import spaces import os import math import gradio as gr import numpy as np import torch import safetensors.torch as sf import datetime from pathlib import Path from io import BytesIO from PIL import Image from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler from diffusers.models.attention_processor import AttnProcessor2_0 from transformers import CLIPTextModel, CLIPTokenizer import dds_cloudapi_sdk from dds_cloudapi_sdk import Config, Client, TextPrompt from dds_cloudapi_sdk.tasks.dinox import DinoxTask from dds_cloudapi_sdk.tasks import DetectionTarget from dds_cloudapi_sdk.tasks.detection import DetectionTask from enum import Enum from torch.hub import download_url_to_file import tempfile from sam2.build_sam import build_sam2 from sam2.sam2_image_predictor import SAM2ImagePredictor import cv2 from typing import Optional from depth_anything_v2.dpt import DepthAnythingV2 import httpx client = httpx.Client(timeout=httpx.Timeout(10.0)) # Set timeout to 10 seconds import supervision as sv import torch from PIL import Image # Load # Model paths model_path = './models/iclight_sd15_fc.safetensors' model_path2 = './checkpoints/depth_anything_v2_vits.pth' model_path3 = './checkpoints/sam2_hiera_large.pt' model_path4 = './checkpoints/config.json' model_path5 = './checkpoints/preprocessor_config.json' model_path6 = './configs/sam2_hiera_l.yaml' model_path7 = './mvadapter_i2mv_sdxl.safetensors' # Base URL for the repository BASE_URL = 'https://huggingface.co/Ashoka74/Placement/resolve/main/' # Model URLs model_urls = { model_path: 'iclight_sd15_fc.safetensors', model_path2: 'depth_anything_v2_vits.pth', model_path3: 'sam2_hiera_large.pt', model_path4: 'config.json', model_path5: 'preprocessor_config.json', model_path6: 'sam2_hiera_l.yaml', model_path7: 'mvadapter_i2mv_sdxl.safetensors' } # Ensure directories exist def ensure_directories(): for path in model_urls.keys(): os.makedirs(os.path.dirname(path), exist_ok=True) # Download models def download_models(): for local_path, filename in model_urls.items(): if not os.path.exists(local_path): try: url = f"{BASE_URL}{filename}" print(f"Downloading {filename}") download_url_to_file(url, local_path) print(f"Successfully downloaded {filename}") except Exception as e: print(f"Error downloading {filename}: {e}") ensure_directories() download_models() try: import xformers import xformers.ops XFORMERS_AVAILABLE = True print("xformers is available - Using memory efficient attention") except ImportError: XFORMERS_AVAILABLE = False print("xformers not available - Using default attention") # Memory optimizations for RTX 2070 torch.backends.cudnn.benchmark = True if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # Set a smaller attention slice size for RTX 2070 torch.backends.cuda.max_split_size_mb = 512 device = torch.device('cuda') else: device = torch.device('cpu') # 'stablediffusionapi/realistic-vision-v51' # 'runwayml/stable-diffusion-v1-5' sd15_name = 'stablediffusionapi/realistic-vision-v51' tokenizer = CLIPTokenizer.from_pretrained(sd15_name, subfolder="tokenizer") text_encoder = CLIPTextModel.from_pretrained(sd15_name, subfolder="text_encoder") vae = AutoencoderKL.from_pretrained(sd15_name, subfolder="vae") unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet") # Load model directly from transformers import AutoModelForImageSegmentation rmbg = AutoModelForImageSegmentation.from_pretrained("briaai/RMBG-1.4", trust_remote_code=True) rmbg = rmbg.to(device=device, dtype=torch.float32) # Keep this as float32 model = DepthAnythingV2(encoder='vits', features=64, out_channels=[48, 96, 192, 384]) model.load_state_dict(torch.load('checkpoints/depth_anything_v2_vits.pth', map_location=device)) model = model.to(device) model.eval() # Change UNet with torch.no_grad(): new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding) new_conv_in.weight.zero_() new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight) new_conv_in.bias = unet.conv_in.bias unet.conv_in = new_conv_in unet_original_forward = unet.forward def enable_efficient_attention(): if XFORMERS_AVAILABLE: try: # RTX 2070 specific settings unet.set_use_memory_efficient_attention_xformers(True) vae.set_use_memory_efficient_attention_xformers(True) print("Enabled xformers memory efficient attention") except Exception as e: print(f"Xformers error: {e}") print("Falling back to sliced attention") # Use sliced attention for RTX 2070 # unet.set_attention_slice_size(4) # vae.set_attention_slice_size(4) unet.set_attn_processor(AttnProcessor2_0()) vae.set_attn_processor(AttnProcessor2_0()) else: # Fallback for when xformers is not available print("Using sliced attention") # unet.set_attention_slice_size(4) # vae.set_attention_slice_size(4) unet.set_attn_processor(AttnProcessor2_0()) vae.set_attn_processor(AttnProcessor2_0()) # Add memory clearing function def clear_memory(): if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() # Enable efficient attention enable_efficient_attention() def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs): c_concat = kwargs['cross_attention_kwargs']['concat_conds'].to(sample) c_concat = torch.cat([c_concat] * (sample.shape[0] // c_concat.shape[0]), dim=0) new_sample = torch.cat([sample, c_concat], dim=1) kwargs['cross_attention_kwargs'] = {} return unet_original_forward(new_sample, timestep, encoder_hidden_states, **kwargs) unet.forward = hooked_unet_forward sd_offset = sf.load_file(model_path) sd_origin = unet.state_dict() keys = sd_origin.keys() sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()} unet.load_state_dict(sd_merged, strict=True) del sd_offset, sd_origin, sd_merged, keys # Device # device = torch.device('cuda') # text_encoder = text_encoder.to(device=device, dtype=torch.float16) # vae = vae.to(device=device, dtype=torch.bfloat16) # unet = unet.to(device=device, dtype=torch.float16) # rmbg = rmbg.to(device=device, dtype=torch.float32) # Device and dtype setup device = torch.device('cuda') dtype = torch.float16 # RTX 2070 works well with float16 # Memory optimizations for RTX 2070 torch.backends.cudnn.benchmark = True if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # Set a very small attention slice size for RTX 2070 to avoid OOM torch.backends.cuda.max_split_size_mb = 128 # Move models to device with consistent dtype text_encoder = text_encoder.to(device=device, dtype=dtype) vae = vae.to(device=device, dtype=dtype) # Changed from bfloat16 to float16 unet = unet.to(device=device, dtype=dtype) rmbg = rmbg.to(device=device, dtype=torch.float32) # Keep this as float32 ddim_scheduler = DDIMScheduler( num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, steps_offset=1, ) euler_a_scheduler = EulerAncestralDiscreteScheduler( num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, steps_offset=1 ) dpmpp_2m_sde_karras_scheduler = DPMSolverMultistepScheduler( num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True, steps_offset=1 ) # Pipelines t2i_pipe = StableDiffusionPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=dpmpp_2m_sde_karras_scheduler, safety_checker=None, requires_safety_checker=False, feature_extractor=None, image_encoder=None ) i2i_pipe = StableDiffusionImg2ImgPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=dpmpp_2m_sde_karras_scheduler, safety_checker=None, requires_safety_checker=False, feature_extractor=None, image_encoder=None ) @torch.inference_mode() def encode_prompt_inner(txt: str): max_length = tokenizer.model_max_length chunk_length = tokenizer.model_max_length - 2 id_start = tokenizer.bos_token_id id_end = tokenizer.eos_token_id id_pad = id_end def pad(x, p, i): return x[:i] if len(x) >= i else x + [p] * (i - len(x)) tokens = tokenizer(txt, truncation=False, add_special_tokens=False)["input_ids"] chunks = [[id_start] + tokens[i: i + chunk_length] + [id_end] for i in range(0, len(tokens), chunk_length)] chunks = [pad(ck, id_pad, max_length) for ck in chunks] token_ids = torch.tensor(chunks).to(device=device, dtype=torch.int64) conds = text_encoder(token_ids).last_hidden_state return conds @torch.inference_mode() def encode_prompt_pair(positive_prompt, negative_prompt): c = encode_prompt_inner(positive_prompt) uc = encode_prompt_inner(negative_prompt) c_len = float(len(c)) uc_len = float(len(uc)) max_count = max(c_len, uc_len) c_repeat = int(math.ceil(max_count / c_len)) uc_repeat = int(math.ceil(max_count / uc_len)) max_chunk = max(len(c), len(uc)) c = torch.cat([c] * c_repeat, dim=0)[:max_chunk] uc = torch.cat([uc] * uc_repeat, dim=0)[:max_chunk] c = torch.cat([p[None, ...] for p in c], dim=1) uc = torch.cat([p[None, ...] for p in uc], dim=1) return c, uc @spaces.GPU(duration=60) @torch.inference_mode() def pytorch2numpy(imgs, quant=True): results = [] for x in imgs: y = x.movedim(0, -1) if quant: y = y * 127.5 + 127.5 y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8) else: y = y * 0.5 + 0.5 y = y.detach().float().cpu().numpy().clip(0, 1).astype(np.float32) results.append(y) return results @spaces.GPU(duration=60) @torch.inference_mode() def numpy2pytorch(imgs): h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0 # so that 127 must be strictly 0.0 h = h.movedim(-1, 1) return h def resize_and_center_crop(image, target_width, target_height): pil_image = Image.fromarray(image) original_width, original_height = pil_image.size scale_factor = max(target_width / original_width, target_height / original_height) resized_width = int(round(original_width * scale_factor)) resized_height = int(round(original_height * scale_factor)) resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS) left = (resized_width - target_width) / 2 top = (resized_height - target_height) / 2 right = (resized_width + target_width) / 2 bottom = (resized_height + target_height) / 2 cropped_image = resized_image.crop((left, top, right, bottom)) return np.array(cropped_image) def resize_without_crop(image, target_width, target_height): pil_image = Image.fromarray(image) resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS) return np.array(resized_image) @spaces.GPU(duration=60) @torch.inference_mode() def run_rmbg(img, sigma=0.0): # Convert RGBA to RGB if needed if img.shape[-1] == 4: # Use white background for alpha composition alpha = img[..., 3:] / 255.0 rgb = img[..., :3] white_bg = np.ones_like(rgb) * 255 img = (rgb * alpha + white_bg * (1 - alpha)).astype(np.uint8) H, W, C = img.shape assert C == 3 k = (256.0 / float(H * W)) ** 0.5 feed = resize_without_crop(img, int(64 * round(W * k)), int(64 * round(H * k))) feed = numpy2pytorch([feed]).to(device=device, dtype=torch.float32) alpha = rmbg(feed)[0][0] alpha = torch.nn.functional.interpolate(alpha, size=(H, W), mode="bilinear") alpha = alpha.movedim(1, -1)[0] alpha = alpha.detach().float().cpu().numpy().clip(0, 1) # Create RGBA image rgba = np.dstack((img, alpha * 255)).astype(np.uint8) result = 127 + (img.astype(np.float32) - 127 + sigma) * alpha return result.clip(0, 255).astype(np.uint8), rgba @spaces.GPU(duration=60) @torch.inference_mode() def process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source): clear_memory() # Get input dimensions input_height, input_width = input_fg.shape[:2] bg_source = BGSource(bg_source) if bg_source == BGSource.UPLOAD: pass elif bg_source == BGSource.UPLOAD_FLIP: input_bg = np.fliplr(input_bg) if bg_source == BGSource.GREY: input_bg = np.zeros(shape=(input_height, input_width, 3), dtype=np.uint8) + 64 elif bg_source == BGSource.LEFT: gradient = np.linspace(255, 0, input_width) image = np.tile(gradient, (input_height, 1)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.RIGHT: gradient = np.linspace(0, 255, input_width) image = np.tile(gradient, (input_height, 1)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.TOP: gradient = np.linspace(255, 0, input_height)[:, None] image = np.tile(gradient, (1, input_width)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.BOTTOM: gradient = np.linspace(0, 255, input_height)[:, None] image = np.tile(gradient, (1, input_width)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) else: raise 'Wrong initial latent!' rng = torch.Generator(device=device).manual_seed(int(seed)) # Use input dimensions directly fg = resize_without_crop(input_fg, input_width, input_height) concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype) concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt) if input_bg is None: latents = t2i_pipe( prompt_embeds=conds, negative_prompt_embeds=unconds, width=input_width, height=input_height, num_inference_steps=steps, num_images_per_prompt=num_samples, generator=rng, output_type='latent', guidance_scale=cfg, cross_attention_kwargs={'concat_conds': concat_conds}, ).images.to(vae.dtype) / vae.config.scaling_factor else: bg = resize_without_crop(input_bg, input_width, input_height) bg_latent = numpy2pytorch([bg]).to(device=vae.device, dtype=vae.dtype) bg_latent = vae.encode(bg_latent).latent_dist.mode() * vae.config.scaling_factor latents = i2i_pipe( image=bg_latent, strength=lowres_denoise, prompt_embeds=conds, negative_prompt_embeds=unconds, width=input_width, height=input_height, num_inference_steps=int(round(steps / lowres_denoise)), num_images_per_prompt=num_samples, generator=rng, output_type='latent', guidance_scale=cfg, cross_attention_kwargs={'concat_conds': concat_conds}, ).images.to(vae.dtype) / vae.config.scaling_factor pixels = vae.decode(latents).sample pixels = pytorch2numpy(pixels) pixels = [resize_without_crop( image=p, target_width=int(round(input_width * highres_scale / 64.0) * 64), target_height=int(round(input_height * highres_scale / 64.0) * 64)) for p in pixels] pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype) latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor latents = latents.to(device=unet.device, dtype=unet.dtype) highres_height, highres_width = latents.shape[2] * 8, latents.shape[3] * 8 fg = resize_without_crop(input_fg, highres_width, highres_height) concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype) concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor latents = i2i_pipe( image=latents, strength=highres_denoise, prompt_embeds=conds, negative_prompt_embeds=unconds, width=highres_width, height=highres_height, num_inference_steps=int(round(steps / highres_denoise)), num_images_per_prompt=num_samples, generator=rng, output_type='latent', guidance_scale=cfg, cross_attention_kwargs={'concat_conds': concat_conds}, ).images.to(vae.dtype) / vae.config.scaling_factor pixels = vae.decode(latents).sample pixels = pytorch2numpy(pixels) # Resize back to input dimensions pixels = [resize_without_crop(p, input_width, input_height) for p in pixels] pixels = np.stack(pixels) return pixels def extract_foreground(image): if image is None: return None, gr.update(visible=True), gr.update(visible=True) result, rgba = run_rmbg(image) mask_mover.set_extracted_fg(rgba) return result, gr.update(visible=True), gr.update(visible=True) @torch.inference_mode() def process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source): clear_memory() bg_source = BGSource(bg_source) if bg_source == BGSource.UPLOAD: pass elif bg_source == BGSource.UPLOAD_FLIP: input_bg = np.fliplr(input_bg) elif bg_source == BGSource.GREY: input_bg = np.zeros(shape=(image_height, image_width, 3), dtype=np.uint8) + 64 elif bg_source == BGSource.LEFT: gradient = np.linspace(224, 32, image_width) image = np.tile(gradient, (image_height, 1)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.RIGHT: gradient = np.linspace(32, 224, image_width) image = np.tile(gradient, (image_height, 1)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.TOP: gradient = np.linspace(224, 32, image_height)[:, None] image = np.tile(gradient, (1, image_width)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.BOTTOM: gradient = np.linspace(32, 224, image_height)[:, None] image = np.tile(gradient, (1, image_width)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) else: raise 'Wrong background source!' rng = torch.Generator(device=device).manual_seed(seed) fg = resize_and_center_crop(input_fg, image_width, image_height) bg = resize_and_center_crop(input_bg, image_width, image_height) concat_conds = numpy2pytorch([fg, bg]).to(device=vae.device, dtype=vae.dtype) concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor concat_conds = torch.cat([c[None, ...] for c in concat_conds], dim=1) conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt) latents = t2i_pipe( prompt_embeds=conds, negative_prompt_embeds=unconds, width=image_width, height=image_height, num_inference_steps=steps, num_images_per_prompt=num_samples, generator=rng, output_type='latent', guidance_scale=cfg, cross_attention_kwargs={'concat_conds': concat_conds}, ).images.to(vae.dtype) / vae.config.scaling_factor pixels = vae.decode(latents).sample pixels = pytorch2numpy(pixels) pixels = [resize_without_crop( image=p, target_width=int(round(image_width * highres_scale / 64.0) * 64), target_height=int(round(image_height * highres_scale / 64.0) * 64)) for p in pixels] pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype) latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor latents = latents.to(device=unet.device, dtype=unet.dtype) image_height, image_width = latents.shape[2] * 8, latents.shape[3] * 8 fg = resize_and_center_crop(input_fg, image_width, image_height) bg = resize_and_center_crop(input_bg, image_width, image_height) concat_conds = numpy2pytorch([fg, bg]).to(device=vae.device, dtype=vae.dtype) concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor concat_conds = torch.cat([c[None, ...] for c in concat_conds], dim=1) latents = i2i_pipe( image=latents, strength=highres_denoise, prompt_embeds=conds, negative_prompt_embeds=unconds, width=image_width, height=image_height, num_inference_steps=int(round(steps / highres_denoise)), num_images_per_prompt=num_samples, generator=rng, output_type='latent', guidance_scale=cfg, cross_attention_kwargs={'concat_conds': concat_conds}, ).images.to(vae.dtype) / vae.config.scaling_factor pixels = vae.decode(latents).sample pixels = pytorch2numpy(pixels, quant=False) clear_memory() return pixels, [fg, bg] @torch.inference_mode() def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source): #input_fg, matting = run_rmbg(input_fg) results = process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source) return results @torch.inference_mode() def process_relight_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source): bg_source = BGSource(bg_source) # bg_source = "Use Background Image" # Convert numerical inputs to appropriate types image_width = int(image_width) image_height = int(image_height) num_samples = int(num_samples) seed = int(seed) steps = int(steps) cfg = float(cfg) highres_scale = float(highres_scale) highres_denoise = float(highres_denoise) if bg_source == BGSource.UPLOAD: pass elif bg_source == BGSource.UPLOAD_FLIP: input_bg = np.fliplr(input_bg) elif bg_source == BGSource.GREY: input_bg = np.zeros(shape=(image_height, image_width, 3), dtype=np.uint8) + 64 elif bg_source == BGSource.LEFT: gradient = np.linspace(224, 32, image_width) image = np.tile(gradient, (image_height, 1)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.RIGHT: gradient = np.linspace(32, 224, image_width) image = np.tile(gradient, (image_height, 1)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.TOP: gradient = np.linspace(224, 32, image_height)[:, None] image = np.tile(gradient, (1, image_width)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.BOTTOM: gradient = np.linspace(32, 224, image_height)[:, None] image = np.tile(gradient, (1, image_width)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) else: raise ValueError('Wrong background source!') input_fg, matting = run_rmbg(input_fg) results, extra_images = process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source) results = [(x * 255.0).clip(0, 255).astype(np.uint8) for x in results] final_results = results + extra_images # Save the generated images save_images(results, prefix="relight") return results quick_prompts = [ 'sunshine from window', 'neon light, city', 'sunset over sea', 'golden time', 'sci-fi RGB glowing, cyberpunk', 'natural lighting', 'warm atmosphere, at home, bedroom', 'magic lit', 'evil, gothic, Yharnam', 'light and shadow', 'shadow from window', 'soft studio lighting', 'home atmosphere, cozy bedroom illumination', 'neon, Wong Kar-wai, warm' ] quick_prompts = [[x] for x in quick_prompts] quick_subjects = [ 'modern sofa, high quality leather', 'elegant dining table, polished wood', 'luxurious bed, premium mattress', 'minimalist office desk, clean design', 'vintage wooden cabinet, antique finish', ] quick_subjects = [[x] for x in quick_subjects] class BGSource(Enum): UPLOAD = "Use Background Image" UPLOAD_FLIP = "Use Flipped Background Image" LEFT = "Left Light" RIGHT = "Right Light" TOP = "Top Light" BOTTOM = "Bottom Light" GREY = "Ambient" # Add save function def save_images(images, prefix="relight"): # Create output directory if it doesn't exist output_dir = Path("outputs") output_dir.mkdir(exist_ok=True) # Create timestamp for unique filenames timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") saved_paths = [] for i, img in enumerate(images): if isinstance(img, np.ndarray): # Convert to PIL Image if numpy array img = Image.fromarray(img) # Create filename with timestamp filename = f"{prefix}_{timestamp}_{i+1}.png" filepath = output_dir / filename # Save image img.save(filepath) # print(f"Saved {len(saved_paths)} images to {output_dir}") return saved_paths class MaskMover: def __init__(self): self.extracted_fg = None self.original_fg = None # Store original foreground def set_extracted_fg(self, fg_image): """Store the extracted foreground with alpha channel""" if isinstance(fg_image, np.ndarray): self.extracted_fg = fg_image.copy() self.original_fg = fg_image.copy() else: self.extracted_fg = np.array(fg_image) self.original_fg = np.array(fg_image) return self.extracted_fg def create_composite(self, background, x_pos, y_pos, scale=1.0): """Create composite with foreground at specified position""" if self.original_fg is None or background is None: return background # Convert inputs to PIL Images if isinstance(background, np.ndarray): bg = Image.fromarray(background).convert('RGBA') else: bg = background.convert('RGBA') if isinstance(self.original_fg, np.ndarray): fg = Image.fromarray(self.original_fg).convert('RGBA') else: fg = self.original_fg.convert('RGBA') # Scale the foreground size new_width = int(fg.width * scale) new_height = int(fg.height * scale) fg = fg.resize((new_width, new_height), Image.LANCZOS) # Center the scaled foreground at the position x = int(x_pos - new_width / 2) y = int(y_pos - new_height / 2) # Create composite result = bg.copy() result.paste(fg, (x, y), fg) # Use fg as the mask (requires fg to be in 'RGBA' mode) return np.array(result.convert('RGB')) # Convert back to 'RGB' if needed def get_depth(image): if image is None: return None # Convert from PIL/gradio format to cv2 raw_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # Get depth map depth = model.infer_image(raw_img) # HxW raw depth map # Normalize depth for visualization depth = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8) # Convert to RGB for display depth_colored = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO) depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB) return Image.fromarray(depth_colored) from PIL import Image def compress_image(image): # Convert Gradio image (numpy array) to PIL Image img = Image.fromarray(image) # Resize image if dimensions are too large max_size = 1024 # Maximum dimension size if img.width > max_size or img.height > max_size: ratio = min(max_size/img.width, max_size/img.height) new_size = (int(img.width * ratio), int(img.height * ratio)) img = img.resize(new_size, Image.Resampling.LANCZOS) quality = 95 # Start with high quality img.save("compressed_image.jpg", "JPEG", quality=quality) # Initial save # Check file size and adjust quality if necessary while os.path.getsize("compressed_image.jpg") > 100 * 1024: # 100KB limit quality -= 5 # Decrease quality img.save("compressed_image.jpg", "JPEG", quality=quality) if quality < 20: # Prevent quality from going too low break # Convert back to numpy array for Gradio compressed_img = np.array(Image.open("compressed_image.jpg")) return compressed_img @spaces.GPU(duration=60) @torch.inference_mode def process_image(input_image, input_text): """Main processing function for the Gradio interface""" # Initialize configs API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720" SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt" SAM2_MODEL_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/sam2_hiera_l.yaml") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Initialize DDS client config = Config(API_TOKEN) client = Client(config) # Process classes from text prompt classes = [x.strip().lower() for x in input_text.split('.') if x] class_name_to_id = {name: id for id, name in enumerate(classes)} class_id_to_name = {id: name for name, id in class_name_to_id.items()} # Save input image to temp file and get URL with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile: cv2.imwrite(tmpfile.name, input_image) image_url = client.upload_file(tmpfile.name) os.remove(tmpfile.name) # Process detection results input_boxes = [] masks = [] confidences = [] class_names = [] class_ids = [] if len(input_text) == 0: task = DinoxTask( image_url=image_url, prompts=[TextPrompt(text="")], # targets=[DetectionTarget.BBox, DetectionTarget.Mask] ) client.run_task(task) predictions = task.result.objects classes = [pred.category for pred in predictions] classes = list(set(classes)) class_name_to_id = {name: id for id, name in enumerate(classes)} class_id_to_name = {id: name for name, id in class_name_to_id.items()} for idx, obj in enumerate(predictions): input_boxes.append(obj.bbox) masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size)) # convert mask to np.array using DDS API confidences.append(obj.score) cls_name = obj.category.lower().strip() class_names.append(cls_name) class_ids.append(class_name_to_id[cls_name]) boxes = np.array(input_boxes) masks = np.array(masks) class_ids = np.array(class_ids) labels = [ f"{class_name} {confidence:.2f}" for class_name, confidence in zip(class_names, confidences) ] detections = sv.Detections( xyxy=boxes, mask=masks.astype(bool), class_id=class_ids ) box_annotator = sv.BoxAnnotator() label_annotator = sv.LabelAnnotator() mask_annotator = sv.MaskAnnotator() annotated_frame = input_image.copy() annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections) annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) # Create transparent mask for first detected object if len(detections) > 0: # Get first mask first_mask = detections.mask[0] # Get original RGB image img = input_image.copy() H, W, C = img.shape # Create RGBA image alpha = np.zeros((H, W, 1), dtype=np.uint8) alpha[first_mask] = 255 rgba = np.dstack((img, alpha)).astype(np.uint8) # Crop to mask bounds to minimize image size y_indices, x_indices = np.where(first_mask) y_min, y_max = y_indices.min(), y_indices.max() x_min, x_max = x_indices.min(), x_indices.max() # Crop the RGBA image cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1] # Set extracted foreground for mask mover mask_mover.set_extracted_fg(cropped_rgba) return annotated_frame, cropped_rgba, gr.update(visible=True), gr.update(visible=True) else: # Run DINO-X detection task = DinoxTask( image_url=image_url, prompts=[TextPrompt(text=input_text)], targets=[DetectionTarget.BBox, DetectionTarget.Mask] ) client.run_task(task) result = task.result objects = result.objects # for obj in objects: # input_boxes.append(obj.bbox) # confidences.append(obj.score) # cls_name = obj.category.lower().strip() # class_names.append(cls_name) # class_ids.append(class_name_to_id[cls_name]) # input_boxes = np.array(input_boxes) # class_ids = np.array(class_ids) predictions = task.result.objects classes = [x.strip().lower() for x in input_text.split('.') if x] class_name_to_id = {name: id for id, name in enumerate(classes)} class_id_to_name = {id: name for name, id in class_name_to_id.items()} boxes = [] masks = [] confidences = [] class_names = [] class_ids = [] for idx, obj in enumerate(predictions): boxes.append(obj.bbox) masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size)) # convert mask to np.array using DDS API confidences.append(obj.score) cls_name = obj.category.lower().strip() class_names.append(cls_name) class_ids.append(class_name_to_id[cls_name]) boxes = np.array(boxes) masks = np.array(masks) class_ids = np.array(class_ids) labels = [ f"{class_name} {confidence:.2f}" for class_name, confidence in zip(class_names, confidences) ] # Initialize SAM2 # torch.autocast(device_type=DEVICE, dtype=torch.bfloat16).__enter__() # if torch.cuda.get_device_properties(0).major >= 8: # torch.backends.cuda.matmul.allow_tf32 = True # torch.backends.cudnn.allow_tf32 = True # sam2_model = build_sam2(SAM2_MODEL_CONFIG, SAM2_CHECKPOINT, device=DEVICE) # sam2_predictor = SAM2ImagePredictor(sam2_model) # sam2_predictor.set_image(input_image) # sam2_predictor = run_sam_inference(SAM_IMAGE_MODEL, input_image, detections) # Get masks from SAM2 # masks, scores, logits = sam2_predictor.predict( # point_coords=None, # point_labels=None, # box=input_boxes, # multimask_output=False, # ) if masks.ndim == 4: masks = masks.squeeze(1) # Create visualization # labels = [f"{class_name} {confidence:.2f}" # for class_name, confidence in zip(class_names, confidences)] # detections = sv.Detections( # xyxy=input_boxes, # mask=masks.astype(bool), # class_id=class_ids # ) detections = sv.Detections( xyxy = boxes, mask = masks.astype(bool), class_id = class_ids, ) box_annotator = sv.BoxAnnotator() label_annotator = sv.LabelAnnotator() mask_annotator = sv.MaskAnnotator() annotated_frame = input_image.copy() annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections) annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) # Create transparent mask for first detected object if len(detections) > 0: # Get first mask first_mask = detections.mask[0] # Get original RGB image img = input_image.copy() H, W, C = img.shape # Create RGBA image alpha = np.zeros((H, W, 1), dtype=np.uint8) alpha[first_mask] = 255 rgba = np.dstack((img, alpha)).astype(np.uint8) # Crop to mask bounds to minimize image size y_indices, x_indices = np.where(first_mask) y_min, y_max = y_indices.min(), y_indices.max() x_min, x_max = x_indices.min(), x_indices.max() # Crop the RGBA image cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1] # Set extracted foreground for mask mover mask_mover.set_extracted_fg(cropped_rgba) return annotated_frame, cropped_rgba, gr.update(visible=True), gr.update(visible=True) return annotated_frame, None, gr.update(visible=False), gr.update(visible=False) block = gr.Blocks().queue() with block: with gr.Tab("Text"): with gr.Row(): gr.Markdown("## Product Placement from Text") with gr.Row(): with gr.Column(): with gr.Row(): input_fg = gr.Image(type="numpy", label="Image", height=480) with gr.Row(): with gr.Group(): find_objects_button = gr.Button(value="(Option 1) Segment Object from text") text_prompt = gr.Textbox( label="Text Prompt", placeholder="Enter object classes separated by periods (e.g. 'car . person .'), leave empty to get all objects", value="" ) extract_button = gr.Button(value="Remove Background") with gr.Row(): extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480) extracted_fg = gr.Image(type="numpy", label="Extracted Foreground", height=480) # output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480) with gr.Group(): prompt = gr.Textbox(label="Prompt") bg_source = gr.Radio(choices=[e.value for e in list(BGSource)[2:]], value=BGSource.LEFT.value, label="Lighting Preference (Initial Latent)", type='value') example_quick_subjects = gr.Dataset(samples=quick_subjects, label='Subject Quick List', samples_per_page=1000, components=[prompt]) example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Lighting Quick List', samples_per_page=1000, components=[prompt]) relight_button = gr.Button(value="Relight") with gr.Group(visible=False): with gr.Row(): num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) seed = gr.Number(label="Seed", value=12345, precision=0) with gr.Row(): image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64) image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64) with gr.Accordion("Advanced options", open=False): steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=15, step=1) cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=2, step=0.01, visible=False) lowres_denoise = gr.Slider(label="Lowres Denoise (for initial latent)", minimum=0.1, maximum=1.0, value=0.9, step=0.01) highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=3.0, value=1.5, step=0.01) highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=1.0, value=0.5, step=0.01) a_prompt = gr.Textbox(label="Added Prompt", value='best quality', visible=False) n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality', visible=False) x_slider = gr.Slider( minimum=0, maximum=1000, label="X Position", value=500, visible=False ) y_slider = gr.Slider( minimum=0, maximum=1000, label="Y Position", value=500, visible=False ) with gr.Column(): result_gallery = gr.Gallery(height=832, object_fit='contain', label='Outputs') with gr.Row(): dummy_image_for_outputs = gr.Image(visible=False, label='Result') # gr.Examples( # fn=lambda *args: ([args[-1]], None), # examples=db_examples.foreground_conditioned_examples, # inputs=[ # input_fg, prompt, bg_source, image_width, image_height, seed, dummy_image_for_outputs # ], # outputs=[result_gallery, output_bg], # run_on_click=True, examples_per_page=1024 # ) ips = [input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source] relight_button.click(fn=process_relight, inputs=ips, outputs=[result_gallery]) example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False) example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False) find_objects_button.click( fn=process_image, inputs=[input_fg, text_prompt], outputs=[extracted_objects, extracted_fg] ) extract_button.click( fn=extract_foreground, inputs=[input_fg], outputs=[extracted_fg, x_slider, y_slider] ) with gr.Tab("Background", visible=False): # empty cache mask_mover = MaskMover() # with torch.no_grad(): # # Update the input channels to 12 # new_conv_in = torch.nn.Conv2d(12, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding) # Changed from 8 to 12 # new_conv_in.weight.zero_() # new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight) # new_conv_in.bias = unet.conv_in.bias # unet.conv_in = new_conv_in with gr.Row(): gr.Markdown("## IC-Light (Relighting with Foreground and Background Condition)") gr.Markdown("💾 Generated images are automatically saved to 'outputs' folder") with gr.Row(): with gr.Column(): # Step 1: Input and Extract with gr.Row(): with gr.Group(): gr.Markdown("### Step 1: Extract Foreground") input_image = gr.Image(type="numpy", label="Input Image", height=480) # find_objects_button = gr.Button(value="Find Objects") extract_button = gr.Button(value="Remove Background") extracted_fg = gr.Image(type="numpy", label="Extracted Foreground", height=480) with gr.Row(): # Step 2: Background and Position with gr.Group(): gr.Markdown("### Step 2: Position on Background") input_bg = gr.Image(type="numpy", label="Background Image", height=480) with gr.Row(): x_slider = gr.Slider( minimum=0, maximum=1000, label="X Position", value=500, visible=False ) y_slider = gr.Slider( minimum=0, maximum=1000, label="Y Position", value=500, visible=False ) fg_scale_slider = gr.Slider( label="Foreground Scale", minimum=0.01, maximum=3.0, value=1.0, step=0.01 ) editor = gr.ImageEditor( type="numpy", label="Position Foreground", height=480, visible=False ) get_depth_button = gr.Button(value="Get Depth") depth_image = gr.Image(type="numpy", label="Depth Image", height=480) # Step 3: Relighting Options with gr.Group(): gr.Markdown("### Step 3: Relighting Settings") prompt = gr.Textbox(label="Prompt") bg_source = gr.Radio( choices=[e.value for e in BGSource], value=BGSource.UPLOAD.value, label="Background Source", type='value', visible=False ) example_prompts = gr.Dataset( samples=quick_prompts, label='Prompt Quick List', components=[prompt] ) # bg_gallery = gr.Gallery( # height=450, # label='Background Quick List', # value=db_examples.bg_samples, # columns=5, # allow_preview=False # ) relight_button_bg = gr.Button(value="Relight") # Additional settings with gr.Group(): with gr.Row(): num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) seed = gr.Number(label="Seed", value=12345, precision=0) with gr.Row(): image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64) image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64) with gr.Accordion("Advanced options", open=False): steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=7.0, step=0.01) highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=2.0, value=1.2, step=0.01) highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=0.9, value=0.5, step=0.01) a_prompt = gr.Textbox(label="Added Prompt", value='best quality') n_prompt = gr.Textbox( label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality' ) with gr.Column(): result_gallery = gr.Image(height=832, label='Outputs') def extract_foreground(image): if image is None: return None, gr.update(visible=True), gr.update(visible=True) result, rgba = run_rmbg(image) mask_mover.set_extracted_fg(rgba) return result, gr.update(visible=True), gr.update(visible=True) original_bg = None extract_button.click( fn=extract_foreground, inputs=[input_image], outputs=[extracted_fg, x_slider, y_slider] ) find_objects_button.click( fn=process_image, inputs=[input_image, text_prompt], outputs=[extracted_objects, extracted_fg, x_slider, y_slider] ) get_depth_button.click( fn=get_depth, inputs=[input_bg], outputs=[depth_image] ) # def update_position(background, x_pos, y_pos, scale): # """Update composite when position changes""" # global original_bg # if background is None: # return None # if original_bg is None: # original_bg = background.copy() # # Convert string values to float # x_pos = float(x_pos) # y_pos = float(y_pos) # scale = float(scale) # return mask_mover.create_composite(original_bg, x_pos, y_pos, scale) class BackgroundManager: def __init__(self): self.original_bg = None def update_position(self, background, x_pos, y_pos, scale): """Update composite when position changes""" if background is None: return None if self.original_bg is None: self.original_bg = background.copy() # Convert string values to float x_pos = float(x_pos) y_pos = float(y_pos) scale = float(scale) return mask_mover.create_composite(self.original_bg, x_pos, y_pos, scale) # Create an instance of BackgroundManager bg_manager = BackgroundManager() x_slider.change( fn=lambda bg, x, y, scale: bg_manager.update_position(bg, x, y, scale), inputs=[input_bg, x_slider, y_slider, fg_scale_slider], outputs=[input_bg] ) y_slider.change( fn=lambda bg, x, y, scale: bg_manager.update_position(bg, x, y, scale), inputs=[input_bg, x_slider, y_slider, fg_scale_slider], outputs=[input_bg] ) fg_scale_slider.change( fn=lambda bg, x, y, scale: bg_manager.update_position(bg, x, y, scale), inputs=[input_bg, x_slider, y_slider, fg_scale_slider], outputs=[input_bg] ) # Update inputs list to include fg_scale_slider def process_relight_with_position(*args): if mask_mover.extracted_fg is None: gr.Warning("Please extract foreground first") return None background = args[1] # Get background image x_pos = float(args[-3]) # x_slider value y_pos = float(args[-2]) # y_slider value scale = float(args[-1]) # fg_scale_slider value # Get original foreground size after scaling fg = Image.fromarray(mask_mover.original_fg) new_width = int(fg.width * scale) new_height = int(fg.height * scale) # Calculate crop region around foreground position crop_x = int(x_pos - new_width/2) crop_y = int(y_pos - new_height/2) crop_width = new_width crop_height = new_height # Add padding for context (20% extra on each side) padding = 0.2 crop_x = int(crop_x - crop_width * padding) crop_y = int(crop_y - crop_height * padding) crop_width = int(crop_width * (1 + 2 * padding)) crop_height = int(crop_height * (1 + 2 * padding)) # Ensure crop dimensions are multiples of 8 crop_width = ((crop_width + 7) // 8) * 8 crop_height = ((crop_height + 7) // 8) * 8 # Ensure crop region is within image bounds bg_height, bg_width = background.shape[:2] crop_x = max(0, min(crop_x, bg_width - crop_width)) crop_y = max(0, min(crop_y, bg_height - crop_height)) # Get actual crop dimensions after boundary check crop_width = min(crop_width, bg_width - crop_x) crop_height = min(crop_height, bg_height - crop_y) # Ensure dimensions are multiples of 8 again crop_width = (crop_width // 8) * 8 crop_height = (crop_height // 8) * 8 # Crop region from background crop_region = background[crop_y:crop_y+crop_height, crop_x:crop_x+crop_width] # Create composite in cropped region fg_local_x = int(new_width/2 + crop_width*padding) fg_local_y = int(new_height/2 + crop_height*padding) cropped_composite = mask_mover.create_composite(crop_region, fg_local_x, fg_local_y, scale) # Process the cropped region crop_args = list(args) crop_args[0] = cropped_composite crop_args[1] = crop_region crop_args[3] = crop_width crop_args[4] = crop_height crop_args = crop_args[:-3] # Remove position and scale arguments # Get relit result relit_crop = process_relight_bg(*crop_args)[0] # Resize relit result to match crop dimensions if needed if relit_crop.shape[:2] != (crop_height, crop_width): relit_crop = resize_without_crop(relit_crop, crop_width, crop_height) # Place relit crop back into original background result = background.copy() result[crop_y:crop_y+crop_height, crop_x:crop_x+crop_width] = relit_crop return result ips_bg = [input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source] # Update button click events with new inputs list relight_button_bg.click( fn=process_relight_with_position, inputs=ips_bg, outputs=[result_gallery] ) example_prompts.click( fn=lambda x: x[0], inputs=example_prompts, outputs=prompt, show_progress=False, queue=False ) block.launch(server_name='0.0.0.0', share=False)