import os import random import sys from typing import Sequence, Mapping, Any, Union import torch import gradio as gr from PIL import Image from huggingface_hub import hf_hub_download import spaces import spaces import argparse import random import os import math import gradio as gr import numpy as np import torch import safetensors.torch as sf import datetime from pathlib import Path from io import BytesIO from PIL import Image from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler from diffusers.models.attention_processor import AttnProcessor2_0 from transformers import CLIPTextModel, CLIPTokenizer import dds_cloudapi_sdk from dds_cloudapi_sdk import Config, Client, TextPrompt from dds_cloudapi_sdk.tasks.dinox import DinoxTask from dds_cloudapi_sdk.tasks import DetectionTarget from dds_cloudapi_sdk.tasks.detection import DetectionTask from transformers import AutoModelForImageSegmentation from enum import Enum from torch.hub import download_url_to_file import tempfile from sam2.build_sam import build_sam2 from sam2.sam2_image_predictor import SAM2ImagePredictor import cv2 from transformers import AutoModelForImageSegmentation from inference_i2mv_sdxl import prepare_pipeline, remove_bg, run_pipeline from torchvision import transforms from typing import Optional from depth_anything_v2.dpt import DepthAnythingV2 import httpx import gradio as gr import torch from diffusers import FluxFillPipeline from diffusers.utils import load_image from PIL import Image, ImageDraw import numpy as np import spaces from huggingface_hub import hf_hub_download client = httpx.Client(timeout=httpx.Timeout(10.0)) # Set timeout to 10 seconds NUM_VIEWS = 6 HEIGHT = 768 WIDTH = 768 MAX_SEED = np.iinfo(np.int32).max import supervision as sv import torch from PIL import Image import logging # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s') transform_image = transforms.Compose( [ transforms.Resize((1024, 1024)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ] ) #hf_hub_download(repo_id="YarvixPA/FLUX.1-Fill-dev-gguf", filename="flux1-fill-dev-Q5_K_S.gguf", local_dir="models/") fill_pipe = FluxFillPipeline.from_single_file( "https://huggingface.co/SporkySporkness/FLUX.1-Fill-dev-GGUF/flux1-fill-dev-fp16-Q5_0-GGUF.gguf", torch_dtype=torch.bfloat16 ).to("cuda") # Load # Model paths model_path = './models/iclight_sd15_fc.safetensors' model_path2 = './checkpoints/depth_anything_v2_vits.pth' model_path3 = './checkpoints/sam2_hiera_large.pt' model_path4 = './checkpoints/config.json' model_path5 = './checkpoints/preprocessor_config.json' model_path6 = './configs/sam2_hiera_l.yaml' model_path7 = './mvadapter_i2mv_sdxl.safetensors' # Base URL for the repository BASE_URL = 'https://huggingface.co/Ashoka74/Placement/resolve/main/' # Model URLs model_urls = { model_path: 'iclight_sd15_fc.safetensors', model_path2: 'depth_anything_v2_vits.pth', model_path3: 'sam2_hiera_large.pt', model_path4: 'config.json', model_path5: 'preprocessor_config.json', model_path6: 'sam2_hiera_l.yaml', model_path7: 'mvadapter_i2mv_sdxl.safetensors' } # Ensure directories exist def ensure_directories(): for path in model_urls.keys(): os.makedirs(os.path.dirname(path), exist_ok=True) # Download models def download_models(): for local_path, filename in model_urls.items(): if not os.path.exists(local_path): try: url = f"{BASE_URL}{filename}" print(f"Downloading {filename}") download_url_to_file(url, local_path) print(f"Successfully downloaded {filename}") except Exception as e: print(f"Error downloading {filename}: {e}") ensure_directories() download_models() hf_hub_download(repo_id="black-forest-labs/FLUX.1-Redux-dev", filename="flux1-redux-dev.safetensors", local_dir="models/style_models") hf_hub_download(repo_id="black-forest-labs/FLUX.1-Depth-dev", filename="flux1-depth-dev.safetensors", local_dir="models/diffusion_models") hf_hub_download(repo_id="Comfy-Org/sigclip_vision_384", filename="sigclip_vision_patch14_384.safetensors", local_dir="models/clip_vision") hf_hub_download(repo_id="Kijai/DepthAnythingV2-safetensors", filename="depth_anything_v2_vitl_fp32.safetensors", local_dir="models/depthanything") hf_hub_download(repo_id="black-forest-labs/FLUX.1-dev", filename="ae.safetensors", local_dir="models/vae/FLUX1") hf_hub_download(repo_id="comfyanonymous/flux_text_encoders", filename="clip_l.safetensors", local_dir="models/text_encoders") t5_path = hf_hub_download(repo_id="comfyanonymous/flux_text_encoders", filename="t5xxl_fp16.safetensors", local_dir="models/text_encoders/t5") sd15_name = 'stablediffusionapi/realistic-vision-v51' tokenizer = CLIPTokenizer.from_pretrained(sd15_name, subfolder="tokenizer") text_encoder = CLIPTextModel.from_pretrained(sd15_name, subfolder="text_encoder") vae = AutoencoderKL.from_pretrained(sd15_name, subfolder="vae") unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet") try: import xformers import xformers.ops XFORMERS_AVAILABLE = True print("xformers is available - Using memory efficient attention") except ImportError: XFORMERS_AVAILABLE = False print("xformers not available - Using default attention") # Memory optimizations for RTX 2070 torch.backends.cudnn.benchmark = True if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True # Set a smaller attention slice size for RTX 2070 torch.backends.cuda.max_split_size_mb = 512 device = torch.device('cuda') else: device = torch.device('cpu') rmbg = AutoModelForImageSegmentation.from_pretrained( "ZhengPeng7/BiRefNet", trust_remote_code=True ) rmbg = rmbg.to(device=device, dtype=torch.float32) model = DepthAnythingV2(encoder='vits', features=64, out_channels=[48, 96, 192, 384]) model.load_state_dict(torch.load('checkpoints/depth_anything_v2_vits.pth', map_location=device)) model = model.to(device) model.eval() with torch.no_grad(): new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding) new_conv_in.weight.zero_() new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight) new_conv_in.bias = unet.conv_in.bias unet.conv_in = new_conv_in unet_original_forward = unet.forward def can_expand(source_width, source_height, target_width, target_height, alignment): if alignment in ("Left", "Right") and source_width >= target_width: return False if alignment in ("Top", "Bottom") and source_height >= target_height: return False return True def prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom): target_size = (width, height) scale_factor = min(target_size[0] / image.width, target_size[1] / image.height) new_width = int(image.width * scale_factor) new_height = int(image.height * scale_factor) source = image.resize((new_width, new_height), Image.LANCZOS) if resize_option == "Full": resize_percentage = 100 elif resize_option == "75%": resize_percentage = 75 elif resize_option == "50%": resize_percentage = 50 elif resize_option == "33%": resize_percentage = 33 elif resize_option == "25%": resize_percentage = 25 else: # Custom resize_percentage = custom_resize_percentage # Calculate new dimensions based on percentage resize_factor = resize_percentage / 100 new_width = int(source.width * resize_factor) new_height = int(source.height * resize_factor) # Ensure minimum size of 64 pixels new_width = max(new_width, 64) new_height = max(new_height, 64) # Resize the image source = source.resize((new_width, new_height), Image.LANCZOS) # Calculate the overlap in pixels based on the percentage overlap_x = int(new_width * (overlap_percentage / 100)) overlap_y = int(new_height * (overlap_percentage / 100)) # Ensure minimum overlap of 1 pixel overlap_x = max(overlap_x, 1) overlap_y = max(overlap_y, 1) # Calculate margins based on alignment if alignment == "Middle": margin_x = (target_size[0] - new_width) // 2 margin_y = (target_size[1] - new_height) // 2 elif alignment == "Left": margin_x = 0 margin_y = (target_size[1] - new_height) // 2 elif alignment == "Right": margin_x = target_size[0] - new_width margin_y = (target_size[1] - new_height) // 2 elif alignment == "Top": margin_x = (target_size[0] - new_width) // 2 margin_y = 0 elif alignment == "Bottom": margin_x = (target_size[0] - new_width) // 2 margin_y = target_size[1] - new_height # Adjust margins to eliminate gaps margin_x = max(0, min(margin_x, target_size[0] - new_width)) margin_y = max(0, min(margin_y, target_size[1] - new_height)) # Create a new background image and paste the resized source image background = Image.new('RGB', target_size, (255, 255, 255)) background.paste(source, (margin_x, margin_y)) # Create the mask mask = Image.new('L', target_size, 255) mask_draw = ImageDraw.Draw(mask) # Calculate overlap areas white_gaps_patch = 2 left_overlap = margin_x + overlap_x if overlap_left else margin_x + white_gaps_patch right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width - white_gaps_patch top_overlap = margin_y + overlap_y if overlap_top else margin_y + white_gaps_patch bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height - white_gaps_patch if alignment == "Left": left_overlap = margin_x + overlap_x if overlap_left else margin_x elif alignment == "Right": right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width elif alignment == "Top": top_overlap = margin_y + overlap_y if overlap_top else margin_y elif alignment == "Bottom": bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height # Draw the mask mask_draw.rectangle([ (left_overlap, top_overlap), (right_overlap, bottom_overlap) ], fill=0) return background, mask @spaces.GPU def inpaint(image, width, height, overlap_percentage, num_inference_steps, resize_option, custom_resize_percentage, prompt_input, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom, progress=gr.Progress(track_tqdm=True)): background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom) if not can_expand(background.width, background.height, width, height, alignment): alignment = "Middle" cnet_image = background.copy() cnet_image.paste(0, (0, 0), mask) final_prompt = prompt_input #generator = torch.Generator(device="cuda").manual_seed(42) result = fill_pipe( prompt=final_prompt, height=height, width=width, image=cnet_image, mask_image=mask, num_inference_steps=num_inference_steps, guidance_scale=30, ).images[0] result = result.convert("RGBA") cnet_image.paste(result, (0, 0), mask) return cnet_image, background def preview_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom): background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom) preview = background.copy().convert('RGBA') red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64)) red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0)) red_mask.paste(red_overlay, (0, 0), mask) preview = Image.alpha_composite(preview, red_mask) return preview def clear_result(): return gr.update(value=None) def preload_presets(target_ratio, ui_width, ui_height): if target_ratio == "9:16": return 720, 1280, gr.update() elif target_ratio == "16:9": return 1280, 720, gr.update() elif target_ratio == "1:1": return 1024, 1024, gr.update() elif target_ratio == "Custom": return ui_width, ui_height, gr.update(open=True) def select_the_right_preset(user_width, user_height): if user_width == 720 and user_height == 1280: return "9:16" elif user_width == 1280 and user_height == 720: return "16:9" elif user_width == 1024 and user_height == 1024: return "1:1" else: return "Custom" def toggle_custom_resize_slider(resize_option): return gr.update(visible=(resize_option == "Custom")) def update_history(new_image, history): if history is None: history = [] history.insert(0, new_image) return history def enable_efficient_attention(): if XFORMERS_AVAILABLE: try: # RTX 2070 specific settings unet.set_use_memory_efficient_attention_xformers(True) vae.set_use_memory_efficient_attention_xformers(True) print("Enabled xformers memory efficient attention") except Exception as e: print(f"Xformers error: {e}") print("Falling back to sliced attention") # Use sliced attention for RTX 2070 # unet.set_attention_slice_size(4) # vae.set_attention_slice_size(4) unet.set_attn_processor(AttnProcessor2_0()) vae.set_attn_processor(AttnProcessor2_0()) else: # Fallback for when xformers is not available print("Using sliced attention") # unet.set_attention_slice_size(4) # vae.set_attention_slice_size(4) unet.set_attn_processor(AttnProcessor2_0()) vae.set_attn_processor(AttnProcessor2_0()) # Add memory clearing function def clear_memory(): if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() # Enable efficient attention enable_efficient_attention() def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs): c_concat = kwargs['cross_attention_kwargs']['concat_conds'].to(sample) c_concat = torch.cat([c_concat] * (sample.shape[0] // c_concat.shape[0]), dim=0) new_sample = torch.cat([sample, c_concat], dim=1) kwargs['cross_attention_kwargs'] = {} return unet_original_forward(new_sample, timestep, encoder_hidden_states, **kwargs) unet.forward = hooked_unet_forward sd_offset = sf.load_file(model_path) sd_origin = unet.state_dict() keys = sd_origin.keys() sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()} unet.load_state_dict(sd_merged, strict=True) del sd_offset, sd_origin, sd_merged, keys # Device and dtype setup device = torch.device('cuda') #dtype = torch.float16 # RTX 2070 works well with float16 dtype = torch.bfloat16 pipe = prepare_pipeline( base_model="stabilityai/stable-diffusion-xl-base-1.0", vae_model="madebyollin/sdxl-vae-fp16-fix", unet_model=None, lora_model=None, adapter_path="huanngzh/mv-adapter", scheduler=None, num_views=NUM_VIEWS, device=device, dtype=dtype, ) # Move models to device with consistent dtype text_encoder = text_encoder.to(device=device, dtype=dtype) vae = vae.to(device=device, dtype=dtype) # Changed from bfloat16 to float16 unet = unet.to(device=device, dtype=dtype) #rmbg = rmbg.to(device=device, dtype=torch.float32) # Keep this as float32 rmbg = rmbg.to(device) ddim_scheduler = DDIMScheduler( num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, steps_offset=1, ) euler_a_scheduler = EulerAncestralDiscreteScheduler( num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, steps_offset=1 ) dpmpp_2m_sde_karras_scheduler = DPMSolverMultistepScheduler( num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True, steps_offset=1 ) # Pipelines t2i_pipe = StableDiffusionPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=dpmpp_2m_sde_karras_scheduler, safety_checker=None, requires_safety_checker=False, feature_extractor=None, image_encoder=None ) i2i_pipe = StableDiffusionImg2ImgPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=dpmpp_2m_sde_karras_scheduler, safety_checker=None, requires_safety_checker=False, feature_extractor=None, image_encoder=None ) @torch.inference_mode() def encode_prompt_inner(txt: str): max_length = tokenizer.model_max_length chunk_length = tokenizer.model_max_length - 2 id_start = tokenizer.bos_token_id id_end = tokenizer.eos_token_id id_pad = id_end def pad(x, p, i): return x[:i] if len(x) >= i else x + [p] * (i - len(x)) tokens = tokenizer(txt, truncation=False, add_special_tokens=False)["input_ids"] chunks = [[id_start] + tokens[i: i + chunk_length] + [id_end] for i in range(0, len(tokens), chunk_length)] chunks = [pad(ck, id_pad, max_length) for ck in chunks] token_ids = torch.tensor(chunks).to(device=device, dtype=torch.int64) conds = text_encoder(token_ids).last_hidden_state return conds @torch.inference_mode() def encode_prompt_pair(positive_prompt, negative_prompt): c = encode_prompt_inner(positive_prompt) uc = encode_prompt_inner(negative_prompt) c_len = float(len(c)) uc_len = float(len(uc)) max_count = max(c_len, uc_len) c_repeat = int(math.ceil(max_count / c_len)) uc_repeat = int(math.ceil(max_count / uc_len)) max_chunk = max(len(c), len(uc)) c = torch.cat([c] * c_repeat, dim=0)[:max_chunk] uc = torch.cat([uc] * uc_repeat, dim=0)[:max_chunk] c = torch.cat([p[None, ...] for p in c], dim=1) uc = torch.cat([p[None, ...] for p in uc], dim=1) return c, uc # @spaces.GPU(duration=60) # @torch.inference_mode() @spaces.GPU(duration=60) @torch.inference_mode() def infer( prompt, image, # This is already RGBA with background removed do_rembg=True, seed=42, randomize_seed=False, guidance_scale=3.0, num_inference_steps=50, reference_conditioning_scale=1.0, negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast", progress=gr.Progress(track_tqdm=True), ): #logging.info(f"Input image shape: {image.shape}, dtype: {image.dtype}") # Convert input to PIL if needed if isinstance(image, np.ndarray): if image.shape[-1] == 4: # RGBA image = Image.fromarray(image, 'RGBA') else: # RGB image = Image.fromarray(image, 'RGB') #logging.info(f"Converted to PIL Image mode: {image.mode}") # No need for remove_bg_fn since image is already processed remove_bg_fn = None if randomize_seed: seed = random.randint(0, MAX_SEED) images, preprocessed_image = run_pipeline( pipe, num_views=NUM_VIEWS, text=prompt, image=image, height=HEIGHT, width=WIDTH, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, seed=seed, remove_bg_fn=remove_bg_fn, # Set to None since preprocessing is done reference_conditioning_scale=reference_conditioning_scale, negative_prompt=negative_prompt, device=device, ) # logging.info(f"Output images shape: {[img.shape for img in images]}") # logging.info(f"Preprocessed image shape: {preprocessed_image.shape if preprocessed_image is not None else None}") return images @spaces.GPU(duration=60) @torch.inference_mode() def pytorch2numpy(imgs, quant=True): results = [] for x in imgs: y = x.movedim(0, -1) if quant: y = y * 127.5 + 127.5 y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8) else: y = y * 0.5 + 0.5 y = y.detach().float().cpu().numpy().clip(0, 1).astype(np.float32) results.append(y) return results @spaces.GPU(duration=60) @torch.inference_mode() def numpy2pytorch(imgs): h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0 # so that 127 must be strictly 0.0 h = h.movedim(-1, 1) return h def resize_and_center_crop(image, target_width, target_height): pil_image = Image.fromarray(image) original_width, original_height = pil_image.size scale_factor = max(target_width / original_width, target_height / original_height) resized_width = int(round(original_width * scale_factor)) resized_height = int(round(original_height * scale_factor)) resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS) left = (resized_width - target_width) / 2 top = (resized_height - target_height) / 2 right = (resized_width + target_width) / 2 bottom = (resized_height + target_height) / 2 cropped_image = resized_image.crop((left, top, right, bottom)) return np.array(cropped_image) def resize_without_crop(image, target_width, target_height): pil_image = Image.fromarray(image) resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS) return np.array(resized_image) # @spaces.GPU(duration=60) # @torch.inference_mode() # def run_rmbg(img, sigma=0.0): # # Convert RGBA to RGB if needed # if img.shape[-1] == 4: # # Use white background for alpha composition # alpha = img[..., 3:] / 255.0 # rgb = img[..., :3] # white_bg = np.ones_like(rgb) * 255 # img = (rgb * alpha + white_bg * (1 - alpha)).astype(np.uint8) # H, W, C = img.shape # assert C == 3 # k = (256.0 / float(H * W)) ** 0.5 # feed = resize_without_crop(img, int(64 * round(W * k)), int(64 * round(H * k))) # feed = numpy2pytorch([feed]).to(device=device, dtype=torch.float32) # alpha = rmbg(feed)[0][0] # alpha = torch.nn.functional.interpolate(alpha, size=(H, W), mode="bilinear") # alpha = alpha.movedim(1, -1)[0] # alpha = alpha.detach().float().cpu().numpy().clip(0, 1) # # Create RGBA image # rgba = np.dstack((img, alpha * 255)).astype(np.uint8) # result = 127 + (img.astype(np.float32) - 127 + sigma) * alpha # return result.clip(0, 255).astype(np.uint8), rgba @spaces.GPU @torch.inference_mode() def run_rmbg(image): image_size = image.size input_images = transform_image(image).unsqueeze(0).to("cuda") # Prediction with torch.no_grad(): preds = rmbg(input_images)[-1].sigmoid().cpu() pred = preds[0].squeeze() pred_pil = transforms.ToPILImage()(pred) mask = pred_pil.resize(image_size) image.putalpha(mask) return image def preprocess_image(image: Image.Image, height=768, width=768): image = np.array(image) alpha = image[..., 3] > 0 H, W = alpha.shape # get the bounding box of alpha y, x = np.where(alpha) y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H) x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W) image_center = image[y0:y1, x0:x1] # resize the longer side to H * 0.9 H, W, _ = image_center.shape if H > W: W = int(W * (height * 0.9) / H) H = int(height * 0.9) else: H = int(H * (width * 0.9) / W) W = int(width * 0.9) image_center = np.array(Image.fromarray(image_center).resize((W, H))) # pad to H, W start_h = (height - H) // 2 start_w = (width - W) // 2 image = np.zeros((height, width, 4), dtype=np.uint8) image[start_h : start_h + H, start_w : start_w + W] = image_center image = image.astype(np.float32) / 255.0 image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5 image = (image * 255).clip(0, 255).astype(np.uint8) image = Image.fromarray(image) return image @spaces.GPU(duration=60) @torch.inference_mode() def process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source): clear_memory() # Get input dimensions input_height, input_width = input_fg.shape[:2] bg_source = BGSource(bg_source) if bg_source == BGSource.UPLOAD: pass elif bg_source == BGSource.UPLOAD_FLIP: input_bg = np.fliplr(input_bg) if bg_source == BGSource.GREY: input_bg = np.zeros(shape=(input_height, input_width, 3), dtype=np.uint8) + 64 elif bg_source == BGSource.LEFT: gradient = np.linspace(255, 0, input_width) image = np.tile(gradient, (input_height, 1)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.RIGHT: gradient = np.linspace(0, 255, input_width) image = np.tile(gradient, (input_height, 1)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.TOP: gradient = np.linspace(255, 0, input_height)[:, None] image = np.tile(gradient, (1, input_width)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) elif bg_source == BGSource.BOTTOM: gradient = np.linspace(0, 255, input_height)[:, None] image = np.tile(gradient, (1, input_width)) input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8) else: raise 'Wrong initial latent!' rng = torch.Generator(device=device).manual_seed(int(seed)) # Use input dimensions directly fg = resize_without_crop(input_fg, input_width, input_height) concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype) concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt) if input_bg is None: latents = t2i_pipe( prompt_embeds=conds, negative_prompt_embeds=unconds, width=input_width, height=input_height, num_inference_steps=steps, num_images_per_prompt=num_samples, generator=rng, output_type='latent', guidance_scale=cfg, cross_attention_kwargs={'concat_conds': concat_conds}, ).images.to(vae.dtype) / vae.config.scaling_factor else: bg = resize_without_crop(input_bg, input_width, input_height) bg_latent = numpy2pytorch([bg]).to(device=vae.device, dtype=vae.dtype) bg_latent = vae.encode(bg_latent).latent_dist.mode() * vae.config.scaling_factor latents = i2i_pipe( image=bg_latent, strength=lowres_denoise, prompt_embeds=conds, negative_prompt_embeds=unconds, width=input_width, height=input_height, num_inference_steps=int(round(steps / lowres_denoise)), num_images_per_prompt=num_samples, generator=rng, output_type='latent', guidance_scale=cfg, cross_attention_kwargs={'concat_conds': concat_conds}, ).images.to(vae.dtype) / vae.config.scaling_factor pixels = vae.decode(latents).sample pixels = pytorch2numpy(pixels) pixels = [resize_without_crop( image=p, target_width=int(round(input_width * highres_scale / 64.0) * 64), target_height=int(round(input_height * highres_scale / 64.0) * 64)) for p in pixels] pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype) latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor latents = latents.to(device=unet.device, dtype=unet.dtype) highres_height, highres_width = latents.shape[2] * 8, latents.shape[3] * 8 fg = resize_without_crop(input_fg, highres_width, highres_height) concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype) concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor latents = i2i_pipe( image=latents, strength=highres_denoise, prompt_embeds=conds, negative_prompt_embeds=unconds, width=highres_width, height=highres_height, num_inference_steps=int(round(steps / highres_denoise)), num_images_per_prompt=num_samples, generator=rng, output_type='latent', guidance_scale=cfg, cross_attention_kwargs={'concat_conds': concat_conds}, ).images.to(vae.dtype) / vae.config.scaling_factor pixels = vae.decode(latents).sample pixels = pytorch2numpy(pixels) # Resize back to input dimensions pixels = [resize_without_crop(p, input_width, input_height) for p in pixels] pixels = np.stack(pixels) return pixels def extract_foreground(image): if image is None: return None, gr.update(visible=True), gr.update(visible=True) #logging.info(f"Input image shape: {image.shape}, dtype: {image.dtype}") #result, rgba = run_rmbg(image) result = run_rmbg(image) result = preprocess_image(result) #logging.info(f"Result shape: {result.shape}, dtype: {result.dtype}") #logging.info(f"RGBA shape: {rgba.shape}, dtype: {rgba.dtype}") return result, gr.update(visible=True), gr.update(visible=True) def update_extracted_fg_height(selected_image: gr.SelectData): if selected_image: # Get the height of the selected image height = selected_image.value['image']['shape'][0] # Assuming the image is in numpy format return gr.update(height=height) # Update the height of extracted_fg return gr.update(height=480) # Default height if no image is selected @torch.inference_mode() def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source): # Convert input foreground from PIL to NumPy array if it's in PIL format if isinstance(input_fg, Image.Image): input_fg = np.array(input_fg) logging.info(f"Input foreground shape: {input_fg.shape}, dtype: {input_fg.dtype}") results = process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source) logging.info(f"Results shape: {results.shape}, dtype: {results.dtype}") return results quick_prompts = [ 'sunshine from window', 'golden time', 'natural lighting', 'warm atmosphere, at home, bedroom', 'shadow from window', 'soft studio lighting', 'home atmosphere, cozy bedroom illumination', ] quick_prompts = [[x] for x in quick_prompts] quick_subjects = [ 'modern sofa, high quality leather', 'elegant dining table, polished wood', 'luxurious bed, premium mattress', 'minimalist office desk, clean design', 'vintage wooden cabinet, antique finish', ] quick_subjects = [[x] for x in quick_subjects] class BGSource(Enum): UPLOAD = "Use Background Image" UPLOAD_FLIP = "Use Flipped Background Image" NONE = "None" LEFT = "Left Light" RIGHT = "Right Light" TOP = "Top Light" BOTTOM = "Bottom Light" GREY = "Ambient" # Add save function def save_images(images, prefix="relight"): # Create output directory if it doesn't exist output_dir = Path("outputs") output_dir.mkdir(exist_ok=True) # Create timestamp for unique filenames timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") saved_paths = [] for i, img in enumerate(images): if isinstance(img, np.ndarray): # Convert to PIL Image if numpy array img = Image.fromarray(img) # Create filename with timestamp filename = f"{prefix}_{timestamp}_{i+1}.png" filepath = output_dir / filename # Save image img.save(filepath) # print(f"Saved {len(saved_paths)} images to {output_dir}") return saved_paths class MaskMover: def __init__(self): self.extracted_fg = None self.original_fg = None # Store original foreground def set_extracted_fg(self, fg_image): """Store the extracted foreground with alpha channel""" if isinstance(fg_image, np.ndarray): self.extracted_fg = fg_image.copy() self.original_fg = fg_image.copy() else: self.extracted_fg = np.array(fg_image) self.original_fg = np.array(fg_image) return self.extracted_fg def create_composite(self, background, x_pos, y_pos, scale=1.0): """Create composite with foreground at specified position""" if self.original_fg is None or background is None: return background # Convert inputs to PIL Images if isinstance(background, np.ndarray): bg = Image.fromarray(background).convert('RGBA') else: bg = background.convert('RGBA') if isinstance(self.original_fg, np.ndarray): fg = Image.fromarray(self.original_fg).convert('RGBA') else: fg = self.original_fg.convert('RGBA') # Scale the foreground size new_width = int(fg.width * scale) new_height = int(fg.height * scale) fg = fg.resize((new_width, new_height), Image.LANCZOS) # Center the scaled foreground at the position x = int(x_pos - new_width / 2) y = int(y_pos - new_height / 2) # Create composite result = bg.copy() result.paste(fg, (x, y), fg) # Use fg as the mask (requires fg to be in 'RGBA' mode) return np.array(result.convert('RGB')) # Convert back to 'RGB' if needed def get_depth(image): if image is None: return None # Convert from PIL/gradio format to cv2 raw_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # Get depth map depth = model.infer_image(raw_img) # HxW raw depth map # Normalize depth for visualization depth = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8) # Convert to RGB for display depth_colored = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO) depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB) return Image.fromarray(depth_colored) from PIL import Image def compress_image(image): # Convert Gradio image (numpy array) to PIL Image img = Image.fromarray(image) # Resize image if dimensions are too large max_size = 1024 # Maximum dimension size if img.width > max_size or img.height > max_size: ratio = min(max_size/img.width, max_size/img.height) new_size = (int(img.width * ratio), int(img.height * ratio)) img = img.resize(new_size, Image.Resampling.LANCZOS) quality = 95 # Start with high quality img.save("compressed_image.jpg", "JPEG", quality=quality) # Initial save # Check file size and adjust quality if necessary while os.path.getsize("compressed_image.jpg") > 100 * 1024: # 100KB limit quality -= 5 # Decrease quality img.save("compressed_image.jpg", "JPEG", quality=quality) if quality < 20: # Prevent quality from going too low break # Convert back to numpy array for Gradio compressed_img = np.array(Image.open("compressed_image.jpg")) return compressed_img def use_orientation(selected_image:gr.SelectData): return selected_image.value['image']['path'] @spaces.GPU(duration=60) @torch.inference_mode def process_image(input_image, input_text): """Main processing function for the Gradio interface""" if isinstance(input_image, Image.Image): input_image = np.array(input_image) # Initialize configs API_TOKEN = "9c8c865e10ec1821bea79d9fa9dc8720" SAM2_CHECKPOINT = "./checkpoints/sam2_hiera_large.pt" SAM2_MODEL_CONFIG = os.path.join(os.path.dirname(os.path.abspath(__file__)), "configs/sam2_hiera_l.yaml") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" OUTPUT_DIR = Path("outputs/grounded_sam2_dinox_demo") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) HEIGHT = 768 WIDTH = 768 # Initialize DDS client config = Config(API_TOKEN) client = Client(config) # Process classes from text prompt classes = [x.strip().lower() for x in input_text.split('.') if x] class_name_to_id = {name: id for id, name in enumerate(classes)} class_id_to_name = {id: name for name, id in class_name_to_id.items()} # Save input image to temp file and get URL with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile: cv2.imwrite(tmpfile.name, input_image) image_url = client.upload_file(tmpfile.name) os.remove(tmpfile.name) # Process detection results input_boxes = [] masks = [] confidences = [] class_names = [] class_ids = [] if len(input_text) == 0: task = DinoxTask( image_url=image_url, prompts=[TextPrompt(text="")], # targets=[DetectionTarget.BBox, DetectionTarget.Mask] ) client.run_task(task) predictions = task.result.objects classes = [pred.category for pred in predictions] classes = list(set(classes)) class_name_to_id = {name: id for id, name in enumerate(classes)} class_id_to_name = {id: name for name, id in class_name_to_id.items()} for idx, obj in enumerate(predictions): input_boxes.append(obj.bbox) masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size)) # convert mask to np.array using DDS API confidences.append(obj.score) cls_name = obj.category.lower().strip() class_names.append(cls_name) class_ids.append(class_name_to_id[cls_name]) boxes = np.array(input_boxes) masks = np.array(masks) class_ids = np.array(class_ids) labels = [ f"{class_name} {confidence:.2f}" for class_name, confidence in zip(class_names, confidences) ] detections = sv.Detections( xyxy=boxes, mask=masks.astype(bool), class_id=class_ids ) box_annotator = sv.BoxAnnotator() label_annotator = sv.LabelAnnotator() mask_annotator = sv.MaskAnnotator() annotated_frame = input_image.copy() annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections) annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) # Create transparent mask for first detected object if len(detections) > 0: # Get first mask first_mask = detections.mask[0] # Get original RGB image img = input_image.copy() H, W, C = img.shape # Create RGBA image with default 255 alpha alpha = np.zeros((H, W, 1), dtype=np.uint8) alpha[~first_mask] = 0 # 128 # for semi-transparency background alpha[first_mask] = 255 # Make the foreground opaque alpha = alpha.squeeze(-1) # Remove singleton dimension to become 2D rgba = np.dstack((img, alpha)).astype(np.uint8) # get the bounding box of alpha y, x = np.where(alpha > 0) y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H) x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W) image_center = rgba[y0:y1, x0:x1] # resize the longer side to H * 0.9 H, W, _ = image_center.shape if H > W: W = int(W * (HEIGHT * 0.9) / H) H = int(HEIGHT * 0.9) else: H = int(H * (WIDTH * 0.9) / W) W = int(WIDTH * 0.9) image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS)) # pad to H, W start_h = (HEIGHT - H) // 2 start_w = (WIDTH - W) // 2 image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8) image[start_h : start_h + H, start_w : start_w + W] = image_center image = image.astype(np.float32) / 255.0 image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5 image = (image * 255).clip(0, 255).astype(np.uint8) image = Image.fromarray(image) return annotated_frame, image, gr.update(visible=False), gr.update(visible=False) return annotated_frame, None, gr.update(visible=False), gr.update(visible=False) else: # Run DINO-X detection task = DinoxTask( image_url=image_url, prompts=[TextPrompt(text=input_text)], targets=[DetectionTarget.BBox, DetectionTarget.Mask] ) client.run_task(task) result = task.result objects = result.objects predictions = task.result.objects classes = [x.strip().lower() for x in input_text.split('.') if x] class_name_to_id = {name: id for id, name in enumerate(classes)} class_id_to_name = {id: name for name, id in class_name_to_id.items()} boxes = [] masks = [] confidences = [] class_names = [] class_ids = [] for idx, obj in enumerate(predictions): boxes.append(obj.bbox) masks.append(DetectionTask.rle2mask(DetectionTask.string2rle(obj.mask.counts), obj.mask.size)) # convert mask to np.array using DDS API confidences.append(obj.score) cls_name = obj.category.lower().strip() class_names.append(cls_name) class_ids.append(class_name_to_id[cls_name]) boxes = np.array(boxes) masks = np.array(masks) class_ids = np.array(class_ids) labels = [ f"{class_name} {confidence:.2f}" for class_name, confidence in zip(class_names, confidences) ] detections = sv.Detections( xyxy=boxes, mask=masks.astype(bool), class_id=class_ids, ) box_annotator = sv.BoxAnnotator() label_annotator = sv.LabelAnnotator() mask_annotator = sv.MaskAnnotator() annotated_frame = input_image.copy() annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections) annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) annotated_frame = mask_annotator.annotate(scene=annotated_frame, detections=detections) # Create transparent mask for first detected object if len(detections) > 0: # Get first mask first_mask = detections.mask[0] # Get original RGB image img = input_image.copy() H, W, C = img.shape # Create RGBA image with default 255 alpha alpha = np.zeros((H, W, 1), dtype=np.uint8) alpha[~first_mask] = 0 # 128 for semi-transparency background alpha[first_mask] = 255 # Make the foreground opaque alpha = alpha.squeeze(-1) # Remove singleton dimension to become 2D rgba = np.dstack((img, alpha)).astype(np.uint8) # get the bounding box of alpha y, x = np.where(alpha > 0) y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H) x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W) image_center = rgba[y0:y1, x0:x1] # resize the longer side to H * 0.9 H, W, _ = image_center.shape if H > W: W = int(W * (HEIGHT * 0.9) / H) H = int(HEIGHT * 0.9) else: H = int(H * (WIDTH * 0.9) / W) W = int(WIDTH * 0.9) image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS)) # pad to H, W start_h = (HEIGHT - H) // 2 start_w = (WIDTH - W) // 2 image = np.zeros((HEIGHT, WIDTH, 4), dtype=np.uint8) image[start_h : start_h + H, start_w : start_w + W] = image_center image = image.astype(np.float32) / 255.0 image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5 image = (image * 255).clip(0, 255).astype(np.uint8) image = Image.fromarray(image) return annotated_frame, image, gr.update(visible=False), gr.update(visible=False) return annotated_frame, None, gr.update(visible=False), gr.update(visible=False) # Import all the necessary functions from the original script def get_value_at_index(obj: Union[Sequence, Mapping], index: int) -> Any: try: return obj[index] except KeyError: return obj["result"][index] # Add all the necessary setup functions from the original script def find_path(name: str, path: str = None) -> str: if path is None: path = os.getcwd() if name in os.listdir(path): path_name = os.path.join(path, name) print(f"{name} found: {path_name}") return path_name parent_directory = os.path.dirname(path) if parent_directory == path: return None return find_path(name, parent_directory) def add_comfyui_directory_to_sys_path() -> None: comfyui_path = find_path("ComfyUI") if comfyui_path is not None and os.path.isdir(comfyui_path): sys.path.append(comfyui_path) print(f"'{comfyui_path}' added to sys.path") def add_extra_model_paths() -> None: try: from main import load_extra_path_config except ImportError: from utils.extra_config import load_extra_path_config extra_model_paths = find_path("extra_model_paths.yaml") if extra_model_paths is not None: load_extra_path_config(extra_model_paths) else: print("Could not find the extra_model_paths config file.") # Initialize paths add_comfyui_directory_to_sys_path() add_extra_model_paths() def import_custom_nodes() -> None: import asyncio import execution from nodes import init_extra_nodes import server loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) server_instance = server.PromptServer(loop) execution.PromptQueue(server_instance) init_extra_nodes() # Import all necessary nodes from nodes import ( StyleModelLoader, VAEEncode, NODE_CLASS_MAPPINGS, LoadImage, CLIPVisionLoader, SaveImage, VAELoader, CLIPVisionEncode, DualCLIPLoader, EmptyLatentImage, VAEDecode, UNETLoader, CLIPTextEncode, ) # Initialize all constant nodes and models in global context import_custom_nodes() # Global variables for preloaded models and constants #with torch.inference_mode(): # Initialize constants intconstant = NODE_CLASS_MAPPINGS["INTConstant"]() CONST_1024 = intconstant.get_value(value=1024) # Load CLIP dualcliploader = DualCLIPLoader() CLIP_MODEL = dualcliploader.load_clip( clip_name1="t5/t5xxl_fp16.safetensors", clip_name2="clip_l.safetensors", type="flux", ) # Load VAE vaeloader = VAELoader() VAE_MODEL = vaeloader.load_vae(vae_name="FLUX1/ae.safetensors") # Load UNET unetloader = UNETLoader() UNET_MODEL = unetloader.load_unet( unet_name="flux1-depth-dev.safetensors", weight_dtype="default" ) # Load CLIP Vision clipvisionloader = CLIPVisionLoader() CLIP_VISION_MODEL = clipvisionloader.load_clip( clip_name="sigclip_vision_patch14_384.safetensors" ) # Load Style Model stylemodelloader = StyleModelLoader() STYLE_MODEL = stylemodelloader.load_style_model( style_model_name="flux1-redux-dev.safetensors" ) # Initialize samplers ksamplerselect = NODE_CLASS_MAPPINGS["KSamplerSelect"]() SAMPLER = ksamplerselect.get_sampler(sampler_name="euler") # Initialize depth model cr_clip_input_switch = NODE_CLASS_MAPPINGS["CR Clip Input Switch"]() downloadandloaddepthanythingv2model = NODE_CLASS_MAPPINGS["DownloadAndLoadDepthAnythingV2Model"]() DEPTH_MODEL = downloadandloaddepthanythingv2model.loadmodel( model="depth_anything_v2_vitl_fp32.safetensors" ) cliptextencode = CLIPTextEncode() loadimage = LoadImage() vaeencode = VAEEncode() fluxguidance = NODE_CLASS_MAPPINGS["FluxGuidance"]() instructpixtopixconditioning = NODE_CLASS_MAPPINGS["InstructPixToPixConditioning"]() clipvisionencode = CLIPVisionEncode() stylemodelapplyadvanced = NODE_CLASS_MAPPINGS["StyleModelApplyAdvanced"]() emptylatentimage = EmptyLatentImage() basicguider = NODE_CLASS_MAPPINGS["BasicGuider"]() basicscheduler = NODE_CLASS_MAPPINGS["BasicScheduler"]() randomnoise = NODE_CLASS_MAPPINGS["RandomNoise"]() samplercustomadvanced = NODE_CLASS_MAPPINGS["SamplerCustomAdvanced"]() vaedecode = VAEDecode() cr_text = NODE_CLASS_MAPPINGS["CR Text"]() saveimage = SaveImage() getimagesizeandcount = NODE_CLASS_MAPPINGS["GetImageSizeAndCount"]() depthanything_v2 = NODE_CLASS_MAPPINGS["DepthAnything_V2"]() imageresize = NODE_CLASS_MAPPINGS["ImageResize+"]() @spaces.GPU def generate_image(prompt, structure_image, style_image, depth_strength=15, style_strength=0.5, progress=gr.Progress(track_tqdm=True)) -> str: """Main generation function that processes inputs and returns the path to the generated image.""" with torch.inference_mode(): # Set up CLIP clip_switch = cr_clip_input_switch.switch( Input=1, clip1=get_value_at_index(CLIP_MODEL, 0), clip2=get_value_at_index(CLIP_MODEL, 0), ) # Encode text text_encoded = cliptextencode.encode( text=prompt, clip=get_value_at_index(clip_switch, 0), ) empty_text = cliptextencode.encode( text="", clip=get_value_at_index(clip_switch, 0), ) # Process structure image structure_img = loadimage.load_image(image=structure_image) # Resize image resized_img = imageresize.execute( width=get_value_at_index(CONST_1024, 0), height=get_value_at_index(CONST_1024, 0), interpolation="bicubic", method="keep proportion", condition="always", multiple_of=16, image=get_value_at_index(structure_img, 0), ) # Get image size size_info = getimagesizeandcount.getsize( image=get_value_at_index(resized_img, 0) ) # Encode VAE vae_encoded = vaeencode.encode( pixels=get_value_at_index(size_info, 0), vae=get_value_at_index(VAE_MODEL, 0), ) # Process depth depth_processed = depthanything_v2.process( da_model=get_value_at_index(DEPTH_MODEL, 0), images=get_value_at_index(size_info, 0), ) # Apply Flux guidance flux_guided = fluxguidance.append( guidance=depth_strength, conditioning=get_value_at_index(text_encoded, 0), ) # Process style image style_img = loadimage.load_image(image=style_image) # Encode style with CLIP Vision style_encoded = clipvisionencode.encode( crop="center", clip_vision=get_value_at_index(CLIP_VISION_MODEL, 0), image=get_value_at_index(style_img, 0), ) # Set up conditioning conditioning = instructpixtopixconditioning.encode( positive=get_value_at_index(flux_guided, 0), negative=get_value_at_index(empty_text, 0), vae=get_value_at_index(VAE_MODEL, 0), pixels=get_value_at_index(depth_processed, 0), ) # Apply style style_applied = stylemodelapplyadvanced.apply_stylemodel( strength=style_strength, conditioning=get_value_at_index(conditioning, 0), style_model=get_value_at_index(STYLE_MODEL, 0), clip_vision_output=get_value_at_index(style_encoded, 0), ) # Set up empty latent empty_latent = emptylatentimage.generate( width=get_value_at_index(resized_img, 1), height=get_value_at_index(resized_img, 2), batch_size=1, ) # Set up guidance guided = basicguider.get_guider( model=get_value_at_index(UNET_MODEL, 0), conditioning=get_value_at_index(style_applied, 0), ) # Set up scheduler schedule = basicscheduler.get_sigmas( scheduler="simple", steps=28, denoise=1, model=get_value_at_index(UNET_MODEL, 0), ) # Generate random noise noise = randomnoise.get_noise(noise_seed=random.randint(1, 2**64)) # Sample sampled = samplercustomadvanced.sample( noise=get_value_at_index(noise, 0), guider=get_value_at_index(guided, 0), sampler=get_value_at_index(SAMPLER, 0), sigmas=get_value_at_index(schedule, 0), latent_image=get_value_at_index(empty_latent, 0), ) # Decode VAE decoded = vaedecode.decode( samples=get_value_at_index(sampled, 0), vae=get_value_at_index(VAE_MODEL, 0), ) # Save image prefix = cr_text.text_multiline(text="Flux_BFL_Depth_Redux") saved = saveimage.save_images( filename_prefix=get_value_at_index(prefix, 0), images=get_value_at_index(decoded, 0), ) saved_path = f"output/{saved['ui']['images'][0]['filename']}" return saved_path # Create Gradio interface examples = [ ["", "chair_input_1.jpg", "chair_input_2.png", 15, 0.5], ] output_image = gr.Image(label="Generated Image") with gr.Blocks() as app: with gr.Tab("Relighting"): with gr.Row(): gr.Markdown("## Product Placement from Text") with gr.Row(): with gr.Column(): with gr.Row(): input_fg = gr.Image(type="pil", label="Image", height=480) with gr.Row(): with gr.Group(): find_objects_button = gr.Button(value="(Option 1) Segment Object from text") text_prompt = gr.Textbox( label="Text Prompt", placeholder="Enter object classes separated by periods (e.g. 'car . person .'), leave empty to get all objects", value="" ) extract_button = gr.Button(value="Remove Background") with gr.Row(): extracted_objects = gr.Image(type="numpy", label="Extracted Foreground", height=480) extracted_fg = gr.Image(type="pil", label="Extracted Foreground", height=480) angles_fg = gr.Image(type="pil", label="Converted Foreground", height=480, visible=False) # output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480) with gr.Group(): run_button = gr.Button("Generate alternative angles") orientation_result = gr.Gallery( label="Result", show_label=False, columns=[3], rows=[2], object_fit="fill", height="auto", allow_preview=False, ) if orientation_result: orientation_result.select(use_orientation, inputs=None, outputs=extracted_fg) dummy_image_for_outputs = gr.Image(visible=False, label='Result', type='pil') with gr.Column(): with gr.Row(): with gr.Column(4): result_gallery = gr.Gallery(height=832, label='Outputs', object_fit='contain', selected_index=0) if result_gallery: result_gallery.select(use_orientation, inputs=None, outputs=dummy_image_for_outputs) with gr.Column(1): with gr.Group(): gr.Markdown("Outpaint") with gr.Row(): with gr.Column(scale=2): prompt_fill = gr.Textbox(label="Prompt (Optional)") with gr.Column(scale=1): fill_button = gr.Button("Generate") target_ratio = gr.Radio( label="Image Ratio", choices=["9:16", "16:9", "1:1", "Custom"], value="9:16", scale=3 ) alignment_dropdown = gr.Dropdown( choices=["Middle", "Left", "Right", "Top", "Bottom"], value="Middle", label="Alignment", ) resize_option = gr.Radio( label="Resize input image", choices=["Full", "75%", "50%", "33%", "25%", "Custom"], value="75%" ) custom_resize_percentage = gr.Slider( label="Custom resize (%)", minimum=1, maximum=100, step=1, value=50, visible=False ) fill_result = gr.Image( interactive=False, label="Generated Image", ) with gr.Accordion(label="Advanced settings", open=False) as settings_panel: with gr.Column(): with gr.Row(): width_slider = gr.Slider( label="Target Width", minimum=720, maximum=1536, step=8, value=720, ) height_slider = gr.Slider( label="Target Height", minimum=720, maximum=1536, step=8, value=1280, ) num_inference_steps = gr.Slider(label="Steps", minimum=2, maximum=50, step=1, value=28) with gr.Group(): overlap_percentage = gr.Slider( label="Mask overlap (%)", minimum=1, maximum=50, value=10, step=1 ) with gr.Row(): overlap_top = gr.Checkbox(label="Overlap Top", value=True) overlap_right = gr.Checkbox(label="Overlap Right", value=True) with gr.Row(): overlap_left = gr.Checkbox(label="Overlap Left", value=True) overlap_bottom = gr.Checkbox(label="Overlap Bottom", value=True) with gr.Row(): with gr.Group(): prompt = gr.Textbox(label="Prompt") bg_source = gr.Radio(choices=[e.value for e in list(BGSource)[2:]], value=BGSource.LEFT.value, label="Lighting Preference (Initial Latent)", type='value') example_quick_subjects = gr.Dataset(samples=quick_subjects, label='Subject Quick List', samples_per_page=1000, components=[prompt]) example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Lighting Quick List', samples_per_page=1000, components=[prompt]) with gr.Row(): relight_button = gr.Button(value="Relight") with gr.Group(visible=False): with gr.Row(): num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) seed = gr.Number(label="Seed", value=12345, precision=0) with gr.Row(): image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64) image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64) with gr.Accordion("Advanced options", open=False): steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=15, step=1) cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=2, step=0.01, visible=False) lowres_denoise = gr.Slider(label="Lowres Denoise (for initial latent)", minimum=0.1, maximum=1.0, value=0.9, step=0.01) highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=3.0, value=1.5, step=0.01) highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=1.0, value=0.5, step=0.01) a_prompt = gr.Textbox(label="Added Prompt", value='best quality', visible=False) n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality', visible=False) x_slider = gr.Slider( minimum=0, maximum=1000, label="X Position", value=500, visible=False ) y_slider = gr.Slider( minimum=0, maximum=1000, label="Y Position", value=500, visible=False ) # with gr.Row(): # gr.Examples( # fn=lambda *args: ([args[-1]], None), # examples=db_examples.foreground_conditioned_examples, # inputs=[ # input_fg, prompt, bg_source, image_width, image_height, seed, dummy_image_for_outputs # ], # outputs=[result_gallery, output_bg], # run_on_click=True, examples_per_page=1024 # ) ips = [extracted_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source] relight_button.click(fn=process_relight, inputs=ips, outputs=[result_gallery]) example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False) example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False) # def use_output_as_input(output_image): # return output_image # use_as_input_button.click( # fn=use_output_as_input, # inputs=[fill_result], # outputs=[input_image] # ) target_ratio.change( fn=preload_presets, inputs=[target_ratio, width_slider, height_slider], outputs=[width_slider, height_slider, settings_panel], queue=False ) width_slider.change( fn=select_the_right_preset, inputs=[width_slider, height_slider], outputs=[target_ratio], queue=False ) height_slider.change( fn=select_the_right_preset, inputs=[width_slider, height_slider], outputs=[target_ratio], queue=False ) resize_option.change( fn=toggle_custom_resize_slider, inputs=[resize_option], outputs=[custom_resize_percentage], queue=False ) fill_button.click( fn=clear_result, inputs=None, outputs=fill_result, ).then( fn=inpaint, inputs=[dummy_image_for_outputs, width_slider, height_slider, overlap_percentage, num_inference_steps, resize_option, custom_resize_percentage, prompt_fill, alignment_dropdown, overlap_left, overlap_right, overlap_top, overlap_bottom], outputs=[fill_result]) # ).then( # fn=lambda: gr.update(visible=True), # inputs=None, # outputs=use_as_input_button, # ) prompt_fill.submit( fn=clear_result, inputs=None, outputs=fill_result, ).then( fn=inpaint, inputs=[dummy_image_for_outputs, width_slider, height_slider, overlap_percentage, num_inference_steps, resize_option, custom_resize_percentage, prompt_fill, alignment_dropdown, overlap_left, overlap_right, overlap_top, overlap_bottom], outputs=[fill_result]) def convert_to_pil(image): try: #logging.info(f"Input image shape: {image.shape}, dtype: {image.dtype}") image = image.astype(np.uint8) logging.info(f"Converted image shape: {image.shape}, dtype: {image.dtype}") return image except Exception as e: logging.error(f"Error converting image: {e}") return image run_button.click( fn=convert_to_pil, inputs=extracted_fg, # This is already RGBA with removed background outputs=angles_fg ).then( fn=infer, inputs=[ text_prompt, extracted_fg, # Already processed RGBA image ], outputs=[orientation_result], ) find_objects_button.click( fn=process_image, inputs=[input_fg, text_prompt], outputs=[extracted_objects, extracted_fg] ) extract_button.click( fn=extract_foreground, inputs=[input_fg], outputs=[extracted_fg, x_slider, y_slider] ) with gr.Tab("Style Transfer"): gr.Markdown("## Apply the style of an image to another one") with gr.Row(): with gr.Column(): prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...") with gr.Row(): with gr.Group(): structure_image = gr.Image(label="Structure Image", type="filepath") depth_strength = gr.Slider(minimum=0, maximum=50, value=15, label="Depth Strength") with gr.Group(): style_image = gr.Image(label="Style Image", type="filepath") style_strength = gr.Slider(minimum=0, maximum=1, value=0.5, label="Style Strength") generate_btn = gr.Button("Generate") gr.Examples( examples=examples, inputs=[prompt_input, structure_image, style_image, depth_strength, style_strength], outputs=[output_image], fn=generate_image, cache_examples=True, cache_mode="lazy" ) with gr.Column(): output_image.render() transfer_btn = gr.Button("Send to relight") def send_img(img_result): return img_result transfer_btn.click(send_img, [output_image], [input_fg]) generate_btn.click( fn=generate_image, inputs=[prompt_input, structure_image, style_image, depth_strength, style_strength], outputs=[output_image] ) if __name__ == "__main__": app.launch(share=True)