flxcontrol / app.py
fantos's picture
Update app.py
1666a97 verified
import sys
sys.path.append('./')
import gradio as gr
import spaces
import os
import sys
import subprocess
import numpy as np
from PIL import Image
import cv2
import torch
import random
from transformers import pipeline
os.system("pip install -e ./controlnet_aux")
from controlnet_aux import OpenposeDetector, CannyDetector
from depth_anything_v2.dpt import DepthAnythingV2
from huggingface_hub import hf_hub_download
from huggingface_hub import login
hf_token = os.environ.get("HF_TOKEN_GATED")
login(token=hf_token)
MAX_SEED = np.iinfo(np.int32).max
# ๋ฒˆ์—ญ๊ธฐ ์„ค์ •
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
def translate_to_english(text):
if any('\uAC00' <= char <= '\uD7A3' for char in text):
return translator(text, max_length=512)[0]['translation_text']
return text
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
if randomize_seed:
seed = random.randint(0, MAX_SEED)
return seed
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model_configs = {
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}
encoder = 'vitl'
model = DepthAnythingV2(**model_configs[encoder])
filepath = hf_hub_download(repo_id=f"depth-anything/Depth-Anything-V2-Large", filename=f"depth_anything_v2_vitl.pth", repo_type="model")
state_dict = torch.load(filepath, map_location="cpu")
model.load_state_dict(state_dict)
model = model.to(DEVICE).eval()
import torch
from diffusers.utils import load_image
from diffusers import FluxControlNetPipeline, FluxControlNetModel
from diffusers.models import FluxMultiControlNetModel
base_model = 'black-forest-labs/FLUX.1-dev'
controlnet_model = 'Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro'
controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
controlnet = FluxMultiControlNetModel([controlnet])
pipe = FluxControlNetPipeline.from_pretrained(base_model, controlnet=controlnet, torch_dtype=torch.bfloat16)
pipe.to("cuda")
mode_mapping = {"์บ๋‹ˆ":0, "ํƒ€์ผ":1, "๊นŠ์ด":2, "๋ธ”๋Ÿฌ":3, "์˜คํ”ˆํฌ์ฆˆ":4, "๊ทธ๋ ˆ์ด์Šค์ผ€์ผ":5, "์ €ํ’ˆ์งˆ": 6}
strength_mapping = {"์บ๋‹ˆ":0.65, "ํƒ€์ผ":0.45, "๊นŠ์ด":0.55, "๋ธ”๋Ÿฌ":0.45, "์˜คํ”ˆํฌ์ฆˆ":0.55, "๊ทธ๋ ˆ์ด์Šค์ผ€์ผ":0.45, "์ €ํ’ˆ์งˆ": 0.4}
canny = CannyDetector()
open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
torch.backends.cuda.matmul.allow_tf32 = True
pipe.vae.enable_tiling()
pipe.vae.enable_slicing()
pipe.enable_model_cpu_offload() # for saving memory
def convert_from_image_to_cv2(img: Image) -> np.ndarray:
return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
def convert_from_cv2_to_image(img: np.ndarray) -> Image:
return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
def extract_depth(image):
image = np.asarray(image)
depth = model.infer_image(image[:, :, ::-1])
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
depth = depth.astype(np.uint8)
gray_depth = Image.fromarray(depth).convert('RGB')
return gray_depth
def extract_openpose(img):
processed_image_open_pose = open_pose(img, hand_and_face=True)
return processed_image_open_pose
def extract_canny(image):
processed_image_canny = canny(image)
return processed_image_canny
def apply_gaussian_blur(image, kernel_size=(21, 21)):
image = convert_from_image_to_cv2(image)
blurred_image = convert_from_cv2_to_image(cv2.GaussianBlur(image, kernel_size, 0))
return blurred_image
def convert_to_grayscale(image):
image = convert_from_image_to_cv2(image)
gray_image = convert_from_cv2_to_image(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))
return gray_image
def add_gaussian_noise(image, mean=0, sigma=10):
image = convert_from_image_to_cv2(image)
noise = np.random.normal(mean, sigma, image.shape)
noisy_image = convert_from_cv2_to_image(np.clip(image.astype(np.float32) + noise, 0, 255).astype(np.uint8))
return noisy_image
def tile(input_image, resolution=768):
input_image = convert_from_image_to_cv2(input_image)
H, W, C = input_image.shape
H = float(H)
W = float(W)
k = float(resolution) / min(H, W)
H *= k
W *= k
H = int(np.round(H / 64.0)) * 64
W = int(np.round(W / 64.0)) * 64
img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
img = convert_from_cv2_to_image(img)
return img
def resize_img(input_image, max_side=768, min_side=512, size=None,
pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64):
w, h = input_image.size
if size is not None:
w_resize_new, h_resize_new = size
else:
ratio = min_side / min(h, w)
w, h = round(ratio*w), round(ratio*h)
ratio = max_side / max(h, w)
input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode)
w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
input_image = input_image.resize([w_resize_new, h_resize_new], mode)
if pad_to_max_side:
res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
offset_x = (max_side - w_resize_new) // 2
offset_y = (max_side - h_resize_new) // 2
res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image)
input_image = Image.fromarray(res)
return input_image
@spaces.GPU()
def infer(cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed, progress=gr.Progress(track_tqdm=True)):
control_mode_num = mode_mapping[control_mode]
prompt = translate_to_english(prompt)
if cond_in is None:
if image_in is not None:
image_in = resize_img(load_image(image_in))
if control_mode == "์บ๋‹ˆ":
control_image = extract_canny(image_in)
elif control_mode == "๊นŠ์ด":
control_image = extract_depth(image_in)
elif control_mode == "์˜คํ”ˆํฌ์ฆˆ":
control_image = extract_openpose(image_in)
elif control_mode == "๋ธ”๋Ÿฌ":
control_image = apply_gaussian_blur(image_in)
elif control_mode == "์ €ํ’ˆ์งˆ":
control_image = add_gaussian_noise(image_in)
elif control_mode == "๊ทธ๋ ˆ์ด์Šค์ผ€์ผ":
control_image = convert_to_grayscale(image_in)
elif control_mode == "ํƒ€์ผ":
control_image = tile(image_in)
else:
control_image = resize_img(load_image(cond_in))
width, height = control_image.size
image = pipe(
prompt,
control_image=[control_image],
control_mode=[control_mode_num],
width=width,
height=height,
controlnet_conditioning_scale=[control_strength],
num_inference_steps=inference_steps,
guidance_scale=guidance_scale,
generator=torch.manual_seed(seed),
).images[0]
torch.cuda.empty_cache()
return image, control_image, gr.update(visible=True)
css = """
footer {
visibility: hidden;
}
"""
with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as demo:
with gr.Column(elem_id="col-container"):
with gr.Column():
with gr.Row():
with gr.Column():
with gr.Row(equal_height=True):
cond_in = gr.Image(label="์ฒ˜๋ฆฌ๋œ ์ปจํŠธ๋กค ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ", sources=["upload"], type="filepath")
image_in = gr.Image(label="์ฐธ์กฐ ์ด๋ฏธ์ง€์—์„œ ์กฐ๊ฑด ์ถ”์ถœ (์„ ํƒ์‚ฌํ•ญ)", sources=["upload"], type="filepath")
prompt = gr.Textbox(label="ํ”„๋กฌํ”„ํŠธ", value="์ตœ๊ณ  ํ’ˆ์งˆ")
with gr.Accordion("์ปจํŠธ๋กค๋„ท"):
control_mode = gr.Radio(
["์บ๋‹ˆ", "๊นŠ์ด", "์˜คํ”ˆํฌ์ฆˆ", "๊ทธ๋ ˆ์ด์Šค์ผ€์ผ", "๋ธ”๋Ÿฌ", "ํƒ€์ผ", "์ €ํ’ˆ์งˆ"], label="๋ชจ๋“œ", value="๊ทธ๋ ˆ์ด์Šค์ผ€์ผ",
info="์ปจํŠธ๋กค ๋ชจ๋“œ ์„ ํƒ, ๋ชจ๋“  ์ด๋ฏธ์ง€์— ์ ์šฉ๋ฉ๋‹ˆ๋‹ค"
)
control_strength = gr.Slider(
label="์ปจํŠธ๋กค ๊ฐ•๋„",
minimum=0,
maximum=1.0,
step=0.05,
value=0.50,
)
seed = gr.Slider(
label="์‹œ๋“œ",
minimum=0,
maximum=MAX_SEED,
step=1,
value=42,
)
randomize_seed = gr.Checkbox(label="์‹œ๋“œ ๋ฌด์ž‘์œ„ํ™”", value=True)
with gr.Accordion("๊ณ ๊ธ‰ ์„ค์ •", open=False):
with gr.Column():
with gr.Row():
inference_steps = gr.Slider(label="์ถ”๋ก  ๋‹จ๊ณ„", minimum=1, maximum=50, step=1, value=24)
guidance_scale = gr.Slider(label="๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ", minimum=1.0, maximum=10.0, step=0.1, value=3.5)
submit_btn = gr.Button("์ œ์ถœ")
with gr.Column():
result = gr.Image(label="๊ฒฐ๊ณผ")
processed_cond = gr.Image(label="์ „์ฒ˜๋ฆฌ๋œ ์กฐ๊ฑด")
submit_btn.click(
fn=randomize_seed_fn,
inputs=[seed, randomize_seed],
outputs=seed,
queue=False,
api_name=False
).then(
fn = infer,
inputs = [cond_in, image_in, prompt, inference_steps, guidance_scale, control_mode, control_strength, seed],
outputs = [result, processed_cond],
show_api=False
)
demo.queue(api_open=False)
demo.launch()