Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import torch | |
from PIL import Image | |
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor, pipeline | |
from transformers import AutoProcessor, AutoModelForCausalLM | |
import re | |
import random | |
import os | |
from huggingface_hub import snapshot_download | |
from kolors.pipelines.pipeline_stable_diffusion_xl_chatglm_256 import StableDiffusionXLPipeline | |
from kolors.models.modeling_chatglm import ChatGLMModel | |
from kolors.models.tokenization_chatglm import ChatGLMTokenizer | |
from diffusers import UNet2DConditionModel, AutoencoderKL | |
from diffusers import EulerDiscreteScheduler | |
import subprocess | |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) | |
# Initialize models | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
dtype = torch.float16 | |
# Download Kolors model | |
ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors") | |
# Load Kolors models | |
text_encoder = ChatGLMModel.from_pretrained(os.path.join(ckpt_dir, 'text_encoder'), torch_dtype=dtype).to(device) | |
tokenizer = ChatGLMTokenizer.from_pretrained(os.path.join(ckpt_dir, 'text_encoder')) | |
vae = AutoencoderKL.from_pretrained(os.path.join(ckpt_dir, "vae"), revision=None).to(dtype).to(device) | |
scheduler = EulerDiscreteScheduler.from_pretrained(os.path.join(ckpt_dir, "scheduler")) | |
unet = UNet2DConditionModel.from_pretrained(os.path.join(ckpt_dir, "unet"), revision=None).to(dtype).to(device) | |
kolors_pipe = StableDiffusionXLPipeline( | |
vae=vae, | |
text_encoder=text_encoder, | |
tokenizer=tokenizer, | |
unet=unet, | |
scheduler=scheduler, | |
force_zeros_for_empty_prompt=False | |
).to(device) | |
# VLM Captioner | |
vlm_model = PaliGemmaForConditionalGeneration.from_pretrained("gokaygokay/sd3-long-captioner-v2").to(device).eval() | |
vlm_processor = PaliGemmaProcessor.from_pretrained("gokaygokay/sd3-long-captioner-v2") | |
# Initialize Florence model | |
florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval() | |
florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True) | |
# Prompt Enhancer | |
enhancer_medium = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance", device=device) | |
enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance-Long", device=device) | |
MAX_SEED = 2**32 - 1 | |
# Florence caption function | |
def florence_caption(image): | |
# Convert image to PIL if it's not already | |
if not isinstance(image, Image.Image): | |
image = Image.fromarray(image) | |
inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device) | |
generated_ids = florence_model.generate( | |
input_ids=inputs["input_ids"], | |
pixel_values=inputs["pixel_values"], | |
max_new_tokens=1024, | |
early_stopping=False, | |
do_sample=False, | |
num_beams=3, | |
) | |
generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
parsed_answer = florence_processor.post_process_generation( | |
generated_text, | |
task="<MORE_DETAILED_CAPTION>", | |
image_size=(image.width, image.height) | |
) | |
return parsed_answer["<MORE_DETAILED_CAPTION>"] | |
# VLM Captioner function | |
def create_captions_rich(image): | |
prompt = "caption en" | |
model_inputs = vlm_processor(text=prompt, images=image, return_tensors="pt").to(device) | |
input_len = model_inputs["input_ids"].shape[-1] | |
with torch.inference_mode(): | |
generation = vlm_model.generate(**model_inputs, repetition_penalty=1.10, max_new_tokens=256, do_sample=False) | |
generation = generation[0][input_len:] | |
decoded = vlm_processor.decode(generation, skip_special_tokens=True) | |
return modify_caption(decoded) | |
# Helper function for caption modification | |
def modify_caption(caption: str) -> str: | |
prefix_substrings = [ | |
('captured from ', ''), | |
('captured at ', '') | |
] | |
pattern = '|'.join([re.escape(opening) for opening, _ in prefix_substrings]) | |
replacers = {opening: replacer for opening, replacer in prefix_substrings} | |
def replace_fn(match): | |
return replacers[match.group(0)] | |
return re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE) | |
# Prompt Enhancer function | |
def enhance_prompt(input_prompt, model_choice): | |
if model_choice == "Medium": | |
result = enhancer_medium("Enhance the description: " + input_prompt) | |
enhanced_text = result[0]['summary_text'] | |
pattern = r'^.*?of\s+(.*?(?:\.|$))' | |
match = re.match(pattern, enhanced_text, re.IGNORECASE | re.DOTALL) | |
if match: | |
remaining_text = enhanced_text[match.end():].strip() | |
modified_sentence = match.group(1).capitalize() | |
enhanced_text = modified_sentence + ' ' + remaining_text | |
else: # Long | |
result = enhancer_long("Enhance the description: " + input_prompt) | |
enhanced_text = result[0]['summary_text'] | |
return enhanced_text | |
def generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, num_images_per_prompt): | |
if randomize_seed: | |
seed = random.randint(0, MAX_SEED) | |
generator = torch.Generator(device=device).manual_seed(seed) | |
image = kolors_pipe( | |
prompt=prompt, | |
negative_prompt=negative_prompt, | |
guidance_scale=guidance_scale, | |
num_inference_steps=num_inference_steps, | |
width=width, | |
height=height, | |
num_images_per_prompt=num_images_per_prompt, | |
generator=generator | |
).images | |
return image, seed | |
def process_workflow(image, text_prompt, vlm_model_choice, use_enhancer, model_choice, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, num_images_per_prompt): | |
if image is not None: | |
# Convert image to PIL if it's not already | |
if not isinstance(image, Image.Image): | |
image = Image.fromarray(image) | |
if vlm_model_choice == "Long Captioner": | |
prompt = create_captions_rich(image) | |
else: # Florence | |
prompt = florence_caption(image) | |
else: | |
prompt = text_prompt | |
if use_enhancer: | |
prompt = enhance_prompt(prompt, model_choice) | |
generated_image, used_seed = generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, num_images_per_prompt) | |
return generated_image, prompt, used_seed | |
custom_css = """ | |
.input-group, .output-group { | |
border: 1px solid #e0e0e0; | |
border-radius: 10px; | |
padding: 20px; | |
margin-bottom: 20px; | |
background-color: #f9f9f9; | |
} | |
.submit-btn { | |
background-color: #2980b9 !important; | |
color: white !important; | |
} | |
.submit-btn:hover { | |
background-color: #3498db !important; | |
} | |
""" | |
title = """<h1 align="center">Kolors with VLM Captioner and Prompt Enhancer</h1> | |
<p><center> | |
<a href="https://huggingface.co/Kwai-Kolors/Kolors" target="_blank">[Kolors Model]</a> | |
<a href="https://huggingface.co/microsoft/Florence-2-base" target="_blank">[Florence-2 Model]</a> | |
<a href="https://huggingface.co/gokaygokay/sd3-long-captioner-v2" target="_blank">[Long Captioner Model]</a> | |
<a href="https://huggingface.co/gokaygokay/Lamini-Prompt-Enchance-Long" target="_blank">[Prompt Enhancer Long]</a> | |
<a href="https://huggingface.co/gokaygokay/Lamini-Prompt-Enchance" target="_blank">[Prompt Enhancer Medium]</a> | |
<p align="center">Create long prompts from images or enhance your short prompts with prompt enhancer</p> | |
</center></p> | |
""" | |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as demo: | |
gr.HTML(title) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
with gr.Group(elem_classes="input-group"): | |
input_image = gr.Image(label="Input Image (VLM Captioner)") | |
vlm_model_choice = gr.Radio(["Florence-2", "Long Captioner"], label="VLM Model", value="Florence-2") | |
with gr.Accordion("Advanced Settings", open=False): | |
text_prompt = gr.Textbox(label="Text Prompt (optional, used if no image is uploaded)") | |
use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False) | |
model_choice = gr.Radio(["Medium", "Long"], label="Enhancer Model", value="Long") | |
negative_prompt = gr.Textbox(label="Negative Prompt") | |
seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0) | |
randomize_seed = gr.Checkbox(label="Randomize Seed", value=True) | |
width = gr.Slider(label="Width", minimum=512, maximum=2048, step=64, value=1024) | |
height = gr.Slider(label="Height", minimum=512, maximum=2048, step=64, value=1024) | |
guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.5, value=5.0) | |
num_inference_steps = gr.Slider(label="Inference Steps", minimum=20, maximum=50, step=1, value=20) | |
num_images_per_prompt = gr.Slider(1, 4, 1, step=1, label="Number of images per prompt") | |
generate_btn = gr.Button("Generate Image", elem_classes="submit-btn") | |
with gr.Column(scale=1): | |
with gr.Group(elem_classes="output-group"): | |
output_image = gr.Gallery(label="Result", elem_id="gallery", show_label=False) | |
final_prompt = gr.Textbox(label="Final Prompt Used") | |
used_seed = gr.Number(label="Seed Used") | |
generate_btn.click( | |
fn=process_workflow, | |
inputs=[ | |
input_image, text_prompt, vlm_model_choice, use_enhancer, model_choice, | |
negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, | |
num_images_per_prompt | |
], | |
outputs=[output_image, final_prompt, used_seed] | |
) | |
demo.launch(debug=True) |