Kolors-Controlnet_and_IPA

Running on Zero

App Files Files Community

Kolors-Controlnet_and_IPA / app.py

lixiang46

update

8cf7184 16 days ago

raw

history blame contribute delete

No virus

14.7 kB

	import spaces
	import random
	import torch
	import cv2
	import gradio as gr
	import numpy as np
	from huggingface_hub import snapshot_download
	from transformers import CLIPVisionModelWithProjection,CLIPImageProcessor
	from diffusers.utils import load_image
	from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
	from kolors.models.modeling_chatglm import ChatGLMModel
	from kolors.models.tokenization_chatglm import ChatGLMTokenizer
	from kolors.models.controlnet import ControlNetModel
	from diffusers import AutoencoderKL
	from kolors.models.unet_2d_condition import UNet2DConditionModel
	from diffusers import EulerDiscreteScheduler
	from PIL import Image
	from annotator.midas import MidasDetector
	from annotator.dwpose import DWposeDetector
	from annotator.util import resize_image, HWC3


	device = "cuda"
	ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
	ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
	ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
	ckpt_dir_ipa = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-Plus")
	ckpt_dir_pose = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Pose")

	text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
	tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
	vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
	scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
	unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)

	controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
	controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
	controlnet_pose = ControlNetModel.from_pretrained(f"{ckpt_dir_pose}", revision=None).half().to(device)

	image_encoder = CLIPVisionModelWithProjection.from_pretrained(f'{ckpt_dir_ipa}/image_encoder', ignore_mismatched_sizes=True).to(dtype=torch.float16, device=device)
	ip_img_size = 336
	clip_image_processor = CLIPImageProcessor(size=ip_img_size, crop_size=ip_img_size )

	pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
	vae=vae,
	controlnet = controlnet_depth,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=scheduler,
	image_encoder=image_encoder,
	feature_extractor=clip_image_processor,
	force_zeros_for_empty_prompt=False
	)

	pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
	vae=vae,
	controlnet = controlnet_canny,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=scheduler,
	image_encoder=image_encoder,
	feature_extractor=clip_image_processor,
	force_zeros_for_empty_prompt=False
	)

	pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
	vae=vae,
	controlnet = controlnet_pose,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=scheduler,
	image_encoder=image_encoder,
	feature_extractor=clip_image_processor,
	force_zeros_for_empty_prompt=False
	)

	for pipe in [pipe_depth]:
	if hasattr(pipe.unet, 'encoder_hid_proj'):
	pipe.unet.text_encoder_hid_proj = pipe.unet.encoder_hid_proj

	pipe_depth.load_ip_adapter(f'{ckpt_dir_ipa}' , subfolder="", weight_name=["ip_adapter_plus_general.bin"])
	pipe_canny.load_ip_adapter(f'{ckpt_dir_ipa}' , subfolder="", weight_name=["ip_adapter_plus_general.bin"])
	pipe_pose.load_ip_adapter(f'{ckpt_dir_ipa}' , subfolder="", weight_name=["ip_adapter_plus_general.bin"])

	@spaces.GPU
	def process_canny_condition(image, canny_threods=[100,200]):
	np_image = image.copy()
	np_image = cv2.Canny(np_image, canny_threods[0], canny_threods[1])
	np_image = np_image[:, :, None]
	np_image = np.concatenate([np_image, np_image, np_image], axis=2)
	np_image = HWC3(np_image)
	return Image.fromarray(np_image)

	model_midas = MidasDetector()

	@spaces.GPU
	def process_depth_condition_midas(img, res = 1024):
	h,w,_ = img.shape
	img = resize_image(HWC3(img), res)
	result = HWC3(model_midas(img))
	result = cv2.resize(result, (w,h))
	return Image.fromarray(result)

	model_dwpose = DWposeDetector()
	@spaces.GPU
	def process_dwpose_condition(image, res=1024):
	h,w,_ = image.shape
	img = resize_image(HWC3(image), res)
	out_res, out_img = model_dwpose(image)
	result = HWC3(out_img)
	result = cv2.resize( result, (w,h) )
	return Image.fromarray(result)

	MAX_SEED = np.iinfo(np.int32).max
	MAX_IMAGE_SIZE = 1024

	@spaces.GPU
	def infer_depth(prompt,
	image = None,
	ipa_img = None,
	negative_prompt = "nsfw，脸部阴影，低分辨率，糟糕的解剖结构、糟糕的手，缺失手指、质量最差、低质量、jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
	seed = 66,
	randomize_seed = False,
	guidance_scale = 5.0,
	num_inference_steps = 50,
	controlnet_conditioning_scale = 0.5,
	control_guidance_end = 0.9,
	strength = 1.0,
	ip_scale = 0.5,
	):
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)
	generator = torch.Generator().manual_seed(seed)
	init_image = resize_image(image, MAX_IMAGE_SIZE)
	pipe = pipe_depth.to("cuda")
	pipe.set_ip_adapter_scale([ip_scale])
	condi_img = process_depth_condition_midas( np.array(init_image), MAX_IMAGE_SIZE)
	image = pipe(
	prompt= prompt ,
	image = init_image,
	controlnet_conditioning_scale = controlnet_conditioning_scale,
	control_guidance_end = control_guidance_end,
	ip_adapter_image=[ipa_img],
	strength= strength ,
	control_image = condi_img,
	negative_prompt= negative_prompt ,
	num_inference_steps= num_inference_steps,
	guidance_scale= guidance_scale,
	num_images_per_prompt=1,
	generator=generator,
	).images[0]
	return [condi_img, image], seed

	@spaces.GPU
	def infer_canny(prompt,
	image = None,
	ipa_img = None,
	negative_prompt = "nsfw，脸部阴影，低分辨率，糟糕的解剖结构、糟糕的手，缺失手指、质量最差、低质量、jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
	seed = 66,
	randomize_seed = False,
	guidance_scale = 5.0,
	num_inference_steps = 50,
	controlnet_conditioning_scale = 0.5,
	control_guidance_end = 0.9,
	strength = 1.0,
	ip_scale = 0.5,
	):
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)
	generator = torch.Generator().manual_seed(seed)
	init_image = resize_image(image, MAX_IMAGE_SIZE)
	pipe = pipe_canny.to("cuda")
	pipe.set_ip_adapter_scale([ip_scale])
	condi_img = process_canny_condition(np.array(init_image))
	image = pipe(
	prompt= prompt ,
	image = init_image,
	controlnet_conditioning_scale = controlnet_conditioning_scale,
	control_guidance_end = control_guidance_end,
	ip_adapter_image=[ipa_img],
	strength= strength ,
	control_image = condi_img,
	negative_prompt= negative_prompt ,
	num_inference_steps= num_inference_steps,
	guidance_scale= guidance_scale,
	num_images_per_prompt=1,
	generator=generator,
	).images[0]
	return [condi_img, image], seed

	@spaces.GPU
	def infer_pose(prompt,
	image = None,
	ipa_img = None,
	negative_prompt = "nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
	seed = 66,
	randomize_seed = False,
	guidance_scale = 5.0,
	num_inference_steps = 50,
	controlnet_conditioning_scale = 0.5,
	control_guidance_end = 0.9,
	strength = 1.0,
	ip_scale = 0.5,
	):
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)
	generator = torch.Generator().manual_seed(seed)
	init_image = resize_image(image, MAX_IMAGE_SIZE)
	pipe = pipe_pose.to("cuda")
	pipe.set_ip_adapter_scale([ip_scale])
	condi_img = process_dwpose_condition(np.array(init_image), MAX_IMAGE_SIZE)
	image = pipe(
	prompt= prompt ,
	image = init_image,
	controlnet_conditioning_scale = controlnet_conditioning_scale,
	control_guidance_end = control_guidance_end,
	ip_adapter_image=[ipa_img],
	strength= strength ,
	control_image = condi_img,
	negative_prompt= negative_prompt ,
	num_inference_steps= num_inference_steps,
	guidance_scale= guidance_scale,
	num_images_per_prompt=1,
	generator=generator,
	).images[0]
	return [condi_img, image], seed

	canny_examples = [
	["一个红色头发的女孩，唯美风景，清新明亮，斑驳的光影，最好的质量，超细节，8K画质",
	"image/woman_2.png", "image/2.png"],
	]

	depth_examples = [
	["一个漂亮的女孩，最好的质量，超细节，8K画质",
	"image/1.png","image/woman_1.png"],
	]

	pose_examples = [
	["一位穿着紫色泡泡袖连衣裙、戴着皇冠和白色蕾丝手套的女孩，超高分辨率，最佳品质，8k画质",
	"image/woman_3.png","image/woman_4.png"],
	]


	css="""
	#col-left {
	margin: 0 auto;
	max-width: 600px;
	}
	#col-right {
	margin: 0 auto;
	max-width: 750px;
	}
	#button {
	color: blue;
	}
	"""

	def load_description(fp):
	with open(fp, 'r', encoding='utf-8') as f:
	content = f.read()
	return content

	with gr.Blocks(css=css) as Kolors:
	gr.HTML(load_description("assets/title.md"))
	with gr.Row():
	with gr.Column(elem_id="col-left"):
	with gr.Row():
	prompt = gr.Textbox(
	label="Prompt",
	placeholder="Enter your prompt",
	lines=2
	)
	with gr.Row():
	image = gr.Image(label="Image", type="pil")
	ipa_image = gr.Image(label="IP-Adapter-Image", type="pil")
	with gr.Accordion("Advanced Settings", open=False):
	negative_prompt = gr.Textbox(
	label="Negative prompt",
	placeholder="Enter a negative prompt",
	visible=True,
	value="nsfw，脸部阴影，低分辨率，糟糕的解剖结构、糟糕的手，缺失手指、质量最差、低质量、jpeg伪影、模糊、糟糕，黑脸，霓虹灯"
	)
	seed = gr.Slider(
	label="Seed",
	minimum=0,
	maximum=MAX_SEED,
	step=1,
	value=0,
	)
	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
	with gr.Row():
	guidance_scale = gr.Slider(
	label="Guidance scale",
	minimum=0.0,
	maximum=10.0,
	step=0.1,
	value=5.0,
	)
	num_inference_steps = gr.Slider(
	label="Number of inference steps",
	minimum=10,
	maximum=50,
	step=1,
	value=30,
	)
	with gr.Row():
	controlnet_conditioning_scale = gr.Slider(
	label="Controlnet Conditioning Scale",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.5,
	)
	control_guidance_end = gr.Slider(
	label="Control Guidance End",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.9,
	)
	with gr.Row():
	strength = gr.Slider(
	label="Strength",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=1.0,
	)
	ip_scale = gr.Slider(
	label="IP_Scale",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.5,
	)
	with gr.Row():
	canny_button = gr.Button("Canny", elem_id="button")
	depth_button = gr.Button("Depth", elem_id="button")
	pose_button = gr.Button("Pose", elem_id="button")

	with gr.Column(elem_id="col-right"):
	result = gr.Gallery(label="Result", show_label=False, columns=2)
	seed_used = gr.Number(label="Seed Used")

	with gr.Row():
	gr.Examples(
	fn = infer_canny,
	examples = canny_examples,
	inputs = [prompt, image, ipa_image],
	outputs = [result, seed_used],
	label = "Canny"
	)
	with gr.Row():
	gr.Examples(
	fn = infer_depth,
	examples = depth_examples,
	inputs = [prompt, image, ipa_image],
	outputs = [result, seed_used],
	label = "Depth"
	)
	with gr.Row():
	gr.Examples(
	fn = infer_pose,
	examples = pose_examples,
	inputs = [prompt, image, ipa_image],
	outputs = [result, seed_used],
	label = "Pose"
	)

	canny_button.click(
	fn = infer_canny,
	inputs = [prompt, image, ipa_image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength, ip_scale],
	outputs = [result, seed_used]
	)

	depth_button.click(
	fn = infer_depth,
	inputs = [prompt, image, ipa_image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength, ip_scale],
	outputs = [result, seed_used]
	)

	pose_button.click(
	fn = infer_pose,
	inputs = [prompt, image, ipa_image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength, ip_scale],
	outputs = [result, seed_used]
	)

	Kolors.queue().launch(debug=True)