Spaces:

adaface-neurips
/

adaface-animate

Sleeping

adaface-animate / adaface /test_img_prompt_model.py

adaface-neurips

Integrate do_neg_id_prompt_weight, fix bugs, various refinements

f0b9ada about 2 months ago

10 kB

	import torch
	from PIL import Image
	import os, argparse, glob
	import numpy as np
	from .face_id_to_ada_prompt import create_id2ada_prompt_encoder
	from .util import create_consistentid_pipeline
	from .arc2face_models import create_arc2face_pipeline
	from transformers import CLIPTextModel

	def save_images(images, subject_name, id2img_prompt_encoder_type,
	prompt, perturb_std, save_dir = "samples-ada"):
	os.makedirs(save_dir, exist_ok=True)
	# Save 4 images as a grid image in save_dir
	grid_image = Image.new('RGB', (512 * 2, 512 * 2))
	for i, image in enumerate(images):
	image = image.resize((512, 512))
	grid_image.paste(image, (512 * (i % 2), 512 * (i // 2)))

	prompt_sig = prompt.replace(" ", "_").replace(",", "_")
	grid_filepath = os.path.join(save_dir,
	"-".join([subject_name, id2img_prompt_encoder_type,
	prompt_sig, f"perturb{perturb_std:.02f}.png"]))

	if os.path.exists(grid_filepath):
	grid_count = 2
	grid_filepath = os.path.join(save_dir,
	"-".join([ subject_name, id2img_prompt_encoder_type,
	prompt_sig, f"perturb{perturb_std:.02f}", str(grid_count) ]) + ".png")
	while os.path.exists(grid_filepath):
	grid_count += 1
	grid_filepath = os.path.join(save_dir,
	"-".join([ subject_name, id2img_prompt_encoder_type,
	prompt_sig, f"perturb{perturb_std:.02f}", str(grid_count) ]) + ".png")

	grid_image.save(grid_filepath)
	print(f"Saved to {grid_filepath}")

	def seed_everything(seed):
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False
	os.environ["PL_GLOBAL_SEED"] = str(seed)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()

	# --base_model_path models/Realistic_Vision_V4.0_noVAE
	parser.add_argument("--base_model_path", type=str, default="models/sar/sar.safetensors")
	parser.add_argument("--id2img_prompt_encoder_type", type=str,
	choices=["arc2face", "consistentID"],
	help="Types of the ID2Img prompt encoder")
	parser.add_argument("--subject", type=str, default="subjects-celebrity/taylorswift")
	parser.add_argument("--example_image_count", type=int, default=5, help="Number of example images to use")
	parser.add_argument("--out_image_count", type=int, default=4, help="Number of images to generate")
	parser.add_argument("--init_img", type=str, default=None)
	parser.add_argument("--prompt", type=str, default="portrait photo of a person in superman costume")
	parser.add_argument("--use_core_only", action="store_true")
	parser.add_argument("--truncate_prompt_at", type=int, default=-1,
	help="Truncate the prompt to this length")
	parser.add_argument("--randface", action="store_true")
	parser.add_argument("--seed", type=int, default=-1)
	parser.add_argument("--perturb_std", type=float, default=1)

	args = parser.parse_args()
	if args.seed > 0:
	seed_everything(args.seed)

	if args.id2img_prompt_encoder_type == "arc2face":
	pipeline = create_arc2face_pipeline(args.base_model_path)
	use_teacher_neg = False
	elif args.id2img_prompt_encoder_type == "consistentID":
	pipeline = create_consistentid_pipeline(args.base_model_path)
	use_teacher_neg = True

	pipeline = pipeline.to('cuda', torch.float16)

	# When the second argument, adaface_ckpt_path = None, create_id2ada_prompt_encoder()
	# returns an id2ada_prompt_encoder object, with .subj_basis_generator uninitialized.
	# But it doesn't matter, as we don't use the subj_basis_generator to generate ada embeddings.
	id2img_prompt_encoder = create_id2ada_prompt_encoder([args.id2img_prompt_encoder_type],
	num_static_img_suffix_embs=0)
	id2img_prompt_encoder.to('cuda')

	if not args.randface:
	image_folder = args.subject
	if image_folder.endswith("/"):
	image_folder = image_folder[:-1]

	if os.path.isfile(image_folder):
	# Get the second to the last part of the path
	subject_name = os.path.basename(os.path.dirname(image_folder))
	image_paths = [image_folder]

	else:
	subject_name = os.path.basename(image_folder)
	image_types = [".jpg", ".png", "*.jpeg"]
	alltype_image_paths = []
	for image_type in image_types:
	# glob returns the full path.
	image_paths = glob.glob(os.path.join(image_folder, image_type))
	if len(image_paths) > 0:
	alltype_image_paths.extend(image_paths)
	# image_paths contain at most args.example_image_count full image paths.
	image_paths = alltype_image_paths[:args.example_image_count]
	else:
	subject_name = None
	image_paths = None
	image_folder = None

	subject_name = "randface-" + str(torch.seed()) if args.randface else subject_name
	id_batch_size = args.out_image_count

	text_encoder = pipeline.text_encoder
	orig_text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda")

	noise = torch.randn(args.out_image_count, 4, 64, 64, device='cuda', dtype=torch.float16)
	if args.randface:
	init_id_embs = torch.randn(1, 512, device='cuda', dtype=torch.float16)
	if args.id2img_prompt_encoder_type == "arc2face":
	pre_clip_features = None
	elif args.id2img_prompt_encoder_type == "consistentID":
	# For ConsistentID, random clip features are much better than zero clip features.
	rand_clip_fgbg_features = torch.randn(1, 514, 1280, device='cuda', dtype=torch.float16)
	pre_clip_features = rand_clip_fgbg_features
	else:
	breakpoint()
	else:
	init_id_embs = None
	pre_clip_features = None

	# perturb_std is the relative std of the noise added to the face ID embeddings.
	# For Arc2Face, a perturb_std of 0.08 could change gender, but 0.06 is usually safe.
	# For ConsistentID, the image prompt embeddings are extremely robust to noise,
	# and the perturb_std can be set to 0.5, only leading to a slight change in the result images.
	# Seems ConsistentID mainly relies on CLIP features, instead of the face ID embeddings.
	for perturb_std in (args.perturb_std, 0):
	# id_prompt_emb is in the image prompt space.
	# neg_id_prompt_emb is used in ConsistentID only.
	face_image_count, faceid_embeds, id_prompt_emb, neg_id_prompt_emb \
	= id2img_prompt_encoder.get_img_prompt_embs( \
	init_id_embs=init_id_embs,
	pre_clip_features=pre_clip_features,
	image_paths=image_paths,
	image_objs=None,
	id_batch_size=id_batch_size,
	perturb_at_stage='img_prompt_emb',
	perturb_std=perturb_std,
	avg_at_stage='id_emb',
	verbose=True)

	pipeline.text_encoder = orig_text_encoder

	comp_prompt = args.prompt
	negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
	# prompt_embeds_, negative_prompt_embeds_: [4, 77, 768]
	prompt_embeds_, negative_prompt_embeds_ = \
	pipeline.encode_prompt(comp_prompt, device='cuda', num_images_per_prompt=args.out_image_count,
	do_classifier_free_guidance=True, negative_prompt=negative_prompt)
	#pipeline.text_encoder = text_encoder
	# Append the id prompt embeddings to the prompt embeddings.
	# For arc2face, id_prompt_emb can be either pre- or post-pended.
	# But for ConsistentID, id_prompt_emb has to be post-pended. Otherwise, the result images are blank.

	full_negative_prompt_embeds_ = negative_prompt_embeds_
	if args.truncate_prompt_at >= 0:
	prompt_embeds_ = prompt_embeds_[:, :args.truncate_prompt_at]
	negative_prompt_embeds_ = negative_prompt_embeds_[:, :args.truncate_prompt_at]

	prompt_embeds_ = torch.cat([prompt_embeds_, id_prompt_emb], dim=1)
	M = id_prompt_emb.shape[1]

	if (not use_teacher_neg) or neg_id_prompt_emb is None:
	# For arc2face, neg_id_prompt_emb is None. So we concatenate the last M negative prompt embeddings,
	# to make the negative prompt embeddings have the same length as the prompt embeddings.
	negative_prompt_embeds_ = torch.cat([negative_prompt_embeds_, full_negative_prompt_embeds_[:, -M:]], dim=1)
	else:
	# NOTE: For ConsistentID, neg_id_prompt_emb has to be present in the negative prompt embeddings.
	# Otherwise, the result images are cartoonish.
	negative_prompt_embeds_ = torch.cat([negative_prompt_embeds_, neg_id_prompt_emb], dim=1)

	if args.use_core_only:
	prompt_embeds_ = id_prompt_emb
	if (not use_teacher_neg) or neg_id_prompt_emb is None:
	negative_prompt_embeds_ = full_negative_prompt_embeds_[:, :M]
	else:
	negative_prompt_embeds_ = neg_id_prompt_emb

	for guidance_scale in [6]:
	images = pipeline(latents=noise,
	prompt_embeds=prompt_embeds_,
	negative_prompt_embeds=negative_prompt_embeds_,
	num_inference_steps=50,
	guidance_scale=guidance_scale,
	num_images_per_prompt=1).images

	save_images(images, subject_name, args.id2img_prompt_encoder_type,
	f"guide{guidance_scale}", perturb_std)