Spaces:

PAIR
/

StreamingSVD

Running on Zero

App Files Files Community

StreamingSVD / diffusion_trainer /streaming_svd.py

lev1

Initial commit

8fd2f2f 3 months ago

raw

history blame

22.2 kB

	from modules.loader.module_loader import GenericModuleLoader
	from modules.params.diffusion_trainer.params_streaming_diff_trainer import DiffusionTrainerParams
	import torch
	from modules.params.diffusion.inference_params import InferenceParams
	from utils import result_processor
	from modules.loader.module_loader import GenericModuleLoader
	from tqdm import tqdm
	from PIL import Image, ImageFilter
	from utils.inference_utils import resize_and_crop,get_padding_for_aspect_ratio
	import numpy as np
	from safetensors.torch import load_file as load_safetensors
	import math
	from einops import repeat, rearrange
	from torchvision.transforms import ToTensor
	from models.svd.sgm.modules.autoencoding.temporal_ae import VideoDecoder
	import PIL
	from modules.params.vfi import VFIParams
	from modules.params.i2v_enhance import I2VEnhanceParams
	from typing import List,Union
	from models.diffusion.wrappers import StreamingWrapper
	from diffusion_trainer.abstract_trainer import AbstractTrainer
	from utils.loader import download_ckpt
	import torchvision.transforms.functional as TF
	from diffusers import AutoPipelineForInpainting, DEISMultistepScheduler
	from transformers import BlipProcessor, BlipForConditionalGeneration

	class StreamingSVD(AbstractTrainer):
	def __init__(self,
	module_loader: GenericModuleLoader,
	diff_trainer_params: DiffusionTrainerParams,
	inference_params: InferenceParams,
	vfi: VFIParams,
	i2v_enhance: I2VEnhanceParams,
	):
	super().__init__(inference_params=inference_params,
	diff_trainer_params=diff_trainer_params,
	module_loader=module_loader,
	)

	# network config is wrapped by OpenAIWrapper, so we dont need a direct reference anymore
	# this corresponds to the config yaml defined at model.module_loader.module_config.model.dependent_modules
	del self.network_config
	self.diff_trainer_params: DiffusionTrainerParams
	self.vfi = vfi
	self.i2v_enhance = i2v_enhance

	def on_inference_epoch_start(self):
	super().on_inference_epoch_start()

	# for StreamingSVD we use a model wrapper that combines the base SVD model and the control model.
	self.inference_model = StreamingWrapper(
	diffusion_model=self.model.diffusion_model,
	controlnet=self.controlnet,
	num_frame_conditioning=self.inference_params.num_conditional_frames
	)

	def post_init(self):
	self.svd_pipeline.set_progress_bar_config(disable=True)
	if self.device.type != "cpu":
	self.svd_pipeline.enable_model_cpu_offload(gpu_id = self.device.index)

	# re-use the open clip already loaded for image conditioner for image_encoder_apm
	embedders = self.conditioner.embedders
	for embedder in embedders:
	if hasattr(embedder,"input_key") and embedder.input_key == "cond_frames_without_noise":
	self.image_encoder_apm = embedder.open_clip
	self.first_stage_model.to("cpu")
	self.conditioner.embedders[3].encoder.to("cpu")
	self.conditioner.embedders[0].open_clip.to("cpu")

	pipe = AutoPipelineForInpainting.from_pretrained(
	'Lykon/dreamshaper-8-inpainting', torch_dtype=torch.float16, variant="fp16", safety_checker=None, requires_safety_checker=False)

	pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config)
	pipe = pipe.to(self.device)
	pipe.enable_model_cpu_offload(gpu_id = self.device.index)
	self.inpaint_pipe = pipe

	processor = BlipProcessor.from_pretrained(
	"Salesforce/blip-image-captioning-large")


	model = BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to(self.device)
	def blip(x): return processor.decode(model.generate(** processor(x,
	return_tensors='pt').to("cuda", torch.float16))[0], skip_special_tokens=True)
	self.blip = blip

	# Adapted from https://github.com/Stability-AI/generative-models/blob/main/scripts/sampling/simple_video_sample.py
	def get_unique_embedder_keys_from_conditioner(self, conditioner):
	return list(set([x.input_key for x in conditioner.embedders]))


	# Adapted from https://github.com/Stability-AI/generative-models/blob/main/scripts/sampling/simple_video_sample.py
	def get_batch_sgm(self, keys, value_dict, N, T, device):
	batch = {}
	batch_uc = {}

	for key in keys:
	if key == "fps_id":
	batch[key] = (
	torch.tensor([value_dict["fps_id"]])
	.to(device)
	.repeat(int(math.prod(N)))
	)
	elif key == "motion_bucket_id":
	batch[key] = (
	torch.tensor([value_dict["motion_bucket_id"]])
	.to(device)
	.repeat(int(math.prod(N)))
	)
	elif key == "cond_aug":
	batch[key] = repeat(
	torch.tensor([value_dict["cond_aug"]]).to(device),
	"1 -> b",
	b=math.prod(N),
	)
	elif key == "cond_frames":
	batch[key] = repeat(value_dict["cond_frames"],
	"1 ... -> b ...", b=N[0])
	elif key == "cond_frames_without_noise":
	batch[key] = repeat(
	value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
	)
	else:
	batch[key] = value_dict[key]

	if T is not None:
	batch["num_video_frames"] = T

	for key in batch.keys():
	if key not in batch_uc and isinstance(batch[key], torch.Tensor):
	batch_uc[key] = torch.clone(batch[key])
	return batch, batch_uc

	# Adapted from https://github.com/Stability-AI/generative-models/blob/main/sgm/models/diffusion.py
	@torch.no_grad()
	def decode_first_stage(self, z):
	self.first_stage_model.to(self.device)

	z = 1.0 / self.diff_trainer_params.scale_factor * z
	#n_samples = default(self.en_and_decode_n_samples_a_time, z.shape[0])
	n_samples = min(z.shape[0],8)
	#print("SVD decoder started")
	import time
	start = time.time()
	n_rounds = math.ceil(z.shape[0] / n_samples)
	all_out = []
	with torch.autocast("cuda", enabled=not self.diff_trainer_params.disable_first_stage_autocast):
	for n in range(n_rounds):
	if isinstance(self.first_stage_model.decoder, VideoDecoder):
	kwargs = {"timesteps": len(
	z[n * n_samples: (n + 1) * n_samples])}
	else:
	kwargs = {}
	out = self.first_stage_model.decode(
	z[n * n_samples: (n + 1) * n_samples], **kwargs
	)
	all_out.append(out)
	out = torch.cat(all_out, dim=0)
	# print(f"SVD decoder finished after {time.time()-start} seconds.")
	self.first_stage_model.to("cpu")
	return out


	# Adapted from https://github.com/Stability-AI/generative-models/blob/main/scripts/sampling/simple_video_sample.py
	def _generate_conditional_output(self, svd_input_frame, inference_params: InferenceParams, **params):
	C = 4
	F = 8 # spatial compression TODO read from model

	H = svd_input_frame.shape[-2]
	W = svd_input_frame.shape[-1]
	num_frames = self.sampler.guider.num_frames

	shape = (num_frames, C, H // F, W // F)
	batch_size = 1

	image = svd_input_frame[None,:]
	cond_aug = 0.02

	value_dict = {}
	value_dict["motion_bucket_id"] = 127
	value_dict["fps_id"] = 6
	value_dict["cond_aug"] = cond_aug
	value_dict["cond_frames_without_noise"] = image
	value_dict["cond_frames"] =image + cond_aug * torch.rand_like(image)

	batch, batch_uc = self.get_batch_sgm(
	self.get_unique_embedder_keys_from_conditioner(
	self.conditioner),
	value_dict,
	[1, num_frames],
	T=num_frames,
	device=self.device,
	)

	self.conditioner.embedders[3].encoder.to(self.device)
	self.conditioner.embedders[0].open_clip.to(self.device)
	c, uc = self.conditioner.get_unconditional_conditioning(
	batch,
	batch_uc=batch_uc,
	force_uc_zero_embeddings=[
	"cond_frames",
	"cond_frames_without_noise",
	],
	)
	self.conditioner.embedders[3].encoder.to("cpu")
	self.conditioner.embedders[0].open_clip.to("cpu")


	for k in ["crossattn", "concat"]:
	uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
	uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
	c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
	c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)

	randn = torch.randn(shape, device=self.device)

	additional_model_inputs = {}
	additional_model_inputs["image_only_indicator"] = torch.zeros(2*batch_size,num_frames).to(self.device)
	additional_model_inputs["num_video_frames"] = batch["num_video_frames"]

	# StreamingSVD inputs
	additional_model_inputs["batch_size"] = 2*batch_size
	additional_model_inputs["num_conditional_frames"] = self.inference_params.num_conditional_frames
	additional_model_inputs["ctrl_frames"] = params["ctrl_frames"]

	self.inference_model.diffusion_model = self.inference_model.diffusion_model.to(
	self.device)
	self.inference_model.controlnet = self.inference_model.controlnet.to(
	self.device)

	c["vector"] = c["vector"].to(randn.dtype)
	uc["vector"] = uc["vector"].to(randn.dtype)
	def denoiser(input, sigma, c):
	return self.denoiser(self.inference_model,input,sigma,c, **additional_model_inputs)
	samples_z = self.sampler(denoiser,randn,cond=c,uc=uc)

	self.inference_model.diffusion_model = self.inference_model.diffusion_model.to( "cpu")
	self.inference_model.controlnet = self.inference_model.controlnet.to("cpu")
	samples_x = self.decode_first_stage(samples_z)

	samples = torch.clamp(samples_x,min=-1.0,max=1.0)
	return samples


	def extract_anchor_frames(self, video, input_range,inference_params: InferenceParams):
	"""
	Extracts anchor frames from the input video based on the provided inference parameters.

	Parameters:
	- video: torch.Tensor
	The input video tensor.
	- input_range: list
	The pixel value range of input video.
	- inference_params: InferenceParams
	An object containing inference parameters.
	- anchor_frames: str
	Specifies how the anchor frames are encoded. It can be either a single number specifying which frame is used as the anchor frame,
	or a range in the format "a:b" indicating that frames from index a up to index b (inclusive) are used as anchor frames.

	Returns:
	- torch.Tensor
	The extracted anchor frames from the input video.
	"""
	video = result_processor.convert_range(video=video.clone(),input_range=input_range,output_range=[-1,1])

	if video.shape[1] == 3 and video.shape[0]>3:
	video = rearrange(video,"F C W H -> 1 F C W H")
	elif video.shape[0]>3 and video.shape[-1] == 3:
	video = rearrange(video,"F W H C -> 1 F C W H")
	else:
	raise NotImplementedError(f"Unexpected video input format: {video.shape}")

	if ":" in inference_params.anchor_frames:
	anchor_frames = inference_params.anchor_frames.split(":")
	anchor_frames = [int(anchor_frame) for anchor_frame in anchor_frames]
	assert len(anchor_frames) == 2,"Anchor frames encoding wrong."
	anchor = video[:,anchor_frames[0]:anchor_frames[1]]
	else:
	anchor_frame = int(inference_params.anchor_frames)
	anchor = video[:, anchor_frame].unsqueeze(0)

	return anchor

	def extract_ctrl_frames(self,video: torch.FloatType, input_range: List[int], inference_params: InferenceParams):
	"""
	Extracts control frames from the input video.

	Parameters:
	- video: torch.Tensor
	The input video tensor.
	- input_range: list
	The pixel value range of input video.
	- inference_params: InferenceParams
	An object containing inference parameters.

	Returns:
	- torch.Tensor
	The extracted control image encoding frames from the input video.
	"""
	video = result_processor.convert_range(video=video.clone(), input_range=input_range, output_range=[-1, 1])
	if video.shape[1] == 3 and video.shape[0] > 3:
	video = rearrange(video, "F C W H -> 1 F C W H")
	elif video.shape[0] > 3 and video.shape[-1] == 3:
	video = rearrange(video, "F W H C -> 1 F C W H")
	else:
	raise NotImplementedError(
	f"Unexpected video input format: {video.shape}")

	# return the last num_conditional_frames frames
	video = video[:, -inference_params.num_conditional_frames:]
	return video


	def _autoregressive_generation(self,initial_generation: Union[torch.FloatType,List[torch.FloatType]], inference_params:InferenceParams):
	"""
	Perform autoregressive generation of video chunks based on the initial generation and inference parameters.

	Parameters:
	- initial_generation: torch.Tensor or list of torch.Tensor
	The initial generation or list of initial generation video chunks.
	- inference_params: InferenceParams
	An object containing inference parameters.

	Returns:
	- torch.Tensor
	The generated video resulting from autoregressive generation.
	"""

	# input is [-1,1] float
	result_chunks = initial_generation
	if not isinstance(result_chunks,list):
	result_chunks = [result_chunks]

	# make sure
	if (result_chunks[0].shape[1] >3) and (result_chunks[0].shape[-1] == 3):
	result_chunks = [rearrange(result_chunks[0],"F W H C -> F C W H")]

	# generating chunk by conditioning on the previous chunks
	for _ in tqdm(list(range(inference_params.n_autoregressive_generations)),desc="StreamingSVD"):

	# extract anchor frames based on the entire, so far generated, video
	# note that we do note use anchor frame in StreamingSVD (apart from the anchor frame already used by SVD).
	anchor_frames = self.extract_anchor_frames(
	video = torch.cat(result_chunks),
	inference_params=inference_params,
	input_range=[-1, 1],
	)

	# extract control frames based on the last generated chunk
	ctrl_frames = self.extract_ctrl_frames(
	video = result_chunks[-1],
	input_range=[-1, 1],
	inference_params=inference_params,
	)

	# select the anchor frame for svd
	svd_input_frame = result_chunks[0][int(inference_params.anchor_frames)]

	# generate the next chunk
	# result is [F, C, H, W], range is [-1,1] float.
	result = self._generate_conditional_output(
	svd_input_frame = svd_input_frame,
	inference_params=inference_params,
	anchor_frames=anchor_frames,
	ctrl_frames=ctrl_frames,
	)

	# from each generation, we keep all frames except for the first <num_conditional_frames> frames
	result = result[inference_params.num_conditional_frames:]
	result_chunks.append(result)
	torch.cuda.empty_cache()

	# concat all chunks to one long video
	result_chunks = [result_processor.convert_range(chunk,output_range=[0,255],input_range=[-1,1]) for chunk in result_chunks]
	result = result_processor.concat_chunks(result_chunks)
	torch.cuda.empty_cache()
	return result

	def ensure_image_ratio(self,source_image: PIL,target_aspect_ratio = 16/9):

	if source_image.width / source_image.height == target_aspect_ratio:
	return source_image, None

	image = source_image.copy().convert("RGBA")
	mask = image.split()[-1]
	image = image.convert("RGB")
	padding = get_padding_for_aspect_ratio(image)


	mask_padded = TF.pad(mask, padding)
	mask_padded_size = mask_padded.size
	mask_padded_resized = TF.resize(mask_padded, (512, 512),
	interpolation=TF.InterpolationMode.NEAREST)
	mask_padded_resized = TF.invert(mask_padded_resized)

	# image
	padded_input_image = TF.pad(image, padding, padding_mode="reflect")
	resized_image = TF.resize(padded_input_image, (512, 512))

	image_tensor = (self.inpaint_pipe.image_processor.preprocess(
	resized_image).cuda().half())
	latent_tensor = self.inpaint_pipe._encode_vae_image(image_tensor, None)
	self.inpaint_pipe.scheduler.set_timesteps(999)
	noisy_latent_tensor = self.inpaint_pipe.scheduler.add_noise(
	latent_tensor,
	torch.randn_like(latent_tensor),
	self.inpaint_pipe.scheduler.timesteps[:1],
	)

	prompt = self.blip(source_image)
	if prompt.startswith("there is "):
	prompt = prompt[len("there is "):]

	output_image_normalized_size = self.inpaint_pipe(
	prompt=prompt,
	image=resized_image,
	mask_image=mask_padded_resized,
	latents=noisy_latent_tensor,
	).images[0]

	output_image_extended_size = TF.resize(
	output_image_normalized_size, mask_padded_size[::-1])

	blured_outpainting_mask = TF.invert(mask_padded).filter(
	ImageFilter.GaussianBlur(radius=5))

	final_image = Image.composite(
	output_image_extended_size, padded_input_image, blured_outpainting_mask)
	return final_image, TF.invert(mask_padded)


	def image_to_video(self, batch, inference_params: InferenceParams, batch_idx):

	"""
	Performs image to video based on the input batch and inference parameters.
	It runs SVD-XT one to generate the first chunk, then auto-regressively applies StreamingSVD.

	Parameters:
	- batch: dict
	The input batch containing the start image for generating the video.
	- inference_params: InferenceParams
	An object containing inference parameters.
	- batch_idx: int
	The index of the batch.

	Returns:
	- torch.Tensor
	The generated video based on the image image.
	"""
	batch_key = "image"
	assert batch_key == "image", f"Generating video from {batch_key} not implemented."
	input_image = PIL.Image.fromarray(batch[batch_key][0].cpu().numpy())
	# TODO remove conversion forth and back

	outpainted_image, _ = self.ensure_image_ratio(input_image)

	#image = Image.fromarray(np.uint8(image))
	'''
	if image.width/image.height != 16/9:
	print(f"Warning! For best results, we assume the aspect ratio of the input image to be 16:9. Found ratio {image.width}:{image.height}.")
	'''
	scaled_outpainted_image, expanded_size = resize_and_crop(outpainted_image)
	assert scaled_outpainted_image.width == 1024 and scaled_outpainted_image.height == 576, f"Wrong shape for file {batch[batch_key]} with shape {scaled_outpainted_image.width}:{scaled_outpainted_image.height}."

	# Generating first chunk
	with torch.autocast(device_type="cuda",enabled=False):
	video_chunks = self.svd_pipeline(
	scaled_outpainted_image, decode_chunk_size=8).frames[0]

	video_chunks = torch.stack([ToTensor()(frame) for frame in video_chunks])
	video_chunks = video_chunks * 2.0 - 1 # [-1,1], float

	video_chunks = video_chunks.to(self.device)

	video = self._autoregressive_generation(
	initial_generation=video_chunks,
	inference_params=inference_params)

	return video, scaled_outpainted_image, expanded_size


	def generate_output(self, batch, batch_idx,inference_params: InferenceParams):
	"""
	Generate output video based on the input batch and inference parameters.

	Parameters:
	- batch: dict
	The input batch containing data for generating the output video.
	- batch_idx: int
	The index of the batch.
	- inference_params: InferenceParams
	An object containing inference parameters.

	Returns:
	- torch.Tensor
	The generated video. Note the result is also accessible via self.trainer.generated_video
	"""

	sample_id = batch["sample_id"].item()
	video, scaled_outpainted_image, expanded_size = self.image_to_video(
	batch, inference_params=inference_params, batch_idx=sample_id)

	self.trainer.generated_video = video.numpy()
	self.trainer.expanded_size = expanded_size
	self.trainer.scaled_outpainted_image = scaled_outpainted_image
	return video