import os
import shutil
from enum import Enum
import cv2
import einops
import gradio as gr
import numpy as np
import torch
import torch.nn.functional as F
import torchvision.transforms as T
from blendmodes.blend import BlendType, blendLayers
from PIL import Image
from pytorch_lightning import seed_everything
from safetensors.torch import load_file
from skimage import exposure
import src.import_util # noqa: F401
from ControlNet.annotator.canny import CannyDetector
from ControlNet.annotator.hed import HEDdetector
from ControlNet.annotator.midas import MidasDetector
from ControlNet.annotator.util import HWC3
from ControlNet.cldm.model import create_model, load_state_dict
from gmflow_module.gmflow.gmflow import GMFlow
from flow.flow_utils import get_warped_and_mask
from sd_model_cfg import model_dict
from src.config import RerenderConfig
from src.controller import AttentionControl
from src.ddim_v_hacked import DDIMVSampler
from src.img_util import find_flat_region, numpy2tensor
from src.video_util import (frame_to_video, get_fps, get_frame_count,
prepare_frames)
import huggingface_hub
REPO_NAME = 'Anonymous-sub/Rerender'
huggingface_hub.hf_hub_download(REPO_NAME,
'pexels-koolshooters-7322716.mp4',
local_dir='videos')
huggingface_hub.hf_hub_download(
REPO_NAME,
'pexels-antoni-shkraba-8048492-540x960-25fps.mp4',
local_dir='videos')
huggingface_hub.hf_hub_download(
REPO_NAME,
'pexels-cottonbro-studio-6649832-960x506-25fps.mp4',
local_dir='videos')
inversed_model_dict = dict()
for k, v in model_dict.items():
inversed_model_dict[v] = k
to_tensor = T.PILToTensor()
blur = T.GaussianBlur(kernel_size=(9, 9), sigma=(18, 18))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class ProcessingState(Enum):
NULL = 0
FIRST_IMG = 1
KEY_IMGS = 2
MAX_KEYFRAME = float(os.environ.get('MAX_KEYFRAME', 8))
class GlobalState:
def __init__(self):
self.sd_model = None
self.ddim_v_sampler = None
self.detector_type = None
self.detector = None
self.controller = None
self.processing_state = ProcessingState.NULL
flow_model = GMFlow(
feature_channels=128,
num_scales=1,
upsample_factor=8,
num_head=1,
attention_type='swin',
ffn_dim_expansion=4,
num_transformer_layers=6,
).to(device)
checkpoint = torch.load('models/gmflow_sintel-0c07dcb3.pth',
map_location=lambda storage, loc: storage)
weights = checkpoint['model'] if 'model' in checkpoint else checkpoint
flow_model.load_state_dict(weights, strict=False)
flow_model.eval()
self.flow_model = flow_model
def update_controller(self, inner_strength, mask_period, cross_period,
ada_period, warp_period):
self.controller = AttentionControl(inner_strength, mask_period,
cross_period, ada_period,
warp_period)
def update_sd_model(self, sd_model, control_type):
if sd_model == self.sd_model:
return
self.sd_model = sd_model
model = create_model('./ControlNet/models/cldm_v15.yaml').cpu()
if control_type == 'HED':
model.load_state_dict(
load_state_dict(huggingface_hub.hf_hub_download(
'lllyasviel/ControlNet', './models/control_sd15_hed.pth'),
location=device))
elif control_type == 'canny':
model.load_state_dict(
load_state_dict(huggingface_hub.hf_hub_download(
'lllyasviel/ControlNet', 'models/control_sd15_canny.pth'),
location=device))
elif control_type == 'depth':
model.load_state_dict(
load_state_dict(huggingface_hub.hf_hub_download(
'lllyasviel/ControlNet', 'models/control_sd15_depth.pth'),
location=device))
model.to(device)
sd_model_path = model_dict[sd_model]
if len(sd_model_path) > 0:
repo_name = REPO_NAME
# check if sd_model is repo_id/name otherwise use global REPO_NAME
if sd_model.count('/') == 1:
repo_name = sd_model
model_ext = os.path.splitext(sd_model_path)[1]
downloaded_model = huggingface_hub.hf_hub_download(
repo_name, sd_model_path)
if model_ext == '.safetensors':
model.load_state_dict(load_file(downloaded_model),
strict=False)
elif model_ext == '.ckpt' or model_ext == '.pth':
model.load_state_dict(
torch.load(downloaded_model)['state_dict'], strict=False)
try:
model.first_stage_model.load_state_dict(torch.load(
huggingface_hub.hf_hub_download(
'stabilityai/sd-vae-ft-mse-original',
'vae-ft-mse-840000-ema-pruned.ckpt'))['state_dict'],
strict=False)
except Exception:
print('Warning: We suggest you download the fine-tuned VAE',
'otherwise the generation quality will be degraded')
self.ddim_v_sampler = DDIMVSampler(model)
def clear_sd_model(self):
self.sd_model = None
self.ddim_v_sampler = None
if device == 'cuda':
torch.cuda.empty_cache()
def update_detector(self, control_type, canny_low=100, canny_high=200):
if self.detector_type == control_type:
return
if control_type == 'HED':
self.detector = HEDdetector()
elif control_type == 'canny':
canny_detector = CannyDetector()
low_threshold = canny_low
high_threshold = canny_high
def apply_canny(x):
return canny_detector(x, low_threshold, high_threshold)
self.detector = apply_canny
elif control_type == 'depth':
midas = MidasDetector()
def apply_midas(x):
detected_map, _ = midas(x)
return detected_map
self.detector = apply_midas
global_state = GlobalState()
global_video_path = None
video_frame_count = None
def create_cfg(input_path, prompt, image_resolution, control_strength,
color_preserve, left_crop, right_crop, top_crop, bottom_crop,
control_type, low_threshold, high_threshold, ddim_steps, scale,
seed, sd_model, a_prompt, n_prompt, interval, keyframe_count,
x0_strength, use_constraints, cross_start, cross_end,
style_update_freq, warp_start, warp_end, mask_start, mask_end,
ada_start, ada_end, mask_strength, inner_strength,
smooth_boundary):
use_warp = 'shape-aware fusion' in use_constraints
use_mask = 'pixel-aware fusion' in use_constraints
use_ada = 'color-aware AdaIN' in use_constraints
if not use_warp:
warp_start = 1
warp_end = 0
if not use_mask:
mask_start = 1
mask_end = 0
if not use_ada:
ada_start = 1
ada_end = 0
input_name = os.path.split(input_path)[-1].split('.')[0]
frame_count = 2 + keyframe_count * interval
cfg = RerenderConfig()
cfg.create_from_parameters(
input_path,
os.path.join('result', input_name, 'blend.mp4'),
prompt,
a_prompt=a_prompt,
n_prompt=n_prompt,
frame_count=frame_count,
interval=interval,
crop=[left_crop, right_crop, top_crop, bottom_crop],
sd_model=sd_model,
ddim_steps=ddim_steps,
scale=scale,
control_type=control_type,
control_strength=control_strength,
canny_low=low_threshold,
canny_high=high_threshold,
seed=seed,
image_resolution=image_resolution,
x0_strength=x0_strength,
style_update_freq=style_update_freq,
cross_period=(cross_start, cross_end),
warp_period=(warp_start, warp_end),
mask_period=(mask_start, mask_end),
ada_period=(ada_start, ada_end),
mask_strength=mask_strength,
inner_strength=inner_strength,
smooth_boundary=smooth_boundary,
color_preserve=color_preserve)
return cfg
def cfg_to_input(filename):
cfg = RerenderConfig()
cfg.create_from_path(filename)
keyframe_count = (cfg.frame_count - 2) // cfg.interval
use_constraints = [
'shape-aware fusion', 'pixel-aware fusion', 'color-aware AdaIN'
]
sd_model = inversed_model_dict.get(cfg.sd_model, 'Stable Diffusion 1.5')
args = [
cfg.input_path, cfg.prompt, cfg.image_resolution, cfg.control_strength,
cfg.color_preserve, *cfg.crop, cfg.control_type, cfg.canny_low,
cfg.canny_high, cfg.ddim_steps, cfg.scale, cfg.seed, sd_model,
cfg.a_prompt, cfg.n_prompt, cfg.interval, keyframe_count,
cfg.x0_strength, use_constraints, *cfg.cross_period,
cfg.style_update_freq, *cfg.warp_period, *cfg.mask_period,
*cfg.ada_period, cfg.mask_strength, cfg.inner_strength,
cfg.smooth_boundary
]
return args
def setup_color_correction(image):
correction_target = cv2.cvtColor(np.asarray(image.copy()),
cv2.COLOR_RGB2LAB)
return correction_target
def apply_color_correction(correction, original_image):
image = Image.fromarray(
cv2.cvtColor(
exposure.match_histograms(cv2.cvtColor(np.asarray(original_image),
cv2.COLOR_RGB2LAB),
correction,
channel_axis=2),
cv2.COLOR_LAB2RGB).astype('uint8'))
image = blendLayers(image, original_image, BlendType.LUMINOSITY)
return image
@torch.no_grad()
def process(*args):
first_frame = process1(*args)
keypath = process2(*args)
return first_frame, keypath
@torch.no_grad()
def process0(*args):
global global_video_path
global_video_path = args[0]
return process(*args[1:])
@torch.no_grad()
def process1(*args):
global global_video_path
cfg = create_cfg(global_video_path, *args)
global global_state
global_state.update_sd_model(cfg.sd_model, cfg.control_type)
global_state.update_controller(cfg.inner_strength, cfg.mask_period,
cfg.cross_period, cfg.ada_period,
cfg.warp_period)
global_state.update_detector(cfg.control_type, cfg.canny_low,
cfg.canny_high)
global_state.processing_state = ProcessingState.FIRST_IMG
prepare_frames(cfg.input_path, cfg.input_dir, cfg.image_resolution,
cfg.crop)
ddim_v_sampler = global_state.ddim_v_sampler
model = ddim_v_sampler.model
detector = global_state.detector
controller = global_state.controller
model.control_scales = [cfg.control_strength] * 13
model.to(device)
num_samples = 1
eta = 0.0
imgs = sorted(os.listdir(cfg.input_dir))
imgs = [os.path.join(cfg.input_dir, img) for img in imgs]
model.cond_stage_model.device = device
with torch.no_grad():
frame = cv2.imread(imgs[0])
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = HWC3(frame)
H, W, C = img.shape
img_ = numpy2tensor(img)
def generate_first_img(img_, strength):
encoder_posterior = model.encode_first_stage(img_.to(device))
x0 = model.get_first_stage_encoding(encoder_posterior).detach()
detected_map = detector(img)
detected_map = HWC3(detected_map)
control = torch.from_numpy(
detected_map.copy()).float().to(device) / 255.0
control = torch.stack([control for _ in range(num_samples)], dim=0)
control = einops.rearrange(control, 'b h w c -> b c h w').clone()
cond = {
'c_concat': [control],
'c_crossattn': [
model.get_learned_conditioning(
[cfg.prompt + ', ' + cfg.a_prompt] * num_samples)
]
}
un_cond = {
'c_concat': [control],
'c_crossattn':
[model.get_learned_conditioning([cfg.n_prompt] * num_samples)]
}
shape = (4, H // 8, W // 8)
controller.set_task('initfirst')
seed_everything(cfg.seed)
samples, _ = ddim_v_sampler.sample(
cfg.ddim_steps,
num_samples,
shape,
cond,
verbose=False,
eta=eta,
unconditional_guidance_scale=cfg.scale,
unconditional_conditioning=un_cond,
controller=controller,
x0=x0,
strength=strength)
x_samples = model.decode_first_stage(samples)
x_samples_np = (
einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 +
127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
return x_samples, x_samples_np
# When not preserve color, draw a different frame at first and use its
# color to redraw the first frame.
if not cfg.color_preserve:
first_strength = -1
else:
first_strength = 1 - cfg.x0_strength
x_samples, x_samples_np = generate_first_img(img_, first_strength)
if not cfg.color_preserve:
color_corrections = setup_color_correction(
Image.fromarray(x_samples_np[0]))
global_state.color_corrections = color_corrections
img_ = apply_color_correction(color_corrections,
Image.fromarray(img))
img_ = to_tensor(img_).unsqueeze(0)[:, :3] / 127.5 - 1
x_samples, x_samples_np = generate_first_img(
img_, 1 - cfg.x0_strength)
global_state.first_result = x_samples
global_state.first_img = img
Image.fromarray(x_samples_np[0]).save(
os.path.join(cfg.first_dir, 'first.jpg'))
return x_samples_np[0]
@torch.no_grad()
def process2(*args):
global global_state
global global_video_path
if global_state.processing_state != ProcessingState.FIRST_IMG:
raise gr.Error('Please generate the first key image before generating'
' all key images')
cfg = create_cfg(global_video_path, *args)
global_state.update_sd_model(cfg.sd_model, cfg.control_type)
global_state.update_detector(cfg.control_type, cfg.canny_low,
cfg.canny_high)
global_state.processing_state = ProcessingState.KEY_IMGS
# reset key dir
shutil.rmtree(cfg.key_dir)
os.makedirs(cfg.key_dir, exist_ok=True)
ddim_v_sampler = global_state.ddim_v_sampler
model = ddim_v_sampler.model
detector = global_state.detector
controller = global_state.controller
flow_model = global_state.flow_model
model.control_scales = [cfg.control_strength] * 13
num_samples = 1
eta = 0.0
firstx0 = True
pixelfusion = cfg.use_mask
imgs = sorted(os.listdir(cfg.input_dir))
imgs = [os.path.join(cfg.input_dir, img) for img in imgs]
first_result = global_state.first_result
first_img = global_state.first_img
pre_result = first_result
pre_img = first_img
for i in range(0, cfg.frame_count - 1, cfg.interval):
cid = i + 1
frame = cv2.imread(imgs[i + 1])
print(cid)
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = HWC3(frame)
H, W, C = img.shape
if cfg.color_preserve or global_state.color_corrections is None:
img_ = numpy2tensor(img)
else:
img_ = apply_color_correction(global_state.color_corrections,
Image.fromarray(img))
img_ = to_tensor(img_).unsqueeze(0)[:, :3] / 127.5 - 1
encoder_posterior = model.encode_first_stage(img_.to(device))
x0 = model.get_first_stage_encoding(encoder_posterior).detach()
detected_map = detector(img)
detected_map = HWC3(detected_map)
control = torch.from_numpy(
detected_map.copy()).float().to(device) / 255.0
control = torch.stack([control for _ in range(num_samples)], dim=0)
control = einops.rearrange(control, 'b h w c -> b c h w').clone()
cond = {
'c_concat': [control],
'c_crossattn': [
model.get_learned_conditioning(
[cfg.prompt + ', ' + cfg.a_prompt] * num_samples)
]
}
un_cond = {
'c_concat': [control],
'c_crossattn':
[model.get_learned_conditioning([cfg.n_prompt] * num_samples)]
}
shape = (4, H // 8, W // 8)
cond['c_concat'] = [control]
un_cond['c_concat'] = [control]
image1 = torch.from_numpy(pre_img).permute(2, 0, 1).float()
image2 = torch.from_numpy(img).permute(2, 0, 1).float()
warped_pre, bwd_occ_pre, bwd_flow_pre = get_warped_and_mask(
flow_model, image1, image2, pre_result, False)
blend_mask_pre = blur(
F.max_pool2d(bwd_occ_pre, kernel_size=9, stride=1, padding=4))
blend_mask_pre = torch.clamp(blend_mask_pre + bwd_occ_pre, 0, 1)
image1 = torch.from_numpy(first_img).permute(2, 0, 1).float()
warped_0, bwd_occ_0, bwd_flow_0 = get_warped_and_mask(
flow_model, image1, image2, first_result, False)
blend_mask_0 = blur(
F.max_pool2d(bwd_occ_0, kernel_size=9, stride=1, padding=4))
blend_mask_0 = torch.clamp(blend_mask_0 + bwd_occ_0, 0, 1)
if firstx0:
mask = 1 - F.max_pool2d(blend_mask_0, kernel_size=8)
controller.set_warp(
F.interpolate(bwd_flow_0 / 8.0,
scale_factor=1. / 8,
mode='bilinear'), mask)
else:
mask = 1 - F.max_pool2d(blend_mask_pre, kernel_size=8)
controller.set_warp(
F.interpolate(bwd_flow_pre / 8.0,
scale_factor=1. / 8,
mode='bilinear'), mask)
controller.set_task('keepx0, keepstyle')
seed_everything(cfg.seed)
samples, intermediates = ddim_v_sampler.sample(
cfg.ddim_steps,
num_samples,
shape,
cond,
verbose=False,
eta=eta,
unconditional_guidance_scale=cfg.scale,
unconditional_conditioning=un_cond,
controller=controller,
x0=x0,
strength=1 - cfg.x0_strength)
direct_result = model.decode_first_stage(samples)
if not pixelfusion:
pre_result = direct_result
pre_img = img
viz = (
einops.rearrange(direct_result, 'b c h w -> b h w c') * 127.5 +
127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
else:
blend_results = (1 - blend_mask_pre
) * warped_pre + blend_mask_pre * direct_result
blend_results = (
1 - blend_mask_0) * warped_0 + blend_mask_0 * blend_results
bwd_occ = 1 - torch.clamp(1 - bwd_occ_pre + 1 - bwd_occ_0, 0, 1)
blend_mask = blur(
F.max_pool2d(bwd_occ, kernel_size=9, stride=1, padding=4))
blend_mask = 1 - torch.clamp(blend_mask + bwd_occ, 0, 1)
encoder_posterior = model.encode_first_stage(blend_results)
xtrg = model.get_first_stage_encoding(
encoder_posterior).detach() # * mask
blend_results_rec = model.decode_first_stage(xtrg)
encoder_posterior = model.encode_first_stage(blend_results_rec)
xtrg_rec = model.get_first_stage_encoding(
encoder_posterior).detach()
xtrg_ = (xtrg + 1 * (xtrg - xtrg_rec)) # * mask
blend_results_rec_new = model.decode_first_stage(xtrg_)
tmp = (abs(blend_results_rec_new - blend_results).mean(
dim=1, keepdims=True) > 0.25).float()
mask_x = F.max_pool2d((F.interpolate(tmp,
scale_factor=1 / 8.,
mode='bilinear') > 0).float(),
kernel_size=3,
stride=1,
padding=1)
mask = (1 - F.max_pool2d(1 - blend_mask, kernel_size=8)
) # * (1-mask_x)
if cfg.smooth_boundary:
noise_rescale = find_flat_region(mask)
else:
noise_rescale = torch.ones_like(mask)
masks = []
for i in range(cfg.ddim_steps):
if i <= cfg.ddim_steps * cfg.mask_period[
0] or i >= cfg.ddim_steps * cfg.mask_period[1]:
masks += [None]
else:
masks += [mask * cfg.mask_strength]
# mask 3
# xtrg = ((1-mask_x) *
# (xtrg + xtrg - xtrg_rec) + mask_x * samples) * mask
# mask 2
# xtrg = (xtrg + 1 * (xtrg - xtrg_rec)) * mask
xtrg = (xtrg + (1 - mask_x) * (xtrg - xtrg_rec)) * mask # mask 1
tasks = 'keepstyle, keepx0'
if not firstx0:
tasks += ', updatex0'
if i % cfg.style_update_freq == 0:
tasks += ', updatestyle'
controller.set_task(tasks, 1.0)
seed_everything(cfg.seed)
samples, _ = ddim_v_sampler.sample(
cfg.ddim_steps,
num_samples,
shape,
cond,
verbose=False,
eta=eta,
unconditional_guidance_scale=cfg.scale,
unconditional_conditioning=un_cond,
controller=controller,
x0=x0,
strength=1 - cfg.x0_strength,
xtrg=xtrg,
mask=masks,
noise_rescale=noise_rescale)
x_samples = model.decode_first_stage(samples)
pre_result = x_samples
pre_img = img
viz = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 +
127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
Image.fromarray(viz[0]).save(
os.path.join(cfg.key_dir, f'{cid:04d}.png'))
key_video_path = os.path.join(cfg.work_dir, 'key.mp4')
fps = get_fps(cfg.input_path)
fps //= cfg.interval
frame_to_video(key_video_path, cfg.key_dir, fps, False)
return key_video_path
DESCRIPTION = '''
## [Rerender A Video](https://github.com/williamyang1991/Rerender_A_Video)
### This space provides the function of key frame translation. Full code for full video translation will be released upon the publication of the paper.
### To avoid overload, we set limitations to the **maximum frame number** (8) and the maximum frame resolution (512x768).
### The running time of a video of size 512x640 is about 1 minute per keyframe under T4 GPU.
### How to use:
1. **Run 1st Key Frame**: only translate the first frame, so you can adjust the prompts/models/parameters to find your ideal output appearance before run the whole video.
2. **Run Key Frames**: translate all the key frames based on the settings of the first frame
3. **Run All**: **Run 1st Key Frame** and **Run Key Frames**
4. **Run Propagation**: propogate the key frames to other frames for full video translation. This function is supported [here](https://github.com/williamyang1991/Rerender_A_Video#webui-recommended)
### Tips:
1. This method cannot handle large or quick motions where the optical flow is hard to estimate. **Videos with stable motions are preferred**.
2. Pixel-aware fusion may not work for large or quick motions.
3. Try different color-aware AdaIN settings and even unuse it to avoid color jittering.
4. `revAnimated_v11` model for non-photorealstic style, `realisticVisionV20_v20` model for photorealstic style.
5. To use your own SD/LoRA model, you may clone the space and specify your model with [sd_model_cfg.py](https://huggingface.co/spaces/Anonymous-sub/Rerender/blob/main/sd_model_cfg.py).
6. This method is based on the original SD model. You may need to [convert](https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py) Diffuser/Automatic1111 models to the original one.
**This code is for research purpose and non-commercial use only.**
[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/Anonymous-sub/Rerender?duplicate=true) for no queue on your own hardware.
'''
ARTICLE = r"""
If Rerender-A-Video is helpful, please help to ⭐ the Github Repo. Thanks!
[![GitHub Stars](https://img.shields.io/github/stars/williamyang1991/Rerender_A_Video?style=social)](https://github.com/williamyang1991/Rerender_A_Video)
---
📝 **Citation**
If our work is useful for your research, please consider citing:
```bibtex
@inproceedings{yang2023rerender,
title = {Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation},
author = {Yang, Shuai and Zhou, Yifan and Liu, Ziwei and and Loy, Chen Change},
booktitle = {ACM SIGGRAPH Asia Conference Proceedings},
year = {2023},
}
```
📋 **License**
This project is licensed under S-Lab License 1.0.
Redistribution and use for non-commercial purposes should follow this license.
📧 **Contact**
If you have any questions, please feel free to reach me out at williamyang@pku.edu.cn.
"""
block = gr.Blocks().queue()
with block:
with gr.Row():
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column():
input_path = gr.Video(label='Input Video',
source='upload',
format='mp4',
visible=True)
prompt = gr.Textbox(label='Prompt')
seed = gr.Slider(label='Seed',
minimum=0,
maximum=2147483647,
step=1,
value=0,
randomize=True)
run_button = gr.Button(value='Run All')
with gr.Row():
run_button1 = gr.Button(value='Run 1st Key Frame')
run_button2 = gr.Button(value='Run Key Frames')
run_button3 = gr.Button(value='Run Propagation')
with gr.Accordion('Advanced options for the 1st frame translation',
open=False):
image_resolution = gr.Slider(
label='Frame rsolution',
minimum=256,
maximum=512,
value=512,
step=64,
info='To avoid overload, maximum 512')
control_strength = gr.Slider(label='ControNet strength',
minimum=0.0,
maximum=2.0,
value=1.0,
step=0.01)
x0_strength = gr.Slider(
label='Denoising strength',
minimum=0.00,
maximum=1.05,
value=0.75,
step=0.05,
info=('0: fully recover the input.'
'1.05: fully rerender the input.'))
color_preserve = gr.Checkbox(
label='Preserve color',
value=True,
info='Keep the color of the input video')
with gr.Row():
left_crop = gr.Slider(label='Left crop length',
minimum=0,
maximum=512,
value=0,
step=1)
right_crop = gr.Slider(label='Right crop length',
minimum=0,
maximum=512,
value=0,
step=1)
with gr.Row():
top_crop = gr.Slider(label='Top crop length',
minimum=0,
maximum=512,
value=0,
step=1)
bottom_crop = gr.Slider(label='Bottom crop length',
minimum=0,
maximum=512,
value=0,
step=1)
with gr.Row():
control_type = gr.Dropdown(['HED', 'canny', 'depth'],
label='Control type',
value='HED')
low_threshold = gr.Slider(label='Canny low threshold',
minimum=1,
maximum=255,
value=100,
step=1)
high_threshold = gr.Slider(label='Canny high threshold',
minimum=1,
maximum=255,
value=200,
step=1)
ddim_steps = gr.Slider(label='Steps',
minimum=1,
maximum=20,
value=20,
step=1,
info='To avoid overload, maximum 20')
scale = gr.Slider(label='CFG scale',
minimum=0.1,
maximum=30.0,
value=7.5,
step=0.1)
sd_model_list = list(model_dict.keys())
sd_model = gr.Dropdown(sd_model_list,
label='Base model',
value='Stable Diffusion 1.5')
a_prompt = gr.Textbox(label='Added prompt',
value='best quality, extremely detailed')
n_prompt = gr.Textbox(
label='Negative prompt',
value=('longbody, lowres, bad anatomy, bad hands, '
'missing fingers, extra digit, fewer digits, '
'cropped, worst quality, low quality'))
with gr.Accordion('Advanced options for the key fame translation',
open=False):
interval = gr.Slider(
label='Key frame frequency (K)',
minimum=1,
maximum=MAX_KEYFRAME,
value=1,
step=1,
info='Uniformly sample the key frames every K frames')
keyframe_count = gr.Slider(
label='Number of key frames',
minimum=1,
maximum=MAX_KEYFRAME,
value=1,
step=1,
info='To avoid overload, maximum 8 key frames')
use_constraints = gr.CheckboxGroup(
[
'shape-aware fusion', 'pixel-aware fusion',
'color-aware AdaIN'
],
label='Select the cross-frame contraints to be used',
value=[
'shape-aware fusion', 'pixel-aware fusion',
'color-aware AdaIN'
]),
with gr.Row():
cross_start = gr.Slider(
label='Cross-frame attention start',
minimum=0,
maximum=1,
value=0,
step=0.05)
cross_end = gr.Slider(label='Cross-frame attention end',
minimum=0,
maximum=1,
value=1,
step=0.05)
style_update_freq = gr.Slider(
label='Cross-frame attention update frequency',
minimum=1,
maximum=100,
value=1,
step=1,
info=('Update the key and value for '
'cross-frame attention every N key frames (recommend N*K>=10)'
))
with gr.Row():
warp_start = gr.Slider(label='Shape-aware fusion start',
minimum=0,
maximum=1,
value=0,
step=0.05)
warp_end = gr.Slider(label='Shape-aware fusion end',
minimum=0,
maximum=1,
value=0.1,
step=0.05)
with gr.Row():
mask_start = gr.Slider(label='Pixel-aware fusion start',
minimum=0,
maximum=1,
value=0.5,
step=0.05)
mask_end = gr.Slider(label='Pixel-aware fusion end',
minimum=0,
maximum=1,
value=0.8,
step=0.05)
with gr.Row():
ada_start = gr.Slider(label='Color-aware AdaIN start',
minimum=0,
maximum=1,
value=0.8,
step=0.05)
ada_end = gr.Slider(label='Color-aware AdaIN end',
minimum=0,
maximum=1,
value=1,
step=0.05)
mask_strength = gr.Slider(label='Pixel-aware fusion stength',
minimum=0,
maximum=1,
value=0.5,
step=0.01)
inner_strength = gr.Slider(
label='Pixel-aware fusion detail level',
minimum=0.5,
maximum=1,
value=0.9,
step=0.01,
info='Use a low value to prevent artifacts')
smooth_boundary = gr.Checkbox(
label='Smooth fusion boundary',
value=True,
info='Select to prevent artifacts at boundary')
with gr.Accordion('Example configs', open=True):
config_dir = 'config'
config_list = os.listdir(config_dir)
args_list = []
for config in config_list:
try:
config_path = os.path.join(config_dir, config)
args = cfg_to_input(config_path)
args_list.append(args)
except FileNotFoundError:
# The video file does not exist, skipped
pass
ips = [
prompt, image_resolution, control_strength, color_preserve,
left_crop, right_crop, top_crop, bottom_crop, control_type,
low_threshold, high_threshold, ddim_steps, scale, seed,
sd_model, a_prompt, n_prompt, interval, keyframe_count,
x0_strength, use_constraints[0], cross_start, cross_end,
style_update_freq, warp_start, warp_end, mask_start,
mask_end, ada_start, ada_end, mask_strength,
inner_strength, smooth_boundary
]
with gr.Column():
result_image = gr.Image(label='Output first frame',
type='numpy',
interactive=False)
result_keyframe = gr.Video(label='Output key frame video',
format='mp4',
interactive=False)
with gr.Row():
gr.Examples(examples=args_list,
inputs=[input_path, *ips],
fn=process0,
outputs=[result_image, result_keyframe],
cache_examples=True)
gr.Markdown(ARTICLE)
def input_uploaded(path):
frame_count = get_frame_count(path)
if frame_count <= 2:
raise gr.Error('The input video is too short!'
'Please input another video.')
default_interval = min(10, frame_count - 2)
max_keyframe = min((frame_count - 2) // default_interval, MAX_KEYFRAME)
global video_frame_count
video_frame_count = frame_count
global global_video_path
global_video_path = path
return gr.Slider.update(value=default_interval,
maximum=frame_count - 2), gr.Slider.update(
value=max_keyframe, maximum=max_keyframe)
def input_changed(path):
frame_count = get_frame_count(path)
if frame_count <= 2:
return gr.Slider.update(maximum=1), gr.Slider.update(maximum=1)
default_interval = min(10, frame_count - 2)
max_keyframe = min((frame_count - 2) // default_interval, MAX_KEYFRAME)
global video_frame_count
video_frame_count = frame_count
global global_video_path
global_video_path = path
return gr.Slider.update(value=default_interval,
maximum=frame_count - 2), \
gr.Slider.update(maximum=max_keyframe)
def interval_changed(interval):
global video_frame_count
if video_frame_count is None:
return gr.Slider.update()
max_keyframe = min((video_frame_count - 2) // interval, MAX_KEYFRAME)
return gr.Slider.update(value=max_keyframe, maximum=max_keyframe)
input_path.change(input_changed, input_path, [interval, keyframe_count])
input_path.upload(input_uploaded, input_path, [interval, keyframe_count])
interval.change(interval_changed, interval, keyframe_count)
run_button.click(fn=process,
inputs=ips,
outputs=[result_image, result_keyframe])
run_button1.click(fn=process1, inputs=ips, outputs=[result_image])
run_button2.click(fn=process2, inputs=ips, outputs=[result_keyframe])
def process3():
raise gr.Error(
"Coming Soon. Full code for full video translation will be "
"released upon the publication of the paper.")
run_button3.click(fn=process3, outputs=[result_keyframe])
block.queue(concurrency_count=1, max_size=20)
block.launch(server_name='0.0.0.0')