Spaces:
Paused
Paused
eranlevinlt
commited on
Commit
•
f1a05f0
1
Parent(s):
af41e07
add app
Browse files
app.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
+
|
6 |
+
from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
|
7 |
+
from xora.models.transformers.transformer3d import Transformer3DModel
|
8 |
+
from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
|
9 |
+
from xora.schedulers.rf import RectifiedFlowScheduler
|
10 |
+
from xora.pipelines.pipeline_xora_video import XoraVideoPipeline
|
11 |
+
from transformers import T5EncoderModel, T5Tokenizer
|
12 |
+
from xora.utils.conditioning_method import ConditioningMethod
|
13 |
+
from pathlib import Path
|
14 |
+
import safetensors.torch
|
15 |
+
import json
|
16 |
+
import numpy as np
|
17 |
+
import cv2
|
18 |
+
from PIL import Image
|
19 |
+
import tempfile
|
20 |
+
import os
|
21 |
+
|
22 |
+
# Load Hugging Face token if needed
|
23 |
+
hf_token = os.getenv("HF_TOKEN")
|
24 |
+
|
25 |
+
# Set model download directory within Hugging Face Spaces
|
26 |
+
model_path = "asset"
|
27 |
+
if not os.path.exists(model_path):
|
28 |
+
snapshot_download("Lightricks/Xora", local_dir=model_path, repo_type='model', token=hf_token)
|
29 |
+
|
30 |
+
# Global variables to load components
|
31 |
+
vae_dir = Path(model_path) / 'vae'
|
32 |
+
unet_dir = Path(model_path) / 'unet'
|
33 |
+
scheduler_dir = Path(model_path) / 'scheduler'
|
34 |
+
|
35 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
36 |
+
|
37 |
+
|
38 |
+
def load_vae(vae_dir):
|
39 |
+
vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
|
40 |
+
vae_config_path = vae_dir / "config.json"
|
41 |
+
with open(vae_config_path, 'r') as f:
|
42 |
+
vae_config = json.load(f)
|
43 |
+
vae = CausalVideoAutoencoder.from_config(vae_config)
|
44 |
+
vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
|
45 |
+
vae.load_state_dict(vae_state_dict)
|
46 |
+
return vae.cuda().to(torch.bfloat16)
|
47 |
+
|
48 |
+
|
49 |
+
def load_unet(unet_dir):
|
50 |
+
unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
|
51 |
+
unet_config_path = unet_dir / "config.json"
|
52 |
+
transformer_config = Transformer3DModel.load_config(unet_config_path)
|
53 |
+
transformer = Transformer3DModel.from_config(transformer_config)
|
54 |
+
unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
|
55 |
+
transformer.load_state_dict(unet_state_dict, strict=True)
|
56 |
+
return transformer.to(device)
|
57 |
+
|
58 |
+
|
59 |
+
def load_scheduler(scheduler_dir):
|
60 |
+
scheduler_config_path = scheduler_dir / "scheduler_config.json"
|
61 |
+
scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
|
62 |
+
return RectifiedFlowScheduler.from_config(scheduler_config)
|
63 |
+
|
64 |
+
|
65 |
+
# Helper function for image processing
|
66 |
+
def center_crop_and_resize(frame, target_height, target_width):
|
67 |
+
h, w, _ = frame.shape
|
68 |
+
aspect_ratio_target = target_width / target_height
|
69 |
+
aspect_ratio_frame = w / h
|
70 |
+
if aspect_ratio_frame > aspect_ratio_target:
|
71 |
+
new_width = int(h * aspect_ratio_target)
|
72 |
+
x_start = (w - new_width) // 2
|
73 |
+
frame_cropped = frame[:, x_start:x_start + new_width]
|
74 |
+
else:
|
75 |
+
new_height = int(w / aspect_ratio_target)
|
76 |
+
y_start = (h - new_height) // 2
|
77 |
+
frame_cropped = frame[y_start:y_start + new_height, :]
|
78 |
+
frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
|
79 |
+
return frame_resized
|
80 |
+
|
81 |
+
|
82 |
+
def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
|
83 |
+
image = Image.open(image_path).convert("RGB")
|
84 |
+
image_np = np.array(image)
|
85 |
+
frame_resized = center_crop_and_resize(image_np, target_height, target_width)
|
86 |
+
frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float()
|
87 |
+
frame_tensor = (frame_tensor / 127.5) - 1.0
|
88 |
+
return frame_tensor.unsqueeze(0).unsqueeze(2)
|
89 |
+
|
90 |
+
|
91 |
+
# Preset options for resolution and frame configuration
|
92 |
+
preset_options = [
|
93 |
+
{"label": "704x1216, 41 frames", "height": 704, "width": 1216, "num_frames": 41},
|
94 |
+
{"label": "704x1088, 49 frames", "height": 704, "width": 1088, "num_frames": 49},
|
95 |
+
{"label": "640x1056, 57 frames", "height": 640, "width": 1056, "num_frames": 57},
|
96 |
+
{"label": "608x992, 65 frames", "height": 608, "width": 992, "num_frames": 65},
|
97 |
+
{"label": "608x896, 73 frames", "height": 608, "width": 896, "num_frames": 73},
|
98 |
+
{"label": "544x896, 81 frames", "height": 544, "width": 896, "num_frames": 81},
|
99 |
+
{"label": "544x832, 89 frames", "height": 544, "width": 832, "num_frames": 89},
|
100 |
+
{"label": "512x800, 97 frames", "height": 512, "width": 800, "num_frames": 97},
|
101 |
+
{"label": "512x768, 97 frames", "height": 512, "width": 768, "num_frames": 97},
|
102 |
+
{"label": "480x800, 105 frames", "height": 480, "width": 800, "num_frames": 105},
|
103 |
+
{"label": "480x736, 113 frames", "height": 480, "width": 736, "num_frames": 113},
|
104 |
+
{"label": "480x704, 121 frames", "height": 480, "width": 704, "num_frames": 121},
|
105 |
+
{"label": "448x704, 129 frames", "height": 448, "width": 704, "num_frames": 129},
|
106 |
+
{"label": "448x672, 137 frames", "height": 448, "width": 672, "num_frames": 137},
|
107 |
+
{"label": "416x640, 153 frames", "height": 416, "width": 640, "num_frames": 153},
|
108 |
+
{"label": "384x672, 161 frames", "height": 384, "width": 672, "num_frames": 161},
|
109 |
+
{"label": "384x640, 169 frames", "height": 384, "width": 640, "num_frames": 169},
|
110 |
+
{"label": "384x608, 177 frames", "height": 384, "width": 608, "num_frames": 177},
|
111 |
+
{"label": "384x576, 185 frames", "height": 384, "width": 576, "num_frames": 185},
|
112 |
+
{"label": "352x608, 193 frames", "height": 352, "width": 608, "num_frames": 193},
|
113 |
+
{"label": "352x576, 201 frames", "height": 352, "width": 576, "num_frames": 201},
|
114 |
+
{"label": "352x544, 209 frames", "height": 352, "width": 544, "num_frames": 209},
|
115 |
+
{"label": "352x512, 225 frames", "height": 352, "width": 512, "num_frames": 225},
|
116 |
+
{"label": "352x512, 233 frames", "height": 352, "width": 512, "num_frames": 233},
|
117 |
+
{"label": "320x544, 241 frames", "height": 320, "width": 544, "num_frames": 241},
|
118 |
+
{"label": "320x512, 249 frames", "height": 320, "width": 512, "num_frames": 249},
|
119 |
+
{"label": "320x512, 257 frames", "height": 320, "width": 512, "num_frames": 257},
|
120 |
+
{"label": "Custom", "height": None, "width": None, "num_frames": None}
|
121 |
+
]
|
122 |
+
|
123 |
+
|
124 |
+
# Function to toggle visibility of sliders based on preset selection
|
125 |
+
def preset_changed(preset):
|
126 |
+
if preset != "Custom":
|
127 |
+
selected = next(item for item in preset_options if item["label"] == preset)
|
128 |
+
return (
|
129 |
+
selected["height"],
|
130 |
+
selected["width"],
|
131 |
+
selected["num_frames"],
|
132 |
+
gr.update(visible=False),
|
133 |
+
gr.update(visible=False),
|
134 |
+
gr.update(visible=False)
|
135 |
+
)
|
136 |
+
else:
|
137 |
+
return None, None, None, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
|
138 |
+
|
139 |
+
|
140 |
+
# Load models
|
141 |
+
vae = load_vae(vae_dir)
|
142 |
+
unet = load_unet(unet_dir)
|
143 |
+
scheduler = load_scheduler(scheduler_dir)
|
144 |
+
patchifier = SymmetricPatchifier(patch_size=1)
|
145 |
+
text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to(device)
|
146 |
+
tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
|
147 |
+
|
148 |
+
pipeline = XoraVideoPipeline(
|
149 |
+
transformer=unet,
|
150 |
+
patchifier=patchifier,
|
151 |
+
text_encoder=text_encoder,
|
152 |
+
tokenizer=tokenizer,
|
153 |
+
scheduler=scheduler,
|
154 |
+
vae=vae,
|
155 |
+
).to(device)
|
156 |
+
|
157 |
+
|
158 |
+
# Modified function to include validation with gr.Error
|
159 |
+
@spaces.GPU(duration=120)
|
160 |
+
def generate_video(image_path=None, prompt="", negative_prompt="",
|
161 |
+
seed=171198, num_inference_steps=40, num_images_per_prompt=1,
|
162 |
+
guidance_scale=3, height=512, width=768, num_frames=121, frame_rate=25, progress=gr.Progress()):
|
163 |
+
# Check prompt length and raise an error if it's too short
|
164 |
+
if len(prompt.strip()) < 50:
|
165 |
+
raise gr.Error("Prompt must be at least 50 characters long. Please provide more details for the best results.", duration=5)
|
166 |
+
|
167 |
+
if image_path:
|
168 |
+
media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device)
|
169 |
+
else:
|
170 |
+
raise ValueError("Image path must be provided.")
|
171 |
+
|
172 |
+
sample = {
|
173 |
+
"prompt": prompt,
|
174 |
+
'prompt_attention_mask': None,
|
175 |
+
'negative_prompt': negative_prompt,
|
176 |
+
'negative_prompt_attention_mask': None,
|
177 |
+
'media_items': media_items,
|
178 |
+
}
|
179 |
+
|
180 |
+
generator = torch.Generator(device="cpu").manual_seed(seed)
|
181 |
+
|
182 |
+
def gradio_progress_callback(self, step, timestep, kwargs):
|
183 |
+
progress((step + 1) / num_inference_steps)
|
184 |
+
|
185 |
+
images = pipeline(
|
186 |
+
num_inference_steps=num_inference_steps,
|
187 |
+
num_images_per_prompt=num_images_per_prompt,
|
188 |
+
guidance_scale=guidance_scale,
|
189 |
+
generator=generator,
|
190 |
+
output_type="pt",
|
191 |
+
height=height,
|
192 |
+
width=width,
|
193 |
+
num_frames=num_frames,
|
194 |
+
frame_rate=frame_rate,
|
195 |
+
**sample,
|
196 |
+
is_video=True,
|
197 |
+
vae_per_channel_normalize=True,
|
198 |
+
conditioning_method=ConditioningMethod.FIRST_FRAME,
|
199 |
+
mixed_precision=True,
|
200 |
+
callback_on_step_end=gradio_progress_callback
|
201 |
+
).images
|
202 |
+
|
203 |
+
output_path = tempfile.mktemp(suffix=".mp4")
|
204 |
+
video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
|
205 |
+
video_np = (video_np * 255).astype(np.uint8)
|
206 |
+
height, width = video_np.shape[1:3]
|
207 |
+
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (width, height))
|
208 |
+
for frame in video_np[..., ::-1]:
|
209 |
+
out.write(frame)
|
210 |
+
out.release()
|
211 |
+
|
212 |
+
return output_path
|
213 |
+
|
214 |
+
|
215 |
+
# Define the Gradio interface with presets
|
216 |
+
with gr.Blocks() as iface:
|
217 |
+
gr.Markdown("# Video Generation with Xora")
|
218 |
+
|
219 |
+
with gr.Row():
|
220 |
+
with gr.Column():
|
221 |
+
image_input = gr.Image(type="filepath", label="Image Input")
|
222 |
+
prompt = gr.Textbox(label="Prompt", value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains. The sky is clear with a few wispy clouds, and the sunlight glistens on the motorcycle as it speeds along. The rider is dressed in a black leather jacket and helmet, leaning slightly forward as the wind rustles through nearby trees. The wheels kick up dust, creating a slight trail behind the motorcycle, adding a sense of speed and excitement to the scene.")
|
223 |
+
negative_prompt = gr.Textbox(label="Negative Prompt", value="worst quality, inconsistent motion...")
|
224 |
+
|
225 |
+
# Preset dropdown for resolution and frame settings
|
226 |
+
preset_dropdown = gr.Dropdown(
|
227 |
+
choices=[p["label"] for p in preset_options],
|
228 |
+
value="704x1216, 41 frames",
|
229 |
+
label="Resolution Preset"
|
230 |
+
)
|
231 |
+
|
232 |
+
# Advanced options section
|
233 |
+
with gr.Accordion("Advanced Options", open=False):
|
234 |
+
seed = gr.Slider(label="Seed", minimum=0, maximum=1000000, step=1, value=171198)
|
235 |
+
inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=100, step=1, value=40)
|
236 |
+
images_per_prompt = gr.Slider(label="Images per Prompt", minimum=1, maximum=10, step=1, value=1)
|
237 |
+
guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=3.0)
|
238 |
+
|
239 |
+
# Sliders to appear at the end of the advanced settings
|
240 |
+
height_slider = gr.Slider(label="Height", minimum=256, maximum=1024, step=64, value=704, visible=False)
|
241 |
+
width_slider = gr.Slider(label="Width", minimum=256, maximum=1024, step=64, value=1216, visible=False)
|
242 |
+
num_frames_slider = gr.Slider(label="Number of Frames", minimum=1, maximum=200, step=1, value=41,
|
243 |
+
visible=False)
|
244 |
+
|
245 |
+
frame_rate = gr.Slider(label="Frame Rate", minimum=1, maximum=60, step=1, value=25, visible=False)
|
246 |
+
|
247 |
+
generate_button = gr.Button("Generate Video")
|
248 |
+
|
249 |
+
with gr.Column():
|
250 |
+
output_video = gr.Video(label="Generated Video")
|
251 |
+
|
252 |
+
# Link dropdown change to update sliders visibility and values
|
253 |
+
preset_dropdown.change(
|
254 |
+
fn=preset_changed,
|
255 |
+
inputs=[preset_dropdown],
|
256 |
+
outputs=[height_slider, width_slider, num_frames_slider, height_slider, width_slider, frame_rate]
|
257 |
+
)
|
258 |
+
|
259 |
+
generate_button.click(
|
260 |
+
fn=generate_video,
|
261 |
+
inputs=[image_input, prompt, negative_prompt, seed, inference_steps, images_per_prompt, guidance_scale,
|
262 |
+
height_slider, width_slider, num_frames_slider, frame_rate],
|
263 |
+
outputs=output_video
|
264 |
+
)
|
265 |
+
|
266 |
+
iface.launch(share=True)
|