File size: 5,592 Bytes
9ac31b8
07c1f7a
9ac31b8
3b06696
f226a29
3b06696
 
 
 
9ac31b8
 
3b06696
 
a9235bb
25d3956
9de0ad3
e3d310b
07c1f7a
d56d267
36be800
 
9de0ad3
 
 
9ac31b8
48f2aa4
 
 
d56d267
9ac31b8
 
8010ebe
9ac31b8
 
d56d267
c907ab3
45f3f73
0cd72ee
53a5202
3b06696
492fffc
 
3b06696
9ac31b8
d56d267
 
3b06696
36be800
 
 
 
311fe07
 
 
9664ead
36be800
9ac31b8
 
 
 
d56d267
9ac31b8
 
 
0cd72ee
efa319b
9ac31b8
 
f58ac54
 
 
 
 
 
 
9ac31b8
23224a3
 
 
 
f58ac54
3b06696
0cd72ee
d56d267
 
 
3b06696
d56d267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b06696
d56d267
 
 
ff46702
d56d267
45f3f73
 
 
32097c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45f3f73
 
 
 
32097c5
45f3f73
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
#import gradio.helpers
import torch
import os
import base64
from glob import glob
from pathlib import Path
from typing import Optional

from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
from PIL import Image

import uuid
import random
from huggingface_hub import login, hf_hub_download

#gradio.helpers.CACHED_FOLDER = '/data/cache'

SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')

HF_API_KEY = os.getenv('HF_API_KEY', '')
login(token=HF_API_KEY)

pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt-1-1",
    torch_dtype=torch.float16,
    variant="fp16"
)
pipe.to("cuda")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
#pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True)

max_64_bit_int = 2**63 - 1

def generate_video(
    secret_token: str,
    image: Image,
    seed: int,
    motion_bucket_id: int = 127,
    fps_id: int = 6,
    version: str = "svd_xt",
    cond_aug: float = 0.02,
    decoding_t: int = 3,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: str = "cuda",
    output_folder: str = "outputs",
):
    if secret_token != SECRET_TOKEN:
        raise gr.Error(
            f'Invalid secret token. Please fork the original space if you want to use it for yourself.')


    # note julian: normally we should resize input images, but normally they are already in 1024x576, so..
    # also, I would like to experiment with vertical videos, and 1024x512 videos
    image = resize_image(image)
    
    if image.mode == "RGBA":
        image = image.convert("RGB")
        
    generator = torch.manual_seed(seed)
    
    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0]
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)

    # Read the content of the video file and encode it to base64
    with open(video_path, "rb") as video_file:
        video_base64 = base64.b64encode(video_file.read()).decode('utf-8')

    # Prepend the appropriate data URI header with MIME type
    video_data_uri = 'data:video/mp4;base64,' + video_base64
    
    # clean-up (otherwise there is a risk of "ghosting", eg. someone seeing the previous generated video",
    # of one of the steps go wrong)
    os.remove(video_path)
    
    return video_data_uri

def resize_image(image, output_size=(1024, 576)):
    # Calculate aspect ratios
    target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
    image_aspect = image.width / image.height  # Aspect ratio of the original image

    # Resize then crop if the original image is larger
    if image_aspect > target_aspect:
        # Resize the image to match the target height, maintaining aspect ratio
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = (new_width - output_size[0]) / 2
        top = 0
        right = (new_width + output_size[0]) / 2
        bottom = output_size[1]
    else:
        # Resize the image to match the target width, maintaining aspect ratio
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        # Calculate coordinates for cropping
        left = 0
        top = (new_height - output_size[1]) / 2
        right = output_size[0]
        bottom = (new_height + output_size[1]) / 2

    # Crop the image
    cropped_image = resized_image.crop((left, top, right, bottom))
    return cropped_image

with gr.Blocks() as demo:
    secret_token = gr.Text(
        label='Secret Token',
        max_lines=1,
        placeholder='Enter your secret token')
    gr.HTML("""
        <div style="z-index: 100; position: fixed; top: 0px; right: 0px; left: 0px; bottom: 0px; width: 100%; height: 100%; background: white; display: flex; align-items: center; justify-content: center; color: black;">
        <div style="text-align: center; color: black;">
        <p style="color: black;">This space is a REST API to programmatically generate MP4 videos.</p>
        <p style="color: black;">Interested in using it? Look no further than the <a href="https://huggingface.co/spaces/multimodalart/stable-video-diffusion" target="_blank">original space</a>!</p>
        </div>
        </div>""")
    image = gr.Image(label="Upload your image", type="pil")
    generate_btn = gr.Button("Generate")
    base64_out = gr.Textbox(label="Base64 Video")
    seed = gr.Slider(label="Seed", value=42, randomize=False, minimum=0, maximum=max_64_bit_int, step=1)
    motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
    fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30)
       
    generate_btn.click(
        fn=generate_video,
        inputs=[secret_token, image, seed, motion_bucket_id, fps_id],
        outputs=base64_out,
        api_name="run")

demo.queue(max_size=20).launch()