StableDiffusionVideoTo3D

Runtime error

App Files Files Community

heheyas commited on Mar 13

Commit

b1531dc

•

1 Parent(s): 3667a5a

update app.py

Browse files

Files changed (2) hide show

app.py +107 -139
app_bkp.py +294 -0

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from glob import glob
 from mediapy import write_video
 from pathlib import Path
 import spaces
 @spaces.GPU
@@ -142,153 +143,120 @@ def do_sample(
     return video_path
 def change_model_params(model, min_cfg, max_cfg):
     model.sampler.guider.max_scale = max_cfg
     model.sampler.guider.min_scale = min_cfg
-@spaces.GPU
-def launch(device="cuda", share=False):
-    model_config = "./scripts/pub/configs/V3D_512.yaml"
-    num_frames = OmegaConf.load(
-        model_config
-    ).model.params.sampler_config.params.guider_config.params.num_frames
-    print("Detected num_frames:", num_frames)
-    # num_steps = default(num_steps, 25)
-    num_steps = 25
-    output_folder = "outputs/V3D_512"
-    sd = load_safetensors("./ckpts/svd_xt.safetensors")
-    clip_model_config = OmegaConf.load("./configs/embedder/clip_image.yaml")
-    clip_model = instantiate_from_config(clip_model_config).eval()
-    clip_sd = dict()
-    for k, v in sd.items():
-        if "conditioner.embedders.0" in k:
-            clip_sd[k.replace("conditioner.embedders.0.", "")] = v
-    clip_model.load_state_dict(clip_sd)
-    clip_model = clip_model.to(device)
-    ae_model_config = OmegaConf.load("./configs/ae/video.yaml")
-    ae_model = instantiate_from_config(ae_model_config).eval()
-    encoder_sd = dict()
-    for k, v in sd.items():
-        if "first_stage_model" in k:
-            encoder_sd[k.replace("first_stage_model.", "")] = v
-    ae_model.load_state_dict(encoder_sd)
-    ae_model = ae_model.to(device)
-    rembg_session = rembg.new_session()
-    model, _ = load_model(
-        model_config, device, num_frames, num_steps, min_cfg=3.5, max_cfg=3.5
     )
-    with gr.Blocks(title="V3D", theme=gr.themes.Monochrome()) as demo:
-        with gr.Row(equal_height=True):
-            with gr.Column():
-                input_image = gr.Image(value=None, label="Input Image")
-                border_ratio_slider = gr.Slider(
-                    value=0.3,
-                    label="Border Ratio",
-                    minimum=0.05,
-                    maximum=0.5,
-                    step=0.05,
-                )
-                decoding_t_slider = gr.Slider(
-                    value=1,
-                    label="Number of Decoding frames",
-                    minimum=1,
-                    maximum=num_frames,
-                    step=1,
-                )
-                min_guidance_slider = gr.Slider(
-                    value=3.5,
-                    label="Min CFG Value",
-                    minimum=0.05,
-                    maximum=0.5,
-                    step=0.05,
-                )
-                max_guidance_slider = gr.Slider(
-                    value=3.5,
-                    label="Max CFG Value",
-                    minimum=0.05,
-                    maximum=0.5,
-                    step=0.05,
-                )
-                run_button = gr.Button(value="Run V3D")
-            with gr.Column():
-                output_video = gr.Video(value=None, label="Output Orbit Video")
-        @run_button.click(
-            inputs=[
-                input_image,
-                border_ratio_slider,
-                min_guidance_slider,
-                max_guidance_slider,
-                decoding_t_slider,
-            ],
-            outputs=[output_video],
         )
-        def _(image, border_ratio, min_guidance, max_guidance, decoding_t):
-            change_model_params(model, min_guidance, max_guidance)
-            return do_sample(
-                image,
-                model,
-                clip_model,
-                ae_model,
-                device,
-                num_frames,
-                num_steps,
-                int(decoding_t),
-                border_ratio,
-                False,
-                rembg_session,
-                output_folder,
-            )
-    # do_sample(
-    #     np.asarray(Image.open("assets/baby_yoda.png")),
-    #     model,
-    #     clip_model,
-    #     ae_model,
-    #     device,
-    #     num_frames,
-    #     num_steps,
-    #     1,
-    #     0.3,
-    #     False,
-    #     rembg_session,
-    #     output_folder,
-    # )
-    demo.launch(inbrowser=True, inline=False, share=share, show_error=True)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--device", type=str, default="cuda")
-    parser.add_argument("--share", action="store_true")
-    opt = parser.parse_args()
-    test = OmegaConf.load("./scripts/pub/configs/V3D_512.yaml")
-    print(test)
-    def download_if_need(path, url):
-        if Path(path).exists():
-            return
-        import wget
-        path = Path(path)
-        path.parent.mkdir(parents=True, exist_ok=True)
-        wget.download(url, out=str(path))
-    # download_if_need(
-    #     "ckpts/svd_xt.safetensors",
-    #     "https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors",
-    # )
-    # download_if_need(
-    #     "ckpts/V3D_512.ckpt", "https://huggingface.co/heheyas/V3D/resolve/main/V3D.ckpt"
-    # )
-    launch(opt.device, opt.share)

 from mediapy import write_video
 from pathlib import Path
 import spaces
+from huggingface_hub import hf_hub_download
 @spaces.GPU
     return video_path
+@spaces.GPU
 def change_model_params(model, min_cfg, max_cfg):
     model.sampler.guider.max_scale = max_cfg
     model.sampler.guider.min_scale = min_cfg
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# download
+V3D_ckpt_path = hf_hub_download(repo_id="heheyas/V3D", filename="V3D.ckpt")
+svd_xt_ckpt_path = hf_hub_download(
+    repo_id="stabilityai/stable-video-diffusion-img2vid-xt",
+    filename="svd_xt.safetensors",
+)
+model_config = "./scripts/pub/configs/V3D_512.yaml"
+num_frames = OmegaConf.load(
+    model_config
+).model.params.sampler_config.params.guider_config.params.num_frames
+print("Detected num_frames:", num_frames)
+# num_steps = default(num_steps, 25)
+num_steps = 25
+output_folder = "outputs/V3D_512"
+sd = load_safetensors(svd_xt_ckpt_path)
+clip_model_config = OmegaConf.load("./configs/embedder/clip_image.yaml")
+clip_model = instantiate_from_config(clip_model_config).eval()
+clip_sd = dict()
+for k, v in sd.items():
+    if "conditioner.embedders.0" in k:
+        clip_sd[k.replace("conditioner.embedders.0.", "")] = v
+clip_model.load_state_dict(clip_sd)
+clip_model = clip_model.to(device)
+ae_model_config = OmegaConf.load("./configs/ae/video.yaml")
+ae_model = instantiate_from_config(ae_model_config).eval()
+encoder_sd = dict()
+for k, v in sd.items():
+    if "first_stage_model" in k:
+        encoder_sd[k.replace("first_stage_model.", "")] = v
+ae_model.load_state_dict(encoder_sd)
+ae_model = ae_model.to(device)
+rembg_session = rembg.new_session()
+model_config.model.params.ckpt_path = V3D_ckpt_path
+model, _ = load_model(
+    model_config, device, num_frames, num_steps, min_cfg=3.5, max_cfg=3.5
+)
+model = model.to(device)
+with gr.Blocks(title="V3D", theme=gr.themes.Monochrome()) as demo:
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            input_image = gr.Image(value=None, label="Input Image")
+            border_ratio_slider = gr.Slider(
+                value=0.3,
+                label="Border Ratio",
+                minimum=0.05,
+                maximum=0.5,
+                step=0.05,
+            )
+            decoding_t_slider = gr.Slider(
+                value=1,
+                label="Number of Decoding frames",
+                minimum=1,
+                maximum=num_frames,
+                step=1,
+            )
+            min_guidance_slider = gr.Slider(
+                value=3.5,
+                label="Min CFG Value",
+                minimum=0.05,
+                maximum=0.5,
+                step=0.05,
+            )
+            max_guidance_slider = gr.Slider(
+                value=3.5,
+                label="Max CFG Value",
+                minimum=0.05,
+                maximum=0.5,
+                step=0.05,
+            )
+            run_button = gr.Button(value="Run V3D")
+        with gr.Column():
+            output_video = gr.Video(value=None, label="Output Orbit Video")
+    @run_button.click(
+        inputs=[
+            input_image,
+            border_ratio_slider,
+            min_guidance_slider,
+            max_guidance_slider,
+            decoding_t_slider,
+        ],
+        outputs=[output_video],
     )
+    def _(image, border_ratio, min_guidance, max_guidance, decoding_t):
+        change_model_params(model, min_guidance, max_guidance)
+        return do_sample(
+            image,
+            model,
+            clip_model,
+            ae_model,
+            device,
+            num_frames,
+            num_steps,
+            int(decoding_t),
+            border_ratio,
+            False,
+            rembg_session,
+            output_folder,
         )
+demo.launch()

app_bkp.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# TODO
+import numpy as np
+import argparse
+import torch
+from torchvision.utils import make_grid
+import tempfile
+import gradio as gr
+from omegaconf import OmegaConf
+from einops import rearrange
+from scripts.pub.V3D_512 import (
+    sample_one,
+    get_batch,
+    get_unique_embedder_keys_from_conditioner,
+    load_model,
+)
+from sgm.util import default, instantiate_from_config
+from safetensors.torch import load_file as load_safetensors
+from PIL import Image
+from kiui.op import recenter
+from torchvision.transforms import ToTensor
+from einops import rearrange, repeat
+import rembg
+import os
+from glob import glob
+from mediapy import write_video
+from pathlib import Path
+import spaces
+@spaces.GPU
+def do_sample(
+    image,
+    model,
+    clip_model,
+    ae_model,
+    device,
+    num_frames,
+    num_steps,
+    decoding_t,
+    border_ratio,
+    ignore_alpha,
+    rembg_session,
+    output_folder,
+):
+    # if image.mode == "RGBA":
+    #     image = image.convert("RGB")
+    image = Image.fromarray(image)
+    w, h = image.size
+    if border_ratio > 0:
+        if image.mode != "RGBA" or ignore_alpha:
+            image = image.convert("RGB")
+            image = np.asarray(image)
+            carved_image = rembg.remove(image, session=rembg_session)  # [H, W, 4]
+        else:
+            image = np.asarray(image)
+            carved_image = image
+        mask = carved_image[..., -1] > 0
+        image = recenter(carved_image, mask, border_ratio=border_ratio)
+        image = image.astype(np.float32) / 255.0
+        if image.shape[-1] == 4:
+            image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
+        image = Image.fromarray((image * 255).astype(np.uint8))
+    else:
+        print("Ignore border ratio")
+    image = image.resize((512, 512))
+    image = ToTensor()(image)
+    image = image * 2.0 - 1.0
+    image = image.unsqueeze(0).to(device)
+    H, W = image.shape[2:]
+    assert image.shape[1] == 3
+    F = 8
+    C = 4
+    shape = (num_frames, C, H // F, W // F)
+    value_dict = {}
+    value_dict["motion_bucket_id"] = 0
+    value_dict["fps_id"] = 0
+    value_dict["cond_aug"] = 0.05
+    value_dict["cond_frames_without_noise"] = clip_model(image)
+    value_dict["cond_frames"] = ae_model.encode(image)
+    value_dict["cond_frames"] += 0.05 * torch.randn_like(value_dict["cond_frames"])
+    value_dict["cond_aug"] = 0.05
+    with torch.no_grad():
+        with torch.autocast(device):
+            batch, batch_uc = get_batch(
+                get_unique_embedder_keys_from_conditioner(model.conditioner),
+                value_dict,
+                [1, num_frames],
+                T=num_frames,
+                device=device,
+            )
+            c, uc = model.conditioner.get_unconditional_conditioning(
+                batch,
+                batch_uc=batch_uc,
+                force_uc_zero_embeddings=[
+                    "cond_frames",
+                    "cond_frames_without_noise",
+                ],
+            )
+            for k in ["crossattn", "concat"]:
+                uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+                uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+                c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+                c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+            randn = torch.randn(shape, device=device)
+            randn = randn.to(device)
+            additional_model_inputs = {}
+            additional_model_inputs["image_only_indicator"] = torch.zeros(
+                2, num_frames
+            ).to(device)
+            additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+            def denoiser(input, sigma, c):
+                return model.denoiser(
+                    model.model, input, sigma, c, **additional_model_inputs
+                )
+            samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
+            model.en_and_decode_n_samples_a_time = decoding_t
+            samples_x = model.decode_first_stage(samples_z)
+            samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+            os.makedirs(output_folder, exist_ok=True)
+            base_count = len(glob(os.path.join(output_folder, "*.mp4")))
+            video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
+            frames = (
+                (rearrange(samples, "t c h w -> t h w c") * 255)
+                .cpu()
+                .numpy()
+                .astype(np.uint8)
+            )
+            write_video(video_path, frames, fps=6)
+    return video_path
+def change_model_params(model, min_cfg, max_cfg):
+    model.sampler.guider.max_scale = max_cfg
+    model.sampler.guider.min_scale = min_cfg
+@spaces.GPU
+def launch(device="cuda", share=False):
+    model_config = "./scripts/pub/configs/V3D_512.yaml"
+    num_frames = OmegaConf.load(
+        model_config
+    ).model.params.sampler_config.params.guider_config.params.num_frames
+    print("Detected num_frames:", num_frames)
+    # num_steps = default(num_steps, 25)
+    num_steps = 25
+    output_folder = "outputs/V3D_512"
+    sd = load_safetensors("./ckpts/svd_xt.safetensors")
+    clip_model_config = OmegaConf.load("./configs/embedder/clip_image.yaml")
+    clip_model = instantiate_from_config(clip_model_config).eval()
+    clip_sd = dict()
+    for k, v in sd.items():
+        if "conditioner.embedders.0" in k:
+            clip_sd[k.replace("conditioner.embedders.0.", "")] = v
+    clip_model.load_state_dict(clip_sd)
+    clip_model = clip_model.to(device)
+    ae_model_config = OmegaConf.load("./configs/ae/video.yaml")
+    ae_model = instantiate_from_config(ae_model_config).eval()
+    encoder_sd = dict()
+    for k, v in sd.items():
+        if "first_stage_model" in k:
+            encoder_sd[k.replace("first_stage_model.", "")] = v
+    ae_model.load_state_dict(encoder_sd)
+    ae_model = ae_model.to(device)
+    rembg_session = rembg.new_session()
+    model, _ = load_model(
+        model_config, device, num_frames, num_steps, min_cfg=3.5, max_cfg=3.5
+    )
+    with gr.Blocks(title="V3D", theme=gr.themes.Monochrome()) as demo:
+        with gr.Row(equal_height=True):
+            with gr.Column():
+                input_image = gr.Image(value=None, label="Input Image")
+                border_ratio_slider = gr.Slider(
+                    value=0.3,
+                    label="Border Ratio",
+                    minimum=0.05,
+                    maximum=0.5,
+                    step=0.05,
+                )
+                decoding_t_slider = gr.Slider(
+                    value=1,
+                    label="Number of Decoding frames",
+                    minimum=1,
+                    maximum=num_frames,
+                    step=1,
+                )
+                min_guidance_slider = gr.Slider(
+                    value=3.5,
+                    label="Min CFG Value",
+                    minimum=0.05,
+                    maximum=0.5,
+                    step=0.05,
+                )
+                max_guidance_slider = gr.Slider(
+                    value=3.5,
+                    label="Max CFG Value",
+                    minimum=0.05,
+                    maximum=0.5,
+                    step=0.05,
+                )
+                run_button = gr.Button(value="Run V3D")
+            with gr.Column():
+                output_video = gr.Video(value=None, label="Output Orbit Video")
+        @run_button.click(
+            inputs=[
+                input_image,
+                border_ratio_slider,
+                min_guidance_slider,
+                max_guidance_slider,
+                decoding_t_slider,
+            ],
+            outputs=[output_video],
+        )
+        def _(image, border_ratio, min_guidance, max_guidance, decoding_t):
+            change_model_params(model, min_guidance, max_guidance)
+            return do_sample(
+                image,
+                model,
+                clip_model,
+                ae_model,
+                device,
+                num_frames,
+                num_steps,
+                int(decoding_t),
+                border_ratio,
+                False,
+                rembg_session,
+                output_folder,
+            )
+    # do_sample(
+    #     np.asarray(Image.open("assets/baby_yoda.png")),
+    #     model,
+    #     clip_model,
+    #     ae_model,
+    #     device,
+    #     num_frames,
+    #     num_steps,
+    #     1,
+    #     0.3,
+    #     False,
+    #     rembg_session,
+    #     output_folder,
+    # )
+    demo.launch(inbrowser=True, inline=False, share=share, show_error=True)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--share", action="store_true")
+    opt = parser.parse_args()
+    test = OmegaConf.load("./scripts/pub/configs/V3D_512.yaml")
+    print(test)
+    def download_if_need(path, url):
+        if Path(path).exists():
+            return
+        import wget
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        wget.download(url, out=str(path))
+    # download_if_need(
+    #     "ckpts/svd_xt.safetensors",
+    #     "https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors",
+    # )
+    # download_if_need(
+    #     "ckpts/V3D_512.ckpt", "https://huggingface.co/heheyas/V3D/resolve/main/V3D.ckpt"
+    # )
+    launch(opt.device, opt.share)