Lotus_Depth_video

Paused

App Files Files Community

haodongli commited on Oct 6

Commit

dc78df8

•

1 Parent(s): 74af050

v1

Browse files

Files changed (9) hide show

.gitattributes +1 -0
.gitignore +3 -0
README.md +1 -1
app.py +218 -217
files/videos/K_0005_IN.mp4 +3 -0
files/videos/obama.mp4 +0 -0
infer.py +134 -28
pipeline.py +0 -1
requirements.txt +3 -2

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 files/images/01.jpg filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 files/images/01.jpg filter=lfs diff=lfs merge=lfs -text
+files/videos/K_0005_IN.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+output/
+gradio_cached_examples/

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🚀
 colorFrom: blue
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: blue
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from __future__ import annotations
 from gradio_imageslider import ImageSlider
 import functools
 import os
@@ -14,224 +13,226 @@ from tqdm import tqdm
 from pathlib import Path
 import gradio
 from gradio.utils import get_cache_folder
-from infer import lotus
-# def process_image_check(path_input):
-#     if path_input is None:
-#         raise gr.Error(
-#             "Missing image in the first pane: upload a file or use one from the gallery below."
-#         )
-# def infer(path_input, seed=0):
-#     print(f"==> Processing image {path_input}")
-#     return path_input
-#     return [path_input, path_input]
-#     # name_base, name_ext = os.path.splitext(os.path.basename(path_input))
-#     # print(f"==> Processing image {name_base}{name_ext}")
-#     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#     # print(f"==> Device: {device}")
-#     # output_g, output_d = lotus(path_input, 'depth', seed, device)
-#     # if not os.path.exists("files/output"):
-#     #     os.makedirs("files/output")
-#     # g_save_path = os.path.join("files/output", f"{name_base}_g{name_ext}")
-#     # d_save_path = os.path.join("files/output", f"{name_base}_d{name_ext}")
-#     # output_g.save(g_save_path)
-#     # output_d.save(d_save_path)
-#     # yield [path_input, g_save_path], [path_input, d_save_path]
-# def run_demo_server():
-#     gradio_theme = gr.themes.Default()
-#     with gr.Blocks(
-#         theme=gradio_theme,
-#         title="LOTUS (Depth)",
-#         css="""
-#             #download {
-#                 height: 118px;
-#             }
-#             .slider .inner {
-#                 width: 5px;
-#                 background: #FFF;
-#             }
-#             .viewport {
-#                 aspect-ratio: 4/3;
-#             }
-#             .tabs button.selected {
-#                 font-size: 20px !important;
-#                 color: crimson !important;
-#             }
-#             h1 {
-#                 text-align: center;
-#                 display: block;
-#             }
-#             h2 {
-#                 text-align: center;
-#                 display: block;
-#             }
-#             h3 {
-#                 text-align: center;
-#                 display: block;
-#             }
-#             .md_feedback li {
-#                 margin-bottom: 0px !important;
-#             }
-#         """,
-#         head="""
-#             <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
-#             <script>
-#                 window.dataLayer = window.dataLayer || [];
-#                 function gtag() {dataLayer.push(arguments);}
-#                 gtag('js', new Date());
-#                 gtag('config', 'G-1FWSVCGZTG');
-#             </script>
-#         """,
-#     ) as demo:
-#         gr.Markdown(
-#             """
-#             # LOTUS: Diffusion-based Visual Foundation Model for High-quality Dense Prediction
-#             <p align="center">
-#             <a title="Page" href="https://lotus3d.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
-#                 <img src="https://img.shields.io/badge/Project-Website-pink?logo=googlechrome&logoColor=white">
-#             </a>
-#             <a title="arXiv" href="https://arxiv.org/abs/2409.18124" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
-#                 <img src="https://img.shields.io/badge/arXiv-Paper-b31b1b?logo=arxiv&logoColor=white">
-#             </a>
-#             <a title="Github" href="https://github.com/EnVision-Research/Lotus" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
-#                 <img src="https://img.shields.io/github/stars/EnVision-Research/Lotus?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
-#             </a>
-#             <a title="Social" href="https://x.com/haodongli00/status/1839524569058582884" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
-#                 <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
-#             </a>
-#         """
-#         )
-#         with gr.Tabs(elem_classes=["tabs"]):
-#             with gr.Tab("IMAGE"):
-#                 with gr.Row():
-#                     with gr.Column():
-#                         image_input = gr.Image(
-#                             label="Input Image",
-#                             type="filepath",
-#                         )
-#                         seed = gr.Number(
-#                             label="Seed",
-#                             minimum=0,
-#                             maximum=999999,
-#                         )
-#                         with gr.Row():
-#                             image_submit_btn = gr.Button(
-#                                 value="Predict Depth!", variant="primary"
-#                             )
-#                             # image_reset_btn = gr.Button(value="Reset")
-#                     with gr.Column():
-#                         image_output_g = gr.Image(
-#                             label="Output (Generative)",
-#                             type="filepath",
-#                         )
-#                         # image_output_g = ImageSlider(
-#                         #     label="Output (Generative)",
-#                         #     type="filepath",
-#                         #     show_download_button=True,
-#                         #     show_share_button=True,
-#                         #     interactive=False,
-#                         #     elem_classes="slider",
-#                         #     position=0.25,
-#                         # )
-#                         # with gr.Row():
-#                         #     image_output_d = gr.Image(
-#                         #         label="Output (Generative)",
-#                         #         type="filepath",
-#                         #     )
-#                         #     image_output_d = ImageSlider(
-#                         #         label="Output (Discriminative)",
-#                         #         type="filepath",
-#                         #         show_download_button=True,
-#                         #         show_share_button=True,
-#                         #         interactive=False,
-#                         #         elem_classes="slider",
-#                         #         position=0.25,
-#                         #     )
-#                 # gr.Examples(
-#                 #     fn=infer,
-#                 #     examples=sorted([
-#                 #         os.path.join("files", "images", name)
-#                 #         for name in os.listdir(os.path.join("files", "images"))
-#                 #     ]),
-#                 #     inputs=[image_input],
-#                 #     outputs=[image_output_g],
-#                 #     cache_examples=True,
-#                 # )
-#             with gr.Tab("VIDEO"):
-#                 with gr.Column():
-#                     gr.Markdown("Coming soon")
-#         ### Image
-#         image_submit_btn.click(
-#             fn=infer,
-#             inputs=[
-#                 image_input
-#             ],
-#             outputs=image_output_g,
-#             concurrency_limit=1,
-#         )
-#         # image_reset_btn.click(
-#         #     fn=lambda: (
-#         #         None,
-#         #         None,
-#         #         None,
-#         #     ),
-#         #     inputs=[],
-#         #     outputs=image_output_g,
-#         #     queue=False,
-#         # )
-#         ### Video
-#         ### Server launch
-#         demo.queue(
-#             api_open=False,
-#         ).launch(
-#             server_name="0.0.0.0",
-#             server_port=7860,
-#         )
-# def main():
-#     os.system("pip freeze")
-#     run_demo_server()
-# if __name__ == "__main__":
-#     main()
-def flip_text(x):
-    return x[::-1]
-def flip_image(x):
-    return np.fliplr(x)
-with gr.Blocks() as demo:
-    gr.Markdown("Flip text or image files using this demo.")
-    with gr.Tab("Flip Text"):
-        text_input = gr.Textbox()
-        text_output = gr.Textbox()
-        text_button = gr.Button("Flip")
-    with gr.Tab("Flip Image"):
-        with gr.Row():
-            image_input = gr.Image()
-            image_output = gr.Image()
-        image_button = gr.Button("Flip")
-    with gr.Accordion("Open for More!", open=False):
-        gr.Markdown("Look at me...")
-        temp_slider = gr.Slider(
-            0, 1,
-            value=0.1,
-            step=0.1,
-            interactive=True,
-            label="Slide me",
         )
-    text_button.click(flip_text, inputs=text_input, outputs=text_output)
-    image_button.click(flip_image, inputs=image_input, outputs=image_output)
-demo.launch(share=True)

 from gradio_imageslider import ImageSlider
 import functools
 import os
 from pathlib import Path
 import gradio
 from gradio.utils import get_cache_folder
+from infer import lotus, lotus_video
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def infer(path_input, seed=0):
+    name_base, name_ext = os.path.splitext(os.path.basename(path_input))
+    output_g, output_d = lotus(path_input, 'depth', seed, device)
+    if not os.path.exists("files/output"):
+        os.makedirs("files/output")
+    g_save_path = os.path.join("files/output", f"{name_base}_g{name_ext}")
+    d_save_path = os.path.join("files/output", f"{name_base}_d{name_ext}")
+    output_g.save(g_save_path)
+    output_d.save(d_save_path)
+    return [path_input, g_save_path], [path_input, d_save_path]
+def infer_video(path_input, seed=0):
+    frames_g, frames_d = lotus_video(path_input, 'depth', seed, device)
+    if not os.path.exists("files/output"):
+        os.makedirs("files/output")
+    name_base, _ = os.path.splitext(os.path.basename(path_input))
+    g_save_path = os.path.join("files/output", f"{name_base}_g.mp4")
+    d_save_path = os.path.join("files/output", f"{name_base}_d.mp4")
+    imageio.mimsave(g_save_path, frames_g)
+    imageio.mimsave(d_save_path, frames_d)
+    return [g_save_path, d_save_path]
+def run_demo_server():
+    gradio_theme = gr.themes.Default()
+    with gr.Blocks(
+        theme=gradio_theme,
+        title="LOTUS (Depth)",
+        css="""
+            #download {
+                height: 118px;
+            }
+            .slider .inner {
+                width: 5px;
+                background: #FFF;
+            }
+            .viewport {
+                aspect-ratio: 4/3;
+            }
+            .tabs button.selected {
+                font-size: 20px !important;
+                color: crimson !important;
+            }
+            h1 {
+                text-align: center;
+                display: block;
+            }
+            h2 {
+                text-align: center;
+                display: block;
+            }
+            h3 {
+                text-align: center;
+                display: block;
+            }
+            .md_feedback li {
+                margin-bottom: 0px !important;
+            }
+        """,
+        head="""
+            <script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
+            <script>
+                window.dataLayer = window.dataLayer || [];
+                function gtag() {dataLayer.push(arguments);}
+                gtag('js', new Date());
+                gtag('config', 'G-1FWSVCGZTG');
+            </script>
+        """,
+    ) as demo:
+        gr.Markdown(
+            """
+            # LOTUS: Diffusion-based Visual Foundation Model for High-quality Dense Prediction
+            <p align="center">
+            <a title="Page" href="https://lotus3d.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://img.shields.io/badge/Project-Website-pink?logo=googlechrome&logoColor=white">
+            </a>
+            <a title="arXiv" href="https://arxiv.org/abs/2409.18124" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://img.shields.io/badge/arXiv-Paper-b31b1b?logo=arxiv&logoColor=white">
+            </a>
+            <a title="Github" href="https://github.com/EnVision-Research/Lotus" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://img.shields.io/github/stars/EnVision-Research/Lotus?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
+            </a>
+            <a title="Social" href="https://x.com/haodongli00/status/1839524569058582884" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
+            </a>
+        """
+        )
+        with gr.Tabs(elem_classes=["tabs"]):
+            with gr.Tab("IMAGE"):
+                with gr.Row():
+                    with gr.Column():
+                        image_input = gr.Image(
+                            label="Input Image",
+                            type="filepath",
+                        )
+                        seed = gr.Number(
+                            label="Seed (only for Generative mode)",
+                            minimum=0,
+                            maximum=999999999,
+                        )
+                        with gr.Row():
+                            image_submit_btn = gr.Button(
+                                value="Predict Depth!", variant="primary"
+                            )
+                            image_reset_btn = gr.Button(value="Reset")
+                    with gr.Column():
+                        image_output_g = ImageSlider(
+                            label="Output (Generative)",
+                            type="filepath",
+                            interactive=False,
+                            elem_classes="slider",
+                            position=0.25,
+                        )
+                        with gr.Row():
+                            image_output_d = ImageSlider(
+                                label="Output (Discriminative)",
+                                type="filepath",
+                                interactive=False,
+                                elem_classes="slider",
+                                position=0.25,
+                            )
+                gr.Examples(
+                    fn=infer,
+                    examples=sorted([
+                        os.path.join("files", "images", name)
+                        for name in os.listdir(os.path.join("files", "images"))
+                    ]),
+                    inputs=[image_input],
+                    outputs=[image_output_g, image_output_d],
+                    cache_examples=True,
+                )
+            with gr.Tab("VIDEO"):
+                with gr.Row():
+                    with gr.Column():
+                        input_video = gr.Video(
+                            label="Input Video",
+                            autoplay=True,
+                            loop=True,
+                        )
+                        seed = gr.Number(
+                            label="Seed (only for Generative mode)",
+                            minimum=0,
+                            maximum=999999999,
+                        )
+                        with gr.Row():
+                            video_submit_btn = gr.Button(
+                                value="Compute Depth!", variant="primary"
+                            )
+                            video_reset_btn = gr.Button(value="Reset")
+                    with gr.Column():
+                        video_output_g = gr.Video(
+                            label="Output (Generative)",
+                            interactive=False,
+                            autoplay=True,
+                            loop=True,
+                            show_share_button=True,
+                        )
+                        with gr.Row():
+                            video_output_d = gr.Video(
+                                label="Output (Discriminative)",
+                                interactive=False,
+                                autoplay=True,
+                                loop=True,
+                                show_share_button=True,
+                            )
+                gr.Examples(
+                    fn=infer_video,
+                    examples=sorted([
+                        os.path.join("files", "videos", name)
+                        for name in os.listdir(os.path.join("files", "videos"))
+                    ]),
+                    inputs=[input_video],
+                    outputs=[video_output_g, video_output_d],
+                    cache_examples=True,
+                )
+        ### Image
+        image_submit_btn.click(
+            fn=infer,
+            inputs=[image_input, seed],
+            outputs=[image_output_g, image_output_d],
+            concurrency_limit=1,
+        )
+        image_reset_btn.click(
+            fn=lambda: (
+                None,
+                None,
+                None,
+            ),
+            inputs=[],
+            outputs=[image_output_g, image_output_d],
+            queue=False,
+        )
+        ### Video
+        video_submit_btn.click(
+            fn=infer_video,
+            inputs=[input_video, seed],
+            outputs=[video_output_g, video_output_d],
+            queue=True,
+        )
+        ### Server launch
+        demo.queue(
+            api_open=False,
+        ).launch(
+            server_name="0.0.0.0",
+            server_port=7860,
         )
+def main():
+    os.system("pip freeze")
+    run_demo_server()
+if __name__ == "__main__":
+    main()

files/videos/K_0005_IN.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a532ba2738716dbb244e0d7172cf681879218cbbdad09980404fa08ef6b9ecc
+size 3095352

files/videos/obama.mp4 DELETED Viewed

Binary file (320 kB)

infer.py CHANGED Viewed

@@ -14,6 +14,9 @@ from pipeline import LotusGPipeline, LotusDPipeline
 from utils.image_utils import colorize_depth_map
 from utils.seed_all import seed_all
 check_min_version('0.28.0.dev0')
 def infer_pipe(pipe, image_input, task_name, seed, device):
@@ -22,36 +25,137 @@ def infer_pipe(pipe, image_input, task_name, seed, device):
     else:
         generator = torch.Generator(device=device).manual_seed(seed)
-    test_image = Image.open(image_input).convert('RGB')
-    test_image = np.array(test_image).astype(np.float32)
-    test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
-    test_image = test_image / 127.5 - 1.0
-    test_image = test_image.to(device)
-    task_emb = torch.tensor([1, 0]).float().unsqueeze(0).repeat(1, 1).to(device)
-    task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1).repeat(1, 1)
-    # Run
-    pred = pipe(
-        rgb_in=test_image,
-        prompt='',
-        num_inference_steps=1,
-        generator=generator,
-        # guidance_scale=0,
-        output_type='np',
-        timesteps=[999],
-        task_emb=task_emb,
-        ).images[0]
-    # Post-process the prediction
     if task_name == 'depth':
-        output_npy = pred.mean(axis=-1)
-        output_color = colorize_depth_map(output_npy)
     else:
-        output_npy = pred
-        output_color = Image.fromarray((output_npy * 255).astype(np.uint8))
-    return output_color
 def lotus(image_input, task_name, seed, device):
     if task_name == 'depth':
@@ -61,7 +165,7 @@ def lotus(image_input, task_name, seed, device):
         model_g = 'jingheya/lotus-normal-g-v1-0'
         model_d = 'jingheya/lotus-normal-d-v1-0'
-    dtype = torch.float32
     pipe_g = LotusGPipeline.from_pretrained(
         model_g,
         torch_dtype=dtype,
@@ -72,6 +176,8 @@ def lotus(image_input, task_name, seed, device):
     )
     pipe_g.to(device)
     pipe_d.to(device)
     logging.info(f"Successfully loading pipeline from {model_g} and {model_d}.")
     output_g = infer_pipe(pipe_g, image_input, task_name, seed, device)
     output_d = infer_pipe(pipe_d, image_input, task_name, seed, device)
@@ -158,7 +264,7 @@ def main():
         dtype = torch.float16
         logging.info(f"Running with half precision ({dtype}).")
     else:
-        dtype = torch.float32
     # -------------------- Device --------------------
     if torch.cuda.is_available():
@@ -206,7 +312,7 @@ def main():
         for i in tqdm(range(len(test_images))):
             # Preprocess validation image
             test_image = Image.open(test_images[i]).convert('RGB')
-            test_image = np.array(test_image).astype(np.float32)
             test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
             test_image = test_image / 127.5 - 1.0
             test_image = test_image.to(device)

 from utils.image_utils import colorize_depth_map
 from utils.seed_all import seed_all
+from contextlib import nullcontext
+import cv2
 check_min_version('0.28.0.dev0')
 def infer_pipe(pipe, image_input, task_name, seed, device):
     else:
         generator = torch.Generator(device=device).manual_seed(seed)
+    if torch.backends.mps.is_available():
+        autocast_ctx = nullcontext()
+    else:
+        autocast_ctx = torch.autocast(pipe.device.type)
+    with autocast_ctx:
+        test_image = Image.open(image_input).convert('RGB')
+        test_image = np.array(test_image).astype(np.float16)
+        test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
+        test_image = test_image / 127.5 - 1.0
+        test_image = test_image.to(device)
+        task_emb = torch.tensor([1, 0]).float().unsqueeze(0).repeat(1, 1).to(device)
+        task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1).repeat(1, 1)
+        # Run
+        pred = pipe(
+            rgb_in=test_image,
+            prompt='',
+            num_inference_steps=1,
+            generator=generator,
+            # guidance_scale=0,
+            output_type='np',
+            timesteps=[999],
+            task_emb=task_emb,
+            ).images[0]
+        # Post-process the prediction
+        if task_name == 'depth':
+            output_npy = pred.mean(axis=-1)
+            output_color = colorize_depth_map(output_npy)
+        else:
+            output_npy = pred
+            output_color = Image.fromarray((output_npy * 255).astype(np.uint8))
+    return output_color
+def lotus_video(input_video, task_name, seed, device):
     if task_name == 'depth':
+        model_g = 'jingheya/lotus-depth-g-v1-0'
+        model_d = 'jingheya/lotus-depth-d-v1-0'
     else:
+        model_g = 'jingheya/lotus-normal-g-v1-0'
+        model_d = 'jingheya/lotus-normal-d-v1-0'
+    dtype = torch.float16
+    pipe_g = LotusGPipeline.from_pretrained(
+        model_g,
+        torch_dtype=dtype,
+    )
+    pipe_d = LotusDPipeline.from_pretrained(
+        model_d,
+        torch_dtype=dtype,
+    )
+    pipe_g.to(device)
+    pipe_d.to(device)
+    pipe_g.set_progress_bar_config(disable=True)
+    pipe_d.set_progress_bar_config(disable=True)
+    logging.info(f"Successfully loading pipeline from {model_g} and {model_d}.")
+    # load the video and split it into frames
+    cap = cv2.VideoCapture(input_video)
+    frames = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(frame)
+    cap.release()
+    logging.info(f"There are {len(frames)} frames in the video.")
+    if seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=device).manual_seed(seed)
+    task_emb = torch.tensor([1, 0]).float().unsqueeze(0).repeat(1, 1).to(device)
+    task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)], dim=-1).repeat(1, 1)
+    output_g = []
+    output_d = []
+    for frame in frames:
+        if torch.backends.mps.is_available():
+            autocast_ctx = nullcontext()
+        else:
+            autocast_ctx = torch.autocast(pipe_g.device.type)
+        with autocast_ctx:
+            test_image = frame
+            test_image = np.array(test_image).astype(np.float16)
+            test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
+            test_image = test_image / 127.5 - 1.0
+            test_image = test_image.to(device)
+            # Run
+            pred_g = pipe_g(
+                rgb_in=test_image,
+                prompt='',
+                num_inference_steps=1,
+                generator=generator,
+                # guidance_scale=0,
+                output_type='np',
+                timesteps=[999],
+                task_emb=task_emb,
+                ).images[0]
+            pred_d = pipe_d(
+                rgb_in=test_image,
+                prompt='',
+                num_inference_steps=1,
+                generator=generator,
+                # guidance_scale=0,
+                output_type='np',
+                timesteps=[999],
+                task_emb=task_emb,
+                ).images[0]
+            # Post-process the prediction
+            if task_name == 'depth':
+                output_npy_g = pred_g.mean(axis=-1)
+                output_color_g = colorize_depth_map(output_npy_g)
+                output_npy_d = pred_d.mean(axis=-1)
+                output_color_d = colorize_depth_map(output_npy_d)
+            else:
+                output_npy_g = pred_g
+                output_color_g = Image.fromarray((output_npy_g * 255).astype(np.uint8))
+                output_npy_d = pred_d
+                output_color_d = Image.fromarray((output_npy_d * 255).astype(np.uint8))
+            output_g.append(output_color_g)
+            output_d.append(output_color_d)
+    return output_g, output_d
 def lotus(image_input, task_name, seed, device):
     if task_name == 'depth':
         model_g = 'jingheya/lotus-normal-g-v1-0'
         model_d = 'jingheya/lotus-normal-d-v1-0'
+    dtype = torch.float16
     pipe_g = LotusGPipeline.from_pretrained(
         model_g,
         torch_dtype=dtype,
     )
     pipe_g.to(device)
     pipe_d.to(device)
+    pipe_g.set_progress_bar_config(disable=True)
+    pipe_d.set_progress_bar_config(disable=True)
     logging.info(f"Successfully loading pipeline from {model_g} and {model_d}.")
     output_g = infer_pipe(pipe_g, image_input, task_name, seed, device)
     output_d = infer_pipe(pipe_d, image_input, task_name, seed, device)
         dtype = torch.float16
         logging.info(f"Running with half precision ({dtype}).")
     else:
+        dtype = torch.float16
     # -------------------- Device --------------------
     if torch.cuda.is_available():
         for i in tqdm(range(len(test_images))):
             # Preprocess validation image
             test_image = Image.open(test_images[i]).convert('RGB')
+            test_image = np.array(test_image).astype(np.float16)
             test_image = torch.tensor(test_image).permute(2,0,1).unsqueeze(0)
             test_image = test_image / 127.5 - 1.0
             test_image = test_image.to(device)

pipeline.py CHANGED Viewed

@@ -1197,7 +1197,6 @@ class LotusGPipeline(DirectDiffusionPipeline):
         # 2. Define call parameters
         batch_size = rgb_in.shape[0]
         device = self._execution_device
-        print("Device: ", device)
         # 3. Encode input prompt
         prompt_embeds, _ = self.encode_prompt(

         # 2. Define call parameters
         batch_size = rgb_in.shape[0]
         device = self._execution_device
         # 3. Encode input prompt
         prompt_embeds, _ = self.encode_prompt(

requirements.txt CHANGED Viewed

@@ -17,7 +17,8 @@ h5py==3.11.0
 omegaconf==2.3.0
 tabulate==0.9.0
 imageio==2.35.1
 spaces==0.28.3
-gradio==4.21.0
 gradio-imageslider==0.0.16
-gradio_client==0.12.0

 omegaconf==2.3.0
 tabulate==0.9.0
 imageio==2.35.1
+imageio-ffmpeg==0.5.1
 spaces==0.28.3
+gradio==4.44.0
 gradio-imageslider==0.0.16
+gradio-client==1.3.0