see-2-sound

Running on Zero

App Files Files Community

jadechoghari commited on 28 days ago

Commit

7c3177c

•

1 Parent(s): e090f2e

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -13

app.py CHANGED Viewed

@@ -1,16 +1,18 @@
 from typing import Tuple, Union
 import gradio as gr
 import numpy as np
 import see2sound
 import spaces
 import torch
 import yaml
 from huggingface_hub import snapshot_download
 model_id = "rishitdagli/see-2-sound"
 base_path = snapshot_download(repo_id=model_id)
 with open("config.yaml", "r") as file:
     data = yaml.safe_load(file)
 data_str = yaml.dump(data)
@@ -22,20 +24,43 @@ with open("config.yaml", "w") as file:
 model = see2sound.See2Sound(config_path="config.yaml")
 model.setup()
 @spaces.GPU(duration=280)
 @torch.no_grad()
 def process_image(
     image: str, num_audios: int, prompt: Union[str, None], steps: Union[int, None]
 ) -> Tuple[str, str]:
     model.run(
         path=image,
-        output_path="audio.wav",
         num_audios=num_audios,
         prompt=prompt,
         steps=steps,
     )
-    return image, "audio.wav"
 description_text = """# SEE-2-SOUND 🔊 Demo
@@ -43,8 +68,6 @@ description_text = """# SEE-2-SOUND 🔊 Demo
 Official demo for *SEE-2-SOUND 🔊: Zero-Shot Spatial Environment-to-Spatial Sound*.
 Please refer to our [paper](https://arxiv.org/abs/2406.06612), [project page](https://see2sound.github.io/), or [github](https://github.com/see2sound/see2sound) for more details.
 > Note: You should make sure that your hardware supports spatial audio.
-This demo allows you to generate spatial audio given an image. Upload an image (with an optional text prompt in the advanced settings) to geenrate spatial audio to accompany the image.
 """
 css = """
@@ -92,18 +115,23 @@ with gr.Blocks(css=css) as demo:
                 ),
             )
     gr.Examples(
-        examples=[[f"examples/{i}.png"] for i in range(1, 10)],
-        inputs=[image],
         outputs=[processed_image, generated_audio],
-        cache_examples="lazy"
     )
-    gr.on(
-        triggers=[submit_button.click],
-        fn=process_image,
-        inputs=[image, num_audios, prompt, steps],
-        outputs=[processed_image, generated_audio],
     )
 if __name__ == "__main__":

 from typing import Tuple, Union
 import gradio as gr
 import numpy as np
 import see2sound
 import spaces
 import torch
 import yaml
+import os
 from huggingface_hub import snapshot_download
+from PIL import Image
 model_id = "rishitdagli/see-2-sound"
 base_path = snapshot_download(repo_id=model_id)
+# load and update the configuration
 with open("config.yaml", "r") as file:
     data = yaml.safe_load(file)
 data_str = yaml.dump(data)
 model = see2sound.See2Sound(config_path="config.yaml")
 model.setup()
+CACHE_DIR = "gradio_cached_examples"
+# function to create cached output directory
+def create_cache_dir(image_path):
+    image_name = os.path.basename(image_path).split('.')[0]
+    cached_dir = os.path.join(CACHE_DIR, image_name)
+    os.makedirs(cached_dir, exist_ok=True)
+    return cached_dir
+# fn to process image and cache outputs
 @spaces.GPU(duration=280)
 @torch.no_grad()
 def process_image(
     image: str, num_audios: int, prompt: Union[str, None], steps: Union[int, None]
 ) -> Tuple[str, str]:
+    cached_dir = create_cache_dir(image)
+    cached_image_path = os.path.join(cached_dir, "processed_image.png")
+    cached_audio_path = os.path.join(cached_dir, "audio.wav")
+    # check if cached outputs exist, if yes, return them
+    if os.path.exists(cached_image_path) and os.path.exists(cached_audio_path):
+        return cached_image_path, cached_audio_path
+    # run the model if outputs are not cached
     model.run(
         path=image,
+        output_path=cached_audio_path,  # Save audio in cache directory
         num_audios=num_audios,
         prompt=prompt,
         steps=steps,
     )
+    # save the processed image to the cache directory (use original image or any transformations)
+    processed_image = Image.open(image)  # Assuming image is a file path
+    processed_image.save(cached_image_path)
+    return cached_image_path, cached_audio_path
 description_text = """# SEE-2-SOUND 🔊 Demo
 Official demo for *SEE-2-SOUND 🔊: Zero-Shot Spatial Environment-to-Spatial Sound*.
 Please refer to our [paper](https://arxiv.org/abs/2406.06612), [project page](https://see2sound.github.io/), or [github](https://github.com/see2sound/see2sound) for more details.
 > Note: You should make sure that your hardware supports spatial audio.
 """
 css = """
                 ),
             )
+    # load examples with manually cached outputs
     gr.Examples(
+        examples=[
+            ["examples/1.png", 3, "A scenic mountain view", 500],
+            ["examples/2.png", 2, "A forest with birds", 500],
+            ["examples/3.png", 1, "A crowded city", 500]
+        ],
+        inputs=[image, num_audios, prompt, steps],
         outputs=[processed_image, generated_audio],
+        cache_examples="lazy",  # Cache outputs as users interact
+        fn=process_image
     )
+    submit_button.click(
+        process_image,
+        inputs=[image, num_audios, prompt, steps],
+        outputs=[processed_image, generated_audio]
     )
 if __name__ == "__main__":