Spaces:

aaronb
/

Anything2Image

Runtime error

App Files Files Community

laizeqiang commited on May 17, 2023

Commit

38e694c

•

1 Parent(s): c160de1

update

Browse files

Files changed (25) hide show

{imagebind/models → anything2image}/__init__.py +0 -0
anything2image/api.py +59 -0
anything2image/app.py +34 -0
anything2image/cli.py +21 -0
{imagebind → anything2image/imagebind}/__init__.py +0 -0
{imagebind → anything2image/imagebind}/bpe/bpe_simple_vocab_16e6.txt.gz +0 -0
{imagebind → anything2image/imagebind}/data.py +3 -1
anything2image/imagebind/models/__init__.py +0 -0
{imagebind → anything2image/imagebind}/models/helpers.py +0 -0
{imagebind → anything2image/imagebind}/models/imagebind_model.py +16 -12
{imagebind → anything2image/imagebind}/models/multimodal_preprocessors.py +0 -0
{imagebind → anything2image/imagebind}/models/transformer.py +0 -0
app.py +25 -44
assets/generated/audio_image_to_image/bird_rain.png +2 -2
assets/generated/audio_image_to_image/bird_wave.png +2 -2
assets/generated/image_to_image/bird_image.png +3 -0
assets/generated/image_to_image/car_image.png +2 -2
assets/generated/image_to_image/dog_image.png +3 -0
assets/generated/image_to_image/room.png +3 -0
assets/generated/text_to_image/car.png +3 -0
assets/generated/text_to_image/city.png +3 -0
assets/generated/text_to_image/flower.png +3 -0
assets/generated/text_to_image/sunset.png +3 -0
assets/image/room.png +3 -0
requirements.txt +5 -2

{imagebind/models → anything2image}/__init__.py RENAMED Viewed

File without changes

anything2image/api.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import soundfile as sf
+import torch
+import numpy as np
+from diffusers import StableUnCLIPImg2ImgPipeline
+from PIL import Image
+from . import imagebind
+class Anything2Image:
+    def __init__(
+        self,
+        device = "cuda:0" if torch.cuda.is_available() else "cpu",
+        imagebind_download_dir="checkpoints"
+    ):
+        self.pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16
+        ).to(device)
+        self.model = imagebind.imagebind_huge(pretrained=True, download_dir=imagebind_download_dir).eval().to(device)
+        self.device = device
+    @torch.no_grad()
+    def __call__(self, prompt=None, audio=None, image=None, text=None):
+        device, model, pipe = self.device, self.model, self.pipe
+        if audio is not None:
+            sr, waveform = audio
+            sf.write('tmp.wav', waveform, sr)
+            embeddings = model.forward({
+                imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(['tmp.wav'], device),
+            })
+            audio_embeddings = embeddings[imagebind.ModalityType.AUDIO]
+        if image is not None:
+            Image.fromarray(image).save('tmp.png')
+            embeddings = model.forward({
+                imagebind.ModalityType.VISION: imagebind.load_and_transform_vision_data(['tmp.png'], device),
+            }, normalize=False)
+            image_embeddings = embeddings[imagebind.ModalityType.VISION]
+        if audio is not None and image is not None:
+            embeddings = (audio_embeddings + image_embeddings) / 2
+        elif image is not None:
+            embeddings = image_embeddings
+        elif audio is not None:
+            embeddings = audio_embeddings
+        else:
+            embeddings = None
+        if text is not None and text != "":
+            embeddings = self.model.forward({
+                imagebind.ModalityType.TEXT: imagebind.load_and_transform_text([text], device),
+            }, normalize=False)
+            embeddings = embeddings[imagebind.ModalityType.TEXT]
+        if embeddings is not None:
+            embeddings = embeddings.half()
+        images = pipe(prompt=prompt, image_embeds=embeddings).images
+        return images[0]

anything2image/app.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import gradio as gr
+import fire
+import os
+from anything2image.api import Anything2Image
+def main(ckpt_dir=os.path.join(os.path.expanduser('~'), 'anything2image', 'checkpoints'), ip='0.0.0.0', port=10049, share=False):
+    anything2img = Anything2Image(imagebind_download_dir=ckpt_dir)
+    with gr.Blocks() as demo:
+        gr.HTML(
+                """
+                <div align='center'> <h1>Anything To Image </h1> </div>
+                <p align="center"> Generate image from anything with ImageBind's unified latent space and stable-diffusion-2-1-unclip. </p>
+                <p align="center"><a href="https://github.com/Zeqiang-Lai/Anything2Image"><b>https://github.com/Zeqiang-Lai/Anything2Image</b></p>
+                """)
+        gr.Interface(fn=anything2img,
+                     inputs=[gr.Text(placeholder="Enter a prompt in addition to the audio, image, text condition below", label="Prompt (Could be empty)"),
+                             "audio",
+                             "image",
+                             "text"
+                             ],
+                     outputs="image",
+                     examples=[['', 'assets/wav/dog_audio.wav', None, None],
+                               ['A painting', 'assets/wav/cat.wav', None, None],
+                               ['', 'assets/wav/wave.wav', 'assets/image/bird.png', None],
+                               ['', None, 'assets/image/bird_image.jpg', None],
+                               ['', None, None, 'A sunset over the ocean.'],
+                               ],
+                     cache_examples=True,
+                     )
+    demo.queue(1).launch(server_name=ip, server_port=port, share=share)
+fire.Fire(main)

anything2image/cli.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import fire
+from anything2image.api import Anything2Image
+import soundfile as sf
+from PIL import Image
+import numpy as np
+def main(
+    prompt='', audio=None, image=None, text=None,
+    ckpt_dir=os.path.join(os.path.expanduser('~'), 'anything2image', 'checkpoints')
+):
+    anything2img = Anything2Image(imagebind_download_dir=ckpt_dir)
+    if audio is not None:
+        data, samplerate = sf.read(audio)
+        audio = (samplerate, data)
+    if image is not None:
+        image = np.array(Image.open(image))
+    image = anything2img(prompt=prompt, audio=audio, image=image, text=text)
+    image.save('cli_output.png')
+fire.Fire(main)

{imagebind → anything2image/imagebind}/__init__.py RENAMED Viewed

File without changes

{imagebind → anything2image/imagebind}/bpe/bpe_simple_vocab_16e6.txt.gz RENAMED Viewed

File without changes

{imagebind → anything2image/imagebind}/data.py RENAMED Viewed

@@ -7,6 +7,7 @@
 import logging
 import math
 import torch
 import torch.nn as nn
@@ -22,7 +23,8 @@ from .models.multimodal_preprocessors import SimpleTokenizer
 DEFAULT_AUDIO_FRAME_SHIFT_MS = 10  # in milliseconds
-BPE_PATH = "bpe/bpe_simple_vocab_16e6.txt.gz"
 def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):

 import logging
 import math
+import os
 import torch
 import torch.nn as nn
 DEFAULT_AUDIO_FRAME_SHIFT_MS = 10  # in milliseconds
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+BPE_PATH = os.path.join(CURRENT_DIR, "bpe/bpe_simple_vocab_16e6.txt.gz")
 def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):

anything2image/imagebind/models/__init__.py ADDED Viewed

File without changes

{imagebind → anything2image/imagebind}/models/helpers.py RENAMED Viewed

File without changes

{imagebind → anything2image/imagebind}/models/imagebind_model.py RENAMED Viewed

@@ -452,7 +452,7 @@ class ImageBindModel(nn.Module):
         return nn.ModuleDict(modality_postprocessors)
-    def forward(self, inputs):
         outputs = {}
         for modality_key, modality_value in inputs.items():
             reduce_list = (
@@ -474,9 +474,10 @@ class ImageBindModel(nn.Module):
                 modality_value = self.modality_heads[modality_key](
                     modality_value, **head_inputs
                 )
-                modality_value = self.modality_postprocessors[modality_key](
-                    modality_value
-                )
                 if reduce_list:
                     modality_value = modality_value.reshape(B, S, -1)
@@ -487,7 +488,7 @@ class ImageBindModel(nn.Module):
         return outputs
-def imagebind_huge(pretrained=False):
     model = ImageBindModel(
         vision_embed_dim=1280,
         vision_num_blocks=32,
@@ -501,17 +502,20 @@ def imagebind_huge(pretrained=False):
     )
     if pretrained:
-        if not os.path.exists("checkpoints/imagebind_huge.pth"):
-            print(
-                "Downloading imagebind weights to .checkpoints/imagebind_huge.pth ..."
-            )
-            os.makedirs("checkpoints", exist_ok=True)
             torch.hub.download_url_to_file(
                 "https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
-                "checkpoints/imagebind_huge.pth",
                 progress=True,
             )
-        model.load_state_dict(torch.load("checkpoints/imagebind_huge.pth"))
     return model

         return nn.ModuleDict(modality_postprocessors)
+    def forward(self, inputs, normalize=True):
         outputs = {}
         for modality_key, modality_value in inputs.items():
             reduce_list = (
                 modality_value = self.modality_heads[modality_key](
                     modality_value, **head_inputs
                 )
+                if normalize:
+                    modality_value = self.modality_postprocessors[modality_key](
+                        modality_value
+                    )
                 if reduce_list:
                     modality_value = modality_value.reshape(B, S, -1)
         return outputs
+def imagebind_huge(pretrained=False, download_dir="checkpoints"):
     model = ImageBindModel(
         vision_embed_dim=1280,
         vision_num_blocks=32,
     )
     if pretrained:
+        path = os.path.join(download_dir, 'imagebind_huge.pth')
+        # if we have ckpt in current dir, do not download.
+        default_path = os.path.join('checkpoints', 'imagebind_huge.pth')
+        if os.path.exists(default_path):
+            path = default_path
+        if not os.path.exists(path):
+            print(f"Downloading imagebind weights to {path} ...")
+            os.makedirs(download_dir, exist_ok=True)
             torch.hub.download_url_to_file(
                 "https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
+                path,
                 progress=True,
             )
+        model.load_state_dict(torch.load(path))
     return model

{imagebind → anything2image/imagebind}/models/multimodal_preprocessors.py RENAMED Viewed

File without changes

{imagebind → anything2image/imagebind}/models/transformer.py RENAMED Viewed

File without changes

app.py CHANGED Viewed

@@ -1,49 +1,30 @@
 import gradio as gr
-import imagebind
-import soundfile as sf
-import torch
-from diffusers import StableUnCLIPImg2ImgPipeline
-from PIL import Image
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-2-1-unclip",
-)
-pipe = pipe.to(device)
-model = imagebind.imagebind_huge(pretrained=True)
-model.eval()
-model.to(device)
-@torch.no_grad()
-def anything2img(prompt, audio, image):
-    if audio is not None:
-        sr, waveform = audio
-        sf.write('tmp.wav', waveform, sr)
-        embeddings = model.forward({
-            imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(['tmp.wav'], device),
-        })
-        audio_embeddings = embeddings[imagebind.ModalityType.AUDIO]
-    if image is not None:
-        Image.fromarray(image).save('tmp.png')
-        embeddings = model.forward({
-            imagebind.ModalityType.VISION: imagebind.load_and_transform_vision_data(['tmp.png'], device),
-        })
-        image_embeddings = embeddings[imagebind.ModalityType.VISION]
-    if audio_embeddings is not None and image_embeddings is not None:
-        embeddings = audio_embeddings + image_embeddings
-    elif image_embeddings is not None:
-        embeddings = image_embeddings
-    elif audio_embeddings is not None:
-        embeddings = audio_embeddings
-    else:
-        embeddings = None
-    images = pipe(prompt=prompt, image_embeds=embeddings).images
-    return images[0]
-demo = gr.Interface(fn=anything2img, inputs=["text", "audio", "image"], outputs="image")
-# demo.launch(server_name='0.0.0.0', server_port=10051, share=True)
-demo.launch(server_name='0.0.0.0', server_port=10047, share=True)

 import gradio as gr
+from anything2image.api import Anything2Image
+anything2img = Anything2Image(imagebind_download_dir='checkpoints')
+with gr.Blocks() as demo:
+    gr.HTML(
+            """
+            <div align='center'> <h1>Anything To Image </h1> </div>
+            <p align="center"> Generate image from anything with ImageBind's unified latent space and stable-diffusion-2-1-unclip. </p>
+            <p align="center"><a href="https://github.com/Zeqiang-Lai/Anything2Image"><b>https://github.com/Zeqiang-Lai/Anything2Image</b></p>
+            """)
+    gr.Interface(fn=anything2img,
+                    inputs=[gr.Text(placeholder="Enter a prompt in addition to the audio, image, text condition below", label="Prompt (Could be empty)"),
+                            "audio",
+                            "image",
+                            "text"
+                            ],
+                    outputs="image",
+                    examples=[['', 'assets/wav/dog_audio.wav', None, None],
+                            ['A painting', 'assets/wav/cat.wav', None, None],
+                            ['', 'assets/wav/wave.wav', 'assets/image/bird.png', None],
+                            ['', None, 'assets/image/bird_image.jpg', None],
+                            ['', None, None, 'A sunset over the ocean.'],
+                            ],
+                    cache_examples=True,
+                    )
+demo.queue(1).launch()

assets/generated/audio_image_to_image/bird_rain.png CHANGED Viewed

Git LFS Details

SHA256: d86e2b91b1a4f3719a41c696845731b9a586aef241e2c8f56cc779c6c4c7ea6d
Pointer size: 132 Bytes
Size of remote file: 1.08 MB

Git LFS Details

SHA256: 09b7ae7d8155b45ecd59e81165f508f35745e0b2e74ef10ba1f94dc9deb5767a
Pointer size: 132 Bytes
Size of remote file: 1.09 MB

assets/generated/audio_image_to_image/bird_wave.png CHANGED Viewed

Git LFS Details

SHA256: 0c80e2177421e6922651c4f75a016716ce153dffa479e5d385dd34402878707b
Pointer size: 131 Bytes
Size of remote file: 748 kB

Git LFS Details

SHA256: dc6195dc6d1ddaefb9d628fd7ff84f97cbd578dccbdc8cfadfa0d47e296eea5f
Pointer size: 131 Bytes
Size of remote file: 693 kB

assets/generated/image_to_image/bird_image.png ADDED Viewed

Git LFS Details

SHA256: 7fce7238cd230431128e61f37dfc057818f6ae3cea9e7b8f7a6bee24b10516d4
Pointer size: 131 Bytes
Size of remote file: 833 kB

assets/generated/image_to_image/car_image.png CHANGED Viewed

Git LFS Details

SHA256: e0acc986045fd06a3bc7561207bb1d8fd6164c6e7ef0b74986aa4f31c7847b4f
Pointer size: 131 Bytes
Size of remote file: 986 kB

Git LFS Details

SHA256: 031b35a8fc906405aacc46d96814b1309fa99f90bbccd607ebb64e9c08784cc9
Pointer size: 131 Bytes
Size of remote file: 812 kB

assets/generated/image_to_image/dog_image.png ADDED Viewed

Git LFS Details

SHA256: 89fea42f3f0c7c22e7b6cb8b99361b9f358d1e4f3886b65f42c55de525d09223
Pointer size: 131 Bytes
Size of remote file: 835 kB

assets/generated/image_to_image/room.png ADDED Viewed

Git LFS Details

SHA256: 127568497fe93a2b5fa765e66a3a2ed4c0b7716bf3d09745721ab3204a83ed57
Pointer size: 131 Bytes
Size of remote file: 662 kB

assets/generated/text_to_image/car.png ADDED Viewed

Git LFS Details

SHA256: 29a9de7e2326de9984ce861964188c3f0dca29466fd0602f173563cafa894967
Pointer size: 131 Bytes
Size of remote file: 937 kB

assets/generated/text_to_image/city.png ADDED Viewed

Git LFS Details

SHA256: 6cf2473f6b746e7308024aed7a074c589c178a7471edba04d5e7606ea69dfc6f
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

assets/generated/text_to_image/flower.png ADDED Viewed

Git LFS Details

SHA256: d38db856f827e76352c64e746e64df93e5eeba46c102199eb067f302533a73fc
Pointer size: 131 Bytes
Size of remote file: 679 kB

assets/generated/text_to_image/sunset.png ADDED Viewed

Git LFS Details

SHA256: 6f18894f703078b4e80d9a79f60c12f070ab1912804d900e68680b5b8d26c86a
Pointer size: 131 Bytes
Size of remote file: 687 kB

assets/image/room.png ADDED Viewed

Git LFS Details

SHA256: 4bc790c3526b24a5aad9e502f351343599192c41320ed24530128dc40359d630
Pointer size: 131 Bytes
Size of remote file: 190 kB

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@ diffusers
 torch==1.13
 torchvision==0.14.0
 torchaudio==0.13.0
-pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
 timm==0.6.7
 ftfy
 regex
@@ -10,4 +10,7 @@ einops
 fvcore
 decord==0.6.0
 soundfile
-transformers

 torch==1.13
 torchvision==0.14.0
 torchaudio==0.13.0
+pytorchvideo
 timm==0.6.7
 ftfy
 regex
 fvcore
 decord==0.6.0
 soundfile
+transformers
+gradio
+fire
+accelerate