Spaces:
Runtime error
Runtime error
laizeqiang
commited on
Commit
β’
38e694c
1
Parent(s):
c160de1
update
Browse files- {imagebind/models β anything2image}/__init__.py +0 -0
- anything2image/api.py +59 -0
- anything2image/app.py +34 -0
- anything2image/cli.py +21 -0
- {imagebind β anything2image/imagebind}/__init__.py +0 -0
- {imagebind β anything2image/imagebind}/bpe/bpe_simple_vocab_16e6.txt.gz +0 -0
- {imagebind β anything2image/imagebind}/data.py +3 -1
- anything2image/imagebind/models/__init__.py +0 -0
- {imagebind β anything2image/imagebind}/models/helpers.py +0 -0
- {imagebind β anything2image/imagebind}/models/imagebind_model.py +16 -12
- {imagebind β anything2image/imagebind}/models/multimodal_preprocessors.py +0 -0
- {imagebind β anything2image/imagebind}/models/transformer.py +0 -0
- app.py +25 -44
- assets/generated/audio_image_to_image/bird_rain.png +2 -2
- assets/generated/audio_image_to_image/bird_wave.png +2 -2
- assets/generated/image_to_image/bird_image.png +3 -0
- assets/generated/image_to_image/car_image.png +2 -2
- assets/generated/image_to_image/dog_image.png +3 -0
- assets/generated/image_to_image/room.png +3 -0
- assets/generated/text_to_image/car.png +3 -0
- assets/generated/text_to_image/city.png +3 -0
- assets/generated/text_to_image/flower.png +3 -0
- assets/generated/text_to_image/sunset.png +3 -0
- assets/image/room.png +3 -0
- requirements.txt +5 -2
{imagebind/models β anything2image}/__init__.py
RENAMED
File without changes
|
anything2image/api.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import soundfile as sf
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from diffusers import StableUnCLIPImg2ImgPipeline
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
from . import imagebind
|
8 |
+
|
9 |
+
|
10 |
+
class Anything2Image:
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu",
|
14 |
+
imagebind_download_dir="checkpoints"
|
15 |
+
):
|
16 |
+
self.pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
|
17 |
+
"stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16
|
18 |
+
).to(device)
|
19 |
+
self.model = imagebind.imagebind_huge(pretrained=True, download_dir=imagebind_download_dir).eval().to(device)
|
20 |
+
self.device = device
|
21 |
+
|
22 |
+
@torch.no_grad()
|
23 |
+
def __call__(self, prompt=None, audio=None, image=None, text=None):
|
24 |
+
device, model, pipe = self.device, self.model, self.pipe
|
25 |
+
|
26 |
+
if audio is not None:
|
27 |
+
sr, waveform = audio
|
28 |
+
sf.write('tmp.wav', waveform, sr)
|
29 |
+
embeddings = model.forward({
|
30 |
+
imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(['tmp.wav'], device),
|
31 |
+
})
|
32 |
+
audio_embeddings = embeddings[imagebind.ModalityType.AUDIO]
|
33 |
+
if image is not None:
|
34 |
+
Image.fromarray(image).save('tmp.png')
|
35 |
+
embeddings = model.forward({
|
36 |
+
imagebind.ModalityType.VISION: imagebind.load_and_transform_vision_data(['tmp.png'], device),
|
37 |
+
}, normalize=False)
|
38 |
+
image_embeddings = embeddings[imagebind.ModalityType.VISION]
|
39 |
+
|
40 |
+
if audio is not None and image is not None:
|
41 |
+
embeddings = (audio_embeddings + image_embeddings) / 2
|
42 |
+
elif image is not None:
|
43 |
+
embeddings = image_embeddings
|
44 |
+
elif audio is not None:
|
45 |
+
embeddings = audio_embeddings
|
46 |
+
else:
|
47 |
+
embeddings = None
|
48 |
+
|
49 |
+
if text is not None and text != "":
|
50 |
+
embeddings = self.model.forward({
|
51 |
+
imagebind.ModalityType.TEXT: imagebind.load_and_transform_text([text], device),
|
52 |
+
}, normalize=False)
|
53 |
+
embeddings = embeddings[imagebind.ModalityType.TEXT]
|
54 |
+
|
55 |
+
if embeddings is not None:
|
56 |
+
embeddings = embeddings.half()
|
57 |
+
|
58 |
+
images = pipe(prompt=prompt, image_embeds=embeddings).images
|
59 |
+
return images[0]
|
anything2image/app.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import fire
|
3 |
+
import os
|
4 |
+
from anything2image.api import Anything2Image
|
5 |
+
|
6 |
+
|
7 |
+
def main(ckpt_dir=os.path.join(os.path.expanduser('~'), 'anything2image', 'checkpoints'), ip='0.0.0.0', port=10049, share=False):
|
8 |
+
anything2img = Anything2Image(imagebind_download_dir=ckpt_dir)
|
9 |
+
|
10 |
+
with gr.Blocks() as demo:
|
11 |
+
gr.HTML(
|
12 |
+
"""
|
13 |
+
<div align='center'> <h1>Anything To Image </h1> </div>
|
14 |
+
<p align="center"> Generate image from anything with ImageBind's unified latent space and stable-diffusion-2-1-unclip. </p>
|
15 |
+
<p align="center"><a href="https://github.com/Zeqiang-Lai/Anything2Image"><b>https://github.com/Zeqiang-Lai/Anything2Image</b></p>
|
16 |
+
""")
|
17 |
+
gr.Interface(fn=anything2img,
|
18 |
+
inputs=[gr.Text(placeholder="Enter a prompt in addition to the audio, image, text condition below", label="Prompt (Could be empty)"),
|
19 |
+
"audio",
|
20 |
+
"image",
|
21 |
+
"text"
|
22 |
+
],
|
23 |
+
outputs="image",
|
24 |
+
examples=[['', 'assets/wav/dog_audio.wav', None, None],
|
25 |
+
['A painting', 'assets/wav/cat.wav', None, None],
|
26 |
+
['', 'assets/wav/wave.wav', 'assets/image/bird.png', None],
|
27 |
+
['', None, 'assets/image/bird_image.jpg', None],
|
28 |
+
['', None, None, 'A sunset over the ocean.'],
|
29 |
+
],
|
30 |
+
cache_examples=True,
|
31 |
+
)
|
32 |
+
demo.queue(1).launch(server_name=ip, server_port=port, share=share)
|
33 |
+
|
34 |
+
fire.Fire(main)
|
anything2image/cli.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import fire
|
3 |
+
from anything2image.api import Anything2Image
|
4 |
+
import soundfile as sf
|
5 |
+
from PIL import Image
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
def main(
|
9 |
+
prompt='', audio=None, image=None, text=None,
|
10 |
+
ckpt_dir=os.path.join(os.path.expanduser('~'), 'anything2image', 'checkpoints')
|
11 |
+
):
|
12 |
+
anything2img = Anything2Image(imagebind_download_dir=ckpt_dir)
|
13 |
+
if audio is not None:
|
14 |
+
data, samplerate = sf.read(audio)
|
15 |
+
audio = (samplerate, data)
|
16 |
+
if image is not None:
|
17 |
+
image = np.array(Image.open(image))
|
18 |
+
image = anything2img(prompt=prompt, audio=audio, image=image, text=text)
|
19 |
+
image.save('cli_output.png')
|
20 |
+
|
21 |
+
fire.Fire(main)
|
{imagebind β anything2image/imagebind}/__init__.py
RENAMED
File without changes
|
{imagebind β anything2image/imagebind}/bpe/bpe_simple_vocab_16e6.txt.gz
RENAMED
File without changes
|
{imagebind β anything2image/imagebind}/data.py
RENAMED
@@ -7,6 +7,7 @@
|
|
7 |
|
8 |
import logging
|
9 |
import math
|
|
|
10 |
|
11 |
import torch
|
12 |
import torch.nn as nn
|
@@ -22,7 +23,8 @@ from .models.multimodal_preprocessors import SimpleTokenizer
|
|
22 |
|
23 |
DEFAULT_AUDIO_FRAME_SHIFT_MS = 10 # in milliseconds
|
24 |
|
25 |
-
|
|
|
26 |
|
27 |
|
28 |
def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
|
|
|
7 |
|
8 |
import logging
|
9 |
import math
|
10 |
+
import os
|
11 |
|
12 |
import torch
|
13 |
import torch.nn as nn
|
|
|
23 |
|
24 |
DEFAULT_AUDIO_FRAME_SHIFT_MS = 10 # in milliseconds
|
25 |
|
26 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
27 |
+
BPE_PATH = os.path.join(CURRENT_DIR, "bpe/bpe_simple_vocab_16e6.txt.gz")
|
28 |
|
29 |
|
30 |
def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
|
anything2image/imagebind/models/__init__.py
ADDED
File without changes
|
{imagebind β anything2image/imagebind}/models/helpers.py
RENAMED
File without changes
|
{imagebind β anything2image/imagebind}/models/imagebind_model.py
RENAMED
@@ -452,7 +452,7 @@ class ImageBindModel(nn.Module):
|
|
452 |
|
453 |
return nn.ModuleDict(modality_postprocessors)
|
454 |
|
455 |
-
def forward(self, inputs):
|
456 |
outputs = {}
|
457 |
for modality_key, modality_value in inputs.items():
|
458 |
reduce_list = (
|
@@ -474,9 +474,10 @@ class ImageBindModel(nn.Module):
|
|
474 |
modality_value = self.modality_heads[modality_key](
|
475 |
modality_value, **head_inputs
|
476 |
)
|
477 |
-
|
478 |
-
modality_value
|
479 |
-
|
|
|
480 |
|
481 |
if reduce_list:
|
482 |
modality_value = modality_value.reshape(B, S, -1)
|
@@ -487,7 +488,7 @@ class ImageBindModel(nn.Module):
|
|
487 |
return outputs
|
488 |
|
489 |
|
490 |
-
def imagebind_huge(pretrained=False):
|
491 |
model = ImageBindModel(
|
492 |
vision_embed_dim=1280,
|
493 |
vision_num_blocks=32,
|
@@ -501,17 +502,20 @@ def imagebind_huge(pretrained=False):
|
|
501 |
)
|
502 |
|
503 |
if pretrained:
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
|
|
|
|
|
|
509 |
torch.hub.download_url_to_file(
|
510 |
"https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
|
511 |
-
|
512 |
progress=True,
|
513 |
)
|
514 |
|
515 |
-
model.load_state_dict(torch.load(
|
516 |
|
517 |
return model
|
|
|
452 |
|
453 |
return nn.ModuleDict(modality_postprocessors)
|
454 |
|
455 |
+
def forward(self, inputs, normalize=True):
|
456 |
outputs = {}
|
457 |
for modality_key, modality_value in inputs.items():
|
458 |
reduce_list = (
|
|
|
474 |
modality_value = self.modality_heads[modality_key](
|
475 |
modality_value, **head_inputs
|
476 |
)
|
477 |
+
if normalize:
|
478 |
+
modality_value = self.modality_postprocessors[modality_key](
|
479 |
+
modality_value
|
480 |
+
)
|
481 |
|
482 |
if reduce_list:
|
483 |
modality_value = modality_value.reshape(B, S, -1)
|
|
|
488 |
return outputs
|
489 |
|
490 |
|
491 |
+
def imagebind_huge(pretrained=False, download_dir="checkpoints"):
|
492 |
model = ImageBindModel(
|
493 |
vision_embed_dim=1280,
|
494 |
vision_num_blocks=32,
|
|
|
502 |
)
|
503 |
|
504 |
if pretrained:
|
505 |
+
path = os.path.join(download_dir, 'imagebind_huge.pth')
|
506 |
+
# if we have ckpt in current dir, do not download.
|
507 |
+
default_path = os.path.join('checkpoints', 'imagebind_huge.pth')
|
508 |
+
if os.path.exists(default_path):
|
509 |
+
path = default_path
|
510 |
+
if not os.path.exists(path):
|
511 |
+
print(f"Downloading imagebind weights to {path} ...")
|
512 |
+
os.makedirs(download_dir, exist_ok=True)
|
513 |
torch.hub.download_url_to_file(
|
514 |
"https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
|
515 |
+
path,
|
516 |
progress=True,
|
517 |
)
|
518 |
|
519 |
+
model.load_state_dict(torch.load(path))
|
520 |
|
521 |
return model
|
{imagebind β anything2image/imagebind}/models/multimodal_preprocessors.py
RENAMED
File without changes
|
{imagebind β anything2image/imagebind}/models/transformer.py
RENAMED
File without changes
|
app.py
CHANGED
@@ -1,49 +1,30 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
import soundfile as sf
|
4 |
-
import torch
|
5 |
-
from diffusers import StableUnCLIPImg2ImgPipeline
|
6 |
-
from PIL import Image
|
7 |
|
8 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
9 |
-
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
|
10 |
-
"stabilityai/stable-diffusion-2-1-unclip",
|
11 |
-
)
|
12 |
-
pipe = pipe.to(device)
|
13 |
|
14 |
-
|
15 |
-
model.eval()
|
16 |
-
model.to(device)
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
if audio_embeddings is not None and image_embeddings is not None:
|
35 |
-
embeddings = audio_embeddings + image_embeddings
|
36 |
-
elif image_embeddings is not None:
|
37 |
-
embeddings = image_embeddings
|
38 |
-
elif audio_embeddings is not None:
|
39 |
-
embeddings = audio_embeddings
|
40 |
-
else:
|
41 |
-
embeddings = None
|
42 |
-
|
43 |
-
images = pipe(prompt=prompt, image_embeds=embeddings).images
|
44 |
-
return images[0]
|
45 |
-
|
46 |
-
|
47 |
-
demo = gr.Interface(fn=anything2img, inputs=["text", "audio", "image"], outputs="image")
|
48 |
-
# demo.launch(server_name='0.0.0.0', server_port=10051, share=True)
|
49 |
-
demo.launch(server_name='0.0.0.0', server_port=10047, share=True)
|
|
|
1 |
import gradio as gr
|
2 |
+
from anything2image.api import Anything2Image
|
|
|
|
|
|
|
|
|
3 |
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
anything2img = Anything2Image(imagebind_download_dir='checkpoints')
|
|
|
|
|
6 |
|
7 |
+
with gr.Blocks() as demo:
|
8 |
+
gr.HTML(
|
9 |
+
"""
|
10 |
+
<div align='center'> <h1>Anything To Image </h1> </div>
|
11 |
+
<p align="center"> Generate image from anything with ImageBind's unified latent space and stable-diffusion-2-1-unclip. </p>
|
12 |
+
<p align="center"><a href="https://github.com/Zeqiang-Lai/Anything2Image"><b>https://github.com/Zeqiang-Lai/Anything2Image</b></p>
|
13 |
+
""")
|
14 |
+
gr.Interface(fn=anything2img,
|
15 |
+
inputs=[gr.Text(placeholder="Enter a prompt in addition to the audio, image, text condition below", label="Prompt (Could be empty)"),
|
16 |
+
"audio",
|
17 |
+
"image",
|
18 |
+
"text"
|
19 |
+
],
|
20 |
+
outputs="image",
|
21 |
+
examples=[['', 'assets/wav/dog_audio.wav', None, None],
|
22 |
+
['A painting', 'assets/wav/cat.wav', None, None],
|
23 |
+
['', 'assets/wav/wave.wav', 'assets/image/bird.png', None],
|
24 |
+
['', None, 'assets/image/bird_image.jpg', None],
|
25 |
+
['', None, None, 'A sunset over the ocean.'],
|
26 |
+
],
|
27 |
+
cache_examples=True,
|
28 |
+
)
|
29 |
+
demo.queue(1).launch()
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assets/generated/audio_image_to_image/bird_rain.png
CHANGED
Git LFS Details
|
Git LFS Details
|
assets/generated/audio_image_to_image/bird_wave.png
CHANGED
Git LFS Details
|
Git LFS Details
|
assets/generated/image_to_image/bird_image.png
ADDED
Git LFS Details
|
assets/generated/image_to_image/car_image.png
CHANGED
Git LFS Details
|
Git LFS Details
|
assets/generated/image_to_image/dog_image.png
ADDED
Git LFS Details
|
assets/generated/image_to_image/room.png
ADDED
Git LFS Details
|
assets/generated/text_to_image/car.png
ADDED
Git LFS Details
|
assets/generated/text_to_image/city.png
ADDED
Git LFS Details
|
assets/generated/text_to_image/flower.png
ADDED
Git LFS Details
|
assets/generated/text_to_image/sunset.png
ADDED
Git LFS Details
|
assets/image/room.png
ADDED
Git LFS Details
|
requirements.txt
CHANGED
@@ -2,7 +2,7 @@ diffusers
|
|
2 |
torch==1.13
|
3 |
torchvision==0.14.0
|
4 |
torchaudio==0.13.0
|
5 |
-
pytorchvideo
|
6 |
timm==0.6.7
|
7 |
ftfy
|
8 |
regex
|
@@ -10,4 +10,7 @@ einops
|
|
10 |
fvcore
|
11 |
decord==0.6.0
|
12 |
soundfile
|
13 |
-
transformers
|
|
|
|
|
|
|
|
2 |
torch==1.13
|
3 |
torchvision==0.14.0
|
4 |
torchaudio==0.13.0
|
5 |
+
pytorchvideo
|
6 |
timm==0.6.7
|
7 |
ftfy
|
8 |
regex
|
|
|
10 |
fvcore
|
11 |
decord==0.6.0
|
12 |
soundfile
|
13 |
+
transformers
|
14 |
+
gradio
|
15 |
+
fire
|
16 |
+
accelerate
|