laizeqiang commited on
Commit
38e694c
β€’
1 Parent(s): c160de1
{imagebind/models β†’ anything2image}/__init__.py RENAMED
File without changes
anything2image/api.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import torch
3
+ import numpy as np
4
+ from diffusers import StableUnCLIPImg2ImgPipeline
5
+ from PIL import Image
6
+
7
+ from . import imagebind
8
+
9
+
10
+ class Anything2Image:
11
+ def __init__(
12
+ self,
13
+ device = "cuda:0" if torch.cuda.is_available() else "cpu",
14
+ imagebind_download_dir="checkpoints"
15
+ ):
16
+ self.pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
17
+ "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16
18
+ ).to(device)
19
+ self.model = imagebind.imagebind_huge(pretrained=True, download_dir=imagebind_download_dir).eval().to(device)
20
+ self.device = device
21
+
22
+ @torch.no_grad()
23
+ def __call__(self, prompt=None, audio=None, image=None, text=None):
24
+ device, model, pipe = self.device, self.model, self.pipe
25
+
26
+ if audio is not None:
27
+ sr, waveform = audio
28
+ sf.write('tmp.wav', waveform, sr)
29
+ embeddings = model.forward({
30
+ imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(['tmp.wav'], device),
31
+ })
32
+ audio_embeddings = embeddings[imagebind.ModalityType.AUDIO]
33
+ if image is not None:
34
+ Image.fromarray(image).save('tmp.png')
35
+ embeddings = model.forward({
36
+ imagebind.ModalityType.VISION: imagebind.load_and_transform_vision_data(['tmp.png'], device),
37
+ }, normalize=False)
38
+ image_embeddings = embeddings[imagebind.ModalityType.VISION]
39
+
40
+ if audio is not None and image is not None:
41
+ embeddings = (audio_embeddings + image_embeddings) / 2
42
+ elif image is not None:
43
+ embeddings = image_embeddings
44
+ elif audio is not None:
45
+ embeddings = audio_embeddings
46
+ else:
47
+ embeddings = None
48
+
49
+ if text is not None and text != "":
50
+ embeddings = self.model.forward({
51
+ imagebind.ModalityType.TEXT: imagebind.load_and_transform_text([text], device),
52
+ }, normalize=False)
53
+ embeddings = embeddings[imagebind.ModalityType.TEXT]
54
+
55
+ if embeddings is not None:
56
+ embeddings = embeddings.half()
57
+
58
+ images = pipe(prompt=prompt, image_embeds=embeddings).images
59
+ return images[0]
anything2image/app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fire
3
+ import os
4
+ from anything2image.api import Anything2Image
5
+
6
+
7
+ def main(ckpt_dir=os.path.join(os.path.expanduser('~'), 'anything2image', 'checkpoints'), ip='0.0.0.0', port=10049, share=False):
8
+ anything2img = Anything2Image(imagebind_download_dir=ckpt_dir)
9
+
10
+ with gr.Blocks() as demo:
11
+ gr.HTML(
12
+ """
13
+ <div align='center'> <h1>Anything To Image </h1> </div>
14
+ <p align="center"> Generate image from anything with ImageBind's unified latent space and stable-diffusion-2-1-unclip. </p>
15
+ <p align="center"><a href="https://github.com/Zeqiang-Lai/Anything2Image"><b>https://github.com/Zeqiang-Lai/Anything2Image</b></p>
16
+ """)
17
+ gr.Interface(fn=anything2img,
18
+ inputs=[gr.Text(placeholder="Enter a prompt in addition to the audio, image, text condition below", label="Prompt (Could be empty)"),
19
+ "audio",
20
+ "image",
21
+ "text"
22
+ ],
23
+ outputs="image",
24
+ examples=[['', 'assets/wav/dog_audio.wav', None, None],
25
+ ['A painting', 'assets/wav/cat.wav', None, None],
26
+ ['', 'assets/wav/wave.wav', 'assets/image/bird.png', None],
27
+ ['', None, 'assets/image/bird_image.jpg', None],
28
+ ['', None, None, 'A sunset over the ocean.'],
29
+ ],
30
+ cache_examples=True,
31
+ )
32
+ demo.queue(1).launch(server_name=ip, server_port=port, share=share)
33
+
34
+ fire.Fire(main)
anything2image/cli.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fire
3
+ from anything2image.api import Anything2Image
4
+ import soundfile as sf
5
+ from PIL import Image
6
+ import numpy as np
7
+
8
+ def main(
9
+ prompt='', audio=None, image=None, text=None,
10
+ ckpt_dir=os.path.join(os.path.expanduser('~'), 'anything2image', 'checkpoints')
11
+ ):
12
+ anything2img = Anything2Image(imagebind_download_dir=ckpt_dir)
13
+ if audio is not None:
14
+ data, samplerate = sf.read(audio)
15
+ audio = (samplerate, data)
16
+ if image is not None:
17
+ image = np.array(Image.open(image))
18
+ image = anything2img(prompt=prompt, audio=audio, image=image, text=text)
19
+ image.save('cli_output.png')
20
+
21
+ fire.Fire(main)
{imagebind β†’ anything2image/imagebind}/__init__.py RENAMED
File without changes
{imagebind β†’ anything2image/imagebind}/bpe/bpe_simple_vocab_16e6.txt.gz RENAMED
File without changes
{imagebind β†’ anything2image/imagebind}/data.py RENAMED
@@ -7,6 +7,7 @@
7
 
8
  import logging
9
  import math
 
10
 
11
  import torch
12
  import torch.nn as nn
@@ -22,7 +23,8 @@ from .models.multimodal_preprocessors import SimpleTokenizer
22
 
23
  DEFAULT_AUDIO_FRAME_SHIFT_MS = 10 # in milliseconds
24
 
25
- BPE_PATH = "bpe/bpe_simple_vocab_16e6.txt.gz"
 
26
 
27
 
28
  def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
 
7
 
8
  import logging
9
  import math
10
+ import os
11
 
12
  import torch
13
  import torch.nn as nn
 
23
 
24
  DEFAULT_AUDIO_FRAME_SHIFT_MS = 10 # in milliseconds
25
 
26
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
27
+ BPE_PATH = os.path.join(CURRENT_DIR, "bpe/bpe_simple_vocab_16e6.txt.gz")
28
 
29
 
30
  def waveform2melspec(waveform, sample_rate, num_mel_bins, target_length):
anything2image/imagebind/models/__init__.py ADDED
File without changes
{imagebind β†’ anything2image/imagebind}/models/helpers.py RENAMED
File without changes
{imagebind β†’ anything2image/imagebind}/models/imagebind_model.py RENAMED
@@ -452,7 +452,7 @@ class ImageBindModel(nn.Module):
452
 
453
  return nn.ModuleDict(modality_postprocessors)
454
 
455
- def forward(self, inputs):
456
  outputs = {}
457
  for modality_key, modality_value in inputs.items():
458
  reduce_list = (
@@ -474,9 +474,10 @@ class ImageBindModel(nn.Module):
474
  modality_value = self.modality_heads[modality_key](
475
  modality_value, **head_inputs
476
  )
477
- modality_value = self.modality_postprocessors[modality_key](
478
- modality_value
479
- )
 
480
 
481
  if reduce_list:
482
  modality_value = modality_value.reshape(B, S, -1)
@@ -487,7 +488,7 @@ class ImageBindModel(nn.Module):
487
  return outputs
488
 
489
 
490
- def imagebind_huge(pretrained=False):
491
  model = ImageBindModel(
492
  vision_embed_dim=1280,
493
  vision_num_blocks=32,
@@ -501,17 +502,20 @@ def imagebind_huge(pretrained=False):
501
  )
502
 
503
  if pretrained:
504
- if not os.path.exists("checkpoints/imagebind_huge.pth"):
505
- print(
506
- "Downloading imagebind weights to .checkpoints/imagebind_huge.pth ..."
507
- )
508
- os.makedirs("checkpoints", exist_ok=True)
 
 
 
509
  torch.hub.download_url_to_file(
510
  "https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
511
- "checkpoints/imagebind_huge.pth",
512
  progress=True,
513
  )
514
 
515
- model.load_state_dict(torch.load("checkpoints/imagebind_huge.pth"))
516
 
517
  return model
 
452
 
453
  return nn.ModuleDict(modality_postprocessors)
454
 
455
+ def forward(self, inputs, normalize=True):
456
  outputs = {}
457
  for modality_key, modality_value in inputs.items():
458
  reduce_list = (
 
474
  modality_value = self.modality_heads[modality_key](
475
  modality_value, **head_inputs
476
  )
477
+ if normalize:
478
+ modality_value = self.modality_postprocessors[modality_key](
479
+ modality_value
480
+ )
481
 
482
  if reduce_list:
483
  modality_value = modality_value.reshape(B, S, -1)
 
488
  return outputs
489
 
490
 
491
+ def imagebind_huge(pretrained=False, download_dir="checkpoints"):
492
  model = ImageBindModel(
493
  vision_embed_dim=1280,
494
  vision_num_blocks=32,
 
502
  )
503
 
504
  if pretrained:
505
+ path = os.path.join(download_dir, 'imagebind_huge.pth')
506
+ # if we have ckpt in current dir, do not download.
507
+ default_path = os.path.join('checkpoints', 'imagebind_huge.pth')
508
+ if os.path.exists(default_path):
509
+ path = default_path
510
+ if not os.path.exists(path):
511
+ print(f"Downloading imagebind weights to {path} ...")
512
+ os.makedirs(download_dir, exist_ok=True)
513
  torch.hub.download_url_to_file(
514
  "https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth",
515
+ path,
516
  progress=True,
517
  )
518
 
519
+ model.load_state_dict(torch.load(path))
520
 
521
  return model
{imagebind β†’ anything2image/imagebind}/models/multimodal_preprocessors.py RENAMED
File without changes
{imagebind β†’ anything2image/imagebind}/models/transformer.py RENAMED
File without changes
app.py CHANGED
@@ -1,49 +1,30 @@
1
  import gradio as gr
2
- import imagebind
3
- import soundfile as sf
4
- import torch
5
- from diffusers import StableUnCLIPImg2ImgPipeline
6
- from PIL import Image
7
 
8
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
- pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
10
- "stabilityai/stable-diffusion-2-1-unclip",
11
- )
12
- pipe = pipe.to(device)
13
 
14
- model = imagebind.imagebind_huge(pretrained=True)
15
- model.eval()
16
- model.to(device)
17
 
18
- @torch.no_grad()
19
- def anything2img(prompt, audio, image):
20
- if audio is not None:
21
- sr, waveform = audio
22
- sf.write('tmp.wav', waveform, sr)
23
- embeddings = model.forward({
24
- imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(['tmp.wav'], device),
25
- })
26
- audio_embeddings = embeddings[imagebind.ModalityType.AUDIO]
27
- if image is not None:
28
- Image.fromarray(image).save('tmp.png')
29
- embeddings = model.forward({
30
- imagebind.ModalityType.VISION: imagebind.load_and_transform_vision_data(['tmp.png'], device),
31
- })
32
- image_embeddings = embeddings[imagebind.ModalityType.VISION]
 
 
 
 
 
 
 
 
33
 
34
- if audio_embeddings is not None and image_embeddings is not None:
35
- embeddings = audio_embeddings + image_embeddings
36
- elif image_embeddings is not None:
37
- embeddings = image_embeddings
38
- elif audio_embeddings is not None:
39
- embeddings = audio_embeddings
40
- else:
41
- embeddings = None
42
-
43
- images = pipe(prompt=prompt, image_embeds=embeddings).images
44
- return images[0]
45
-
46
-
47
- demo = gr.Interface(fn=anything2img, inputs=["text", "audio", "image"], outputs="image")
48
- # demo.launch(server_name='0.0.0.0', server_port=10051, share=True)
49
- demo.launch(server_name='0.0.0.0', server_port=10047, share=True)
 
1
  import gradio as gr
2
+ from anything2image.api import Anything2Image
 
 
 
 
3
 
 
 
 
 
 
4
 
5
+ anything2img = Anything2Image(imagebind_download_dir='checkpoints')
 
 
6
 
7
+ with gr.Blocks() as demo:
8
+ gr.HTML(
9
+ """
10
+ <div align='center'> <h1>Anything To Image </h1> </div>
11
+ <p align="center"> Generate image from anything with ImageBind's unified latent space and stable-diffusion-2-1-unclip. </p>
12
+ <p align="center"><a href="https://github.com/Zeqiang-Lai/Anything2Image"><b>https://github.com/Zeqiang-Lai/Anything2Image</b></p>
13
+ """)
14
+ gr.Interface(fn=anything2img,
15
+ inputs=[gr.Text(placeholder="Enter a prompt in addition to the audio, image, text condition below", label="Prompt (Could be empty)"),
16
+ "audio",
17
+ "image",
18
+ "text"
19
+ ],
20
+ outputs="image",
21
+ examples=[['', 'assets/wav/dog_audio.wav', None, None],
22
+ ['A painting', 'assets/wav/cat.wav', None, None],
23
+ ['', 'assets/wav/wave.wav', 'assets/image/bird.png', None],
24
+ ['', None, 'assets/image/bird_image.jpg', None],
25
+ ['', None, None, 'A sunset over the ocean.'],
26
+ ],
27
+ cache_examples=True,
28
+ )
29
+ demo.queue(1).launch()
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/generated/audio_image_to_image/bird_rain.png CHANGED

Git LFS Details

  • SHA256: d86e2b91b1a4f3719a41c696845731b9a586aef241e2c8f56cc779c6c4c7ea6d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.08 MB

Git LFS Details

  • SHA256: 09b7ae7d8155b45ecd59e81165f508f35745e0b2e74ef10ba1f94dc9deb5767a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.09 MB
assets/generated/audio_image_to_image/bird_wave.png CHANGED

Git LFS Details

  • SHA256: 0c80e2177421e6922651c4f75a016716ce153dffa479e5d385dd34402878707b
  • Pointer size: 131 Bytes
  • Size of remote file: 748 kB

Git LFS Details

  • SHA256: dc6195dc6d1ddaefb9d628fd7ff84f97cbd578dccbdc8cfadfa0d47e296eea5f
  • Pointer size: 131 Bytes
  • Size of remote file: 693 kB
assets/generated/image_to_image/bird_image.png ADDED

Git LFS Details

  • SHA256: 7fce7238cd230431128e61f37dfc057818f6ae3cea9e7b8f7a6bee24b10516d4
  • Pointer size: 131 Bytes
  • Size of remote file: 833 kB
assets/generated/image_to_image/car_image.png CHANGED

Git LFS Details

  • SHA256: e0acc986045fd06a3bc7561207bb1d8fd6164c6e7ef0b74986aa4f31c7847b4f
  • Pointer size: 131 Bytes
  • Size of remote file: 986 kB

Git LFS Details

  • SHA256: 031b35a8fc906405aacc46d96814b1309fa99f90bbccd607ebb64e9c08784cc9
  • Pointer size: 131 Bytes
  • Size of remote file: 812 kB
assets/generated/image_to_image/dog_image.png ADDED

Git LFS Details

  • SHA256: 89fea42f3f0c7c22e7b6cb8b99361b9f358d1e4f3886b65f42c55de525d09223
  • Pointer size: 131 Bytes
  • Size of remote file: 835 kB
assets/generated/image_to_image/room.png ADDED

Git LFS Details

  • SHA256: 127568497fe93a2b5fa765e66a3a2ed4c0b7716bf3d09745721ab3204a83ed57
  • Pointer size: 131 Bytes
  • Size of remote file: 662 kB
assets/generated/text_to_image/car.png ADDED

Git LFS Details

  • SHA256: 29a9de7e2326de9984ce861964188c3f0dca29466fd0602f173563cafa894967
  • Pointer size: 131 Bytes
  • Size of remote file: 937 kB
assets/generated/text_to_image/city.png ADDED

Git LFS Details

  • SHA256: 6cf2473f6b746e7308024aed7a074c589c178a7471edba04d5e7606ea69dfc6f
  • Pointer size: 132 Bytes
  • Size of remote file: 1.05 MB
assets/generated/text_to_image/flower.png ADDED

Git LFS Details

  • SHA256: d38db856f827e76352c64e746e64df93e5eeba46c102199eb067f302533a73fc
  • Pointer size: 131 Bytes
  • Size of remote file: 679 kB
assets/generated/text_to_image/sunset.png ADDED

Git LFS Details

  • SHA256: 6f18894f703078b4e80d9a79f60c12f070ab1912804d900e68680b5b8d26c86a
  • Pointer size: 131 Bytes
  • Size of remote file: 687 kB
assets/image/room.png ADDED

Git LFS Details

  • SHA256: 4bc790c3526b24a5aad9e502f351343599192c41320ed24530128dc40359d630
  • Pointer size: 131 Bytes
  • Size of remote file: 190 kB
requirements.txt CHANGED
@@ -2,7 +2,7 @@ diffusers
2
  torch==1.13
3
  torchvision==0.14.0
4
  torchaudio==0.13.0
5
- pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
6
  timm==0.6.7
7
  ftfy
8
  regex
@@ -10,4 +10,7 @@ einops
10
  fvcore
11
  decord==0.6.0
12
  soundfile
13
- transformers
 
 
 
 
2
  torch==1.13
3
  torchvision==0.14.0
4
  torchaudio==0.13.0
5
+ pytorchvideo
6
  timm==0.6.7
7
  ftfy
8
  regex
 
10
  fvcore
11
  decord==0.6.0
12
  soundfile
13
+ transformers
14
+ gradio
15
+ fire
16
+ accelerate