laizeqiang commited on
Commit
c160de1
1 Parent(s): b6d24d0
Files changed (36) hide show
  1. .gitattributes +2 -0
  2. app.py +31 -14
  3. assets/bird_image.jpg +0 -0
  4. assets/car_image.jpg +0 -0
  5. assets/dog_image.jpg +0 -0
  6. assets/generated/audio_image_to_image/bird_rain.png +3 -0
  7. assets/generated/audio_image_to_image/bird_wave.png +3 -0
  8. assets/generated/audio_text_to_image/bird_a_painting.png +3 -0
  9. assets/generated/audio_text_to_image/bird_a_photo.png +3 -0
  10. assets/generated/audio_text_to_image/cat_a_painting.png +3 -0
  11. assets/generated/audio_text_to_image/cat_a_photo.png +3 -0
  12. assets/generated/audio_to_image/bird_audio.png +3 -0
  13. assets/generated/audio_to_image/car_audio.png +3 -0
  14. assets/generated/audio_to_image/cat.png +3 -0
  15. assets/generated/audio_to_image/cattle.png +3 -0
  16. assets/generated/audio_to_image/dog_audio.png +3 -0
  17. assets/generated/audio_to_image/fire_engine.png +3 -0
  18. assets/generated/audio_to_image/goat.png +3 -0
  19. assets/generated/audio_to_image/motorcycle.png +3 -0
  20. assets/generated/audio_to_image/plane.png +3 -0
  21. assets/generated/audio_to_image/train.png +3 -0
  22. assets/generated/bird_audio.png +0 -0
  23. assets/generated/cattle.png +0 -0
  24. assets/generated/dog_audio.png +0 -0
  25. assets/generated/goat.png +0 -0
  26. assets/generated/image_to_image/car_image.png +3 -0
  27. assets/image/bird.png +3 -0
  28. assets/image/bird_image.jpg +3 -0
  29. assets/image/car_image.jpg +3 -0
  30. assets/image/dog_image.jpg +3 -0
  31. assets/wav/cat.wav +3 -0
  32. assets/wav/fire_engine.wav +3 -0
  33. assets/wav/motorcycle.wav +3 -0
  34. assets/wav/plane.wav +3 -0
  35. assets/wav/train.wav +3 -0
  36. assets/wav/wave.wav +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  *.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  *.wav filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.jpg filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,12 +1,13 @@
1
  import gradio as gr
2
  import imagebind
 
3
  import torch
4
  from diffusers import StableUnCLIPImg2ImgPipeline
5
- import soundfile as sf
6
 
7
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
8
  pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
9
- "stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variation="fp16"
10
  )
11
  pipe = pipe.to(device)
12
 
@@ -15,18 +16,34 @@ model.eval()
15
  model.to(device)
16
 
17
  @torch.no_grad()
18
- def anything2img(prompt, audio):
19
- sr, waveform = audio
20
- audio_path = 'tmp.wav'
21
- sf.write(audio_path, waveform, sr)
22
- audio_paths=[audio_path]
23
- embeddings = model.forward({
24
- imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(audio_paths, device),
25
- })
26
- embeddings = embeddings[imagebind.ModalityType.AUDIO]
27
- images = pipe(prompt=prompt, image_embeds=embeddings.half()).images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  return images[0]
29
 
30
 
31
- demo = gr.Interface(fn=anything2img, inputs=["text", "audio"], outputs="image")
32
- demo.launch()
 
 
1
  import gradio as gr
2
  import imagebind
3
+ import soundfile as sf
4
  import torch
5
  from diffusers import StableUnCLIPImg2ImgPipeline
6
+ from PIL import Image
7
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
  pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
10
+ "stabilityai/stable-diffusion-2-1-unclip",
11
  )
12
  pipe = pipe.to(device)
13
 
 
16
  model.to(device)
17
 
18
  @torch.no_grad()
19
+ def anything2img(prompt, audio, image):
20
+ if audio is not None:
21
+ sr, waveform = audio
22
+ sf.write('tmp.wav', waveform, sr)
23
+ embeddings = model.forward({
24
+ imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(['tmp.wav'], device),
25
+ })
26
+ audio_embeddings = embeddings[imagebind.ModalityType.AUDIO]
27
+ if image is not None:
28
+ Image.fromarray(image).save('tmp.png')
29
+ embeddings = model.forward({
30
+ imagebind.ModalityType.VISION: imagebind.load_and_transform_vision_data(['tmp.png'], device),
31
+ })
32
+ image_embeddings = embeddings[imagebind.ModalityType.VISION]
33
+
34
+ if audio_embeddings is not None and image_embeddings is not None:
35
+ embeddings = audio_embeddings + image_embeddings
36
+ elif image_embeddings is not None:
37
+ embeddings = image_embeddings
38
+ elif audio_embeddings is not None:
39
+ embeddings = audio_embeddings
40
+ else:
41
+ embeddings = None
42
+
43
+ images = pipe(prompt=prompt, image_embeds=embeddings).images
44
  return images[0]
45
 
46
 
47
+ demo = gr.Interface(fn=anything2img, inputs=["text", "audio", "image"], outputs="image")
48
+ # demo.launch(server_name='0.0.0.0', server_port=10051, share=True)
49
+ demo.launch(server_name='0.0.0.0', server_port=10047, share=True)
assets/bird_image.jpg DELETED
Binary file (115 kB)
 
assets/car_image.jpg DELETED
Binary file (59.3 kB)
 
assets/dog_image.jpg DELETED
Binary file (86.1 kB)
 
assets/generated/audio_image_to_image/bird_rain.png ADDED

Git LFS Details

  • SHA256: d86e2b91b1a4f3719a41c696845731b9a586aef241e2c8f56cc779c6c4c7ea6d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.08 MB
assets/generated/audio_image_to_image/bird_wave.png ADDED

Git LFS Details

  • SHA256: 0c80e2177421e6922651c4f75a016716ce153dffa479e5d385dd34402878707b
  • Pointer size: 131 Bytes
  • Size of remote file: 748 kB
assets/generated/audio_text_to_image/bird_a_painting.png ADDED

Git LFS Details

  • SHA256: 1e045df59a428a18b2a7b1e8fea3787b31ff9746138105e1353aa5b64fa0cdd8
  • Pointer size: 131 Bytes
  • Size of remote file: 747 kB
assets/generated/audio_text_to_image/bird_a_photo.png ADDED

Git LFS Details

  • SHA256: 78c036f81c2a080eddd9c420f3fdd8c74725b1b8d2dfca99e0d2140d8e1f5a92
  • Pointer size: 131 Bytes
  • Size of remote file: 848 kB
assets/generated/audio_text_to_image/cat_a_painting.png ADDED

Git LFS Details

  • SHA256: 1beca95d1c1255d18362766e434f1dac16d1ea9dd9026e110cf2a520ba29b67a
  • Pointer size: 131 Bytes
  • Size of remote file: 963 kB
assets/generated/audio_text_to_image/cat_a_photo.png ADDED

Git LFS Details

  • SHA256: 334b7aa8095bf0f71c4b2e235ba735ccb1dbe5f5cbb302d3b5930cc1775c3468
  • Pointer size: 132 Bytes
  • Size of remote file: 1.04 MB
assets/generated/audio_to_image/bird_audio.png ADDED

Git LFS Details

  • SHA256: 93d656e8a0ac00672616e852cd1cb5f3c68a0be6791ac909b7b2e6df766944d6
  • Pointer size: 131 Bytes
  • Size of remote file: 760 kB
assets/generated/audio_to_image/car_audio.png ADDED

Git LFS Details

  • SHA256: dc09121ca3f985e2e8cf560149552caaaa3009a214a1762e5e97f1859891b1d1
  • Pointer size: 131 Bytes
  • Size of remote file: 853 kB
assets/generated/audio_to_image/cat.png ADDED

Git LFS Details

  • SHA256: 3a18cc97276cf6d684d8f6a3760dfd912503fd034490973b155de9171e828af3
  • Pointer size: 132 Bytes
  • Size of remote file: 1.1 MB
assets/generated/audio_to_image/cattle.png ADDED

Git LFS Details

  • SHA256: 133906b7151e79e9a43586554e8a486a610df22d3ed19f34ca6af1f2facb9eb6
  • Pointer size: 131 Bytes
  • Size of remote file: 760 kB
assets/generated/audio_to_image/dog_audio.png ADDED

Git LFS Details

  • SHA256: 85e135a8bfec3837ce542d9d651ffd0e3c15fdd2c4d337c98c4cb564cf943601
  • Pointer size: 131 Bytes
  • Size of remote file: 819 kB
assets/generated/audio_to_image/fire_engine.png ADDED

Git LFS Details

  • SHA256: 7d7be8ccad183ab43007bfb295dd3c2d9624f618d4203b0258d176efd5d8ab85
  • Pointer size: 131 Bytes
  • Size of remote file: 615 kB
assets/generated/audio_to_image/goat.png ADDED

Git LFS Details

  • SHA256: f5c4585b5fdd1b619bf08e04de815b106cd6d542c703d45be9b490a51b60421b
  • Pointer size: 131 Bytes
  • Size of remote file: 874 kB
assets/generated/audio_to_image/motorcycle.png ADDED

Git LFS Details

  • SHA256: 719415a1299fa8b51fa1516c4c91ba8de4ea2de2886d297182a6a6833492daf5
  • Pointer size: 131 Bytes
  • Size of remote file: 644 kB
assets/generated/audio_to_image/plane.png ADDED

Git LFS Details

  • SHA256: f5172cf1bd5dab7d5bca91a6a527c64dbcf1b9be99f09326ef944efcc4996f27
  • Pointer size: 131 Bytes
  • Size of remote file: 884 kB
assets/generated/audio_to_image/train.png ADDED

Git LFS Details

  • SHA256: d6874e3a394137fbf3275e0cf5018b15c00276af2a5f1743a336397a577f1d7a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.01 MB
assets/generated/bird_audio.png DELETED
Binary file (760 kB)
 
assets/generated/cattle.png DELETED
Binary file (760 kB)
 
assets/generated/dog_audio.png DELETED
Binary file (819 kB)
 
assets/generated/goat.png DELETED
Binary file (874 kB)
 
assets/generated/image_to_image/car_image.png ADDED

Git LFS Details

  • SHA256: e0acc986045fd06a3bc7561207bb1d8fd6164c6e7ef0b74986aa4f31c7847b4f
  • Pointer size: 131 Bytes
  • Size of remote file: 986 kB
assets/image/bird.png ADDED

Git LFS Details

  • SHA256: 5505ad315c1b95a3b3a09e8f84b7e8db6ab754838aa286a6fe9ea8d0644ba2ea
  • Pointer size: 132 Bytes
  • Size of remote file: 1.28 MB
assets/image/bird_image.jpg ADDED

Git LFS Details

  • SHA256: 1d9d891f7785a0a7b85556f5c8e7bedd82edce78b46b7f55a569f4b2f3e9e5d1
  • Pointer size: 131 Bytes
  • Size of remote file: 115 kB
assets/image/car_image.jpg ADDED

Git LFS Details

  • SHA256: 45d920dfe748a66d513796600564f691240c2b00c8da4316c45d29e11ce067b2
  • Pointer size: 130 Bytes
  • Size of remote file: 59.3 kB
assets/image/dog_image.jpg ADDED

Git LFS Details

  • SHA256: c11d7d454d9db85a6ebd765ddb8eff30c6d0088c3cd14e7fdc74548a474d36d1
  • Pointer size: 130 Bytes
  • Size of remote file: 86.1 kB
assets/wav/cat.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa3d926fa712eb851c4e27d81dff0804cafc72e4f1bd716800478b2a4b40a02d
3
+ size 640044
assets/wav/fire_engine.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29165227485a6d457146dbee150610cc92a099af01c4a228172b328bcdcef3f0
3
+ size 882078
assets/wav/motorcycle.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45cd2e6a875984813afe9097be6dae8c911e7616698ae50cdd3cdb1ab6900b17
3
+ size 457918
assets/wav/plane.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e816bca22a06dc50e75d1cab715ca99ab3334a4097e5635c955eaee0c0b2ccf
3
+ size 1058478
assets/wav/train.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54eaa334a44b11f8d8888cdbf7cfc0340b70a87fb57c97c09adb454f26de7f60
3
+ size 483959
assets/wav/wave.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ce431cb79b82390adf8640c0f68536aae71cdf4b48da412516c4fee0428831a
3
+ size 1411278