Anything2Image / app.py
laizeqiang
updaste
c160de1
raw
history blame
1.72 kB
import gradio as gr
import imagebind
import soundfile as sf
import torch
from diffusers import StableUnCLIPImg2ImgPipeline
from PIL import Image
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1-unclip",
)
pipe = pipe.to(device)
model = imagebind.imagebind_huge(pretrained=True)
model.eval()
model.to(device)
@torch.no_grad()
def anything2img(prompt, audio, image):
if audio is not None:
sr, waveform = audio
sf.write('tmp.wav', waveform, sr)
embeddings = model.forward({
imagebind.ModalityType.AUDIO: imagebind.load_and_transform_audio_data(['tmp.wav'], device),
})
audio_embeddings = embeddings[imagebind.ModalityType.AUDIO]
if image is not None:
Image.fromarray(image).save('tmp.png')
embeddings = model.forward({
imagebind.ModalityType.VISION: imagebind.load_and_transform_vision_data(['tmp.png'], device),
})
image_embeddings = embeddings[imagebind.ModalityType.VISION]
if audio_embeddings is not None and image_embeddings is not None:
embeddings = audio_embeddings + image_embeddings
elif image_embeddings is not None:
embeddings = image_embeddings
elif audio_embeddings is not None:
embeddings = audio_embeddings
else:
embeddings = None
images = pipe(prompt=prompt, image_embeds=embeddings).images
return images[0]
demo = gr.Interface(fn=anything2img, inputs=["text", "audio", "image"], outputs="image")
# demo.launch(server_name='0.0.0.0', server_port=10051, share=True)
demo.launch(server_name='0.0.0.0', server_port=10047, share=True)