Spaces:
Runtime error
Runtime error
import gradio as gr | |
import json | |
import torch | |
import wavio | |
import spaces | |
from tqdm import tqdm | |
from huggingface_hub import snapshot_download | |
from models import AudioDiffusion, DDPMScheduler | |
from audioldm.audio.stft import TacotronSTFT | |
from audioldm.variational_autoencoder import AutoencoderKL | |
from gradio import Markdown | |
# Automatic device detection | |
if torch.cuda.is_available(): | |
device_type = "cuda" | |
device_selection = "cuda:0" | |
else: | |
device_type = "cpu" | |
device_selection = "cpu" | |
class Tango: | |
def __init__(self, name = "declare-lab/tango2", device = device_selection): | |
path = snapshot_download(repo_id = name) | |
vae_config = json.load(open("{}/vae_config.json".format(path))) | |
stft_config = json.load(open("{}/stft_config.json".format(path))) | |
main_config = json.load(open("{}/main_config.json".format(path))) | |
self.vae = AutoencoderKL(**vae_config).to(device) | |
self.stft = TacotronSTFT(**stft_config).to(device) | |
self.model = AudioDiffusion(**main_config).to(device) | |
vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location = device) | |
stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location = device) | |
main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location = device) | |
self.vae.load_state_dict(vae_weights) | |
self.stft.load_state_dict(stft_weights) | |
self.model.load_state_dict(main_weights) | |
print ("Successfully loaded checkpoint from:", name) | |
self.vae.eval() | |
self.stft.eval() | |
self.model.eval() | |
self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder = "scheduler") | |
def chunks(self, lst, n): | |
""" Yield successive n-sized chunks from a list. """ | |
for i in range(0, len(lst), n): | |
yield lst[i:i + n] | |
def generate(self, prompt, steps = 100, guidance = 3, samples = 1, disable_progress = True): | |
""" Generate audio for a single prompt string. """ | |
with torch.no_grad(): | |
latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress = disable_progress) | |
mel = self.vae.decode_first_stage(latents) | |
wave = self.vae.decode_to_waveform(mel) | |
return wave[0] | |
def generate_for_batch(self, prompts, steps = 200, guidance = 3, samples = 1, batch_size = 8, disable_progress = True): | |
""" Generate audio for a list of prompt strings. """ | |
outputs = [] | |
for k in tqdm(range(0, len(prompts), batch_size)): | |
batch = prompts[k: k + batch_size] | |
with torch.no_grad(): | |
latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress = disable_progress) | |
mel = self.vae.decode_first_stage(latents) | |
wave = self.vae.decode_to_waveform(mel) | |
outputs += [item for item in wave] | |
if samples == 1: | |
return outputs | |
return list(self.chunks(outputs, samples)) | |
# Initialize TANGO | |
tango = Tango(device = "cpu") | |
tango.vae.to(device_type) | |
tango.stft.to(device_type) | |
tango.model.to(device_type) | |
def gradio_generate(prompt, steps, guidance): | |
output_wave = tango.generate(prompt, steps, guidance) | |
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav" | |
output_filename = "temp.wav" | |
wavio.write(output_filename, output_wave, rate = 16000, sampwidth = 2) | |
return output_filename | |
description_text = """ | |
<p style="text-align: center;"> | |
<b><big><big><big>Text-to-Audio</big></big></big></b> | |
<br/>Generates an audio file, freely, without account, without watermark, that you can download. | |
</p> | |
<br/> | |
<br/> | |
π Powered by <i>Tango 2</i> AI. | |
<br/> | |
<ul> | |
<li>If you need to generate <b>music</b>, I recommend you to use <i>MusicGen</i>,</li> | |
</ul> | |
<br/> | |
π Slow process... Your computer must <b><u>not</u></b> enter into standby mode.<br/>You can duplicate this space on a free account, it works on CPU.<br/> | |
<a href='https://huggingface.co/spaces/Fabrice-TIERCELIN/Text-to-Audio?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14'></a> | |
<br/> | |
βοΈ You can use, modify and share the generated sounds but not for commercial uses. | |
""" | |
# Gradio input and output components | |
input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True) | |
output_audio = gr.Audio(label = "Generated Audio", type = "filepath") | |
denoising_steps = gr.Slider(label = "Steps", minimum = 100, maximum = 200, value = 100, step = 1, interactive = True) | |
guidance_scale = gr.Slider(label = "Guidance Scale", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True) | |
# Gradio interface | |
gr_interface = gr.Interface( | |
fn = gradio_generate, | |
inputs = [input_text, denoising_steps, guidance_scale], | |
outputs = [output_audio], | |
title = "", | |
description = description_text, | |
allow_flagging = False, | |
examples = [ | |
["Quiet speech and then and airplane flying away"], | |
["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"], | |
["Ducks quack and water splashes with some animal screeching in the background"], | |
["Describe the sound of the ocean"], | |
["A woman and a baby are having a conversation"], | |
["A man speaks followed by a popping noise and laughter"], | |
["A cup is filled from a faucet"], | |
["An audience cheering and clapping"], | |
["Rolling thunder with lightning strikes"], | |
["A dog barking and a cat mewing and a racing car passes by"], | |
["Gentle water stream, birds chirping and sudden gun shot"], | |
["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone."], | |
["A dog barking"], | |
["A cat meowing"], | |
["Wooden table tapping sound while water pouring"], | |
["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"], | |
["two gunshots followed by birds flying away while chirping"], | |
["Whistling with birds chirping"], | |
["A person snoring"], | |
["Motor vehicles are driving with loud engines and a person whistles"], | |
["People cheering in a stadium while thunder and lightning strikes"], | |
["A helicopter is in flight"], | |
["A dog barking and a man talking and a racing car passes by"], | |
], | |
cache_examples = "lazy", # Turn on to cache. | |
) | |
# Launch Gradio app | |
gr_interface.queue(10).launch() |