import gradio as gr import json import torch import wavio import spaces from tqdm import tqdm from huggingface_hub import snapshot_download from models import AudioDiffusion, DDPMScheduler from audioldm.audio.stft import TacotronSTFT from audioldm.variational_autoencoder import AutoencoderKL from gradio import Markdown # Automatic device detection if torch.cuda.is_available(): device_type = "cuda" device_selection = "cuda:0" else: device_type = "cpu" device_selection = "cpu" class Tango: def __init__(self, name = "declare-lab/tango2", device = device_selection): path = snapshot_download(repo_id = name) vae_config = json.load(open("{}/vae_config.json".format(path))) stft_config = json.load(open("{}/stft_config.json".format(path))) main_config = json.load(open("{}/main_config.json".format(path))) self.vae = AutoencoderKL(**vae_config).to(device) self.stft = TacotronSTFT(**stft_config).to(device) self.model = AudioDiffusion(**main_config).to(device) vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location = device) stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location = device) main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location = device) self.vae.load_state_dict(vae_weights) self.stft.load_state_dict(stft_weights) self.model.load_state_dict(main_weights) print ("Successfully loaded checkpoint from:", name) self.vae.eval() self.stft.eval() self.model.eval() self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder = "scheduler") def chunks(self, lst, n): """ Yield successive n-sized chunks from a list. """ for i in range(0, len(lst), n): yield lst[i:i + n] def generate(self, prompt, steps = 100, guidance = 3, samples = 1, disable_progress = True): """ Generate audio for a single prompt string. """ with torch.no_grad(): latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress = disable_progress) mel = self.vae.decode_first_stage(latents) wave = self.vae.decode_to_waveform(mel) return wave[0] def generate_for_batch(self, prompts, steps = 200, guidance = 3, samples = 1, batch_size = 8, disable_progress = True): """ Generate audio for a list of prompt strings. """ outputs = [] for k in tqdm(range(0, len(prompts), batch_size)): batch = prompts[k: k + batch_size] with torch.no_grad(): latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress = disable_progress) mel = self.vae.decode_first_stage(latents) wave = self.vae.decode_to_waveform(mel) outputs += [item for item in wave] if samples == 1: return outputs return list(self.chunks(outputs, samples)) # Initialize TANGO tango = Tango(device = "cpu") tango.vae.to(device_type) tango.stft.to(device_type) tango.model.to(device_type) @spaces.GPU(duration = 60) def gradio_generate(prompt, steps, guidance): output_wave = tango.generate(prompt, steps, guidance) # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav" output_filename = "temp.wav" wavio.write(output_filename, output_wave, rate = 16000, sampwidth = 2) return output_filename description_text = """

Text-to-Audio
Generates an audio file, freely, without account, without watermark, that you can download.

🚀 Powered by Tango 2 AI.

If you need to generate music, I recommend you to use MusicGen,

🐌 Slow process... Your computer must not enter into standby mode.
You can duplicate this space on a free account, it works on CPU.

⚖️ You can use, modify and share the generated sounds but not for commercial uses. """ # Gradio input and output components input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True) output_audio = gr.Audio(label = "Generated Audio", type = "filepath") denoising_steps = gr.Slider(label = "Steps", minimum = 100, maximum = 200, value = 100, step = 1, interactive = True) guidance_scale = gr.Slider(label = "Guidance Scale", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True) # Gradio interface gr_interface = gr.Interface( fn = gradio_generate, inputs = [input_text, denoising_steps, guidance_scale], outputs = [output_audio], title = "", description = description_text, allow_flagging = False, examples = [ ["Quiet speech and then and airplane flying away"], ["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"], ["Ducks quack and water splashes with some animal screeching in the background"], ["Describe the sound of the ocean"], ["A woman and a baby are having a conversation"], ["A man speaks followed by a popping noise and laughter"], ["A cup is filled from a faucet"], ["An audience cheering and clapping"], ["Rolling thunder with lightning strikes"], ["A dog barking and a cat mewing and a racing car passes by"], ["Gentle water stream, birds chirping and sudden gun shot"], ["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone."], ["A dog barking"], ["A cat meowing"], ["Wooden table tapping sound while water pouring"], ["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"], ["two gunshots followed by birds flying away while chirping"], ["Whistling with birds chirping"], ["A person snoring"], ["Motor vehicles are driving with loud engines and a person whistles"], ["People cheering in a stadium while thunder and lightning strikes"], ["A helicopter is in flight"], ["A dog barking and a man talking and a racing car passes by"], ], cache_examples = "lazy", # Turn on to cache. ) # Launch Gradio app gr_interface.queue(10).launch()