Spaces:
Runtime error
Runtime error
File size: 7,350 Bytes
ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e 559b00c ffead1e a258601 559b00c a117171 559b00c ffead1e 559b00c ffead1e 0353aff 559b00c ffead1e 559b00c ffead1e 559b00c c0683f9 ffead1e c0683f9 ffead1e c0683f9 ffead1e c0683f9 ffead1e 559b00c ffead1e b76c2d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import gradio as gr
import json
import torch
import wavio
import spaces
from tqdm import tqdm
from huggingface_hub import snapshot_download
from models import AudioDiffusion, DDPMScheduler
from audioldm.audio.stft import TacotronSTFT
from audioldm.variational_autoencoder import AutoencoderKL
from gradio import Markdown
# Automatic device detection
if torch.cuda.is_available():
device_type = "cuda"
device_selection = "cuda:0"
else:
device_type = "cpu"
device_selection = "cpu"
class Tango:
def __init__(self, name = "declare-lab/tango2", device = device_selection):
path = snapshot_download(repo_id = name)
vae_config = json.load(open("{}/vae_config.json".format(path)))
stft_config = json.load(open("{}/stft_config.json".format(path)))
main_config = json.load(open("{}/main_config.json".format(path)))
self.vae = AutoencoderKL(**vae_config).to(device)
self.stft = TacotronSTFT(**stft_config).to(device)
self.model = AudioDiffusion(**main_config).to(device)
vae_weights = torch.load("{}/pytorch_model_vae.bin".format(path), map_location = device)
stft_weights = torch.load("{}/pytorch_model_stft.bin".format(path), map_location = device)
main_weights = torch.load("{}/pytorch_model_main.bin".format(path), map_location = device)
self.vae.load_state_dict(vae_weights)
self.stft.load_state_dict(stft_weights)
self.model.load_state_dict(main_weights)
print ("Successfully loaded checkpoint from:", name)
self.vae.eval()
self.stft.eval()
self.model.eval()
self.scheduler = DDPMScheduler.from_pretrained(main_config["scheduler_name"], subfolder = "scheduler")
def chunks(self, lst, n):
""" Yield successive n-sized chunks from a list. """
for i in range(0, len(lst), n):
yield lst[i:i + n]
def generate(self, prompt, steps = 100, guidance = 3, samples = 1, disable_progress = True):
""" Generate audio for a single prompt string. """
with torch.no_grad():
latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress = disable_progress)
mel = self.vae.decode_first_stage(latents)
wave = self.vae.decode_to_waveform(mel)
return wave[0]
def generate_for_batch(self, prompts, steps = 200, guidance = 3, samples = 1, batch_size = 8, disable_progress = True):
""" Generate audio for a list of prompt strings. """
outputs = []
for k in tqdm(range(0, len(prompts), batch_size)):
batch = prompts[k: k + batch_size]
with torch.no_grad():
latents = self.model.inference(batch, self.scheduler, steps, guidance, samples, disable_progress = disable_progress)
mel = self.vae.decode_first_stage(latents)
wave = self.vae.decode_to_waveform(mel)
outputs += [item for item in wave]
if samples == 1:
return outputs
return list(self.chunks(outputs, samples))
# Initialize TANGO
tango = Tango(device = "cpu")
tango.vae.to(device_type)
tango.stft.to(device_type)
tango.model.to(device_type)
@spaces.GPU(duration = 60)
def gradio_generate(prompt, steps, guidance):
output_wave = tango.generate(prompt, steps, guidance)
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
output_filename = "temp.wav"
wavio.write(output_filename, output_wave, rate = 16000, sampwidth = 2)
return output_filename
description_text = """
<p style="text-align: center;">
<b><big><big><big>Text-to-Audio</big></big></big></b>
<br/>Generates an audio file, freely, without account, without watermark, that you can download.
</p>
<br/>
<br/>
π Powered by <i>Tango 2</i> AI.
<br/>
<ul>
<li>If you need to generate <b>music</b>, I recommend you to use <i>MusicGen</i>,</li>
</ul>
<br/>
π Slow process... Your computer must <b><u>not</u></b> enter into standby mode.<br/>You can duplicate this space on a free account, it works on CPU.<br/>
<a href='https://huggingface.co/spaces/Fabrice-TIERCELIN/Text-to-Audio?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14'></a>
<br/>
βοΈ You can use, modify and share the generated sounds but not for commercial uses.
"""
# Gradio input and output components
input_text = gr.Textbox(label = "Prompt", value = "Snort of a horse", lines = 2, autofocus = True)
output_audio = gr.Audio(label = "Generated Audio", type = "filepath")
denoising_steps = gr.Slider(label = "Steps", minimum = 100, maximum = 200, value = 100, step = 1, interactive = True)
guidance_scale = gr.Slider(label = "Guidance Scale", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
# Gradio interface
gr_interface = gr.Interface(
fn = gradio_generate,
inputs = [input_text, denoising_steps, guidance_scale],
outputs = [output_audio],
title = "",
description = description_text,
allow_flagging = False,
examples = [
["Quiet speech and then and airplane flying away"],
["A bicycle peddling on dirt and gravel followed by a man speaking then laughing"],
["Ducks quack and water splashes with some animal screeching in the background"],
["Describe the sound of the ocean"],
["A woman and a baby are having a conversation"],
["A man speaks followed by a popping noise and laughter"],
["A cup is filled from a faucet"],
["An audience cheering and clapping"],
["Rolling thunder with lightning strikes"],
["A dog barking and a cat mewing and a racing car passes by"],
["Gentle water stream, birds chirping and sudden gun shot"],
["A man talking followed by a goat baaing then a metal gate sliding shut as ducks quack and wind blows into a microphone."],
["A dog barking"],
["A cat meowing"],
["Wooden table tapping sound while water pouring"],
["Applause from a crowd with distant clicking and a man speaking over a loudspeaker"],
["two gunshots followed by birds flying away while chirping"],
["Whistling with birds chirping"],
["A person snoring"],
["Motor vehicles are driving with loud engines and a person whistles"],
["People cheering in a stadium while thunder and lightning strikes"],
["A helicopter is in flight"],
["A dog barking and a man talking and a racing car passes by"],
],
cache_examples = "lazy", # Turn on to cache.
)
# Launch Gradio app
gr_interface.queue(10).launch() |