Spaces:
Runtime error
Runtime error
Fabrice-TIERCELIN
commited on
Commit
β’
33e899a
1
Parent(s):
b8d94a7
Seed
Browse files
app.py
CHANGED
@@ -10,6 +10,8 @@ from audioldm.audio.stft import TacotronSTFT
|
|
10 |
from audioldm.variational_autoencoder import AutoencoderKL
|
11 |
from pydub import AudioSegment
|
12 |
|
|
|
|
|
13 |
# Automatic device detection
|
14 |
if torch.cuda.is_available():
|
15 |
device_type = "cuda"
|
@@ -81,11 +83,18 @@ tango.vae.to(device_type)
|
|
81 |
tango.stft.to(device_type)
|
82 |
tango.model.to(device_type)
|
83 |
|
|
|
|
|
|
|
|
|
|
|
84 |
def check(
|
85 |
prompt,
|
86 |
output_number,
|
87 |
steps,
|
88 |
-
guidance
|
|
|
|
|
89 |
):
|
90 |
if prompt is None or prompt == "":
|
91 |
raise gr.Error("Please provide a prompt input.")
|
@@ -104,9 +113,18 @@ def text2audio(
|
|
104 |
prompt,
|
105 |
output_number,
|
106 |
steps,
|
107 |
-
guidance
|
|
|
|
|
108 |
):
|
109 |
start = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
output_wave = tango.generate(prompt, steps, guidance, output_number)
|
111 |
|
112 |
output_wave_1 = gr.make_waveform((16000, output_wave[0]))
|
@@ -162,6 +180,8 @@ with gr.Blocks() as interface:
|
|
162 |
output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
|
163 |
denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 10, maximum = 200, value = 10, step = 1, interactive = True)
|
164 |
guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
|
|
|
|
|
165 |
|
166 |
submit = gr.Button("π Generate", variant = "primary")
|
167 |
|
@@ -170,11 +190,18 @@ with gr.Blocks() as interface:
|
|
170 |
output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
|
171 |
information = gr.Label(label = "Information")
|
172 |
|
173 |
-
submit.click(fn =
|
|
|
|
|
|
|
|
|
|
|
174 |
input_text,
|
175 |
output_number,
|
176 |
denoising_steps,
|
177 |
-
guidance_scale
|
|
|
|
|
178 |
], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
|
179 |
output_format,
|
180 |
output_number
|
@@ -187,7 +214,9 @@ with gr.Blocks() as interface:
|
|
187 |
input_text,
|
188 |
output_number,
|
189 |
denoising_steps,
|
190 |
-
guidance_scale
|
|
|
|
|
191 |
], outputs = [
|
192 |
output_audio_1,
|
193 |
output_audio_2,
|
@@ -209,7 +238,9 @@ with gr.Blocks() as interface:
|
|
209 |
input_text,
|
210 |
output_number,
|
211 |
denoising_steps,
|
212 |
-
guidance_scale
|
|
|
|
|
213 |
],
|
214 |
outputs = [
|
215 |
output_audio_1,
|
@@ -218,11 +249,11 @@ with gr.Blocks() as interface:
|
|
218 |
information
|
219 |
],
|
220 |
examples = [
|
221 |
-
["A hammer is hitting a wooden surface", 3, 100, 3],
|
222 |
-
["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3],
|
223 |
-
["A man is speaking in a small room.", 2, 100, 3],
|
224 |
-
["A female is speaking followed by footstep sound", 1, 100, 3],
|
225 |
-
["Wooden table tapping sound followed by water pouring sound.", 3, 200, 3],
|
226 |
],
|
227 |
cache_examples = "lazy",
|
228 |
)
|
|
|
10 |
from audioldm.variational_autoencoder import AutoencoderKL
|
11 |
from pydub import AudioSegment
|
12 |
|
13 |
+
max_64_bit_int = 2**63 - 1
|
14 |
+
|
15 |
# Automatic device detection
|
16 |
if torch.cuda.is_available():
|
17 |
device_type = "cuda"
|
|
|
83 |
tango.stft.to(device_type)
|
84 |
tango.model.to(device_type)
|
85 |
|
86 |
+
def update_seed(is_randomize_seed, seed):
|
87 |
+
if is_randomize_seed:
|
88 |
+
return random.randint(0, max_64_bit_int)
|
89 |
+
return seed
|
90 |
+
|
91 |
def check(
|
92 |
prompt,
|
93 |
output_number,
|
94 |
steps,
|
95 |
+
guidance,
|
96 |
+
is_randomize_seed,
|
97 |
+
seed
|
98 |
):
|
99 |
if prompt is None or prompt == "":
|
100 |
raise gr.Error("Please provide a prompt input.")
|
|
|
113 |
prompt,
|
114 |
output_number,
|
115 |
steps,
|
116 |
+
guidance,
|
117 |
+
is_randomize_seed,
|
118 |
+
seed
|
119 |
):
|
120 |
start = time.time()
|
121 |
+
|
122 |
+
if seed is None:
|
123 |
+
seed = random.randint(0, max_64_bit_int)
|
124 |
+
|
125 |
+
random.seed(seed)
|
126 |
+
torch.manual_seed(seed)
|
127 |
+
|
128 |
output_wave = tango.generate(prompt, steps, guidance, output_number)
|
129 |
|
130 |
output_wave_1 = gr.make_waveform((16000, output_wave[0]))
|
|
|
180 |
output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
|
181 |
denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 10, maximum = 200, value = 10, step = 1, interactive = True)
|
182 |
guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
|
183 |
+
randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
|
184 |
+
seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
|
185 |
|
186 |
submit = gr.Button("π Generate", variant = "primary")
|
187 |
|
|
|
190 |
output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
|
191 |
information = gr.Label(label = "Information")
|
192 |
|
193 |
+
submit.click(fn = update_seed, inputs = [
|
194 |
+
randomize_seed,
|
195 |
+
seed
|
196 |
+
], outputs = [
|
197 |
+
seed
|
198 |
+
], queue = False, show_progress = False).then(fn = check, inputs = [
|
199 |
input_text,
|
200 |
output_number,
|
201 |
denoising_steps,
|
202 |
+
guidance_scale,
|
203 |
+
randomize_seed,
|
204 |
+
seed
|
205 |
], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
|
206 |
output_format,
|
207 |
output_number
|
|
|
214 |
input_text,
|
215 |
output_number,
|
216 |
denoising_steps,
|
217 |
+
guidance_scale,
|
218 |
+
randomize_seed,
|
219 |
+
seed
|
220 |
], outputs = [
|
221 |
output_audio_1,
|
222 |
output_audio_2,
|
|
|
238 |
input_text,
|
239 |
output_number,
|
240 |
denoising_steps,
|
241 |
+
guidance_scale,
|
242 |
+
randomize_seed,
|
243 |
+
seed
|
244 |
],
|
245 |
outputs = [
|
246 |
output_audio_1,
|
|
|
249 |
information
|
250 |
],
|
251 |
examples = [
|
252 |
+
["A hammer is hitting a wooden surface", 3, 100, 3, False, 123],
|
253 |
+
["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3, False, 123],
|
254 |
+
["A man is speaking in a small room.", 2, 100, 3, False, 123],
|
255 |
+
["A female is speaking followed by footstep sound", 1, 100, 3, False, 123],
|
256 |
+
["Wooden table tapping sound followed by water pouring sound.", 3, 200, 3, False, 123],
|
257 |
],
|
258 |
cache_examples = "lazy",
|
259 |
)
|