Fabrice-TIERCELIN commited on
Commit
33e899a
β€’
1 Parent(s): b8d94a7
Files changed (1) hide show
  1. app.py +42 -11
app.py CHANGED
@@ -10,6 +10,8 @@ from audioldm.audio.stft import TacotronSTFT
10
  from audioldm.variational_autoencoder import AutoencoderKL
11
  from pydub import AudioSegment
12
 
 
 
13
  # Automatic device detection
14
  if torch.cuda.is_available():
15
  device_type = "cuda"
@@ -81,11 +83,18 @@ tango.vae.to(device_type)
81
  tango.stft.to(device_type)
82
  tango.model.to(device_type)
83
 
 
 
 
 
 
84
  def check(
85
  prompt,
86
  output_number,
87
  steps,
88
- guidance
 
 
89
  ):
90
  if prompt is None or prompt == "":
91
  raise gr.Error("Please provide a prompt input.")
@@ -104,9 +113,18 @@ def text2audio(
104
  prompt,
105
  output_number,
106
  steps,
107
- guidance
 
 
108
  ):
109
  start = time.time()
 
 
 
 
 
 
 
110
  output_wave = tango.generate(prompt, steps, guidance, output_number)
111
 
112
  output_wave_1 = gr.make_waveform((16000, output_wave[0]))
@@ -162,6 +180,8 @@ with gr.Blocks() as interface:
162
  output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
163
  denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 10, maximum = 200, value = 10, step = 1, interactive = True)
164
  guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
 
 
165
 
166
  submit = gr.Button("πŸš€ Generate", variant = "primary")
167
 
@@ -170,11 +190,18 @@ with gr.Blocks() as interface:
170
  output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
171
  information = gr.Label(label = "Information")
172
 
173
- submit.click(fn = check, inputs = [
 
 
 
 
 
174
  input_text,
175
  output_number,
176
  denoising_steps,
177
- guidance_scale
 
 
178
  ], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
179
  output_format,
180
  output_number
@@ -187,7 +214,9 @@ with gr.Blocks() as interface:
187
  input_text,
188
  output_number,
189
  denoising_steps,
190
- guidance_scale
 
 
191
  ], outputs = [
192
  output_audio_1,
193
  output_audio_2,
@@ -209,7 +238,9 @@ with gr.Blocks() as interface:
209
  input_text,
210
  output_number,
211
  denoising_steps,
212
- guidance_scale
 
 
213
  ],
214
  outputs = [
215
  output_audio_1,
@@ -218,11 +249,11 @@ with gr.Blocks() as interface:
218
  information
219
  ],
220
  examples = [
221
- ["A hammer is hitting a wooden surface", 3, 100, 3],
222
- ["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3],
223
- ["A man is speaking in a small room.", 2, 100, 3],
224
- ["A female is speaking followed by footstep sound", 1, 100, 3],
225
- ["Wooden table tapping sound followed by water pouring sound.", 3, 200, 3],
226
  ],
227
  cache_examples = "lazy",
228
  )
 
10
  from audioldm.variational_autoencoder import AutoencoderKL
11
  from pydub import AudioSegment
12
 
13
+ max_64_bit_int = 2**63 - 1
14
+
15
  # Automatic device detection
16
  if torch.cuda.is_available():
17
  device_type = "cuda"
 
83
  tango.stft.to(device_type)
84
  tango.model.to(device_type)
85
 
86
+ def update_seed(is_randomize_seed, seed):
87
+ if is_randomize_seed:
88
+ return random.randint(0, max_64_bit_int)
89
+ return seed
90
+
91
  def check(
92
  prompt,
93
  output_number,
94
  steps,
95
+ guidance,
96
+ is_randomize_seed,
97
+ seed
98
  ):
99
  if prompt is None or prompt == "":
100
  raise gr.Error("Please provide a prompt input.")
 
113
  prompt,
114
  output_number,
115
  steps,
116
+ guidance,
117
+ is_randomize_seed,
118
+ seed
119
  ):
120
  start = time.time()
121
+
122
+ if seed is None:
123
+ seed = random.randint(0, max_64_bit_int)
124
+
125
+ random.seed(seed)
126
+ torch.manual_seed(seed)
127
+
128
  output_wave = tango.generate(prompt, steps, guidance, output_number)
129
 
130
  output_wave_1 = gr.make_waveform((16000, output_wave[0]))
 
180
  output_number = gr.Slider(label = "Number of generations", info = "1, 2 or 3 output files", minimum = 1, maximum = 3, value = 3, step = 1, interactive = True)
181
  denoising_steps = gr.Slider(label = "Steps", info = "lower=faster & variant, higher=audio quality & similar", minimum = 10, maximum = 200, value = 10, step = 1, interactive = True)
182
  guidance_scale = gr.Slider(label = "Guidance Scale", info = "lower=audio quality, higher=follow the prompt", minimum = 1, maximum = 10, value = 3, step = 0.1, interactive = True)
183
+ randomize_seed = gr.Checkbox(label = "\U0001F3B2 Randomize seed", value = True, info = "If checked, result is always different")
184
+ seed = gr.Slider(minimum = 0, maximum = max_64_bit_int, step = 1, randomize = True, label = "Seed")
185
 
186
  submit = gr.Button("πŸš€ Generate", variant = "primary")
187
 
 
190
  output_audio_3 = gr.Audio(label = "Generated Audio #3/3", format = "wav", type="numpy")
191
  information = gr.Label(label = "Information")
192
 
193
+ submit.click(fn = update_seed, inputs = [
194
+ randomize_seed,
195
+ seed
196
+ ], outputs = [
197
+ seed
198
+ ], queue = False, show_progress = False).then(fn = check, inputs = [
199
  input_text,
200
  output_number,
201
  denoising_steps,
202
+ guidance_scale,
203
+ randomize_seed,
204
+ seed
205
  ], outputs = [], queue = False, show_progress = False).success(fn = update_output, inputs = [
206
  output_format,
207
  output_number
 
214
  input_text,
215
  output_number,
216
  denoising_steps,
217
+ guidance_scale,
218
+ randomize_seed,
219
+ seed
220
  ], outputs = [
221
  output_audio_1,
222
  output_audio_2,
 
238
  input_text,
239
  output_number,
240
  denoising_steps,
241
+ guidance_scale,
242
+ randomize_seed,
243
+ seed
244
  ],
245
  outputs = [
246
  output_audio_1,
 
249
  information
250
  ],
251
  examples = [
252
+ ["A hammer is hitting a wooden surface", 3, 100, 3, False, 123],
253
+ ["Peaceful and calming ambient music with singing bowl and other instruments.", 3, 100, 3, False, 123],
254
+ ["A man is speaking in a small room.", 2, 100, 3, False, 123],
255
+ ["A female is speaking followed by footstep sound", 1, 100, 3, False, 123],
256
+ ["Wooden table tapping sound followed by water pouring sound.", 3, 200, 3, False, 123],
257
  ],
258
  cache_examples = "lazy",
259
  )