mrfakename commited on
Commit
9df052c
1 Parent(s): 1755826

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. app.py +15 -107
app.py CHANGED
@@ -10,7 +10,6 @@ import numpy as np
10
  import soundfile as sf
11
  import torchaudio
12
  from cached_path import cached_path
13
- from pydub import AudioSegment
14
  from transformers import AutoModelForCausalLM, AutoTokenizer
15
 
16
  try:
@@ -114,58 +113,6 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
114
  return (final_sample_rate, final_wave), spectrogram_path
115
 
116
 
117
- @gpu_decorator
118
- def generate_podcast(
119
- script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, model, remove_silence
120
- ):
121
- # Split the script into speaker blocks
122
- speaker_pattern = re.compile(f"^({re.escape(speaker1_name)}|{re.escape(speaker2_name)}):", re.MULTILINE)
123
- speaker_blocks = speaker_pattern.split(script)[1:] # Skip the first empty element
124
-
125
- generated_audio_segments = []
126
-
127
- for i in range(0, len(speaker_blocks), 2):
128
- speaker = speaker_blocks[i]
129
- text = speaker_blocks[i + 1].strip()
130
-
131
- # Determine which speaker is talking
132
- if speaker == speaker1_name:
133
- ref_audio = ref_audio1
134
- ref_text = ref_text1
135
- elif speaker == speaker2_name:
136
- ref_audio = ref_audio2
137
- ref_text = ref_text2
138
- else:
139
- continue # Skip if the speaker is neither speaker1 nor speaker2
140
-
141
- # Generate audio for this block
142
- audio, _ = infer(ref_audio, ref_text, text, model, remove_silence)
143
-
144
- # Convert the generated audio to a numpy array
145
- sr, audio_data = audio
146
-
147
- # Save the audio data as a WAV file
148
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
149
- sf.write(temp_file.name, audio_data, sr)
150
- audio_segment = AudioSegment.from_wav(temp_file.name)
151
-
152
- generated_audio_segments.append(audio_segment)
153
-
154
- # Add a short pause between speakers
155
- pause = AudioSegment.silent(duration=500) # 500ms pause
156
- generated_audio_segments.append(pause)
157
-
158
- # Concatenate all audio segments
159
- final_podcast = sum(generated_audio_segments)
160
-
161
- # Export the final podcast
162
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
163
- podcast_path = temp_file.name
164
- final_podcast.export(podcast_path, format="wav")
165
-
166
- return podcast_path
167
-
168
-
169
  with gr.Blocks() as app_credits:
170
  gr.Markdown("""
171
  # Credits
@@ -225,53 +172,6 @@ with gr.Blocks() as app_tts:
225
  outputs=[audio_output, spectrogram_output],
226
  )
227
 
228
- with gr.Blocks() as app_podcast:
229
- gr.Markdown("# Podcast Generation")
230
- speaker1_name = gr.Textbox(label="Speaker 1 Name")
231
- ref_audio_input1 = gr.Audio(label="Reference Audio (Speaker 1)", type="filepath")
232
- ref_text_input1 = gr.Textbox(label="Reference Text (Speaker 1)", lines=2)
233
-
234
- speaker2_name = gr.Textbox(label="Speaker 2 Name")
235
- ref_audio_input2 = gr.Audio(label="Reference Audio (Speaker 2)", type="filepath")
236
- ref_text_input2 = gr.Textbox(label="Reference Text (Speaker 2)", lines=2)
237
-
238
- script_input = gr.Textbox(
239
- label="Podcast Script",
240
- lines=10,
241
- placeholder="Enter the script with speaker names at the start of each block, e.g.:\nSean: How did you start studying...\n\nMeghan: I came to my interest in technology...\nIt was a long journey...\n\nSean: That's fascinating. Can you elaborate...",
242
- )
243
-
244
- podcast_model_choice = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
245
- podcast_remove_silence = gr.Checkbox(
246
- label="Remove Silences",
247
- value=True,
248
- )
249
- generate_podcast_btn = gr.Button("Generate Podcast", variant="primary")
250
- podcast_output = gr.Audio(label="Generated Podcast")
251
-
252
- def podcast_generation(
253
- script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence
254
- ):
255
- return generate_podcast(
256
- script, speaker1, ref_audio1, ref_text1, speaker2, ref_audio2, ref_text2, model, remove_silence
257
- )
258
-
259
- generate_podcast_btn.click(
260
- podcast_generation,
261
- inputs=[
262
- script_input,
263
- speaker1_name,
264
- ref_audio_input1,
265
- ref_text_input1,
266
- speaker2_name,
267
- ref_audio_input2,
268
- ref_text_input2,
269
- podcast_model_choice,
270
- podcast_remove_silence,
271
- ],
272
- outputs=podcast_output,
273
- )
274
-
275
 
276
  def parse_speechtypes_text(gen_text):
277
  # Pattern to find {speechtype}
@@ -298,7 +198,7 @@ def parse_speechtypes_text(gen_text):
298
  return segments
299
 
300
 
301
- with gr.Blocks() as app_emotional:
302
  # New section for emotional generation
303
  gr.Markdown(
304
  """
@@ -306,9 +206,13 @@ with gr.Blocks() as app_emotional:
306
 
307
  This section allows you to upload different audio clips for each speech type. 'Regular' emotion is mandatory. You can add additional speech types by clicking the "Add Speech Type" button. Enter your text in the format shown below, and the system will generate speech using the appropriate emotions. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
308
 
309
- **Example Input:**
310
-
311
- {Regular} Hello, I'd like to order a sandwich please. {Surprised} What do you mean you're out of bread? {Sad} I really wanted a sandwich though... {Angry} You know what, darn you and your little shop, you suck! {Whisper} I'll just go back home and cry now. {Shouting} Why me?!
 
 
 
 
312
  """
313
  )
314
 
@@ -392,7 +296,11 @@ with gr.Blocks() as app_emotional:
392
  delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
393
 
394
  # Text input for the prompt
395
- gen_text_input_emotional = gr.Textbox(label="Text to Generate", lines=10)
 
 
 
 
396
 
397
  # Model choice
398
  model_choice_emotional = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
@@ -694,8 +602,8 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
694
  """
695
  )
696
  gr.TabbedInterface(
697
- [app_tts, app_podcast, app_emotional, app_chat, app_credits],
698
- ["TTS", "Podcast", "Multi-Style", "Voice-Chat", "Credits"],
699
  )
700
 
701
 
 
10
  import soundfile as sf
11
  import torchaudio
12
  from cached_path import cached_path
 
13
  from transformers import AutoModelForCausalLM, AutoTokenizer
14
 
15
  try:
 
113
  return (final_sample_rate, final_wave), spectrogram_path
114
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  with gr.Blocks() as app_credits:
117
  gr.Markdown("""
118
  # Credits
 
172
  outputs=[audio_output, spectrogram_output],
173
  )
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def parse_speechtypes_text(gen_text):
177
  # Pattern to find {speechtype}
 
198
  return segments
199
 
200
 
201
+ with gr.Blocks() as app_multistyle:
202
  # New section for emotional generation
203
  gr.Markdown(
204
  """
 
206
 
207
  This section allows you to upload different audio clips for each speech type. 'Regular' emotion is mandatory. You can add additional speech types by clicking the "Add Speech Type" button. Enter your text in the format shown below, and the system will generate speech using the appropriate emotions. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
208
 
209
+ **Example Input:**
210
+ {Regular} Hello, I'd like to order a sandwich please.
211
+ {Surprised} What do you mean you're out of bread?
212
+ {Sad} I really wanted a sandwich though...
213
+ {Angry} You know what, darn you and your little shop!
214
+ {Whisper} I'll just go back home and cry now.
215
+ {Shouting} Why me?!
216
  """
217
  )
218
 
 
296
  delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
297
 
298
  # Text input for the prompt
299
+ gen_text_input_emotional = gr.Textbox(
300
+ label="Text to Generate ( Make sure the type names you entered match the Speech Type Name above ! ! ! )",
301
+ lines=10,
302
+ placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
303
+ )
304
 
305
  # Model choice
306
  model_choice_emotional = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
 
602
  """
603
  )
604
  gr.TabbedInterface(
605
+ [app_tts, app_multistyle, app_chat, app_credits],
606
+ ["TTS", "Multi-Style", "Voice-Chat", "Credits"],
607
  )
608
 
609