drewThomasson commited on
Commit
d084eaa
1 Parent(s): a553ade

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -300
app.py CHANGED
@@ -1,305 +1,34 @@
1
  import gradio as gr
2
  from outetts.v0_1.interface import InterfaceHF
3
- import logging
4
- import os
5
- import tempfile
6
 
7
- # Import faster-whisper for transcription
8
- from faster_whisper import WhisperModel
9
 
10
- # Configure logging to display information in the terminal
11
- logging.basicConfig(level=logging.INFO)
12
- logger = logging.getLogger(__name__)
13
-
14
- # Initialize the OuteTTS interface with the Hugging Face model
15
- try:
16
- logger.info("Initializing OuteTTS InterfaceHF with model 'OuteAI/OuteTTS-0.1-350M'")
17
- interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
18
- logger.info("OuteTTS model loaded successfully.")
19
- except Exception as e:
20
- logger.error(f"Failed to load OuteTTS model: {e}")
21
- raise e
22
-
23
- # Initialize the faster-whisper model
24
- try:
25
- logger.info("Initializing faster-whisper model for transcription.")
26
- whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
27
- logger.info("faster-whisper model loaded successfully.")
28
- except Exception as e:
29
- logger.error(f"Failed to load faster-whisper model: {e}")
30
- raise e
31
-
32
- def generate_tts_basic(text, temperature, repetition_penalty, max_length):
33
- """
34
- Generates speech from the input text using the OuteTTS model (Basic TTS).
35
-
36
- Parameters:
37
- text (str): The input text for TTS.
38
- temperature (float): Sampling temperature.
39
- repetition_penalty (float): Repetition penalty.
40
- max_length (int): Maximum length of the generated audio tokens.
41
-
42
- Returns:
43
- str: Path to the generated audio file.
44
- """
45
- logger.info("Received Basic TTS generation request.")
46
- logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}")
47
-
48
- try:
49
- # Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
50
- output = interface.generate(
51
- text=text,
52
- temperature=temperature,
53
- repetition_penalty=repetition_penalty,
54
- max_lenght=max_length # Pass the parameter with typo
55
- )
56
- logger.info("Basic TTS generation complete.")
57
-
58
- # Save the output to a temporary WAV file
59
- output_path = os.path.join(tempfile.gettempdir(), "basic_output.wav")
60
- output.save(output_path)
61
- logger.info(f"Basic TTS audio saved to {output_path}")
62
-
63
- return output_path # Gradio will handle the audio playback
64
- except Exception as e:
65
- logger.error(f"Error during Basic TTS generation: {e}")
66
- return None
67
-
68
- def transcribe_audio(audio_path):
69
- """
70
- Transcribes the given audio file using faster-whisper.
71
-
72
- Parameters:
73
- audio_path (str): Path to the audio file.
74
-
75
- Returns:
76
- str: Transcribed text.
77
- """
78
- logger.info(f"Transcribing audio file: {audio_path}")
79
- try:
80
- segments, info = whisper_model.transcribe(audio_path)
81
- transcript = " ".join([segment.text for segment in segments])
82
- logger.info(f"Transcription complete: {transcript}")
83
- return transcript
84
- except Exception as e:
85
- logger.error(f"Error during transcription: {e}")
86
- return ""
87
-
88
- def create_speaker_with_transcription(audio_file):
89
- """
90
- Creates a custom speaker from a reference audio file by automatically transcribing it.
91
-
92
- Parameters:
93
- audio_file (file): Uploaded reference audio file.
94
-
95
- Returns:
96
- dict: Speaker configuration or empty dict if failed.
97
- """
98
- logger.info("Received Voice Cloning request with audio file.")
99
-
100
- try:
101
- # Save uploaded audio to a temporary file
102
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
103
- temp_audio_path = temp_audio.name
104
- temp_audio.write(audio_file.read())
105
- logger.info(f"Reference audio saved to {temp_audio_path}")
106
-
107
- # Transcribe the audio file
108
- transcript = transcribe_audio(temp_audio_path)
109
-
110
- if not transcript.strip():
111
- logger.error("Transcription resulted in empty text.")
112
- os.remove(temp_audio_path)
113
- return {}
114
-
115
- # Create speaker using the transcribed text
116
- speaker = interface.create_speaker(temp_audio_path, transcript)
117
- logger.info("Speaker created successfully.")
118
-
119
- # Clean up the temporary audio file
120
- os.remove(temp_audio_path)
121
- logger.info(f"Temporary audio file {temp_audio_path} removed.")
122
-
123
- return speaker
124
- except Exception as e:
125
- logger.error(f"Error during speaker creation: {e}")
126
- return {}
127
-
128
- def generate_tts_cloned(text, temperature, repetition_penalty, max_length, speaker):
129
- """
130
- Generates speech from the input text using the OuteTTS model with cloned voice.
131
-
132
- Parameters:
133
- text (str): The input text for TTS.
134
- temperature (float): Sampling temperature.
135
- repetition_penalty (float): Repetition penalty.
136
- max_length (int): Maximum length of the generated audio tokens.
137
- speaker (dict): Speaker configuration for voice cloning.
138
-
139
- Returns:
140
- str: Path to the generated audio file.
141
- """
142
- logger.info("Received Cloned TTS generation request.")
143
- logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}, Speaker Provided: {speaker is not None}")
144
-
145
- try:
146
- if not speaker:
147
- logger.error("Speaker configuration is missing.")
148
- return None
149
-
150
- # Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
151
- output = interface.generate(
152
- text=text,
153
- temperature=temperature,
154
- repetition_penalty=repetition_penalty,
155
- max_lenght=max_length, # Pass the parameter with typo
156
- speaker=speaker
157
- )
158
- logger.info("Cloned TTS generation complete.")
159
-
160
- # Save the output to a temporary WAV file
161
- output_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
162
- output.save(output_path)
163
- logger.info(f"Cloned TTS audio saved to {output_path}")
164
-
165
- return output_path # Gradio will handle the audio playback
166
- except Exception as e:
167
- logger.error(f"Error during Cloned TTS generation: {e}")
168
- return None
169
-
170
- # Define the Gradio Blocks interface
171
- with gr.Blocks() as demo:
172
- gr.Markdown("# 🎤 OuteTTS - Text to Speech Interface")
173
- gr.Markdown(
174
- """
175
- Generate speech from text using the **OuteTTS-0.1-350M** model.
176
-
177
- **Key Features:**
178
- - Pure language modeling approach to TTS
179
- - Voice cloning capabilities with automatic transcription
180
- - Compatible with LLaMa architecture
181
- """
182
  )
183
-
184
- with gr.Tab("Basic TTS"):
185
- with gr.Row():
186
- text_input_basic = gr.Textbox(
187
- label="📄 Text Input",
188
- placeholder="Enter the text for TTS generation",
189
- lines=3
190
- )
191
-
192
- with gr.Row():
193
- temperature_basic = gr.Slider(
194
- minimum=0.1,
195
- maximum=1.0,
196
- value=0.1,
197
- step=0.01,
198
- label="🌡️ Temperature"
199
- )
200
- repetition_penalty_basic = gr.Slider(
201
- minimum=0.5,
202
- maximum=2.0,
203
- value=1.1,
204
- step=0.1,
205
- label="🔁 Repetition Penalty"
206
- )
207
- max_length_basic = gr.Slider(
208
- minimum=256,
209
- maximum=4096,
210
- value=1024,
211
- step=256,
212
- label="📏 Max Length"
213
- )
214
-
215
- generate_button_basic = gr.Button("🔊 Generate Speech")
216
-
217
- output_audio_basic = gr.Audio(
218
- label="🎧 Generated Speech",
219
- type="filepath" # Expecting a file path to the audio
220
- )
221
-
222
- # Define the button click event for Basic TTS
223
- generate_button_basic.click(
224
- fn=generate_tts_basic,
225
- inputs=[text_input_basic, temperature_basic, repetition_penalty_basic, max_length_basic],
226
- outputs=output_audio_basic
227
- )
228
-
229
- with gr.Tab("Voice Cloning"):
230
- with gr.Row():
231
- reference_audio = gr.Audio(
232
- label="🔊 Reference Audio",
233
- type="filepath" # Receive the path to the uploaded file
234
- )
235
-
236
- create_speaker_button = gr.Button("🎤 Create Speaker")
237
-
238
- speaker_info = gr.JSON(label="🗂️ Speaker Configuration") # Removed interactive=False
239
-
240
- with gr.Row():
241
- generate_cloned_speech = gr.Textbox(
242
- label="📄 Text Input",
243
- placeholder="Enter the text for TTS generation with cloned voice",
244
- lines=3
245
- )
246
-
247
- with gr.Row():
248
- temperature_clone = gr.Slider(
249
- minimum=0.1,
250
- maximum=1.0,
251
- value=0.1,
252
- step=0.01,
253
- label="🌡️ Temperature"
254
- )
255
- repetition_penalty_clone = gr.Slider(
256
- minimum=0.5,
257
- maximum=2.0,
258
- value=1.1,
259
- step=0.1,
260
- label="🔁 Repetition Penalty"
261
- )
262
- max_length_clone = gr.Slider(
263
- minimum=256,
264
- maximum=4096,
265
- value=1024,
266
- step=256,
267
- label="📏 Max Length"
268
- )
269
-
270
- generate_cloned_button = gr.Button("🔊 Generate Cloned Speech")
271
-
272
- output_cloned_audio = gr.Audio(
273
- label="🎧 Generated Cloned Speech",
274
- type="filepath" # Expecting a file path to the audio
275
- )
276
-
277
- # Define the button click event for creating a speaker
278
- create_speaker_button.click(
279
- fn=create_speaker_with_transcription,
280
- inputs=[reference_audio],
281
- outputs=speaker_info
282
- )
283
-
284
- # Define the button click event for generating speech with the cloned voice
285
- generate_cloned_button.click(
286
- fn=generate_tts_cloned,
287
- inputs=[generate_cloned_speech, temperature_clone, repetition_penalty_clone, max_length_clone, speaker_info],
288
- outputs=output_cloned_audio
289
- )
290
-
291
- gr.Markdown(
292
- """
293
- ---
294
- **Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)
295
-
296
- **Credits:**
297
- - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
298
- - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
299
- - [faster-whisper](https://github.com/guillaumekln/faster-whisper)
300
- """
301
- )
302
-
303
- # Launch the Gradio app without a loading bar
304
- if __name__ == "__main__":
305
- demo.launch(share=True, show_progress=False)
 
1
  import gradio as gr
2
  from outetts.v0_1.interface import InterfaceHF
 
 
 
3
 
4
+ # Initialize the TTS model interface
5
+ interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
6
 
7
+ # Define a function to generate and save TTS output from input text
8
+ def generate_tts(text, temperature=0.1, repetition_penalty=1.1, max_length=4096):
9
+ output = interface.generate(
10
+ text=text,
11
+ temperature=temperature,
12
+ repetition_penalty=repetition_penalty,
13
+ max_lenght=max_length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  )
15
+ # Save the output audio to a file
16
+ output.save("output.wav")
17
+ return "output.wav"
18
+
19
+ # Gradio interface for TTS
20
+ demo = gr.Interface(
21
+ fn=generate_tts,
22
+ inputs=[
23
+ gr.Textbox(lines=2, placeholder="Enter text to convert to speech", label="Text"),
24
+ gr.Slider(0.1, 1.0, value=0.1, label="Temperature"),
25
+ gr.Slider(1.0, 2.0, value=1.1, label="Repetition Penalty"),
26
+ gr.Slider(512, 4096, value=4096, step=256, label="Max Length")
27
+ ],
28
+ outputs=gr.Audio(type="file", label="Generated Speech"),
29
+ title="OuteTTS Text-to-Speech Demo",
30
+ description="Convert text to speech using the OuteTTS model."
31
+ )
32
+
33
+ # Launch the Gradio app
34
+ demo.launch()