Spaces:

drewThomasson
/

OuteTTS-DEMO

Running

App Files Files Community

drewThomasson commited on Nov 5

Commit

d084eaa

•

1 Parent(s): a553ade

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -300

app.py CHANGED Viewed

@@ -1,305 +1,34 @@
 import gradio as gr
 from outetts.v0_1.interface import InterfaceHF
-import logging
-import os
-import tempfile
-# Import faster-whisper for transcription
-from faster_whisper import WhisperModel
-# Configure logging to display information in the terminal
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Initialize the OuteTTS interface with the Hugging Face model
-try:
-    logger.info("Initializing OuteTTS InterfaceHF with model 'OuteAI/OuteTTS-0.1-350M'")
-    interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
-    logger.info("OuteTTS model loaded successfully.")
-except Exception as e:
-    logger.error(f"Failed to load OuteTTS model: {e}")
-    raise e
-# Initialize the faster-whisper model
-try:
-    logger.info("Initializing faster-whisper model for transcription.")
-    whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
-    logger.info("faster-whisper model loaded successfully.")
-except Exception as e:
-    logger.error(f"Failed to load faster-whisper model: {e}")
-    raise e
-def generate_tts_basic(text, temperature, repetition_penalty, max_length):
-    """
-    Generates speech from the input text using the OuteTTS model (Basic TTS).
-    Parameters:
-        text (str): The input text for TTS.
-        temperature (float): Sampling temperature.
-        repetition_penalty (float): Repetition penalty.
-        max_length (int): Maximum length of the generated audio tokens.
-    Returns:
-        str: Path to the generated audio file.
-    """
-    logger.info("Received Basic TTS generation request.")
-    logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}")
-    try:
-        # Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
-        output = interface.generate(
-            text=text,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            max_lenght=max_length  # Pass the parameter with typo
-        )
-        logger.info("Basic TTS generation complete.")
-        # Save the output to a temporary WAV file
-        output_path = os.path.join(tempfile.gettempdir(), "basic_output.wav")
-        output.save(output_path)
-        logger.info(f"Basic TTS audio saved to {output_path}")
-        return output_path  # Gradio will handle the audio playback
-    except Exception as e:
-        logger.error(f"Error during Basic TTS generation: {e}")
-        return None
-def transcribe_audio(audio_path):
-    """
-    Transcribes the given audio file using faster-whisper.
-    Parameters:
-        audio_path (str): Path to the audio file.
-    Returns:
-        str: Transcribed text.
-    """
-    logger.info(f"Transcribing audio file: {audio_path}")
-    try:
-        segments, info = whisper_model.transcribe(audio_path)
-        transcript = " ".join([segment.text for segment in segments])
-        logger.info(f"Transcription complete: {transcript}")
-        return transcript
-    except Exception as e:
-        logger.error(f"Error during transcription: {e}")
-        return ""
-def create_speaker_with_transcription(audio_file):
-    """
-    Creates a custom speaker from a reference audio file by automatically transcribing it.
-    Parameters:
-        audio_file (file): Uploaded reference audio file.
-    Returns:
-        dict: Speaker configuration or empty dict if failed.
-    """
-    logger.info("Received Voice Cloning request with audio file.")
-    try:
-        # Save uploaded audio to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
-            temp_audio_path = temp_audio.name
-            temp_audio.write(audio_file.read())
-            logger.info(f"Reference audio saved to {temp_audio_path}")
-        # Transcribe the audio file
-        transcript = transcribe_audio(temp_audio_path)
-        if not transcript.strip():
-            logger.error("Transcription resulted in empty text.")
-            os.remove(temp_audio_path)
-            return {}
-        # Create speaker using the transcribed text
-        speaker = interface.create_speaker(temp_audio_path, transcript)
-        logger.info("Speaker created successfully.")
-        # Clean up the temporary audio file
-        os.remove(temp_audio_path)
-        logger.info(f"Temporary audio file {temp_audio_path} removed.")
-        return speaker
-    except Exception as e:
-        logger.error(f"Error during speaker creation: {e}")
-        return {}
-def generate_tts_cloned(text, temperature, repetition_penalty, max_length, speaker):
-    """
-    Generates speech from the input text using the OuteTTS model with cloned voice.
-    Parameters:
-        text (str): The input text for TTS.
-        temperature (float): Sampling temperature.
-        repetition_penalty (float): Repetition penalty.
-        max_length (int): Maximum length of the generated audio tokens.
-        speaker (dict): Speaker configuration for voice cloning.
-    Returns:
-        str: Path to the generated audio file.
-    """
-    logger.info("Received Cloned TTS generation request.")
-    logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}, Speaker Provided: {speaker is not None}")
-    try:
-        if not speaker:
-            logger.error("Speaker configuration is missing.")
-            return None
-        # Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
-        output = interface.generate(
-            text=text,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            max_lenght=max_length,  # Pass the parameter with typo
-            speaker=speaker
-        )
-        logger.info("Cloned TTS generation complete.")
-        # Save the output to a temporary WAV file
-        output_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
-        output.save(output_path)
-        logger.info(f"Cloned TTS audio saved to {output_path}")
-        return output_path  # Gradio will handle the audio playback
-    except Exception as e:
-        logger.error(f"Error during Cloned TTS generation: {e}")
-        return None
-# Define the Gradio Blocks interface
-with gr.Blocks() as demo:
-    gr.Markdown("# 🎤 OuteTTS - Text to Speech Interface")
-    gr.Markdown(
-        """
-        Generate speech from text using the **OuteTTS-0.1-350M** model.
-        **Key Features:**
-        - Pure language modeling approach to TTS
-        - Voice cloning capabilities with automatic transcription
-        - Compatible with LLaMa architecture
-        """
     )
-    with gr.Tab("Basic TTS"):
-        with gr.Row():
-            text_input_basic = gr.Textbox(
-                label="📄 Text Input",
-                placeholder="Enter the text for TTS generation",
-                lines=3
-            )
-        with gr.Row():
-            temperature_basic = gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=0.1,
-                step=0.01,
-                label="🌡️ Temperature"
-            )
-            repetition_penalty_basic = gr.Slider(
-                minimum=0.5,
-                maximum=2.0,
-                value=1.1,
-                step=0.1,
-                label="🔁 Repetition Penalty"
-            )
-            max_length_basic = gr.Slider(
-                minimum=256,
-                maximum=4096,
-                value=1024,
-                step=256,
-                label="📏 Max Length"
-            )
-        generate_button_basic = gr.Button("🔊 Generate Speech")
-        output_audio_basic = gr.Audio(
-            label="🎧 Generated Speech",
-            type="filepath"  # Expecting a file path to the audio
-        )
-        # Define the button click event for Basic TTS
-        generate_button_basic.click(
-            fn=generate_tts_basic,
-            inputs=[text_input_basic, temperature_basic, repetition_penalty_basic, max_length_basic],
-            outputs=output_audio_basic
-        )
-    with gr.Tab("Voice Cloning"):
-        with gr.Row():
-            reference_audio = gr.Audio(
-                label="🔊 Reference Audio",
-                type="filepath"  # Receive the path to the uploaded file
-            )
-        create_speaker_button = gr.Button("🎤 Create Speaker")
-        speaker_info = gr.JSON(label="🗂️ Speaker Configuration")  # Removed interactive=False
-        with gr.Row():
-            generate_cloned_speech = gr.Textbox(
-                label="📄 Text Input",
-                placeholder="Enter the text for TTS generation with cloned voice",
-                lines=3
-            )
-        with gr.Row():
-            temperature_clone = gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=0.1,
-                step=0.01,
-                label="🌡️ Temperature"
-            )
-            repetition_penalty_clone = gr.Slider(
-                minimum=0.5,
-                maximum=2.0,
-                value=1.1,
-                step=0.1,
-                label="🔁 Repetition Penalty"
-            )
-            max_length_clone = gr.Slider(
-                minimum=256,
-                maximum=4096,
-                value=1024,
-                step=256,
-                label="📏 Max Length"
-            )
-        generate_cloned_button = gr.Button("🔊 Generate Cloned Speech")
-        output_cloned_audio = gr.Audio(
-            label="🎧 Generated Cloned Speech",
-            type="filepath"  # Expecting a file path to the audio
-        )
-        # Define the button click event for creating a speaker
-        create_speaker_button.click(
-            fn=create_speaker_with_transcription,
-            inputs=[reference_audio],
-            outputs=speaker_info
-        )
-        # Define the button click event for generating speech with the cloned voice
-        generate_cloned_button.click(
-            fn=generate_tts_cloned,
-            inputs=[generate_cloned_speech, temperature_clone, repetition_penalty_clone, max_length_clone, speaker_info],
-            outputs=output_cloned_audio
-        )
-    gr.Markdown(
-        """
-        ---
-        **Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)
-        **Credits:**
-        - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
-        - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
-        - [faster-whisper](https://github.com/guillaumekln/faster-whisper)
-        """
-    )
-# Launch the Gradio app without a loading bar
-if __name__ == "__main__":
-    demo.launch(share=True, show_progress=False)

 import gradio as gr
 from outetts.v0_1.interface import InterfaceHF
+# Initialize the TTS model interface
+interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
+# Define a function to generate and save TTS output from input text
+def generate_tts(text, temperature=0.1, repetition_penalty=1.1, max_length=4096):
+    output = interface.generate(
+        text=text,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        max_lenght=max_length
     )
+    # Save the output audio to a file
+    output.save("output.wav")
+    return "output.wav"
+# Gradio interface for TTS
+demo = gr.Interface(
+    fn=generate_tts,
+    inputs=[
+        gr.Textbox(lines=2, placeholder="Enter text to convert to speech", label="Text"),
+        gr.Slider(0.1, 1.0, value=0.1, label="Temperature"),
+        gr.Slider(1.0, 2.0, value=1.1, label="Repetition Penalty"),
+        gr.Slider(512, 4096, value=4096, step=256, label="Max Length")
+    ],
+    outputs=gr.Audio(type="file", label="Generated Speech"),
+    title="OuteTTS Text-to-Speech Demo",
+    description="Convert text to speech using the OuteTTS model."
+)
+# Launch the Gradio app
+demo.launch()