import torch import gradio as gr import torchaudio import time from datetime import datetime import numpy as np # Add this import for handling numpy arrays from transformers import pipeline from tortoise.api import TextToSpeech from tortoise.utils.text import split_and_recombine_text from tortoise.utils.audio import load_audio, load_voice, load_voices # STT Initialization model_id = "openai/whisper-tiny" pipe = pipeline("automatic-speech-recognition", model=model_id) # TTS Initialization VOICE_OPTIONS = [ "indian_f_1", "indian_F_2", "indian_F_3", "indian_M_1", "indian_M_2", "indian_M_3" ] tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) def convert_audio(filepath, voice="indian_F_1"): # Transcribe audio to text using STT ) transcribed_text = transcription_output["text"] # Use the transcribed text for TTS texts = split_and_recombine_text(transcribed_text) voice_samples, conditioning_latents = load_voice(voice) audio_frames = [] for text in texts: for audio_frame in tts.tts_with_preset( text, voice_samples=voice_samples, k=1 ): audio_frames.append(audio_frame.cpu().detach().numpy()) # Joining the audio frames for output using numpy's concatenate final_audio = np.concatenate(audio_frames, axis=0) interface = gr.Interface( fn=convert_audio, inputs=[ gr.Audio(source="upload", type="filepath"), gr.Dropdown(VOICE_OPTIONS, value="indian_f_1", label="Select voice:", type="value") ], outputs=gr.Audio(label="streaming audio:", streaming=True, autoplay=True), title="STT to TTS", description="Convert spoken words into a different voice" ) interface.launch()