tortoise-tts-v2 / app.py
Shanuka01's picture
Update app.py
578dc3b verified
import torch
import gradio as gr
import torchaudio
import time
from datetime import datetime
import numpy as np # Add this import for handling numpy arrays
from transformers import pipeline
from tortoise.api import TextToSpeech
from tortoise.utils.text import split_and_recombine_text
from tortoise.utils.audio import load_audio, load_voice, load_voices
# STT Initialization
model_id = "openai/whisper-tiny"
pipe = pipeline("automatic-speech-recognition", model=model_id)
# TTS Initialization
VOICE_OPTIONS = [
"indian_f_1", "indian_F_2", "indian_F_3",
"indian_M_1", "indian_M_2", "indian_M_3"
]
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
def convert_audio(filepath, voice="indian_F_1"):
# Transcribe audio to text using STT
transcription_output = pipe(
filepath,
max_new_tokens=256,
generate_kwargs={
"task": "transcribe",
"language": "english",
},
chunk_length_s=30,
batch_size=8
)
transcribed_text = transcription_output["text"]
# Use the transcribed text for TTS
texts = split_and_recombine_text(transcribed_text)
voice_samples, conditioning_latents = load_voice(voice)
audio_frames = []
for text in texts:
for audio_frame in tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset="ultra_fast",
k=1
):
audio_frames.append(audio_frame.cpu().detach().numpy())
# Joining the audio frames for output using numpy's concatenate
final_audio = np.concatenate(audio_frames, axis=0)
return (24000, final_audio)
interface = gr.Interface(
fn=convert_audio,
inputs=[
gr.Audio(type="filepath"), # Removed 'source="upload"'
gr.Dropdown(VOICE_OPTIONS, value="indian_f_1", label="Select voice:", type="value")
],
outputs=gr.Audio(label="streaming audio:", streaming=True, autoplay=True),
title="STT to TTS",
description="Convert spoken words into a different voice"
)
interface.launch()