Spaces:
Runtime error
Runtime error
import torch | |
import gradio as gr | |
import torchaudio | |
import time | |
from datetime import datetime | |
import numpy as np # Add this import for handling numpy arrays | |
from transformers import pipeline | |
from tortoise.api import TextToSpeech | |
from tortoise.utils.text import split_and_recombine_text | |
from tortoise.utils.audio import load_audio, load_voice, load_voices | |
# STT Initialization | |
model_id = "openai/whisper-tiny" | |
pipe = pipeline("automatic-speech-recognition", model=model_id) | |
# TTS Initialization | |
VOICE_OPTIONS = [ | |
"indian_f_1", "indian_F_2", "indian_F_3", | |
"indian_M_1", "indian_M_2", "indian_M_3" | |
] | |
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) | |
def convert_audio(filepath, voice="indian_F_1"): | |
# Transcribe audio to text using STT | |
transcription_output = pipe( | |
filepath, | |
max_new_tokens=256, | |
generate_kwargs={ | |
"task": "transcribe", | |
"language": "english", | |
}, | |
chunk_length_s=30, | |
batch_size=8 | |
) | |
transcribed_text = transcription_output["text"] | |
# Use the transcribed text for TTS | |
texts = split_and_recombine_text(transcribed_text) | |
voice_samples, conditioning_latents = load_voice(voice) | |
audio_frames = [] | |
for text in texts: | |
for audio_frame in tts.tts_with_preset( | |
text, | |
voice_samples=voice_samples, | |
conditioning_latents=conditioning_latents, | |
preset="ultra_fast", | |
k=1 | |
): | |
audio_frames.append(audio_frame.cpu().detach().numpy()) | |
# Joining the audio frames for output using numpy's concatenate | |
final_audio = np.concatenate(audio_frames, axis=0) | |
return (24000, final_audio) | |
interface = gr.Interface( | |
fn=convert_audio, | |
inputs=[ | |
gr.Audio(type="filepath"), # Removed 'source="upload"' | |
gr.Dropdown(VOICE_OPTIONS, value="indian_f_1", label="Select voice:", type="value") | |
], | |
outputs=gr.Audio(label="streaming audio:", streaming=True, autoplay=True), | |
title="STT to TTS", | |
description="Convert spoken words into a different voice" | |
) | |
interface.launch() | |