import gradio as gr import tensorflow as tf import numpy as np from tensorflow_tts.inference import TFAutoModel, AutoProcessor # Load pre-trained models processor = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en") fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en") melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en") # Define inference function def tts_inference(text): # Convert text to sequence input_ids = processor.text_to_sequence(text) # Generate mel spectrogram mel_outputs = fastspeech2.inference( input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32) ) # Convert mel spectrogram to waveform audio = melgan.inference(mel_outputs)[0, :, 0] audio = audio.numpy() # Save to a temporary file and return path return audio, 22050 # Return audio and sample rate for Gradio to play # Create Gradio interface iface = gr.Interface( fn=tts_inference, inputs="text", outputs="audio", title="FastSpeech2_vi TTS", description="Enter Vietnamese text and generate speech using FastSpeech2" ) iface.launch()