import streamlit as st from tensorflow_tts.inference import AutoProcessor, TFAutoModel import tensorflow as tf import numpy as np import soundfile as sf import yaml processor = AutoProcessor.from_pretrained("MarcNg/fastspeech2-vi-infore") fastspeech2 = TFAutoModel.from_pretrained("MarcNg/fastspeech2-vi-infore") mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en") output = "output.wav" st.header("MarcNg/fastspeech2-vi-infore Demo") def tts(text): input_ids = processor.text_to_sequence(text) mel_before, mel_after, duration_outputs, _, _ = fastspeech2.inference( input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0), speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32), speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32), f0_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32), energy_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32), ) return mel_after text = st.text_input("Text to process") if st.button("Speak"): mel_after = tts(text) audio_after = mb_melgan.inference(mel_after)[0, :, 0] sf.write(output, audio_after, 22050, 'PCM_16') st.audio(output, format='audio/wav')