TriNguyenPO commited on
Commit
b6e1372
1 Parent(s): 1942c17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -12
app.py CHANGED
@@ -1,21 +1,38 @@
1
  import gradio as gr
2
- from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, pipeline
 
 
3
 
4
- tokenizer = AutoTokenizer.from_pretrained("MarcNg/fastspeech2-vi-infore")
5
- model = AutoModelForSpeechSeq2Seq.from_pretrained("MarcNg/fastspeech2-vi-infore")
 
 
6
 
7
- tts_pipeline = pipeline("text-to-speech", model=model, tokenizer=tokenizer)
8
-
9
- def text_to_speech(text):
10
- output = tts_pipeline(text)
11
- return output["audio"].numpy()
 
 
 
 
 
 
 
 
 
 
 
 
12
 
 
13
  iface = gr.Interface(
14
- fn=text_to_speech,
15
- inputs="text",
16
  outputs="audio",
17
- title="Vietnamese Text-to-Speech",
18
- description="Enter Vietnamese text to convert to speech using FastSpeech 2 model."
19
  )
20
 
21
  iface.launch()
 
1
  import gradio as gr
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ from tensorflow_tts.inference import TFAutoModel, AutoProcessor
5
 
6
+ # Load pre-trained models
7
+ processor = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
8
+ fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
9
+ melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en")
10
 
11
+ # Define inference function
12
+ def tts_inference(text):
13
+ # Convert text to sequence
14
+ input_ids = processor.text_to_sequence(text)
15
+
16
+ # Generate mel spectrogram
17
+ mel_outputs = fastspeech2.inference(
18
+ input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
19
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32)
20
+ )
21
+
22
+ # Convert mel spectrogram to waveform
23
+ audio = melgan.inference(mel_outputs)[0, :, 0]
24
+ audio = audio.numpy()
25
+
26
+ # Save to a temporary file and return path
27
+ return audio, 22050 # Return audio and sample rate for Gradio to play
28
 
29
+ # Create Gradio interface
30
  iface = gr.Interface(
31
+ fn=tts_inference,
32
+ inputs="text",
33
  outputs="audio",
34
+ title="FastSpeech2_vi TTS",
35
+ description="Enter Vietnamese text and generate speech using FastSpeech2"
36
  )
37
 
38
  iface.launch()