Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,10 @@ pipe = pipeline("automatic-speech-recognition",
|
|
11 |
device=device
|
12 |
)
|
13 |
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# Define a function to translate an audio, in French here
|
16 |
def translate(audio):
|
@@ -19,11 +23,6 @@ def translate(audio):
|
|
19 |
return outputs["text"]
|
20 |
|
21 |
|
22 |
-
# Load the model checkpoint and tokenizer
|
23 |
-
model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
|
24 |
-
tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")
|
25 |
-
|
26 |
-
|
27 |
# Define function to generate the waveform output
|
28 |
def synthesise(text):
|
29 |
inputs = tokenizer(text, return_tensors="pt")
|
@@ -35,17 +34,12 @@ def synthesise(text):
|
|
35 |
return outputs.audio[0]
|
36 |
|
37 |
|
38 |
-
# Define global variables
|
39 |
-
target_dtype = np.int16 # format expected by Gradio
|
40 |
-
max_range = np.iinfo(target_dtype).max
|
41 |
-
|
42 |
-
|
43 |
# Define the pipeline
|
44 |
def speech_to_speech_translation(audio):
|
45 |
translated_text = translate(audio)
|
46 |
synthesised_speech = synthesise(translated_text)
|
47 |
synthesised_speech = (
|
48 |
-
synthesised_speech.numpy() *
|
49 |
return 16000, synthesised_speech
|
50 |
|
51 |
|
|
|
11 |
device=device
|
12 |
)
|
13 |
|
14 |
+
# Load the model checkpoint and tokenizer
|
15 |
+
model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
|
16 |
+
tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")
|
17 |
+
|
18 |
|
19 |
# Define a function to translate an audio, in French here
|
20 |
def translate(audio):
|
|
|
23 |
return outputs["text"]
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
26 |
# Define function to generate the waveform output
|
27 |
def synthesise(text):
|
28 |
inputs = tokenizer(text, return_tensors="pt")
|
|
|
34 |
return outputs.audio[0]
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
37 |
# Define the pipeline
|
38 |
def speech_to_speech_translation(audio):
|
39 |
translated_text = translate(audio)
|
40 |
synthesised_speech = synthesise(translated_text)
|
41 |
synthesised_speech = (
|
42 |
+
synthesised_speech.numpy() * 32767).astype(np.int16)
|
43 |
return 16000, synthesised_speech
|
44 |
|
45 |
|