hamza666 commited on
Commit
c4d170b
1 Parent(s): e6a5261

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ from datasets import load_dataset
4
+ from datasets import Audio
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+
7
+
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+ pipe = pipeline(
10
+ "automatic-speech-recognition", model="openai/whisper-base", device=device
11
+ )
12
+
13
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
14
+
15
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
16
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
17
+
18
+
19
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
20
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
21
+
22
+ def translate(audio):
23
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate","language": "fr"})
24
+ return outputs["text"]
25
+
26
+ def synthesise(text):
27
+ inputs = processor(text=text, return_tensors="pt")
28
+ speech = model.generate_speech(
29
+ inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
30
+ )
31
+ return speech.cpu()
32
+
33
+ import numpy as np
34
+
35
+ target_dtype = np.int16
36
+ max_range = np.iinfo(target_dtype).max
37
+
38
+
39
+ def speech_to_speech_translation(audio):
40
+ translated_text = translate(audio)
41
+ print(f"{translated_text}")
42
+ synthesised_speech = synthesise(translated_text)
43
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
44
+ return 16000, synthesised_speech
45
+
46
+ import gradio as gr
47
+
48
+ demo = gr.Blocks()
49
+
50
+ mic_translate = gr.Interface(
51
+ fn=speech_to_speech_translation,
52
+ inputs=gr.Audio(sources= ["microphone","upload"], type="filepath"),
53
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
54
+ )
55
+
56
+ file_translate = gr.Interface(
57
+ fn=speech_to_speech_translation,
58
+ inputs=gr.Audio(sources=["upload"], type="filepath"),
59
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
60
+ )
61
+
62
+ with demo:
63
+ gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
64
+
65
+ demo.launch()