speech-to-speech-translation

Runtime error

App Files Files Community

crowbarmassage commited on Aug 23, 2023

Commit

147fb27

•

1 Parent(s): 316ede8

Upload app.py

Browse files

Files changed (1) hide show

app.py +99 -36

app.py CHANGED Viewed

@@ -1,72 +1,135 @@
-import gradio as gr
-import numpy as np
-import torch
 from datasets import load_dataset
-from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# load speech translation checkpoint
-asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
-# load text-to-speech checkpoint and speaker embeddings
-processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
-    return outputs["text"]
 def synthesise(text):
-    inputs = processor(text=text, return_tensors="pt")
-    speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
-    return speech.cpu()
 def speech_to_speech_translation(audio):
-    translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
-    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
-title = "Cascaded STST"
-description = """
-Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
-[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
-![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
-"""
 demo = gr.Blocks()
 mic_translate = gr.Interface(
-    fn=speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
-    title=title,
-    description=description,
 )
 file_translate = gr.Interface(
-    fn=speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
-    examples=[["./example.wav"]],
-    title=title,
-    description=description,
 )
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
-demo.launch()

+# -*- coding: utf-8 -*-
+"""app.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/143eWt9oxUTcF59OBiVybOgKXJB3QOTsK
+"""
+# Beginning of Unit 7
+#!pip install git+https://github.com/huggingface/transformers.git
+!pip install torch accelerate torchaudio datasets gradio sentencepiece
+!pip install -U transformers
+#!pip install sacremoses
+#!pip install -Uqq datasets[audio]
+#!pip install git+https://github.com/huggingface/transformers
+from transformers.models.markuplm.tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
+import torch, torchaudio
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import sentencepiece
+from transformers import MarianMTModel, MarianTokenizer
 from datasets import load_dataset
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from IPython.display import Audio
+import numpy as np
+target_dtype = np.int16
+max_range = np.iinfo(target_dtype).max
+# Load Spanish Audio
+def transcribe(audio):
+    model_id_asr = "openai/whisper-small"
+    processor_asr = WhisperProcessor.from_pretrained(model_id_asr)
+    model_asr = WhisperForConditionalGeneration.from_pretrained(model_id_asr)
+    model_asr.config.forced_decoder_ids = None
+    input_features = processor_asr(audio["audio"]["array"], sampling_rate=audio["audio"]["sampling_rate"], return_tensors="pt").input_features
+    predicted_ids = model_asr.generate(input_features)
+    # decode token ids to text
+    transcription = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)
+    return transcription[0]
+# Run inference on Spanish Audio vector
+def translate(text):
+    model_id_mt = "Helsinki-NLP/opus-mt-es-fr"
+    tokenizer_mt = MarianTokenizer.from_pretrained(model_id_mt)
+    model_mt = MarianMTModel.from_pretrained(model_id_mt)
+    # Tokenize the input text
+    input_ids = tokenizer_mt.encode(text, return_tensors="pt")
+    # Generate translation
+    with torch.no_grad():
+        translated_ids = model_mt.generate(input_ids)
+    # Decode the translated text
+    translated_text = tokenizer_mt.decode(translated_ids[0], skip_special_tokens=True)
+    return translated_text
 def synthesise(text):
+    processor_tts = SpeechT5Processor.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
+    model_tts = SpeechT5ForTextToSpeech.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+    inputs = processor_tts(text=text, return_tensors="pt")
+    speech = model_tts.generate_speech(
+        inputs["input_ids"], speaker_embeddings, vocoder=vocoder
+    )
+    return speech
 def speech_to_speech_translation(audio):
+    transcribed_text = transcribe(audio)
+    translated_text = translate(transcribed_text)
     synthesised_speech = synthesise(translated_text)
     return 16000, synthesised_speech
+def adjusted_speech_to_speech_translation(audio_filepath):
+    # Load the audio file
+    waveform, sampling_rate = torchaudio.load(audio_filepath)
+    if sampling_rate != 16000:
+      resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
+      waveform = resampler(waveform)
+      sampling_rate = 16000
+    # Convert the waveform to a numpy array and construct the expected dictionary format
+    audio_dict = {
+        "audio": {
+            "array": waveform.numpy(),
+            "sampling_rate": sampling_rate
+        }
+    }
+    transcribed_text = transcribe(audio_dict)
+    translated_text = translate(transcribed_text)
+    #print(transcribed_text)
+    #print(translated_text)
+    synthesised_speech = synthesise(translated_text)
+    #print(synthesised_speech)
+    #print(torch.min(synthesised_speech), torch.max(synthesised_speech))
+    synthesised_speech = (synthesised_speech * 32767).numpy().astype(np.int16)
+    #print(synthesised_speech)
+    #print(np.min(synthesised_speech), np.max(synthesised_speech))
+    return 16000, synthesised_speech
+import gradio as gr
 demo = gr.Blocks()
 mic_translate = gr.Interface(
+    fn=adjusted_speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
 )
 file_translate = gr.Interface(
+    fn=adjusted_speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs=gr.Audio(label="Generated Speech", type="numpy"),
 )
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
+demo.launch(debug=True, share=False)