Spaces:
Sleeping
Sleeping
File size: 2,520 Bytes
1c2b467 7beb980 1c2b467 7beb980 1c2b467 20945f0 7beb980 20945f0 7beb980 1c2b467 602984b 1c2b467 7beb980 1c2b467 7beb980 1c2b467 7beb980 1c2b467 7beb980 1c2b467 7beb980 602984b 1c2b467 7beb980 1c2b467 20945f0 7beb980 1c2b467 7beb980 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import torch
from transformers import pipeline, VitsModel, VitsTokenizer
import numpy as np
import gradio as gr
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Load Whisper-small
pipe = pipeline("automatic-speech-recognition",
model="openai/whisper-small",
device=device
)
# Load the model checkpoint and tokenizer
model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")
# Define a function to translate an audio, in French here
def translate(audio):
outputs = pipe(audio, max_new_tokens=256,
generate_kwargs={"task": "transcribe", "language": "fr"})
return outputs["text"]
# Define function to generate the waveform output
def synthesise(text):
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
with torch.no_grad():
outputs = model(input_ids)
return outputs.audio[0]
# Define the pipeline
def speech_to_speech_translation(audio):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text)
synthesised_speech = (
synthesised_speech.numpy() * 32767).astype(np.int16)
return 16000, synthesised_speech
# Define the title etc
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. Demo uses OpenAI's [Whisper Small](https://huggingface.co/openai/whisper-small) model for speech translation, and Facebook's
[MMS TTS](https://huggingface.co/facebook/mms-tts) model, finetuned by [Matthijs](https://huggingface.co/Matthijs), for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch() |