speech-to-speech-translation

Sleeping

App Files Files Community

speech-to-speech-translation / app.py

64FC

Change Whisper-small to Distil-Whisper-large

894def0 about 1 year ago

raw

history blame

2.69 kB

	import torch
	from transformers import pipeline, VitsModel, VitsTokenizer
	import numpy as np
	import gradio as gr

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# Load Whisper-small
	#pipe = pipeline("automatic-speech-recognition",
	# model="openai/whisper-small",
	# device=device
	#)

	# Load Distil-Whisper-large
	pipe = pipeline("automatic-speech-recognition",
	model="distil-whisper/distil-large-v2",
	device=device
	)

	# Load the model checkpoint and tokenizer
	model = VitsModel.from_pretrained("Matthijs/mms-tts-fra")
	tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-fra")


	# Define a function to translate an audio, in French here
	def translate(audio):
	outputs = pipe(audio, max_new_tokens=256,
	generate_kwargs={"task": "transcribe", "language": "fr"})
	return outputs["text"]


	# Define function to generate the waveform output
	def synthesise(text):
	inputs = tokenizer(text, return_tensors="pt")
	input_ids = inputs["input_ids"]

	with torch.no_grad():
	outputs = model(input_ids)

	return outputs.audio[0]


	# Define the pipeline
	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (
	synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech


	# Define the title etc
	title = "Cascaded STST"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. Demo uses OpenAI's [Whisper Small](https://huggingface.co/openai/whisper-small) model for speech translation, and Facebook's
	[MMS TTS](https://huggingface.co/facebook/mms-tts) model, finetuned by [Matthijs](https://huggingface.co/Matthijs), for text-to-speech:
	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.launch()