speech-to-speech-translation-ca

Runtime error

App Files Files Community

speech-to-speech-translation-ca / app.py

JanLilan

Update app.py

613f65f 10 months ago

raw

history blame contribute delete

5.05 kB

	import os
	import torch
	import gradio as gr
	import numpy as np
	import torch
	from datasets import load_dataset, Audio
	from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
	from speechbrain.pretrained import EncoderClassifier

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# load speech translation checkpoint
	asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

	# load text-to-speech checkpoint and speaker embeddings
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

	# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
	model = SpeechT5ForTextToSpeech.from_pretrained(
	"JanLilan/speecht5_finetuned_openslr-slr69-cat"
	).to(device)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

	######################################################################################
	################################## SPEAKER EMBEDDING #################################
	######################################################################################
	# we will try to translate with this voice embedding... Let's see what happen. else:
	dataset = load_dataset("projecte-aina/openslr-slr69-ca-trimmed-denoised", split="train")
	dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
	# LOAD
	spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
	speaker_model = EncoderClassifier.from_hparams(
	source=spk_model_name,
	run_opts={"device": device},
	savedir=os.path.join("/tmp", spk_model_name),
	)

	def create_speaker_embedding(waveform):
	with torch.no_grad():
	speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
	speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
	speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
	return speaker_embeddings

	# we must take one speaker embeding
	checkpoint = "microsoft/speecht5_tts"
	processor = SpeechT5Processor.from_pretrained(checkpoint)

	# function to embedd
	def prepare_dataset(example):
	audio = example["audio"]

	example = processor(
	text=example["transcription"],
	audio_target=audio["array"],
	sampling_rate=audio["sampling_rate"],
	return_attention_mask=False,
	)

	# strip off the batch dimension
	example["labels"] = example["labels"][0]

	# use SpeechBrain to obtain x-vector
	example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

	return example

	processed_example = prepare_dataset(dataset[2])
	speaker_embeddings = torch.tensor(processed_example["speaker_embeddings"]).unsqueeze(0)

	# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


	def translate(audio):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "catalan"})
	return outputs["text"]


	def synthesise(text):
	inputs = processor(text=text, return_tensors="pt")
	speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
	return speech.cpu()


	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech


	title = "Demo STST - Multilingual to Català Speech"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Català. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation to català, and Microsoft's
	[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech fine-tuned on [projecte-aina/openslr-slr69-ca-trimmed-denoised](https://huggingface.co/datasets/projecte-aina/openslr-slr69-ca-trimmed-denoised).

	This demo can be improve updating it with [projecte-aina/tts-ca-coqui-vits-multispeaker](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker) model:

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.launch()