Spaces:

KevinGeng
/

Laronix_voice_quality_checking_system_FILEIO

Build error

App Files Files Community

Laronix_voice_quality_checking_system_FILEIO / app.py

KevinGeng

test python files

b33c328 about 2 years ago

raw

history blame

2.54 kB


	from random import sample
	import gradio as gr
	import torchaudio
	import torch
	import torch.nn as nn
	import lightning_module
	import pdb
	import jiwer
	# ASR part
	from transformers import pipeline
	p = pipeline("automatic-speech-recognition")

	# WER part
	transformation = jiwer.Compose([
	jiwer.ToLowerCase(),
	jiwer.RemoveWhiteSpace(replace_by_space=True),
	jiwer.RemoveMultipleSpaces(),
	jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
	])

	class ChangeSampleRate(nn.Module):
	def __init__(self, input_rate: int, output_rate: int):
	super().__init__()
	self.output_rate = output_rate
	self.input_rate = input_rate

	def forward(self, wav: torch.tensor) -> torch.tensor:
	# Only accepts 1-channel waveform input
	wav = wav.view(wav.size(0), -1)
	new_length = wav.size(-1) * self.output_rate // self.input_rate
	indices = (torch.arange(new_length) * (self.input_rate / self.output_rate))
	round_down = wav[:, indices.long()]
	round_up = wav[:, (indices.long() + 1).clamp(max=wav.size(-1) - 1)]
	output = round_down * (1. - indices.fmod(1.)).unsqueeze(0) + round_up * indices.fmod(1.).unsqueeze(0)
	return output

	model = lightning_module.BaselineLightningModule.load_from_checkpoint("epoch=3-step=7459.ckpt").eval()
	def calc_mos(audio_path, ref):
	wav, sr = torchaudio.load(audio_path)
	osr = 16_000
	batch = wav.unsqueeze(0).repeat(10, 1, 1)
	csr = ChangeSampleRate(sr, osr)
	out_wavs = csr(wav)
	# ASR
	trans = p(audio_path)["text"]
	# WER
	wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)

	batch = {
	'wav': out_wavs,
	'domains': torch.tensor([0]),
	'judge_id': torch.tensor([288])
	}
	with torch.no_grad():
	output = model(batch)

	predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3

	return predic_mos, trans, wer

	description ="""
	MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
	This demo only accepts .wav format. Best at 16 kHz sampling rate.

	Paper is available [here](https://arxiv.org/abs/2204.02152)
	"""

	iface = gr.Interface(
	fn=calc_mos,
	inputs=[gr.Audio(type='filepath'), gr.Textbox(placeholder="Insert referance here", label="Referance")],
	outputs=[gr.Textbox("Predicted MOS"), gr.Textbox("Hypothesis"), gr.Textbox("WER")],
	title="UTMOS Demo",
	description=description,
	allow_flagging="auto",
	)
	iface.launch()