Spaces:

KevinGeng
/

Laronix_voice_quality_checking_system_MICROPHONE

Runtime error

App Files Files Community

KevinGeng commited on Oct 9, 2023

Commit

4ea1ce4

•

1 Parent(s): f3b6079

Update app.py

Browse files

1. Support multi channels (mean stereo to mono)
2. Better recognition rate. (Whisper)

Files changed (1) hide show

app.py +15 -11

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from random import sample
 import gradio as gr
 import torchaudio
@@ -10,8 +9,12 @@ import jiwer
 # ASR part
 from transformers import pipeline
-p = pipeline("automatic-speech-recognition")
 # WER part
 transformation = jiwer.Compose([
     jiwer.ToLowerCase(),
@@ -44,7 +47,9 @@ class ChangeSampleRate(nn.Module):
 model = lightning_module.BaselineLightningModule.load_from_checkpoint("epoch=3-step=7459.ckpt").eval()
 def calc_mos(audio_path, ref):
-    wav, sr = torchaudio.load(audio_path)
     osr = 16_000
     batch = wav.unsqueeze(0).repeat(10, 1, 1)
     csr = ChangeSampleRate(sr, osr)
@@ -73,6 +78,7 @@ def calc_mos(audio_path, ref):
     return predic_mos, trans, wer, phone_transcription, ppm
 description ="""
 MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
 This demo only accepts .wav format. Best at 16 kHz sampling rate.
@@ -86,15 +92,13 @@ Add WER interface.
 iface = gr.Interface(
   fn=calc_mos,
-  inputs=[gr.Audio(source="microphone", type='filepath', label="Audio to evaluate"),
-          gr.Textbox(value="Once upon a time there was a young rat named Arthur who couldn’t make up his mind.",
-                    placeholder="Input reference here",
-                    label="Reference")],
-  outputs=[gr.Textbox(placeholder="Predicted MOS", label="Predicted MOS"),
            gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
-           gr.Textbox(placeholder="Word Error Rate", label = "WER"),
            gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
-           gr.Textbox(placeholder="Phonemes per minutes", label="PPM")],
   title="Laronix's Voice Quality Checking System Demo",
   description=description,
   allow_flagging="auto",

 from random import sample
 import gradio as gr
 import torchaudio
 # ASR part
 from transformers import pipeline
+# p = pipeline("automatic-speech-recognition")
+p = pipeline(
+    "automatic-speech-recognition",
+    model="KevinGeng/whipser_medium_en_PAL300_step25",
+    device=0,
+)
 # WER part
 transformation = jiwer.Compose([
     jiwer.ToLowerCase(),
 model = lightning_module.BaselineLightningModule.load_from_checkpoint("epoch=3-step=7459.ckpt").eval()
 def calc_mos(audio_path, ref):
+    wav, sr = torchaudio.load(audio_path, channels_first=True)
+    if wav.shape[0] > 1:
+        wav = wav.mean(dim=0, keepdim=True) # Mono channel
     osr = 16_000
     batch = wav.unsqueeze(0).repeat(10, 1, 1)
     csr = ChangeSampleRate(sr, osr)
     return predic_mos, trans, wer, phone_transcription, ppm
 description ="""
 MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
 This demo only accepts .wav format. Best at 16 kHz sampling rate.
 iface = gr.Interface(
   fn=calc_mos,
+  inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
+          gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
+  outputs=[gr.Textbox(placeholder="Naturalness evaluation, ranged 1 to 5, the higher the better.", label="Predicted MOS"),
            gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
+           gr.Textbox(placeholder="Word Error Rate: Only valid when Reference is given", label = "WER"),
            gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
+           gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
   title="Laronix's Voice Quality Checking System Demo",
   description=description,
   allow_flagging="auto",