Spaces:

KevinGeng
/

Laronix_voice_quality_checking_system_MICROPHONE

Runtime error

App Files Files Community

KevinGeng commited on Dec 13, 2023

Commit

4497f4b

•

1 Parent(s): ac46a87

add new phoneme mode;

Browse files

Files changed (1) hide show

app.py +9 -10

app.py CHANGED Viewed

@@ -23,8 +23,8 @@ transformation = jiwer.Compose([
 # WPM part
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
-# phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
 # phoneme_model =  pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
 class ChangeSampleRate(nn.Module):
     def __init__(self, input_rate: int, output_rate: int):
@@ -79,10 +79,10 @@ def calc_mos(audio_path, ref):
     MOS_fig = Naturalness_Plot(AVA_MOS)
     # Phonemes per minute (PPM)
-    # with torch.no_grad():
-    #     logits = phoneme_model(out_wavs).logits
-    # phone_predicted_ids = torch.argmax(logits, dim=-1)
-    # phone_transcription = processor.batch_decode(phone_predicted_ids)
     # Disable PPM for now
     phone_transcription = ['D U M M Y']
@@ -95,8 +95,7 @@ def calc_mos(audio_path, ref):
     ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
     # pdb.set_trace()
-    # return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm, f0_db_fig
-    return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, f0_db_fig
 with open("local/description.md") as f:
@@ -118,6 +117,8 @@ iface = gr.Interface(
            gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better", visible=False),
            gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
            gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
            gr.Plot(label="Pitch Contour and dB Analysis", show_label=True, container=True)],
   title="Speech Analysis by Laronix AI",
   description=description,
@@ -125,7 +126,5 @@ iface = gr.Interface(
   examples=examples,
 )
         # Currently remove PPM and Phonemes
-        #    gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
-        #    gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False),
 # add password to protect the interface
 iface.launch(share=False, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password.\n Thanks for your cooperation!")

 # WPM part
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
+model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 # phoneme_model =  pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
 class ChangeSampleRate(nn.Module):
     def __init__(self, input_rate: int, output_rate: int):
     MOS_fig = Naturalness_Plot(AVA_MOS)
     # Phonemes per minute (PPM)
+    with torch.no_grad():
+        logits = phoneme_model(out_wavs).logits
+    phone_predicted_ids = torch.argmax(logits, dim=-1)
+    phone_transcription = processor.batch_decode(phone_predicted_ids)
     # Disable PPM for now
     phone_transcription = ['D U M M Y']
     ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
     # pdb.set_trace()
+    return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm, f0_db_fig
 with open("local/description.md") as f:
            gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better", visible=False),
            gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
            gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
+           gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
+           gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False),
            gr.Plot(label="Pitch Contour and dB Analysis", show_label=True, container=True)],
   title="Speech Analysis by Laronix AI",
   description=description,
   examples=examples,
 )
         # Currently remove PPM and Phonemes
 # add password to protect the interface
 iface.launch(share=False, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password.\n Thanks for your cooperation!")