dis able PPM
Browse files
app.py
CHANGED
@@ -23,8 +23,8 @@ transformation = jiwer.Compose([
|
|
23 |
|
24 |
# WPM part
|
25 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
26 |
-
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
|
27 |
-
phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
|
28 |
# phoneme_model = pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
|
29 |
class ChangeSampleRate(nn.Module):
|
30 |
def __init__(self, input_rate: int, output_rate: int):
|
@@ -79,10 +79,13 @@ def calc_mos(audio_path, ref):
|
|
79 |
MOS_fig = Naturalness_Plot(AVA_MOS)
|
80 |
|
81 |
# Phonemes per minute (PPM)
|
82 |
-
with torch.no_grad():
|
83 |
-
|
84 |
-
phone_predicted_ids = torch.argmax(logits, dim=-1)
|
85 |
-
phone_transcription = processor.batch_decode(phone_predicted_ids)
|
|
|
|
|
|
|
86 |
lst_phonemes = phone_transcription[0].split(" ")
|
87 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
88 |
|
@@ -92,7 +95,7 @@ def calc_mos(audio_path, ref):
|
|
92 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
93 |
|
94 |
# pdb.set_trace()
|
95 |
-
return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
|
96 |
|
97 |
|
98 |
with open("local/description.md") as f:
|
|
|
23 |
|
24 |
# WPM part
|
25 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
26 |
+
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
|
27 |
+
# phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
|
28 |
# phoneme_model = pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
|
29 |
class ChangeSampleRate(nn.Module):
|
30 |
def __init__(self, input_rate: int, output_rate: int):
|
|
|
79 |
MOS_fig = Naturalness_Plot(AVA_MOS)
|
80 |
|
81 |
# Phonemes per minute (PPM)
|
82 |
+
# with torch.no_grad():
|
83 |
+
# logits = phoneme_model(out_wavs).logits
|
84 |
+
# phone_predicted_ids = torch.argmax(logits, dim=-1)
|
85 |
+
# phone_transcription = processor.batch_decode(phone_predicted_ids)
|
86 |
+
|
87 |
+
# Disable PPM for now
|
88 |
+
phone_transcription = None
|
89 |
lst_phonemes = phone_transcription[0].split(" ")
|
90 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
91 |
|
|
|
95 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
96 |
|
97 |
# pdb.set_trace()
|
98 |
+
return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm, f0_db_fig
|
99 |
|
100 |
|
101 |
with open("local/description.md") as f:
|