add MOS and WER conversion
Browse files- .gitignore +22 -0
- app.py +11 -3
- local/WER2INTELI.png +0 -0
- local/convert_metrics.py +71 -0
- local/nat2avaMOS.png +0 -0
.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
4 |
+
*.pyo
|
5 |
+
*.pyd
|
6 |
+
__pycache__/
|
7 |
+
*.db
|
8 |
+
*.sqlite3
|
9 |
+
*.sqlite
|
10 |
+
*.log
|
11 |
+
*.bak
|
12 |
+
*.swp
|
13 |
+
*.swo
|
14 |
+
*.tmp
|
15 |
+
*.tmp.*
|
16 |
+
*~
|
17 |
+
|
18 |
+
# flagged
|
19 |
+
flagged/
|
20 |
+
|
21 |
+
#
|
22 |
+
*.wav
|
app.py
CHANGED
@@ -6,6 +6,7 @@ import torch.nn as nn
|
|
6 |
import lightning_module
|
7 |
import pdb
|
8 |
import jiwer
|
|
|
9 |
|
10 |
# ASR part
|
11 |
from transformers import pipeline
|
@@ -54,6 +55,10 @@ def calc_mos(audio_path, ref):
|
|
54 |
trans = p(audio_path)["text"]
|
55 |
# WER
|
56 |
wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
|
|
|
|
|
|
|
|
|
57 |
# MOS
|
58 |
batch = {
|
59 |
'wav': out_wavs,
|
@@ -63,6 +68,8 @@ def calc_mos(audio_path, ref):
|
|
63 |
with torch.no_grad():
|
64 |
output = model(batch)
|
65 |
predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
|
|
|
|
|
66 |
# Phonemes per minute (PPM)
|
67 |
with torch.no_grad():
|
68 |
logits = phoneme_model(out_wavs).logits
|
@@ -72,7 +79,8 @@ def calc_mos(audio_path, ref):
|
|
72 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
73 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
74 |
|
75 |
-
return
|
|
|
76 |
|
77 |
|
78 |
description ="""
|
@@ -91,9 +99,9 @@ iface = gr.Interface(
|
|
91 |
fn=calc_mos,
|
92 |
inputs=[gr.Audio(source='microphone', type="filepath", label="Audio to evaluate"),
|
93 |
gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
|
94 |
-
outputs=[gr.Textbox(placeholder="Naturalness
|
|
|
95 |
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
96 |
-
gr.Textbox(placeholder="Word Error Rate: Only valid when Reference is given", label = "WER"),
|
97 |
gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
|
98 |
gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
|
99 |
title="Laronix's Voice Quality Checking System Demo",
|
|
|
6 |
import lightning_module
|
7 |
import pdb
|
8 |
import jiwer
|
9 |
+
from local.convert_metrics import nat2avaMOS, WER2INTELI
|
10 |
|
11 |
# ASR part
|
12 |
from transformers import pipeline
|
|
|
55 |
trans = p(audio_path)["text"]
|
56 |
# WER
|
57 |
wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
|
58 |
+
|
59 |
+
# WER convert to Intellibility score
|
60 |
+
INTELI_score = WER2INTELI(wer*100)
|
61 |
+
|
62 |
# MOS
|
63 |
batch = {
|
64 |
'wav': out_wavs,
|
|
|
68 |
with torch.no_grad():
|
69 |
output = model(batch)
|
70 |
predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
|
71 |
+
# MOS to AVA MOS
|
72 |
+
AVA_MOS = nat2avaMOS(predic_mos)
|
73 |
# Phonemes per minute (PPM)
|
74 |
with torch.no_grad():
|
75 |
logits = phoneme_model(out_wavs).logits
|
|
|
79 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
80 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
81 |
|
82 |
+
return AVA_MOS, INTELI_score, trans, phone_transcription, ppm
|
83 |
+
|
84 |
|
85 |
|
86 |
description ="""
|
|
|
99 |
fn=calc_mos,
|
100 |
inputs=[gr.Audio(source='microphone', type="filepath", label="Audio to evaluate"),
|
101 |
gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
|
102 |
+
outputs=[gr.Textbox(placeholder="Naturalness Score", label="Naturalness Score, ranged from 0 to 5, the higher the better."),
|
103 |
+
gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better"),
|
104 |
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
|
|
105 |
gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
|
106 |
gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
|
107 |
title="Laronix's Voice Quality Checking System Demo",
|
local/WER2INTELI.png
ADDED
local/convert_metrics.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
# Natural MOS to AVA MOS
|
5 |
+
|
6 |
+
def linear_function(x):
|
7 |
+
return 8 * x - 8
|
8 |
+
|
9 |
+
def quadratic_function(x):
|
10 |
+
return -0.0816 * (x - 5) ** 2 + 5
|
11 |
+
|
12 |
+
# Natural MOS to AVA MOS
|
13 |
+
def nat2avaMOS(x):
|
14 |
+
if x <= 1.5:
|
15 |
+
return linear_function(x)
|
16 |
+
elif x >1.5 and x <= 5:
|
17 |
+
return quadratic_function(x)
|
18 |
+
|
19 |
+
# Word error rate to Intellibility Score (X is percentage)
|
20 |
+
def WER2INTELI(x):
|
21 |
+
if x <= 10:
|
22 |
+
return 100
|
23 |
+
elif x <= 100:
|
24 |
+
slope = (30 - 100) / (100 - 10)
|
25 |
+
intercept = 100 - slope * 10
|
26 |
+
return slope * x + intercept
|
27 |
+
else:
|
28 |
+
return 100 * np.exp(-0.01 * (x - 100))
|
29 |
+
|
30 |
+
# # 生成 x 值
|
31 |
+
# x = np.linspace(0, 200, 400) # 从0到200生成400个点
|
32 |
+
|
33 |
+
# # 计算对应的 y 值
|
34 |
+
# y = [WER2INT(xi) for xi in x]
|
35 |
+
|
36 |
+
# # 绘制函数图像
|
37 |
+
# plt.plot(x, y)
|
38 |
+
# plt.xlabel('x')
|
39 |
+
# plt.ylabel('f(x)')
|
40 |
+
# plt.title('Custom Function')
|
41 |
+
# plt.grid(True)
|
42 |
+
# plt.show()
|
43 |
+
|
44 |
+
# # 生成 x 值的范围
|
45 |
+
# x1 = np.linspace(1, 1.5, 100)
|
46 |
+
# x2 = np.linspace(1.5, 5, 100)
|
47 |
+
|
48 |
+
# # 计算对应的 y 值
|
49 |
+
# y1 = linear_function(x1)
|
50 |
+
# y2 = quadratic_function(x2)
|
51 |
+
|
52 |
+
# # 绘制线性部分
|
53 |
+
# plt.plot(x1, y1, label='Linear Function (1 <= x <= 1.5)')
|
54 |
+
|
55 |
+
# # 绘制二次部分
|
56 |
+
# plt.plot(x2, y2, label='Quadratic Function (1.5 <= x <= 5)')
|
57 |
+
|
58 |
+
# # 添加标签和标题
|
59 |
+
# plt.xlabel('Natural Mean Opinion Score')
|
60 |
+
# plt.ylabel('AVA Mean Opinion Score')
|
61 |
+
# plt.title('nat2avaMOS')
|
62 |
+
|
63 |
+
# # 添加图例
|
64 |
+
# plt.legend()
|
65 |
+
|
66 |
+
# # 显示图形
|
67 |
+
# plt.grid(True)
|
68 |
+
|
69 |
+
# # 显示图像
|
70 |
+
# plt.savefig("./local/nat2avaMOS.png")
|
71 |
+
# plt.savefig("./local/WER2INT.png")
|
local/nat2avaMOS.png
ADDED