gradio-4 support
Browse files- app.py +26 -21
- local/description.md +5 -0
- local/indicator_plot.py +112 -0
- requirements.txt +8 -96
app.py
CHANGED
@@ -7,6 +7,7 @@ import lightning_module
|
|
7 |
import pdb
|
8 |
import jiwer
|
9 |
from local.convert_metrics import nat2avaMOS, WER2INTELI
|
|
|
10 |
|
11 |
# ASR part
|
12 |
from transformers import pipeline
|
@@ -55,10 +56,11 @@ def calc_mos(audio_path, ref):
|
|
55 |
trans = p(audio_path)["text"]
|
56 |
# WER
|
57 |
wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
|
58 |
-
|
59 |
# WER convert to Intellibility score
|
60 |
INTELI_score = WER2INTELI(wer*100)
|
61 |
|
|
|
|
|
62 |
# MOS
|
63 |
batch = {
|
64 |
'wav': out_wavs,
|
@@ -70,6 +72,9 @@ def calc_mos(audio_path, ref):
|
|
70 |
predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
|
71 |
# MOS to AVA MOS
|
72 |
AVA_MOS = nat2avaMOS(predic_mos)
|
|
|
|
|
|
|
73 |
# Phonemes per minute (PPM)
|
74 |
with torch.no_grad():
|
75 |
logits = phoneme_model(out_wavs).logits
|
@@ -78,34 +83,34 @@ def calc_mos(audio_path, ref):
|
|
78 |
lst_phonemes = phone_transcription[0].split(" ")
|
79 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
80 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
81 |
-
|
82 |
-
return AVA_MOS, INTELI_score, trans, phone_transcription, ppm
|
83 |
|
84 |
|
|
|
|
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
Add ASR based on wav2vec-960, currently only English available.
|
93 |
-
This is a lite version of ASR, delievring faster calculation and compromise to recognition performance
|
94 |
-
Add WER interface by Laronix Pty LTD
|
95 |
-
"""
|
96 |
-
|
97 |
-
|
98 |
iface = gr.Interface(
|
99 |
fn=calc_mos,
|
100 |
-
inputs=[gr.Audio(
|
101 |
gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
|
102 |
-
outputs=[gr.Textbox(placeholder="Naturalness Score", label="Naturalness Score, ranged from 0 to 5, the higher the better."),
|
103 |
-
gr.
|
|
|
|
|
104 |
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
105 |
-
gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
|
106 |
-
gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="
|
107 |
-
title="
|
108 |
description=description,
|
109 |
allow_flagging="auto",
|
|
|
110 |
)
|
|
|
111 |
iface.launch()
|
|
|
7 |
import pdb
|
8 |
import jiwer
|
9 |
from local.convert_metrics import nat2avaMOS, WER2INTELI
|
10 |
+
from local.indicator_plot import Intelligibility_Plot, Naturalness_Plot
|
11 |
|
12 |
# ASR part
|
13 |
from transformers import pipeline
|
|
|
56 |
trans = p(audio_path)["text"]
|
57 |
# WER
|
58 |
wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
|
|
|
59 |
# WER convert to Intellibility score
|
60 |
INTELI_score = WER2INTELI(wer*100)
|
61 |
|
62 |
+
INT_fig = Intelligibility_Plot(INTELI_score)
|
63 |
+
|
64 |
# MOS
|
65 |
batch = {
|
66 |
'wav': out_wavs,
|
|
|
72 |
predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
|
73 |
# MOS to AVA MOS
|
74 |
AVA_MOS = nat2avaMOS(predic_mos)
|
75 |
+
|
76 |
+
MOS_fig = Naturalness_Plot(AVA_MOS)
|
77 |
+
|
78 |
# Phonemes per minute (PPM)
|
79 |
with torch.no_grad():
|
80 |
logits = phoneme_model(out_wavs).logits
|
|
|
83 |
lst_phonemes = phone_transcription[0].split(" ")
|
84 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
85 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
86 |
+
# pdb.set_trace()
|
87 |
+
return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
|
88 |
|
89 |
|
90 |
+
with open("local/description.md") as f:
|
91 |
+
description = f.read()
|
92 |
|
93 |
+
# calc_mos("audio_2023-11-01_15-57-39.wav", "hello world")
|
94 |
+
# pdb.set_trace()
|
95 |
+
examples = [
|
96 |
+
[None, "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
|
97 |
+
[None, "Whenever the other rats asked Arthur if he wanted to go to the park, he would say, 'I don't know.'"],
|
98 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
iface = gr.Interface(
|
100 |
fn=calc_mos,
|
101 |
+
inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
|
102 |
gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
|
103 |
+
outputs=[gr.Textbox(placeholder="Naturalness Score, ranged from 0 to 5, the higher the better.", label="Naturalness Score, ranged from 0 to 5, the higher the better.", visible=False),
|
104 |
+
gr.Plot(label="Naturalness Score, ranged from 0 to 5, the higher the better.", show_label=True, container=True),
|
105 |
+
gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better", visible=False),
|
106 |
+
gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
|
107 |
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
108 |
+
gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
|
109 |
+
gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False)],
|
110 |
+
title="Speech Analysis by Laronix AI",
|
111 |
description=description,
|
112 |
allow_flagging="auto",
|
113 |
+
examples=examples,
|
114 |
)
|
115 |
+
|
116 |
iface.launch()
|
local/description.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Let’s get started!
|
2 |
+
+ Use this in a quiet environment.
|
3 |
+
+ Make sure your microphone is working and attached.
|
4 |
+
+ If prompted in a pop-up, allow the browser access to your microphone.
|
5 |
+
+ When you’re ready, hit record and read the reference sentence below:
|
local/indicator_plot.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import plotly.graph_objects as go
|
2 |
+
|
3 |
+
def Intelligibility_Plot(Int_Score, fair_thre=30, good_thre = 70, Upper=100, Lower=0):
|
4 |
+
'''
|
5 |
+
Int_Score: a float number between 0 and 100
|
6 |
+
Upper: the upper bound of the plot
|
7 |
+
Lower: the lower bound of the plot
|
8 |
+
'''
|
9 |
+
# Assert Nat_Score is a float number between 0 and 100
|
10 |
+
assert isinstance(Int_Score, float|int)
|
11 |
+
assert Int_Score >= Lower
|
12 |
+
assert Int_Score <= Upper
|
13 |
+
# Indicator plot with different colors, under fair_threshold the plot is red, then yellow, then green
|
14 |
+
# Design 1: Show bar in different colors refer to the threshold
|
15 |
+
|
16 |
+
color = "#75DA99"
|
17 |
+
if Int_Score <= fair_thre:
|
18 |
+
color = "#F2ADA0"
|
19 |
+
elif Int_Score <= good_thre:
|
20 |
+
color = "#e8ee89"
|
21 |
+
else:
|
22 |
+
color = "#75DA99"
|
23 |
+
|
24 |
+
fig = go.Figure(go.Indicator(
|
25 |
+
mode="number+gauge",
|
26 |
+
gauge={'shape': "bullet",
|
27 |
+
'axis':{'range': [Lower, Upper+10]},
|
28 |
+
'bgcolor': 'white',
|
29 |
+
'bar': {'color': color},
|
30 |
+
},
|
31 |
+
value=Int_Score,
|
32 |
+
domain = {'x': [0, 1], 'y': [0, 1]},
|
33 |
+
)
|
34 |
+
)
|
35 |
+
# # Design 2: Show all thresholds in the background
|
36 |
+
# fig = go.Figure(go.Indicator(
|
37 |
+
# mode = "number+gauge",
|
38 |
+
# gauge = {'shape': "bullet",
|
39 |
+
# 'axis': {'range': [Lower, Upper]},
|
40 |
+
# 'bgcolor': 'white',
|
41 |
+
# 'steps': [
|
42 |
+
# {'range': [Lower, fair_thre], 'color': "#F2ADA0"},
|
43 |
+
# {'range': [fair_thre, good_thre], 'color': "#e8ee89"},
|
44 |
+
# {'range': [good_thre, Upper], 'color': " #75DA99"}],
|
45 |
+
# 'bar': {'color': "grey"},
|
46 |
+
# },
|
47 |
+
# value = Int_Score,
|
48 |
+
# domain = {'x': [0, 1], 'y': [0, 1]},
|
49 |
+
# )
|
50 |
+
# )
|
51 |
+
fig.update_layout(
|
52 |
+
autosize=False,
|
53 |
+
width=650,
|
54 |
+
height=250,
|
55 |
+
margin=dict(
|
56 |
+
l=10,
|
57 |
+
r=10,
|
58 |
+
b=10,
|
59 |
+
t=10,
|
60 |
+
pad=4
|
61 |
+
),)
|
62 |
+
return fig
|
63 |
+
|
64 |
+
def Naturalness_Plot(Nat_Score, fair_thre=2, good_thre = 4, Upper=5, Lower=0):
|
65 |
+
'''
|
66 |
+
Nat_Score: a float number between 0 and 100
|
67 |
+
Upper: the upper bound of the plot
|
68 |
+
Lower: the lower bound of the plot
|
69 |
+
'''
|
70 |
+
# Assert Nat_Score is a float number between 0 and 100
|
71 |
+
assert isinstance(Nat_Score, float)
|
72 |
+
assert Nat_Score >= Lower
|
73 |
+
assert Nat_Score <= Upper
|
74 |
+
|
75 |
+
color = "#75DA99"
|
76 |
+
if Nat_Score <= fair_thre:
|
77 |
+
color = "#F2ADA0"
|
78 |
+
elif Nat_Score <= good_thre:
|
79 |
+
color = "#e8ee89"
|
80 |
+
else:
|
81 |
+
color = "#75DA99"
|
82 |
+
fig = go.Figure(go.Indicator(
|
83 |
+
mode = "number+gauge",
|
84 |
+
gauge = {'shape': "bullet",
|
85 |
+
'axis':{'range': [Lower, Upper+0.4]},
|
86 |
+
"bar":{'color': color}},
|
87 |
+
value = Nat_Score,
|
88 |
+
domain = {'x': [0, 1], 'y': [0, 1]},
|
89 |
+
)
|
90 |
+
)
|
91 |
+
fig.update_layout(
|
92 |
+
autosize=False,
|
93 |
+
width=650,
|
94 |
+
height=250,
|
95 |
+
margin=dict(
|
96 |
+
l=10,
|
97 |
+
r=10,
|
98 |
+
b=10,
|
99 |
+
t=10,
|
100 |
+
pad=4
|
101 |
+
),)
|
102 |
+
return fig
|
103 |
+
|
104 |
+
# test case Intelligibility_Plot
|
105 |
+
# x = Intelligibility_Plot(10)
|
106 |
+
# x.show()
|
107 |
+
# x = Naturalness_Plot(3.5)
|
108 |
+
# x.show()
|
109 |
+
# x = Intelligibility_Plot(50)
|
110 |
+
# x.show()
|
111 |
+
# x = Intelligibility_Plot(90)
|
112 |
+
# x.show()
|
requirements.txt
CHANGED
@@ -1,101 +1,13 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
antlr4-python3-runtime==4.8
|
6 |
-
anyio==3.5.0
|
7 |
-
asgiref==3.5.0
|
8 |
-
async-timeout==4.0.2
|
9 |
-
attrs==21.4.0
|
10 |
-
backoff==1.10.0
|
11 |
-
bcrypt==3.2.0
|
12 |
-
bitarray==2.4.0
|
13 |
-
cachetools==5.0.0
|
14 |
-
certifi==2021.10.8
|
15 |
-
cffi==1.15.0
|
16 |
-
charset-normalizer==2.0.12
|
17 |
-
click==8.0.4
|
18 |
-
colorama==0.4.4
|
19 |
-
cryptography==36.0.1
|
20 |
-
cycler==0.11.0
|
21 |
-
Cython==0.29.28
|
22 |
-
fairseq @ git+https://github.com/pytorch/fairseq.git@d03f4e771484a433f025f47744017c2eb6e9c6bc
|
23 |
-
fastapi==0.75.0
|
24 |
-
ffmpy==0.3.0
|
25 |
-
fonttools==4.30.0
|
26 |
-
frozenlist==1.3.0
|
27 |
-
fsspec==2022.2.0
|
28 |
-
future==0.18.2
|
29 |
-
google-auth==2.6.0
|
30 |
-
google-auth-oauthlib==0.4.6
|
31 |
-
gradio==3.46.1
|
32 |
-
grpcio==1.44.0
|
33 |
-
h11==0.12.0
|
34 |
-
hydra-core==1.0.7
|
35 |
-
idna==3.3
|
36 |
-
importlib-metadata==4.11.3
|
37 |
-
Jinja2==3.0.3
|
38 |
-
kiwisolver==1.3.2
|
39 |
-
linkify-it-py==1.0.3
|
40 |
-
Markdown==3.3.6
|
41 |
-
markdown-it-py==2.0.1
|
42 |
-
MarkupSafe==2.1.0
|
43 |
-
matplotlib==3.5.1
|
44 |
-
mdit-py-plugins==0.3.0
|
45 |
-
mdurl==0.1.0
|
46 |
-
monotonic==1.6
|
47 |
-
multidict==6.0.2
|
48 |
-
numpy==1.22.3
|
49 |
-
oauthlib==3.2.0
|
50 |
-
omegaconf==2.0.6
|
51 |
-
orjson==3.6.7
|
52 |
-
packaging==21.3
|
53 |
-
pandas==1.4.1
|
54 |
-
paramiko==2.10.1
|
55 |
-
Pillow==9.0.1
|
56 |
-
portalocker==2.4.0
|
57 |
-
protobuf==3.19.4
|
58 |
-
pyasn1==0.4.8
|
59 |
-
pyasn1-modules==0.2.8
|
60 |
-
pycparser==2.21
|
61 |
-
pycryptodome==3.14.1
|
62 |
-
pydantic==1.9.0
|
63 |
-
pyDeprecate==0.3.1
|
64 |
-
pydub==0.25.1
|
65 |
-
PyNaCl==1.5.0
|
66 |
-
pyparsing==3.0.7
|
67 |
-
python-dateutil==2.8.2
|
68 |
-
python-multipart==0.0.5
|
69 |
-
pytorch-lightning==1.5.10
|
70 |
-
pytz==2021.3
|
71 |
-
PyYAML==6.0
|
72 |
-
regex==2022.3.2
|
73 |
-
requests==2.27.1
|
74 |
-
requests-oauthlib==1.3.1
|
75 |
-
rsa==4.8
|
76 |
-
sacrebleu==2.0.0
|
77 |
-
six==1.16.0
|
78 |
-
sniffio==1.2.0
|
79 |
-
starlette==0.17.1
|
80 |
-
tabulate==0.8.9
|
81 |
-
tensorboard==2.8.0
|
82 |
-
tensorboard-data-server==0.6.1
|
83 |
-
tensorboard-plugin-wit==1.8.1
|
84 |
-
torch==1.11.0
|
85 |
-
torchaudio==0.11.0
|
86 |
-
torchmetrics==0.7.2
|
87 |
-
tqdm==4.63.0
|
88 |
-
typing-extensions==4.1.1
|
89 |
-
uc-micro-py==1.0.1
|
90 |
-
urllib3==1.26.8
|
91 |
-
uvicorn==0.17.6
|
92 |
-
Werkzeug==2.0.3
|
93 |
-
yarl==1.7.2
|
94 |
-
zipp==3.7.0
|
95 |
-
|
96 |
transformers
|
|
|
|
|
97 |
tensorboardX
|
98 |
jiwer
|
99 |
phonemizer
|
100 |
librosa
|
101 |
-
speake
|
|
|
|
1 |
+
gradio
|
2 |
+
pytorch_lightning
|
3 |
+
torch
|
4 |
+
jiwer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
transformers
|
6 |
+
numpy
|
7 |
+
fairseq @ git+https://github.com/pytorch/fairseq.git@d03f4e771484a433f025f47744017c2eb6e9c6bc
|
8 |
tensorboardX
|
9 |
jiwer
|
10 |
phonemizer
|
11 |
librosa
|
12 |
+
speake
|
13 |
+
plotly
|