support pitch contour and db plotting
Browse files- app.py +10 -65
- local/pitch_contour.py +107 -0
app.py
CHANGED
@@ -8,7 +8,7 @@ import pdb
|
|
8 |
import jiwer
|
9 |
from local.convert_metrics import nat2avaMOS, WER2INTELI
|
10 |
from local.indicator_plot import Intelligibility_Plot, Naturalness_Plot
|
11 |
-
|
12 |
# ASR part
|
13 |
from transformers import pipeline
|
14 |
p = pipeline("automatic-speech-recognition")
|
@@ -85,77 +85,21 @@ def calc_mos(audio_path, ref):
|
|
85 |
phone_transcription = processor.batch_decode(phone_predicted_ids)
|
86 |
lst_phonemes = phone_transcription[0].split(" ")
|
87 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
88 |
-
import matplotlib.pyplot as plt
|
89 |
-
|
90 |
-
fig = plt.figure(figsize=(30, 10))
|
91 |
-
# ax = fig.subplots(1, 1)
|
92 |
-
# pdb.set_trace()
|
93 |
-
|
94 |
-
# time_x = torch.arange(wav.shape[-1]) / sr
|
95 |
-
# # ax.plot(time_x, wav_vad.squeeze())
|
96 |
-
# pdb.set_trace()
|
97 |
-
# ax.plot(time_x, wav.squeeze(), alpha=0.5)
|
98 |
-
# get f0
|
99 |
-
f0 = torchaudio.functional.compute_kaldi_pitch(wav, frame_length=25, frame_shift=20, min_f0=20, max_f0=600, sample_rate=sr)[0, :, 1]
|
100 |
-
# # get f0 time x axis
|
101 |
-
# time_x_f0 = torch.arange(f0.shape[-1]) * 20 / 1000
|
102 |
-
# plot f0 with x axis as time
|
103 |
-
|
104 |
-
# spectrogram with x axis as time
|
105 |
-
# pdb.set_trace()
|
106 |
-
spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=400, hop_length=160, n_mels=80)(wav)
|
107 |
-
|
108 |
-
spectrogram = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(spectrogram)
|
109 |
-
|
110 |
-
# plot spectrogram with x axis as time, y axis as frequency bins
|
111 |
-
ax2 = fig.add_subplot(212)
|
112 |
-
ax2.set_xlabel("Time (s)")
|
113 |
-
ax2.set_ylabel("Frequency (Hz)")
|
114 |
-
ax2.set_title("Spectrogram")
|
115 |
-
ax2.set_xticks(torch.arange(0, spectrogram.shape[-1], 100))
|
116 |
-
ax2.set_xticklabels(torch.arange(0, spectrogram.shape[-1], 100) * 20 / 1000)
|
117 |
-
ax2.set_yticks(torch.arange(0, spectrogram.shape[1], 10))
|
118 |
-
ax2.set_yticklabels(torch.arange(0, spectrogram.shape[1], 10) * 800 / 80)
|
119 |
-
|
120 |
-
# add colorbar to spectrogram with limitation from -80 to 0
|
121 |
-
cbar = plt.colorbar(ax2.imshow(spectrogram.squeeze().numpy(), aspect='auto', origin='lower'))
|
122 |
-
cbar.set_label("dB")
|
123 |
-
ax2.grid()
|
124 |
-
|
125 |
-
# plot f0 with x axis as time, y axis as frequency bins, y is limited from 0 to 600
|
126 |
-
ax1 = fig.add_subplot(211)
|
127 |
-
ax1.set_xlabel("Time (s)")
|
128 |
-
ax1.set_ylabel("Frequency (Hz)")
|
129 |
-
ax1.set_title("F0")
|
130 |
-
ax1.set_xticks(torch.arange(0, f0.shape[-1], 100))
|
131 |
-
ax1.set_xticklabels(torch.arange(0, f0.shape[-1], 100) * 20 / 1000)
|
132 |
-
ax1.set_yticks(torch.arange(0, 600, 50))
|
133 |
-
ax1.set_yticklabels(torch.arange(0, 600, 50))
|
134 |
|
135 |
-
#
|
136 |
-
|
137 |
-
|
138 |
-
ax1.grid()
|
139 |
-
|
140 |
-
# remove unvoiced part based on vad
|
141 |
-
|
142 |
-
# plot f0 with x axis as time
|
143 |
-
|
144 |
-
# time_x = torch.arange(f0.shape[-1]) * 20 / 1000
|
145 |
-
# plt.plot(time_x, f0.squeeze())
|
146 |
-
# fig.savefig("vad.png")
|
147 |
-
# pdb.set_trace()
|
148 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
149 |
|
150 |
# pdb.set_trace()
|
151 |
-
return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
|
152 |
|
153 |
|
154 |
with open("local/description.md") as f:
|
155 |
description = f.read()
|
156 |
|
157 |
-
# calc_mos("JOHN1.wav", "he would answer in a soft voice, 'I don't know.'")
|
158 |
-
|
159 |
|
160 |
examples = [
|
161 |
["local/Julianna_Set1_Author_01.wav", "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
|
@@ -171,11 +115,12 @@ iface = gr.Interface(
|
|
171 |
gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
|
172 |
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
173 |
gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
|
174 |
-
gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False)
|
|
|
175 |
title="Speech Analysis by Laronix AI",
|
176 |
description=description,
|
177 |
allow_flagging="auto",
|
178 |
examples=examples,
|
179 |
)
|
180 |
# add password to protect the interface
|
181 |
-
iface.launch(share=False, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password
|
|
|
8 |
import jiwer
|
9 |
from local.convert_metrics import nat2avaMOS, WER2INTELI
|
10 |
from local.indicator_plot import Intelligibility_Plot, Naturalness_Plot
|
11 |
+
from local.pitch_contour import draw_spec_db_pitch
|
12 |
# ASR part
|
13 |
from transformers import pipeline
|
14 |
p = pipeline("automatic-speech-recognition")
|
|
|
85 |
phone_transcription = processor.batch_decode(phone_predicted_ids)
|
86 |
lst_phonemes = phone_transcription[0].split(" ")
|
87 |
wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
+
# draw f0 and db analysis plot
|
90 |
+
f0_db_fig = draw_spec_db_pitch(audio_path, save_fig_path=None)
|
91 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
|
93 |
|
94 |
# pdb.set_trace()
|
95 |
+
return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm , f0_db_fig
|
96 |
|
97 |
|
98 |
with open("local/description.md") as f:
|
99 |
description = f.read()
|
100 |
|
101 |
+
# x = calc_mos("JOHN1.wav", "he would answer in a soft voice, 'I don't know.'")
|
102 |
+
# pdb.set_trace()
|
103 |
|
104 |
examples = [
|
105 |
["local/Julianna_Set1_Author_01.wav", "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
|
|
|
115 |
gr.Plot(label="Intelligibility Score, range from 0 to 100, the higher the better", show_label=True, container=True),
|
116 |
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
117 |
gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes", visible=False),
|
118 |
+
gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="Speaking Rate, Phonemes per minutes", visible=False),
|
119 |
+
gr.Plot(label="Pitch Contour and dB Analysis", show_label=True, container=True)],
|
120 |
title="Speech Analysis by Laronix AI",
|
121 |
description=description,
|
122 |
allow_flagging="auto",
|
123 |
examples=examples,
|
124 |
)
|
125 |
# add password to protect the interface
|
126 |
+
iface.launch(share=False, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password.\n Thanks for your cooperation!")
|
local/pitch_contour.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Kevin @ Laronix
|
2 |
+
|
3 |
+
from glob import glob
|
4 |
+
from pathlib import Path
|
5 |
+
import matplotlib
|
6 |
+
from matplotlib.transforms import Bbox
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import pdb
|
11 |
+
import parselmouth
|
12 |
+
|
13 |
+
def draw_spectrogram(spectrogram, dynamic_range=80):
|
14 |
+
X, Y = spectrogram.x_grid(), spectrogram.y_grid()
|
15 |
+
sg_db = 10 * np.log10(spectrogram.values)
|
16 |
+
plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range)
|
17 |
+
plt.ylim([spectrogram.ymin, spectrogram.ymax])
|
18 |
+
# TODO add colorbar to spectrogram with limitation from -40 to 0
|
19 |
+
|
20 |
+
# plt.xlabel("time [s]")
|
21 |
+
plt.ylabel("frequency [Hz]")
|
22 |
+
|
23 |
+
def draw_intensity(intensity):
|
24 |
+
# draw intensity in red with x axis as time
|
25 |
+
plt.plot(intensity.xs(), intensity.values.T, linewidth=3, color='r')
|
26 |
+
plt.plot(intensity.xs(), intensity.values.T, linewidth=1, color="w")
|
27 |
+
intensity_values = intensity.values.T
|
28 |
+
# get range of intensity
|
29 |
+
intensity_min = np.nanmin(intensity_values)
|
30 |
+
intensity_max = np.nanmax(intensity_values)
|
31 |
+
# project maximum and minimum intensity to y axis in dotted line
|
32 |
+
intensity_min_index = np.where(intensity.values.T == intensity_min)
|
33 |
+
intensity_min_time = intensity.xs()[intensity_min_index[0]][0]
|
34 |
+
intensity_max_index = np.where(intensity.values.T == intensity_max)
|
35 |
+
intensity_max_time = intensity.xs()[intensity_max_index[0]][0]
|
36 |
+
|
37 |
+
plt.plot([intensity.xmax, intensity_min_time], [intensity_min, intensity_min], linewidth=1, linestyle='dotted', color='red')
|
38 |
+
plt.plot([intensity.xmax, intensity_max_time], [intensity_max, intensity_max], linewidth=1, linestyle='dotted', color='red')
|
39 |
+
# add text at intensity_min and intensity_max on y axis
|
40 |
+
plt.text(intensity.xmax, intensity_min, str(round(intensity_min, 1)), color='red')
|
41 |
+
plt.text(intensity.xmax, intensity_max, str(round(intensity_max, 1)), color='red')
|
42 |
+
# get intensity.min's index
|
43 |
+
|
44 |
+
plt.grid(False)
|
45 |
+
plt.ylim(0)
|
46 |
+
plt.ylabel("intensity [dB]")
|
47 |
+
|
48 |
+
def draw_pitch(pitch):
|
49 |
+
# Extract selected pitch contour, and
|
50 |
+
# replace unvoiced samples by NaN to not plot
|
51 |
+
pitch_values = pitch.selected_array['frequency']
|
52 |
+
pitch_values[pitch_values==0] = np.nan
|
53 |
+
pitch_min = np.nanmin(pitch_values)
|
54 |
+
pitch_max = np.nanmax(pitch_values)
|
55 |
+
plt.plot(pitch.xs(), pitch_values, markersize=5, color='blue')
|
56 |
+
# plt.plot(pitch.xs(), pitch_values, markersize=, color='white')
|
57 |
+
# project maximum and minimum db to y axis in dotted line
|
58 |
+
pitch_min_index = np.where(pitch_values == pitch_min)
|
59 |
+
pitch_min_time = pitch.xs()[pitch_min_index[0]][0]
|
60 |
+
pitch_max_index = np.where(pitch_values == pitch_max)
|
61 |
+
pitch_max_time = pitch.xs()[pitch_max_index[0]][0]
|
62 |
+
plt.plot([pitch.xmin, pitch_min_time], [pitch_min, pitch_min], linewidth=1, linestyle='dotted', color='blue')
|
63 |
+
plt.plot([pitch.xmin, pitch_max_time], [pitch_max, pitch_max], linewidth=1, linestyle='dotted', color='blue')
|
64 |
+
# add text at pitch_min and pitch_max on y axis
|
65 |
+
# highlight pitch_min and pitch_max
|
66 |
+
plt.scatter(pitch_min_time, pitch_min, color='blue', s=100)
|
67 |
+
plt.scatter(pitch_max_time, pitch_max, color='blue', s=100)
|
68 |
+
|
69 |
+
plt.text(pitch_min_time-0.2, pitch_min-30, "f0min = " + str(round(pitch_min, 1) ), color='blue', fontsize=12)
|
70 |
+
plt.text(pitch_max_time-0.2, pitch_max+30, "f0max = " + str(round(pitch_max, 1) ), color='blue', fontsize=12)
|
71 |
+
|
72 |
+
plt.grid(False)
|
73 |
+
|
74 |
+
plt.ylim(max([0, pitch_min-50]), pitch_max+50)
|
75 |
+
plt.ylabel("fundamental frequency [Hz]")
|
76 |
+
|
77 |
+
def draw_spec_db_pitch(wav, save_fig_path=None):
|
78 |
+
# get figure
|
79 |
+
fig = plt.figure(figsize=(10, 5))
|
80 |
+
fig.tight_layout()
|
81 |
+
|
82 |
+
# get pitch, intensity, spectrogram
|
83 |
+
snd = parselmouth.Sound(str(wav))
|
84 |
+
pitch = snd.to_pitch()
|
85 |
+
intensity = snd.to_intensity()
|
86 |
+
pre_emphasized_snd = snd.copy()
|
87 |
+
pre_emphasized_snd.pre_emphasize()
|
88 |
+
spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.1)
|
89 |
+
|
90 |
+
# draw dB plot and spectrogram
|
91 |
+
plt.subplot(2, 1, 1)
|
92 |
+
draw_spectrogram(spectrogram)
|
93 |
+
plt.twinx()
|
94 |
+
draw_intensity(intensity)
|
95 |
+
plt.xlim([snd.xmin, snd.xmax])
|
96 |
+
|
97 |
+
# draw pitch contour
|
98 |
+
plt.subplot(2, 1, 2)
|
99 |
+
draw_pitch(pitch)
|
100 |
+
plt.xlim([snd.xmin, snd.xmax])
|
101 |
+
plt.xlabel("time [s]")
|
102 |
+
|
103 |
+
return fig
|
104 |
+
|
105 |
+
# f = draw_spec_db_pitch("./test.wav")
|
106 |
+
# plt.savefig("y.png")
|
107 |
+
|