KevinGeng commited on
Commit
73426f6
1 Parent(s): 11a6db4

add password pretection

Browse files
Files changed (1) hide show
  1. app.py +69 -4
app.py CHANGED
@@ -48,10 +48,13 @@ def calc_mos(audio_path, ref):
48
  wav, sr = torchaudio.load(audio_path, channels_first=True)
49
  if wav.shape[0] > 1:
50
  wav = wav.mean(dim=0, keepdim=True) # Mono channel
 
 
51
  osr = 16_000
52
  batch = wav.unsqueeze(0).repeat(10, 1, 1)
53
  csr = ChangeSampleRate(sr, osr)
54
  out_wavs = csr(wav)
 
55
  # ASR
56
  trans = p(audio_path)["text"]
57
  # WER
@@ -82,7 +85,68 @@ def calc_mos(audio_path, ref):
82
  phone_transcription = processor.batch_decode(phone_predicted_ids)
83
  lst_phonemes = phone_transcription[0].split(" ")
84
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
 
86
  # pdb.set_trace()
87
  return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
88
 
@@ -90,8 +154,9 @@ def calc_mos(audio_path, ref):
90
  with open("local/description.md") as f:
91
  description = f.read()
92
 
93
- # calc_mos("audio_2023-11-01_15-57-39.wav", "hello world")
94
- # pdb.set_trace()
 
95
  examples = [
96
  ["local/Julianna_Set1_Author_01.wav", "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
97
  ["local/Patient_Arthur_set1_002_noisy.wav", "Whenever the other rats asked him if he would like to go hunting with them, he would answer in a soft voice, 'I don't know.'"],
@@ -112,5 +177,5 @@ iface = gr.Interface(
112
  allow_flagging="auto",
113
  examples=examples,
114
  )
115
-
116
- iface.launch()
 
48
  wav, sr = torchaudio.load(audio_path, channels_first=True)
49
  if wav.shape[0] > 1:
50
  wav = wav.mean(dim=0, keepdim=True) # Mono channel
51
+ # get decibel
52
+
53
  osr = 16_000
54
  batch = wav.unsqueeze(0).repeat(10, 1, 1)
55
  csr = ChangeSampleRate(sr, osr)
56
  out_wavs = csr(wav)
57
+ db = torchaudio.transforms.AmplitudeToDB(stype="amplitude", top_db=80)(wav)
58
  # ASR
59
  trans = p(audio_path)["text"]
60
  # WER
 
85
  phone_transcription = processor.batch_decode(phone_predicted_ids)
86
  lst_phonemes = phone_transcription[0].split(" ")
87
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
88
+ import matplotlib.pyplot as plt
89
+
90
+ fig = plt.figure(figsize=(30, 10))
91
+ # ax = fig.subplots(1, 1)
92
+ # pdb.set_trace()
93
+
94
+ # time_x = torch.arange(wav.shape[-1]) / sr
95
+ # # ax.plot(time_x, wav_vad.squeeze())
96
+ # pdb.set_trace()
97
+ # ax.plot(time_x, wav.squeeze(), alpha=0.5)
98
+ # get f0
99
+ f0 = torchaudio.functional.compute_kaldi_pitch(wav, frame_length=25, frame_shift=20, min_f0=20, max_f0=600, sample_rate=sr)[0, :, 1]
100
+ # # get f0 time x axis
101
+ # time_x_f0 = torch.arange(f0.shape[-1]) * 20 / 1000
102
+ # plot f0 with x axis as time
103
+
104
+ # spectrogram with x axis as time
105
+ pdb.set_trace()
106
+ spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=400, hop_length=160, n_mels=80)(wav)
107
+
108
+ spectrogram = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)(spectrogram)
109
+
110
+ # plot spectrogram with x axis as time, y axis as frequency bins
111
+ ax2 = fig.add_subplot(212)
112
+ ax2.set_xlabel("Time (s)")
113
+ ax2.set_ylabel("Frequency (Hz)")
114
+ ax2.set_title("Spectrogram")
115
+ ax2.set_xticks(torch.arange(0, spectrogram.shape[-1], 100))
116
+ ax2.set_xticklabels(torch.arange(0, spectrogram.shape[-1], 100) * 20 / 1000)
117
+ ax2.set_yticks(torch.arange(0, spectrogram.shape[1], 10))
118
+ ax2.set_yticklabels(torch.arange(0, spectrogram.shape[1], 10) * 800 / 80)
119
+
120
+ # add colorbar to spectrogram with limitation from -80 to 0
121
+ cbar = plt.colorbar(ax2.imshow(spectrogram.squeeze().numpy(), aspect='auto', origin='lower'))
122
+ cbar.set_label("dB")
123
+ ax2.grid()
124
+
125
+ # plot f0 with x axis as time, y axis as frequency bins, y is limited from 0 to 600
126
+ ax1 = fig.add_subplot(211)
127
+ ax1.set_xlabel("Time (s)")
128
+ ax1.set_ylabel("Frequency (Hz)")
129
+ ax1.set_title("F0")
130
+ ax1.set_xticks(torch.arange(0, f0.shape[-1], 100))
131
+ ax1.set_xticklabels(torch.arange(0, f0.shape[-1], 100) * 20 / 1000)
132
+ ax1.set_yticks(torch.arange(0, 600, 50))
133
+ ax1.set_yticklabels(torch.arange(0, 600, 50))
134
+
135
+ # add colorbar to f0 with limitation from 0 to 600
136
+ # cbar = plt.colorbar(ax1.imshow(f0.squeeze().numpy(), aspect='auto', origin='lower'))
137
+ # cbar.set_label("Hz")
138
+ ax1.grid()
139
+
140
+ # remove unvoiced part based on vad
141
+
142
+ # plot f0 with x axis as time
143
+
144
+ # time_x = torch.arange(f0.shape[-1]) * 20 / 1000
145
+ # plt.plot(time_x, f0.squeeze())
146
+ # fig.savefig("vad.png")
147
+ # pdb.set_trace()
148
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
149
+
150
  # pdb.set_trace()
151
  return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm
152
 
 
154
  with open("local/description.md") as f:
155
  description = f.read()
156
 
157
+ calc_mos("JOHN1.wav", "he would answer in a soft voice, 'I don't know.'")
158
+
159
+ pdb.set_trace()
160
  examples = [
161
  ["local/Julianna_Set1_Author_01.wav", "Once upon a time, there was a young rat named Arthur who couldn't make up his mind."],
162
  ["local/Patient_Arthur_set1_002_noisy.wav", "Whenever the other rats asked him if he would like to go hunting with them, he would answer in a soft voice, 'I don't know.'"],
 
177
  allow_flagging="auto",
178
  examples=examples,
179
  )
180
+ # add password to protect the interface
181
+ iface.launch(share=True, auth=['Laronix', 'LaronixSLP'], auth_message="Authentication Required, ask kevin@laronix.com for password,\n Thanks for your cooperation!")