Spaces:
Runtime error
Runtime error
add files
Browse files- app.py +227 -0
- config/samples.yaml +3 -0
- config/template.yaml +12 -0
- data/samples/Fary_PAL_p326_len10_p326_013.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_020.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_025.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_036.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_040.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_043.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_044.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_045.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_048.wav +0 -0
- data/samples/Fary_PAL_p326_len10_p326_049.wav +0 -0
- data/samples/John_p326_020.wav +0 -0
- data/samples/p326_020.wav +0 -0
- data/samples/ref.txt +10 -0
- local/__pycache__/vis.cpython-39.pyc +0 -0
- local/token.py +55 -0
- local/vis.py +40 -0
- packages.txt +2 -0
- requirements.txt +103 -0
- src/__pycache__/lightning_module.cpython-39.pyc +0 -0
- src/__pycache__/model.cpython-39.pyc +0 -0
- src/description.html +42 -0
- src/font/Muli-Bold.ttf +0 -0
- src/font/Muli.ttf +0 -0
- src/font/OFL.txt +93 -0
- src/font/Poppins-Bold.ttf +0 -0
- src/font/Poppins-Regular.ttf +0 -0
- src/font/SIL Open Font License.txt +44 -0
app.py
ADDED
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
TODO:
|
3 |
+
+ [x] Load Configuration
|
4 |
+
+ [ ] Multi ASR Engine
|
5 |
+
+ [ ] Batch / Real Time support
|
6 |
+
"""
|
7 |
+
import numpy as np
|
8 |
+
from pathlib import Path
|
9 |
+
import jiwer
|
10 |
+
import pdb
|
11 |
+
import torch.nn as nn
|
12 |
+
import torch
|
13 |
+
import torchaudio
|
14 |
+
import gradio as gr
|
15 |
+
from logging import PlaceHolder
|
16 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2CTCTokenizer
|
17 |
+
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
|
18 |
+
from datasets import load_dataset
|
19 |
+
import datasets
|
20 |
+
import yaml
|
21 |
+
from transformers import pipeline
|
22 |
+
import librosa
|
23 |
+
import librosa.display
|
24 |
+
import matplotlib.pyplot as plt
|
25 |
+
import soundfile as sf
|
26 |
+
|
27 |
+
# local import
|
28 |
+
import sys
|
29 |
+
|
30 |
+
from local.vis import token_plot
|
31 |
+
sys.path.append("src")
|
32 |
+
|
33 |
+
# Load automos
|
34 |
+
config_yaml = "config/samples.yaml"
|
35 |
+
with open(config_yaml, "r") as f:
|
36 |
+
# pdb.set_trace()
|
37 |
+
try:
|
38 |
+
config = yaml.safe_load(f)
|
39 |
+
except FileExistsError:
|
40 |
+
print("Config file Loading Error")
|
41 |
+
exit()
|
42 |
+
|
43 |
+
# Auto load examples
|
44 |
+
refs = np.loadtxt(config["ref_txt"], delimiter="\n", dtype="str")
|
45 |
+
refs_ids = [x.split()[0] for x in refs]
|
46 |
+
refs_txt = [" ".join(x.split()[1:]) for x in refs]
|
47 |
+
ref_wavs = [str(x) for x in sorted(Path(config["ref_wavs"]).glob("**/*.wav"))]
|
48 |
+
|
49 |
+
with open("src/description.html", "r", encoding="utf-8") as f:
|
50 |
+
description = f.read()
|
51 |
+
# description
|
52 |
+
|
53 |
+
reference_id = gr.Textbox(
|
54 |
+
value="ID", placeholder="Utter ID", label="Reference_ID"
|
55 |
+
)
|
56 |
+
reference_textbox = gr.Textbox(
|
57 |
+
value="Input reference here",
|
58 |
+
placeholder="Input reference here",
|
59 |
+
label="Reference",
|
60 |
+
)
|
61 |
+
reference_PPM = gr.Textbox(
|
62 |
+
placeholder="Pneumatic Voice's PPM", label="Ref PPM"
|
63 |
+
)
|
64 |
+
|
65 |
+
examples = [
|
66 |
+
[x, y] for x, y in zip(ref_wavs, refs_txt)
|
67 |
+
]
|
68 |
+
|
69 |
+
# def map_to_array(batch):
|
70 |
+
# speech, _ = sf.read(batch["file"])
|
71 |
+
# batch["speech"] = speech
|
72 |
+
# return batch
|
73 |
+
# ASR part
|
74 |
+
p = pipeline("automatic-speech-recognition")
|
75 |
+
import pdb
|
76 |
+
|
77 |
+
# Tokenlizer part
|
78 |
+
# import model, feature extractor, tokenizer
|
79 |
+
def TOKENLIZER(audio_path, activate_plot=False):
|
80 |
+
|
81 |
+
token_model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
82 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
83 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
|
84 |
+
|
85 |
+
# # load first sample of English common_voice
|
86 |
+
# dataset = load_dataset("common_voice", "en", split="train", streaming=True)
|
87 |
+
# dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
88 |
+
# dataset_iter = iter(dataset)
|
89 |
+
# sample = next(dataset_iter)
|
90 |
+
|
91 |
+
# # forward sample through model to get greedily predicted transcription ids
|
92 |
+
# input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values
|
93 |
+
# pdb.set_trace()
|
94 |
+
|
95 |
+
input_values, sr = torchaudio.load(audio_path)
|
96 |
+
if sr != feature_extractor.sampling_rate:
|
97 |
+
input_values = torchaudio.functional.resample(input_values, sr, feature_extractor.sampling_rate)
|
98 |
+
|
99 |
+
logits = token_model(input_values).logits[0]
|
100 |
+
pred_ids = torch.argmax(logits, axis=-1)
|
101 |
+
|
102 |
+
# retrieve word stamps (analogous commands for `output_char_offsets`)
|
103 |
+
outputs = tokenizer.decode(pred_ids, output_word_offsets=True)
|
104 |
+
# pdb.set_trace()
|
105 |
+
# compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
|
106 |
+
time_offset = token_model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate
|
107 |
+
|
108 |
+
word_offsets = [
|
109 |
+
{
|
110 |
+
"word": d["word"],
|
111 |
+
"start_time": round(d["start_offset"] * time_offset, 2),
|
112 |
+
"end_time": round(d["end_offset"] * time_offset, 2),
|
113 |
+
}
|
114 |
+
for d in outputs.word_offsets
|
115 |
+
]
|
116 |
+
if activate_plot == True:
|
117 |
+
token_fig = token_plot(input_values, feature_extractor.sampling_rate, word_offsets)
|
118 |
+
return word_offsets, token_fig
|
119 |
+
return word_offsets
|
120 |
+
# TOKENLIZER("data/samples/p326_020.wav")
|
121 |
+
|
122 |
+
# pdb.set_trace()
|
123 |
+
# Load dataset
|
124 |
+
# pdb.set_trace()
|
125 |
+
# dataset = load_dataset("common_voice", "en", split="train", streaming=True)
|
126 |
+
# dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
127 |
+
# dataset_iter = iter(dataset)
|
128 |
+
# sample = next(dataset_iter)
|
129 |
+
|
130 |
+
# pdb.set_trace()
|
131 |
+
# input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values
|
132 |
+
# pdb.set_trace()
|
133 |
+
|
134 |
+
# WER part
|
135 |
+
transformation = jiwer.Compose(
|
136 |
+
[
|
137 |
+
jiwer.RemovePunctuation(),
|
138 |
+
jiwer.ToUpperCase(),
|
139 |
+
jiwer.RemoveWhiteSpace(replace_by_space=True),
|
140 |
+
jiwer.RemoveMultipleSpaces(),
|
141 |
+
jiwer.ReduceToListOfListOfWords(word_delimiter=" "),
|
142 |
+
]
|
143 |
+
)
|
144 |
+
()
|
145 |
+
|
146 |
+
class ChangeSampleRate(nn.Module):
|
147 |
+
def __init__(self, input_rate: int, output_rate: int):
|
148 |
+
super().__init__()
|
149 |
+
self.output_rate = output_rate
|
150 |
+
self.input_rate = input_rate
|
151 |
+
|
152 |
+
def forward(self, wav: torch.tensor) -> torch.tensor:
|
153 |
+
# Only accepts 1-channel waveform input
|
154 |
+
wav = wav.view(wav.size(0), -1)
|
155 |
+
new_length = wav.size(-1) * self.output_rate // self.input_rate
|
156 |
+
indices = torch.arange(new_length) * (
|
157 |
+
self.input_rate / self.output_rate
|
158 |
+
)
|
159 |
+
round_down = wav[:, indices.long()]
|
160 |
+
round_up = wav[:, (indices.long() + 1).clamp(max=wav.size(-1) - 1)]
|
161 |
+
output = round_down * (1.0 - indices.fmod(1.0)).unsqueeze(0) + (
|
162 |
+
round_up * indices.fmod(1.0).unsqueeze(0)
|
163 |
+
)
|
164 |
+
return output
|
165 |
+
|
166 |
+
# Flagging setup
|
167 |
+
|
168 |
+
def calc_wer(audio_path, ref):
|
169 |
+
wav, sr = torchaudio.load(audio_path)
|
170 |
+
if wav.shape[0] != 1:
|
171 |
+
wav = wav[0, :].unsqueeze(0)
|
172 |
+
print(wav.shape)
|
173 |
+
osr = 16000
|
174 |
+
batch = wav.unsqueeze(0).repeat(10, 1, 1)
|
175 |
+
csr = ChangeSampleRate(sr, osr)
|
176 |
+
out_wavs = csr(wav)
|
177 |
+
# ASR
|
178 |
+
# trans = jiwer.ToUpperCase()(p(audio_path)["text"])
|
179 |
+
|
180 |
+
# Tokenlizer
|
181 |
+
tokens, token_wav_plot = TOKENLIZER(audio_path, activate_plot=True)
|
182 |
+
# ASR part
|
183 |
+
|
184 |
+
trans_cnt = []
|
185 |
+
for i in tokens:
|
186 |
+
word, start_time, end_time = i.values()
|
187 |
+
trans_cnt.append(word)
|
188 |
+
trans = " ".join(x for x in trans_cnt)
|
189 |
+
trans = jiwer.ToUpperCase()(trans)
|
190 |
+
# WER
|
191 |
+
ref = jiwer.ToUpperCase()(ref)
|
192 |
+
wer = jiwer.wer(
|
193 |
+
ref,
|
194 |
+
trans,
|
195 |
+
truth_transform=transformation,
|
196 |
+
hypothesis_transform=transformation,
|
197 |
+
)
|
198 |
+
# pdb.set_trace()
|
199 |
+
return [trans, wer, token_wav_plot]
|
200 |
+
# calc_wer(examples[1][0], examples[1][1])
|
201 |
+
# pdb.set_trace()
|
202 |
+
iface = gr.Interface(
|
203 |
+
fn=calc_wer,
|
204 |
+
inputs=[
|
205 |
+
gr.Audio(
|
206 |
+
source="upload",
|
207 |
+
type="filepath",
|
208 |
+
label="Audio_to_evaluate",
|
209 |
+
),
|
210 |
+
reference_textbox,
|
211 |
+
],
|
212 |
+
outputs=[
|
213 |
+
gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
|
214 |
+
gr.Textbox(placeholder="Word Error Rate", label="WER"),
|
215 |
+
gr.Plot(label="waveform")
|
216 |
+
],
|
217 |
+
title="Laronix Automatic Speech Recognition",
|
218 |
+
description=description,
|
219 |
+
examples=examples,
|
220 |
+
css=".body {background-color: green}",
|
221 |
+
)
|
222 |
+
|
223 |
+
print("Launch examples")
|
224 |
+
|
225 |
+
iface.launch(
|
226 |
+
share=False,
|
227 |
+
)
|
config/samples.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
exp_id: NULL
|
2 |
+
ref_txt: data/samples/ref.txt
|
3 |
+
ref_wavs: data/samples
|
config/template.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
exp_id: NULL
|
2 |
+
ref_txt: data/p326_split.txt
|
3 |
+
ref_feature: data/p326_split_ref.csv
|
4 |
+
ref_wavs: data/p326_split
|
5 |
+
thre:
|
6 |
+
minppm: 0
|
7 |
+
maxppm: 2000
|
8 |
+
WER: 1.0
|
9 |
+
AUTOMOS: 3.0
|
10 |
+
auth:
|
11 |
+
username: NULL
|
12 |
+
password: NULL
|
data/samples/Fary_PAL_p326_len10_p326_013.wav
ADDED
Binary file (667 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_020.wav
ADDED
Binary file (584 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_025.wav
ADDED
Binary file (576 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_036.wav
ADDED
Binary file (249 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_040.wav
ADDED
Binary file (396 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_043.wav
ADDED
Binary file (274 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_044.wav
ADDED
Binary file (350 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_045.wav
ADDED
Binary file (282 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_048.wav
ADDED
Binary file (399 kB). View file
|
|
data/samples/Fary_PAL_p326_len10_p326_049.wav
ADDED
Binary file (417 kB). View file
|
|
data/samples/John_p326_020.wav
ADDED
Binary file (339 kB). View file
|
|
data/samples/p326_020.wav
ADDED
Binary file (449 kB). View file
|
|
data/samples/ref.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Fary_PAL_p326_len10_p326_013 SOME HAVE ACCEPTED IT AS A MIRACLE WITHOUT PHYSICAL EXPLANATION
|
2 |
+
Fary_PAL_p326_len10_p326_020 MANY COMPLICATED IDEAS ABOUT THE RAINBOW HAVE BEEN FORMED
|
3 |
+
Fary_PAL_p326_len10_p326_025 JIMMY RETURNED TO ABERDEEN, BUT SHE REMAINED WITH GEORGE
|
4 |
+
Fary_PAL_p326_len10_p326_036 IT WAS MY SHORTEST AUDITION
|
5 |
+
Fary_PAL_p326_len10_p326_040 THE POLICE BELIEVE THE MAN WAS NOT INJURED
|
6 |
+
Fary_PAL_p326_len10_p326_043 I READ ABOUT IT IN THE NEWSPAPERS
|
7 |
+
Fary_PAL_p326_len10_p326_044 WHAT DID THEY THINK OF HER VOICE
|
8 |
+
Fary_PAL_p326_len10_p326_045 THEY SAY THEY FEEL DIFFERENT
|
9 |
+
Fary_PAL_p326_len10_p326_048 WE WILL CONTINUE TO PRESSURE THE GOVERNMENT
|
10 |
+
Fary_PAL_p326_len10_p326_049 HIS ART WAS NOT JUST ABOUT EMOTIONS
|
local/__pycache__/vis.cpython-39.pyc
ADDED
Binary file (1.38 kB). View file
|
|
local/token.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from pathlib import Path
|
3 |
+
import jiwer
|
4 |
+
import pdb
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch
|
7 |
+
import torchaudio
|
8 |
+
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
|
9 |
+
|
10 |
+
import yaml
|
11 |
+
import librosa
|
12 |
+
import librosa.display
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import soundfile as sf
|
15 |
+
|
16 |
+
|
17 |
+
def TOKENLIZER(audio_path):
|
18 |
+
token_model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
20 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
|
21 |
+
|
22 |
+
# # load first sample of English common_voice
|
23 |
+
# dataset = load_dataset("common_voice", "en", split="train", streaming=True)
|
24 |
+
# dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
25 |
+
# dataset_iter = iter(dataset)
|
26 |
+
# sample = next(dataset_iter)
|
27 |
+
|
28 |
+
# # forward sample through model to get greedily predicted transcription ids
|
29 |
+
# input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values
|
30 |
+
# pdb.set_trace()
|
31 |
+
|
32 |
+
# load samples
|
33 |
+
input_values, sr = torchaudio.load(audio_path)
|
34 |
+
# resample
|
35 |
+
if sr != feature_extractor.sampling_rate:
|
36 |
+
input_values = torchaudio.functional.resample(input_values, sr, feature_extractor.sampling_rate)
|
37 |
+
|
38 |
+
logits = token_model(input_values).logits[0]
|
39 |
+
# Get predict IDs
|
40 |
+
pred_ids = torch.argmax(logits, axis=-1)
|
41 |
+
|
42 |
+
# retrieve word stamps (analogous commands for `output_char_offsets`)
|
43 |
+
outputs = tokenizer.decode(pred_ids, output_word_offsets=True)
|
44 |
+
# compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
|
45 |
+
time_offset = token_model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate
|
46 |
+
|
47 |
+
word_offsets = [
|
48 |
+
{
|
49 |
+
"word": d["word"],
|
50 |
+
"start_time": round(d["start_offset"] * time_offset, 2),
|
51 |
+
"end_time": round(d["end_offset"] * time_offset, 2),
|
52 |
+
}
|
53 |
+
for d in outputs.word_offsets
|
54 |
+
]
|
55 |
+
return word_offsets
|
local/vis.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
from matplotlib.animation import FuncAnimation
|
3 |
+
|
4 |
+
import torchaudio
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
import pdb
|
9 |
+
test_token = [{'word': 'MANY', 'start_time': 1.3, 'end_time': 1.5}, {'word': 'COMPLICATED', 'start_time': 1.56, 'end_time': 2.14}, {'word': 'IDEAS', 'start_time': 2.24, 'end_time': 2.56}, {'word': 'ABOUT', 'start_time': 2.66, 'end_time': 2.9}, {'word': 'THE', 'start_time': 3.0, 'end_time': 3.06}, {'word': 'RAINBOW', 'start_time': 3.14, 'end_time': 3.42}, {'word': 'HAVE', 'start_time': 3.48, 'end_time': 3.58}, {'word': 'BEEN', 'start_time': 3.62, 'end_time': 3.74}, {'word': 'FORMED', 'start_time': 3.84, 'end_time': 4.16}]
|
10 |
+
laronix_green = [120, 189, 145]
|
11 |
+
|
12 |
+
|
13 |
+
def token_plot(audio, sr, token):
|
14 |
+
# pdb.set_trace()
|
15 |
+
# Get X axis
|
16 |
+
duration = audio.squeeze().shape[0] / sr
|
17 |
+
x = np.arange(0, duration, 1/sr)
|
18 |
+
# Wave plot
|
19 |
+
fig, ax = plt.subplots(figsize=(20, 4))
|
20 |
+
ax.plot(x, audio.squeeze(), color="#78bd91")
|
21 |
+
ax.set_xlabel("Time / s")
|
22 |
+
ax.set_ylabel("Amplitude")
|
23 |
+
|
24 |
+
y_limit = np.max(audio.numpy())
|
25 |
+
# pdb.set_trace()
|
26 |
+
# load token
|
27 |
+
for i in token:
|
28 |
+
word, start_time, end_time = i.values()
|
29 |
+
# plot tokens
|
30 |
+
ax.text(x=start_time, y=y_limit, s=word, ha="left", fontsize="large", fontstretch="ultra-condensed")
|
31 |
+
# plot token boundarys
|
32 |
+
ax.vlines(x=start_time, ymin=np.min(audio.numpy()), ymax=y_limit, colors="black")
|
33 |
+
# ax.vlines(x=end_time, ymin=np.min(audio.numpy()), ymax=y_limit, colors="red")
|
34 |
+
|
35 |
+
|
36 |
+
plt.tight_layout()
|
37 |
+
# pdb.set_trace()
|
38 |
+
# fig.savefig("1.png")
|
39 |
+
return fig
|
40 |
+
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
festival
|
2 |
+
espeak
|
requirements.txt
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==1.0.0
|
2 |
+
aiohttp==3.8.1
|
3 |
+
aiosignal==1.2.0
|
4 |
+
analytics-python==1.4.0
|
5 |
+
antlr4-python3-runtime==4.8
|
6 |
+
anyio==3.5.0
|
7 |
+
asgiref==3.5.0
|
8 |
+
async-timeout==4.0.2
|
9 |
+
attrs==21.4.0
|
10 |
+
backoff==1.10.0
|
11 |
+
bcrypt==3.2.0
|
12 |
+
bitarray==2.4.0
|
13 |
+
cachetools==5.0.0
|
14 |
+
certifi==2021.10.8
|
15 |
+
cffi==1.15.0
|
16 |
+
charset-normalizer==2.0.12
|
17 |
+
click==8.0.4
|
18 |
+
colorama==0.4.4
|
19 |
+
cryptography==36.0.1
|
20 |
+
cycler==0.11.0
|
21 |
+
Cython==0.29.28
|
22 |
+
fairseq @ git+https://github.com/pytorch/fairseq.git@d03f4e771484a433f025f47744017c2eb6e9c6bc
|
23 |
+
fastapi==0.75.0
|
24 |
+
ffmpy==0.3.0
|
25 |
+
fonttools==4.30.0
|
26 |
+
frozenlist==1.3.0
|
27 |
+
fsspec==2022.2.0
|
28 |
+
future==0.18.2
|
29 |
+
google-auth==2.6.0
|
30 |
+
google-auth-oauthlib==0.4.6
|
31 |
+
gradio==3.2
|
32 |
+
grpcio==1.44.0
|
33 |
+
h11==0.12.0
|
34 |
+
hydra-core==1.0.7
|
35 |
+
idna==3.3
|
36 |
+
importlib-metadata==4.11.3
|
37 |
+
Jinja2==3.0.3
|
38 |
+
kiwisolver==1.3.2
|
39 |
+
linkify-it-py==1.0.3
|
40 |
+
Markdown==3.3.6
|
41 |
+
markdown-it-py==2.0.1
|
42 |
+
MarkupSafe==2.1.0
|
43 |
+
matplotlib==3.5.1
|
44 |
+
mdit-py-plugins==0.3.0
|
45 |
+
mdurl==0.1.0
|
46 |
+
monotonic==1.6
|
47 |
+
multidict==6.0.2
|
48 |
+
numpy==1.22.3
|
49 |
+
oauthlib==3.2.0
|
50 |
+
omegaconf==2.0.6
|
51 |
+
orjson==3.6.7
|
52 |
+
packaging==21.3
|
53 |
+
pandas==1.4.1
|
54 |
+
paramiko==2.10.1
|
55 |
+
Pillow==9.0.1
|
56 |
+
portalocker==2.4.0
|
57 |
+
protobuf==3.19.4
|
58 |
+
pyasn1==0.4.8
|
59 |
+
pyasn1-modules==0.2.8
|
60 |
+
pycparser==2.21
|
61 |
+
pycryptodome==3.14.1
|
62 |
+
pydantic==1.9.0
|
63 |
+
pyDeprecate==0.3.1
|
64 |
+
pydub==0.25.1
|
65 |
+
PyNaCl==1.5.0
|
66 |
+
pyparsing==3.0.7
|
67 |
+
python-dateutil==2.8.2
|
68 |
+
python-multipart==0.0.5
|
69 |
+
pytorch-lightning==1.5.10
|
70 |
+
pytz==2021.3
|
71 |
+
PyYAML==6.0
|
72 |
+
regex==2022.3.2
|
73 |
+
requests==2.27.1
|
74 |
+
requests-oauthlib==1.3.1
|
75 |
+
rsa==4.8
|
76 |
+
sacrebleu==2.0.0
|
77 |
+
six==1.16.0
|
78 |
+
sniffio==1.2.0
|
79 |
+
starlette==0.17.1
|
80 |
+
tabulate==0.8.9
|
81 |
+
tensorboard==2.8.0
|
82 |
+
tensorboard-data-server==0.6.1
|
83 |
+
tensorboard-plugin-wit==1.8.1
|
84 |
+
torch==1.11.0
|
85 |
+
torchaudio==0.11.0
|
86 |
+
torchmetrics==0.7.2
|
87 |
+
tqdm==4.63.0
|
88 |
+
typing-extensions==4.1.1
|
89 |
+
uc-micro-py==1.0.1
|
90 |
+
urllib3==1.26.8
|
91 |
+
uvicorn==0.17.6
|
92 |
+
Werkzeug==2.0.3
|
93 |
+
yarl==1.7.2
|
94 |
+
zipp==3.7.0
|
95 |
+
|
96 |
+
transformers
|
97 |
+
deepspeech
|
98 |
+
tensorboardX
|
99 |
+
jiwer
|
100 |
+
phonemizer
|
101 |
+
librosa
|
102 |
+
|
103 |
+
rich
|
src/__pycache__/lightning_module.cpython-39.pyc
ADDED
Binary file (1.86 kB). View file
|
|
src/__pycache__/model.cpython-39.pyc
ADDED
Binary file (6.46 kB). View file
|
|
src/description.html
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
|
3 |
+
<html>
|
4 |
+
|
5 |
+
<head>
|
6 |
+
<meta charset="UTF-8">
|
7 |
+
<meta name="viewport" content="width=device-width">
|
8 |
+
<title style="text-align: center;"> Laronix Naturalness Test </title>
|
9 |
+
<!-- CSS only -->
|
10 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.1/dist/css/bootstrap.min.css" rel="stylesheet"
|
11 |
+
integrity="sha384-F3w7mX95PdgyTmZZMECAngseQB83DfGTowi0iMjiWaeVhAn4FJkqJByhZMI3AhiU" crossorigin="anonymous">
|
12 |
+
</head>
|
13 |
+
<style>
|
14 |
+
|
15 |
+
@font-face {
|
16 |
+
font-family: Poppins;
|
17 |
+
src: url(font/Poppins-Regular.ttf);
|
18 |
+
}
|
19 |
+
@font-face {
|
20 |
+
font-family: Poppins-Bold;
|
21 |
+
src: url(font/Poppins-Bold.ttf);
|
22 |
+
}
|
23 |
+
@font-face {
|
24 |
+
font-family: Muli;
|
25 |
+
src: url(font/Muli.ttf);
|
26 |
+
}
|
27 |
+
@font-face {
|
28 |
+
font-family: Muli-Bold;
|
29 |
+
src: url(font/Muli-Bold.ttf);
|
30 |
+
}
|
31 |
+
</style>
|
32 |
+
<body>
|
33 |
+
<p style="font-family: Muli;">This is a prototype of Laronix Automatic Speech Recognition platform.</p>
|
34 |
+
|
35 |
+
<img
|
36 |
+
src="https://static.wixstatic.com/media/e7e144_93e98148d06147828031797eb4525b80~mv2.png/v1/crop/x_0,y_25,w_2606,h_882/fill/w_396,h_142,al_c,q_85,usm_0.66_1.00_0.01,enc_auto/newlogo.png"
|
37 |
+
align="right"
|
38 |
+
height="20%"
|
39 |
+
width="20%"
|
40 |
+
/>
|
41 |
+
</body>
|
42 |
+
</html>
|
src/font/Muli-Bold.ttf
ADDED
Binary file (52.8 kB). View file
|
|
src/font/Muli.ttf
ADDED
Binary file (49 kB). View file
|
|
src/font/OFL.txt
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright 2020 The Poppins Project Authors (https://github.com/itfoundry/Poppins)
|
2 |
+
|
3 |
+
This Font Software is licensed under the SIL Open Font License, Version 1.1.
|
4 |
+
This license is copied below, and is also available with a FAQ at:
|
5 |
+
http://scripts.sil.org/OFL
|
6 |
+
|
7 |
+
|
8 |
+
-----------------------------------------------------------
|
9 |
+
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
|
10 |
+
-----------------------------------------------------------
|
11 |
+
|
12 |
+
PREAMBLE
|
13 |
+
The goals of the Open Font License (OFL) are to stimulate worldwide
|
14 |
+
development of collaborative font projects, to support the font creation
|
15 |
+
efforts of academic and linguistic communities, and to provide a free and
|
16 |
+
open framework in which fonts may be shared and improved in partnership
|
17 |
+
with others.
|
18 |
+
|
19 |
+
The OFL allows the licensed fonts to be used, studied, modified and
|
20 |
+
redistributed freely as long as they are not sold by themselves. The
|
21 |
+
fonts, including any derivative works, can be bundled, embedded,
|
22 |
+
redistributed and/or sold with any software provided that any reserved
|
23 |
+
names are not used by derivative works. The fonts and derivatives,
|
24 |
+
however, cannot be released under any other type of license. The
|
25 |
+
requirement for fonts to remain under this license does not apply
|
26 |
+
to any document created using the fonts or their derivatives.
|
27 |
+
|
28 |
+
DEFINITIONS
|
29 |
+
"Font Software" refers to the set of files released by the Copyright
|
30 |
+
Holder(s) under this license and clearly marked as such. This may
|
31 |
+
include source files, build scripts and documentation.
|
32 |
+
|
33 |
+
"Reserved Font Name" refers to any names specified as such after the
|
34 |
+
copyright statement(s).
|
35 |
+
|
36 |
+
"Original Version" refers to the collection of Font Software components as
|
37 |
+
distributed by the Copyright Holder(s).
|
38 |
+
|
39 |
+
"Modified Version" refers to any derivative made by adding to, deleting,
|
40 |
+
or substituting -- in part or in whole -- any of the components of the
|
41 |
+
Original Version, by changing formats or by porting the Font Software to a
|
42 |
+
new environment.
|
43 |
+
|
44 |
+
"Author" refers to any designer, engineer, programmer, technical
|
45 |
+
writer or other person who contributed to the Font Software.
|
46 |
+
|
47 |
+
PERMISSION & CONDITIONS
|
48 |
+
Permission is hereby granted, free of charge, to any person obtaining
|
49 |
+
a copy of the Font Software, to use, study, copy, merge, embed, modify,
|
50 |
+
redistribute, and sell modified and unmodified copies of the Font
|
51 |
+
Software, subject to the following conditions:
|
52 |
+
|
53 |
+
1) Neither the Font Software nor any of its individual components,
|
54 |
+
in Original or Modified Versions, may be sold by itself.
|
55 |
+
|
56 |
+
2) Original or Modified Versions of the Font Software may be bundled,
|
57 |
+
redistributed and/or sold with any software, provided that each copy
|
58 |
+
contains the above copyright notice and this license. These can be
|
59 |
+
included either as stand-alone text files, human-readable headers or
|
60 |
+
in the appropriate machine-readable metadata fields within text or
|
61 |
+
binary files as long as those fields can be easily viewed by the user.
|
62 |
+
|
63 |
+
3) No Modified Version of the Font Software may use the Reserved Font
|
64 |
+
Name(s) unless explicit written permission is granted by the corresponding
|
65 |
+
Copyright Holder. This restriction only applies to the primary font name as
|
66 |
+
presented to the users.
|
67 |
+
|
68 |
+
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
|
69 |
+
Software shall not be used to promote, endorse or advertise any
|
70 |
+
Modified Version, except to acknowledge the contribution(s) of the
|
71 |
+
Copyright Holder(s) and the Author(s) or with their explicit written
|
72 |
+
permission.
|
73 |
+
|
74 |
+
5) The Font Software, modified or unmodified, in part or in whole,
|
75 |
+
must be distributed entirely under this license, and must not be
|
76 |
+
distributed under any other license. The requirement for fonts to
|
77 |
+
remain under this license does not apply to any document created
|
78 |
+
using the Font Software.
|
79 |
+
|
80 |
+
TERMINATION
|
81 |
+
This license becomes null and void if any of the above conditions are
|
82 |
+
not met.
|
83 |
+
|
84 |
+
DISCLAIMER
|
85 |
+
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
86 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
|
87 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
88 |
+
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
|
89 |
+
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
90 |
+
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
|
91 |
+
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
92 |
+
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
|
93 |
+
OTHER DEALINGS IN THE FONT SOFTWARE.
|
src/font/Poppins-Bold.ttf
ADDED
Binary file (154 kB). View file
|
|
src/font/Poppins-Regular.ttf
ADDED
Binary file (158 kB). View file
|
|
src/font/SIL Open Font License.txt
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2011 by vernon adams (vern@newtypography.co.uk),
|
2 |
+
with Reserved Font Name "Muli".
|
3 |
+
|
4 |
+
This Font Software is licensed under the SIL Open Font License, Version 1.1.
|
5 |
+
This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL
|
6 |
+
|
7 |
+
-----------------------------------------------------------
|
8 |
+
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
|
9 |
+
-----------------------------------------------------------
|
10 |
+
|
11 |
+
PREAMBLE
|
12 |
+
The goals of the Open Font License (OFL) are to stimulate worldwide development of collaborative font projects, to support the font creation efforts of academic and linguistic communities, and to provide a free and open framework in which fonts may be shared and improved in partnership with others.
|
13 |
+
|
14 |
+
The OFL allows the licensed fonts to be used, studied, modified and redistributed freely as long as they are not sold by themselves. The fonts, including any derivative works, can be bundled, embedded, redistributed and/or sold with any software provided that any reserved names are not used by derivative works. The fonts and derivatives, however, cannot be released under any other type of license. The requirement for fonts to remain under this license does not apply to any document created using the fonts or their derivatives.
|
15 |
+
|
16 |
+
DEFINITIONS
|
17 |
+
"Font Software" refers to the set of files released by the Copyright Holder(s) under this license and clearly marked as such. This may include source files, build scripts and documentation.
|
18 |
+
|
19 |
+
"Reserved Font Name" refers to any names specified as such after the copyright statement(s).
|
20 |
+
|
21 |
+
"Original Version" refers to the collection of Font Software components as distributed by the Copyright Holder(s).
|
22 |
+
|
23 |
+
"Modified Version" refers to any derivative made by adding to, deleting, or substituting -- in part or in whole -- any of the components of the Original Version, by changing formats or by porting the Font Software to a new environment.
|
24 |
+
|
25 |
+
"Author" refers to any designer, engineer, programmer, technical writer or other person who contributed to the Font Software.
|
26 |
+
|
27 |
+
PERMISSION & CONDITIONS
|
28 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of the Font Software, to use, study, copy, merge, embed, modify, redistribute, and sell modified and unmodified copies of the Font Software, subject to the following conditions:
|
29 |
+
|
30 |
+
1) Neither the Font Software nor any of its individual components, in Original or Modified Versions, may be sold by itself.
|
31 |
+
|
32 |
+
2) Original or Modified Versions of the Font Software may be bundled, redistributed and/or sold with any software, provided that each copy contains the above copyright notice and this license. These can be included either as stand-alone text files, human-readable headers or in the appropriate machine-readable metadata fields within text or binary files as long as those fields can be easily viewed by the user.
|
33 |
+
|
34 |
+
3) No Modified Version of the Font Software may use the Reserved Font Name(s) unless explicit written permission is granted by the corresponding Copyright Holder. This restriction only applies to the primary font name as presented to the users.
|
35 |
+
|
36 |
+
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font Software shall not be used to promote, endorse or advertise any Modified Version, except to acknowledge the contribution(s) of the Copyright Holder(s) and the Author(s) or with their explicit written permission.
|
37 |
+
|
38 |
+
5) The Font Software, modified or unmodified, in part or in whole, must be distributed entirely under this license, and must not be distributed under any other license. The requirement for fonts to remain under this license does not apply to any document created using the Font Software.
|
39 |
+
|
40 |
+
TERMINATION
|
41 |
+
This license becomes null and void if any of the above conditions are not met.
|
42 |
+
|
43 |
+
DISCLAIMER
|
44 |
+
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM OTHER DEALINGS IN THE FONT SOFTWARE.
|