Spaces:
Running
Running
Vineel Pratap
commited on
Commit
•
f138a14
1
Parent(s):
a7567f9
resampling fix
Browse files- app.py +2 -1
- requirements.txt +2 -2
- utils/lm.py +9 -11
- utils/norm_config.py +8 -9
- utils/text_norm.py +15 -6
- zeroshot.py +9 -8
app.py
CHANGED
@@ -53,7 +53,8 @@ with gr.Blocks(css="style.css") as demo:
|
|
53 |
)
|
54 |
with gr.Column():
|
55 |
autolm = gr.Checkbox(
|
56 |
-
label="Automatically create Unigram LM from text data",
|
|
|
57 |
)
|
58 |
btn = gr.Button("Submit", elem_id="submit")
|
59 |
|
|
|
53 |
)
|
54 |
with gr.Column():
|
55 |
autolm = gr.Checkbox(
|
56 |
+
label="Automatically create Unigram LM from text data",
|
57 |
+
value=True,
|
58 |
)
|
59 |
btn = gr.Button("Submit", elem_id="submit")
|
60 |
|
requirements.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41b8b278a5c4d2fc182c7893bcc683ad261ab0612cea1da58aaed1b358fd9649
|
3 |
+
size 164
|
utils/lm.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
-
# Creates unigram LM following KenLM
|
2 |
-
import math
|
3 |
import shutil, tempfile
|
4 |
|
|
|
5 |
def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
|
6 |
"""
|
7 |
Calculate log probabilities for each word in the corpus,
|
8 |
including a special <unk> token for unknown words.
|
9 |
"""
|
10 |
-
total_words = sum(word_counts.values())
|
11 |
-
total_words += 2 * num_sentences
|
12 |
# Adjust total for <unk>
|
13 |
total_words_with_unk = total_words + 1 # Adding 1 for <unk>
|
14 |
total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
|
@@ -25,6 +26,7 @@ def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
|
|
25 |
# Convert to log probabilities
|
26 |
return {word: math.log10(prob) for word, prob in probabilities.items()}
|
27 |
|
|
|
28 |
def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
|
29 |
with open(arpa_fpath, "r") as file:
|
30 |
lines = file.readlines()
|
@@ -46,6 +48,7 @@ def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
|
|
46 |
|
47 |
file.write(line)
|
48 |
|
|
|
49 |
def save_log_probabilities(log_probabilities, file_path):
|
50 |
with open(file_path, "w") as file:
|
51 |
file.write(f"\data\\")
|
@@ -59,13 +62,8 @@ def save_log_probabilities(log_probabilities, file_path):
|
|
59 |
file.write(f"{log_prob}\t{word}\n")
|
60 |
file.write(f"\n")
|
61 |
file.write(f"\end\\")
|
62 |
-
|
|
|
63 |
def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
|
64 |
log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
|
65 |
save_log_probabilities(log_probs, file_path)
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
1 |
+
# Creates unigram LM following KenLM
|
2 |
+
import math
|
3 |
import shutil, tempfile
|
4 |
|
5 |
+
|
6 |
def calculate_log_probabilities(word_counts, num_sentences, n_smoothing=0.01):
|
7 |
"""
|
8 |
Calculate log probabilities for each word in the corpus,
|
9 |
including a special <unk> token for unknown words.
|
10 |
"""
|
11 |
+
total_words = sum(word_counts.values())
|
12 |
+
total_words += 2 * num_sentences # add counts for <s> and </s>
|
13 |
# Adjust total for <unk>
|
14 |
total_words_with_unk = total_words + 1 # Adding 1 for <unk>
|
15 |
total_words_with_unk = total_words_with_unk + total_words_with_unk * n_smoothing
|
|
|
26 |
# Convert to log probabilities
|
27 |
return {word: math.log10(prob) for word, prob in probabilities.items()}
|
28 |
|
29 |
+
|
30 |
def maybe_generate_pseudo_bigram_arpa(arpa_fpath):
|
31 |
with open(arpa_fpath, "r") as file:
|
32 |
lines = file.readlines()
|
|
|
48 |
|
49 |
file.write(line)
|
50 |
|
51 |
+
|
52 |
def save_log_probabilities(log_probabilities, file_path):
|
53 |
with open(file_path, "w") as file:
|
54 |
file.write(f"\data\\")
|
|
|
62 |
file.write(f"{log_prob}\t{word}\n")
|
63 |
file.write(f"\n")
|
64 |
file.write(f"\end\\")
|
65 |
+
|
66 |
+
|
67 |
def create_unigram_lm(word_counts, num_sentences, file_path, n_smoothing=0.01):
|
68 |
log_probs = calculate_log_probabilities(word_counts, num_sentences, n_smoothing)
|
69 |
save_log_probabilities(log_probs, file_path)
|
|
|
|
|
|
|
|
|
|
|
|
utils/norm_config.py
CHANGED
@@ -42,7 +42,7 @@ inverted_question_mark = r"\u00BF"
|
|
42 |
|
43 |
|
44 |
# Hindi
|
45 |
-
hindi_danda =
|
46 |
|
47 |
# Egyptian Arabic
|
48 |
# arabic_percent = r"\u066A"
|
@@ -175,7 +175,7 @@ nominal_digit_shapes = r"\u206f"
|
|
175 |
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
|
176 |
punc_list = punc_f.readlines()
|
177 |
|
178 |
-
punct_pattern = r""
|
179 |
for punc in punc_list:
|
180 |
# the first character in the tab separated line is the punc to be removed
|
181 |
punct_pattern += re.escape(punc.split("\t")[0])
|
@@ -213,7 +213,6 @@ shared_punc_list = (
|
|
213 |
+ arabic_question_mark
|
214 |
+ chinese_punc
|
215 |
+ punct_pattern
|
216 |
-
|
217 |
)
|
218 |
|
219 |
shared_mappping = {
|
@@ -242,11 +241,11 @@ norm_config = {
|
|
242 |
"mapping": shared_mappping,
|
243 |
"digit_set": shared_digits,
|
244 |
"unicode_norm": "NFKC",
|
245 |
-
"rm_diacritics"
|
246 |
}
|
247 |
}
|
248 |
|
249 |
-
|
250 |
|
251 |
norm_config["mon"] = norm_config["*"].copy()
|
252 |
# add soft hyphen to punc list to match with fleurs
|
@@ -254,23 +253,23 @@ norm_config["mon"]["del_set"] += r"\u00AD"
|
|
254 |
|
255 |
norm_config["khk"] = norm_config["mon"].copy()
|
256 |
|
257 |
-
|
258 |
|
259 |
norm_config["heb"] = norm_config["*"].copy()
|
260 |
# add "HEBREW POINT" symbols to match with fleurs
|
261 |
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
|
262 |
|
263 |
-
|
264 |
|
265 |
norm_config["tha"] = norm_config["*"].copy()
|
266 |
# add "Zero width joiner" symbols to match with fleurs
|
267 |
norm_config["tha"]["punc_set"] += r"\u200D"
|
268 |
|
269 |
-
|
270 |
norm_config["ara"] = norm_config["*"].copy()
|
271 |
norm_config["ara"]["mapping"]["ٱ"] = "ا"
|
272 |
norm_config["arb"] = norm_config["ara"].copy()
|
273 |
|
274 |
-
|
275 |
norm_config["jav"] = norm_config["*"].copy()
|
276 |
norm_config["jav"]["rm_diacritics"] = True
|
|
|
42 |
|
43 |
|
44 |
# Hindi
|
45 |
+
hindi_danda = "\u0964"
|
46 |
|
47 |
# Egyptian Arabic
|
48 |
# arabic_percent = r"\u066A"
|
|
|
175 |
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
|
176 |
punc_list = punc_f.readlines()
|
177 |
|
178 |
+
punct_pattern = r""
|
179 |
for punc in punc_list:
|
180 |
# the first character in the tab separated line is the punc to be removed
|
181 |
punct_pattern += re.escape(punc.split("\t")[0])
|
|
|
213 |
+ arabic_question_mark
|
214 |
+ chinese_punc
|
215 |
+ punct_pattern
|
|
|
216 |
)
|
217 |
|
218 |
shared_mappping = {
|
|
|
241 |
"mapping": shared_mappping,
|
242 |
"digit_set": shared_digits,
|
243 |
"unicode_norm": "NFKC",
|
244 |
+
"rm_diacritics": False,
|
245 |
}
|
246 |
}
|
247 |
|
248 |
+
# =============== Mongolian ===============#
|
249 |
|
250 |
norm_config["mon"] = norm_config["*"].copy()
|
251 |
# add soft hyphen to punc list to match with fleurs
|
|
|
253 |
|
254 |
norm_config["khk"] = norm_config["mon"].copy()
|
255 |
|
256 |
+
# =============== Hebrew ===============#
|
257 |
|
258 |
norm_config["heb"] = norm_config["*"].copy()
|
259 |
# add "HEBREW POINT" symbols to match with fleurs
|
260 |
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
|
261 |
|
262 |
+
# =============== Thai ===============#
|
263 |
|
264 |
norm_config["tha"] = norm_config["*"].copy()
|
265 |
# add "Zero width joiner" symbols to match with fleurs
|
266 |
norm_config["tha"]["punc_set"] += r"\u200D"
|
267 |
|
268 |
+
# =============== Arabic ===============#
|
269 |
norm_config["ara"] = norm_config["*"].copy()
|
270 |
norm_config["ara"]["mapping"]["ٱ"] = "ا"
|
271 |
norm_config["arb"] = norm_config["ara"].copy()
|
272 |
|
273 |
+
# =============== Javanese ===============#
|
274 |
norm_config["jav"] = norm_config["*"].copy()
|
275 |
norm_config["jav"]["rm_diacritics"] = True
|
utils/text_norm.py
CHANGED
@@ -5,7 +5,9 @@ import unicodedata
|
|
5 |
from utils.norm_config import norm_config
|
6 |
|
7 |
|
8 |
-
def text_normalize(
|
|
|
|
|
9 |
|
10 |
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
11 |
|
@@ -15,17 +17,23 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
|
|
15 |
remove_numbers : Boolean flag to specify if words containing only digits should be removed
|
16 |
|
17 |
Returns:
|
18 |
-
normalized_text : the string after all normalization
|
19 |
|
20 |
"""
|
21 |
|
22 |
config = norm_config.get(iso_code, norm_config["*"])
|
23 |
|
24 |
-
for field in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
if field not in config:
|
26 |
config[field] = norm_config["*"][field]
|
27 |
|
28 |
-
|
29 |
text = unicodedata.normalize(config["unicode_norm"], text)
|
30 |
|
31 |
# Convert to lower case
|
@@ -34,7 +42,7 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
|
|
34 |
text = text.lower()
|
35 |
|
36 |
# brackets
|
37 |
-
|
38 |
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
39 |
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
40 |
if remove_brackets:
|
@@ -84,9 +92,10 @@ def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_
|
|
84 |
|
85 |
if config["rm_diacritics"]:
|
86 |
from unidecode import unidecode
|
|
|
87 |
normalized_text = unidecode(normalized_text)
|
88 |
|
89 |
# Remove extra spaces
|
90 |
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
91 |
|
92 |
-
return normalized_text
|
|
|
5 |
from utils.norm_config import norm_config
|
6 |
|
7 |
|
8 |
+
def text_normalize(
|
9 |
+
text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
|
10 |
+
):
|
11 |
|
12 |
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
13 |
|
|
|
17 |
remove_numbers : Boolean flag to specify if words containing only digits should be removed
|
18 |
|
19 |
Returns:
|
20 |
+
normalized_text : the string after all normalization
|
21 |
|
22 |
"""
|
23 |
|
24 |
config = norm_config.get(iso_code, norm_config["*"])
|
25 |
|
26 |
+
for field in [
|
27 |
+
"lower_case",
|
28 |
+
"punc_set",
|
29 |
+
"del_set",
|
30 |
+
"mapping",
|
31 |
+
"digit_set",
|
32 |
+
"unicode_norm",
|
33 |
+
]:
|
34 |
if field not in config:
|
35 |
config[field] = norm_config["*"][field]
|
36 |
|
|
|
37 |
text = unicodedata.normalize(config["unicode_norm"], text)
|
38 |
|
39 |
# Convert to lower case
|
|
|
42 |
text = text.lower()
|
43 |
|
44 |
# brackets
|
45 |
+
|
46 |
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
47 |
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
48 |
if remove_brackets:
|
|
|
92 |
|
93 |
if config["rm_diacritics"]:
|
94 |
from unidecode import unidecode
|
95 |
+
|
96 |
normalized_text = unidecode(normalized_text)
|
97 |
|
98 |
# Remove extra spaces
|
99 |
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
100 |
|
101 |
+
return normalized_text
|
zeroshot.py
CHANGED
@@ -34,7 +34,7 @@ class MY_LOG:
|
|
34 |
def __init__(self):
|
35 |
self.text = "[START]"
|
36 |
|
37 |
-
def add(self, new_log, new_line=
|
38 |
self.text = self.text + ("\n" if new_line else " ") + new_log
|
39 |
self.text = self.text.strip()
|
40 |
return self.text
|
@@ -127,7 +127,9 @@ def process(
|
|
127 |
audio_samples = (audio_samples / 32768.0).astype(float)
|
128 |
|
129 |
if sr != ASR_SAMPLING_RATE:
|
130 |
-
audio_samples = librosa.resample(
|
|
|
|
|
131 |
else:
|
132 |
# file upload
|
133 |
assert isinstance(audio_data, str)
|
@@ -179,15 +181,14 @@ def process(
|
|
179 |
# print(k, v)
|
180 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
181 |
|
182 |
-
# Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
|
183 |
-
tmp_file = tempfile.NamedTemporaryFile()
|
184 |
if autolm and any([cnt > 2 for cnt in word_counts.values()]):
|
185 |
yield transcription, logs.add(f"Creating unigram LM...", False)
|
186 |
-
lm_path = tmp_file.name
|
187 |
create_unigram_lm(word_counts, num_sentences, lm_path)
|
188 |
yield transcription, logs.add(f"OK")
|
189 |
|
190 |
-
|
191 |
if lm_path is None:
|
192 |
yield transcription, logs.add(f"Filtering lexicon....")
|
193 |
lexicon = filter_lexicon(lexicon, word_counts)
|
@@ -195,8 +196,8 @@ def process(
|
|
195 |
f"Ok. Leixcon size after filtering: {len(lexicon)}"
|
196 |
)
|
197 |
else:
|
198 |
-
# kenlm throws an error if unigram LM is being used
|
199 |
-
# HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
|
200 |
maybe_generate_pseudo_bigram_arpa(lm_path)
|
201 |
|
202 |
# for k, v in lexicon.items():
|
|
|
34 |
def __init__(self):
|
35 |
self.text = "[START]"
|
36 |
|
37 |
+
def add(self, new_log, new_line=True):
|
38 |
self.text = self.text + ("\n" if new_line else " ") + new_log
|
39 |
self.text = self.text.strip()
|
40 |
return self.text
|
|
|
127 |
audio_samples = (audio_samples / 32768.0).astype(float)
|
128 |
|
129 |
if sr != ASR_SAMPLING_RATE:
|
130 |
+
audio_samples = librosa.resample(
|
131 |
+
audio_samples, orig_sr=sr, target_sr=ASR_SAMPLING_RATE
|
132 |
+
)
|
133 |
else:
|
134 |
# file upload
|
135 |
assert isinstance(audio_data, str)
|
|
|
181 |
# print(k, v)
|
182 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
183 |
|
184 |
+
# Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
|
185 |
+
tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
|
186 |
if autolm and any([cnt > 2 for cnt in word_counts.values()]):
|
187 |
yield transcription, logs.add(f"Creating unigram LM...", False)
|
188 |
+
lm_path = tmp_file.name
|
189 |
create_unigram_lm(word_counts, num_sentences, lm_path)
|
190 |
yield transcription, logs.add(f"OK")
|
191 |
|
|
|
192 |
if lm_path is None:
|
193 |
yield transcription, logs.add(f"Filtering lexicon....")
|
194 |
lexicon = filter_lexicon(lexicon, word_counts)
|
|
|
196 |
f"Ok. Leixcon size after filtering: {len(lexicon)}"
|
197 |
)
|
198 |
else:
|
199 |
+
# kenlm throws an error if unigram LM is being used
|
200 |
+
# HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
|
201 |
maybe_generate_pseudo_bigram_arpa(lm_path)
|
202 |
|
203 |
# for k, v in lexicon.items():
|