Spaces:
Running
Running
Yurii Paniv
commited on
Commit
•
0f120d1
1
Parent(s):
72475af
Add instructions for scripts
Browse files- scripts/README.md +19 -0
- scripts/extract_text_corpus.py +13 -3
- scripts/wiki_import.py +22 -5
scripts/README.md
CHANGED
@@ -10,4 +10,23 @@
|
|
10 |
8. Put CV files into dataset files folder
|
11 |
9. Put dev.csv and test.csv into folder
|
12 |
|
|
|
|
|
13 |
You have a reproducible dataset!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
8. Put CV files into dataset files folder
|
11 |
9. Put dev.csv and test.csv into folder
|
12 |
|
13 |
+
Note: you can also specify dataset with "," e.g. dataset1/train.csv,dataset2/train.csv.
|
14 |
+
|
15 |
You have a reproducible dataset!
|
16 |
+
|
17 |
+
|
18 |
+
# Scorer
|
19 |
+
|
20 |
+
1. Refer to DeepSpeech guide for further explanations.
|
21 |
+
|
22 |
+
2. Generate scorer package.
|
23 |
+
```
|
24 |
+
python3 generate_lm.py --input_txt ../../../voice-recognition-ua/data/all_text.txt --output_dir . \
|
25 |
+
--top_k 500000 --kenlm_bins ../../../voice-recognition-ua/kenlm/build/bin \
|
26 |
+
--arpa_order 5 --max_arpa_memory "85%" --arpa_prune "0|0|1" \
|
27 |
+
--binary_a_bits 255 --binary_q_bits 8 --binary_type trie
|
28 |
+
```
|
29 |
+
3. Run lm_optimizer to find the best scorer value.
|
30 |
+
4. Rerun step 2 to generate new scorer.
|
31 |
+
|
32 |
+
Caution: scorer is very model-dependant, so you'll likely need to adjust it to each model.
|
scripts/extract_text_corpus.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import nltk
|
3 |
import re
|
@@ -32,10 +33,19 @@ for subdir, dirs, files in os.walk(FOLDER):
|
|
32 |
text = text.strip()
|
33 |
|
34 |
words = tokenizer.tokenize(text)
|
35 |
-
words = [i for i in words if i.isalnum()]
|
36 |
words = [i for i in words if not i.isdigit()]
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
continue
|
40 |
if len(words) == 0:
|
41 |
continue
|
|
|
1 |
+
# this script is used for importing random texts from folder and converting it for scorer
|
2 |
import os
|
3 |
import nltk
|
4 |
import re
|
|
|
33 |
text = text.strip()
|
34 |
|
35 |
words = tokenizer.tokenize(text)
|
|
|
36 |
words = [i for i in words if not i.isdigit()]
|
37 |
+
new_words = []
|
38 |
+
for word in words:
|
39 |
+
include = True
|
40 |
+
for letter in word:
|
41 |
+
if word.startswith("-"):
|
42 |
+
word = word[1:]
|
43 |
+
if letter not in allowed_chars:
|
44 |
+
include = False
|
45 |
+
if include:
|
46 |
+
new_words.append(word)
|
47 |
+
words = new_words
|
48 |
+
if all([len(i) <= 1 for i in words]):
|
49 |
continue
|
50 |
if len(words) == 0:
|
51 |
continue
|
scripts/wiki_import.py
CHANGED
@@ -1,12 +1,16 @@
|
|
|
|
1 |
from wiki_dump_reader import Cleaner, iterate
|
2 |
from os import remove
|
|
|
3 |
import nltk
|
4 |
import re
|
5 |
nltk.download("punkt")
|
6 |
|
|
|
7 |
|
8 |
-
|
9 |
-
|
|
|
10 |
|
11 |
tokenizer = nltk.SpaceTokenizer()
|
12 |
paranthesis_regex = re.compile(r'\(.*\)')
|
@@ -14,6 +18,7 @@ allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и
|
|
14 |
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
|
15 |
|
16 |
cleaner = Cleaner()
|
|
|
17 |
for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
|
18 |
text = cleaner.clean_text(text)
|
19 |
cleaned_text, _ = cleaner.build_links(text)
|
@@ -34,10 +39,19 @@ for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.x
|
|
34 |
continue
|
35 |
|
36 |
words = tokenizer.tokenize(text)
|
37 |
-
words = [i for i in words if i.isalnum()]
|
38 |
words = [i for i in words if not i.isdigit()]
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
continue
|
42 |
if len(words) == 0:
|
43 |
continue
|
@@ -47,5 +61,8 @@ for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.x
|
|
47 |
if cleaned_text == "":
|
48 |
continue
|
49 |
text_file.write(cleaned_text + "\n")
|
|
|
|
|
|
|
50 |
|
51 |
text_file.close()
|
|
|
1 |
+
# this script is used for importing wiki text into scorer format
|
2 |
from wiki_dump_reader import Cleaner, iterate
|
3 |
from os import remove
|
4 |
+
from os.path import exists
|
5 |
import nltk
|
6 |
import re
|
7 |
nltk.download("punkt")
|
8 |
|
9 |
+
OUT_PATH = "../data/wiki_text.txt"
|
10 |
|
11 |
+
if exists(OUT_PATH):
|
12 |
+
remove(OUT_PATH)
|
13 |
+
text_file = open(OUT_PATH, mode="a")
|
14 |
|
15 |
tokenizer = nltk.SpaceTokenizer()
|
16 |
paranthesis_regex = re.compile(r'\(.*\)')
|
|
|
18 |
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
|
19 |
|
20 |
cleaner = Cleaner()
|
21 |
+
# iter = 0
|
22 |
for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
|
23 |
text = cleaner.clean_text(text)
|
24 |
cleaned_text, _ = cleaner.build_links(text)
|
|
|
39 |
continue
|
40 |
|
41 |
words = tokenizer.tokenize(text)
|
|
|
42 |
words = [i for i in words if not i.isdigit()]
|
43 |
+
new_words = []
|
44 |
+
for word in words:
|
45 |
+
include = True
|
46 |
+
for letter in word:
|
47 |
+
if word.startswith("-"):
|
48 |
+
word = word[1:]
|
49 |
+
if letter not in allowed_chars:
|
50 |
+
include = False
|
51 |
+
if include:
|
52 |
+
new_words.append(word)
|
53 |
+
words = new_words
|
54 |
+
if all([len(i) <= 1 for i in words]):
|
55 |
continue
|
56 |
if len(words) == 0:
|
57 |
continue
|
|
|
61 |
if cleaned_text == "":
|
62 |
continue
|
63 |
text_file.write(cleaned_text + "\n")
|
64 |
+
# iter += 1
|
65 |
+
# if iter > 5:
|
66 |
+
# break
|
67 |
|
68 |
text_file.close()
|