Spaces:
Running
Running
Yurii Paniv
commited on
Commit
•
8451e68
1
Parent(s):
b3b12c1
Add text converting sscripts for scorer
Browse files- .gitignore +3 -1
- scripts/extract_text_corpus.py +51 -0
- scripts/wiki_import.py +51 -0
.gitignore
CHANGED
@@ -129,4 +129,6 @@ dmypy.json
|
|
129 |
.pyre/
|
130 |
|
131 |
*.tflite
|
132 |
-
.DS_Store
|
|
|
|
|
|
129 |
.pyre/
|
130 |
|
131 |
*.tflite
|
132 |
+
.DS_Store
|
133 |
+
|
134 |
+
/data
|
scripts/extract_text_corpus.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import nltk
|
3 |
+
import re
|
4 |
+
nltk.download("punkt")
|
5 |
+
|
6 |
+
FOLDER = "../data/текст/"
|
7 |
+
OUT_FILE = "../data/texts.txt"
|
8 |
+
text_file = open(OUT_FILE, mode="a")
|
9 |
+
|
10 |
+
tokenizer = nltk.SpaceTokenizer()
|
11 |
+
paranthesis_regex = re.compile(r'\(.*\)')
|
12 |
+
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
|
13 |
+
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
|
14 |
+
|
15 |
+
for subdir, dirs, files in os.walk(FOLDER):
|
16 |
+
for file in files:
|
17 |
+
file_path = os.path.join(subdir, file)
|
18 |
+
print(file_path)
|
19 |
+
input_file = open(file_path)
|
20 |
+
try:
|
21 |
+
cleaned_text = input_file.read()
|
22 |
+
except:
|
23 |
+
input_file.close()
|
24 |
+
input_file = open(file_path, encoding="cp1251")
|
25 |
+
cleaned_text = input_file.read()
|
26 |
+
cleaned_text = cleaned_text.lower()
|
27 |
+
cleaned_text = paranthesis_regex.sub('', cleaned_text)
|
28 |
+
cleaned_text = cleaned_text.strip()
|
29 |
+
cleaned_text = cleaned_text.split(".")
|
30 |
+
out_text = []
|
31 |
+
for text in cleaned_text:
|
32 |
+
text = text.strip()
|
33 |
+
|
34 |
+
words = tokenizer.tokenize(text)
|
35 |
+
words = [i for i in words if i.isalnum()]
|
36 |
+
words = [i for i in words if not i.isdigit()]
|
37 |
+
words = [i for i in words if len(i) > 1]
|
38 |
+
if any([any(j not in allowed_chars for j in i) for i in words]):
|
39 |
+
continue
|
40 |
+
if len(words) == 0:
|
41 |
+
continue
|
42 |
+
out_text.append(
|
43 |
+
" ".join(words))
|
44 |
+
cleaned_text = "\n".join(out_text)
|
45 |
+
if cleaned_text == "":
|
46 |
+
continue
|
47 |
+
text_file.write(cleaned_text + "\n")
|
48 |
+
input_file.close()
|
49 |
+
|
50 |
+
|
51 |
+
text_file.close()
|
scripts/wiki_import.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from wiki_dump_reader import Cleaner, iterate
|
2 |
+
from os import remove
|
3 |
+
import nltk
|
4 |
+
import re
|
5 |
+
nltk.download("punkt")
|
6 |
+
|
7 |
+
|
8 |
+
remove("../data/wiki_text.txt")
|
9 |
+
text_file = open("../data/wiki_text.txt", mode="a")
|
10 |
+
|
11 |
+
tokenizer = nltk.SpaceTokenizer()
|
12 |
+
paranthesis_regex = re.compile(r'\(.*\)')
|
13 |
+
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
|
14 |
+
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
|
15 |
+
|
16 |
+
cleaner = Cleaner()
|
17 |
+
for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
|
18 |
+
text = cleaner.clean_text(text)
|
19 |
+
cleaned_text, _ = cleaner.build_links(text)
|
20 |
+
cleaned_text = cleaned_text.lower()
|
21 |
+
cleaned_text = cleaned_text.replace(" ", " ")
|
22 |
+
cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
|
23 |
+
cleaned_text = cleaned_text.replace("ім.", "імені")
|
24 |
+
cleaned_text = cleaned_text.replace("див.", "дивись")
|
25 |
+
cleaned_text = paranthesis_regex.sub('', cleaned_text)
|
26 |
+
cleaned_text = cleaned_text.strip()
|
27 |
+
cleaned_text = cleaned_text.split(".")
|
28 |
+
out_text = []
|
29 |
+
for text in cleaned_text:
|
30 |
+
text = text.strip()
|
31 |
+
if text.endswith(", що вивчає"):
|
32 |
+
continue
|
33 |
+
if text.startswith("redirect") or text.startswith("перенаправлення"):
|
34 |
+
continue
|
35 |
+
|
36 |
+
words = tokenizer.tokenize(text)
|
37 |
+
words = [i for i in words if i.isalnum()]
|
38 |
+
words = [i for i in words if not i.isdigit()]
|
39 |
+
words = [i for i in words if len(i) > 1]
|
40 |
+
if any([any(j not in allowed_chars for j in i) for i in words]):
|
41 |
+
continue
|
42 |
+
if len(words) == 0:
|
43 |
+
continue
|
44 |
+
out_text.append(
|
45 |
+
" ".join(words))
|
46 |
+
cleaned_text = "\n".join(out_text)
|
47 |
+
if cleaned_text == "":
|
48 |
+
continue
|
49 |
+
text_file.write(cleaned_text + "\n")
|
50 |
+
|
51 |
+
text_file.close()
|