Yurii Paniv commited on
Commit
8451e68
1 Parent(s): b3b12c1

Add text converting sscripts for scorer

Browse files
.gitignore CHANGED
@@ -129,4 +129,6 @@ dmypy.json
129
  .pyre/
130
 
131
  *.tflite
132
- .DS_Store
 
 
 
129
  .pyre/
130
 
131
  *.tflite
132
+ .DS_Store
133
+
134
+ /data
scripts/extract_text_corpus.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nltk
3
+ import re
4
+ nltk.download("punkt")
5
+
6
+ FOLDER = "../data/текст/"
7
+ OUT_FILE = "../data/texts.txt"
8
+ text_file = open(OUT_FILE, mode="a")
9
+
10
+ tokenizer = nltk.SpaceTokenizer()
11
+ paranthesis_regex = re.compile(r'\(.*\)')
12
+ allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
13
+ "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
14
+
15
+ for subdir, dirs, files in os.walk(FOLDER):
16
+ for file in files:
17
+ file_path = os.path.join(subdir, file)
18
+ print(file_path)
19
+ input_file = open(file_path)
20
+ try:
21
+ cleaned_text = input_file.read()
22
+ except:
23
+ input_file.close()
24
+ input_file = open(file_path, encoding="cp1251")
25
+ cleaned_text = input_file.read()
26
+ cleaned_text = cleaned_text.lower()
27
+ cleaned_text = paranthesis_regex.sub('', cleaned_text)
28
+ cleaned_text = cleaned_text.strip()
29
+ cleaned_text = cleaned_text.split(".")
30
+ out_text = []
31
+ for text in cleaned_text:
32
+ text = text.strip()
33
+
34
+ words = tokenizer.tokenize(text)
35
+ words = [i for i in words if i.isalnum()]
36
+ words = [i for i in words if not i.isdigit()]
37
+ words = [i for i in words if len(i) > 1]
38
+ if any([any(j not in allowed_chars for j in i) for i in words]):
39
+ continue
40
+ if len(words) == 0:
41
+ continue
42
+ out_text.append(
43
+ " ".join(words))
44
+ cleaned_text = "\n".join(out_text)
45
+ if cleaned_text == "":
46
+ continue
47
+ text_file.write(cleaned_text + "\n")
48
+ input_file.close()
49
+
50
+
51
+ text_file.close()
scripts/wiki_import.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from wiki_dump_reader import Cleaner, iterate
2
+ from os import remove
3
+ import nltk
4
+ import re
5
+ nltk.download("punkt")
6
+
7
+
8
+ remove("../data/wiki_text.txt")
9
+ text_file = open("../data/wiki_text.txt", mode="a")
10
+
11
+ tokenizer = nltk.SpaceTokenizer()
12
+ paranthesis_regex = re.compile(r'\(.*\)')
13
+ allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
14
+ "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
15
+
16
+ cleaner = Cleaner()
17
+ for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
18
+ text = cleaner.clean_text(text)
19
+ cleaned_text, _ = cleaner.build_links(text)
20
+ cleaned_text = cleaned_text.lower()
21
+ cleaned_text = cleaned_text.replace(" ", " ")
22
+ cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
23
+ cleaned_text = cleaned_text.replace("ім.", "імені")
24
+ cleaned_text = cleaned_text.replace("див.", "дивись")
25
+ cleaned_text = paranthesis_regex.sub('', cleaned_text)
26
+ cleaned_text = cleaned_text.strip()
27
+ cleaned_text = cleaned_text.split(".")
28
+ out_text = []
29
+ for text in cleaned_text:
30
+ text = text.strip()
31
+ if text.endswith(", що вивчає"):
32
+ continue
33
+ if text.startswith("redirect") or text.startswith("перенаправлення"):
34
+ continue
35
+
36
+ words = tokenizer.tokenize(text)
37
+ words = [i for i in words if i.isalnum()]
38
+ words = [i for i in words if not i.isdigit()]
39
+ words = [i for i in words if len(i) > 1]
40
+ if any([any(j not in allowed_chars for j in i) for i in words]):
41
+ continue
42
+ if len(words) == 0:
43
+ continue
44
+ out_text.append(
45
+ " ".join(words))
46
+ cleaned_text = "\n".join(out_text)
47
+ if cleaned_text == "":
48
+ continue
49
+ text_file.write(cleaned_text + "\n")
50
+
51
+ text_file.close()