Spaces:

robinhad
/

ukrainian-stt

Running

App Files Files Community

Yurii Paniv commited on Mar 31, 2021

Commit

8451e68

•

1 Parent(s): b3b12c1

Add text converting sscripts for scorer

Browse files

Files changed (3) hide show

.gitignore +3 -1
scripts/extract_text_corpus.py +51 -0
scripts/wiki_import.py +51 -0

.gitignore CHANGED Viewed

@@ -129,4 +129,6 @@ dmypy.json
 .pyre/
 *.tflite
-.DS_Store

 .pyre/
 *.tflite
+.DS_Store
+/data

scripts/extract_text_corpus.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import nltk
+import re
+nltk.download("punkt")
+FOLDER = "../data/текст/"
+OUT_FILE = "../data/texts.txt"
+text_file = open(OUT_FILE, mode="a")
+tokenizer = nltk.SpaceTokenizer()
+paranthesis_regex = re.compile(r'\(.*\)')
+allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
+                 "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
+for subdir, dirs, files in os.walk(FOLDER):
+    for file in files:
+        file_path = os.path.join(subdir, file)
+        print(file_path)
+        input_file = open(file_path)
+        try:
+            cleaned_text = input_file.read()
+        except:
+            input_file.close()
+            input_file = open(file_path, encoding="cp1251")
+            cleaned_text = input_file.read()
+        cleaned_text = cleaned_text.lower()
+        cleaned_text = paranthesis_regex.sub('', cleaned_text)
+        cleaned_text = cleaned_text.strip()
+        cleaned_text = cleaned_text.split(".")
+        out_text = []
+        for text in cleaned_text:
+            text = text.strip()
+            words = tokenizer.tokenize(text)
+            words = [i for i in words if i.isalnum()]
+            words = [i for i in words if not i.isdigit()]
+            words = [i for i in words if len(i) > 1]
+            if any([any(j not in allowed_chars for j in i) for i in words]):
+                continue
+            if len(words) == 0:
+                continue
+            out_text.append(
+                " ".join(words))
+        cleaned_text = "\n".join(out_text)
+        if cleaned_text == "":
+            continue
+        text_file.write(cleaned_text + "\n")
+        input_file.close()
+text_file.close()

scripts/wiki_import.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from wiki_dump_reader import Cleaner, iterate
+from os import remove
+import nltk
+import re
+nltk.download("punkt")
+remove("../data/wiki_text.txt")
+text_file = open("../data/wiki_text.txt", mode="a")
+tokenizer = nltk.SpaceTokenizer()
+paranthesis_regex = re.compile(r'\(.*\)')
+allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
+                 "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
+cleaner = Cleaner()
+for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
+    text = cleaner.clean_text(text)
+    cleaned_text, _ = cleaner.build_links(text)
+    cleaned_text = cleaned_text.lower()
+    cleaned_text = cleaned_text.replace("&nbsp;", " ")
+    cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
+    cleaned_text = cleaned_text.replace("ім.", "імені")
+    cleaned_text = cleaned_text.replace("див.", "дивись")
+    cleaned_text = paranthesis_regex.sub('', cleaned_text)
+    cleaned_text = cleaned_text.strip()
+    cleaned_text = cleaned_text.split(".")
+    out_text = []
+    for text in cleaned_text:
+        text = text.strip()
+        if text.endswith(", що вивчає"):
+            continue
+        if text.startswith("redirect") or text.startswith("перенаправлення"):
+            continue
+        words = tokenizer.tokenize(text)
+        words = [i for i in words if i.isalnum()]
+        words = [i for i in words if not i.isdigit()]
+        words = [i for i in words if len(i) > 1]
+        if any([any(j not in allowed_chars for j in i) for i in words]):
+            continue
+        if len(words) == 0:
+            continue
+        out_text.append(
+            " ".join(words))
+    cleaned_text = "\n".join(out_text)
+    if cleaned_text == "":
+        continue
+    text_file.write(cleaned_text + "\n")
+text_file.close()