|
import numpy as np |
|
|
|
s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→" |
|
|
|
def split(text): |
|
o = [] |
|
t = "" |
|
for i in text+" ": |
|
if i in s: |
|
if t != "": |
|
o.append(t) |
|
t = "" |
|
if i != " ": |
|
o.append(i) |
|
t = "" |
|
else: |
|
t += i |
|
return o |
|
|
|
def tokenize_2str(text: str): |
|
text = split(text) |
|
|
|
o = [] |
|
|
|
for i in text: |
|
if i[-2:] == "es": |
|
o.append(i[:-2]) |
|
o.append("<es>") |
|
else: |
|
o.append(i) |
|
return o |
|
|
|
ind2text = ["<NULL>", "<UNK>", "<es>"] |
|
text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2} |
|
|
|
def fit_on_text(text: str): |
|
global ind2text |
|
global text2ind |
|
tokens = tokenize_2str(text) |
|
for i in tokens: |
|
if i not in ind2text: |
|
ind2text.append(i) |
|
text2ind[i] = len(ind2text) - 1 |
|
|
|
def fit_on_texts(texts): |
|
for text in texts: fit_on_text(text) |
|
|
|
def tokenize(text: str): |
|
text = tokenize_2str(text) |
|
|
|
o = [] |
|
|
|
for i in text: |
|
if i in ind2text: |
|
o.append(text2ind[i]) |
|
else: |
|
o.append(text2ind['<UNK>']) |
|
return np.array(o) |
|
|
|
|