basicchatbot-kel / tokenizer.py
ierhon's picture
Upload 5 files
d80c106
import numpy as np
s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→"
def split(text):
o = []
t = ""
for i in text+" ":
if i in s:
if t != "":
o.append(t)
t = ""
if i != " ":
o.append(i)
t = ""
else:
t += i
return o
def tokenize_2str(text: str):
text = split(text)
o = []
for i in text:
if i[-2:] == "es":
o.append(i[:-2])
o.append("<es>")
else:
o.append(i)
return o
ind2text = ["<NULL>", "<UNK>", "<es>"]
text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2}
def fit_on_text(text: str):
global ind2text
global text2ind
tokens = tokenize_2str(text)
for i in tokens:
if i not in ind2text:
ind2text.append(i)
text2ind[i] = len(ind2text) - 1
def fit_on_texts(texts):
for text in texts: fit_on_text(text)
def tokenize(text: str):
text = tokenize_2str(text)
o = []
for i in text:
if i in ind2text:
o.append(text2ind[i])
else:
o.append(text2ind['<UNK>'])
return np.array(o)