AlterM ierhon commited on
Commit
16d5400
0 Parent(s):

Duplicate from RisticksAI/Zaglyt2-transformer-test

Browse files

Co-authored-by: - - - <ierhon@users.noreply.huggingface.co>

Files changed (8) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +14 -0
  4. m_conf.py +3 -0
  5. net.py +82 -0
  6. requirements.txt +5 -0
  7. train.txt +0 -0
  8. word_emb.py +15 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Zaglyt2 Transformer Test
3
+ emoji: 🚀
4
+ colorFrom: pink
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.33.1
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: RisticksAI/Zaglyt2-transformer-test
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import net
3
+
4
+ def generate(text):
5
+ o = text
6
+ r = []
7
+ for i in range(5):
8
+ t = net.gen(o)
9
+ o += " " + t
10
+ r.append(t)
11
+ return text + " *"+' '.join(r)+"*"
12
+
13
+ iface = gr.Interface(fn=generate, inputs="text", outputs="text")
14
+ iface.launch()
m_conf.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ input_length = 20
2
+ emb_dim = 128
3
+ emb_o_dim = 256
net.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import word_emb
2
+ from m_conf import *
3
+ import numpy as np
4
+ from gensim.models import Word2Vec
5
+ from tensorflow.keras.models import Sequential
6
+ from tensorflow.keras.layers import Dense, Dropout, Flatten, Embedding
7
+ from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
8
+ from tensorflow.keras.optimizers import Adam
9
+ from tensorflow.keras.preprocessing.text import Tokenizer
10
+ from tensorflow.keras.losses import MeanSquaredError
11
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
12
+
13
+ w2v = Word2Vec.load("w2v.model")
14
+
15
+ # загрузка датасета
16
+ with open('train.txt', 'r') as file:
17
+ text = file.readlines()
18
+
19
+ # создание Tokenizerа
20
+ tokenizer = Tokenizer()
21
+ # обучение Tokenizer на основе текста из train.txt
22
+ tokenizer.fit_on_texts(text)
23
+
24
+ # преобразование текстовых данных в последовательности целых чисел с помощью объекта tokenizer
25
+ tt = tokenizer.texts_to_sequences(text)
26
+
27
+ t_sw = [[line[i:i+input_length] for i in range(len(line))] for line in tt]
28
+
29
+ combined_list = []
30
+
31
+ for line in t_sw:
32
+ combined_list.extend(line)
33
+
34
+ y_t = [[w2v.wv[str(token)] for token in line] for line in tt]
35
+
36
+ y = []
37
+ for line in y_t:
38
+ y.extend(line)
39
+
40
+ # задать длинну входа до переменной input_length, заполняя пустоту нулями
41
+ X = pad_sequences(combined_list, maxlen=input_length, padding='pre')
42
+
43
+ # получаем количество токенов в тексте
44
+ vocab_size = len(tokenizer.word_index)
45
+
46
+ # создание модели машинного обучения и задание её параметров
47
+ model = Sequential()
48
+ emb = Embedding(input_dim=vocab_size+1, output_dim=emb_dim, input_length=input_length)
49
+ model.add(emb)
50
+ model.add(SeqWeightedAttention())
51
+ model.add(Flatten())
52
+ model.add(Dense(512, activation="tanh"))
53
+ model.add(Dropout(0.5))
54
+ model.add(Dense(256, activation="tanh"))
55
+ model.add(Dropout(0.5))
56
+ model.add(Dense(128, activation="tanh"))
57
+ model.add(Dense(emb_o_dim, activation="tanh"))
58
+
59
+ # компилирование модели с функцией потерь mse и отображением accuracy
60
+ model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=["accuracy"])
61
+
62
+ # обучение модели
63
+ set_limit = 2000
64
+ model.fit(np.array(X[:set_limit]), np.array(y[:set_limit]), epochs=10, batch_size=4)
65
+
66
+ def find_closest_token(o, temperature=0.0, top_p=1):
67
+ token_distances = []
68
+ for token in w2v.wv.index_to_key:
69
+ vector = w2v.wv[token]
70
+ distance = np.sum((o - vector)**2)
71
+ token_distances.append((token, distance))
72
+
73
+ token_distances = sorted(token_distances, key=lambda x: x[1])
74
+ closest_token = token_distances[0][0]
75
+
76
+ return closest_token
77
+
78
+ def gen(text):
79
+ # преобразовать текст в понимаемую нейросетью информацию
80
+ inp = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=input_length, padding='pre')
81
+ # сделать предположение и его возвратить
82
+ return str(tokenizer.index_word[int(find_closest_token(model.predict(inp)[0]))])
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ gensim
3
+ tensorflow
4
+ keras
5
+ keras_self_attention
train.txt ADDED
The diff for this file is too large to render. See raw diff
 
word_emb.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from m_conf import *
2
+ from keras.preprocessing.text import Tokenizer
3
+ from gensim.models import Word2Vec
4
+
5
+ with open('train.txt', 'r') as file:
6
+ lines = file.readlines()
7
+
8
+ tokenizer = Tokenizer()
9
+ tokenizer.fit_on_texts(lines)
10
+ sequences = tokenizer.texts_to_sequences(lines)
11
+ tokens = [[str(i) for i in seq] for seq in sequences]
12
+
13
+ model = Word2Vec(tokens, window=3, min_count=1, vector_size=emb_o_dim)
14
+
15
+ model.save("w2v.model")