Danil commited on
Commit
c05a5b4
1 Parent(s): 6a3e9c9

Upload indexer.py

Browse files
Files changed (1) hide show
  1. indexer.py +64 -0
indexer.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import faiss
3
+ import numpy as np
4
+ # from grammar import remove_verbs, clean_text
5
+ from utils import *
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+
9
+ class FAISS:
10
+ def __init__(self, dimensions: int):
11
+ self.dimensions = dimensions
12
+ self.index = faiss.IndexFlatL2(dimensions)
13
+ self.vectors = {}
14
+ self.counter = 0
15
+ self.model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
16
+ self.sentence_encoder = SentenceTransformer(self.model_name)
17
+
18
+ def init_vectors(self, path):
19
+ with open(path, 'rb') as pkl_file:
20
+ self.vectors = pickle.load(pkl_file)
21
+
22
+ def init_index(self, path):
23
+ self.index = faiss.read_index(path)
24
+
25
+ def add(self, text, idx, pop, emb=None):
26
+ if emb is None:
27
+ text_vec = self.sentence_encoder.encode([text])
28
+ else:
29
+ text_vec = emb
30
+ self.index.add(text_vec)
31
+ self.vectors[self.counter] = (idx, text, pop, text_vec)
32
+ self.counter += 1
33
+
34
+ def search(self, v: list, k: int = 10):
35
+ result = []
36
+ distance, item_index = self.index.search(v, k)
37
+ for dist, i in zip(distance[0], item_index[0]):
38
+ if i == -1:
39
+ break
40
+ else:
41
+ result.append((self.vectors[i][0], self.vectors[i][1], self.vectors[i][2], dist))
42
+
43
+ return result
44
+
45
+ def suggest_tags(self, query, top_n=10, k=30) -> list:
46
+
47
+ emb = self.sentence_encoder.encode([query.lower()])
48
+ r = self.search(emb, k)
49
+
50
+ result = []
51
+ for i in r:
52
+ if check(query, i[1]):
53
+ result.append(i)
54
+ # надо добавить вес относительно длины
55
+ result = sorted(result, key=lambda x: x[0] * 0.3 - x[-1], reverse=True)
56
+ total_result = []
57
+ for i in range(len(result)):
58
+ flag = True
59
+ for j in result[i + 1:]:
60
+ flag &= sweet_check(result[i][1], j[1])
61
+ if flag:
62
+ total_result.append(result[i][1])
63
+
64
+ return total_result[:top_n]