fruitpicker01 commited on
Commit
6a6025e
1 Parent(s): 677b493

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +116 -0
utils.py CHANGED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import numpy as np
3
+ import pandas as pd
4
+ import pymorphy2
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from transformers import AutoTokenizer, AutoModel
8
+
9
+ morph = pymorphy2.MorphAnalyzer()
10
+ tokenizer = AutoTokenizer.from_pretrained("ai-forever/ru-en-RoSBERTa")
11
+ model = AutoModel.from_pretrained("ai-forever/ru-en-RoSBERTa")
12
+
13
+ def cosine_similarity(embedding1, embedding2):
14
+ embedding1 = np.array(embedding1)
15
+ embedding2 = np.array(embedding2)
16
+
17
+ dot_product = np.dot(embedding1, embedding2)
18
+ norm_a = np.linalg.norm(embedding1)
19
+ norm_b = np.linalg.norm(embedding2)
20
+
21
+ return dot_product / (norm_a * norm_b)
22
+
23
+ def pool(hidden_state, mask, pooling_method="cls"):
24
+ if pooling_method == "mean":
25
+ s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
26
+ d = mask.sum(axis=1, keepdim=True).float()
27
+ return s / d
28
+ elif pooling_method == "cls":
29
+ return hidden_state[:, 0]
30
+
31
+ def text_to_embedding(text, tokenizer, model):
32
+ # Токенизация текста
33
+ tokenized_inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors="pt")
34
+ with torch.no_grad():
35
+ outputs = model(**tokenized_inputs)
36
+
37
+ embeddings = pool(
38
+ outputs.last_hidden_state,
39
+ tokenized_inputs["attention_mask"],
40
+ pooling_method="cls" # or try "mean"
41
+ )
42
+
43
+ embeddings = F.normalize(embeddings, p=2, dim=1).numpy()
44
+
45
+ return embeddings
46
+
47
+ def preprocess_text(text):
48
+ lemmas = [] # Для хранения лемм
49
+ for token in text.split():
50
+ parsed = morph.parse(token)[0] # Морфологический разбор токена
51
+
52
+ # Лемматизация
53
+ if parsed.normal_form and parsed.normal_form.strip():
54
+ lemmas.append(parsed.normal_form) # Добавляем лемму
55
+
56
+ return " ".join(lemmas) if lemmas else ""
57
+
58
+ def product_extraction(text):
59
+ lemmas = preprocess_text(text)
60
+ if 'кредитный бизнес-' in lemmas:
61
+ return 'кредитная бизнес-карта'
62
+ elif 'выпустить бизнес-карта' in lemmas:
63
+ return 'бизнес-карта'
64
+ elif ('расчётный счёт' in lemmas) or ('открыть счёт' in lemmas):
65
+ return 'расчетный счет'
66
+ elif 'бизнес-карта' in lemmas:
67
+ return 'бизнес-карта'
68
+ elif 'бизнес-кешбэк' in lemmas:
69
+ return 'cashback'
70
+ elif 'перевод' in lemmas:
71
+ return 'переводы'
72
+ elif 'кредит' in lemmas:
73
+ return 'кредит'
74
+ elif 'эквайринг' in lemmas:
75
+ return 'эквайринг'
76
+ elif 'зарплатный проект' in lemmas:
77
+ return 'зарплатный проект'
78
+ elif 'вклад' in lemmas:
79
+ return 'вклад'
80
+ elif 'депозит' in lemmas:
81
+ return 'депозит'
82
+ return 'прочее'
83
+
84
+ def best_text_choice(texts, core_df, tokenizer, model, coef=1):
85
+ '''
86
+ Функция для выбора лучшего текста, и оценки его успешности
87
+ '''
88
+ scoring_list = []
89
+ embeddings_df = core_df.copy()
90
+ texts_df = pd.DataFrame(texts, columns=['texts'])
91
+ texts_df['texts_lower'] = texts_df['texts'].apply(lambda x: x.lower())
92
+ texts_df['texts_'] = 'search_query: ' + texts_df['texts_lower']
93
+ texts_df['embeddings'] = texts_df['texts_'].apply(lambda x: text_to_embedding(x, tokenizer, model)[0])
94
+ texts_df['product'] = texts_df['texts'].apply(product_extraction)
95
+ best_text = ''
96
+ score = 0
97
+ for index, row in texts_df.iterrows():
98
+ product = row['product']
99
+ embeddings_df['similarity'] = embeddings_df['embedding'].apply(lambda x: cosine_similarity(x, row['embeddings']))
100
+ embeddings_df['score'] = embeddings_df['value'] * embeddings_df['similarity']
101
+ score_ = np.mean([(embeddings_df
102
+ .sort_values(by=['product_type', 'score'], ascending=[True, False])
103
+ .query('product_type == @product')['score'][:3].mean() * coef),
104
+ embeddings_df
105
+ .sort_values(by='similarity', ascending=False)
106
+ .query('product_type != @product')['score'][:3].mean()])
107
+ scoring_list.append([row['texts'], 100*score_ / embeddings_df.query('product_type == @product')['value'].max()])
108
+ if score_ > score:
109
+ score = score_
110
+ best_text = row['texts']
111
+
112
+ # ratio = score / embeddings_df.query('product_type == @product')['value'].max()
113
+ scoring_df = pd.DataFrame(scoring_list, columns=['text', 'score'])
114
+ scoring_df = scoring_df.sort_values(by='score', ascending=False).reset_index(drop=True)
115
+ scoring_df.index += 1
116
+ return scoring_df.reset_index().rename(columns={'index': 'Место'})[['Место', 'text']]