File size: 1,681 Bytes
eaf119d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from pymilvus import connections, utility, DataType, FieldSchema, CollectionSchema, Collection
from sentence_transformers import SentenceTransformer
from pyvi import ViTokenizer
import string
import json

def load_json(path):
    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def convert_query(query):
    tokenized_query = ViTokenizer.tokenize(query.lower())
    return tokenized_query

def load_stopword(path):
    stop_words = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            stop_words.append(line.strip())
    return stop_words

def remove_stop_words(path, split_prompts):
    stop_words = load_stopword(path)
    clean_words = []
    for ele in split_prompts:
        if ele not in stop_words:
            clean_words.append(ele.strip())
    return clean_words

def clean_query(path, query):
    vi_query = ViTokenizer.tokenize(query.lower())
    word_query = vi_query.split(' ')
    #print("word query: ", word_query)
    query_remove_punc = [word.replace('_', ' ') for word in word_query if word not in string.punctuation]
    removed_stop_words = remove_stop_words(path, query_remove_punc)
    removed_stop_words = list(dict.fromkeys(removed_stop_words))
    
    return removed_stop_words

def load_model(model_name):
    model = SentenceTransformer(model_name)
    return model

def connect_vector_db():
    connections.connect('default', host='localhost', port='19530')
    print("Connect finished!")

def load_collection(collection_name):
    collection = Collection(collection_name)
    collection.load()
    print(f"{collection_name} load complete!")
    return collection