from pymilvus import connections, utility, DataType, FieldSchema, CollectionSchema, Collection from sentence_transformers import SentenceTransformer from pyvi import ViTokenizer import string import json def load_json(path): with open(path, 'r', encoding='utf-8') as file: data = json.load(file) return data def convert_query(query): tokenized_query = ViTokenizer.tokenize(query.lower()) return tokenized_query def load_stopword(path): stop_words = [] with open(path, 'r', encoding='utf-8') as file: for line in file: stop_words.append(line.strip()) return stop_words def remove_stop_words(path, split_prompts): stop_words = load_stopword(path) clean_words = [] for ele in split_prompts: if ele not in stop_words: clean_words.append(ele.strip()) return clean_words def clean_query(path, query): vi_query = ViTokenizer.tokenize(query.lower()) word_query = vi_query.split(' ') #print("word query: ", word_query) query_remove_punc = [word.replace('_', ' ') for word in word_query if word not in string.punctuation] removed_stop_words = remove_stop_words(path, query_remove_punc) removed_stop_words = list(dict.fromkeys(removed_stop_words)) return removed_stop_words def load_model(model_name): model = SentenceTransformer(model_name) return model def connect_vector_db(): connections.connect('default', host='localhost', port='19530') print("Connect finished!") def load_collection(collection_name): collection = Collection(collection_name) collection.load() print(f"{collection_name} load complete!") return collection