# %% import nltk from langchain.indexes import VectorstoreIndexCreator from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader from langchain.vectorstores import Chroma from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings from chromadb.config import Settings import chromadb from chromadb.utils import embedding_functions from hashlib import sha256 import cloudpickle import logging import os from load_model import load_embedding import torch import re import pathlib import tempfile current_path = str( pathlib.Path(__file__).parent.resolve() ) os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" nltk.download('punkt') persist_directory = current_path + "/VectorStore" logger = logging.getLogger() # %% def create_collection(collection_name, model_name, client): """Not used atm""" if not torch.cuda.is_available(): device= "cpu" else: device= "cuda" ef = embedding_functions.InstructorEmbeddingFunction( model_name=model_name, device=device) client.get_or_create_collection(collection_name, embedding_function=ef) return True def create_and_add(collection_name, sub_docs, model_name): client_settings = chromadb.config.Settings( chroma_db_impl="duckdb+parquet", persist_directory=persist_directory, anonymized_telemetry=False ) client = chromadb.Client(client_settings) collection_name = collection_name # + "_" + re.sub('[^A-Za-z0-9]+', '', model_name) embeddings = load_embedding(model_name) logging.info(f"Adding documents to {collection_name}") vectorstore = Chroma( collection_name=collection_name, embedding_function=embeddings, client_settings=client_settings, persist_directory=persist_directory, ) vectorstore.add_documents(documents=sub_docs, embedding=embeddings) vectorstore.persist() # Test Vectorstore vectorstore2 = Chroma( collection_name=collection_name, embedding_function=embeddings, client_settings=client_settings, persist_directory=persist_directory, ) print( vectorstore2.similarity_search_with_score(query="What are AXAs green Goals?", k=4) ) return vectorstore def load_from_file(files): saved_files=[] with tempfile.TemporaryDirectory() as tmpdirname: for file in files: temp_dir = pathlib.Path(tmpdirname) file_name = os.path.join(temp_dir,file.name) saved_files.append(file_name) with open(file_name, mode='wb') as w: w.write(file.read()) print(saved_files) loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files] docs = [] print(loaders) for loader in loaders: docs.extend(loader.load()) return docs def load_from_web(urls, cache=True): docs_list = urls filename=f"./{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl" isFile = os.path.isfile(filename) if cache and isFile: logger.info("Using Cache") pikd = open(filename, "rb") docs = cloudpickle.load(pikd) else: loaders=[OnlinePDFLoader(pdf) for pdf in docs_list] docs = [] for loader in loaders: docs.extend(loader.load()) with open(filename, 'wb') as output: cloudpickle.dump(docs, output) #update metadata i=0 for doc in docs: doc.metadata = {'source': docs_list[i], 'url': docs_list[i], 'company':'AXA'} i=i+1 return docs def load_and_split(docs, chunk_size=700): text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0) sub_docs = text_splitter.split_documents(docs) return sub_docs