GPT-Docker / app /load_vectors.py
heikowagner's picture
remove data
4f0dc21
raw
history blame
3.81 kB
# %%
import nltk
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
from chromadb.config import Settings
import chromadb
from chromadb.utils import embedding_functions
from hashlib import sha256
import cloudpickle
import logging
import os
from load_model import load_embedding
import torch
import re
import pathlib
import tempfile
current_path = str( pathlib.Path(__file__).parent.resolve() )
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
nltk.download('punkt')
persist_directory = current_path + "/VectorStore"
logger = logging.getLogger()
# %%
def create_collection(collection_name, model_name, client):
"""Not used atm"""
if not torch.cuda.is_available():
device= "cpu"
else:
device= "cuda"
ef = embedding_functions.InstructorEmbeddingFunction(
model_name=model_name, device=device)
client.get_or_create_collection(collection_name, embedding_function=ef)
return True
def create_and_add(collection_name, sub_docs, model_name):
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=persist_directory,
anonymized_telemetry=False
)
client = chromadb.Client(client_settings)
collection_name = collection_name # + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)
embeddings = load_embedding(model_name)
logging.info(f"Adding documents to {collection_name}")
vectorstore = Chroma(
collection_name=collection_name,
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=persist_directory,
)
vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
vectorstore.persist()
# Test Vectorstore
vectorstore2 = Chroma(
collection_name=collection_name,
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=persist_directory,
)
print( vectorstore2.similarity_search_with_score(query="What are AXAs green Goals?", k=4) )
return vectorstore
def load_from_file(files):
saved_files=[]
with tempfile.TemporaryDirectory() as tmpdirname:
for file in files:
temp_dir = pathlib.Path(tmpdirname)
file_name = os.path.join(temp_dir,file.name)
saved_files.append(file_name)
with open(file_name, mode='wb') as w:
w.write(file.read())
print(saved_files)
loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files]
docs = []
print(loaders)
for loader in loaders:
docs.extend(loader.load())
return docs
def load_from_web(urls, cache=True):
docs_list = urls
filename=f"./.cache/{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"
isFile = os.path.isfile(filename)
if cache and isFile:
logger.info("Using Cache")
pikd = open(filename, "rb")
docs = cloudpickle.load(pikd)
else:
loaders=[OnlinePDFLoader(pdf) for pdf in docs_list]
docs = []
for loader in loaders:
docs.extend(loader.load())
with open(filename, 'wb') as output:
cloudpickle.dump(docs, output)
#update metadata
i=0
for doc in docs:
doc.metadata = {'source': docs_list[i], 'url': docs_list[i], 'company':'AXA'}
i=i+1
return docs
def load_and_split(docs, chunk_size=700):
text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
sub_docs = text_splitter.split_documents(docs)
return sub_docs