Chatbot / vectorize_documents.py
0504ankitsharma's picture
Upload 5 files
52794ee verified
raw
history blame contribute delete
850 Bytes
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
# loaidng the embedding model
embeddings = HuggingFaceEmbeddings()
loader = DirectoryLoader(path="data",
glob="./*.pdf",
loader_cls=UnstructuredFileLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=2000,
chunk_overlap=500)
text_chunks = text_splitter.split_documents(documents)
vectordb = Chroma.from_documents(
documents=text_chunks,
embedding=embeddings,
persist_directory="vector_db_dir"
)
print("Documents Vectorized")