from langchain_community.document_loaders import UnstructuredFileLoader from langchain_community.document_loaders import DirectoryLoader from langchain_text_splitters import CharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma # loaidng the embedding model embeddings = HuggingFaceEmbeddings() loader = DirectoryLoader(path="data", glob="./*.pdf", loader_cls=UnstructuredFileLoader) documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=500) text_chunks = text_splitter.split_documents(documents) vectordb = Chroma.from_documents( documents=text_chunks, embedding=embeddings, persist_directory="vector_db_dir" ) print("Documents Vectorized")