Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import UnstructuredFileLoader | |
from langchain_community.document_loaders import DirectoryLoader | |
from langchain_text_splitters import CharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
# loaidng the embedding model | |
embeddings = HuggingFaceEmbeddings() | |
loader = DirectoryLoader(path="data", | |
glob="./*.pdf", | |
loader_cls=UnstructuredFileLoader) | |
documents = loader.load() | |
text_splitter = CharacterTextSplitter(chunk_size=2000, | |
chunk_overlap=500) | |
text_chunks = text_splitter.split_documents(documents) | |
vectordb = Chroma.from_documents( | |
documents=text_chunks, | |
embedding=embeddings, | |
persist_directory="vector_db_dir" | |
) | |
print("Documents Vectorized") | |