Update app.py
Browse files
app.py
CHANGED
@@ -18,57 +18,56 @@ from langchain_community.document_loaders.directory import DirectoryLoader
|
|
18 |
from HTML_templates import css, bot_template, user_template
|
19 |
|
20 |
|
|
|
21 |
|
22 |
-
|
23 |
-
def retriever_from_chroma(docs, search_type, k):
|
24 |
model_name = "sentence-transformers/all-mpnet-base-v2"
|
25 |
model_kwargs = {'device': 'cpu'}
|
26 |
encode_kwargs = {'normalize_embeddings': True}
|
|
|
|
|
27 |
embeddings = HuggingFaceEmbeddings(
|
28 |
model_name=model_name,
|
29 |
model_kwargs=model_kwargs,
|
30 |
encode_kwargs=encode_kwargs
|
31 |
)
|
32 |
-
|
33 |
-
if
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": k})
|
38 |
return retriever
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
data_path = "data"
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
documents = []
|
48 |
-
|
49 |
-
for filename in os.listdir(data_path):
|
50 |
-
|
51 |
-
if filename.endswith('.txt'):
|
52 |
-
|
53 |
-
file_path = os.path.join(data_path, filename)
|
54 |
-
|
55 |
-
documents = TextLoader(file_path).load()
|
56 |
-
|
57 |
-
documents.extend(documents)
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
docs = split_docs(documents, 250, 20)
|
63 |
-
|
64 |
-
retriever = retriever_from_chroma(docs,'mmr',7)
|
65 |
|
66 |
|
67 |
|
68 |
|
69 |
|
70 |
|
71 |
-
def main(
|
72 |
|
73 |
st.set_page_config(page_title="Chat with multiple PDFs",
|
74 |
page_icon=":books:")
|
@@ -90,7 +89,8 @@ def main(retriever):
|
|
90 |
handle_userinput(user_question,vectorstore)
|
91 |
|
92 |
|
93 |
-
def handle_userinput(user_question
|
|
|
94 |
docs = retriever.invoke(question)
|
95 |
|
96 |
doc_txt = [doc.page_content for doc in docs]
|
@@ -135,4 +135,4 @@ def create_conversational_rag_chain(retriever):
|
|
135 |
|
136 |
|
137 |
if __name__ == "__main__":
|
138 |
-
main(
|
|
|
18 |
from HTML_templates import css, bot_template, user_template
|
19 |
|
20 |
|
21 |
+
data_path = "data"
|
22 |
|
23 |
+
def create_retriever_from_chroma(data_path, vectorstore_path="docs/chroma/", search_type='mmr', k=7, chunk_size=250, chunk_overlap=20):
|
|
|
24 |
model_name = "sentence-transformers/all-mpnet-base-v2"
|
25 |
model_kwargs = {'device': 'cpu'}
|
26 |
encode_kwargs = {'normalize_embeddings': True}
|
27 |
+
|
28 |
+
# Initialize embeddings
|
29 |
embeddings = HuggingFaceEmbeddings(
|
30 |
model_name=model_name,
|
31 |
model_kwargs=model_kwargs,
|
32 |
encode_kwargs=encode_kwargs
|
33 |
)
|
34 |
+
|
35 |
+
# Check if vectorstore exists
|
36 |
+
if os.path.exists(vectorstore_path) and os.listdir(vectorstore_path):
|
37 |
+
# Load the existing vectorstore
|
38 |
+
vectorstore = Chroma(persist_directory=vectorstore_path, embedding=embeddings)
|
39 |
+
else:
|
40 |
+
# Load documents from the specified data path
|
41 |
+
documents = []
|
42 |
+
for filename in os.listdir(data_path):
|
43 |
+
if filename.endswith('.txt'):
|
44 |
+
file_path = os.path.join(data_path, filename)
|
45 |
+
loaded_docs = TextLoader(file_path).load()
|
46 |
+
documents.extend(loaded_docs)
|
47 |
+
|
48 |
+
# Split documents into chunks
|
49 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
50 |
+
split_docs = text_splitter.split_documents(documents)
|
51 |
+
|
52 |
+
# Ensure the directory for storing vectorstore exists
|
53 |
+
if not os.path.exists(vectorstore_path):
|
54 |
+
os.makedirs(vectorstore_path)
|
55 |
+
|
56 |
+
# Create the vectorstore
|
57 |
+
vectorstore = Chroma.from_documents(
|
58 |
+
documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
|
59 |
+
)
|
60 |
+
|
61 |
+
# Create and return the retriever
|
62 |
retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": k})
|
63 |
return retriever
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
|
66 |
|
67 |
|
68 |
|
69 |
|
70 |
+
def main():
|
71 |
|
72 |
st.set_page_config(page_title="Chat with multiple PDFs",
|
73 |
page_icon=":books:")
|
|
|
89 |
handle_userinput(user_question,vectorstore)
|
90 |
|
91 |
|
92 |
+
def handle_userinput(user_question):
|
93 |
+
retriever = create_retriever_from_chroma(data_path, vectorstore_path="docs/chroma/", search_type='mmr', k=7, chunk_size=250, chunk_overlap=20)
|
94 |
docs = retriever.invoke(question)
|
95 |
|
96 |
doc_txt = [doc.page_content for doc in docs]
|
|
|
135 |
|
136 |
|
137 |
if __name__ == "__main__":
|
138 |
+
main()
|