Spaces:
Running
Running
modify word doc chunking
Browse files- .gitattributes +3 -0
- .gitignore +7 -0
- app.py +49 -16
- combined_recursive_keyword_retriever.pkl +3 -0
- data_loader.ipynb +0 -0
- faiss_recursive_split_word_doc_index/index.faiss +3 -0
- faiss_recursive_split_word_doc_index/index.pkl +3 -0
.gitattributes
CHANGED
@@ -38,3 +38,6 @@ faiss_word_doc_index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
|
38 |
faiss_excel_doc_index/index.pkl filter=lfs diff=lfs merge=lfs -text
|
39 |
faiss_excel_doc_index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
40 |
combined_keyword_retriever.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
38 |
faiss_excel_doc_index/index.pkl filter=lfs diff=lfs merge=lfs -text
|
39 |
faiss_excel_doc_index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
40 |
combined_keyword_retriever.pkl filter=lfs diff=lfs merge=lfs -text
|
41 |
+
faiss_recursive_split_word_doc_index/index.faiss filter=lfs diff=lfs merge=lfs -text
|
42 |
+
faiss_recursive_split_word_doc_index/index.pkl filter=lfs diff=lfs merge=lfs -text
|
43 |
+
combined_recursive_keyword_retriever.pkl filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
faiss_word_doc_index/*
|
2 |
+
recursice_word_keyword_retriever.pkl
|
3 |
+
word_keyword_retriever.pkl
|
4 |
+
excel_keyword_retriever.pkl
|
5 |
+
rag_pipeline.ipynb
|
6 |
+
data_loader.ipynb
|
7 |
+
combined_keyword_retriever.pkl
|
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from langchain_core.output_parsers import StrOutputParser
|
|
|
2 |
from langchain_core.runnables import RunnablePassthrough
|
3 |
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
4 |
from langchain.retrievers.document_compressors import EmbeddingsFilter
|
@@ -16,7 +17,7 @@ GROQ_API_KEY="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"
|
|
16 |
os.environ['GROQ_API_KEY'] = GROQ_API_KEY
|
17 |
|
18 |
|
19 |
-
embed_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-multilingual-base", model_kwargs={"trust_remote_code":True})
|
20 |
llm = ChatGroq(
|
21 |
model="llama-3.1-8b-instant",
|
22 |
temperature=0.0,
|
@@ -25,11 +26,11 @@ llm = ChatGroq(
|
|
25 |
)
|
26 |
|
27 |
excel_vectorstore = FAISS.load_local(folder_path="./faiss_excel_doc_index", embeddings=embed_model, allow_dangerous_deserialization=True)
|
28 |
-
word_vectorstore = FAISS.load_local(folder_path="./
|
29 |
excel_vectorstore.merge_from(word_vectorstore)
|
30 |
combined_vectorstore = excel_vectorstore
|
31 |
|
32 |
-
with open('
|
33 |
combined_keyword_retriever = pickle.load(f)
|
34 |
combined_keyword_retriever.k = 10
|
35 |
|
@@ -47,18 +48,31 @@ compression_retriever = ContextualCompressionRetriever(
|
|
47 |
base_compressor=embeddings_filter, base_retriever=ensemble_retriever
|
48 |
)
|
49 |
|
50 |
-
|
|
|
|
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
def format_docs(docs):
|
54 |
return "\n\n".join(doc.page_content for doc in docs)
|
55 |
|
56 |
|
57 |
rag_chain = (
|
58 |
-
{"context": compression_retriever | format_docs, "question": RunnablePassthrough()}
|
59 |
| prompt
|
60 |
| llm
|
61 |
-
|
|
62 |
)
|
63 |
|
64 |
|
@@ -67,19 +81,38 @@ rag_chain = (
|
|
67 |
|
68 |
# zero = torch.Tensor([0]).cuda()
|
69 |
|
70 |
-
@spaces.GPU
|
71 |
def get_response(question, history):
|
72 |
-
print(question)
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
with gr.Blocks() as demo:
|
|
|
|
|
|
|
|
|
|
|
81 |
chatbot = gr.Chatbot(placeholder="<strong>ADAFSA-RAG Chatbot</strong>")
|
82 |
-
gr.ChatInterface(
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
demo.launch()
|
85 |
|
|
|
1 |
from langchain_core.output_parsers import StrOutputParser
|
2 |
+
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
|
3 |
from langchain_core.runnables import RunnablePassthrough
|
4 |
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.retrievers.document_compressors import EmbeddingsFilter
|
|
|
17 |
os.environ['GROQ_API_KEY'] = GROQ_API_KEY
|
18 |
|
19 |
|
20 |
+
embed_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-multilingual-base", model_kwargs={"trust_remote_code":True, "device": "cuda"})
|
21 |
llm = ChatGroq(
|
22 |
model="llama-3.1-8b-instant",
|
23 |
temperature=0.0,
|
|
|
26 |
)
|
27 |
|
28 |
excel_vectorstore = FAISS.load_local(folder_path="./faiss_excel_doc_index", embeddings=embed_model, allow_dangerous_deserialization=True)
|
29 |
+
word_vectorstore = FAISS.load_local(folder_path="./faiss_recursive_split_word_doc_index", embeddings=embed_model, allow_dangerous_deserialization=True)
|
30 |
excel_vectorstore.merge_from(word_vectorstore)
|
31 |
combined_vectorstore = excel_vectorstore
|
32 |
|
33 |
+
with open('combined_recursive_keyword_retriever.pkl', 'rb') as f:
|
34 |
combined_keyword_retriever = pickle.load(f)
|
35 |
combined_keyword_retriever.k = 10
|
36 |
|
|
|
48 |
base_compressor=embeddings_filter, base_retriever=ensemble_retriever
|
49 |
)
|
50 |
|
51 |
+
template = """
|
52 |
+
User: You are an AI Assistant that follows instructions extremely well.
|
53 |
+
Please be truthful and give direct answers. Please tell 'I don't know' if user query is not in CONTEXT
|
54 |
|
55 |
+
Keep in mind, you will lose the job, if you answer out of CONTEXT questions
|
56 |
+
|
57 |
+
CONTEXT: {context}
|
58 |
+
Query: {question}
|
59 |
+
|
60 |
+
Remember only return AI answer
|
61 |
+
Assistant:
|
62 |
+
"""
|
63 |
+
|
64 |
+
prompt = ChatPromptTemplate.from_template(template)
|
65 |
+
output_parser = StrOutputParser()
|
66 |
|
67 |
def format_docs(docs):
|
68 |
return "\n\n".join(doc.page_content for doc in docs)
|
69 |
|
70 |
|
71 |
rag_chain = (
|
72 |
+
{"context": compression_retriever.with_config(run_name="Docs") | format_docs, "question": RunnablePassthrough()}
|
73 |
| prompt
|
74 |
| llm
|
75 |
+
| output_parser
|
76 |
)
|
77 |
|
78 |
|
|
|
81 |
|
82 |
# zero = torch.Tensor([0]).cuda()
|
83 |
|
84 |
+
# @spaces.GPU
|
85 |
def get_response(question, history):
|
86 |
+
# print(question)
|
87 |
+
curr_ans = ""
|
88 |
+
for chunk in rag_chain.stream(question):
|
89 |
+
curr_ans += chunk
|
90 |
+
yield curr_ans
|
91 |
+
|
92 |
+
example_questions = [
|
93 |
+
"الموسم المناسب لزراعة الذرة العلفية ؟",
|
94 |
+
"ما هي الاحتياجات المائية لتربية الحيوانات؟",
|
95 |
+
"ما هي خطوات إنتاج الشتلات؟",
|
96 |
+
"الموسم المناسب لزراعة الطماطم في الحقل المكشوف بدولة الإمارات؟",
|
97 |
+
"شروط اختيار مكان منحل العسل؟",
|
98 |
+
"ما هو تقييم مطعم قصر نجد؟",
|
99 |
+
"ما كمية أعلاف الجت المستلمة في منطقة الظفرة عام 2022",
|
100 |
+
"ما مساحات المزارع المروية بالتنقيط في منطقة الرحبة عام 2020",
|
101 |
+
"في إمارة أبوظبي في عام 2022، هل نسبة العينات الغذائية الغير مطابقة من إجمالي العينات أعلى في العينات المحلية أم العينات المستوردة"
|
102 |
+
]
|
103 |
with gr.Blocks() as demo:
|
104 |
+
gr.Markdown(
|
105 |
+
"""
|
106 |
+
# ADAFSA RAG Chatbot Demo
|
107 |
+
"""
|
108 |
+
)
|
109 |
chatbot = gr.Chatbot(placeholder="<strong>ADAFSA-RAG Chatbot</strong>")
|
110 |
+
gr.ChatInterface(
|
111 |
+
title="",
|
112 |
+
fn=get_response,
|
113 |
+
chatbot=chatbot,
|
114 |
+
examples=example_questions,
|
115 |
+
)
|
116 |
|
117 |
demo.launch()
|
118 |
|
combined_recursive_keyword_retriever.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71c816aa5e0cb849c3c9f36ca72ecf7b0968d0fd5ab5a63a3316223e68d5398d
|
3 |
+
size 8449174
|
data_loader.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
faiss_recursive_split_word_doc_index/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e21d5d78d4acf373e94ae40d43fcad7b724207b7b4c18455cc1fc613b6c01f5
|
3 |
+
size 14736429
|
faiss_recursive_split_word_doc_index/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27889ba1e7400d896ad677b1e545fd7a01ee16b8d2dbd3c2b9c6431d5b0ff50d
|
3 |
+
size 4029431
|