Ritesh-hf commited on
Commit
8ec78b5
1 Parent(s): 776a1a9

modify word doc chunking

Browse files
.gitattributes CHANGED
@@ -38,3 +38,6 @@ faiss_word_doc_index/index.faiss filter=lfs diff=lfs merge=lfs -text
38
  faiss_excel_doc_index/index.pkl filter=lfs diff=lfs merge=lfs -text
39
  faiss_excel_doc_index/index.faiss filter=lfs diff=lfs merge=lfs -text
40
  combined_keyword_retriever.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
38
  faiss_excel_doc_index/index.pkl filter=lfs diff=lfs merge=lfs -text
39
  faiss_excel_doc_index/index.faiss filter=lfs diff=lfs merge=lfs -text
40
  combined_keyword_retriever.pkl filter=lfs diff=lfs merge=lfs -text
41
+ faiss_recursive_split_word_doc_index/index.faiss filter=lfs diff=lfs merge=lfs -text
42
+ faiss_recursive_split_word_doc_index/index.pkl filter=lfs diff=lfs merge=lfs -text
43
+ combined_recursive_keyword_retriever.pkl filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ faiss_word_doc_index/*
2
+ recursice_word_keyword_retriever.pkl
3
+ word_keyword_retriever.pkl
4
+ excel_keyword_retriever.pkl
5
+ rag_pipeline.ipynb
6
+ data_loader.ipynb
7
+ combined_keyword_retriever.pkl
app.py CHANGED
@@ -1,4 +1,5 @@
1
  from langchain_core.output_parsers import StrOutputParser
 
2
  from langchain_core.runnables import RunnablePassthrough
3
  from langchain_huggingface.embeddings import HuggingFaceEmbeddings
4
  from langchain.retrievers.document_compressors import EmbeddingsFilter
@@ -16,7 +17,7 @@ GROQ_API_KEY="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"
16
  os.environ['GROQ_API_KEY'] = GROQ_API_KEY
17
 
18
 
19
- embed_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-multilingual-base", model_kwargs={"trust_remote_code":True})
20
  llm = ChatGroq(
21
  model="llama-3.1-8b-instant",
22
  temperature=0.0,
@@ -25,11 +26,11 @@ llm = ChatGroq(
25
  )
26
 
27
  excel_vectorstore = FAISS.load_local(folder_path="./faiss_excel_doc_index", embeddings=embed_model, allow_dangerous_deserialization=True)
28
- word_vectorstore = FAISS.load_local(folder_path="./faiss_word_doc_index", embeddings=embed_model, allow_dangerous_deserialization=True)
29
  excel_vectorstore.merge_from(word_vectorstore)
30
  combined_vectorstore = excel_vectorstore
31
 
32
- with open('combined_keyword_retriever.pkl', 'rb') as f:
33
  combined_keyword_retriever = pickle.load(f)
34
  combined_keyword_retriever.k = 10
35
 
@@ -47,18 +48,31 @@ compression_retriever = ContextualCompressionRetriever(
47
  base_compressor=embeddings_filter, base_retriever=ensemble_retriever
48
  )
49
 
50
- prompt = hub.pull("rlm/rag-prompt")
 
 
51
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def format_docs(docs):
54
  return "\n\n".join(doc.page_content for doc in docs)
55
 
56
 
57
  rag_chain = (
58
- {"context": compression_retriever | format_docs, "question": RunnablePassthrough()}
59
  | prompt
60
  | llm
61
- | StrOutputParser()
62
  )
63
 
64
 
@@ -67,19 +81,38 @@ rag_chain = (
67
 
68
  # zero = torch.Tensor([0]).cuda()
69
 
70
- @spaces.GPU
71
  def get_response(question, history):
72
- print(question)
73
- # for chunk in rag_chain.stream(question):
74
- # yield chunk
75
- respose = rag_chain.invoke(question)
76
- print(respose)
77
-
78
- return respose
79
-
 
 
 
 
 
 
 
 
 
80
  with gr.Blocks() as demo:
 
 
 
 
 
81
  chatbot = gr.Chatbot(placeholder="<strong>ADAFSA-RAG Chatbot</strong>")
82
- gr.ChatInterface(fn=get_response, chatbot=chatbot)
 
 
 
 
 
83
 
84
  demo.launch()
85
 
 
1
  from langchain_core.output_parsers import StrOutputParser
2
+ from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
3
  from langchain_core.runnables import RunnablePassthrough
4
  from langchain_huggingface.embeddings import HuggingFaceEmbeddings
5
  from langchain.retrievers.document_compressors import EmbeddingsFilter
 
17
  os.environ['GROQ_API_KEY'] = GROQ_API_KEY
18
 
19
 
20
+ embed_model = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-multilingual-base", model_kwargs={"trust_remote_code":True, "device": "cuda"})
21
  llm = ChatGroq(
22
  model="llama-3.1-8b-instant",
23
  temperature=0.0,
 
26
  )
27
 
28
  excel_vectorstore = FAISS.load_local(folder_path="./faiss_excel_doc_index", embeddings=embed_model, allow_dangerous_deserialization=True)
29
+ word_vectorstore = FAISS.load_local(folder_path="./faiss_recursive_split_word_doc_index", embeddings=embed_model, allow_dangerous_deserialization=True)
30
  excel_vectorstore.merge_from(word_vectorstore)
31
  combined_vectorstore = excel_vectorstore
32
 
33
+ with open('combined_recursive_keyword_retriever.pkl', 'rb') as f:
34
  combined_keyword_retriever = pickle.load(f)
35
  combined_keyword_retriever.k = 10
36
 
 
48
  base_compressor=embeddings_filter, base_retriever=ensemble_retriever
49
  )
50
 
51
+ template = """
52
+ User: You are an AI Assistant that follows instructions extremely well.
53
+ Please be truthful and give direct answers. Please tell 'I don't know' if user query is not in CONTEXT
54
 
55
+ Keep in mind, you will lose the job, if you answer out of CONTEXT questions
56
+
57
+ CONTEXT: {context}
58
+ Query: {question}
59
+
60
+ Remember only return AI answer
61
+ Assistant:
62
+ """
63
+
64
+ prompt = ChatPromptTemplate.from_template(template)
65
+ output_parser = StrOutputParser()
66
 
67
  def format_docs(docs):
68
  return "\n\n".join(doc.page_content for doc in docs)
69
 
70
 
71
  rag_chain = (
72
+ {"context": compression_retriever.with_config(run_name="Docs") | format_docs, "question": RunnablePassthrough()}
73
  | prompt
74
  | llm
75
+ | output_parser
76
  )
77
 
78
 
 
81
 
82
  # zero = torch.Tensor([0]).cuda()
83
 
84
+ # @spaces.GPU
85
  def get_response(question, history):
86
+ # print(question)
87
+ curr_ans = ""
88
+ for chunk in rag_chain.stream(question):
89
+ curr_ans += chunk
90
+ yield curr_ans
91
+
92
+ example_questions = [
93
+ "الموسم المناسب لزراعة الذرة العلفية ؟",
94
+ "ما هي الاحتياجات المائية لتربية الحيوانات؟",
95
+ "ما هي خطوات إنتاج الشتلات؟",
96
+ "الموسم المناسب لزراعة الطماطم في الحقل المكشوف بدولة الإمارات؟",
97
+ "شروط اختيار مكان منحل العسل؟",
98
+ "ما هو تقييم مطعم قصر نجد؟",
99
+ "ما كمية أعلاف الجت المستلمة في منطقة الظفرة عام 2022",
100
+ "ما مساحات المزارع المروية بالتنقيط في منطقة الرحبة عام 2020",
101
+ "في إمارة أبوظبي في عام 2022، هل نسبة العينات الغذائية الغير مطابقة من إجمالي العينات أعلى في العينات المحلية أم العينات المستوردة"
102
+ ]
103
  with gr.Blocks() as demo:
104
+ gr.Markdown(
105
+ """
106
+ # ADAFSA RAG Chatbot Demo
107
+ """
108
+ )
109
  chatbot = gr.Chatbot(placeholder="<strong>ADAFSA-RAG Chatbot</strong>")
110
+ gr.ChatInterface(
111
+ title="",
112
+ fn=get_response,
113
+ chatbot=chatbot,
114
+ examples=example_questions,
115
+ )
116
 
117
  demo.launch()
118
 
combined_recursive_keyword_retriever.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71c816aa5e0cb849c3c9f36ca72ecf7b0968d0fd5ab5a63a3316223e68d5398d
3
+ size 8449174
data_loader.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
faiss_recursive_split_word_doc_index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e21d5d78d4acf373e94ae40d43fcad7b724207b7b4c18455cc1fc613b6c01f5
3
+ size 14736429
faiss_recursive_split_word_doc_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27889ba1e7400d896ad677b1e545fd7a01ee16b8d2dbd3c2b9c6431d5b0ff50d
3
+ size 4029431