Files changed (6) hide show
  1. main.py +19 -14
  2. poetry.lock +0 -0
  3. pyproject.toml +25 -0
  4. requirements.txt +2 -2
  5. static/chatbot.js +1 -1
  6. utils.py +219 -20
main.py CHANGED
@@ -1,22 +1,27 @@
1
- from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
2
- from langchain.embeddings.openai import OpenAIEmbeddings
3
- from langchain.llms import OpenAI
4
 
5
- from utils import generate_answer
6
- from utils import get_search_index
7
 
8
- open_ai_pkl = "open_ai.pkl"
9
- open_ai_index = "open_ai.index"
10
 
11
- gpt_3_5 = OpenAI(model_name='gpt-3.5-turbo',temperature=0)
12
 
13
- open_ai_embeddings = OpenAIEmbeddings()
14
 
15
- def run(question):
 
 
16
 
17
- gpt_3_5_index = get_search_index(open_ai_pkl, open_ai_index, open_ai_embeddings)
18
 
19
- gpt_3_5_chain = load_qa_with_sources_chain(gpt_3_5, chain_type="stuff", verbose=True)
 
 
 
 
 
 
20
 
21
- answer = generate_answer(gpt_3_5_chain, gpt_3_5_index, question)
22
- return answer
 
1
+ from utils import create_index, get_agent_chain, get_prompt_and_tools, get_search_index
2
+ from utils import get_custom_agent
3
+ question_starters = ['who', 'why', 'what', 'how', 'where', 'when', 'which', 'whom', 'whose']
4
 
5
+ def run(question):
 
6
 
7
+ index = get_search_index()
 
8
 
9
+ # prompt, tools = get_prompt_and_tools()
10
 
11
+ # agent_chain = get_agent_chain(prompt, tools)
12
 
13
+ prompt, tools = get_prompt_and_tools_for_custom_agent()
14
+
15
+ agent_chain = get_custom_agent(prompt, tools)
16
 
17
+ result = None
18
 
19
+ try:
20
+ result = agent_chain.run(question)
21
+ print(result)
22
+ except ValueError as ve:
23
+ if "Could not parse LLM output:" in ve.args[0] and question.lower().startswith(tuple(question_starters)) and not question.lower().endswith('?'):
24
+ question = question + '?'
25
+ result = agent_chain.run(question)
26
 
27
+ return result
 
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "makerlab-bot"
3
+ version = "0.1.0"
4
+ description = "Assistant Bot to Makerlab"
5
+ authors = ["rohan-uiuc <rohan13@illinois.edu>"]
6
+ readme = "README.md"
7
+ packages = [{include = "makerlab_bot"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.9"
11
+ faiss-cpu = "^1.7.3"
12
+ langchain = "^0.0.131"
13
+ beautifulsoup4 = "^4.12.0"
14
+ pypdf2 = "^3.0.1"
15
+ openai = "^0.27.4"
16
+ flask = "^2.2.3"
17
+ flask-socketio = "^5.3.3"
18
+ flask-cors = "^3.0.10"
19
+ gevent = "^22.10.2"
20
+ gevent-websocket = "^0.10.1"
21
+
22
+
23
+ [build-system]
24
+ requires = ["poetry-core"]
25
+ build-backend = "poetry.core.masonry.api"
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- faiss-cpu==1.7.3
2
- langchain==0.0.117
3
  beautifulsoup4
4
  PyPDF2
5
  openai
 
1
+ faiss-cpu
2
+ langchain
3
  beautifulsoup4
4
  PyPDF2
5
  openai
static/chatbot.js CHANGED
@@ -23,7 +23,7 @@ $(document).ready(function() {
23
  // Function to display message
24
  function displayMessage(message, isUser) {
25
  var $message = $('<div>').addClass('chat-message round');
26
- var $messageText = $('<p>').html(message.replace(/(https?:\/\/[^\s]+)/g, '<a href="$1">$1</a>'));
27
 
28
  $message.append($messageText);
29
  if (isUser) {
 
23
  // Function to display message
24
  function displayMessage(message, isUser) {
25
  var $message = $('<div>').addClass('chat-message round');
26
+ var $messageText = $('<p>').html(message.replace(/(https?:\/\/[^\s,]+)/g, '<a href="$1">$1</a>'));
27
 
28
  $message.append($messageText);
29
  if (isUser) {
utils.py CHANGED
@@ -1,36 +1,111 @@
1
  import os
2
  import pickle
 
3
  import time
 
4
  from urllib.parse import urlparse, urljoin
5
 
6
  import faiss
7
  import requests
8
  from PyPDF2 import PdfReader
9
  from bs4 import BeautifulSoup
 
 
 
 
 
10
  from langchain.docstore.document import Document
 
 
 
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain.vectorstores.faiss import FAISS
13
 
14
  book_url = 'https://g.co/kgs/2VFC7u'
15
  book_file = "Book.pdf"
16
  url = 'https://makerlab.illinois.edu/'
17
- def get_search_index(pickle_file, index_file, embeddings):
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(pickle_file) > 0:
20
  # Load index from pickle file
21
  with open(pickle_file, "rb") as f:
22
  search_index = pickle.load(f)
23
  else:
24
- source_chunks = create_chunk_documents()
25
 
26
- search_index = search_index_from_docs(source_chunks, embeddings=embeddings)
27
 
28
- faiss.write_index(search_index.index, index_file)
29
-
30
- # Save index to pickle file
31
- with open(pickle_file, "wb") as f:
32
- pickle.dump(search_index, f)
33
 
 
 
 
 
 
 
 
34
  return search_index
35
 
36
 
@@ -118,19 +193,143 @@ def get_document_data(book_file, book_url):
118
  # print("document list" + str(len(document_list)))
119
  return document_list
120
 
121
- def search_index_from_docs(source_chunks, embeddings):
122
  # Create index from chunk documents
123
  # print("Size of chunk" + str(len(source_chunks)))
124
  search_index = FAISS.from_texts([doc.page_content for doc in source_chunks], embeddings, metadatas=[doc.metadata for doc in source_chunks])
125
  return search_index
126
- def generate_answer(chain, index, question):
127
- #Get answer
128
- answer = chain(
129
- {
130
- "input_documents": index.similarity_search(question, k=4),
131
- "question": question,
132
- },
133
- return_only_outputs=True,
134
- )["output_text"]
135
-
136
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import pickle
3
+ import re
4
  import time
5
+ from typing import List, Union
6
  from urllib.parse import urlparse, urljoin
7
 
8
  import faiss
9
  import requests
10
  from PyPDF2 import PdfReader
11
  from bs4 import BeautifulSoup
12
+ from langchain import OpenAI, LLMChain
13
+ from langchain.agents import ConversationalAgent
14
+ from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
15
+ from langchain.prompts import BaseChatPromptTemplate
16
+ from langchain.chains import ConversationalRetrievalChain
17
  from langchain.docstore.document import Document
18
+ from langchain.embeddings import OpenAIEmbeddings
19
+ from langchain.memory import ConversationBufferWindowMemory
20
+ from langchain.schema import AgentAction, AgentFinish, HumanMessage
21
  from langchain.text_splitter import CharacterTextSplitter
22
  from langchain.vectorstores.faiss import FAISS
23
 
24
  book_url = 'https://g.co/kgs/2VFC7u'
25
  book_file = "Book.pdf"
26
  url = 'https://makerlab.illinois.edu/'
 
27
 
28
+ pickle_file = "open_ai.pkl"
29
+ index_file = "open_ai.index"
30
+
31
+ gpt_3_5 = OpenAI(model_name='gpt-3.5-turbo',temperature=0)
32
+
33
+ embeddings = OpenAIEmbeddings()
34
+
35
+ chat_history = []
36
+
37
+ memory = ConversationBufferWindowMemory(memory_key="chat_history")
38
+
39
+ gpt_3_5_index = None
40
+
41
+ class CustomOutputParser(AgentOutputParser):
42
+
43
+ def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
44
+ # Check if agent replied without using tools
45
+ if "AI:" in llm_output:
46
+ return AgentFinish(return_values={"output": llm_output.split("AI:")[-1].strip()},
47
+ log=llm_output)
48
+ # Check if agent should finish
49
+ if "Final Answer:" in llm_output:
50
+ return AgentFinish(
51
+ # Return values is generally always a dictionary with a single `output` key
52
+ # It is not recommended to try anything else at the moment :)
53
+ return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
54
+ log=llm_output,
55
+ )
56
+ # Parse out the action and action input
57
+ regex = r"Action: (.*?)[\n]*Action Input:[\s]*(.*)"
58
+ match = re.search(regex, llm_output, re.DOTALL)
59
+ if not match:
60
+ raise ValueError(f"Could not parse LLM output: `{llm_output}`")
61
+ action = match.group(1).strip()
62
+ action_input = match.group(2)
63
+ # Return the action and action input
64
+ return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
65
+
66
+ # Set up a prompt template
67
+ class CustomPromptTemplate(BaseChatPromptTemplate):
68
+ # The template to use
69
+ template: str
70
+ # The list of tools available
71
+ tools: List[Tool]
72
+
73
+ def format_messages(self, **kwargs) -> str:
74
+ # Get the intermediate steps (AgentAction, Observation tuples)
75
+ # Format them in a particular way
76
+ intermediate_steps = kwargs.pop("intermediate_steps")
77
+ thoughts = ""
78
+ for action, observation in intermediate_steps:
79
+ thoughts += action.log
80
+ thoughts += f"\nObservation: {observation}\nThought: "
81
+ # Set the agent_scratchpad variable to that value
82
+ kwargs["agent_scratchpad"] = thoughts
83
+ # Create a tools variable from the list of tools provided
84
+ kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
85
+ # Create a list of tool names for the tools provided
86
+ kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
87
+ formatted = self.template.format(**kwargs)
88
+ return [HumanMessage(content=formatted)]
89
+
90
+ def get_search_index():
91
+ global gpt_3_5_index
92
  if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(pickle_file) > 0:
93
  # Load index from pickle file
94
  with open(pickle_file, "rb") as f:
95
  search_index = pickle.load(f)
96
  else:
97
+ search_index = create_index()
98
 
99
+ gpt_3_5_index = search_index
100
 
 
 
 
 
 
101
 
102
+ def create_index():
103
+ source_chunks = create_chunk_documents()
104
+ search_index = search_index_from_docs(source_chunks)
105
+ faiss.write_index(search_index.index, index_file)
106
+ # Save index to pickle file
107
+ with open(pickle_file, "wb") as f:
108
+ pickle.dump(search_index, f)
109
  return search_index
110
 
111
 
 
193
  # print("document list" + str(len(document_list)))
194
  return document_list
195
 
196
+ def search_index_from_docs(source_chunks):
197
  # Create index from chunk documents
198
  # print("Size of chunk" + str(len(source_chunks)))
199
  search_index = FAISS.from_texts([doc.page_content for doc in source_chunks], embeddings, metadatas=[doc.metadata for doc in source_chunks])
200
  return search_index
201
+
202
+
203
+ def get_qa_chain(gpt_3_5_index):
204
+ global gpt_3_5
205
+ print("index: " + str(gpt_3_5_index))
206
+ return ConversationalRetrievalChain.from_llm(gpt_3_5, chain_type="stuff", get_chat_history=get_chat_history,
207
+ retriever=gpt_3_5_index.as_retriever(), return_source_documents=True, verbose=True)
208
+
209
+ def get_chat_history(inputs) -> str:
210
+ res = []
211
+ for human, ai in inputs:
212
+ res.append(f"Human:{human}\nAI:{ai}")
213
+ return "\n".join(res)
214
+
215
+
216
+ def generate_answer(question) -> str:
217
+ global chat_history, gpt_3_5_index
218
+ gpt_3_5_chain = get_qa_chain(gpt_3_5_index)
219
+ result = gpt_3_5_chain(
220
+ {"question": question, "chat_history": chat_history,"vectordbkwargs": {"search_distance": 0.8}})
221
+ print("REsult: " + str(result))
222
+ chat_history = [(question, result["answer"])]
223
+ sources = []
224
+
225
+ for document in result['source_documents']:
226
+ source = document.metadata['source']
227
+ sources.append(source)
228
+
229
+ source = ',\n'.join(set(sources))
230
+ return result['answer'] + '\nSOURCES: ' + source
231
+
232
+
233
+ def get_agent_chain(prompt, tools):
234
+ global gpt_3_5
235
+ # output_parser = CustomOutputParser()
236
+ llm_chain = LLMChain(llm=gpt_3_5, prompt=prompt)
237
+ agent = ConversationalAgent(llm_chain=llm_chain, tools=tools, verbose=True)
238
+ agent_chain = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True, memory=memory,
239
+ intermediate_steps=True)
240
+ return agent_chain
241
+
242
+
243
+ def get_prompt_and_tools():
244
+ tools = get_tools()
245
+
246
+ prefix = """Have a conversation with a human, answering the following questions as best you can.
247
+ Always try to use Vectorstore first.
248
+ Your name is Makerlab Bot because you are a personal assistant of Makerlab. You have access to the following tools:"""
249
+ suffix = """Begin! If you use any tool, ALWAYS return a "SOURCES" part in your answer"
250
+
251
+ {chat_history}
252
+ Question: {input}
253
+ {agent_scratchpad}
254
+ SOURCES:"""
255
+ prompt = ConversationalAgent.create_prompt(
256
+ tools,
257
+ prefix=prefix,
258
+ suffix=suffix,
259
+ input_variables=["input", "chat_history", "agent_scratchpad"]
260
+ )
261
+ # print("Template: " + prompt.template)
262
+ return prompt, tools
263
+
264
+
265
+ def get_tools():
266
+ tools = [
267
+ Tool(
268
+ name="Vectorstore",
269
+ func=generate_answer,
270
+ description="useful for when you need to answer questions about the Makerlab or 3D Printing.",
271
+ return_direct=True
272
+ )]
273
+ return tools
274
+
275
+ def get_custom_agent(prompt, tools):
276
+
277
+ llm_chain = LLMChain(llm=gpt_3_5, prompt=prompt)
278
+
279
+ output_parser = CustomOutputParser()
280
+ tool_names = [tool.name for tool in tools]
281
+ agent = LLMSingleActionAgent(
282
+ llm_chain=llm_chain,
283
+ output_parser=output_parser,
284
+ stop=["\nObservation:"],
285
+ allowed_tools=tool_names
286
+ )
287
+ agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True, memory=memory,
288
+ intermediate_steps=True)
289
+ return agent_executor
290
+
291
+ def get_prompt_and_tools_for_custom_agent():
292
+ template = """
293
+ Have a conversation with a human, answering the following questions as best you can.
294
+ Always try to use Vectorstore first.
295
+ Your name is Makerlab Bot because you are a personal assistant of Makerlab. You have access to the following tools:
296
+
297
+ {tools}
298
+
299
+ To answer for the new input, use the following format:
300
+
301
+ New Input: the input question you must answer
302
+ Thought: Do I need to use a tool? Yes
303
+ Action: the action to take, should be one of [{tool_names}]
304
+ Action Input: the input to the action
305
+ Observation: the result of the action
306
+ ... (this Thought/Action/Action Input/Observation can repeat N times)
307
+ Thought: I now know the final answer
308
+ Final Answer: the final answer to the original input question. SOURCES: the sources referred to find the final answer
309
+
310
+
311
+ When you have a response to say to the Human and DO NOT need to use a tool:
312
+ 1. DO NOT return "SOURCES" if you did not use any tool.
313
+ 2. You MUST use this format:
314
+ ```
315
+ Thought: Do I need to use a tool? No
316
+ AI: [your response here]
317
+ ```
318
+
319
+ Begin! Remember to speak as a personal assistant when giving your final answer.
320
+ ALWAYS return a "SOURCES" part in your answer, if you used any tool.
321
+
322
+ Previous conversation history:
323
+ {chat_history}
324
+ New input: {input}
325
+ {agent_scratchpad}
326
+ SOURCES:"""
327
+ tools = get_tools()
328
+ prompt = CustomPromptTemplate(
329
+ template=template,
330
+ tools=tools,
331
+ # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
332
+ # This includes the `intermediate_steps` variable because that is needed
333
+ input_variables=["input", "intermediate_steps", "chat_history"]
334
+ )
335
+ return prompt, tools