Spaces:
Runtime error
Runtime error
pr3
#2
by
rohan13
- opened
- main.py +19 -14
- poetry.lock +0 -0
- pyproject.toml +25 -0
- requirements.txt +2 -2
- static/chatbot.js +1 -1
- utils.py +219 -20
main.py
CHANGED
@@ -1,22 +1,27 @@
|
|
1 |
-
from
|
2 |
-
from
|
3 |
-
|
4 |
|
5 |
-
|
6 |
-
from utils import get_search_index
|
7 |
|
8 |
-
|
9 |
-
open_ai_index = "open_ai.index"
|
10 |
|
11 |
-
|
12 |
|
13 |
-
|
14 |
|
15 |
-
|
|
|
|
|
16 |
|
17 |
-
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
return answer
|
|
|
1 |
+
from utils import create_index, get_agent_chain, get_prompt_and_tools, get_search_index
|
2 |
+
from utils import get_custom_agent
|
3 |
+
question_starters = ['who', 'why', 'what', 'how', 'where', 'when', 'which', 'whom', 'whose']
|
4 |
|
5 |
+
def run(question):
|
|
|
6 |
|
7 |
+
index = get_search_index()
|
|
|
8 |
|
9 |
+
# prompt, tools = get_prompt_and_tools()
|
10 |
|
11 |
+
# agent_chain = get_agent_chain(prompt, tools)
|
12 |
|
13 |
+
prompt, tools = get_prompt_and_tools_for_custom_agent()
|
14 |
+
|
15 |
+
agent_chain = get_custom_agent(prompt, tools)
|
16 |
|
17 |
+
result = None
|
18 |
|
19 |
+
try:
|
20 |
+
result = agent_chain.run(question)
|
21 |
+
print(result)
|
22 |
+
except ValueError as ve:
|
23 |
+
if "Could not parse LLM output:" in ve.args[0] and question.lower().startswith(tuple(question_starters)) and not question.lower().endswith('?'):
|
24 |
+
question = question + '?'
|
25 |
+
result = agent_chain.run(question)
|
26 |
|
27 |
+
return result
|
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "makerlab-bot"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Assistant Bot to Makerlab"
|
5 |
+
authors = ["rohan-uiuc <rohan13@illinois.edu>"]
|
6 |
+
readme = "README.md"
|
7 |
+
packages = [{include = "makerlab_bot"}]
|
8 |
+
|
9 |
+
[tool.poetry.dependencies]
|
10 |
+
python = "^3.9"
|
11 |
+
faiss-cpu = "^1.7.3"
|
12 |
+
langchain = "^0.0.131"
|
13 |
+
beautifulsoup4 = "^4.12.0"
|
14 |
+
pypdf2 = "^3.0.1"
|
15 |
+
openai = "^0.27.4"
|
16 |
+
flask = "^2.2.3"
|
17 |
+
flask-socketio = "^5.3.3"
|
18 |
+
flask-cors = "^3.0.10"
|
19 |
+
gevent = "^22.10.2"
|
20 |
+
gevent-websocket = "^0.10.1"
|
21 |
+
|
22 |
+
|
23 |
+
[build-system]
|
24 |
+
requires = ["poetry-core"]
|
25 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
faiss-cpu
|
2 |
-
langchain
|
3 |
beautifulsoup4
|
4 |
PyPDF2
|
5 |
openai
|
|
|
1 |
+
faiss-cpu
|
2 |
+
langchain
|
3 |
beautifulsoup4
|
4 |
PyPDF2
|
5 |
openai
|
static/chatbot.js
CHANGED
@@ -23,7 +23,7 @@ $(document).ready(function() {
|
|
23 |
// Function to display message
|
24 |
function displayMessage(message, isUser) {
|
25 |
var $message = $('<div>').addClass('chat-message round');
|
26 |
-
var $messageText = $('<p>').html(message.replace(/(https?:\/\/[^\s]+)/g, '<a href="$1">$1</a>'));
|
27 |
|
28 |
$message.append($messageText);
|
29 |
if (isUser) {
|
|
|
23 |
// Function to display message
|
24 |
function displayMessage(message, isUser) {
|
25 |
var $message = $('<div>').addClass('chat-message round');
|
26 |
+
var $messageText = $('<p>').html(message.replace(/(https?:\/\/[^\s,]+)/g, '<a href="$1">$1</a>'));
|
27 |
|
28 |
$message.append($messageText);
|
29 |
if (isUser) {
|
utils.py
CHANGED
@@ -1,36 +1,111 @@
|
|
1 |
import os
|
2 |
import pickle
|
|
|
3 |
import time
|
|
|
4 |
from urllib.parse import urlparse, urljoin
|
5 |
|
6 |
import faiss
|
7 |
import requests
|
8 |
from PyPDF2 import PdfReader
|
9 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
10 |
from langchain.docstore.document import Document
|
|
|
|
|
|
|
11 |
from langchain.text_splitter import CharacterTextSplitter
|
12 |
from langchain.vectorstores.faiss import FAISS
|
13 |
|
14 |
book_url = 'https://g.co/kgs/2VFC7u'
|
15 |
book_file = "Book.pdf"
|
16 |
url = 'https://makerlab.illinois.edu/'
|
17 |
-
def get_search_index(pickle_file, index_file, embeddings):
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(pickle_file) > 0:
|
20 |
# Load index from pickle file
|
21 |
with open(pickle_file, "rb") as f:
|
22 |
search_index = pickle.load(f)
|
23 |
else:
|
24 |
-
|
25 |
|
26 |
-
|
27 |
|
28 |
-
faiss.write_index(search_index.index, index_file)
|
29 |
-
|
30 |
-
# Save index to pickle file
|
31 |
-
with open(pickle_file, "wb") as f:
|
32 |
-
pickle.dump(search_index, f)
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
return search_index
|
35 |
|
36 |
|
@@ -118,19 +193,143 @@ def get_document_data(book_file, book_url):
|
|
118 |
# print("document list" + str(len(document_list)))
|
119 |
return document_list
|
120 |
|
121 |
-
def search_index_from_docs(source_chunks
|
122 |
# Create index from chunk documents
|
123 |
# print("Size of chunk" + str(len(source_chunks)))
|
124 |
search_index = FAISS.from_texts([doc.page_content for doc in source_chunks], embeddings, metadatas=[doc.metadata for doc in source_chunks])
|
125 |
return search_index
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import pickle
|
3 |
+
import re
|
4 |
import time
|
5 |
+
from typing import List, Union
|
6 |
from urllib.parse import urlparse, urljoin
|
7 |
|
8 |
import faiss
|
9 |
import requests
|
10 |
from PyPDF2 import PdfReader
|
11 |
from bs4 import BeautifulSoup
|
12 |
+
from langchain import OpenAI, LLMChain
|
13 |
+
from langchain.agents import ConversationalAgent
|
14 |
+
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
|
15 |
+
from langchain.prompts import BaseChatPromptTemplate
|
16 |
+
from langchain.chains import ConversationalRetrievalChain
|
17 |
from langchain.docstore.document import Document
|
18 |
+
from langchain.embeddings import OpenAIEmbeddings
|
19 |
+
from langchain.memory import ConversationBufferWindowMemory
|
20 |
+
from langchain.schema import AgentAction, AgentFinish, HumanMessage
|
21 |
from langchain.text_splitter import CharacterTextSplitter
|
22 |
from langchain.vectorstores.faiss import FAISS
|
23 |
|
24 |
book_url = 'https://g.co/kgs/2VFC7u'
|
25 |
book_file = "Book.pdf"
|
26 |
url = 'https://makerlab.illinois.edu/'
|
|
|
27 |
|
28 |
+
pickle_file = "open_ai.pkl"
|
29 |
+
index_file = "open_ai.index"
|
30 |
+
|
31 |
+
gpt_3_5 = OpenAI(model_name='gpt-3.5-turbo',temperature=0)
|
32 |
+
|
33 |
+
embeddings = OpenAIEmbeddings()
|
34 |
+
|
35 |
+
chat_history = []
|
36 |
+
|
37 |
+
memory = ConversationBufferWindowMemory(memory_key="chat_history")
|
38 |
+
|
39 |
+
gpt_3_5_index = None
|
40 |
+
|
41 |
+
class CustomOutputParser(AgentOutputParser):
|
42 |
+
|
43 |
+
def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
|
44 |
+
# Check if agent replied without using tools
|
45 |
+
if "AI:" in llm_output:
|
46 |
+
return AgentFinish(return_values={"output": llm_output.split("AI:")[-1].strip()},
|
47 |
+
log=llm_output)
|
48 |
+
# Check if agent should finish
|
49 |
+
if "Final Answer:" in llm_output:
|
50 |
+
return AgentFinish(
|
51 |
+
# Return values is generally always a dictionary with a single `output` key
|
52 |
+
# It is not recommended to try anything else at the moment :)
|
53 |
+
return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
|
54 |
+
log=llm_output,
|
55 |
+
)
|
56 |
+
# Parse out the action and action input
|
57 |
+
regex = r"Action: (.*?)[\n]*Action Input:[\s]*(.*)"
|
58 |
+
match = re.search(regex, llm_output, re.DOTALL)
|
59 |
+
if not match:
|
60 |
+
raise ValueError(f"Could not parse LLM output: `{llm_output}`")
|
61 |
+
action = match.group(1).strip()
|
62 |
+
action_input = match.group(2)
|
63 |
+
# Return the action and action input
|
64 |
+
return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
|
65 |
+
|
66 |
+
# Set up a prompt template
|
67 |
+
class CustomPromptTemplate(BaseChatPromptTemplate):
|
68 |
+
# The template to use
|
69 |
+
template: str
|
70 |
+
# The list of tools available
|
71 |
+
tools: List[Tool]
|
72 |
+
|
73 |
+
def format_messages(self, **kwargs) -> str:
|
74 |
+
# Get the intermediate steps (AgentAction, Observation tuples)
|
75 |
+
# Format them in a particular way
|
76 |
+
intermediate_steps = kwargs.pop("intermediate_steps")
|
77 |
+
thoughts = ""
|
78 |
+
for action, observation in intermediate_steps:
|
79 |
+
thoughts += action.log
|
80 |
+
thoughts += f"\nObservation: {observation}\nThought: "
|
81 |
+
# Set the agent_scratchpad variable to that value
|
82 |
+
kwargs["agent_scratchpad"] = thoughts
|
83 |
+
# Create a tools variable from the list of tools provided
|
84 |
+
kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
|
85 |
+
# Create a list of tool names for the tools provided
|
86 |
+
kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
|
87 |
+
formatted = self.template.format(**kwargs)
|
88 |
+
return [HumanMessage(content=formatted)]
|
89 |
+
|
90 |
+
def get_search_index():
|
91 |
+
global gpt_3_5_index
|
92 |
if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(pickle_file) > 0:
|
93 |
# Load index from pickle file
|
94 |
with open(pickle_file, "rb") as f:
|
95 |
search_index = pickle.load(f)
|
96 |
else:
|
97 |
+
search_index = create_index()
|
98 |
|
99 |
+
gpt_3_5_index = search_index
|
100 |
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
def create_index():
|
103 |
+
source_chunks = create_chunk_documents()
|
104 |
+
search_index = search_index_from_docs(source_chunks)
|
105 |
+
faiss.write_index(search_index.index, index_file)
|
106 |
+
# Save index to pickle file
|
107 |
+
with open(pickle_file, "wb") as f:
|
108 |
+
pickle.dump(search_index, f)
|
109 |
return search_index
|
110 |
|
111 |
|
|
|
193 |
# print("document list" + str(len(document_list)))
|
194 |
return document_list
|
195 |
|
196 |
+
def search_index_from_docs(source_chunks):
|
197 |
# Create index from chunk documents
|
198 |
# print("Size of chunk" + str(len(source_chunks)))
|
199 |
search_index = FAISS.from_texts([doc.page_content for doc in source_chunks], embeddings, metadatas=[doc.metadata for doc in source_chunks])
|
200 |
return search_index
|
201 |
+
|
202 |
+
|
203 |
+
def get_qa_chain(gpt_3_5_index):
|
204 |
+
global gpt_3_5
|
205 |
+
print("index: " + str(gpt_3_5_index))
|
206 |
+
return ConversationalRetrievalChain.from_llm(gpt_3_5, chain_type="stuff", get_chat_history=get_chat_history,
|
207 |
+
retriever=gpt_3_5_index.as_retriever(), return_source_documents=True, verbose=True)
|
208 |
+
|
209 |
+
def get_chat_history(inputs) -> str:
|
210 |
+
res = []
|
211 |
+
for human, ai in inputs:
|
212 |
+
res.append(f"Human:{human}\nAI:{ai}")
|
213 |
+
return "\n".join(res)
|
214 |
+
|
215 |
+
|
216 |
+
def generate_answer(question) -> str:
|
217 |
+
global chat_history, gpt_3_5_index
|
218 |
+
gpt_3_5_chain = get_qa_chain(gpt_3_5_index)
|
219 |
+
result = gpt_3_5_chain(
|
220 |
+
{"question": question, "chat_history": chat_history,"vectordbkwargs": {"search_distance": 0.8}})
|
221 |
+
print("REsult: " + str(result))
|
222 |
+
chat_history = [(question, result["answer"])]
|
223 |
+
sources = []
|
224 |
+
|
225 |
+
for document in result['source_documents']:
|
226 |
+
source = document.metadata['source']
|
227 |
+
sources.append(source)
|
228 |
+
|
229 |
+
source = ',\n'.join(set(sources))
|
230 |
+
return result['answer'] + '\nSOURCES: ' + source
|
231 |
+
|
232 |
+
|
233 |
+
def get_agent_chain(prompt, tools):
|
234 |
+
global gpt_3_5
|
235 |
+
# output_parser = CustomOutputParser()
|
236 |
+
llm_chain = LLMChain(llm=gpt_3_5, prompt=prompt)
|
237 |
+
agent = ConversationalAgent(llm_chain=llm_chain, tools=tools, verbose=True)
|
238 |
+
agent_chain = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True, memory=memory,
|
239 |
+
intermediate_steps=True)
|
240 |
+
return agent_chain
|
241 |
+
|
242 |
+
|
243 |
+
def get_prompt_and_tools():
|
244 |
+
tools = get_tools()
|
245 |
+
|
246 |
+
prefix = """Have a conversation with a human, answering the following questions as best you can.
|
247 |
+
Always try to use Vectorstore first.
|
248 |
+
Your name is Makerlab Bot because you are a personal assistant of Makerlab. You have access to the following tools:"""
|
249 |
+
suffix = """Begin! If you use any tool, ALWAYS return a "SOURCES" part in your answer"
|
250 |
+
|
251 |
+
{chat_history}
|
252 |
+
Question: {input}
|
253 |
+
{agent_scratchpad}
|
254 |
+
SOURCES:"""
|
255 |
+
prompt = ConversationalAgent.create_prompt(
|
256 |
+
tools,
|
257 |
+
prefix=prefix,
|
258 |
+
suffix=suffix,
|
259 |
+
input_variables=["input", "chat_history", "agent_scratchpad"]
|
260 |
+
)
|
261 |
+
# print("Template: " + prompt.template)
|
262 |
+
return prompt, tools
|
263 |
+
|
264 |
+
|
265 |
+
def get_tools():
|
266 |
+
tools = [
|
267 |
+
Tool(
|
268 |
+
name="Vectorstore",
|
269 |
+
func=generate_answer,
|
270 |
+
description="useful for when you need to answer questions about the Makerlab or 3D Printing.",
|
271 |
+
return_direct=True
|
272 |
+
)]
|
273 |
+
return tools
|
274 |
+
|
275 |
+
def get_custom_agent(prompt, tools):
|
276 |
+
|
277 |
+
llm_chain = LLMChain(llm=gpt_3_5, prompt=prompt)
|
278 |
+
|
279 |
+
output_parser = CustomOutputParser()
|
280 |
+
tool_names = [tool.name for tool in tools]
|
281 |
+
agent = LLMSingleActionAgent(
|
282 |
+
llm_chain=llm_chain,
|
283 |
+
output_parser=output_parser,
|
284 |
+
stop=["\nObservation:"],
|
285 |
+
allowed_tools=tool_names
|
286 |
+
)
|
287 |
+
agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True, memory=memory,
|
288 |
+
intermediate_steps=True)
|
289 |
+
return agent_executor
|
290 |
+
|
291 |
+
def get_prompt_and_tools_for_custom_agent():
|
292 |
+
template = """
|
293 |
+
Have a conversation with a human, answering the following questions as best you can.
|
294 |
+
Always try to use Vectorstore first.
|
295 |
+
Your name is Makerlab Bot because you are a personal assistant of Makerlab. You have access to the following tools:
|
296 |
+
|
297 |
+
{tools}
|
298 |
+
|
299 |
+
To answer for the new input, use the following format:
|
300 |
+
|
301 |
+
New Input: the input question you must answer
|
302 |
+
Thought: Do I need to use a tool? Yes
|
303 |
+
Action: the action to take, should be one of [{tool_names}]
|
304 |
+
Action Input: the input to the action
|
305 |
+
Observation: the result of the action
|
306 |
+
... (this Thought/Action/Action Input/Observation can repeat N times)
|
307 |
+
Thought: I now know the final answer
|
308 |
+
Final Answer: the final answer to the original input question. SOURCES: the sources referred to find the final answer
|
309 |
+
|
310 |
+
|
311 |
+
When you have a response to say to the Human and DO NOT need to use a tool:
|
312 |
+
1. DO NOT return "SOURCES" if you did not use any tool.
|
313 |
+
2. You MUST use this format:
|
314 |
+
```
|
315 |
+
Thought: Do I need to use a tool? No
|
316 |
+
AI: [your response here]
|
317 |
+
```
|
318 |
+
|
319 |
+
Begin! Remember to speak as a personal assistant when giving your final answer.
|
320 |
+
ALWAYS return a "SOURCES" part in your answer, if you used any tool.
|
321 |
+
|
322 |
+
Previous conversation history:
|
323 |
+
{chat_history}
|
324 |
+
New input: {input}
|
325 |
+
{agent_scratchpad}
|
326 |
+
SOURCES:"""
|
327 |
+
tools = get_tools()
|
328 |
+
prompt = CustomPromptTemplate(
|
329 |
+
template=template,
|
330 |
+
tools=tools,
|
331 |
+
# This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
|
332 |
+
# This includes the `intermediate_steps` variable because that is needed
|
333 |
+
input_variables=["input", "intermediate_steps", "chat_history"]
|
334 |
+
)
|
335 |
+
return prompt, tools
|