Spaces:
Runtime error
Runtime error
fakezeta
commited on
Commit
•
6feb027
1
Parent(s):
0983982
first release
Browse files- app.py +113 -0
- ingest_data.py +42 -0
- query_data.py +43 -0
- requirements.txt +9 -0
- style.css +23 -0
app.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ast import Delete
|
2 |
+
import streamlit as st
|
3 |
+
from streamlit_chat import message
|
4 |
+
from ingest_data import embed_doc
|
5 |
+
from query_data import get_chain
|
6 |
+
import os
|
7 |
+
import time
|
8 |
+
|
9 |
+
os.environ["OPENAI_API_KEY"] = "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"
|
10 |
+
|
11 |
+
st.set_page_config(page_title="LangChain Local PDF Chat", page_icon=":robot:")
|
12 |
+
|
13 |
+
footer="""<style>
|
14 |
+
|
15 |
+
.footer {
|
16 |
+
position: fixed;
|
17 |
+
left: 0;
|
18 |
+
bottom: 0;
|
19 |
+
width: 100%;
|
20 |
+
background-color: white;
|
21 |
+
color: black;
|
22 |
+
text-align: right;
|
23 |
+
}
|
24 |
+
</style>
|
25 |
+
<div class="footer">
|
26 |
+
<p>Adapted with ❤ and \U0001F916 by Fakezeta from the original Mobilefirst</p>
|
27 |
+
</div>
|
28 |
+
"""
|
29 |
+
st.markdown(footer,unsafe_allow_html=True)
|
30 |
+
|
31 |
+
def process_file(uploaded_file):
|
32 |
+
with open(uploaded_file.name,"wb") as f:
|
33 |
+
f.write(uploaded_file.getbuffer())
|
34 |
+
st.write("File Uploaded successfully")
|
35 |
+
|
36 |
+
with st.spinner("Document is being vectorized...."):
|
37 |
+
vectorstore = embed_doc(uploaded_file.name)
|
38 |
+
f.close()
|
39 |
+
os.remove(uploaded_file.name)
|
40 |
+
return vectorstore
|
41 |
+
|
42 |
+
def get_text():
|
43 |
+
input_text = st.text_input("You: ", value="", key="input", disabled=st.session_state.disabled)
|
44 |
+
return input_text
|
45 |
+
|
46 |
+
def query(query):
|
47 |
+
start = time.time()
|
48 |
+
with st.spinner("Doing magic...."):
|
49 |
+
if len(st.session_state.past) > 0 and len(st.session_state.generated) > 0:
|
50 |
+
chat_history=[("HUMAN: "+st.session_state.past[-1], "ASSISTANT: "+st.session_state.generated[-1])]
|
51 |
+
else:
|
52 |
+
chat_history=[]
|
53 |
+
print("chat_history:", chat_history)
|
54 |
+
output = st.session_state.chain.run(input= query,
|
55 |
+
question= query,
|
56 |
+
vectorstore= st.session_state.vectorstore,
|
57 |
+
chat_history= chat_history
|
58 |
+
)
|
59 |
+
end = time.time()
|
60 |
+
print("Query time: \a "+str(round(end - start,1)))
|
61 |
+
return output
|
62 |
+
|
63 |
+
|
64 |
+
with open("style.css") as f:
|
65 |
+
st.markdown('<style>{}</style>'.format(f.read()), unsafe_allow_html=True)
|
66 |
+
|
67 |
+
st.header("Local Chat with Pdf")
|
68 |
+
|
69 |
+
if "uploaded_file_name" not in st.session_state:
|
70 |
+
st.session_state.uploaded_file_name = ""
|
71 |
+
|
72 |
+
if "past" not in st.session_state:
|
73 |
+
st.session_state.past = []
|
74 |
+
|
75 |
+
if "generated" not in st.session_state:
|
76 |
+
st.session_state["generated"] = []
|
77 |
+
|
78 |
+
if "vectorstore" not in st.session_state:
|
79 |
+
st.session_state.vectorstore = None
|
80 |
+
|
81 |
+
if "chain" not in st.session_state:
|
82 |
+
st.session_state.chain = None
|
83 |
+
|
84 |
+
uploaded_file = st.file_uploader("Choose a file", type=['pdf'])
|
85 |
+
|
86 |
+
if uploaded_file:
|
87 |
+
if uploaded_file.name != st.session_state.uploaded_file_name:
|
88 |
+
st.session_state.vectorstore = None
|
89 |
+
st.session_state.chain = None
|
90 |
+
st.session_state["generated"] = []
|
91 |
+
st.session_state.past = []
|
92 |
+
st.session_state.uploaded_file_name = uploaded_file.name
|
93 |
+
st.session_state.all_messages = []
|
94 |
+
print(st.session_state.uploaded_file_name)
|
95 |
+
if not st.session_state.vectorstore:
|
96 |
+
st.session_state.vectorstore = process_file(uploaded_file)
|
97 |
+
|
98 |
+
if st.session_state.vectorstore and not st.session_state.chain:
|
99 |
+
with st.spinner("Loading Large Language Model...."):
|
100 |
+
st.session_state.chain=get_chain(st.session_state.vectorstore)
|
101 |
+
searching=False
|
102 |
+
user_input = st.text_input("You: ", value="", key="input", disabled=searching)
|
103 |
+
send_button = st.button(label="Query")
|
104 |
+
if send_button:
|
105 |
+
searching = True
|
106 |
+
output = query(user_input)
|
107 |
+
searching = False
|
108 |
+
st.session_state.past.append(user_input)
|
109 |
+
st.session_state.generated.append(output)
|
110 |
+
if st.session_state["generated"]:
|
111 |
+
for i in range(len(st.session_state["generated"]) - 1, -1, -1):
|
112 |
+
message(st.session_state["generated"][i], key=str(i))
|
113 |
+
message(st.session_state.past[i], is_user=True, key=str(i) + "_user")
|
ingest_data.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
+
from langchain.document_loaders import PyPDFLoader
|
3 |
+
from langchain.vectorstores import Chroma
|
4 |
+
from langchain.embeddings import TensorflowHubEmbeddings
|
5 |
+
import os
|
6 |
+
import time
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
def embed_doc(filename):
|
10 |
+
if len(os.listdir("."))>0:
|
11 |
+
loader=PyPDFLoader(filename)
|
12 |
+
start = time.time()
|
13 |
+
raw_documents = loader.load()
|
14 |
+
# Split text
|
15 |
+
|
16 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
17 |
+
chunk_size=1000,
|
18 |
+
chunk_overlap=0,
|
19 |
+
length_function=len
|
20 |
+
|
21 |
+
)
|
22 |
+
|
23 |
+
documents = text_splitter.split_documents(raw_documents)
|
24 |
+
end = time.time()
|
25 |
+
st.text("Load and split text: "+str(round(end - start,1)))
|
26 |
+
|
27 |
+
|
28 |
+
# Load Data to vectorstore
|
29 |
+
start = time.time()
|
30 |
+
# embeddings = LlamaCppEmbeddings(model_path="ggml-model.bin")
|
31 |
+
# embeddings = HuggingFaceEmbeddings(model_name="diptanuc/all-mpnet-base-v2", model_kwargs={'device': 'cpu'})
|
32 |
+
# embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder/4")
|
33 |
+
embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3")
|
34 |
+
# embeddings = HuggingFaceEmbeddings(model_name="obrizum/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
|
35 |
+
end = time.time()
|
36 |
+
st.text("Embedding time: "+str(round(end - start,1)))
|
37 |
+
start = time.time()
|
38 |
+
vectorstore = Chroma.from_documents(documents, embeddings)
|
39 |
+
end = time.time()
|
40 |
+
st.text("Vectorizing time: "+str(round(end - start,1)))
|
41 |
+
return vectorstore
|
42 |
+
|
query_data.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts.prompt import PromptTemplate
|
2 |
+
from langchain.llms import LlamaCpp
|
3 |
+
from langchain.chains import ConversationalRetrievalChain
|
4 |
+
from langchain.memory import ConversationBufferMemory
|
5 |
+
from huggingface_hub import hf_hub_download
|
6 |
+
|
7 |
+
import psutil
|
8 |
+
import os
|
9 |
+
|
10 |
+
#_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
|
11 |
+
#You can assume the question about the uploaded document.
|
12 |
+
|
13 |
+
#Chat History:
|
14 |
+
#{chat_history}
|
15 |
+
#Follow Up Input: {question}
|
16 |
+
#Standalone question:"""
|
17 |
+
#CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
|
18 |
+
|
19 |
+
#template = """You are an AI assistant for answering questions about the uploaded document.
|
20 |
+
#You are given the following extracted parts of a long document and a question. Provide a conversational answer.
|
21 |
+
#If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
|
22 |
+
#If the question is not about the uploaded document, politely inform them that you are tuned to only answer questions about the uploaded document.
|
23 |
+
#Question: {question}
|
24 |
+
|
25 |
+
#Answer in Markdown:"""
|
26 |
+
##QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
|
27 |
+
#QA_PROMPT = PromptTemplate(template=template, input_variables=["question"])
|
28 |
+
|
29 |
+
#=========
|
30 |
+
#{context}
|
31 |
+
#=========
|
32 |
+
|
33 |
+
|
34 |
+
def get_chain(vectorstore):
|
35 |
+
if not os.path.exists("ggml-vic7b-q5_1.bin"):
|
36 |
+
hf_hub_download(repo_id="eachadea/ggml-vicuna-7b-1.1", filename="ggml-vic7b-q5_1.bin", local_dir=".")
|
37 |
+
llm = LlamaCpp(model_path="ggml-vic7b-q5_1.bin", n_ctx=2048, n_threads=psutil.cpu_count(logical=False)/2)
|
38 |
+
qa_chain = ConversationalRetrievalChain.from_llm(
|
39 |
+
llm,
|
40 |
+
vectorstore.as_retriever(),
|
41 |
+
# condense_question_prompt=CONDENSE_QUESTION_PROMPT,
|
42 |
+
)
|
43 |
+
return qa_chain
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
typing-extensions>=4.5.0
|
3 |
+
llama-cpp-python
|
4 |
+
streamlit_chat
|
5 |
+
pypdf
|
6 |
+
chromadb
|
7 |
+
tensorflow_text
|
8 |
+
psutil
|
9 |
+
huggingface-hub
|
style.css
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.main {
|
2 |
+
background-color:black; /* You can change the color to your preference */
|
3 |
+
color:white
|
4 |
+
}
|
5 |
+
|
6 |
+
/* Change the background color of the sidebar */
|
7 |
+
.sidebar .block-container {
|
8 |
+
background-color: black; /* You can change the color to your preference */
|
9 |
+
}
|
10 |
+
|
11 |
+
.footer {
|
12 |
+
position: fixed;
|
13 |
+
left: 0;
|
14 |
+
bottom: 0;
|
15 |
+
width: 100%;
|
16 |
+
background-color:black;
|
17 |
+
color: white;
|
18 |
+
text-align: right;
|
19 |
+
}
|
20 |
+
|
21 |
+
h1, h2, h3, h4, h5, h6, p, label, .stMarkdown, .sidebar .block-container {
|
22 |
+
color: white;
|
23 |
+
}
|