docuverse / app.py
captain-awesome's picture
Update app.py
53520fe
raw
history blame
13.4 kB
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain.memory import ConversationTokenBufferMemory
from langchain.llms import HuggingFacePipeline
# from langchain import PromptTemplate
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import (
CSVLoader,
DirectoryLoader,
GitLoader,
NotebookLoader,
OnlinePDFLoader,
PythonLoader,
TextLoader,
UnstructuredFileLoader,
UnstructuredHTMLLoader,
UnstructuredPDFLoader,
UnstructuredWordDocumentLoader,
WebBaseLoader,
PyPDFLoader,
UnstructuredMarkdownLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredPowerPointLoader,
UnstructuredODTLoader,
NotebookLoader,
UnstructuredFileLoader
)
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
StoppingCriteria,
StoppingCriteriaList,
pipeline,
GenerationConfig,
TextStreamer,
pipeline
)
from langchain.llms import HuggingFaceHub
import torch
from transformers import BitsAndBytesConfig
import os
from langchain.llms import CTransformers
import streamlit as st
from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document
import gradio as gr
import tempfile
FILE_LOADER_MAPPING = {
"csv": (CSVLoader, {"encoding": "utf-8"}),
"doc": (UnstructuredWordDocumentLoader, {}),
"docx": (UnstructuredWordDocumentLoader, {}),
"epub": (UnstructuredEPubLoader, {}),
"html": (UnstructuredHTMLLoader, {}),
"md": (UnstructuredMarkdownLoader, {}),
"odt": (UnstructuredODTLoader, {}),
"pdf": (PyPDFLoader, {}),
"ppt": (UnstructuredPowerPointLoader, {}),
"pptx": (UnstructuredPowerPointLoader, {}),
"txt": (TextLoader, {"encoding": "utf8"}),
"ipynb": (NotebookLoader, {}),
"py": (PythonLoader, {}),
# Add more mappings for other file extensions and loaders as needed
}
def load_model():
# model_path=HuggingFaceHub(repo_id="vilsonrodrigues/falcon-7b-instruct-sharded")
# if not os.path.exists(model_path):
# raise FileNotFoundError(f"No model file found at {model_path}")
# quantization_config = BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_compute_dtype=torch.float16,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_use_double_quant=True,
# )
# model_4bit = AutoModelForCausalLM.from_pretrained(
# model_path,
# device_map="auto",
# quantization_config=quantization_config,
# )
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# pipeline = pipeline(
# "text-generation",
# model=model_4bit,
# tokenizer=tokenizer,
# use_cache=True,
# device_map="auto",
# max_length=700,
# do_sample=True,
# top_k=5,
# num_return_sequences=1,
# eos_token_id=tokenizer.eos_token_id,
# pad_token_id=tokenizer.eos_token_id,
# )
# llm = HuggingFacePipeline(pipeline=pipeline)
# llm = CTransformers(
# model=HuggingFaceHub(repo_id="TheBloke/Llama-2-7B-Chat-GGML", model_kwargs={"temperature":0.5, "max_length":512})
# # model_type=model_type,
# # max_new_tokens=max_new_tokens, # type: ignore
# # temperature=temperature, # type: ignore
# )
llm = CTransformers(
model = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
model_file = "mistral-7b-instruct-v0.1.Q8_0.gguf",
# model="TheBloke/Llama-2-70B-chat-GGUF",
# model = "Deci/DeciLM-6b-instruct",
callbacks=[StreamingStdOutCallbackHandler()]
# model_type=model_type,
# max_new_tokens=max_new_tokens, # type: ignore
# temperature=temperature, # type: ignore
)
return llm
# def load_document(
# # file_path: str,
# uploaded_files: list,
# mapping: dict = FILE_LOADER_MAPPING,
# default_loader: BaseLoader = UnstructuredFileLoader,
# ) -> Document:
# loaded_documents = []
# for uploaded_file in uploaded_files:
# # Choose loader from mapping, load default if no match found
# # ext = "." + uploaded_files.rsplit(".", 1)[-1]
# ext = os.path.splitext(uploaded_file.name)[-1][1:].lower()
# if ext in mapping:
# loader_class, loader_args = mapping[ext]
# loader = loader_class(uploaded_file, **loader_args)
# else:
# loader = default_loader(uploaded_file)
# loaded_documents.extend(loader.load())
# return loaded_documents
def create_vector_database(loaded_documents):
# DB_DIR: str = os.path.join(ABS_PATH, "db")
"""
Creates a vector database using document loaders and embeddings.
This function loads data from PDF, markdown and text files in the 'data/' directory,
splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
and finally persists the embeddings into a Chroma vector database.
"""
# Split loaded documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40, length_function = len)
chunked_documents = text_splitter.split_documents(loaded_documents)
# Initialize HuggingFace embeddings
# embeddings = HuggingFaceEmbeddings(
# model_name="sentence-transformers/all-MiniLM-L6-v2"
# )
embeddings = HuggingFaceBgeEmbeddings(
model_name = "BAAI/bge-large-en"
)
# Create and persist a Chroma vector database from the chunked documents
db = Chroma.from_documents(
documents=chunked_documents,
embedding=embeddings,
persist_directory=persist_directory
# persist_directory=DB_DIR,
)
db.persist()
# db = Chroma(persist_directory=persist_directory,
# embedding_function=embedding)
return db
def set_custom_prompt_condense():
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
return CONDENSE_QUESTION_PROMPT
def set_custom_prompt():
"""
Prompt template for retrieval for each vectorstore
"""
prompt_template = """<Instructions>
Important:
Answer with the facts listed in the list of sources below. If there isn't enough information below, say you don't know.
If asking a clarifying question to the user would help, ask the question.
ALWAYS return a "SOURCES" part in your answer, except for small-talk conversations.
Question: {question}
{context}
Question: {question}
Helpful Answer:
---------------------------
---------------------------
Sources:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
return prompt
def create_chain(llm, prompt, CONDENSE_QUESTION_PROMPT, db):
"""
Creates a Retrieval Question-Answering (QA) chain using a given language model, prompt, and database.
This function initializes a ConversationalRetrievalChain object with a specific chain type and configurations,
and returns this chain. The retriever is set up to return the top 3 results (k=3).
Args:
llm (any): The language model to be used in the RetrievalQA.
prompt (str): The prompt to be used in the chain type.
db (any): The database to be used as the
retriever.
Returns:
ConversationalRetrievalChain: The initialized conversational chain.
"""
memory = ConversationTokenBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, input_key='question', output_key='answer')
chain = ConversationalRetrievalChain.from_llm(
llm=llm,
chain_type="stuff",
retriever=db.as_retriever(search_kwargs={"k": 3}),
return_source_documents=True,
max_tokens_limit=256,
combine_docs_chain_kwargs={"prompt": prompt},
condense_question_prompt=CONDENSE_QUESTION_PROMPT,
memory=memory,
)
return chain
def create_retrieval_qa_bot(loaded_documents):
# if not os.path.exists(persist_dir):
# raise FileNotFoundError(f"No directory found at {persist_dir}")
try:
llm = load_model() # Assuming this function exists and works as expected
except Exception as e:
raise Exception(f"Failed to load model: {str(e)}")
try:
prompt = set_custom_prompt() # Assuming this function exists and works as expected
except Exception as e:
raise Exception(f"Failed to get prompt: {str(e)}")
try:
CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense() # Assuming this function exists and works as expected
except Exception as e:
raise Exception(f"Failed to get condense prompt: {str(e)}")
try:
db = create_vector_database(loaded_documents) # Assuming this function exists and works as expected
except Exception as e:
raise Exception(f"Failed to get database: {str(e)}")
try:
qa = create_chain(
llm=llm, prompt=prompt,CONDENSE_QUESTION_PROMPT=CONDENSE_QUESTION_PROMPT, db=db
) # Assuming this function exists and works as expected
except Exception as e:
raise Exception(f"Failed to create retrieval QA chain: {str(e)}")
return qa
def retrieve_bot_answer(query, loaded_documents):
"""
Retrieves the answer to a given query using a QA bot.
This function creates an instance of a QA bot, passes the query to it,
and returns the bot's response.
Args:
query (str): The question to be answered by the QA bot.
Returns:
dict: The QA bot's response, typically a dictionary with response details.
"""
qa_bot_instance = create_retrieval_qa_bot(loaded_documents)
bot_response = qa_bot_instance({"question": query})
# Check if the 'answer' key exists in the bot_response dictionary
if 'answer' in bot_response:
# answer = bot_response['answer']
return bot_response
else:
raise KeyError("Expected 'answer' key in bot_response, but it was not found.")
# from your_module import load_model, set_custom_prompt, set_custom_prompt_condense, create_vector_database, retrieve_bot_answer
def main():
st.title("Docuverse")
# Upload files
uploaded_files = st.file_uploader("Upload your documents", type=["pdf", "md", "txt", "csv", "py", "epub", "html", "ppt", "pptx", "doc", "docx", "odt", "ipynb"], accept_multiple_files=True)
loaded_documents = []
if uploaded_files:
# Create a temporary directory
with tempfile.TemporaryDirectory() as td:
# Move the uploaded files to the temporary directory and process them
for uploaded_file in uploaded_files:
st.write(f"Uploaded: {uploaded_file.name}")
ext = os.path.splitext(uploaded_file.name)[-1][1:].lower()
st.write(f"Uploaded: {ext}")
# Check if the extension is in FILE_LOADER_MAPPING
if ext in FILE_LOADER_MAPPING:
loader_class, loader_args = FILE_LOADER_MAPPING[ext]
# st.write(f"loader_class: {loader_class}")
# Save the uploaded file to the temporary directory
file_path = os.path.join(td, uploaded_file.name)
with open(file_path, 'wb') as temp_file:
temp_file.write(uploaded_file.read())
# Use Langchain loader to process the file
loader = loader_class(file_path, **loader_args)
loaded_documents.extend(loader.load())
else:
st.warning(f"Unsupported file extension: {ext}")
# st.write(f"loaded_documents: {loaded_documents}")
st.write("Chat with the Document:")
query = st.text_input("Ask a question:")
if st.button("Get Answer"):
if query:
# Load model, set prompts, create vector database, and retrieve answer
try:
llm = load_model()
prompt = set_custom_prompt()
CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense()
db = create_vector_database(loaded_documents)
# st.write(f"db: {db}")
response = retrieve_bot_answer(query,loaded_documents)
# st.write(f"response: {response}")
# Display bot response
st.write("Bot Response:")
st.write(response['answer'])
except Exception as e:
st.error(f"An error occurred: {str(e)}")
else:
st.warning("Please enter a question.")
if __name__ == "__main__":
main()