LongDocumentQuestioner / document_questioner_app.py
NicolasGaudemet's picture
Update document_questioner_app.py
3fd3fa4
raw
history blame contribute delete
No virus
4.3 kB
import openai
import os
import gradio as gr
import chromadb
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
def load_document(Document):
# loads a PDF document
if not Document:
return "Merci de fournir un document PDF"
if not Document.name.endswith('.pdf'):
return ("Merci de fournir un document PDF")
loader = PyPDFLoader(Document.name)
docs = loader.load()
global k
k = len(docs)
# Create embeddings
embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OpenaiKey'])
# Write in DB
global docsearch
docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs], k=1)
global chat_history
chat_history = []
return "Endodage créé"
def get_chat_history(inputs) -> str:
res = []
for human, ai in inputs:
res.append(f"Question : {human}\nRéponse : {ai}")
return "\n".join(res)
def question_document(Question):
if "docsearch" not in globals():
return "Merci d'encoder un document PDF"
# Define LLM
turbo = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key = os.environ['OpenaiKey'])
davinci = OpenAI(model_name = "text-davinci-003", openai_api_key = os.environ['OpenaiKey'])
# Customize map_reduce prompts
#question_template = """{context}
#Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
#Also make sure to answer in the same langage than the following question.
#QUESTION : {question}
#ANSWER :
#"""
#combine_template = """{summaries}
#Note that the above text is based on transient extracts from one source document.
#So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document.
#Also make sure to answer in the same langage than the following question.
#QUESTION : {question}.
#ANSWER :
#"""
#question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
#combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])
# Define chain
#chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
#qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)
vectordbkwargs = {"search_distance": 10}
search_kwargs={"k" : k}
qa = ConversationalRetrievalChain.from_llm(llm = turbo, chain_type = "map_reduce",retriever=docsearch.as_retriever(search_kwargs = search_kwargs), get_chat_history = get_chat_history, return_source_documents = True)
answer = qa({"question" : Question,"chat_history":chat_history, "vectordbkwargs": vectordbkwargs}, return_only_outputs = True)
chat_history.append((Question, answer["answer"]))
#answer = qa({"question" : Question}, )
print(answer)
return "".join(get_chat_history(chat_history))
with gr.Blocks() as demo:
gr.Markdown(
"""
# Interrogateur de PDF
par Nicolas et Alex
""")
with gr.Row():
with gr.Column():
input_file = gr.inputs.File(label="Charger un document")
greet_btnee = gr.Button("Encoder le document")
output_words = gr.outputs.Textbox(label="Encodage")
greet_btnee.click(fn=load_document, inputs=input_file, outputs = output_words)
with gr.Column():
text = gr.inputs.Textbox(label="Question")
greet_btn = gr.Button("Poser une question")
answer = gr.Textbox(label = "Réponse", lines = 8)
greet_btn.click(fn = question_document, inputs = text, outputs = answer)
demo.launch()