LongDocumentQuestioner

Sleeping

App Files Files Community

LongDocumentQuestioner / document_questioner_app.py

NicolasGaudemet

Update document_questioner_app.py

d4b788f over 1 year ago

raw

history blame

2.73 kB

	import openai
	import os
	import gradio as gr
	import chromadb
	from langchain.document_loaders import PyPDFLoader
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.indexes import VectorstoreIndexCreator
	from langchain.chains import RetrievalQAWithSourcesChain
	from langchain.prompts import PromptTemplate
	from langchain.chat_models import ChatOpenAI

	def question_document(Document, Question):

	# loads a PDF document
	if not Document.name.endswith('.pdf'):
	return ("Le fichier doit être un document PDF")

	loader = PyPDFLoader(Document.name)
	docs = loader.load()

	# Create embeddings
	embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OpenaiKey'])

	# Write in DB
	docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs])

	# Define LLM
	llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2, openai_api_key = os.environ['OpenaiKey'])

	# Customize map_reduce prompts
	question_template = """{context}
	Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
	Also make sure to answer in the same langage than the following question.
	QUESTION : {question}
	ANSWER :
	"""

	combine_template = """{summaries}
	Note that the above text is based on transient extracts from one source document.
	So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document.
	Also make sure to answer in the same langage than the following question.
	QUESTION : {question}.
	ANSWER :
	"""

	question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
	combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])

	# Define chain
	chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
	qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)

	answer = qa({"question" : Question}, return_only_outputs = True)
	return answer["answer"]

	iface = gr.Interface(
	fn = question_document,
	inputs= ["file","text"],
	outputs = gr.outputs.Textbox(label="Réponse"),
	title="Interrogateur de PDF",
	description="par Nicolas \nPermet d'interroger un document PDF",
	allow_flagging = "never")

	iface.launch()