LongDocumentQuestioner

Sleeping

App Files Files Community

NicolasGaudemet commited on May 7, 2023

Commit

37bdd2c

•

1 Parent(s): 4803f16

Update document_questioner_app.py

Browse files

Files changed (1) hide show

document_questioner_app.py +43 -24

document_questioner_app.py CHANGED Viewed

@@ -1,50 +1,69 @@
 import openai
 import os
 import gradio as gr
-from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredFileLoader
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
-from langchain.chains import RetrievalQA
 from langchain.chat_models import ChatOpenAI
 os.environ["OPENAI_API_KEY"] = "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"
 def question_document(Document, Question):
-    # Load documents with DirectoryLoader
-    if not Document.name.endswith('.txt'):
-        return ("Le document doit être un fichier texte (.txt)")
-    loader = TextLoader(Document.name, encoding = "ISO-8859-1")
-    #loader = DirectoryLoader("", glob="*.txt", loader_kwargs = {"encoding" : "ISO-8859-1"})
-    txt_docs = loader.load_and_split()
     # Create embeddings
     embeddings = OpenAIEmbeddings()
     # Write in DB
-    txt_docsearch = Chroma.from_documents(txt_docs, embeddings)
     # Define LLM
-    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)
-    # Create Retriever
-    qa_txt = RetrievalQA.from_chain_type(llm=llm,
-                                        chain_type="map_reduce",
-                                        retriever=txt_docsearch.as_retriever()
-                                        )
-    answer = qa_txt.run(Question) #+ "If you don't find the answer in the document, don't answer, say you don't know, in the language of the question." )
     return answer
-#Définition de l'interface
 iface = gr.Interface(
     fn = question_document,
-    inputs= ["file","text"],
     outputs = gr.outputs.Textbox(label="Réponse"),
-    title="Long Text Questioner",
-    description="par Nicolas \nPermet d'interroger un document texte",
     allow_flagging = "never")
 iface.launch()

 import openai
 import os
 import gradio as gr
+import chromadb
+from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, UnstructuredFileLoader
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
+from langchain.indexes import VectorstoreIndexCreator
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
+from langchain.prompts import PromptTemplate
 from langchain.chat_models import ChatOpenAI
+from langchain.chains.question_answering import load_qa_chain
 os.environ["OPENAI_API_KEY"] = "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"
 def question_document(Document, Question):
+    # loads a PDF document
+    if not Document.name.endswith('.pdf'):
+      return ("Le fichier doit être un document PDF")
+    loader = PyPDFLoader(Document.name)
+    docs = loader.load()
     # Create embeddings
     embeddings = OpenAIEmbeddings()
     # Write in DB
+    docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs])
     # Define LLM
+    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.8)
+    # Customize map_reduce prompts
+    question_template = """{context}
+    Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
+    Also make sure to answer in the same langage than the following question.
+    QUESTION : {question}
+    ANSWER :
+    """
+    combine_template = """{summaries}
+    Note that the above text is based on transient extracts from one source document.
+    So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document.
+    Also make sure to answer in the same langage than the following question.
+    QUESTION : {question}.
+    ANSWER :
+    """
+    question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
+    combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])
+    # Define chain
+    chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
+    qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)
+    answer = qa({"question" : Question}, return_only_outputs = True)
     return answer
 iface = gr.Interface(
     fn = question_document,
+    inputs= ["file","question"],
     outputs = gr.outputs.Textbox(label="Réponse"),
+    title="Interrogateur de PDF",
+    description="par Nicolas \nPermet d'interroger un document PDF",
     allow_flagging = "never")
 iface.launch()