NicolasGaudemet commited on
Commit
37bdd2c
1 Parent(s): 4803f16

Update document_questioner_app.py

Browse files
Files changed (1) hide show
  1. document_questioner_app.py +43 -24
document_questioner_app.py CHANGED
@@ -1,50 +1,69 @@
1
  import openai
2
  import os
3
  import gradio as gr
4
- from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredFileLoader
 
5
  from langchain.embeddings.openai import OpenAIEmbeddings
6
  from langchain.vectorstores import Chroma
7
- from langchain.chains import RetrievalQA
 
 
 
8
  from langchain.chat_models import ChatOpenAI
 
9
 
10
  os.environ["OPENAI_API_KEY"] = "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"
11
 
12
  def question_document(Document, Question):
13
- # Load documents with DirectoryLoader
14
-
15
- if not Document.name.endswith('.txt'):
16
- return ("Le document doit être un fichier texte (.txt)")
17
 
18
- loader = TextLoader(Document.name, encoding = "ISO-8859-1")
19
-
20
- #loader = DirectoryLoader("", glob="*.txt", loader_kwargs = {"encoding" : "ISO-8859-1"})
21
- txt_docs = loader.load_and_split()
22
 
 
 
 
23
  # Create embeddings
24
  embeddings = OpenAIEmbeddings()
 
25
  # Write in DB
26
- txt_docsearch = Chroma.from_documents(txt_docs, embeddings)
27
 
28
  # Define LLM
29
- llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3)
30
-
31
- # Create Retriever
32
- qa_txt = RetrievalQA.from_chain_type(llm=llm,
33
- chain_type="map_reduce",
34
- retriever=txt_docsearch.as_retriever()
35
- )
36
 
37
- answer = qa_txt.run(Question) #+ "If you don't find the answer in the document, don't answer, say you don't know, in the language of the question." )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  return answer
39
 
40
- #Définition de l'interface
41
-
42
  iface = gr.Interface(
43
  fn = question_document,
44
- inputs= ["file","text"],
45
  outputs = gr.outputs.Textbox(label="Réponse"),
46
- title="Long Text Questioner",
47
- description="par Nicolas \nPermet d'interroger un document texte",
48
  allow_flagging = "never")
49
 
50
  iface.launch()
 
1
  import openai
2
  import os
3
  import gradio as gr
4
+ import chromadb
5
+ from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, UnstructuredFileLoader
6
  from langchain.embeddings.openai import OpenAIEmbeddings
7
  from langchain.vectorstores import Chroma
8
+ from langchain.indexes import VectorstoreIndexCreator
9
+ from langchain.text_splitter import CharacterTextSplitter
10
+ from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
11
+ from langchain.prompts import PromptTemplate
12
  from langchain.chat_models import ChatOpenAI
13
+ from langchain.chains.question_answering import load_qa_chain
14
 
15
  os.environ["OPENAI_API_KEY"] = "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"
16
 
17
  def question_document(Document, Question):
 
 
 
 
18
 
19
+ # loads a PDF document
20
+ if not Document.name.endswith('.pdf'):
21
+ return ("Le fichier doit être un document PDF")
 
22
 
23
+ loader = PyPDFLoader(Document.name)
24
+ docs = loader.load()
25
+
26
  # Create embeddings
27
  embeddings = OpenAIEmbeddings()
28
+
29
  # Write in DB
30
+ docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs])
31
 
32
  # Define LLM
33
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.8)
 
 
 
 
 
 
34
 
35
+ # Customize map_reduce prompts
36
+ question_template = """{context}
37
+ Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
38
+ Also make sure to answer in the same langage than the following question.
39
+ QUESTION : {question}
40
+ ANSWER :
41
+ """
42
+
43
+ combine_template = """{summaries}
44
+ Note that the above text is based on transient extracts from one source document.
45
+ So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document.
46
+ Also make sure to answer in the same langage than the following question.
47
+ QUESTION : {question}.
48
+ ANSWER :
49
+ """
50
+
51
+ question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
52
+ combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])
53
+
54
+ # Define chain
55
+ chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
56
+ qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)
57
+
58
+ answer = qa({"question" : Question}, return_only_outputs = True)
59
  return answer
60
 
 
 
61
  iface = gr.Interface(
62
  fn = question_document,
63
+ inputs= ["file","question"],
64
  outputs = gr.outputs.Textbox(label="Réponse"),
65
+ title="Interrogateur de PDF",
66
+ description="par Nicolas \nPermet d'interroger un document PDF",
67
  allow_flagging = "never")
68
 
69
  iface.launch()