|
import openai |
|
import os |
|
import gradio as gr |
|
import chromadb |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain.indexes import VectorstoreIndexCreator |
|
from langchain.chains import RetrievalQAWithSourcesChain |
|
from langchain.prompts import PromptTemplate |
|
from langchain.chat_models import ChatOpenAI |
|
|
|
def question_document(Document, Question): |
|
|
|
|
|
if not Document.name.endswith('.pdf'): |
|
return ("Le fichier doit être un document PDF") |
|
|
|
loader = PyPDFLoader(Document.name) |
|
docs = loader.load() |
|
|
|
|
|
embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OpenaiKey']) |
|
|
|
|
|
docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs]) |
|
|
|
|
|
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2, openai_api_key = os.environ['OpenaiKey']) |
|
|
|
|
|
question_template = """{context} |
|
Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page". |
|
Also make sure to answer in the same langage than the following question. |
|
QUESTION : {question} |
|
ANSWER : |
|
""" |
|
|
|
combine_template = """{summaries} |
|
Note that the above text is based on transient extracts from one source document. |
|
So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document. |
|
Also make sure to answer in the same langage than the following question. |
|
QUESTION : {question}. |
|
ANSWER : |
|
""" |
|
|
|
question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question']) |
|
combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question']) |
|
|
|
|
|
chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} |
|
qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True) |
|
|
|
answer = qa({"question" : Question}, return_only_outputs = True) |
|
return answer["answer"] |
|
|
|
iface = gr.Interface( |
|
fn = question_document, |
|
inputs= ["file","text"], |
|
outputs = gr.outputs.Textbox(label="Réponse"), |
|
title="Interrogateur de PDF", |
|
description="par Nicolas \nPermet d'interroger un document PDF", |
|
allow_flagging = "never") |
|
|
|
iface.launch() |