Update main.py
Browse files
main.py
CHANGED
@@ -4,19 +4,14 @@ import bcrypt
|
|
4 |
from typing import List
|
5 |
from pathlib import Path
|
6 |
from langchain_huggingface import HuggingFaceEmbeddings
|
7 |
-
#from langchain_community.llms import HuggingFaceEndpoint
|
8 |
from langchain_huggingface import HuggingFaceEndpoint
|
9 |
-
|
10 |
-
from langchain.prompts import ChatPromptTemplate
|
11 |
from langchain.schema import StrOutputParser
|
12 |
-
from langchain_community.document_loaders import (
|
13 |
-
PyMuPDFLoader,
|
14 |
-
)
|
15 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
16 |
-
from langchain_community.vectorstores import Chroma
|
17 |
|
18 |
-
from
|
19 |
-
from
|
|
|
|
|
20 |
from langchain.schema.runnable import Runnable, RunnablePassthrough, RunnableConfig
|
21 |
from langchain.callbacks.base import BaseCallbackHandler
|
22 |
|
@@ -39,47 +34,6 @@ def auth_callback(username: str, password: str):
|
|
39 |
identifier=ident + " : 🧑🎓 User Datapcc", metadata={"role": "user", "provider": "credentials"}
|
40 |
)
|
41 |
|
42 |
-
chunk_size = 1024
|
43 |
-
chunk_overlap = 50
|
44 |
-
|
45 |
-
embeddings_model = HuggingFaceEmbeddings()
|
46 |
-
|
47 |
-
PDF_STORAGE_PATH = "./public/pdfs"
|
48 |
-
|
49 |
-
|
50 |
-
def process_pdfs(pdf_storage_path: str):
|
51 |
-
pdf_directory = Path(pdf_storage_path)
|
52 |
-
docs = [] # type: List[Document]
|
53 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
54 |
-
|
55 |
-
for pdf_path in pdf_directory.glob("*.pdf"):
|
56 |
-
loader = PyMuPDFLoader(str(pdf_path))
|
57 |
-
documents = loader.load()
|
58 |
-
docs += text_splitter.split_documents(documents)
|
59 |
-
|
60 |
-
doc_search = Chroma.from_documents(docs, embeddings_model)
|
61 |
-
|
62 |
-
namespace = "chromadb/my_documents"
|
63 |
-
record_manager = SQLRecordManager(
|
64 |
-
namespace, db_url="sqlite:///record_manager_cache.sql"
|
65 |
-
)
|
66 |
-
record_manager.create_schema()
|
67 |
-
|
68 |
-
index_result = index(
|
69 |
-
docs,
|
70 |
-
record_manager,
|
71 |
-
doc_search,
|
72 |
-
cleanup="incremental",
|
73 |
-
source_id_key="source",
|
74 |
-
)
|
75 |
-
|
76 |
-
print(f"Indexing stats: {index_result}")
|
77 |
-
|
78 |
-
return doc_search
|
79 |
-
|
80 |
-
|
81 |
-
doc_search = process_pdfs(PDF_STORAGE_PATH)
|
82 |
-
#model = ChatOpenAI(model_name="gpt-4", streaming=True)
|
83 |
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.environ['HUGGINGFACEHUB_API_TOKEN']
|
84 |
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
85 |
|
@@ -114,31 +68,38 @@ async def on_chat_start():
|
|
114 |
await cl.Message(f"Vous pouvez requêter sur la thématique : {res.get('value')}").send()
|
115 |
cl.user_session.set("selectRequest", res.get("value"))
|
116 |
|
117 |
-
|
118 |
-
|
|
|
|
|
119 |
{context}
|
120 |
-
|
121 |
-
Question: {question}
|
122 |
"""
|
123 |
-
prompt = ChatPromptTemplate.
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
130 |
runnable = (
|
131 |
-
|
|
|
|
|
132 |
| prompt
|
133 |
| model
|
134 |
-
| StrOutputParser()
|
135 |
)
|
136 |
|
|
|
137 |
cl.user_session.set("runnable", runnable)
|
138 |
|
139 |
|
140 |
@cl.on_message
|
141 |
async def on_message(message: cl.Message):
|
|
|
142 |
runnable = cl.user_session.get("runnable") # type: Runnable
|
143 |
msg = cl.Message(content="")
|
144 |
|
@@ -159,21 +120,34 @@ async def on_message(message: cl.Message):
|
|
159 |
self.sources.add(source_page_pair) # Add unique pairs to the set
|
160 |
|
161 |
def on_llm_end(self, response, *, run_id, parent_run_id, **kwargs):
|
162 |
-
cl.user_session.set("selectRequest","")
|
163 |
if len(self.sources):
|
164 |
sources_text = "\n".join([f"{source}#page={page}" for source, page in self.sources])
|
165 |
self.msg.elements.append(
|
166 |
cl.Text(name="Sources", content=sources_text, display="inline")
|
167 |
)
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
async with cl.Step(type="run", name="QA Assistant"):
|
170 |
async for chunk in runnable.astream(
|
171 |
-
|
172 |
config=RunnableConfig(callbacks=[
|
173 |
-
cl.
|
174 |
-
PostMessageHandler(msg)
|
175 |
]),
|
176 |
):
|
177 |
await msg.stream_token(chunk)
|
178 |
|
179 |
-
await msg.send()
|
|
|
|
|
|
4 |
from typing import List
|
5 |
from pathlib import Path
|
6 |
from langchain_huggingface import HuggingFaceEmbeddings
|
|
|
7 |
from langchain_huggingface import HuggingFaceEndpoint
|
8 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
|
9 |
from langchain.schema import StrOutputParser
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
from operator import itemgetter
|
12 |
+
from pinecone import Pinecone
|
13 |
+
|
14 |
+
from langchain.memory import ConversationBufferMemory
|
15 |
from langchain.schema.runnable import Runnable, RunnablePassthrough, RunnableConfig
|
16 |
from langchain.callbacks.base import BaseCallbackHandler
|
17 |
|
|
|
34 |
identifier=ident + " : 🧑🎓 User Datapcc", metadata={"role": "user", "provider": "credentials"}
|
35 |
)
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.environ['HUGGINGFACEHUB_API_TOKEN']
|
38 |
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
39 |
|
|
|
68 |
await cl.Message(f"Vous pouvez requêter sur la thématique : {res.get('value')}").send()
|
69 |
cl.user_session.set("selectRequest", res.get("value"))
|
70 |
|
71 |
+
memory = ConversationBufferMemory(return_messages=True)
|
72 |
+
template = """<s>[INST] Vous êtes un chercheur de l'enseignement supérieur et vous êtes doué pour faire des analyses d'articles de recherche sur les thématiques liées à la pédagogie, en fonction des critères définis ci-avant.
|
73 |
+
|
74 |
+
En fonction des informations suivantes et du contexte suivant seulement et strictement, répondez en langue française strictement à la question ci-dessous à partir du contexte ci-dessous. Si vous ne pouvez pas répondre à la question sur la base des informations, dites que vous ne trouvez pas de réponse ou que vous ne parvenez pas à trouver de réponse. Essayez donc de comprendre en profondeur le contexte et répondez uniquement en vous basant sur les informations fournies. Ne générez pas de réponses non pertinentes.
|
75 |
{context}
|
76 |
+
{question} [/INST] </s>
|
|
|
77 |
"""
|
78 |
+
prompt = ChatPromptTemplate.from_messages(
|
79 |
+
[
|
80 |
+
(
|
81 |
+
"system",
|
82 |
+
f"Contexte : Vous êtes un chercheur de l'enseignement supérieur et vous êtes doué pour faire des analyses d'articles de recherche sur les thématiques liées à la pédagogie. En fonction des informations suivantes et du contexte suivant seulement et strictement.",
|
83 |
+
),
|
84 |
+
MessagesPlaceholder(variable_name="history"),
|
85 |
+
("human", "Contexte : {context}, réponds à la question suivante de la manière la plus pertinente, la plus exhaustive et la plus détaillée possible. {question}."),
|
86 |
+
]
|
87 |
+
)
|
88 |
runnable = (
|
89 |
+
RunnablePassthrough.assign(
|
90 |
+
history=RunnableLambda(memory.load_memory_variables) | itemgetter("history")
|
91 |
+
)
|
92 |
| prompt
|
93 |
| model
|
|
|
94 |
)
|
95 |
|
96 |
+
cl.user_session.set("memory", memory)
|
97 |
cl.user_session.set("runnable", runnable)
|
98 |
|
99 |
|
100 |
@cl.on_message
|
101 |
async def on_message(message: cl.Message):
|
102 |
+
runnable = cl.user_session.get("memory")
|
103 |
runnable = cl.user_session.get("runnable") # type: Runnable
|
104 |
msg = cl.Message(content="")
|
105 |
|
|
|
120 |
self.sources.add(source_page_pair) # Add unique pairs to the set
|
121 |
|
122 |
def on_llm_end(self, response, *, run_id, parent_run_id, **kwargs):
|
|
|
123 |
if len(self.sources):
|
124 |
sources_text = "\n".join([f"{source}#page={page}" for source, page in self.sources])
|
125 |
self.msg.elements.append(
|
126 |
cl.Text(name="Sources", content=sources_text, display="inline")
|
127 |
)
|
128 |
|
129 |
+
os.environ['PINECONE_API_KEY'] = os.environ['PINECONE_API_KEY']
|
130 |
+
embeddings = HuggingFaceEmbeddings()
|
131 |
+
index_name = "all-venus"
|
132 |
+
pc = Pinecone(
|
133 |
+
api_key=os.environ['PINECONE_API_KEY']
|
134 |
+
)
|
135 |
+
index = pc.Index(index_name)
|
136 |
+
xq = embeddings.embed_query(message.content)
|
137 |
+
xc = index.query(vector=xq, filter={"categorie": {"$eq": "bibliographie-OPP-DGDIN"}},top_k=150, include_metadata=True)
|
138 |
+
context_p = ""
|
139 |
+
for result in xc['matches']:
|
140 |
+
context_p = context_p + result['metadata']['text']
|
141 |
+
|
142 |
async with cl.Step(type="run", name="QA Assistant"):
|
143 |
async for chunk in runnable.astream(
|
144 |
+
{"question": message.content, "context":context_p},
|
145 |
config=RunnableConfig(callbacks=[
|
146 |
+
cl.AsyncLangchainCallbackHandler(stream_final_answer=True)
|
|
|
147 |
]),
|
148 |
):
|
149 |
await msg.stream_token(chunk)
|
150 |
|
151 |
+
await msg.send()
|
152 |
+
memory.chat_memory.add_user_message(message.content)
|
153 |
+
memory.chat_memory.add_ai_message(msg.content)
|