|
|
|
|
|
|
|
from climateqa.engine.vectorstore import build_vectores_stores, get_PDF_Names_from_GCP, get_categories_files |
|
from climateqa.engine.text_retriever import ClimateQARetriever |
|
from climateqa.engine.rag import make_rag_chain |
|
from climateqa.engine.llm import get_llm |
|
from utils import create_user_id |
|
from datetime import datetime |
|
import json |
|
import re |
|
import gradio as gr |
|
from sentence_transformers import CrossEncoder |
|
|
|
reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1") |
|
|
|
|
|
try: |
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
except Exception as e: |
|
pass |
|
|
|
|
|
theme = gr.themes.Soft( |
|
primary_hue="yellow", |
|
secondary_hue="orange", |
|
font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", |
|
"system-ui", "sans-serif"], |
|
) |
|
|
|
|
|
init_prompt = "" |
|
|
|
system_template = { |
|
"role": "system", |
|
"content": init_prompt, |
|
} |
|
|
|
user_id = create_user_id() |
|
|
|
list_categorie = get_categories_files() |
|
categories=list_categorie["AllCat"] |
|
|
|
def parse_output_llm_with_sources(output): |
|
|
|
content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output) |
|
parts = [] |
|
for part in content_parts: |
|
if part.startswith("Doc"): |
|
subparts = part.split(",") |
|
|
|
subparts = [subpart.lower().replace("doc", "").strip() |
|
for subpart in subparts] |
|
subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup style="color:#FFC000 !important;">({ |
|
subpart})</sup></span></a>""" for subpart in subparts] |
|
parts.append("".join(subparts)) |
|
else: |
|
parts.append(part) |
|
content_parts = "".join(parts) |
|
return content_parts |
|
|
|
|
|
def serialize_docs(docs): |
|
new_docs = [] |
|
for doc in docs: |
|
new_doc = {} |
|
new_doc["page_content"] = doc.page_content |
|
new_doc["metadata"] = doc.metadata |
|
new_docs.append(new_doc) |
|
return new_docs |
|
|
|
|
|
|
|
vectorstore = build_vectores_stores("./sources") |
|
llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0) |
|
|
|
|
|
async def chat(query, history, categories, src_nb_max, src_pertinence): |
|
"""taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of: |
|
(messages in gradio format, messages in langchain format, source documents)""" |
|
|
|
print(f">> NEW QUESTION : {query} -> sources max:{src_nb_max} - pertience: {src_pertinence}") |
|
|
|
filter = None |
|
if len(categories): |
|
filter={ "$or" : [] } |
|
for cat in categories: |
|
for fich in list_categorie[cat]: |
|
filter["$or"].append({"ax_name": fich}) |
|
|
|
print( ">> Filter :" + str(filter) ) |
|
print( ">> nb sources :" + str(src_nb_max) ) |
|
print( ">> pertinence :" + str(src_pertinence) ) |
|
|
|
retriever = ClimateQARetriever( |
|
vectorstore=vectorstore, sources=["Custom"], reports=[], |
|
threshold=src_pertinence, k_total=src_nb_max, filter=filter |
|
) |
|
rag_chain = make_rag_chain(retriever, llm) |
|
|
|
inputs = {"query": query, "audience": None} |
|
result = rag_chain.astream_log(inputs) |
|
|
|
path_reformulation = "/logs/reformulation/final_output" |
|
path_keywords = "/logs/keywords/final_output" |
|
path_retriever = "/logs/find_documents/final_output" |
|
path_answer = "/logs/answer/streamed_output_str/-" |
|
|
|
docs_html = "" |
|
output_query = "" |
|
output_language = "" |
|
output_keywords = "" |
|
gallery = [] |
|
|
|
try: |
|
async for op in result: |
|
|
|
op = op.ops[0] |
|
|
|
if op['path'] == path_reformulation: |
|
try: |
|
output_language = op['value']["language"] |
|
output_query = op["value"]["question"] |
|
except Exception as e: |
|
raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)") |
|
|
|
if op["path"] == path_keywords: |
|
try: |
|
output_keywords = op['value']["keywords"] |
|
output_keywords = " AND ".join(output_keywords) |
|
except Exception as e: |
|
pass |
|
|
|
elif op['path'] == path_retriever: |
|
try: |
|
docs = op['value']['docs'] |
|
docs_html = [] |
|
for i, d in enumerate(docs, 1): |
|
docs_html.append(make_html_source(d, i)) |
|
docs_html = "".join(docs_html) |
|
except TypeError: |
|
print("No documents found") |
|
print("op: ", op) |
|
continue |
|
|
|
elif op['path'] == path_answer: |
|
new_token = op['value'] |
|
|
|
previous_answer = history[-1][1] |
|
previous_answer = previous_answer if previous_answer is not None else "" |
|
answer_yet = previous_answer + new_token |
|
answer_yet = parse_output_llm_with_sources(answer_yet) |
|
history[-1] = (query, answer_yet) |
|
|
|
else: |
|
continue |
|
|
|
history = [tuple(x) for x in history] |
|
yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords |
|
|
|
except Exception as e: |
|
raise gr.Error(f"{e}") |
|
|
|
timestamp = str(datetime.now().timestamp()) |
|
log_file = "logs/" + timestamp + ".json" |
|
prompt = history[-1][0] |
|
logs = { |
|
"user_id": str(user_id), |
|
"prompt": prompt, |
|
"query": prompt, |
|
"question": output_query, |
|
"sources": ["Custom"], |
|
"docs": serialize_docs(docs), |
|
"answer": history[-1][1], |
|
"time": timestamp, |
|
} |
|
|
|
|
|
yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords |
|
|
|
|
|
def make_html_source(source, i): |
|
|
|
text_content = source.page_content.strip() |
|
meta = source.metadata |
|
|
|
name = f"<b>Document {i}</b>" |
|
|
|
|
|
card = f""" |
|
<div class="card" id="doc{i}"> |
|
<div class="card-content"> |
|
<div> |
|
<div style="float:right;width 10%;position:relative;top:0px"> |
|
<a href='{meta['ax_url']}' target='_blank'><img style="width:20px" src='/file/assets/download.png' /></a> |
|
</div> |
|
<div> |
|
<h2>Extrait {i} (Score:{float(meta['similarity_score'])})</h2> |
|
<h2> {meta['ax_name']} - Page {int(meta['ax_page'])}</h2> |
|
</div> |
|
</div> |
|
<p>{text_content}</p> |
|
|
|
</div> |
|
<!-- <div class="card-footer"> |
|
<span>{name}</span> |
|
</div> --> |
|
</div> |
|
""" |
|
|
|
return card |
|
|
|
def log_locally(file, logs): |
|
|
|
logs_json = json.dumps(logs) |
|
|
|
|
|
with open(file, 'w') as f: |
|
f.write(logs_json) |
|
|
|
|
|
|
|
|
|
|
|
|
|
init_prompt = """ |
|
Hello, I am Clara, an AI Assistant created by Axionable. My purpose is to answer your questions using the provided extracted passages, context, and guidelines. |
|
|
|
❓ How to interact with Clara |
|
|
|
Ask your question: You can ask me anything you want to know. I'll provide an answer based on the extracted passages and other relevant sources. |
|
Response structure: I aim to provide clear and structured answers using the given data. |
|
Guidelines: I follow specific guidelines to ensure that my responses are accurate and useful. |
|
⚠️ Limitations |
|
Though I do my best to help, there might be times when my responses are incorrect or incomplete. If that happens, please feel free to ask for more information or provide feedback to help improve my performance. |
|
|
|
What would you like to know today? |
|
""" |
|
|
|
|
|
with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component", elem_classes="ax_background") as demo: |
|
|
|
gr.HTML(""" |
|
<img style="width:100px" src="file/assets/axionable.svg"/> |
|
""", elem_classes="logo-axio ") |
|
|
|
|
|
with gr.Tab("CLARA"): |
|
|
|
with gr.Row(elem_id="chatbot-row"): |
|
with gr.Column(scale=2): |
|
chatbot = gr.Chatbot( |
|
value=[(None, init_prompt)], |
|
show_copy_button=True, show_label=False, elem_id="chatbot", layout="panel", |
|
avatar_images=(None, "assets/logo4.png")) |
|
|
|
with gr.Row(elem_id="input-message"): |
|
textbox = gr.Textbox(placeholder="Posez votre question", show_label=False, |
|
scale=7, lines=1, interactive=True, elem_id="input-textbox") |
|
|
|
|
|
with gr.Column(scale=1, variant="panel", elem_id="right-panel"): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Tabs() as tabs: |
|
|
|
|
|
with gr.Tab("sources"): |
|
sources_textbox = gr.HTML( |
|
show_label=False, elem_id="sources-textbox") |
|
docs_textbox = gr.State("") |
|
|
|
with gr.Tab("filtres"): |
|
|
|
cat_sel = gr.CheckboxGroup(categories,label="Catégories") |
|
|
|
slider = gr.Slider(1, 10, value=7, step=1, label="nb max", interactive=True, elem_id="source-nb-max") |
|
slider_p = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence") |
|
|
|
|
|
with gr.Tab("À propos", elem_classes="max-height other-tabs"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown( |
|
("CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)" |
|
"– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)"), elem_classes="a-propos") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def start_chat(query, history): |
|
|
|
history = history + [(query, None)] |
|
history = [tuple(x) for x in history] |
|
return (gr.update(interactive=False), gr.update(selected=1), history) |
|
|
|
def finish_chat(): |
|
return (gr.update(interactive=True, value="")) |
|
|
|
(textbox |
|
.submit(start_chat, [textbox, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox") |
|
.then(chat, [textbox, chatbot, cat_sel, slider, slider_p], [chatbot, sources_textbox], concurrency_limit=8, api_name="chat_textbox") |
|
.then(finish_chat, None, [textbox], api_name="finish_chat_textbox") |
|
) |
|
|
|
|
|
|
|
demo.queue() |
|
|
|
|
|
demo.launch(allowed_paths=["assets/download.png", |
|
"assets/logo4.png", |
|
"assets/axionable.svg"],favicon_path="assets/logo4.png") |
|
|