Spaces:
Running
Running
some mods
Browse files- app.py +50 -40
- config.py +6 -17
- database/__init__.py +0 -0
- database/vaccine_coverage_db.py +0 -3
- initialize_db.py +0 -3
- rag/rag_pipeline.py +7 -27
- utils/prompts.py +22 -0
app.py
CHANGED
@@ -1,62 +1,72 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
from database.vaccine_coverage_db import VaccineCoverageDB
|
4 |
from rag.rag_pipeline import RAGPipeline
|
5 |
-
from utils.
|
6 |
-
from config import
|
7 |
-
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
14 |
|
15 |
-
# Initialize database and RAG pipeline
|
16 |
-
db = VaccineCoverageDB(DB_PATH)
|
17 |
-
rag = RAGPipeline(METADATA_FILE, PDF_DIR, use_semantic_splitter=True)
|
18 |
|
|
|
|
|
19 |
|
20 |
-
def query_rag(question, prompt_type):
|
21 |
if prompt_type == "Highlight":
|
22 |
-
|
|
|
|
|
23 |
else:
|
24 |
-
|
25 |
|
26 |
-
|
27 |
-
return
|
28 |
|
29 |
|
30 |
-
def
|
31 |
-
|
32 |
-
if
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
|
39 |
|
40 |
-
# Gradio interface
|
41 |
with gr.Blocks() as demo:
|
42 |
-
gr.Markdown("#
|
43 |
|
44 |
-
with gr.
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
52 |
)
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
save_output = gr.Textbox(label="Save Result")
|
58 |
|
59 |
-
|
|
|
|
|
|
|
|
|
60 |
|
61 |
if __name__ == "__main__":
|
62 |
demo.launch()
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
import gradio as gr
|
4 |
+
import json
|
|
|
5 |
from rag.rag_pipeline import RAGPipeline
|
6 |
+
from utils.prompts import highlight_prompt, evidence_based_prompt
|
7 |
+
from config import STUDY_FILES
|
8 |
+
|
9 |
|
10 |
+
def load_rag_pipeline(study_name):
|
11 |
+
study_file = STUDY_FILES.get(study_name)
|
12 |
+
if study_file:
|
13 |
+
return RAGPipeline(study_file)
|
14 |
+
else:
|
15 |
+
raise ValueError(f"Invalid study name: {study_name}")
|
16 |
|
|
|
|
|
|
|
17 |
|
18 |
+
def query_rag(study_name, question, prompt_type):
|
19 |
+
rag = load_rag_pipeline(study_name)
|
20 |
|
|
|
21 |
if prompt_type == "Highlight":
|
22 |
+
prompt = highlight_prompt
|
23 |
+
elif prompt_type == "Evidence-based":
|
24 |
+
prompt = evidence_based_prompt
|
25 |
else:
|
26 |
+
prompt = None
|
27 |
|
28 |
+
response = rag.query(question, prompt)
|
29 |
+
return response.response
|
30 |
|
31 |
|
32 |
+
def get_study_info(study_name):
|
33 |
+
study_file = STUDY_FILES.get(study_name)
|
34 |
+
if study_file:
|
35 |
+
with open(study_file, "r") as f:
|
36 |
+
data = json.load(f)
|
37 |
+
return f"Number of documents: {len(data)}\nFirst document title: {data[0]['title']}"
|
38 |
+
else:
|
39 |
+
return "Invalid study name"
|
40 |
|
41 |
|
|
|
42 |
with gr.Blocks() as demo:
|
43 |
+
gr.Markdown("# RAG Pipeline Demo")
|
44 |
|
45 |
+
with gr.Row():
|
46 |
+
study_dropdown = gr.Dropdown(
|
47 |
+
choices=list(STUDY_FILES.keys()), label="Select Study"
|
48 |
+
)
|
49 |
+
study_info = gr.Textbox(label="Study Information", interactive=False)
|
50 |
+
|
51 |
+
study_dropdown.change(get_study_info, inputs=[study_dropdown], outputs=[study_info])
|
52 |
|
53 |
+
with gr.Row():
|
54 |
+
question_input = gr.Textbox(label="Enter your question")
|
55 |
+
prompt_type = gr.Radio(
|
56 |
+
["Default", "Highlight", "Evidence-based"],
|
57 |
+
label="Prompt Type",
|
58 |
+
value="Default",
|
59 |
)
|
60 |
|
61 |
+
submit_button = gr.Button("Submit")
|
62 |
+
|
63 |
+
answer_output = gr.Textbox(label="Answer")
|
|
|
64 |
|
65 |
+
submit_button.click(
|
66 |
+
query_rag,
|
67 |
+
inputs=[study_dropdown, question_input, prompt_type],
|
68 |
+
outputs=[answer_output],
|
69 |
+
)
|
70 |
|
71 |
if __name__ == "__main__":
|
72 |
demo.launch()
|
config.py
CHANGED
@@ -1,20 +1,9 @@
|
|
1 |
import os
|
2 |
|
3 |
-
# Base directory
|
4 |
-
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
5 |
-
|
6 |
-
# Database configuration
|
7 |
-
DB_NAME = "vaccine_coverage_study.db"
|
8 |
-
DB_PATH = os.path.join(BASE_DIR, DB_NAME)
|
9 |
-
|
10 |
-
# RAG Pipeline configuration
|
11 |
-
DATA_DIR = os.path.join(BASE_DIR, "data")
|
12 |
-
METADATA_FILE = os.path.join(DATA_DIR, "metadata_map.json")
|
13 |
-
PDF_DIR = os.path.join(DATA_DIR, "pdfs")
|
14 |
-
|
15 |
-
# Create directories if they don't exist
|
16 |
-
os.makedirs(DATA_DIR, exist_ok=True)
|
17 |
-
os.makedirs(PDF_DIR, exist_ok=True)
|
18 |
-
|
19 |
-
# OpenAI configuration
|
20 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
4 |
+
|
5 |
+
STUDY_FILES = {
|
6 |
+
"Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
|
7 |
+
"Ebola Virus": "data/ebola_virus_zotero_items.json",
|
8 |
+
"Gene Xpert": "data/gene_xpert_zotero_items.json",
|
9 |
+
}
|
database/__init__.py
DELETED
File without changes
|
database/vaccine_coverage_db.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:42a0645cdd38f2d7ede525768eb21a4cbe08b4d86959cb4eb2349887f2bcf70e
|
3 |
-
size 1774
|
|
|
|
|
|
|
|
initialize_db.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:08030c4783a86d9a9afb9437b102dde959405b6b2857725eec02b6d9c2699e97
|
3 |
-
size 2346
|
|
|
|
|
|
|
|
rag/rag_pipeline.py
CHANGED
@@ -1,24 +1,13 @@
|
|
|
|
|
|
1 |
import json
|
2 |
-
import
|
3 |
-
from
|
4 |
-
from llama_index.core import (
|
5 |
-
VectorStoreIndex,
|
6 |
-
Document,
|
7 |
-
SentenceWindowNodeParser,
|
8 |
-
)
|
9 |
-
from llama_index.core.node_parser import (
|
10 |
-
SentenceSplitter,
|
11 |
-
)
|
12 |
from llama_index.core import PromptTemplate
|
13 |
|
14 |
|
15 |
class RAGPipeline:
|
16 |
-
def __init__(
|
17 |
-
self,
|
18 |
-
study_json,
|
19 |
-
use_semantic_splitter=False,
|
20 |
-
):
|
21 |
-
|
22 |
self.study_json = study_json
|
23 |
self.index = None
|
24 |
self.use_semantic_splitter = use_semantic_splitter
|
@@ -34,10 +23,7 @@ class RAGPipeline:
|
|
34 |
for index, doc_data in enumerate(self.data):
|
35 |
doc_content = (
|
36 |
f"Title: {doc_data['title']}\n"
|
37 |
-
f"Abstract: {doc_data['abstract']}\n"
|
38 |
f"Authors: {', '.join(doc_data['authors'])}\n"
|
39 |
-
f"Year: {doc_data['year']}\n"
|
40 |
-
f"DOI: {doc_data['doi']}\n"
|
41 |
f"Full Text: {doc_data['full_text']}"
|
42 |
)
|
43 |
|
@@ -50,11 +36,7 @@ class RAGPipeline:
|
|
50 |
}
|
51 |
|
52 |
self.documents.append(
|
53 |
-
Document(
|
54 |
-
text=doc_content,
|
55 |
-
id_=f"doc_{index}",
|
56 |
-
metadata=metadata,
|
57 |
-
)
|
58 |
)
|
59 |
|
60 |
def build_index(self):
|
@@ -71,7 +53,6 @@ class RAGPipeline:
|
|
71 |
)
|
72 |
|
73 |
nodes = node_parser.get_nodes_from_documents(self.documents)
|
74 |
-
|
75 |
self.index = VectorStoreIndex(nodes)
|
76 |
|
77 |
def query(self, question, prompt_template=None):
|
@@ -89,8 +70,7 @@ class RAGPipeline:
|
|
89 |
)
|
90 |
|
91 |
query_engine = self.index.as_query_engine(
|
92 |
-
text_qa_template=prompt_template,
|
93 |
-
similarity_top_k=5,
|
94 |
)
|
95 |
response = query_engine.query(question)
|
96 |
|
|
|
1 |
+
# rag/rag_pipeline.py
|
2 |
+
|
3 |
import json
|
4 |
+
from llama_index.core import Document, VectorStoreIndex
|
5 |
+
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from llama_index.core import PromptTemplate
|
7 |
|
8 |
|
9 |
class RAGPipeline:
|
10 |
+
def __init__(self, study_json, use_semantic_splitter=False):
|
|
|
|
|
|
|
|
|
|
|
11 |
self.study_json = study_json
|
12 |
self.index = None
|
13 |
self.use_semantic_splitter = use_semantic_splitter
|
|
|
23 |
for index, doc_data in enumerate(self.data):
|
24 |
doc_content = (
|
25 |
f"Title: {doc_data['title']}\n"
|
|
|
26 |
f"Authors: {', '.join(doc_data['authors'])}\n"
|
|
|
|
|
27 |
f"Full Text: {doc_data['full_text']}"
|
28 |
)
|
29 |
|
|
|
36 |
}
|
37 |
|
38 |
self.documents.append(
|
39 |
+
Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
|
|
|
|
|
|
|
|
|
40 |
)
|
41 |
|
42 |
def build_index(self):
|
|
|
53 |
)
|
54 |
|
55 |
nodes = node_parser.get_nodes_from_documents(self.documents)
|
|
|
56 |
self.index = VectorStoreIndex(nodes)
|
57 |
|
58 |
def query(self, question, prompt_template=None):
|
|
|
70 |
)
|
71 |
|
72 |
query_engine = self.index.as_query_engine(
|
73 |
+
text_qa_template=prompt_template, similarity_top_k=5
|
|
|
74 |
)
|
75 |
response = query_engine.query(question)
|
76 |
|
utils/prompts.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llama_index.core import PromptTemplate
|
2 |
+
|
3 |
+
highlight_prompt = PromptTemplate(
|
4 |
+
"Context information is below.\n"
|
5 |
+
"---------------------\n"
|
6 |
+
"{context_str}\n"
|
7 |
+
"---------------------\n"
|
8 |
+
"Given this information, please answer the question: {query_str}\n"
|
9 |
+
"Include all relevant information from the provided context. "
|
10 |
+
"Highlight key information by enclosing it in **asterisks**. "
|
11 |
+
"When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
|
12 |
+
)
|
13 |
+
|
14 |
+
evidence_based_prompt = PromptTemplate(
|
15 |
+
"Context information is below.\n"
|
16 |
+
"---------------------\n"
|
17 |
+
"{context_str}\n"
|
18 |
+
"---------------------\n"
|
19 |
+
"Given this information, please answer the question: {query_str}\n"
|
20 |
+
"Provide an answer to the question using evidence from the context above. "
|
21 |
+
"Cite sources using square brackets."
|
22 |
+
)
|