Spaces:

Sunbird
/

acres

Running

App Files Files Community

ak3ra commited on Sep 4

Commit

cfb1a62

•

1 Parent(s): b117341

some mods

Browse files

Files changed (7) hide show

app.py +50 -40
config.py +6 -17
database/__init__.py +0 -0
database/vaccine_coverage_db.py +0 -3
initialize_db.py +0 -3
rag/rag_pipeline.py +7 -27
utils/prompts.py +22 -0

app.py CHANGED Viewed

@@ -1,62 +1,72 @@
 import gradio as gr
-import os
-from database.vaccine_coverage_db import VaccineCoverageDB
 from rag.rag_pipeline import RAGPipeline
-from utils.helpers import process_response
-from config import DB_PATH, METADATA_FILE, PDF_DIR
-from initialize_db import initialize_database, populate_database
-# Initialize database if it doesn't exist
-if not os.path.exists(DB_PATH):
-    print("Database not found. Initializing...")
-    initialize_database()
-    populate_database()
-# Initialize database and RAG pipeline
-db = VaccineCoverageDB(DB_PATH)
-rag = RAGPipeline(METADATA_FILE, PDF_DIR, use_semantic_splitter=True)
-def query_rag(question, prompt_type):
     if prompt_type == "Highlight":
-        response = rag.query(question, prompt_type="highlight")
     else:
-        response = rag.query(question, prompt_type="evidence_based")
-    processed = process_response(response)
-    return processed["markdown"]
-def save_pdf(item_key):
-    attachments = db.get_attachments_for_item(item_key)
-    if attachments:
-        attachment_key = attachments[0]["key"]
-        output_path = os.path.join(PDF_DIR, f"{attachment_key}.pdf")
-        if db.save_pdf_to_file(attachment_key, output_path):
-            return f"PDF saved successfully to {output_path}"
-    return "Failed to save PDF or no attachments found"
-# Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Vaccine Coverage Study RAG System")
-    with gr.Tab("Query"):
-        question_input = gr.Textbox(label="Enter your question")
-        prompt_type = gr.Radio(["Highlight", "Evidence-based"], label="Prompt Type")
-        query_button = gr.Button("Submit Query")
-        output = gr.Markdown(label="Response")
-        query_button.click(
-            query_rag, inputs=[question_input, prompt_type], outputs=output
         )
-    with gr.Tab("Save PDF"):
-        item_key_input = gr.Textbox(label="Enter item key")
-        save_button = gr.Button("Save PDF")
-        save_output = gr.Textbox(label="Save Result")
-        save_button.click(save_pdf, inputs=item_key_input, outputs=save_output)
 if __name__ == "__main__":
     demo.launch()

+# app.py
 import gradio as gr
+import json
 from rag.rag_pipeline import RAGPipeline
+from utils.prompts import highlight_prompt, evidence_based_prompt
+from config import STUDY_FILES
+def load_rag_pipeline(study_name):
+    study_file = STUDY_FILES.get(study_name)
+    if study_file:
+        return RAGPipeline(study_file)
+    else:
+        raise ValueError(f"Invalid study name: {study_name}")
+def query_rag(study_name, question, prompt_type):
+    rag = load_rag_pipeline(study_name)
     if prompt_type == "Highlight":
+        prompt = highlight_prompt
+    elif prompt_type == "Evidence-based":
+        prompt = evidence_based_prompt
     else:
+        prompt = None
+    response = rag.query(question, prompt)
+    return response.response
+def get_study_info(study_name):
+    study_file = STUDY_FILES.get(study_name)
+    if study_file:
+        with open(study_file, "r") as f:
+            data = json.load(f)
+        return f"Number of documents: {len(data)}\nFirst document title: {data[0]['title']}"
+    else:
+        return "Invalid study name"
 with gr.Blocks() as demo:
+    gr.Markdown("# RAG Pipeline Demo")
+    with gr.Row():
+        study_dropdown = gr.Dropdown(
+            choices=list(STUDY_FILES.keys()), label="Select Study"
+        )
+        study_info = gr.Textbox(label="Study Information", interactive=False)
+    study_dropdown.change(get_study_info, inputs=[study_dropdown], outputs=[study_info])
+    with gr.Row():
+        question_input = gr.Textbox(label="Enter your question")
+        prompt_type = gr.Radio(
+            ["Default", "Highlight", "Evidence-based"],
+            label="Prompt Type",
+            value="Default",
         )
+    submit_button = gr.Button("Submit")
+    answer_output = gr.Textbox(label="Answer")
+    submit_button.click(
+        query_rag,
+        inputs=[study_dropdown, question_input, prompt_type],
+        outputs=[answer_output],
+    )
 if __name__ == "__main__":
     demo.launch()

config.py CHANGED Viewed

@@ -1,20 +1,9 @@
 import os
-# Base directory
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-# Database configuration
-DB_NAME = "vaccine_coverage_study.db"
-DB_PATH = os.path.join(BASE_DIR, DB_NAME)
-# RAG Pipeline configuration
-DATA_DIR = os.path.join(BASE_DIR, "data")
-METADATA_FILE = os.path.join(DATA_DIR, "metadata_map.json")
-PDF_DIR = os.path.join(DATA_DIR, "pdfs")
-# Create directories if they don't exist
-os.makedirs(DATA_DIR, exist_ok=True)
-os.makedirs(PDF_DIR, exist_ok=True)
-# OpenAI configuration
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

 import os
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+STUDY_FILES = {
+    "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
+    "Ebola Virus": "data/ebola_virus_zotero_items.json",
+    "Gene Xpert": "data/gene_xpert_zotero_items.json",
+}

database/__init__.py DELETED Viewed

File without changes

database/vaccine_coverage_db.py DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:42a0645cdd38f2d7ede525768eb21a4cbe08b4d86959cb4eb2349887f2bcf70e
-size 1774

initialize_db.py DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:08030c4783a86d9a9afb9437b102dde959405b6b2857725eec02b6d9c2699e97
-size 2346

rag/rag_pipeline.py CHANGED Viewed

@@ -1,24 +1,13 @@
 import json
-import os
-from typing import Dict, Any
-from llama_index.core import (
-    VectorStoreIndex,
-    Document,
-    SentenceWindowNodeParser,
-)
-from llama_index.core.node_parser import (
-    SentenceSplitter,
-)
 from llama_index.core import PromptTemplate
 class RAGPipeline:
-    def __init__(
-        self,
-        study_json,
-        use_semantic_splitter=False,
-    ):
         self.study_json = study_json
         self.index = None
         self.use_semantic_splitter = use_semantic_splitter
@@ -34,10 +23,7 @@ class RAGPipeline:
         for index, doc_data in enumerate(self.data):
             doc_content = (
                 f"Title: {doc_data['title']}\n"
-                f"Abstract: {doc_data['abstract']}\n"
                 f"Authors: {', '.join(doc_data['authors'])}\n"
-                f"Year: {doc_data['year']}\n"
-                f"DOI: {doc_data['doi']}\n"
                 f"Full Text: {doc_data['full_text']}"
             )
@@ -50,11 +36,7 @@ class RAGPipeline:
             }
             self.documents.append(
-                Document(
-                    text=doc_content,
-                    id_=f"doc_{index}",
-                    metadata=metadata,
-                )
             )
     def build_index(self):
@@ -71,7 +53,6 @@ class RAGPipeline:
         )
         nodes = node_parser.get_nodes_from_documents(self.documents)
         self.index = VectorStoreIndex(nodes)
     def query(self, question, prompt_template=None):
@@ -89,8 +70,7 @@ class RAGPipeline:
             )
         query_engine = self.index.as_query_engine(
-            text_qa_template=prompt_template,
-            similarity_top_k=5,
         )
         response = query_engine.query(question)

+# rag/rag_pipeline.py
 import json
+from llama_index.core import Document, VectorStoreIndex
+from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
 from llama_index.core import PromptTemplate
 class RAGPipeline:
+    def __init__(self, study_json, use_semantic_splitter=False):
         self.study_json = study_json
         self.index = None
         self.use_semantic_splitter = use_semantic_splitter
         for index, doc_data in enumerate(self.data):
             doc_content = (
                 f"Title: {doc_data['title']}\n"
                 f"Authors: {', '.join(doc_data['authors'])}\n"
                 f"Full Text: {doc_data['full_text']}"
             )
             }
             self.documents.append(
+                Document(text=doc_content, id_=f"doc_{index}", metadata=metadata)
             )
     def build_index(self):
         )
         nodes = node_parser.get_nodes_from_documents(self.documents)
         self.index = VectorStoreIndex(nodes)
     def query(self, question, prompt_template=None):
             )
         query_engine = self.index.as_query_engine(
+            text_qa_template=prompt_template, similarity_top_k=5
         )
         response = query_engine.query(question)

utils/prompts.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from llama_index.core import PromptTemplate
+highlight_prompt = PromptTemplate(
+    "Context information is below.\n"
+    "---------------------\n"
+    "{context_str}\n"
+    "---------------------\n"
+    "Given this information, please answer the question: {query_str}\n"
+    "Include all relevant information from the provided context. "
+    "Highlight key information by enclosing it in **asterisks**. "
+    "When quoting specific information, please use square brackets to indicate the source, e.g. [1], [2], etc."
+)
+evidence_based_prompt = PromptTemplate(
+    "Context information is below.\n"
+    "---------------------\n"
+    "{context_str}\n"
+    "---------------------\n"
+    "Given this information, please answer the question: {query_str}\n"
+    "Provide an answer to the question using evidence from the context above. "
+    "Cite sources using square brackets."
+)