Spaces:
Running
Running
added db file
Browse files- app.py +9 -1
- config.py +13 -4
- initialize_db.py +91 -0
- rag/rag_pipeline.py +5 -0
app.py
CHANGED
@@ -1,8 +1,16 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
from database.vaccine_coverage_db import VaccineCoverageDB
|
3 |
from rag.rag_pipeline import RAGPipeline
|
4 |
from utils.helpers import process_response
|
5 |
from config import DB_PATH, METADATA_FILE, PDF_DIR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# Initialize database and RAG pipeline
|
8 |
db = VaccineCoverageDB(DB_PATH)
|
@@ -23,7 +31,7 @@ def save_pdf(item_key):
|
|
23 |
attachments = db.get_attachments_for_item(item_key)
|
24 |
if attachments:
|
25 |
attachment_key = attachments[0]["key"]
|
26 |
-
output_path = f"{attachment_key}.pdf"
|
27 |
if db.save_pdf_to_file(attachment_key, output_path):
|
28 |
return f"PDF saved successfully to {output_path}"
|
29 |
return "Failed to save PDF or no attachments found"
|
|
|
1 |
import gradio as gr
|
2 |
+
import os
|
3 |
from database.vaccine_coverage_db import VaccineCoverageDB
|
4 |
from rag.rag_pipeline import RAGPipeline
|
5 |
from utils.helpers import process_response
|
6 |
from config import DB_PATH, METADATA_FILE, PDF_DIR
|
7 |
+
from initialize_db import initialize_database, populate_database
|
8 |
+
|
9 |
+
# Initialize database if it doesn't exist
|
10 |
+
if not os.path.exists(DB_PATH):
|
11 |
+
print("Database not found. Initializing...")
|
12 |
+
initialize_database()
|
13 |
+
populate_database()
|
14 |
|
15 |
# Initialize database and RAG pipeline
|
16 |
db = VaccineCoverageDB(DB_PATH)
|
|
|
31 |
attachments = db.get_attachments_for_item(item_key)
|
32 |
if attachments:
|
33 |
attachment_key = attachments[0]["key"]
|
34 |
+
output_path = os.path.join(PDF_DIR, f"{attachment_key}.pdf")
|
35 |
if db.save_pdf_to_file(attachment_key, output_path):
|
36 |
return f"PDF saved successfully to {output_path}"
|
37 |
return "Failed to save PDF or no attachments found"
|
config.py
CHANGED
@@ -1,11 +1,20 @@
|
|
1 |
import os
|
2 |
|
|
|
|
|
|
|
3 |
# Database configuration
|
4 |
-
|
|
|
5 |
|
6 |
# RAG Pipeline configuration
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# OpenAI configuration
|
11 |
-
OPENAI_API_KEY = os.getenv(
|
|
|
1 |
import os
|
2 |
|
3 |
+
# Base directory
|
4 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
5 |
+
|
6 |
# Database configuration
|
7 |
+
DB_NAME = "vaccine_coverage_study.db"
|
8 |
+
DB_PATH = os.path.join(BASE_DIR, DB_NAME)
|
9 |
|
10 |
# RAG Pipeline configuration
|
11 |
+
DATA_DIR = os.path.join(BASE_DIR, "data")
|
12 |
+
METADATA_FILE = os.path.join(DATA_DIR, "metadata_map.json")
|
13 |
+
PDF_DIR = os.path.join(DATA_DIR, "pdfs")
|
14 |
+
|
15 |
+
# Create directories if they don't exist
|
16 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
17 |
+
os.makedirs(PDF_DIR, exist_ok=True)
|
18 |
|
19 |
# OpenAI configuration
|
20 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
initialize_db.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sqlite3
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from config import DB_PATH, METADATA_FILE, PDF_DIR
|
5 |
+
|
6 |
+
|
7 |
+
def initialize_database():
|
8 |
+
conn = sqlite3.connect(DB_PATH)
|
9 |
+
cursor = conn.cursor()
|
10 |
+
|
11 |
+
# Create tables
|
12 |
+
cursor.execute(
|
13 |
+
"""
|
14 |
+
CREATE TABLE IF NOT EXISTS items (
|
15 |
+
key TEXT PRIMARY KEY,
|
16 |
+
title TEXT,
|
17 |
+
abstract TEXT,
|
18 |
+
authors TEXT,
|
19 |
+
year INTEGER,
|
20 |
+
doi TEXT
|
21 |
+
)
|
22 |
+
"""
|
23 |
+
)
|
24 |
+
|
25 |
+
cursor.execute(
|
26 |
+
"""
|
27 |
+
CREATE TABLE IF NOT EXISTS attachments (
|
28 |
+
key TEXT PRIMARY KEY,
|
29 |
+
parent_key TEXT,
|
30 |
+
content BLOB,
|
31 |
+
FOREIGN KEY (parent_key) REFERENCES items (key)
|
32 |
+
)
|
33 |
+
"""
|
34 |
+
)
|
35 |
+
|
36 |
+
conn.commit()
|
37 |
+
conn.close()
|
38 |
+
|
39 |
+
|
40 |
+
def populate_database():
|
41 |
+
if not os.path.exists(METADATA_FILE):
|
42 |
+
print(f"Metadata file not found: {METADATA_FILE}")
|
43 |
+
return
|
44 |
+
|
45 |
+
with open(METADATA_FILE, "r") as f:
|
46 |
+
metadata = json.load(f)
|
47 |
+
|
48 |
+
conn = sqlite3.connect(DB_PATH)
|
49 |
+
cursor = conn.cursor()
|
50 |
+
|
51 |
+
for item_key, item_data in metadata.items():
|
52 |
+
metadata = item_data["metadata"]
|
53 |
+
cursor.execute(
|
54 |
+
"""
|
55 |
+
INSERT OR REPLACE INTO items (key, title, abstract, authors, year, doi)
|
56 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
57 |
+
""",
|
58 |
+
(
|
59 |
+
item_key,
|
60 |
+
metadata["title"],
|
61 |
+
metadata["abstract"],
|
62 |
+
metadata["authors"],
|
63 |
+
metadata["year"],
|
64 |
+
metadata["doi"],
|
65 |
+
),
|
66 |
+
)
|
67 |
+
|
68 |
+
pdf_path = item_data.get("pdf_path")
|
69 |
+
if pdf_path:
|
70 |
+
full_pdf_path = os.path.join(PDF_DIR, os.path.basename(pdf_path))
|
71 |
+
if os.path.exists(full_pdf_path):
|
72 |
+
with open(full_pdf_path, "rb") as pdf_file:
|
73 |
+
pdf_content = pdf_file.read()
|
74 |
+
cursor.execute(
|
75 |
+
"""
|
76 |
+
INSERT OR REPLACE INTO attachments (key, parent_key, content)
|
77 |
+
VALUES (?, ?, ?)
|
78 |
+
""",
|
79 |
+
(os.path.basename(pdf_path), item_key, pdf_content),
|
80 |
+
)
|
81 |
+
else:
|
82 |
+
print(f"PDF file not found: {full_pdf_path}")
|
83 |
+
|
84 |
+
conn.commit()
|
85 |
+
conn.close()
|
86 |
+
|
87 |
+
|
88 |
+
if __name__ == "__main__":
|
89 |
+
initialize_database()
|
90 |
+
populate_database()
|
91 |
+
print("Database initialized and populated.")
|
rag/rag_pipeline.py
CHANGED
@@ -25,6 +25,11 @@ class RAGPipeline:
|
|
25 |
self.build_index()
|
26 |
|
27 |
def load_documents(self):
|
|
|
|
|
|
|
|
|
|
|
28 |
with open(self.metadata_file, "r") as f:
|
29 |
self.metadata = json.load(f)
|
30 |
|
|
|
25 |
self.build_index()
|
26 |
|
27 |
def load_documents(self):
|
28 |
+
if not os.path.exists(self.metadata_file):
|
29 |
+
print(f"Metadata file not found: {self.metadata_file}")
|
30 |
+
self.documents = []
|
31 |
+
return
|
32 |
+
|
33 |
with open(self.metadata_file, "r") as f:
|
34 |
self.metadata = json.load(f)
|
35 |
|