ak3ra commited on
Commit
183168e
1 Parent(s): b160b8b

added db file

Browse files
Files changed (4) hide show
  1. app.py +9 -1
  2. config.py +13 -4
  3. initialize_db.py +91 -0
  4. rag/rag_pipeline.py +5 -0
app.py CHANGED
@@ -1,8 +1,16 @@
1
  import gradio as gr
 
2
  from database.vaccine_coverage_db import VaccineCoverageDB
3
  from rag.rag_pipeline import RAGPipeline
4
  from utils.helpers import process_response
5
  from config import DB_PATH, METADATA_FILE, PDF_DIR
 
 
 
 
 
 
 
6
 
7
  # Initialize database and RAG pipeline
8
  db = VaccineCoverageDB(DB_PATH)
@@ -23,7 +31,7 @@ def save_pdf(item_key):
23
  attachments = db.get_attachments_for_item(item_key)
24
  if attachments:
25
  attachment_key = attachments[0]["key"]
26
- output_path = f"{attachment_key}.pdf"
27
  if db.save_pdf_to_file(attachment_key, output_path):
28
  return f"PDF saved successfully to {output_path}"
29
  return "Failed to save PDF or no attachments found"
 
1
  import gradio as gr
2
+ import os
3
  from database.vaccine_coverage_db import VaccineCoverageDB
4
  from rag.rag_pipeline import RAGPipeline
5
  from utils.helpers import process_response
6
  from config import DB_PATH, METADATA_FILE, PDF_DIR
7
+ from initialize_db import initialize_database, populate_database
8
+
9
+ # Initialize database if it doesn't exist
10
+ if not os.path.exists(DB_PATH):
11
+ print("Database not found. Initializing...")
12
+ initialize_database()
13
+ populate_database()
14
 
15
  # Initialize database and RAG pipeline
16
  db = VaccineCoverageDB(DB_PATH)
 
31
  attachments = db.get_attachments_for_item(item_key)
32
  if attachments:
33
  attachment_key = attachments[0]["key"]
34
+ output_path = os.path.join(PDF_DIR, f"{attachment_key}.pdf")
35
  if db.save_pdf_to_file(attachment_key, output_path):
36
  return f"PDF saved successfully to {output_path}"
37
  return "Failed to save PDF or no attachments found"
config.py CHANGED
@@ -1,11 +1,20 @@
1
  import os
2
 
 
 
 
3
  # Database configuration
4
- DB_PATH = 'vaccine_coverage_study.db'
 
5
 
6
  # RAG Pipeline configuration
7
- METADATA_FILE = os.path.join('data', 'metadata_map.json')
8
- PDF_DIR = os.path.join('data', 'pdfs')
 
 
 
 
 
9
 
10
  # OpenAI configuration
11
- OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 
1
  import os
2
 
3
+ # Base directory
4
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
5
+
6
  # Database configuration
7
+ DB_NAME = "vaccine_coverage_study.db"
8
+ DB_PATH = os.path.join(BASE_DIR, DB_NAME)
9
 
10
  # RAG Pipeline configuration
11
+ DATA_DIR = os.path.join(BASE_DIR, "data")
12
+ METADATA_FILE = os.path.join(DATA_DIR, "metadata_map.json")
13
+ PDF_DIR = os.path.join(DATA_DIR, "pdfs")
14
+
15
+ # Create directories if they don't exist
16
+ os.makedirs(DATA_DIR, exist_ok=True)
17
+ os.makedirs(PDF_DIR, exist_ok=True)
18
 
19
  # OpenAI configuration
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
initialize_db.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import json
3
+ import os
4
+ from config import DB_PATH, METADATA_FILE, PDF_DIR
5
+
6
+
7
+ def initialize_database():
8
+ conn = sqlite3.connect(DB_PATH)
9
+ cursor = conn.cursor()
10
+
11
+ # Create tables
12
+ cursor.execute(
13
+ """
14
+ CREATE TABLE IF NOT EXISTS items (
15
+ key TEXT PRIMARY KEY,
16
+ title TEXT,
17
+ abstract TEXT,
18
+ authors TEXT,
19
+ year INTEGER,
20
+ doi TEXT
21
+ )
22
+ """
23
+ )
24
+
25
+ cursor.execute(
26
+ """
27
+ CREATE TABLE IF NOT EXISTS attachments (
28
+ key TEXT PRIMARY KEY,
29
+ parent_key TEXT,
30
+ content BLOB,
31
+ FOREIGN KEY (parent_key) REFERENCES items (key)
32
+ )
33
+ """
34
+ )
35
+
36
+ conn.commit()
37
+ conn.close()
38
+
39
+
40
+ def populate_database():
41
+ if not os.path.exists(METADATA_FILE):
42
+ print(f"Metadata file not found: {METADATA_FILE}")
43
+ return
44
+
45
+ with open(METADATA_FILE, "r") as f:
46
+ metadata = json.load(f)
47
+
48
+ conn = sqlite3.connect(DB_PATH)
49
+ cursor = conn.cursor()
50
+
51
+ for item_key, item_data in metadata.items():
52
+ metadata = item_data["metadata"]
53
+ cursor.execute(
54
+ """
55
+ INSERT OR REPLACE INTO items (key, title, abstract, authors, year, doi)
56
+ VALUES (?, ?, ?, ?, ?, ?)
57
+ """,
58
+ (
59
+ item_key,
60
+ metadata["title"],
61
+ metadata["abstract"],
62
+ metadata["authors"],
63
+ metadata["year"],
64
+ metadata["doi"],
65
+ ),
66
+ )
67
+
68
+ pdf_path = item_data.get("pdf_path")
69
+ if pdf_path:
70
+ full_pdf_path = os.path.join(PDF_DIR, os.path.basename(pdf_path))
71
+ if os.path.exists(full_pdf_path):
72
+ with open(full_pdf_path, "rb") as pdf_file:
73
+ pdf_content = pdf_file.read()
74
+ cursor.execute(
75
+ """
76
+ INSERT OR REPLACE INTO attachments (key, parent_key, content)
77
+ VALUES (?, ?, ?)
78
+ """,
79
+ (os.path.basename(pdf_path), item_key, pdf_content),
80
+ )
81
+ else:
82
+ print(f"PDF file not found: {full_pdf_path}")
83
+
84
+ conn.commit()
85
+ conn.close()
86
+
87
+
88
+ if __name__ == "__main__":
89
+ initialize_database()
90
+ populate_database()
91
+ print("Database initialized and populated.")
rag/rag_pipeline.py CHANGED
@@ -25,6 +25,11 @@ class RAGPipeline:
25
  self.build_index()
26
 
27
  def load_documents(self):
 
 
 
 
 
28
  with open(self.metadata_file, "r") as f:
29
  self.metadata = json.load(f)
30
 
 
25
  self.build_index()
26
 
27
  def load_documents(self):
28
+ if not os.path.exists(self.metadata_file):
29
+ print(f"Metadata file not found: {self.metadata_file}")
30
+ self.documents = []
31
+ return
32
+
33
  with open(self.metadata_file, "r") as f:
34
  self.metadata = json.load(f)
35