ak3ra commited on
Commit
1e89423
1 Parent(s): 1a429c2

Update requirements.txt with additional dependencies

Browse files
Files changed (5) hide show
  1. app.py +199 -93
  2. config/pdf_config.yaml +0 -0
  3. interface.py +46 -0
  4. requirements.txt +3 -1
  5. utils/pdf_processor.py +93 -0
app.py CHANGED
@@ -6,7 +6,7 @@ import io
6
  import json
7
  import logging
8
  import os
9
- from typing import Tuple
10
 
11
  import gradio as gr
12
  import openai
@@ -23,6 +23,9 @@ from utils.helpers import (
23
  from utils.prompts import highlight_prompt, evidence_based_prompt
24
  from utils.zotero_manager import ZoteroManager
25
 
 
 
 
26
  # Configure logging
27
  logging.basicConfig(level=logging.INFO)
28
  logger = logging.getLogger(__name__)
@@ -225,120 +228,223 @@ def download_as_csv(markdown_content):
225
  return temp_path
226
 
227
 
228
- def create_gr_interface() -> gr.Blocks:
229
- """
230
- Create and configure the Gradio interface for the RAG platform.
231
 
232
- This function sets up the entire user interface, including:
233
- - Chat interface with message input and display
234
- - Study selection dropdown
235
- - Sample and follow-up question buttons
236
- - Prompt type selection
237
- - Event handlers for user interactions
238
 
239
- Returns:
240
- gr.Blocks: The configured Gradio interface ready for launching.
241
- """
 
 
 
242
 
243
- with gr.Blocks() as demo:
244
- gr.Markdown("# ACRES RAG Platform")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- with gr.Row():
247
- with gr.Column(scale=1):
248
- gr.Markdown("### Zotero Credentials")
249
- zotero_library_id = gr.Textbox(
250
- label="Zotero Library ID",
251
- type="password",
252
- placeholder="Enter Your Zotero Library ID here...",
253
- )
254
- zotero_api_access_key = gr.Textbox(
255
- label="Zotero API Access Key",
256
- type="password",
257
- placeholder="Enter Your Zotero API Access Key...",
258
- )
259
- process_zotero_btn = gr.Button("Process your Zotero Library")
260
- zotero_output = gr.Markdown(label="Zotero")
261
 
262
- gr.Markdown("### Study Information")
 
 
 
263
 
264
- # Query ChromaDB for all document IDs in the "study_files_collection" collection
265
- collection = chromadb_client.get_or_create_collection(
266
- "study_files_collection"
267
- )
268
- # Retrieve all documents by querying with an empty string and specifying a high n_results
269
- all_documents = collection.query(query_texts=[""], n_results=1000)
270
- logging.info(f"all_documents: =========> {all_documents}")
271
- # Extract document IDs as study names
272
- document_ids = all_documents.get("ids")
273
- study_choices = [
274
- doc_id for doc_id in document_ids[0] if document_ids
275
- ] # Get list of document IDs
276
- logging.info(f"study_choices: ======> {study_choices}")
277
-
278
- # Update the Dropdown with choices from ChromaDB
279
- study_dropdown = gr.Dropdown(
280
- choices=study_choices,
281
- label="Select Study",
282
- value=(
283
- study_choices[0] if study_choices else None
284
- ), # Set first choice as default, if available
285
- )
286
 
287
- study_info = gr.Markdown(label="Study Details")
288
 
289
- gr.Markdown("### Settings")
290
- prompt_type = gr.Radio(
291
- ["Default", "Highlight", "Evidence-based"],
292
- label="Prompt Type",
293
- value="Default",
294
- )
295
- # clear = gr.Button("Clear Chat")
296
 
297
- with gr.Column(scale=3):
298
- gr.Markdown("### Study Variables")
299
- with gr.Row():
300
- study_variables = gr.Textbox(
301
- show_label=False,
302
- placeholder="Type your variables separated by commas e.g (Study ID, Study Title, Authors etc)",
303
- scale=4,
304
- lines=1,
305
- autofocus=True,
306
- )
307
- submit_btn = gr.Button("Submit", scale=1)
308
- answer_output = gr.Markdown(label="Answer")
309
- # button to download_csv
310
- download_btn = gr.DownloadButton(
311
- "Download as CSV",
312
- variant="primary",
313
- size="sm",
314
- scale=1,
315
- visible=False,
316
- )
317
 
318
- study_dropdown.change(
319
- fn=get_study_info,
320
- inputs=study_dropdown,
321
- outputs=[study_info],
322
- )
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  process_zotero_btn.click(
325
  process_zotero_library_items,
326
  inputs=[zotero_library_id, zotero_api_access_key],
327
  outputs=[zotero_output],
328
- queue=False,
329
  )
 
 
 
 
 
330
  submit_btn.click(
331
  process_multi_input,
332
  inputs=[study_variables, study_dropdown, prompt_type],
333
  outputs=[answer_output, download_btn],
334
- queue=False,
335
  )
 
336
  download_btn.click(
337
- fn=download_as_csv,
338
- inputs=[answer_output],
339
- outputs=[download_btn],
340
- ).then(
341
- fn=cleanup_temp_files, inputs=None, outputs=None # Clean up after download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  )
343
 
344
  return demo
 
6
  import json
7
  import logging
8
  import os
9
+ from typing import Tuple, List
10
 
11
  import gradio as gr
12
  import openai
 
23
  from utils.prompts import highlight_prompt, evidence_based_prompt
24
  from utils.zotero_manager import ZoteroManager
25
 
26
+ from interface import create_chat_interface
27
+ from utils.pdf_processor import PDFProcessor
28
+
29
  # Configure logging
30
  logging.basicConfig(level=logging.INFO)
31
  logger = logging.getLogger(__name__)
 
228
  return temp_path
229
 
230
 
231
+ # PDF Support
 
 
232
 
 
 
 
 
 
 
233
 
234
+ def process_pdf_uploads(
235
+ files: List[str], collection_name: str
236
+ ) -> Tuple[str, gr.update]:
237
+ """Process uploaded PDF files and add them to the system."""
238
+ if not files:
239
+ return "Please upload PDF files", gr.update()
240
 
241
+ try:
242
+ processor = PDFProcessor()
243
+
244
+ # Save uploaded files temporarily
245
+ file_paths = []
246
+ for file in files:
247
+ temp_path = os.path.join(processor.upload_dir, file.name)
248
+ with open(temp_path, "wb") as f:
249
+ f.write(file.read())
250
+ file_paths.append(temp_path)
251
+
252
+ # Process PDFs
253
+ output_path = processor.process_pdfs(file_paths, collection_name)
254
+
255
+ # Add to study files and ChromaDB
256
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
257
+ collection_id = f"PDF-{timestamp}-{collection_name}"
258
+ append_to_study_files("study_files.json", collection_id, output_path)
259
+ add_study_files_to_chromadb("study_files.json", "study_files_collection")
260
 
261
+ # Cleanup temporary files
262
+ for path in file_paths:
263
+ try:
264
+ os.remove(path)
265
+ except Exception as e:
266
+ logger.warning(f"Failed to remove temporary file {path}: {e}")
 
 
 
 
 
 
 
 
 
267
 
268
+ return (
269
+ f"Successfully processed {len(files)} PDF files into collection: {collection_id}",
270
+ gr.update(value=output_path),
271
+ )
272
 
273
+ except Exception as e:
274
+ return f"Error processing PDF files: {str(e)}", gr.update()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
 
276
 
277
+ def chat_response(
278
+ message: str, history: List[Tuple[str, str]], study_name: str
279
+ ) -> Tuple[List[Tuple[str, str]], str]:
280
+ """Generate chat response and update history."""
281
+ if not message.strip():
282
+ return history, None
 
283
 
284
+ response = chat_function(message, study_name, "Default")
285
+ history.append((message, response))
286
+ return history, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
 
 
 
 
 
288
 
289
+ def create_gr_interface() -> gr.Blocks:
290
+ """Create and configure the Gradio interface for the RAG platform."""
291
+ with gr.Blocks() as demo:
292
+ gr.Markdown("# ACRES RAG Platform")
293
+
294
+ with gr.Tabs() as tabs:
295
+ # Tab 1: Original Study Analysis Interface
296
+ with gr.Tab("Study Analysis"):
297
+ with gr.Row():
298
+ with gr.Column(scale=1):
299
+ gr.Markdown("### Zotero Credentials")
300
+ zotero_library_id = gr.Textbox(
301
+ label="Zotero Library ID",
302
+ type="password",
303
+ placeholder="Enter Your Zotero Library ID here...",
304
+ )
305
+ zotero_api_access_key = gr.Textbox(
306
+ label="Zotero API Access Key",
307
+ type="password",
308
+ placeholder="Enter Your Zotero API Access Key...",
309
+ )
310
+ process_zotero_btn = gr.Button("Process your Zotero Library")
311
+ zotero_output = gr.Markdown(label="Zotero")
312
+
313
+ gr.Markdown("### Study Information")
314
+ collection = chromadb_client.get_or_create_collection(
315
+ "study_files_collection"
316
+ )
317
+ all_documents = collection.query(
318
+ query_texts=[""], n_results=1000
319
+ )
320
+ study_choices = [
321
+ doc_id
322
+ for doc_id in all_documents.get("ids")[0]
323
+ if all_documents
324
+ ]
325
+
326
+ study_dropdown = gr.Dropdown(
327
+ choices=study_choices,
328
+ label="Select Study",
329
+ value=(study_choices[0] if study_choices else None),
330
+ )
331
+ study_info = gr.Markdown(label="Study Details")
332
+ prompt_type = gr.Radio(
333
+ ["Default", "Highlight", "Evidence-based"],
334
+ label="Prompt Type",
335
+ value="Default",
336
+ )
337
+
338
+ with gr.Column(scale=3):
339
+ gr.Markdown("### Study Variables")
340
+ with gr.Row():
341
+ study_variables = gr.Textbox(
342
+ show_label=False,
343
+ placeholder="Type your variables separated by commas e.g (Study ID, Study Title, Authors etc)",
344
+ scale=4,
345
+ lines=1,
346
+ autofocus=True,
347
+ )
348
+ submit_btn = gr.Button("Submit", scale=1)
349
+ answer_output = gr.Markdown(label="Answer")
350
+ download_btn = gr.DownloadButton(
351
+ "Download as CSV",
352
+ variant="primary",
353
+ size="sm",
354
+ scale=1,
355
+ visible=False,
356
+ )
357
+
358
+ # Tab 2: PDF Chat Interface
359
+ with gr.Tab("PDF Chat"):
360
+ with gr.Row():
361
+ # Left column: Chat and Input
362
+ with gr.Column(scale=7):
363
+ chat_history = gr.Chatbot(
364
+ value=[], height=600, show_label=False
365
+ )
366
+ with gr.Row():
367
+ query_input = gr.Textbox(
368
+ show_label=False,
369
+ placeholder="Ask a question about your PDFs...",
370
+ scale=8,
371
+ )
372
+ chat_submit_btn = gr.Button(
373
+ "Send", scale=2, variant="primary"
374
+ )
375
+
376
+ # Right column: PDF Preview and Upload
377
+ with gr.Column(scale=3):
378
+ pdf_preview = gr.Image(label="Source Page", height=600)
379
+ with gr.Row():
380
+ pdf_files = gr.File(
381
+ file_count="multiple",
382
+ file_types=[".pdf"],
383
+ label="Upload PDFs",
384
+ )
385
+ with gr.Row():
386
+ collection_name = gr.Textbox(
387
+ label="Collection Name",
388
+ placeholder="Name this PDF collection...",
389
+ )
390
+ pdf_status = gr.Markdown()
391
+
392
+ # Event handlers for Study Analysis tab
393
  process_zotero_btn.click(
394
  process_zotero_library_items,
395
  inputs=[zotero_library_id, zotero_api_access_key],
396
  outputs=[zotero_output],
 
397
  )
398
+
399
+ study_dropdown.change(
400
+ get_study_info, inputs=[study_dropdown], outputs=[study_info]
401
+ )
402
+
403
  submit_btn.click(
404
  process_multi_input,
405
  inputs=[study_variables, study_dropdown, prompt_type],
406
  outputs=[answer_output, download_btn],
 
407
  )
408
+
409
  download_btn.click(
410
+ fn=download_as_csv, inputs=[answer_output], outputs=[download_btn]
411
+ ).then(fn=cleanup_temp_files, inputs=None, outputs=None)
412
+
413
+ # Event handlers for PDF Chat tab
414
+ def add_message(history, message):
415
+ """Add user message to chat history."""
416
+ if not message.strip():
417
+ raise gr.Error("Please enter a message")
418
+ history = history + [(message, None)]
419
+ return history, ""
420
+
421
+ def generate_chat_response(history, collection):
422
+ """Generate response for the last message in history."""
423
+ if len(history) == 0:
424
+ return history
425
+
426
+ last_message = history[-1][0]
427
+ response = chat_function(last_message, collection, "Default")
428
+
429
+ # Update the last message pair with the response
430
+ history[-1] = (last_message, response)
431
+ return history
432
+
433
+ pdf_files.upload(
434
+ process_pdf_uploads,
435
+ inputs=[pdf_files, collection_name],
436
+ outputs=[pdf_status],
437
+ )
438
+
439
+ # Fixed chat event handling
440
+ chat_submit_btn.click(
441
+ add_message,
442
+ inputs=[chat_history, query_input],
443
+ outputs=[chat_history, query_input],
444
+ ).success(
445
+ generate_chat_response,
446
+ inputs=[chat_history, collection_name],
447
+ outputs=[chat_history],
448
  )
449
 
450
  return demo
config/pdf_config.yaml ADDED
File without changes
interface.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio interface module for ACRES RAG Platform.
3
+ Defines the UI components and layout.
4
+ """
5
+
6
+ import gradio as gr
7
+
8
+
9
+ def create_chat_interface() -> gr.Blocks:
10
+ """Create the chat interface component."""
11
+ with gr.Blocks() as chat_interface:
12
+ with gr.Row():
13
+ with gr.Column(scale=7):
14
+ chat_history = gr.Chatbot(
15
+ value=[], elem_id="chatbot", height=600, show_label=False
16
+ )
17
+ with gr.Column(scale=3):
18
+ pdf_preview = gr.Image(label="Source Page", height=600)
19
+
20
+ with gr.Row():
21
+ with gr.Column(scale=8):
22
+ query_input = gr.Textbox(
23
+ show_label=False,
24
+ placeholder="Ask a question about your documents...",
25
+ container=False,
26
+ )
27
+ with gr.Column(scale=2):
28
+ submit_btn = gr.Button("Send", variant="primary")
29
+
30
+ with gr.Row():
31
+ pdf_files = gr.File(
32
+ file_count="multiple", file_types=[".pdf"], label="Upload PDF Files"
33
+ )
34
+ collection_name = gr.Textbox(
35
+ label="Collection Name", placeholder="Name this collection of PDFs..."
36
+ )
37
+
38
+ return (
39
+ chat_interface,
40
+ chat_history,
41
+ pdf_preview,
42
+ query_input,
43
+ submit_btn,
44
+ pdf_files,
45
+ collection_name,
46
+ )
requirements.txt CHANGED
@@ -9,4 +9,6 @@ pandas
9
  pydantic
10
  python-dotenv
11
  pyzotero
12
- python-slugify
 
 
 
9
  pydantic
10
  python-dotenv
11
  pyzotero
12
+ python-slugify
13
+ PyMuPDF==1.23.8
14
+ Pillow==10.2.0
utils/pdf_processor.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF processing module for ACRES RAG Platform.
3
+ Handles PDF file processing, text extraction, and page rendering.
4
+ """
5
+
6
+ import os
7
+ import fitz
8
+ import logging
9
+ from typing import Dict, List, Optional
10
+ from datetime import datetime
11
+ from slugify import slugify
12
+ import json
13
+ from PIL import Image
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class PDFProcessor:
19
+ def __init__(self, upload_dir: str = "data/uploads"):
20
+ """Initialize PDFProcessor with upload directory."""
21
+ self.upload_dir = upload_dir
22
+ os.makedirs(upload_dir, exist_ok=True)
23
+ self.current_page = 0
24
+
25
+ def extract_text_from_pdf(self, file_path: str) -> Dict:
26
+ """Extract text and metadata from a PDF file."""
27
+ doc = fitz.open(file_path)
28
+
29
+ # Extract text from all pages with page tracking
30
+ text = ""
31
+ pages = {}
32
+ for page_num in range(len(doc)):
33
+ page_text = doc[page_num].get_text()
34
+ pages[page_num] = page_text
35
+ text += page_text + "\n"
36
+
37
+ # Extract metadata
38
+ metadata = doc.metadata
39
+ if not metadata.get("title"):
40
+ metadata["title"] = os.path.basename(file_path)
41
+
42
+ # Create structured document
43
+ document = {
44
+ "title": metadata.get("title", ""),
45
+ "authors": metadata.get("author", "").split(";"),
46
+ "date": metadata.get("creationDate", ""),
47
+ "abstract": text[:500] + "..." if len(text) > 500 else text,
48
+ "full_text": text,
49
+ "source_file": file_path,
50
+ "pages": pages,
51
+ "page_count": len(doc),
52
+ }
53
+
54
+ doc.close()
55
+ return document
56
+
57
+ def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
58
+ """Process multiple PDF files and store their content."""
59
+ processed_docs = []
60
+
61
+ for file_path in file_paths:
62
+ try:
63
+ doc_data = self.extract_text_from_pdf(file_path)
64
+ processed_docs.append(doc_data)
65
+ except Exception as e:
66
+ logger.error(f"Error processing {file_path}: {str(e)}")
67
+ continue
68
+
69
+ if not processed_docs:
70
+ raise ValueError("No documents were successfully processed")
71
+
72
+ # Save to JSON file
73
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
74
+ output_filename = f"{slugify(collection_name)}_{timestamp}_documents.json"
75
+ output_path = f"data/{output_filename}"
76
+
77
+ with open(output_path, "w", encoding="utf-8") as f:
78
+ json.dump(processed_docs, f, indent=2, ensure_ascii=False)
79
+
80
+ return output_path
81
+
82
+ def render_page(self, file_path: str, page_num: int) -> Optional[Image.Image]:
83
+ """Render a specific page from a PDF as an image."""
84
+ try:
85
+ doc = fitz.open(file_path)
86
+ page = doc[page_num]
87
+ pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
88
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
89
+ doc.close()
90
+ return image
91
+ except Exception as e:
92
+ logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}")
93
+ return None