Spaces:
Running
Running
Update requirements.txt with additional dependencies
Browse files- app.py +199 -93
- config/pdf_config.yaml +0 -0
- interface.py +46 -0
- requirements.txt +3 -1
- utils/pdf_processor.py +93 -0
app.py
CHANGED
@@ -6,7 +6,7 @@ import io
|
|
6 |
import json
|
7 |
import logging
|
8 |
import os
|
9 |
-
from typing import Tuple
|
10 |
|
11 |
import gradio as gr
|
12 |
import openai
|
@@ -23,6 +23,9 @@ from utils.helpers import (
|
|
23 |
from utils.prompts import highlight_prompt, evidence_based_prompt
|
24 |
from utils.zotero_manager import ZoteroManager
|
25 |
|
|
|
|
|
|
|
26 |
# Configure logging
|
27 |
logging.basicConfig(level=logging.INFO)
|
28 |
logger = logging.getLogger(__name__)
|
@@ -225,120 +228,223 @@ def download_as_csv(markdown_content):
|
|
225 |
return temp_path
|
226 |
|
227 |
|
228 |
-
|
229 |
-
"""
|
230 |
-
Create and configure the Gradio interface for the RAG platform.
|
231 |
|
232 |
-
This function sets up the entire user interface, including:
|
233 |
-
- Chat interface with message input and display
|
234 |
-
- Study selection dropdown
|
235 |
-
- Sample and follow-up question buttons
|
236 |
-
- Prompt type selection
|
237 |
-
- Event handlers for user interactions
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
242 |
|
243 |
-
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
placeholder="Enter Your Zotero Library ID here...",
|
253 |
-
)
|
254 |
-
zotero_api_access_key = gr.Textbox(
|
255 |
-
label="Zotero API Access Key",
|
256 |
-
type="password",
|
257 |
-
placeholder="Enter Your Zotero API Access Key...",
|
258 |
-
)
|
259 |
-
process_zotero_btn = gr.Button("Process your Zotero Library")
|
260 |
-
zotero_output = gr.Markdown(label="Zotero")
|
261 |
|
262 |
-
|
|
|
|
|
|
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
"study_files_collection"
|
267 |
-
)
|
268 |
-
# Retrieve all documents by querying with an empty string and specifying a high n_results
|
269 |
-
all_documents = collection.query(query_texts=[""], n_results=1000)
|
270 |
-
logging.info(f"all_documents: =========> {all_documents}")
|
271 |
-
# Extract document IDs as study names
|
272 |
-
document_ids = all_documents.get("ids")
|
273 |
-
study_choices = [
|
274 |
-
doc_id for doc_id in document_ids[0] if document_ids
|
275 |
-
] # Get list of document IDs
|
276 |
-
logging.info(f"study_choices: ======> {study_choices}")
|
277 |
-
|
278 |
-
# Update the Dropdown with choices from ChromaDB
|
279 |
-
study_dropdown = gr.Dropdown(
|
280 |
-
choices=study_choices,
|
281 |
-
label="Select Study",
|
282 |
-
value=(
|
283 |
-
study_choices[0] if study_choices else None
|
284 |
-
), # Set first choice as default, if available
|
285 |
-
)
|
286 |
|
287 |
-
study_info = gr.Markdown(label="Study Details")
|
288 |
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
# clear = gr.Button("Clear Chat")
|
296 |
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
study_variables = gr.Textbox(
|
301 |
-
show_label=False,
|
302 |
-
placeholder="Type your variables separated by commas e.g (Study ID, Study Title, Authors etc)",
|
303 |
-
scale=4,
|
304 |
-
lines=1,
|
305 |
-
autofocus=True,
|
306 |
-
)
|
307 |
-
submit_btn = gr.Button("Submit", scale=1)
|
308 |
-
answer_output = gr.Markdown(label="Answer")
|
309 |
-
# button to download_csv
|
310 |
-
download_btn = gr.DownloadButton(
|
311 |
-
"Download as CSV",
|
312 |
-
variant="primary",
|
313 |
-
size="sm",
|
314 |
-
scale=1,
|
315 |
-
visible=False,
|
316 |
-
)
|
317 |
|
318 |
-
study_dropdown.change(
|
319 |
-
fn=get_study_info,
|
320 |
-
inputs=study_dropdown,
|
321 |
-
outputs=[study_info],
|
322 |
-
)
|
323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
process_zotero_btn.click(
|
325 |
process_zotero_library_items,
|
326 |
inputs=[zotero_library_id, zotero_api_access_key],
|
327 |
outputs=[zotero_output],
|
328 |
-
queue=False,
|
329 |
)
|
|
|
|
|
|
|
|
|
|
|
330 |
submit_btn.click(
|
331 |
process_multi_input,
|
332 |
inputs=[study_variables, study_dropdown, prompt_type],
|
333 |
outputs=[answer_output, download_btn],
|
334 |
-
queue=False,
|
335 |
)
|
|
|
336 |
download_btn.click(
|
337 |
-
fn=download_as_csv,
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
)
|
343 |
|
344 |
return demo
|
|
|
6 |
import json
|
7 |
import logging
|
8 |
import os
|
9 |
+
from typing import Tuple, List
|
10 |
|
11 |
import gradio as gr
|
12 |
import openai
|
|
|
23 |
from utils.prompts import highlight_prompt, evidence_based_prompt
|
24 |
from utils.zotero_manager import ZoteroManager
|
25 |
|
26 |
+
from interface import create_chat_interface
|
27 |
+
from utils.pdf_processor import PDFProcessor
|
28 |
+
|
29 |
# Configure logging
|
30 |
logging.basicConfig(level=logging.INFO)
|
31 |
logger = logging.getLogger(__name__)
|
|
|
228 |
return temp_path
|
229 |
|
230 |
|
231 |
+
# PDF Support
|
|
|
|
|
232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
+
def process_pdf_uploads(
|
235 |
+
files: List[str], collection_name: str
|
236 |
+
) -> Tuple[str, gr.update]:
|
237 |
+
"""Process uploaded PDF files and add them to the system."""
|
238 |
+
if not files:
|
239 |
+
return "Please upload PDF files", gr.update()
|
240 |
|
241 |
+
try:
|
242 |
+
processor = PDFProcessor()
|
243 |
+
|
244 |
+
# Save uploaded files temporarily
|
245 |
+
file_paths = []
|
246 |
+
for file in files:
|
247 |
+
temp_path = os.path.join(processor.upload_dir, file.name)
|
248 |
+
with open(temp_path, "wb") as f:
|
249 |
+
f.write(file.read())
|
250 |
+
file_paths.append(temp_path)
|
251 |
+
|
252 |
+
# Process PDFs
|
253 |
+
output_path = processor.process_pdfs(file_paths, collection_name)
|
254 |
+
|
255 |
+
# Add to study files and ChromaDB
|
256 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
257 |
+
collection_id = f"PDF-{timestamp}-{collection_name}"
|
258 |
+
append_to_study_files("study_files.json", collection_id, output_path)
|
259 |
+
add_study_files_to_chromadb("study_files.json", "study_files_collection")
|
260 |
|
261 |
+
# Cleanup temporary files
|
262 |
+
for path in file_paths:
|
263 |
+
try:
|
264 |
+
os.remove(path)
|
265 |
+
except Exception as e:
|
266 |
+
logger.warning(f"Failed to remove temporary file {path}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
|
268 |
+
return (
|
269 |
+
f"Successfully processed {len(files)} PDF files into collection: {collection_id}",
|
270 |
+
gr.update(value=output_path),
|
271 |
+
)
|
272 |
|
273 |
+
except Exception as e:
|
274 |
+
return f"Error processing PDF files: {str(e)}", gr.update()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
|
|
276 |
|
277 |
+
def chat_response(
|
278 |
+
message: str, history: List[Tuple[str, str]], study_name: str
|
279 |
+
) -> Tuple[List[Tuple[str, str]], str]:
|
280 |
+
"""Generate chat response and update history."""
|
281 |
+
if not message.strip():
|
282 |
+
return history, None
|
|
|
283 |
|
284 |
+
response = chat_function(message, study_name, "Default")
|
285 |
+
history.append((message, response))
|
286 |
+
return history, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
|
|
|
|
|
|
|
|
|
|
288 |
|
289 |
+
def create_gr_interface() -> gr.Blocks:
|
290 |
+
"""Create and configure the Gradio interface for the RAG platform."""
|
291 |
+
with gr.Blocks() as demo:
|
292 |
+
gr.Markdown("# ACRES RAG Platform")
|
293 |
+
|
294 |
+
with gr.Tabs() as tabs:
|
295 |
+
# Tab 1: Original Study Analysis Interface
|
296 |
+
with gr.Tab("Study Analysis"):
|
297 |
+
with gr.Row():
|
298 |
+
with gr.Column(scale=1):
|
299 |
+
gr.Markdown("### Zotero Credentials")
|
300 |
+
zotero_library_id = gr.Textbox(
|
301 |
+
label="Zotero Library ID",
|
302 |
+
type="password",
|
303 |
+
placeholder="Enter Your Zotero Library ID here...",
|
304 |
+
)
|
305 |
+
zotero_api_access_key = gr.Textbox(
|
306 |
+
label="Zotero API Access Key",
|
307 |
+
type="password",
|
308 |
+
placeholder="Enter Your Zotero API Access Key...",
|
309 |
+
)
|
310 |
+
process_zotero_btn = gr.Button("Process your Zotero Library")
|
311 |
+
zotero_output = gr.Markdown(label="Zotero")
|
312 |
+
|
313 |
+
gr.Markdown("### Study Information")
|
314 |
+
collection = chromadb_client.get_or_create_collection(
|
315 |
+
"study_files_collection"
|
316 |
+
)
|
317 |
+
all_documents = collection.query(
|
318 |
+
query_texts=[""], n_results=1000
|
319 |
+
)
|
320 |
+
study_choices = [
|
321 |
+
doc_id
|
322 |
+
for doc_id in all_documents.get("ids")[0]
|
323 |
+
if all_documents
|
324 |
+
]
|
325 |
+
|
326 |
+
study_dropdown = gr.Dropdown(
|
327 |
+
choices=study_choices,
|
328 |
+
label="Select Study",
|
329 |
+
value=(study_choices[0] if study_choices else None),
|
330 |
+
)
|
331 |
+
study_info = gr.Markdown(label="Study Details")
|
332 |
+
prompt_type = gr.Radio(
|
333 |
+
["Default", "Highlight", "Evidence-based"],
|
334 |
+
label="Prompt Type",
|
335 |
+
value="Default",
|
336 |
+
)
|
337 |
+
|
338 |
+
with gr.Column(scale=3):
|
339 |
+
gr.Markdown("### Study Variables")
|
340 |
+
with gr.Row():
|
341 |
+
study_variables = gr.Textbox(
|
342 |
+
show_label=False,
|
343 |
+
placeholder="Type your variables separated by commas e.g (Study ID, Study Title, Authors etc)",
|
344 |
+
scale=4,
|
345 |
+
lines=1,
|
346 |
+
autofocus=True,
|
347 |
+
)
|
348 |
+
submit_btn = gr.Button("Submit", scale=1)
|
349 |
+
answer_output = gr.Markdown(label="Answer")
|
350 |
+
download_btn = gr.DownloadButton(
|
351 |
+
"Download as CSV",
|
352 |
+
variant="primary",
|
353 |
+
size="sm",
|
354 |
+
scale=1,
|
355 |
+
visible=False,
|
356 |
+
)
|
357 |
+
|
358 |
+
# Tab 2: PDF Chat Interface
|
359 |
+
with gr.Tab("PDF Chat"):
|
360 |
+
with gr.Row():
|
361 |
+
# Left column: Chat and Input
|
362 |
+
with gr.Column(scale=7):
|
363 |
+
chat_history = gr.Chatbot(
|
364 |
+
value=[], height=600, show_label=False
|
365 |
+
)
|
366 |
+
with gr.Row():
|
367 |
+
query_input = gr.Textbox(
|
368 |
+
show_label=False,
|
369 |
+
placeholder="Ask a question about your PDFs...",
|
370 |
+
scale=8,
|
371 |
+
)
|
372 |
+
chat_submit_btn = gr.Button(
|
373 |
+
"Send", scale=2, variant="primary"
|
374 |
+
)
|
375 |
+
|
376 |
+
# Right column: PDF Preview and Upload
|
377 |
+
with gr.Column(scale=3):
|
378 |
+
pdf_preview = gr.Image(label="Source Page", height=600)
|
379 |
+
with gr.Row():
|
380 |
+
pdf_files = gr.File(
|
381 |
+
file_count="multiple",
|
382 |
+
file_types=[".pdf"],
|
383 |
+
label="Upload PDFs",
|
384 |
+
)
|
385 |
+
with gr.Row():
|
386 |
+
collection_name = gr.Textbox(
|
387 |
+
label="Collection Name",
|
388 |
+
placeholder="Name this PDF collection...",
|
389 |
+
)
|
390 |
+
pdf_status = gr.Markdown()
|
391 |
+
|
392 |
+
# Event handlers for Study Analysis tab
|
393 |
process_zotero_btn.click(
|
394 |
process_zotero_library_items,
|
395 |
inputs=[zotero_library_id, zotero_api_access_key],
|
396 |
outputs=[zotero_output],
|
|
|
397 |
)
|
398 |
+
|
399 |
+
study_dropdown.change(
|
400 |
+
get_study_info, inputs=[study_dropdown], outputs=[study_info]
|
401 |
+
)
|
402 |
+
|
403 |
submit_btn.click(
|
404 |
process_multi_input,
|
405 |
inputs=[study_variables, study_dropdown, prompt_type],
|
406 |
outputs=[answer_output, download_btn],
|
|
|
407 |
)
|
408 |
+
|
409 |
download_btn.click(
|
410 |
+
fn=download_as_csv, inputs=[answer_output], outputs=[download_btn]
|
411 |
+
).then(fn=cleanup_temp_files, inputs=None, outputs=None)
|
412 |
+
|
413 |
+
# Event handlers for PDF Chat tab
|
414 |
+
def add_message(history, message):
|
415 |
+
"""Add user message to chat history."""
|
416 |
+
if not message.strip():
|
417 |
+
raise gr.Error("Please enter a message")
|
418 |
+
history = history + [(message, None)]
|
419 |
+
return history, ""
|
420 |
+
|
421 |
+
def generate_chat_response(history, collection):
|
422 |
+
"""Generate response for the last message in history."""
|
423 |
+
if len(history) == 0:
|
424 |
+
return history
|
425 |
+
|
426 |
+
last_message = history[-1][0]
|
427 |
+
response = chat_function(last_message, collection, "Default")
|
428 |
+
|
429 |
+
# Update the last message pair with the response
|
430 |
+
history[-1] = (last_message, response)
|
431 |
+
return history
|
432 |
+
|
433 |
+
pdf_files.upload(
|
434 |
+
process_pdf_uploads,
|
435 |
+
inputs=[pdf_files, collection_name],
|
436 |
+
outputs=[pdf_status],
|
437 |
+
)
|
438 |
+
|
439 |
+
# Fixed chat event handling
|
440 |
+
chat_submit_btn.click(
|
441 |
+
add_message,
|
442 |
+
inputs=[chat_history, query_input],
|
443 |
+
outputs=[chat_history, query_input],
|
444 |
+
).success(
|
445 |
+
generate_chat_response,
|
446 |
+
inputs=[chat_history, collection_name],
|
447 |
+
outputs=[chat_history],
|
448 |
)
|
449 |
|
450 |
return demo
|
config/pdf_config.yaml
ADDED
File without changes
|
interface.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Gradio interface module for ACRES RAG Platform.
|
3 |
+
Defines the UI components and layout.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
|
9 |
+
def create_chat_interface() -> gr.Blocks:
|
10 |
+
"""Create the chat interface component."""
|
11 |
+
with gr.Blocks() as chat_interface:
|
12 |
+
with gr.Row():
|
13 |
+
with gr.Column(scale=7):
|
14 |
+
chat_history = gr.Chatbot(
|
15 |
+
value=[], elem_id="chatbot", height=600, show_label=False
|
16 |
+
)
|
17 |
+
with gr.Column(scale=3):
|
18 |
+
pdf_preview = gr.Image(label="Source Page", height=600)
|
19 |
+
|
20 |
+
with gr.Row():
|
21 |
+
with gr.Column(scale=8):
|
22 |
+
query_input = gr.Textbox(
|
23 |
+
show_label=False,
|
24 |
+
placeholder="Ask a question about your documents...",
|
25 |
+
container=False,
|
26 |
+
)
|
27 |
+
with gr.Column(scale=2):
|
28 |
+
submit_btn = gr.Button("Send", variant="primary")
|
29 |
+
|
30 |
+
with gr.Row():
|
31 |
+
pdf_files = gr.File(
|
32 |
+
file_count="multiple", file_types=[".pdf"], label="Upload PDF Files"
|
33 |
+
)
|
34 |
+
collection_name = gr.Textbox(
|
35 |
+
label="Collection Name", placeholder="Name this collection of PDFs..."
|
36 |
+
)
|
37 |
+
|
38 |
+
return (
|
39 |
+
chat_interface,
|
40 |
+
chat_history,
|
41 |
+
pdf_preview,
|
42 |
+
query_input,
|
43 |
+
submit_btn,
|
44 |
+
pdf_files,
|
45 |
+
collection_name,
|
46 |
+
)
|
requirements.txt
CHANGED
@@ -9,4 +9,6 @@ pandas
|
|
9 |
pydantic
|
10 |
python-dotenv
|
11 |
pyzotero
|
12 |
-
python-slugify
|
|
|
|
|
|
9 |
pydantic
|
10 |
python-dotenv
|
11 |
pyzotero
|
12 |
+
python-slugify
|
13 |
+
PyMuPDF==1.23.8
|
14 |
+
Pillow==10.2.0
|
utils/pdf_processor.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
PDF processing module for ACRES RAG Platform.
|
3 |
+
Handles PDF file processing, text extraction, and page rendering.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import fitz
|
8 |
+
import logging
|
9 |
+
from typing import Dict, List, Optional
|
10 |
+
from datetime import datetime
|
11 |
+
from slugify import slugify
|
12 |
+
import json
|
13 |
+
from PIL import Image
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
class PDFProcessor:
|
19 |
+
def __init__(self, upload_dir: str = "data/uploads"):
|
20 |
+
"""Initialize PDFProcessor with upload directory."""
|
21 |
+
self.upload_dir = upload_dir
|
22 |
+
os.makedirs(upload_dir, exist_ok=True)
|
23 |
+
self.current_page = 0
|
24 |
+
|
25 |
+
def extract_text_from_pdf(self, file_path: str) -> Dict:
|
26 |
+
"""Extract text and metadata from a PDF file."""
|
27 |
+
doc = fitz.open(file_path)
|
28 |
+
|
29 |
+
# Extract text from all pages with page tracking
|
30 |
+
text = ""
|
31 |
+
pages = {}
|
32 |
+
for page_num in range(len(doc)):
|
33 |
+
page_text = doc[page_num].get_text()
|
34 |
+
pages[page_num] = page_text
|
35 |
+
text += page_text + "\n"
|
36 |
+
|
37 |
+
# Extract metadata
|
38 |
+
metadata = doc.metadata
|
39 |
+
if not metadata.get("title"):
|
40 |
+
metadata["title"] = os.path.basename(file_path)
|
41 |
+
|
42 |
+
# Create structured document
|
43 |
+
document = {
|
44 |
+
"title": metadata.get("title", ""),
|
45 |
+
"authors": metadata.get("author", "").split(";"),
|
46 |
+
"date": metadata.get("creationDate", ""),
|
47 |
+
"abstract": text[:500] + "..." if len(text) > 500 else text,
|
48 |
+
"full_text": text,
|
49 |
+
"source_file": file_path,
|
50 |
+
"pages": pages,
|
51 |
+
"page_count": len(doc),
|
52 |
+
}
|
53 |
+
|
54 |
+
doc.close()
|
55 |
+
return document
|
56 |
+
|
57 |
+
def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
|
58 |
+
"""Process multiple PDF files and store their content."""
|
59 |
+
processed_docs = []
|
60 |
+
|
61 |
+
for file_path in file_paths:
|
62 |
+
try:
|
63 |
+
doc_data = self.extract_text_from_pdf(file_path)
|
64 |
+
processed_docs.append(doc_data)
|
65 |
+
except Exception as e:
|
66 |
+
logger.error(f"Error processing {file_path}: {str(e)}")
|
67 |
+
continue
|
68 |
+
|
69 |
+
if not processed_docs:
|
70 |
+
raise ValueError("No documents were successfully processed")
|
71 |
+
|
72 |
+
# Save to JSON file
|
73 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
74 |
+
output_filename = f"{slugify(collection_name)}_{timestamp}_documents.json"
|
75 |
+
output_path = f"data/{output_filename}"
|
76 |
+
|
77 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
78 |
+
json.dump(processed_docs, f, indent=2, ensure_ascii=False)
|
79 |
+
|
80 |
+
return output_path
|
81 |
+
|
82 |
+
def render_page(self, file_path: str, page_num: int) -> Optional[Image.Image]:
|
83 |
+
"""Render a specific page from a PDF as an image."""
|
84 |
+
try:
|
85 |
+
doc = fitz.open(file_path)
|
86 |
+
page = doc[page_num]
|
87 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72))
|
88 |
+
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
89 |
+
doc.close()
|
90 |
+
return image
|
91 |
+
except Exception as e:
|
92 |
+
logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}")
|
93 |
+
return None
|