Spaces:
Running
Running
Refactor PDFProcessor to skip pages that appear to be part of the references section
Browse files- utils/pdf_processor.py +76 -2
utils/pdf_processor.py
CHANGED
@@ -12,6 +12,7 @@ import datetime
|
|
12 |
from slugify import slugify
|
13 |
import json
|
14 |
from PIL import Image
|
|
|
15 |
|
16 |
|
17 |
logger = logging.getLogger(__name__)
|
@@ -48,6 +49,58 @@ class PDFProcessor:
|
|
48 |
logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}")
|
49 |
return None
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
|
52 |
"""Process multiple PDF files and store their content."""
|
53 |
processed_docs = []
|
@@ -56,7 +109,9 @@ class PDFProcessor:
|
|
56 |
try:
|
57 |
doc_data = self.extract_text_from_pdf(file_path)
|
58 |
processed_docs.append(doc_data)
|
59 |
-
logger.info(
|
|
|
|
|
60 |
except Exception as e:
|
61 |
logger.error(f"Error processing {file_path}: {str(e)}")
|
62 |
continue
|
@@ -83,12 +138,30 @@ class PDFProcessor:
|
|
83 |
try:
|
84 |
doc = fitz.open(file_path)
|
85 |
|
|
|
|
|
|
|
86 |
# Extract text from all pages with page tracking
|
87 |
text = ""
|
88 |
pages = {}
|
89 |
for page_num in range(len(doc)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
page_text = doc[page_num].get_text()
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
text += page_text + "\n"
|
93 |
|
94 |
# Extract metadata
|
@@ -110,6 +183,7 @@ class PDFProcessor:
|
|
110 |
"source_file": file_path,
|
111 |
"pages": pages,
|
112 |
"page_count": len(doc),
|
|
|
113 |
}
|
114 |
|
115 |
doc.close()
|
|
|
12 |
from slugify import slugify
|
13 |
import json
|
14 |
from PIL import Image
|
15 |
+
import re
|
16 |
|
17 |
|
18 |
logger = logging.getLogger(__name__)
|
|
|
49 |
logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}")
|
50 |
return None
|
51 |
|
52 |
+
def is_references_page(self, text: str) -> bool:
|
53 |
+
"""
|
54 |
+
Check if the page appears to be a references/bibliography page.
|
55 |
+
"""
|
56 |
+
# Common section headers for references
|
57 |
+
ref_headers = [
|
58 |
+
r"^references\s*$",
|
59 |
+
r"^bibliography\s*$",
|
60 |
+
r"^works cited\s*$",
|
61 |
+
r"^citations\s*$",
|
62 |
+
r"^cited literature\s*$",
|
63 |
+
]
|
64 |
+
|
65 |
+
# Check first few lines of the page
|
66 |
+
first_lines = text.lower().split("\n")[:3]
|
67 |
+
first_block = " ".join(first_lines)
|
68 |
+
|
69 |
+
# Check for reference headers
|
70 |
+
for header in ref_headers:
|
71 |
+
if re.search(header, first_block, re.IGNORECASE):
|
72 |
+
return True
|
73 |
+
|
74 |
+
# Check for reference-like patterns (e.g., [1] Author, et al.)
|
75 |
+
ref_patterns = [
|
76 |
+
r"^\[\d+\]", # [1] style
|
77 |
+
r"^\d+\.", # 1. style
|
78 |
+
r"^[A-Z][a-z]+,\s+[A-Z]\.", # Author, I. style
|
79 |
+
]
|
80 |
+
|
81 |
+
ref_pattern_count = 0
|
82 |
+
lines = text.split("\n")[:10] # Check first 10 lines
|
83 |
+
for line in lines:
|
84 |
+
line = line.strip()
|
85 |
+
if any(re.match(pattern, line) for pattern in ref_patterns):
|
86 |
+
ref_pattern_count += 1
|
87 |
+
|
88 |
+
# If multiple reference-like patterns are found, likely a references page
|
89 |
+
return ref_pattern_count >= 3
|
90 |
+
|
91 |
+
def detect_references_start(self, doc: fitz.Document) -> Optional[int]:
|
92 |
+
"""
|
93 |
+
Detect the page where references section starts.
|
94 |
+
Returns the page number or None if not found.
|
95 |
+
"""
|
96 |
+
for page_num in range(len(doc)):
|
97 |
+
page = doc[page_num]
|
98 |
+
text = page.get_text()
|
99 |
+
if self.is_references_page(text):
|
100 |
+
logger.info(f"Detected references section starting at page {page_num}")
|
101 |
+
return page_num
|
102 |
+
return None
|
103 |
+
|
104 |
def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
|
105 |
"""Process multiple PDF files and store their content."""
|
106 |
processed_docs = []
|
|
|
109 |
try:
|
110 |
doc_data = self.extract_text_from_pdf(file_path)
|
111 |
processed_docs.append(doc_data)
|
112 |
+
logger.info(
|
113 |
+
f"Successfully processed {file_path} ({doc_data['content_pages']} content pages)"
|
114 |
+
)
|
115 |
except Exception as e:
|
116 |
logger.error(f"Error processing {file_path}: {str(e)}")
|
117 |
continue
|
|
|
138 |
try:
|
139 |
doc = fitz.open(file_path)
|
140 |
|
141 |
+
# Find references section start
|
142 |
+
refs_start = self.detect_references_start(doc)
|
143 |
+
|
144 |
# Extract text from all pages with page tracking
|
145 |
text = ""
|
146 |
pages = {}
|
147 |
for page_num in range(len(doc)):
|
148 |
+
# Skip if this is after references section starts
|
149 |
+
if refs_start is not None and page_num >= refs_start:
|
150 |
+
logger.info(
|
151 |
+
f"Skipping page {page_num} as it appears to be part of references"
|
152 |
+
)
|
153 |
+
continue
|
154 |
+
|
155 |
page_text = doc[page_num].get_text()
|
156 |
+
|
157 |
+
# Extra check to catch references if they weren't caught by the initial scan
|
158 |
+
if page_num > 0 and self.is_references_page(page_text):
|
159 |
+
logger.info(
|
160 |
+
f"Detected references content on page {page_num}, skipping"
|
161 |
+
)
|
162 |
+
continue
|
163 |
+
|
164 |
+
pages[str(page_num)] = page_text
|
165 |
text += page_text + "\n"
|
166 |
|
167 |
# Extract metadata
|
|
|
183 |
"source_file": file_path,
|
184 |
"pages": pages,
|
185 |
"page_count": len(doc),
|
186 |
+
"content_pages": len(pages), # Number of pages excluding references
|
187 |
}
|
188 |
|
189 |
doc.close()
|