ak3ra commited on
Commit
8f0a182
1 Parent(s): f4a9ada

Refactor PDFProcessor to skip pages that appear to be part of the references section

Browse files
Files changed (1) hide show
  1. utils/pdf_processor.py +76 -2
utils/pdf_processor.py CHANGED
@@ -12,6 +12,7 @@ import datetime
12
  from slugify import slugify
13
  import json
14
  from PIL import Image
 
15
 
16
 
17
  logger = logging.getLogger(__name__)
@@ -48,6 +49,58 @@ class PDFProcessor:
48
  logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}")
49
  return None
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
52
  """Process multiple PDF files and store their content."""
53
  processed_docs = []
@@ -56,7 +109,9 @@ class PDFProcessor:
56
  try:
57
  doc_data = self.extract_text_from_pdf(file_path)
58
  processed_docs.append(doc_data)
59
- logger.info(f"Successfully processed {file_path}")
 
 
60
  except Exception as e:
61
  logger.error(f"Error processing {file_path}: {str(e)}")
62
  continue
@@ -83,12 +138,30 @@ class PDFProcessor:
83
  try:
84
  doc = fitz.open(file_path)
85
 
 
 
 
86
  # Extract text from all pages with page tracking
87
  text = ""
88
  pages = {}
89
  for page_num in range(len(doc)):
 
 
 
 
 
 
 
90
  page_text = doc[page_num].get_text()
91
- pages[str(page_num)] = page_text # Convert page_num to string for JSON
 
 
 
 
 
 
 
 
92
  text += page_text + "\n"
93
 
94
  # Extract metadata
@@ -110,6 +183,7 @@ class PDFProcessor:
110
  "source_file": file_path,
111
  "pages": pages,
112
  "page_count": len(doc),
 
113
  }
114
 
115
  doc.close()
 
12
  from slugify import slugify
13
  import json
14
  from PIL import Image
15
+ import re
16
 
17
 
18
  logger = logging.getLogger(__name__)
 
49
  logger.error(f"Error rendering page {page_num} from {file_path}: {str(e)}")
50
  return None
51
 
52
+ def is_references_page(self, text: str) -> bool:
53
+ """
54
+ Check if the page appears to be a references/bibliography page.
55
+ """
56
+ # Common section headers for references
57
+ ref_headers = [
58
+ r"^references\s*$",
59
+ r"^bibliography\s*$",
60
+ r"^works cited\s*$",
61
+ r"^citations\s*$",
62
+ r"^cited literature\s*$",
63
+ ]
64
+
65
+ # Check first few lines of the page
66
+ first_lines = text.lower().split("\n")[:3]
67
+ first_block = " ".join(first_lines)
68
+
69
+ # Check for reference headers
70
+ for header in ref_headers:
71
+ if re.search(header, first_block, re.IGNORECASE):
72
+ return True
73
+
74
+ # Check for reference-like patterns (e.g., [1] Author, et al.)
75
+ ref_patterns = [
76
+ r"^\[\d+\]", # [1] style
77
+ r"^\d+\.", # 1. style
78
+ r"^[A-Z][a-z]+,\s+[A-Z]\.", # Author, I. style
79
+ ]
80
+
81
+ ref_pattern_count = 0
82
+ lines = text.split("\n")[:10] # Check first 10 lines
83
+ for line in lines:
84
+ line = line.strip()
85
+ if any(re.match(pattern, line) for pattern in ref_patterns):
86
+ ref_pattern_count += 1
87
+
88
+ # If multiple reference-like patterns are found, likely a references page
89
+ return ref_pattern_count >= 3
90
+
91
+ def detect_references_start(self, doc: fitz.Document) -> Optional[int]:
92
+ """
93
+ Detect the page where references section starts.
94
+ Returns the page number or None if not found.
95
+ """
96
+ for page_num in range(len(doc)):
97
+ page = doc[page_num]
98
+ text = page.get_text()
99
+ if self.is_references_page(text):
100
+ logger.info(f"Detected references section starting at page {page_num}")
101
+ return page_num
102
+ return None
103
+
104
  def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
105
  """Process multiple PDF files and store their content."""
106
  processed_docs = []
 
109
  try:
110
  doc_data = self.extract_text_from_pdf(file_path)
111
  processed_docs.append(doc_data)
112
+ logger.info(
113
+ f"Successfully processed {file_path} ({doc_data['content_pages']} content pages)"
114
+ )
115
  except Exception as e:
116
  logger.error(f"Error processing {file_path}: {str(e)}")
117
  continue
 
138
  try:
139
  doc = fitz.open(file_path)
140
 
141
+ # Find references section start
142
+ refs_start = self.detect_references_start(doc)
143
+
144
  # Extract text from all pages with page tracking
145
  text = ""
146
  pages = {}
147
  for page_num in range(len(doc)):
148
+ # Skip if this is after references section starts
149
+ if refs_start is not None and page_num >= refs_start:
150
+ logger.info(
151
+ f"Skipping page {page_num} as it appears to be part of references"
152
+ )
153
+ continue
154
+
155
  page_text = doc[page_num].get_text()
156
+
157
+ # Extra check to catch references if they weren't caught by the initial scan
158
+ if page_num > 0 and self.is_references_page(page_text):
159
+ logger.info(
160
+ f"Detected references content on page {page_num}, skipping"
161
+ )
162
+ continue
163
+
164
+ pages[str(page_num)] = page_text
165
  text += page_text + "\n"
166
 
167
  # Extract metadata
 
183
  "source_file": file_path,
184
  "pages": pages,
185
  "page_count": len(doc),
186
+ "content_pages": len(pages), # Number of pages excluding references
187
  }
188
 
189
  doc.close()