khrek commited on
Commit
547a2b6
1 Parent(s): d4229f8

Upload reader.py

Browse files
Files changed (1) hide show
  1. reader.py +35 -6
reader.py CHANGED
@@ -1,5 +1,11 @@
1
  import pypdfium2 as pdfium
2
  import re
 
 
 
 
 
 
3
  class ResumeReader:
4
 
5
  def clean_text(self, raw_text):
@@ -12,14 +18,37 @@ class ResumeReader:
12
  clean_text = re.sub(r'• ', " ", clean_text)
13
  return clean_text
14
 
15
- def read_pdf(self, path_file):
16
- raw_text = ""
17
- pdf = pdfium.PdfDocument(path_file)
18
- for page in pdf:
19
- raw_text += page.get_textpage().get_text_range()
 
20
  clean_text = self.clean_text(raw_text)
21
  resume_lines = clean_text.splitlines(True)
22
  resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
 
 
23
  return resume_lines
24
 
25
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pypdfium2 as pdfium
2
  import re
3
+ import wordninja
4
+ from PIL import Image
5
+ from pytesseract import image_to_string
6
+ from utils import recover_text, get_average_line_len
7
+ import pdfplumber
8
+
9
  class ResumeReader:
10
 
11
  def clean_text(self, raw_text):
 
18
  clean_text = re.sub(r'• ', " ", clean_text)
19
  return clean_text
20
 
21
+ def recover_text(self, text_without_spaces):
22
+ recovered_text = " ".join(wordninja.split(text_without_spaces))
23
+ return recovered_text
24
+
25
+ def read_image(self, path_file):
26
+ raw_text = str(image_to_string(Image.open(path_file)))
27
  clean_text = self.clean_text(raw_text)
28
  resume_lines = clean_text.splitlines(True)
29
  resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
30
+ #avg_line = get_average_line_len(resume_lines)
31
+ #resume_lines = [recover_text(line,avg_line) for line in resume_lines]
32
  return resume_lines
33
 
34
+ def read_pdf(self, path_file):
35
+ raw_text = ""
36
+ with pdfplumber.open(path_file) as pdf:
37
+ # Extract text from all pages
38
+ for page_number in range(len(pdf.pages)):
39
+ page = pdf.pages[page_number]
40
+ raw_text += page.extract_text()
41
+ clean_text = self.clean_text(raw_text)
42
+ resume_lines = clean_text.splitlines(True)
43
+ resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
44
+ #avg_line = get_average_line_len(resume_lines)
45
+ #resume_lines = [recover_text(line,avg_line) for line in resume_lines]
46
+ return resume_lines
47
+ def read(self, path_file):
48
+ if path_file.endswith('.pdf'):
49
+ return self.read_pdf(path_file)
50
+ elif path_file.endswith('.jpg') or path_file.endswith('.png') or path_file.endswith('.jpeg'):
51
+ return self.read_image(path_file)
52
+ else:
53
+ print("Unsupported file format")
54
+ return None