File size: 960 Bytes
0d375ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import pypdfium2 as pdfium
import re
class ResumeReader:
    
    def clean_text(self, raw_text):
        clean_text = re.sub(r'\n+', '\n', raw_text)
        clean_text = clean_text.replace("\r", "\n")
        clean_text = clean_text.replace("\t", " ")
        clean_text = re.sub(r"\uf0b7", " ", clean_text)
        clean_text = re.sub(r'[^\x00-\x7F]+', '', clean_text) #remove non-ascii
        clean_text = re.sub(r"\(cid:\d{0,3}\)", " ", clean_text)
        clean_text = re.sub(r'• ', " ", clean_text)
        return clean_text
    
    def read_pdf(self, path_file):
        raw_text = ""
        pdf = pdfium.PdfDocument(path_file)
        for page in pdf:
            raw_text += page.get_textpage().get_text_range()
        clean_text = self.clean_text(raw_text)
        resume_lines = clean_text.splitlines(True)
        resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
        return resume_lines