Spaces:
Running
Running
import pypdfium2 as pdfium | |
import re | |
import wordninja | |
from PIL import Image | |
from pytesseract import image_to_string | |
from utils import recover_text, get_average_line_len | |
import pdfplumber | |
class ResumeReader: | |
def clean_text(self, raw_text): | |
clean_text = re.sub(r'\n+', '\n', raw_text) | |
clean_text = clean_text.replace("\r", "\n") | |
clean_text = clean_text.replace("\t", " ") | |
clean_text = re.sub(r"\uf0b7", " ", clean_text) | |
clean_text = re.sub(r'[^\x00-\x7F]+', '', clean_text) #remove non-ascii | |
clean_text = re.sub(r"\(cid:\d{0,3}\)", " ", clean_text) | |
clean_text = re.sub(r'• ', " ", clean_text) | |
return clean_text | |
def recover_text(self, text_without_spaces): | |
recovered_text = " ".join(wordninja.split(text_without_spaces)) | |
return recovered_text | |
def read_image(self, path_file): | |
raw_text = str(image_to_string(Image.open(path_file))) | |
clean_text = self.clean_text(raw_text) | |
resume_lines = clean_text.splitlines(True) | |
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()] | |
#avg_line = get_average_line_len(resume_lines) | |
#resume_lines = [recover_text(line,avg_line) for line in resume_lines] | |
return resume_lines | |
def read_pdf(self, path_file): | |
raw_text = "" | |
with pdfplumber.open(path_file) as pdf: | |
# Extract text from all pages | |
for page_number in range(len(pdf.pages)): | |
page = pdf.pages[page_number] | |
raw_text += page.extract_text() | |
clean_text = self.clean_text(raw_text) | |
resume_lines = clean_text.splitlines(True) | |
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()] | |
#avg_line = get_average_line_len(resume_lines) | |
#resume_lines = [recover_text(line,avg_line) for line in resume_lines] | |
return resume_lines | |
def read(self, path_file): | |
if path_file.endswith('.pdf'): | |
return self.read_pdf(path_file) | |
elif path_file.endswith('.jpg') or path_file.endswith('.png') or path_file.endswith('.jpeg'): | |
return self.read_image(path_file) | |
else: | |
print("Unsupported file format") | |
return None |