detailed-resume-parser / resume_parser.py
khrek's picture
Update resume_parser.py
a3970e4
raw
history blame
5.21 kB
from itertools import chain
from models import Models
#from models.prototype.models import Models
#from output_model import OutputModel, WorkExperience
from segmenter import ResumeSegmenter
from flashtext import KeywordProcessor
from collections import defaultdict
class ResumeParser():
def __init__(self) -> None:
self.resumeSegmenter = ResumeSegmenter()
self.models = Models()
def get_date_index(self, clean_resume_lines, date):
indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
return indexes
#better suited to a utils file
def sort_tokens_table(self, tokens_data):
table = {}
for key, tokens in tokens_data:
for token in tokens:
table[token] = key
return table
def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
dates_indexes = list(chain.from_iterable(dates_indexes))
dates_indexes = [i + start_index for i in dates_indexes]
#this list should be unique and ordered
dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
dates_indexes = set(dates_indexes)
dates_indexes = list(dates_indexes)
list_single_work_exp = []
for i in range(len(dates_indexes)-1):
index = dates_indexes[i]
next_index = dates_indexes[i+1]
section = resume_lines[index:next_index]
if len(section) == 0:
continue
list_single_work_exp.append(section)
return list_single_work_exp
def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
start_index = sections[section_header][0]
end_index = sections[section_header][1]
#on the bases dates would be unique
return start_index, end_index
#more of a utils function
def sort_tokens_table(tokens_data):
table = {}
for key, tokens in tokens_data:
for token in tokens:
table[token] = key
return table
def format_output(self, keywords, work_section_list, isWorkExp=True):
if isWorkExp:
headlines = [text[0] for text in work_section_list]
else:
headlines = work_section_list
table = self.sort_tokens_table(keywords)
tokens_processor = KeywordProcessor()
list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
tokens_processor.add_keywords_from_list(list_keywords)
data = []
for i, header in enumerate(headlines):
current_data = defaultdict(list)
tokens = tokens_processor.extract_keywords(header)
for token in tokens:
current_data[table[token]].append(token)
if isWorkExp:
current_data["description"] = work_section_list[i][1:]
data.append(dict(current_data))
return data
def parse_work_history(self, resume_lines):
start_index, end_index = self.extract_section_text(resume_lines)
work_dates = self.models.get_ner(resume_lines[start_index:end_index], "date")
single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title")
companies = self.models.get_ner(resume_lines[start_index:end_index], "company")
keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
return self.format_output(keywords, single_work_experiences)
def parse_education(self, resume_lines):
start_index, end_index = self.extract_section_text(resume_lines, "education_and_training")
tokens = ["degree", "university", "degree field", "date", "location"]
for token in tokens:
keywords = self.get_ner(resume_lines[start_index+1:end_index], token)
output = self.format_output(keywords, resume_lines[start_index:end_index], False)
output = [res for res in output if res]
return output
def parse_basic_info(self,resume_lines):
start_index, end_index = self.extract_section_text(resume_lines, "basics_info")
#tokens = ["person", "email", "phone"]
tokens = ["person"]
for token in tokens:
keywords = self.models.get_ner(resume_lines[start_index:end_index], token)
output = {}
for token, result in keywords:
if len(result) > 0:
output[token] = result[0]
return output
def parse(self, resume_lines):
jobs = self.parse_work_history(resume_lines)
education = self.parse_education(resume_lines)
basic_info = self.parse_basic_info(resume_lines)
return {"basic_info":basic_info, "education":education, "work_experience":jobs}