from itertools import chain from models import Models #from models.prototype.models import Models #from output_model import OutputModel, WorkExperience from segmenter import ResumeSegmenter from flashtext import KeywordProcessor from collections import defaultdict class ResumeParser(): def __init__(self) -> None: self.resumeSegmenter = ResumeSegmenter() self.models = Models() def get_date_index(self, clean_resume_lines, date): indexes = [i for i, line in enumerate(clean_resume_lines) if date in line] return indexes #better suited to a utils file def sort_tokens_table(self, tokens_data): table = {} for key, tokens in tokens_data: for token in tokens: table[token] = key return table def split_work_exp(self, resume_lines, start_index, end_index, work_dates): dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates] dates_indexes = list(chain.from_iterable(dates_indexes)) dates_indexes = [i + start_index for i in dates_indexes] #this list should be unique and ordered dates_indexes = sorted([start_index+1] + dates_indexes + [end_index]) dates_indexes = set(dates_indexes) dates_indexes = list(dates_indexes) list_single_work_exp = [] for i in range(len(dates_indexes)-1): index = dates_indexes[i] next_index = dates_indexes[i+1] section = resume_lines[index:next_index] if len(section) == 0: continue list_single_work_exp.append(section) return list_single_work_exp def extract_section_text(self, resume_lines, section_header = "work_and_employment"): text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines) start_index = sections[section_header][0] end_index = sections[section_header][1] #on the bases dates would be unique return start_index, end_index #more of a utils function def sort_tokens_table(tokens_data): table = {} for key, tokens in tokens_data: for token in tokens: table[token] = key return table def format_output(self, keywords, work_section_list, isWorkExp=True): if isWorkExp: headlines = [text[0] for text in work_section_list] else: headlines = work_section_list table = self.sort_tokens_table(keywords) tokens_processor = KeywordProcessor() list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords])) tokens_processor.add_keywords_from_list(list_keywords) data = [] for i, header in enumerate(headlines): current_data = defaultdict(list) tokens = tokens_processor.extract_keywords(header) for token in tokens: current_data[table[token]].append(token) if isWorkExp: current_data["description"] = work_section_list[i][1:] data.append(dict(current_data)) return data def parse_work_history(self, resume_lines): start_index, end_index = self.extract_section_text(resume_lines) work_dates = self.models.get_ner(resume_lines[start_index:end_index], "date") single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates) job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title") companies = self.models.get_ner(resume_lines[start_index:end_index], "company") keywords = [("date", work_dates), ("title", job_positions), ("company", companies)] return self.format_output(keywords, single_work_experiences) def parse_education(self, resume_lines): start_index, end_index = self.extract_section_text(resume_lines, "education_and_training") tokens = ["degree", "university", "degree field", "date", "location"] for token in tokens: keywords = self.get_ner(resume_lines[start_index+1:end_index], token) output = self.format_output(keywords, resume_lines[start_index:end_index], False) output = [res for res in output if res] return output def parse_basic_info(self,resume_lines): start_index, end_index = self.extract_section_text(resume_lines, "basics_info") #tokens = ["person", "email", "phone"] tokens = ["person"] for token in tokens: keywords = self.models.get_ner(resume_lines[start_index:end_index], token) output = {} for token, result in keywords: if len(result) > 0: output[token] = result[0] return output def parse(self, resume_lines): jobs = self.parse_work_history(resume_lines) education = self.parse_education(resume_lines) basic_info = self.parse_basic_info(resume_lines) return {"basic_info":basic_info, "education":education, "work_experience":jobs}