from itertools import chain from models import Models #from output_model import OutputModel, WorkExperience from segmenter import ResumeSegmenter from flashtext import KeywordProcessor from collections import defaultdict import re import wordninja from utils import percentage_difference from nltk import word_tokenize class ResumeParser(): def __init__(self) -> None: self.resumeSegmenter = ResumeSegmenter() self.models = Models() def get_date_index(self, clean_resume_lines, date): indexes = [i for i, line in enumerate(clean_resume_lines) if date in line] return indexes #better suited to a utils file def sort_tokens_table(self, tokens_data): table = {} for key, tokens in tokens_data: for token in tokens: table[token] = key return table def split_work_exp(self, resume_lines, start_index, end_index, work_dates): dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates] dates_indexes = list(chain.from_iterable(dates_indexes)) dates_indexes = [i + start_index for i in dates_indexes] dates_indexes = sorted([start_index+1] + dates_indexes + [end_index]) dates_indexes = set(dates_indexes) dates_indexes = sorted(list(dates_indexes)) individual_sections = [] for i, index in enumerate(dates_indexes): section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]] if len(section) == 0: continue individual_sections.append(section) return individual_sections def extract_section_text(self, resume_lines, section_header = "work_and_employment"): _ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines) if sections is None: return None print(sections) if section_header not in sections: return None start_index = sections[section_header][0] end_index = sections[section_header][1] #on the bases dates would be unique return start_index, end_index #more of a utils function def sort_tokens_table(self, tokens_data): table = {} for key, tokens in tokens_data: for token in tokens: table[token] = key return table def format_output(self, keywords, headlines, isWorkExp=True): data = [] for section in headlines: extracted_data = {} paragraph = '\n'.join(section) if isWorkExp else ' '.join(section) extracted_data['description'] = paragraph recovered_headlines = ' '.join(wordninja.split(paragraph)) if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50: extracted_data['description'] = recovered_headlines for attr in keywords: result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines])) if len(result) > 0: extracted_data[attr[0]] = result data.append(extracted_data) return data def parse_work_history(self, resume_lines, sections): start_index, end_index = sections['work_and_employment'] text = ' '.join(resume_lines[start_index:end_index]) recovered_text = ' '.join(wordninja.split(text)) work_dates, companies, locations = self.models.get_ner(text, recovered_text) single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates) entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index])) job_positions = entity_dict['job title'] keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)] return self.format_output(keywords, single_work_experiences) def parse_education(self, resume_lines, sections): start_index, end_index = sections["education_and_training"] text = ' '.join(resume_lines[start_index:end_index]) dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text))) single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates) entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index])) degrees = entity_dict['degree'] majors = entity_dict['major'] keywords = [("date", dates), ("major", majors), ("degree", degrees), ("university", universities), ("location", locations)] output = self.format_output(keywords, single_education_experiences, False) output = [res for res in output if res] return output def parse_basic_info(self,resume_lines, sections): start_index, end_index = sections["basics_info"] text = ' '.join(resume_lines[start_index:end_index]) phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}' email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' entites = self.models.ner(text) if len(entites) == 0: entites = self.models.ner(' '.join(resume_lines)) output = {} score = 0 for entity in entites: if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']: output['name']= entity['word'] score = entity['score'] email = re.findall(email_pattern, text) phone = re.findall(phone_pattern, text) if email == '': email = re.findall(email_pattern, ' '.join(resume_lines)) if phone == '': phone = re.findall(phone_pattern, ' '.join(resume_lines)) output['email'] = email[0] if len(email) > 0 else '' output['phone'] = phone[0] if len(phone) > 0 else '' return output def parse(self, resume_lines): self.resumeSegmenter.resume_segments = { 'objective': [], 'work_and_employment': [], 'education_and_training': [], 'skills': [], 'accomplishments': [], 'misc': [] } self.resumeSegmenter.resume_indices = [] sections = self.resumeSegmenter.segment(resume_lines) if sections is None: return {} jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {} education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {} basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {} result = {"basic_info":basic_info, "education":education, "work_experience":jobs} for section in sections.keys(): if section not in ['work_and_employment', 'education_and_training', 'basics_info']: text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]]) result[section] =' '.join(wordninja.split(text)) return result