Spaces:

khrek
/

detailed-resume-parser

Running

File size: 5,214 Bytes

0d375ed
81c1983
 
0d375ed
a3970e4
0d375ed

from itertools import chain
from models import Models
#from models.prototype.models import Models
#from output_model import OutputModel, WorkExperience
from segmenter import ResumeSegmenter
from flashtext import KeywordProcessor
from collections import defaultdict
class ResumeParser():
    def __init__(self) -> None:
        self.resumeSegmenter = ResumeSegmenter()
        self.models = Models()
        
        
    def get_date_index(self, clean_resume_lines, date):
        indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
        return indexes 
    
    #better suited to a utils file
    def sort_tokens_table(self, tokens_data):
        table  = {}
        for key, tokens in tokens_data:
            for token in tokens:
                table[token] = key
        return table
    
    def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
        
        dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
        dates_indexes = list(chain.from_iterable(dates_indexes))
        dates_indexes = [i + start_index for i in dates_indexes]
        #this list should be unique and ordered
        dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
        dates_indexes = set(dates_indexes)
        dates_indexes =  list(dates_indexes)

        list_single_work_exp = []
        for i in range(len(dates_indexes)-1):
            index = dates_indexes[i]
            next_index = dates_indexes[i+1]
            section = resume_lines[index:next_index]
            if len(section) == 0:
              continue
            list_single_work_exp.append(section)
        return  list_single_work_exp
    
    def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
        text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
        start_index = sections[section_header][0]
        end_index = sections[section_header][1]
        #on the bases dates would be unique
        return start_index, end_index
    
    #more of a utils function
    def sort_tokens_table(tokens_data):
        table  = {}
        for key, tokens in tokens_data:
            for token in tokens:
                table[token] = key
        return table
    
    def format_output(self, keywords, work_section_list, isWorkExp=True):
        if isWorkExp:
            headlines = [text[0] for text in work_section_list]
        else:
            headlines = work_section_list
        table = self.sort_tokens_table(keywords)
        tokens_processor = KeywordProcessor()
        list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
        tokens_processor.add_keywords_from_list(list_keywords)
        data = []
        for i, header in enumerate(headlines):
            current_data = defaultdict(list)
            tokens = tokens_processor.extract_keywords(header)
            for token in tokens:
                current_data[table[token]].append(token)
            if isWorkExp:
              current_data["description"] = work_section_list[i][1:]
            data.append(dict(current_data))
        return data
      
    def parse_work_history(self, resume_lines):
        start_index, end_index = self.extract_section_text(resume_lines)
        work_dates =  self.models.get_ner(resume_lines[start_index:end_index], "date")
        single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
        job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title")
        companies = self.models.get_ner(resume_lines[start_index:end_index], "company")
        keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
        return self.format_output(keywords, single_work_experiences)
    
    def parse_education(self, resume_lines):
        start_index, end_index = self.extract_section_text(resume_lines, "education_and_training")
        tokens = ["degree", "university", "degree field", "date", "location"]
        
        for token in tokens:
            keywords = self.get_ner(resume_lines[start_index+1:end_index], token)
        output = self.format_output(keywords, resume_lines[start_index:end_index], False)
        output = [res for res in output if res]

        return output
      
    def parse_basic_info(self,resume_lines):
        start_index, end_index = self.extract_section_text(resume_lines, "basics_info")
        #tokens = ["person", "email", "phone"]
        tokens = ["person"]
        for token in tokens:
            keywords = self.models.get_ner(resume_lines[start_index:end_index], token)
            
        output = {}
        for token, result in keywords:
            if len(result) > 0:
                output[token] = result[0]
        return output
    
    def parse(self, resume_lines):
        jobs = self.parse_work_history(resume_lines)
        education = self.parse_education(resume_lines)
        basic_info = self.parse_basic_info(resume_lines)

        return {"basic_info":basic_info, "education":education, "work_experience":jobs}