Spaces:

khrek
/

detailed-resume-parser

Running

App Files Files Community

detailed-resume-parser / resume_parser.py

khrek

Update resume_parser.py

81c1983 12 months ago

raw

history blame

5.23 kB

	from itertools import chain
	from models import Models
	#from models.prototype.models import Models
	#from output_model import OutputModel, WorkExperience
	from models.prototype.segmenter import ResumeSegmenter
	from flashtext import KeywordProcessor
	from collections import defaultdict
	class ResumeParser():
	def __init__(self) -> None:
	self.resumeSegmenter = ResumeSegmenter()
	self.models = Models()


	def get_date_index(self, clean_resume_lines, date):
	indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
	return indexes

	#better suited to a utils file
	def sort_tokens_table(self, tokens_data):
	table = {}
	for key, tokens in tokens_data:
	for token in tokens:
	table[token] = key
	return table

	def split_work_exp(self, resume_lines, start_index, end_index, work_dates):

	dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
	dates_indexes = list(chain.from_iterable(dates_indexes))
	dates_indexes = [i + start_index for i in dates_indexes]
	#this list should be unique and ordered
	dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
	dates_indexes = set(dates_indexes)
	dates_indexes = list(dates_indexes)

	list_single_work_exp = []
	for i in range(len(dates_indexes)-1):
	index = dates_indexes[i]
	next_index = dates_indexes[i+1]
	section = resume_lines[index:next_index]
	if len(section) == 0:
	continue
	list_single_work_exp.append(section)
	return list_single_work_exp

	def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
	text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
	start_index = sections[section_header][0]
	end_index = sections[section_header][1]
	#on the bases dates would be unique
	return start_index, end_index

	#more of a utils function
	def sort_tokens_table(tokens_data):
	table = {}
	for key, tokens in tokens_data:
	for token in tokens:
	table[token] = key
	return table

	def format_output(self, keywords, work_section_list, isWorkExp=True):
	if isWorkExp:
	headlines = [text[0] for text in work_section_list]
	else:
	headlines = work_section_list
	table = self.sort_tokens_table(keywords)
	tokens_processor = KeywordProcessor()
	list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
	tokens_processor.add_keywords_from_list(list_keywords)
	data = []
	for i, header in enumerate(headlines):
	current_data = defaultdict(list)
	tokens = tokens_processor.extract_keywords(header)
	for token in tokens:
	current_data[table[token]].append(token)
	if isWorkExp:
	current_data["description"] = work_section_list[i][1:]
	data.append(dict(current_data))
	return data

	def parse_work_history(self, resume_lines):
	start_index, end_index = self.extract_section_text(resume_lines)
	work_dates = self.models.get_ner(resume_lines[start_index:end_index], "date")
	single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
	job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title")
	companies = self.models.get_ner(resume_lines[start_index:end_index], "company")
	keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
	return self.format_output(keywords, single_work_experiences)

	def parse_education(self, resume_lines):
	start_index, end_index = self.extract_section_text(resume_lines, "education_and_training")
	tokens = ["degree", "university", "degree field", "date", "location"]

	for token in tokens:
	keywords = self.get_ner(resume_lines[start_index+1:end_index], token)
	output = self.format_output(keywords, resume_lines[start_index:end_index], False)
	output = [res for res in output if res]

	return output

	def parse_basic_info(self,resume_lines):
	start_index, end_index = self.extract_section_text(resume_lines, "basics_info")
	#tokens = ["person", "email", "phone"]
	tokens = ["person"]
	for token in tokens:
	keywords = self.models.get_ner(resume_lines[start_index:end_index], token)

	output = {}
	for token, result in keywords:
	if len(result) > 0:
	output[token] = result[0]
	return output

	def parse(self, resume_lines):
	jobs = self.parse_work_history(resume_lines)
	education = self.parse_education(resume_lines)
	basic_info = self.parse_basic_info(resume_lines)

	return {"basic_info":basic_info, "education":education, "work_experience":jobs}