Spaces:

khrek
/

detailed-resume-parser

Running

App Files Files Community

detailed-resume-parser / segmenter.py

khrek

Update segmenter.py

15d0f45 12 months ago

raw

history blame

4.41 kB

	from flashtext import KeywordProcessor
	import json
	class ResumeSegmenter():

	def __init__(self):
	self.resume_segments = {
	'objective': [],
	'work_and_employment': [],
	'education_and_training': [],
	'skills': [],
	'accomplishments': [],
	'misc': []
	}
	self.resume_indices = []

	def get_average_line_len(self, lines):
	sum = 0
	for line in lines:
	sum+=len(line)
	return sum / len(lines)

	def get_average_words_per_line(self, lines):
	sum = 0
	for line in lines:
	#other stopwords too?
	sum+= len(line.split(' '))
	return sum/ len(lines)

	def find_segment_indices(self, text_list):
	with open(r"./sections.json") as f:
	data = json.load(f)
	section_headers = data["section_headers"]
	f.close()
	keyword_processor = KeywordProcessor()
	keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
	average_words_per_line = self.get_average_words_per_line(text_list)

	for i, line in enumerate(text_list):
	if line[0].islower() or line[-1] == '.':
	continue
	kys = keyword_processor.extract_keywords(line)
	if len(kys) > 0:
	#other stopwords? from where? nltk lib ? pos tagger?
	if len(line.split(" ")) > average_words_per_line * 0.75:
	continue
	#is it necessary to keep the actual raw keyword?
	self.resume_indices.append(i)
	self.resume_segments[kys[0]].append(i)

	def slice_segments(self, lines):
	sections = {}
	if len(self.resume_indices) == 0:
	return None

	for section, points in self.resume_segments.items():
	if len(points) == 0: continue
	start_point = points[0]
	tmp_end_point = points[-1]
	end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1,
	len(self.resume_indices)-1)]
	if start_point == self.resume_indices[-1]:
	end_point = len(lines)
	sections[section] = (start_point, end_point)
	sections["basics_info"] = (0, self.resume_indices[0])
	return sections

	def get_interval_intersection(self, sections, interval):
	for section in sections:
	s = section[1]
	if s[0] >= interval[1] or interval[0] >= s[1]:
	return None
	else:
	start = max(s[0], interval[0])
	end = min(s[1], interval[1])
	return [start, end], section
	def segment(self, resume_lines):
	self.find_segment_indices(resume_lines)
	sections = self.slice_segments(resume_lines)
	#whats the naming convention here sections_list or list_sections???
	sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
	intersection_intervals = []

	for i, s in enumerate(sections_list[:-1]):
	result = self.get_interval_intersection(sections_list[i+1:], s[1])
	if result is None:
	continue
	else:
	a,b = result
	print(a,b,s[0])
	intersection_intervals.append((a,b,s[0]))

	if len(intersection_intervals) > 0:
	print("there are intersections", intersection_intervals)
	#needs last method of cleaning overlapping intervals with zero shot
	#classifier + substract intervals
	return sections

	def get_parsed_sections(self, resume_lines):
	text_segments = {}
	sections = self.segment(resume_lines)
	for header_title, section in sections.items():
	lines = resume_lines[section[0]:section[1]]
	text_segments[header_title] = lines

	return text_segments, sections