Spaces:
Running
Running
from flashtext import KeywordProcessor | |
import json | |
class ResumeSegmenter(): | |
def __init__(self): | |
self.resume_segments = { | |
'objective': [], | |
'work_and_employment': [], | |
'education_and_training': [], | |
'skills': [], | |
'accomplishments': [], | |
'misc': [] | |
} | |
self.resume_indices = [] | |
def get_average_line_len(self, lines): | |
sum = 0 | |
for line in lines: | |
sum+=len(line) | |
return sum / len(lines) | |
def get_average_words_per_line(self, lines): | |
sum = 0 | |
for line in lines: | |
#other stopwords too? | |
sum+= len(line.split(' ')) | |
return sum/ len(lines) | |
def find_segment_indices(self, text_list): | |
with open(r"./sections.json") as f: | |
data = json.load(f) | |
section_headers = data["section_headers"] | |
f.close() | |
keyword_processor = KeywordProcessor() | |
keyword_processor.add_keywords_from_dict(keyword_dict=section_headers) | |
average_words_per_line = self.get_average_words_per_line(text_list) | |
for i, line in enumerate(text_list): | |
if line[0].islower() or line[-1] == '.': | |
continue | |
kys = keyword_processor.extract_keywords(line) | |
if len(kys) > 0: | |
#other stopwords? from where? nltk lib ? pos tagger? | |
if len(line.split(" ")) > average_words_per_line * 0.75: | |
continue | |
#is it necessary to keep the actual raw keyword? | |
self.resume_indices.append(i) | |
self.resume_segments[kys[0]].append(i) | |
def slice_segments(self, lines): | |
sections = {} | |
if len(self.resume_indices) == 0: | |
return None | |
for section, points in self.resume_segments.items(): | |
if len(points) == 0: continue | |
start_point = points[0] | |
tmp_end_point = points[-1] | |
end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1, | |
len(self.resume_indices)-1)] | |
if start_point == self.resume_indices[-1]: | |
end_point = len(lines) | |
sections[section] = (start_point, end_point) | |
sections["basics_info"] = (0, self.resume_indices[0]) | |
return sections | |
def get_interval_intersection(self, sections, interval): | |
for section in sections: | |
s = section[1] | |
if s[0] >= interval[1] or interval[0] >= s[1]: | |
return None | |
else: | |
start = max(s[0], interval[0]) | |
end = min(s[1], interval[1]) | |
return [start, end], section | |
def segment(self, resume_lines): | |
self.find_segment_indices(resume_lines) | |
sections = self.slice_segments(resume_lines) | |
#whats the naming convention here sections_list or list_sections??? | |
sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ] | |
intersection_intervals = [] | |
for i, s in enumerate(sections_list[:-1]): | |
result = self.get_interval_intersection(sections_list[i+1:], s[1]) | |
if result is None: | |
continue | |
else: | |
a,b = result | |
print(a,b,s[0]) | |
intersection_intervals.append((a,b,s[0])) | |
if len(intersection_intervals) > 0: | |
print("there are intersections", intersection_intervals) | |
#needs last method of cleaning overlapping intervals with zero shot | |
#classifier + substract intervals | |
return sections | |
def get_parsed_sections(self, resume_lines): | |
text_segments = {} | |
sections = self.segment(resume_lines) | |
for header_title, section in sections.items(): | |
lines = resume_lines[section[0]:section[1]] | |
text_segments[header_title] = lines | |
return text_segments, sections |