Spaces:
Running
Running
File size: 4,434 Bytes
0d375ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from flashtext import KeywordProcessor
import json
class ResumeSegmenter():
class ResumeSegmenter():
def __init__(self):
self.resume_segments = {
'objective': [],
'work_and_employment': [],
'education_and_training': [],
'skills': [],
'accomplishments': [],
'misc': []
}
self.resume_indices = []
def get_average_line_len(self, lines):
sum = 0
for line in lines:
sum+=len(line)
return sum / len(lines)
def get_average_words_per_line(self, lines):
sum = 0
for line in lines:
#other stopwords too?
sum+= len(line.split(' '))
return sum/ len(lines)
def find_segment_indices(self, text_list):
with open(r"./sections.json") as f:
data = json.load(f)
section_headers = data["section_headers"]
f.close()
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
average_words_per_line = self.get_average_words_per_line(text_list)
for i, line in enumerate(text_list):
if line[0].islower() or line[-1] == '.':
continue
kys = keyword_processor.extract_keywords(line)
if len(kys) > 0:
#other stopwords? from where? nltk lib ? pos tagger?
if len(line.split(" ")) > average_words_per_line * 0.75:
continue
#is it necessary to keep the actual raw keyword?
self.resume_indices.append(i)
self.resume_segments[kys[0]].append(i)
def slice_segments(self, lines):
sections = {}
if len(self.resume_indices) == 0:
return None
for section, points in self.resume_segments.items():
if len(points) == 0: continue
start_point = points[0]
tmp_end_point = points[-1]
end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1,
len(self.resume_indices)-1)]
if start_point == self.resume_indices[-1]:
end_point = len(lines)
sections[section] = (start_point, end_point)
sections["basics_info"] = (0, self.resume_indices[0])
return sections
def get_interval_intersection(self, sections, interval):
for section in sections:
s = section[1]
if s[0] >= interval[1] or interval[0] >= s[1]:
return None
else:
start = max(s[0], interval[0])
end = min(s[1], interval[1])
return [start, end], section
def segment(self, resume_lines):
self.find_segment_indices(resume_lines)
sections = self.slice_segments(resume_lines)
#whats the naming convention here sections_list or list_sections???
sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
intersection_intervals = []
for i, s in enumerate(sections_list[:-1]):
result = self.get_interval_intersection(sections_list[i+1:], s[1])
if result is None:
continue
else:
a,b = result
print(a,b,s[0])
intersection_intervals.append((a,b,s[0]))
if len(intersection_intervals) > 0:
print("there are intersections", intersection_intervals)
#needs last method of cleaning overlapping intervals with zero shot
#classifier + substract intervals
return sections
def get_parsed_sections(self, resume_lines):
text_segments = {}
sections = self.segment(resume_lines)
for header_title, section in sections.items():
lines = resume_lines[section[0]:section[1]]
text_segments[header_title] = lines
return text_segments, sections |