from flashtext import KeywordProcessor import json class ResumeSegmenter(): def __init__(self): self.resume_segments = { 'objective': [], 'work_and_employment': [], 'education_and_training': [], 'skills': [], 'accomplishments': [], 'misc': [] } self.resume_indices = [] def get_average_line_len(self, lines): sum = 0 for line in lines: sum+=len(line) return sum / len(lines) def get_average_words_per_line(self, lines): sum = 0 for line in lines: #other stopwords too? sum+= len(line.split(' ')) return sum/ len(lines) def find_segment_indices(self, text_list): with open(r"./sections.json") as f: data = json.load(f) section_headers = data["section_headers"] f.close() keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_dict(keyword_dict=section_headers) average_words_per_line = self.get_average_words_per_line(text_list) for i, line in enumerate(text_list): if line[0].islower() or line[-1] == '.': continue kys = keyword_processor.extract_keywords(line) if len(kys) > 0: #other stopwords? from where? nltk lib ? pos tagger? if len(line.split(" ")) > average_words_per_line * 0.75: continue #is it necessary to keep the actual raw keyword? self.resume_indices.append(i) self.resume_segments[kys[0]].append(i) def slice_segments(self, lines): sections = {} if len(self.resume_indices) == 0: return None for section, points in self.resume_segments.items(): if len(points) == 0: continue start_point = points[0] tmp_end_point = points[-1] end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1, len(self.resume_indices)-1)] if start_point == self.resume_indices[-1]: end_point = len(lines) sections[section] = (start_point, end_point) sections["basics_info"] = (0, self.resume_indices[0]) return sections def get_interval_intersection(self, sections, interval): for section in sections: s = section[1] if s[0] >= interval[1] or interval[0] >= s[1]: return None else: start = max(s[0], interval[0]) end = min(s[1], interval[1]) return [start, end], section def segment(self, resume_lines): self.find_segment_indices(resume_lines) sections = self.slice_segments(resume_lines) #whats the naming convention here sections_list or list_sections??? sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ] intersection_intervals = [] for i, s in enumerate(sections_list[:-1]): result = self.get_interval_intersection(sections_list[i+1:], s[1]) if result is None: continue else: a,b = result print(a,b,s[0]) intersection_intervals.append((a,b,s[0])) if len(intersection_intervals) > 0: print("there are intersections", intersection_intervals) #needs last method of cleaning overlapping intervals with zero shot #classifier + substract intervals return sections def get_parsed_sections(self, resume_lines): text_segments = {} sections = self.segment(resume_lines) for header_title, section in sections.items(): lines = resume_lines[section[0]:section[1]] text_segments[header_title] = lines return text_segments, sections