Spaces:
Runtime error
Runtime error
create cv parser app
Browse files- Main.py +23 -0
- Models.py +58 -0
- ResumeParser.py +237 -0
- ResumeReader.py +100 -0
- ResumeSegmenter.py +259 -0
- app.py +18 -0
Main.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ResumeReader import ResumeReader
|
2 |
+
from ResumeParser import ResumeParser
|
3 |
+
from Models import Models
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
|
7 |
+
|
8 |
+
class Main:
|
9 |
+
def __init__(self):
|
10 |
+
models = Models()
|
11 |
+
ner, ner_dates, zero_shot_classifier, tagger = models.load_trained_models()
|
12 |
+
self.reader = ResumeReader()
|
13 |
+
self.parser = ResumeParser(ner, ner_dates, zero_shot_classifier, tagger)
|
14 |
+
|
15 |
+
def parse_cv(self, file_path):
|
16 |
+
resume_lines = self.reader.read_file(file_path)
|
17 |
+
output = self.parser.parse(resume_lines)
|
18 |
+
return output
|
19 |
+
|
20 |
+
def save_parse_as_json(self, dict, file_name):
|
21 |
+
print("Saving the parse...")
|
22 |
+
with open(file_name, 'w', encoding="utf-8") as f:
|
23 |
+
json.dump(dict, f, indent=4, default=str, ensure_ascii=False)
|
Models.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
|
2 |
+
from transformers import pipeline
|
3 |
+
from flair.data import Sentence
|
4 |
+
from flair.models import SequenceTagger
|
5 |
+
import pickle
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
class Models:
|
10 |
+
|
11 |
+
def pickle_it(self, obj, file_name):
|
12 |
+
with open(f'{file_name}.pickle', 'wb') as f:
|
13 |
+
pickle.dump(obj, f)
|
14 |
+
|
15 |
+
def unpickle_it(self, file_name):
|
16 |
+
with open(f'{file_name}.pickle', 'rb') as f:
|
17 |
+
return pickle.load(f)
|
18 |
+
|
19 |
+
def load_trained_models(self, pickle=False):
|
20 |
+
#NER (dates)
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
|
22 |
+
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
|
23 |
+
self.ner_dates = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
24 |
+
|
25 |
+
#Zero Shot Classification
|
26 |
+
self.zero_shot_classifier = pipeline("zero-shot-classification", model='facebook/bart-large-mnli')
|
27 |
+
# self.zero_shot_classifier = pipeline("zero-shot-classification", model='valhalla/distilbart-mnli-12-6')
|
28 |
+
|
29 |
+
# Ner
|
30 |
+
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
|
31 |
+
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
|
32 |
+
self.ner = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)
|
33 |
+
|
34 |
+
# Pos Tagging
|
35 |
+
self.tagger = SequenceTagger.load("flair/pos-english-fast")
|
36 |
+
|
37 |
+
|
38 |
+
if pickle:
|
39 |
+
self.pickle_models()
|
40 |
+
|
41 |
+
return self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger
|
42 |
+
|
43 |
+
def pickle_models(self):
|
44 |
+
self.pickle_it(self.ner, "ner")
|
45 |
+
self.pickle_it(self.zero_shot_classifier, "zero_shot_classifier_6")
|
46 |
+
self.pickle_it(self.ner_dates, "ner_dates")
|
47 |
+
self.pickle_it(self.tagger, "pos_tagger_fast")
|
48 |
+
|
49 |
+
|
50 |
+
def load_pickled_models(self):
|
51 |
+
ner_dates = self.unpickle_it('ner_dates')
|
52 |
+
ner = self.unpickle_it('ner')
|
53 |
+
zero_shot_classifier = self.unpickle_it('zero_shot_classifier_6')
|
54 |
+
tagger = self.unpickle_it("pos_tagger_fast")
|
55 |
+
return ner_dates, ner, zero_shot_classifier, tagger
|
56 |
+
|
57 |
+
def get_flair_sentence(self, sent):
|
58 |
+
return Sentence(sent)
|
ResumeParser.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Models import Models
|
2 |
+
from ResumeSegmenter import ResumeSegmenter
|
3 |
+
from datetime import datetime
|
4 |
+
from dateutil import parser
|
5 |
+
import re
|
6 |
+
from string import punctuation
|
7 |
+
|
8 |
+
class ResumeParser:
|
9 |
+
def __init__(self, ner, ner_dates, zero_shot_classifier, tagger):
|
10 |
+
self.models = Models()
|
11 |
+
self.segmenter = ResumeSegmenter(zero_shot_classifier)
|
12 |
+
self.ner, self.ner_dates, self.zero_shot_classifier, self.tagger = ner, ner_dates, zero_shot_classifier, tagger
|
13 |
+
self.parsed_cv = {}
|
14 |
+
|
15 |
+
def parse(self, resume_lines):
|
16 |
+
resume_segments = self.segmenter.segment(resume_lines)
|
17 |
+
print("Parsing the Resume...")
|
18 |
+
for segment_name in resume_segments:
|
19 |
+
if segment_name == "contact_info":
|
20 |
+
contact_info = resume_segments[segment_name]
|
21 |
+
self.parse_contact_info(contact_info)
|
22 |
+
elif segment_name == "work_and_employment":
|
23 |
+
resume_segment = resume_segments[segment_name]
|
24 |
+
self.parse_job_history(resume_segment)
|
25 |
+
return self.parsed_cv
|
26 |
+
|
27 |
+
|
28 |
+
def parse_contact_info(self, contact_info):
|
29 |
+
contact_info_dict = {}
|
30 |
+
name = self.find_person_name(contact_info)
|
31 |
+
email = self.find_contact_email(contact_info)
|
32 |
+
self.parsed_cv['Name'] = name
|
33 |
+
contact_info_dict["Email"] = email
|
34 |
+
self.parsed_cv['Contact Info'] = contact_info_dict
|
35 |
+
|
36 |
+
def find_person_name(self, items):
|
37 |
+
class_score = []
|
38 |
+
splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
|
39 |
+
classes = ["person name", "address", "email", "title"]
|
40 |
+
for item in items:
|
41 |
+
elements = splitter.split(item)
|
42 |
+
for element in elements:
|
43 |
+
element = ''.join(i for i in element.strip() if not i.isdigit())
|
44 |
+
if not len(element.strip().split()) > 1: continue
|
45 |
+
out = self.zero_shot_classifier(element, classes)
|
46 |
+
highest = sorted(zip(out["labels"], out["scores"]), key=lambda x: x[1])[-1]
|
47 |
+
if highest[0] == "person name":
|
48 |
+
class_score.append((element, highest[1]))
|
49 |
+
if len(class_score):
|
50 |
+
return sorted(class_score, key=lambda x: x[1], reverse=True)[0][0]
|
51 |
+
return ""
|
52 |
+
|
53 |
+
def find_contact_email(self, items):
|
54 |
+
for item in items:
|
55 |
+
match = re.search(r'[\w.+-]+@[\w-]+\.[\w.-]+', item)
|
56 |
+
if match:
|
57 |
+
return match.group(0)
|
58 |
+
return ""
|
59 |
+
|
60 |
+
def parse_job_history(self, resume_segment):
|
61 |
+
idx_job_title = self.get_job_titles(resume_segment)
|
62 |
+
current_and_below = False
|
63 |
+
if not len(idx_job_title):
|
64 |
+
self.parsed_cv["Job History"] = []
|
65 |
+
return
|
66 |
+
if idx_job_title[0][0] == 0: current_and_below = True
|
67 |
+
job_history = []
|
68 |
+
for ls_idx, (idx, job_title) in enumerate(idx_job_title):
|
69 |
+
job_info = {}
|
70 |
+
job_info["Job Title"] = self.filter_job_title(job_title)
|
71 |
+
# company
|
72 |
+
if current_and_below: line1, line2 = idx, idx+1
|
73 |
+
else: line1, line2 = idx, idx-1
|
74 |
+
job_info["Company"] = self.get_job_company(line1, line2, resume_segment)
|
75 |
+
if current_and_below: st_span = idx
|
76 |
+
else: st_span = idx-1
|
77 |
+
# Dates
|
78 |
+
if ls_idx == len(idx_job_title) - 1: end_span = len(resume_segment)
|
79 |
+
else: end_span = idx_job_title[ls_idx+1][0]
|
80 |
+
start, end = self.get_job_dates(st_span, end_span, resume_segment)
|
81 |
+
job_info["Start Date"] = start
|
82 |
+
job_info["End Date"] = end
|
83 |
+
job_history.append(job_info)
|
84 |
+
self.parsed_cv["Job History"] = job_history
|
85 |
+
|
86 |
+
def get_job_titles(self, resume_segment):
|
87 |
+
classes = ["organization", "institution", "job title", "role"]
|
88 |
+
idx_line = []
|
89 |
+
for idx, line in enumerate(resume_segment):
|
90 |
+
has_verb = False
|
91 |
+
sentence = self.models.get_flair_sentence(line)
|
92 |
+
self.tagger.predict(sentence)
|
93 |
+
for entity in sentence.get_spans('pos'):
|
94 |
+
if entity.tag.startswith("V"):
|
95 |
+
has_verb = True
|
96 |
+
break
|
97 |
+
if not has_verb:
|
98 |
+
out = self.zero_shot_classifier(line, classes)
|
99 |
+
class_score = zip(out["labels"], out["scores"])
|
100 |
+
highest = sorted(class_score, key=lambda x: x[1])[-1]
|
101 |
+
|
102 |
+
if highest[0] == "job title":
|
103 |
+
idx_line.append((idx, line))
|
104 |
+
|
105 |
+
return idx_line
|
106 |
+
|
107 |
+
|
108 |
+
def get_job_dates(self, st, end, resume_segment):
|
109 |
+
search_span = resume_segment[st:end]
|
110 |
+
dates = []
|
111 |
+
for line in search_span:
|
112 |
+
for dt in self.get_ner_in_line(line, "DATE"):
|
113 |
+
if self.isvalidyear(dt.strip()):
|
114 |
+
dates.append(dt)
|
115 |
+
if len(dates): first = dates[0]
|
116 |
+
exists_second = False
|
117 |
+
if len(dates) > 1:
|
118 |
+
exists_second = True
|
119 |
+
second = dates[1]
|
120 |
+
|
121 |
+
if len(dates) > 0:
|
122 |
+
if self.has_two_dates(first):
|
123 |
+
d1, d2 = self.get_two_dates(first)
|
124 |
+
return self.format_date(d1), self.format_date(d2)
|
125 |
+
elif exists_second and self.has_two_dates(second):
|
126 |
+
d1, d2 = self.get_two_dates(second)
|
127 |
+
return self.format_date(d1), self.format_date(d2)
|
128 |
+
else:
|
129 |
+
if exists_second:
|
130 |
+
st = self.format_date(first)
|
131 |
+
end = self.format_date(second)
|
132 |
+
return st, end
|
133 |
+
else:
|
134 |
+
return (self.format_date(first), "")
|
135 |
+
else: return ("", "")
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
def filter_job_title(self, job_title):
|
140 |
+
job_title_splitter = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("&", "") )))
|
141 |
+
job_title = ''.join(i for i in job_title if not i.isdigit())
|
142 |
+
tokens = job_title_splitter.split(job_title)
|
143 |
+
tokens = [''.join([i for i in tok.strip() if (i.isalpha() or i.strip()=="")]) for tok in tokens if tok.strip()]
|
144 |
+
classes = ["company", "organization", "institution", "job title", "responsibility", "details"]
|
145 |
+
new_title = []
|
146 |
+
for token in tokens:
|
147 |
+
if not token: continue
|
148 |
+
res = self.zero_shot_classifier(token, classes)
|
149 |
+
class_score = zip(res["labels"], res["scores"])
|
150 |
+
highest = sorted(class_score, key=lambda x: x[1])[-1]
|
151 |
+
if highest[0] == "job title":
|
152 |
+
new_title.append(token.strip())
|
153 |
+
if len(new_title):
|
154 |
+
return ', '.join(new_title)
|
155 |
+
else: return ', '.join(tokens)
|
156 |
+
|
157 |
+
def has_two_dates(self, date):
|
158 |
+
years = self.get_valid_years()
|
159 |
+
count = 0
|
160 |
+
for year in years:
|
161 |
+
if year in str(date):
|
162 |
+
count+=1
|
163 |
+
return count == 2
|
164 |
+
|
165 |
+
def get_two_dates(self, date):
|
166 |
+
years = self.get_valid_years()
|
167 |
+
idxs = []
|
168 |
+
for year in years:
|
169 |
+
if year in date:
|
170 |
+
idxs.append(date.index(year))
|
171 |
+
min_idx = min(idxs)
|
172 |
+
first = date[:min_idx+4]
|
173 |
+
second = date[min_idx+4:]
|
174 |
+
return first, second
|
175 |
+
def get_valid_years(self):
|
176 |
+
current_year = datetime.today().year
|
177 |
+
years = [str(i) for i in range(current_year-100, current_year)]
|
178 |
+
return years
|
179 |
+
|
180 |
+
def format_date(self, date):
|
181 |
+
out = self.parse_date(date)
|
182 |
+
if out:
|
183 |
+
return out
|
184 |
+
else:
|
185 |
+
date = self.clean_date(date)
|
186 |
+
out = self.parse_date(date)
|
187 |
+
if out:
|
188 |
+
return out
|
189 |
+
else:
|
190 |
+
return date
|
191 |
+
|
192 |
+
def clean_date(self, date):
|
193 |
+
try:
|
194 |
+
date = ''.join(i for i in date if i.isalnum() or i =='-' or i == '/')
|
195 |
+
return date
|
196 |
+
except:
|
197 |
+
return date
|
198 |
+
|
199 |
+
def parse_date(self, date):
|
200 |
+
try:
|
201 |
+
date = parser.parse(date)
|
202 |
+
return date.strftime("%m-%Y")
|
203 |
+
except:
|
204 |
+
try:
|
205 |
+
date = datetime(date)
|
206 |
+
return date.strftime("%m-%Y")
|
207 |
+
except:
|
208 |
+
return 0
|
209 |
+
|
210 |
+
|
211 |
+
def isvalidyear(self, date):
|
212 |
+
current_year = datetime.today().year
|
213 |
+
years = [str(i) for i in range(current_year-100, current_year)]
|
214 |
+
for year in years:
|
215 |
+
if year in str(date):
|
216 |
+
return True
|
217 |
+
return False
|
218 |
+
|
219 |
+
def get_ner_in_line(self, line, entity_type):
|
220 |
+
if entity_type == "DATE": ner = self.ner_dates
|
221 |
+
else: ner = self.ner
|
222 |
+
return [i['word'] for i in ner(line) if i['entity_group'] == entity_type]
|
223 |
+
|
224 |
+
|
225 |
+
def get_job_company(self, idx, idx1, resume_segment):
|
226 |
+
job_title = resume_segment[idx]
|
227 |
+
if not idx1 <= len(resume_segment)-1: context = ""
|
228 |
+
else:context = resume_segment[idx1]
|
229 |
+
candidate_companies = self.get_ner_in_line(job_title, "ORG") + self.get_ner_in_line(context, "ORG")
|
230 |
+
classes = ["organization", "company", "institution", "not organization", "not company", "not institution"]
|
231 |
+
scores = []
|
232 |
+
for comp in candidate_companies:
|
233 |
+
res = self.zero_shot_classifier(comp, classes)['scores']
|
234 |
+
scores.append(max(res[:3]))
|
235 |
+
sorted_cmps = sorted(zip(candidate_companies, scores), key=lambda x: x[1], reverse=True)
|
236 |
+
if len(sorted_cmps): return sorted_cmps[0][0]
|
237 |
+
return context
|
ResumeReader.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
import docx
|
5 |
+
import pdfplumber
|
6 |
+
|
7 |
+
class ResumeReader:
|
8 |
+
|
9 |
+
def convert_docx_to_txt(self, docx_file,docx_parser):
|
10 |
+
"""
|
11 |
+
A utility function to convert a Microsoft docx files to raw text.
|
12 |
+
|
13 |
+
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
|
14 |
+
:param docx_file: docx file with gets uploaded by the user
|
15 |
+
:type docx_file: InMemoryUploadedFile
|
16 |
+
:return: The text contents of the docx file
|
17 |
+
:rtype: str
|
18 |
+
"""
|
19 |
+
|
20 |
+
doc = docx.Document(docx_file)
|
21 |
+
allText = []
|
22 |
+
for docpara in doc.paragraphs:
|
23 |
+
allText.append(docpara.text)
|
24 |
+
text = ' '.join(allText)
|
25 |
+
try:
|
26 |
+
clean_text = re.sub(r'\n+', '\n', text)
|
27 |
+
clean_text = clean_text.replace("\r", "\n").replace("\t", " ") # Normalize text blob
|
28 |
+
resume_lines = clean_text.splitlines() # Split text blob into individual lines
|
29 |
+
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if
|
30 |
+
line.strip()] # Remove empty strings and whitespaces
|
31 |
+
return resume_lines, text
|
32 |
+
except Exception as e:
|
33 |
+
logging.error('Error in docx file:: ' + str(e))
|
34 |
+
return [], " "
|
35 |
+
|
36 |
+
def convert_pdf_to_txt(self, pdf_file):
|
37 |
+
"""
|
38 |
+
A utility function to convert a machine-readable PDF to raw text.
|
39 |
+
|
40 |
+
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
|
41 |
+
:param input_pdf_path: Path to the .pdf file which should be converted
|
42 |
+
:type input_pdf_path: str
|
43 |
+
:return: The text contents of the pdf
|
44 |
+
:rtype: str
|
45 |
+
"""
|
46 |
+
|
47 |
+
pdf = pdfplumber.open(pdf_file)
|
48 |
+
raw_text= ""
|
49 |
+
|
50 |
+
for page in pdf.pages:
|
51 |
+
raw_text += page.extract_text() + "\n"
|
52 |
+
|
53 |
+
pdf.close()
|
54 |
+
|
55 |
+
try:
|
56 |
+
full_string = re.sub(r'\n+', '\n', raw_text)
|
57 |
+
full_string = full_string.replace("\r", "\n")
|
58 |
+
full_string = full_string.replace("\t", " ")
|
59 |
+
|
60 |
+
# Remove awkward LaTeX bullet characters
|
61 |
+
full_string = re.sub(r"\uf0b7", " ", full_string)
|
62 |
+
full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string)
|
63 |
+
full_string = re.sub(r'• ', " ", full_string)
|
64 |
+
|
65 |
+
# Split text blob into individual lines
|
66 |
+
resume_lines = full_string.splitlines(True)
|
67 |
+
|
68 |
+
# Remove empty strings and whitespaces
|
69 |
+
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
|
70 |
+
|
71 |
+
return resume_lines, raw_text
|
72 |
+
except Exception as e:
|
73 |
+
logging.error('Error in docx file:: ' + str(e))
|
74 |
+
return [], " "
|
75 |
+
|
76 |
+
def read_file(self, file,docx_parser = "tika"):
|
77 |
+
"""
|
78 |
+
file : Give path of resume file
|
79 |
+
docx_parser : Enter docx2txt or tika, by default is tika
|
80 |
+
"""
|
81 |
+
print("Reading the Resume...")
|
82 |
+
# file = "/content/Asst Manager Trust Administration.docx"
|
83 |
+
file = os.path.join(file)
|
84 |
+
if file.endswith('docx') or file.endswith('doc'):
|
85 |
+
# if file.endswith('doc') and docx_parser == "docx2txt":
|
86 |
+
# docx_parser = "tika"
|
87 |
+
# logging.error("doc format not supported by the docx2txt changing back to tika")
|
88 |
+
resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser)
|
89 |
+
elif file.endswith('pdf'):
|
90 |
+
resume_lines, raw_text = self.convert_pdf_to_txt(file)
|
91 |
+
elif file.endswith('txt'):
|
92 |
+
with open(file, 'r', encoding='utf-8') as f:
|
93 |
+
resume_lines = f.readlines()
|
94 |
+
|
95 |
+
else:
|
96 |
+
resume_lines = None
|
97 |
+
|
98 |
+
# print(resume_lines)
|
99 |
+
|
100 |
+
return resume_lines
|
ResumeSegmenter.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from Models import Models
|
2 |
+
|
3 |
+
class ResumeSegmenter:
|
4 |
+
|
5 |
+
def __init__(self, zero_shot_classifier):
|
6 |
+
self.zero_shot_classifier = zero_shot_classifier
|
7 |
+
|
8 |
+
objective = (
|
9 |
+
'career goal',
|
10 |
+
'objective',
|
11 |
+
'career objective',
|
12 |
+
'employment objective',
|
13 |
+
'professional objective',
|
14 |
+
'summary',
|
15 |
+
'summary of qualifications',
|
16 |
+
'digital'
|
17 |
+
)
|
18 |
+
|
19 |
+
work_and_employment = (
|
20 |
+
'employment history',
|
21 |
+
'employment data',
|
22 |
+
'career summary',
|
23 |
+
'work history',
|
24 |
+
'work experience',
|
25 |
+
'experience',
|
26 |
+
'professional experience',
|
27 |
+
'professional background',
|
28 |
+
'professional employment',
|
29 |
+
'additional experience',
|
30 |
+
'career related experience',
|
31 |
+
"professional employment history",
|
32 |
+
'related experience',
|
33 |
+
'programming experience',
|
34 |
+
'freelance',
|
35 |
+
'freelance experience',
|
36 |
+
'army experience',
|
37 |
+
'military experience',
|
38 |
+
'military background',
|
39 |
+
)
|
40 |
+
|
41 |
+
education_and_training = (
|
42 |
+
'academic background',
|
43 |
+
'academic experience',
|
44 |
+
'programs',
|
45 |
+
'courses',
|
46 |
+
'related courses',
|
47 |
+
'education',
|
48 |
+
'educational background',
|
49 |
+
'educational qualifications',
|
50 |
+
'educational training',
|
51 |
+
'education and training',
|
52 |
+
'training',
|
53 |
+
'academic training',
|
54 |
+
'professional training',
|
55 |
+
'course project experience',
|
56 |
+
'related course projects',
|
57 |
+
'internship experience',
|
58 |
+
'internships',
|
59 |
+
'apprenticeships',
|
60 |
+
'college activities',
|
61 |
+
'certifications',
|
62 |
+
'special training',
|
63 |
+
)
|
64 |
+
|
65 |
+
skills_header = (
|
66 |
+
'credentials',
|
67 |
+
'qualifications',
|
68 |
+
'areas of experience',
|
69 |
+
'areas of expertise',
|
70 |
+
'areas of knowledge',
|
71 |
+
'skills',
|
72 |
+
"other skills",
|
73 |
+
"other abilities",
|
74 |
+
'career related skills',
|
75 |
+
'professional skills',
|
76 |
+
'specialized skills',
|
77 |
+
'technical skills',
|
78 |
+
'computer skills',
|
79 |
+
'personal skills',
|
80 |
+
'computer knowledge',
|
81 |
+
'technologies',
|
82 |
+
'technical experience',
|
83 |
+
'proficiencies',
|
84 |
+
'languages',
|
85 |
+
'language competencies and skills',
|
86 |
+
'programming languages',
|
87 |
+
'competencies'
|
88 |
+
)
|
89 |
+
|
90 |
+
misc = (
|
91 |
+
'activities and honors',
|
92 |
+
'activities',
|
93 |
+
'affiliations',
|
94 |
+
'professional affiliations',
|
95 |
+
'associations',
|
96 |
+
'professional associations',
|
97 |
+
'memberships',
|
98 |
+
'professional memberships',
|
99 |
+
'athletic involvement',
|
100 |
+
'community involvement',
|
101 |
+
'refere',
|
102 |
+
'civic activities',
|
103 |
+
'extra-Curricular activities',
|
104 |
+
'professional activities',
|
105 |
+
'volunteer work',
|
106 |
+
'volunteer experience',
|
107 |
+
'additional information',
|
108 |
+
'interests'
|
109 |
+
)
|
110 |
+
|
111 |
+
accomplishments = (
|
112 |
+
'achievement',
|
113 |
+
'awards and achievements',
|
114 |
+
'licenses',
|
115 |
+
'presentations',
|
116 |
+
'conference presentations',
|
117 |
+
'conventions',
|
118 |
+
'dissertations',
|
119 |
+
'exhibits',
|
120 |
+
'papers',
|
121 |
+
'publications',
|
122 |
+
'professional publications',
|
123 |
+
'research experience',
|
124 |
+
'research grants',
|
125 |
+
'project',
|
126 |
+
'research projects',
|
127 |
+
'personal projects',
|
128 |
+
'current research interests',
|
129 |
+
'thesis',
|
130 |
+
'theses',
|
131 |
+
)
|
132 |
+
|
133 |
+
|
134 |
+
def find_segment_indices(self, string_to_search, resume_segments, resume_indices):
|
135 |
+
for i, line in enumerate(string_to_search):
|
136 |
+
|
137 |
+
if line[0].islower():
|
138 |
+
continue
|
139 |
+
|
140 |
+
header = line.lower()
|
141 |
+
|
142 |
+
if [o for o in self.objective if header.startswith(o)]:
|
143 |
+
try:
|
144 |
+
resume_segments['objective'][header]
|
145 |
+
except:
|
146 |
+
resume_indices.append(i)
|
147 |
+
header = [o for o in self.objective if header.startswith(o)][0]
|
148 |
+
resume_segments['objective'][header] = i
|
149 |
+
elif [w for w in self.work_and_employment if header.startswith(w)]:
|
150 |
+
try:
|
151 |
+
resume_segments['work_and_employment'][header]
|
152 |
+
except:
|
153 |
+
resume_indices.append(i)
|
154 |
+
header = [w for w in self.work_and_employment if header.startswith(w)][0]
|
155 |
+
resume_segments['work_and_employment'][header] = i
|
156 |
+
elif [e for e in self.education_and_training if header.startswith(e)]:
|
157 |
+
try:
|
158 |
+
resume_segments['education_and_training'][header]
|
159 |
+
except:
|
160 |
+
resume_indices.append(i)
|
161 |
+
header = [e for e in self.education_and_training if header.startswith(e)][0]
|
162 |
+
resume_segments['education_and_training'][header] = i
|
163 |
+
elif [s for s in self.skills_header if header.startswith(s)]:
|
164 |
+
try:
|
165 |
+
resume_segments['skills'][header]
|
166 |
+
except:
|
167 |
+
resume_indices.append(i)
|
168 |
+
header = [s for s in self.skills_header if header.startswith(s)][0]
|
169 |
+
resume_segments['skills'][header] = i
|
170 |
+
elif [m for m in self.misc if header.startswith(m)]:
|
171 |
+
try:
|
172 |
+
resume_segments['misc'][header]
|
173 |
+
except:
|
174 |
+
resume_indices.append(i)
|
175 |
+
header = [m for m in self.misc if header.startswith(m)][0]
|
176 |
+
resume_segments['misc'][header] = i
|
177 |
+
elif [a for a in self.accomplishments if header.startswith(a)]:
|
178 |
+
try:
|
179 |
+
resume_segments['accomplishments'][header]
|
180 |
+
except:
|
181 |
+
resume_indices.append(i)
|
182 |
+
header = [a for a in self.accomplishments if header.startswith(a)][0]
|
183 |
+
resume_segments['accomplishments'][header] = i
|
184 |
+
|
185 |
+
def slice_segments(self, string_to_search, resume_segments, resume_indices):
|
186 |
+
resume_segments['contact_info'] = string_to_search[:resume_indices[0]]
|
187 |
+
sec_idxs = {}
|
188 |
+
for section, value in resume_segments.items():
|
189 |
+
if section == 'contact_info':
|
190 |
+
continue
|
191 |
+
|
192 |
+
for sub_section, start_idx in value.items():
|
193 |
+
end_idx = len(string_to_search)
|
194 |
+
if (resume_indices.index(start_idx) + 1) != len(resume_indices):
|
195 |
+
end_idx = resume_indices[resume_indices.index(start_idx) + 1]
|
196 |
+
|
197 |
+
sec_idxs[section] = (start_idx, end_idx)
|
198 |
+
# print(start_idx, end_idx)
|
199 |
+
|
200 |
+
resume_segments[section][sub_section] = string_to_search[start_idx:end_idx]
|
201 |
+
return sec_idxs
|
202 |
+
|
203 |
+
def find_true_segment(self, dict_of_segments, segment_name):
|
204 |
+
segment_classes = {
|
205 |
+
'objective': ["objective", "other"],
|
206 |
+
'work_and_employment':["employment history", "other"],
|
207 |
+
'education_and_training': ["education", "other"],
|
208 |
+
'skills': ["skills", "other"],
|
209 |
+
'accomplishments': ["accomplishments", "other"],
|
210 |
+
'misc': ["misc", "other"],
|
211 |
+
'contact_info': ["contact information", "other"]
|
212 |
+
}
|
213 |
+
classes = segment_classes[segment_name]
|
214 |
+
scores = []
|
215 |
+
segs = dict_of_segments.keys()
|
216 |
+
for seg in segs:
|
217 |
+
sequence = dict_of_segments[seg]
|
218 |
+
score = self.zero_shot_classifier(' '.join(sequence), classes)["scores"][0]
|
219 |
+
scores.append(score)
|
220 |
+
|
221 |
+
res = sorted(zip(dict_of_segments.keys(), scores), key=lambda x: x[1], reverse=True)
|
222 |
+
if len(res):
|
223 |
+
return res[0][0]
|
224 |
+
else: return 0
|
225 |
+
|
226 |
+
def segment(self, string_to_search):
|
227 |
+
print("Segmenting the Resume..")
|
228 |
+
resume_segments = {
|
229 |
+
'objective': {},
|
230 |
+
'work_and_employment': {},
|
231 |
+
'education_and_training': {},
|
232 |
+
'skills': {},
|
233 |
+
'accomplishments': {},
|
234 |
+
'misc': {}
|
235 |
+
}
|
236 |
+
|
237 |
+
resume_indices = []
|
238 |
+
|
239 |
+
self.find_segment_indices(string_to_search, resume_segments, resume_indices)
|
240 |
+
if len(resume_indices) != 0:
|
241 |
+
sec_idx = self.slice_segments(string_to_search, resume_segments, resume_indices)
|
242 |
+
else:
|
243 |
+
resume_segments['contact_info'] = []
|
244 |
+
|
245 |
+
for segment in resume_segments:
|
246 |
+
if segment == "contact_info": continue
|
247 |
+
if not len(resume_segments[segment]) > 1:
|
248 |
+
if len(resume_segments[segment]) == 1:
|
249 |
+
only_key = list(resume_segments[segment].keys())[0]
|
250 |
+
resume_segments[segment] = resume_segments[segment][only_key][1:]
|
251 |
+
continue
|
252 |
+
if segment != "work_and_employment": continue
|
253 |
+
true_seg = self.find_true_segment(resume_segments[segment], segment)
|
254 |
+
if not true_seg:
|
255 |
+
resume_segments[segment] = []
|
256 |
+
else:
|
257 |
+
resume_segments[segment] = resume_segments[segment][true_seg][1:]
|
258 |
+
|
259 |
+
return resume_segments
|
app.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydoc import describe
|
2 |
+
import gradio as gr
|
3 |
+
from Main import Main
|
4 |
+
|
5 |
+
|
6 |
+
main = Main()
|
7 |
+
|
8 |
+
def parse_cv(cv):
|
9 |
+
return main.parse_cv(cv.name)
|
10 |
+
|
11 |
+
|
12 |
+
description = """A demo for a CV parser built with HuggingFace's transformers."""
|
13 |
+
article = "Find the code on GitHub <a href='https://github.com/asimokby/cv-parser-huggingface'>here</a>."
|
14 |
+
file_input = gr.inputs.File(file_count="single", type="file", label="Upload a CV", optional=False)
|
15 |
+
iface = gr.Interface(fn=parse_cv, inputs=file_input, outputs="json", allow_flagging="never",
|
16 |
+
allow_screenshot=False, title="CV Parser", theme="dark", description=description, article=article)
|
17 |
+
|
18 |
+
iface.launch()
|