Priyanka-Balivada commited on
Commit
7a84be6
1 Parent(s): e08847c

Upload main files

Browse files
Files changed (2) hide show
  1. app.py +273 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import nltk
3
+ from gensim.models.doc2vec import Doc2Vec, TaggedDocument
4
+ from nltk.tokenize import word_tokenize
5
+ import PyPDF2
6
+ import pandas as pd
7
+ import re
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ import spacy
11
+ import re
12
+ import pandas as pd
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+
16
+ nltk.download('punkt')
17
+
18
+
19
+ nlp_model_path = "en_Resume_Matching_Keywords"
20
+ nlp = spacy.load(nlp_model_path)
21
+
22
+ float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
23
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
24
+ float_digit_regex = re.compile(r'^\d{10}$')
25
+ email_with_phone_regex = email_with_phone_regex = re.compile(
26
+ r'(\d{10}).|.(\d{10})')
27
+
28
+
29
+ def extract_text_from_pdf(pdf_file):
30
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
31
+ text = ""
32
+ for page_num in range(len(pdf_reader.pages)):
33
+ text += pdf_reader.pages[page_num].extract_text()
34
+ return text
35
+
36
+
37
+ def tokenize_text(text, nlp_model):
38
+ doc = nlp_model(text, disable=["tagger", "parser"])
39
+ tokens = [(token.text.lower(), token.label_) for token in doc.ents]
40
+ return tokens
41
+
42
+
43
+ def extract_cgpa(resume_text):
44
+ # Define a regular expression pattern for CGPA extraction
45
+ cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
46
+
47
+ # Search for CGPA pattern in the text
48
+ match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
49
+
50
+ # Check if a match is found
51
+ if match:
52
+ # Extract CGPA value
53
+ cgpa = match.group(1) if match.group(1) else match.group(2)
54
+ return float(cgpa)
55
+ else:
56
+ return None
57
+
58
+
59
+ def extract_skills(text, skills_keywords):
60
+ skills = [skill.lower()
61
+ for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
62
+ return skills
63
+
64
+
65
+ def preprocess_text(text):
66
+ return word_tokenize(text.lower())
67
+
68
+
69
+
70
+
71
+
72
+ def train_doc2vec_model(documents):
73
+ model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
74
+ model.build_vocab(documents)
75
+ model.train(documents, total_examples=model.corpus_count,
76
+ epochs=model.epochs)
77
+ return model
78
+
79
+
80
+ def calculate_similarity(model, text1, text2):
81
+ vector1 = model.infer_vector(preprocess_text(text1))
82
+ vector2 = model.infer_vector(preprocess_text(text2))
83
+ return model.dv.cosine_similarities(vector1, [vector2])[0]
84
+
85
+
86
+ def accuracy_calculation(true_positives, false_positives, false_negatives):
87
+ total = true_positives + false_positives + false_negatives
88
+ accuracy = true_positives / total if total != 0 else 0
89
+ return accuracy
90
+
91
+
92
+
93
+
94
+
95
+
96
+ # Streamlit Frontend
97
+ st.markdown("# Resume Matching Tool 📃📃")
98
+ st.markdown("An application to match resumes with a job description.")
99
+
100
+ # Sidebar - File Upload for Resumes
101
+ st.sidebar.markdown("## Upload Resumes PDF")
102
+ resumes_files = st.sidebar.file_uploader(
103
+ "Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
104
+
105
+ if resumes_files:
106
+ # Sidebar - File Upload for Job Descriptions
107
+ st.sidebar.markdown("## Upload Job Description PDF")
108
+ job_descriptions_file = st.sidebar.file_uploader(
109
+ "Upload Job Description PDF", type=["pdf"])
110
+
111
+ if job_descriptions_file:
112
+
113
+ # Backend Processing
114
+ job_description_text = extract_text_from_pdf(job_descriptions_file)
115
+ resumes_texts = [extract_text_from_pdf(
116
+ resume_file) for resume_file in resumes_files]
117
+ job_description_text = extract_text_from_pdf(job_descriptions_file)
118
+ job_description_tokens = tokenize_text(job_description_text, nlp)
119
+
120
+ # st.subheader("Matching Keywords")
121
+
122
+ # Initialize counters
123
+ overall_skill_matches = 0
124
+ overall_qualification_matches = 0
125
+
126
+ # Create a list to store individual results
127
+ results_list = []
128
+ job_skills = set()
129
+ job_qualifications = set()
130
+
131
+ for job_token, job_label in job_description_tokens:
132
+ if job_label == 'QUALIFICATION':
133
+ job_qualifications.add(job_token.replace('\n', ' '))
134
+ elif job_label == 'SKILLS':
135
+ job_skills.add(job_token.replace('\n', ' '))
136
+
137
+ job_skills_number = len(job_skills)
138
+ job_qualifications_number = len(job_qualifications)
139
+
140
+ # Lists to store counts of matched skills for all resumes
141
+ skills_counts_all_resumes = []
142
+
143
+ # Iterate over all uploaded resumes
144
+ for uploaded_resume in resumes_files:
145
+ resume_text = extract_text_from_pdf(uploaded_resume)
146
+ resume_tokens = tokenize_text(resume_text, nlp)
147
+
148
+ # Initialize counters for individual resume
149
+ skillMatch = 0
150
+ qualificationMatch = 0
151
+ cgpa = ""
152
+
153
+ # Lists to store matched skills and qualifications for each resume
154
+ matched_skills = set()
155
+ matched_qualifications = set()
156
+ email = set()
157
+ phone = set()
158
+ name = set()
159
+
160
+ # Compare the tokens in the resume with the job description
161
+ for resume_token, resume_label in resume_tokens:
162
+ for job_token, job_label in job_description_tokens:
163
+ if resume_token.lower().replace('\n', ' ') == job_token.lower().replace('\n', ' '):
164
+ if resume_label == 'SKILLS':
165
+ matched_skills.add(resume_token.replace('\n', ' '))
166
+ elif resume_label == 'QUALIFICATION':
167
+ matched_qualifications.add(resume_token.replace('\n', ' '))
168
+ elif resume_label == 'PHONE' and bool(float_digit_regex.match(resume_token)):
169
+ phone.add(resume_token)
170
+ elif resume_label == 'QUALIFICATION':
171
+ matched_qualifications.add(resume_token.replace('\n', ' '))
172
+
173
+ skillMatch = len(matched_skills)
174
+ qualificationMatch = len(matched_qualifications)
175
+
176
+ # Convert the list of emails to a set
177
+ email_set = set(re.findall(email_pattern, resume_text.replace('\n', ' ')))
178
+ email.update(email_set)
179
+
180
+ numberphone=""
181
+ for email_str in email:
182
+ numberphone = email_with_phone_regex.search(email_str)
183
+ if numberphone:
184
+ email.remove(email_str)
185
+ val=numberphone.group(1) or numberphone.group(2)
186
+ phone.add(val)
187
+ email.add(email_str.strip(val))
188
+
189
+ # Increment overall counters based on matches
190
+ overall_skill_matches += skillMatch
191
+ overall_qualification_matches += qualificationMatch
192
+
193
+ # Add count of matched skills for this resume to the list
194
+ skills_counts_all_resumes.append(
195
+ [resume_text.count(skill.lower()) for skill in job_skills])
196
+
197
+ # Create a dictionary for the current resume and append to the results list
198
+ result_dict = {
199
+ "Resume": uploaded_resume.name,
200
+ "Similarity Score": (skillMatch/job_skills_number)*100,
201
+ "Skill Matches": skillMatch,
202
+ "Matched Skills": matched_skills,
203
+ "CGPA": extract_cgpa(resume_text),
204
+ "Email": email,
205
+ "Phone": phone,
206
+ "Qualification Matches": qualificationMatch,
207
+ "Matched Qualifications": matched_qualifications
208
+ }
209
+
210
+ results_list.append(result_dict)
211
+
212
+ # Display overall matches
213
+ st.subheader("Overall Matches")
214
+ st.write(f"Total Skill Matches: {overall_skill_matches}")
215
+ st.write(
216
+ f"Total Qualification Matches: {overall_qualification_matches}")
217
+ st.write(f"Job Qualifications: {job_qualifications}")
218
+ st.write(f"Job Skills: {job_skills}")
219
+
220
+ # Display individual results in a table
221
+ results_df = pd.DataFrame(results_list)
222
+ st.subheader("Individual Results")
223
+ st.dataframe(results_df)
224
+ tagged_resumes = [TaggedDocument(words=preprocess_text(
225
+ text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
226
+ model_resumes = train_doc2vec_model(tagged_resumes)
227
+
228
+
229
+
230
+ st.subheader("\nHeatmap:")
231
+
232
+ # Get skills keywords from user input
233
+ skills_keywords_input = st.text_input(
234
+ "Enter skills keywords separated by commas (e.g., python, java, machine learning):")
235
+ skills_keywords = [skill.strip()
236
+ for skill in skills_keywords_input.split(',') if skill.strip()]
237
+
238
+ if skills_keywords:
239
+ # Calculate the similarity score between each skill keyword and the resume text
240
+ skills_similarity_scores = []
241
+ for resume_text in resumes_texts:
242
+ resume_text_similarity_scores = []
243
+ for skill in skills_keywords:
244
+ similarity_score = calculate_similarity(
245
+ model_resumes, resume_text, skill)
246
+ resume_text_similarity_scores.append(similarity_score)
247
+ skills_similarity_scores.append(resume_text_similarity_scores)
248
+
249
+ # Create a DataFrame with the similarity scores and set the index to the names of the PDFs
250
+ skills_similarity_df = pd.DataFrame(
251
+ skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
252
+
253
+ # Plot the heatmap
254
+ fig, ax = plt.subplots(figsize=(12, 8))
255
+
256
+ sns.heatmap(skills_similarity_df,
257
+ cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
258
+ ax.set_title('Heatmap for Skills Similarity')
259
+ ax.set_xlabel('Skills')
260
+ ax.set_ylabel('Resumes')
261
+
262
+ # Rotate the y-axis labels for better readability
263
+ plt.yticks(rotation=0)
264
+
265
+ # Display the Matplotlib figure using st.pyplot()
266
+ st.pyplot(fig)
267
+ else:
268
+ st.write("Please enter at least one skill keyword.")
269
+
270
+ else:
271
+ st.warning("Please upload the Job Description PDF to proceed.")
272
+ else:
273
+ st.warning("Please upload Resumes PDF to proceed.")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ spacy
3
+ https://huggingface.co/Priyanka-Balivada/en_Resume_Matching_Keywords/resolve/main/en_Resume_Matching_Keywords-any-py3-none-any.whl
4
+ nltk
5
+ gensim
6
+ PyPDF2
7
+ pandas
8
+ matplotlib
9
+ seaborn