Spaces:
Runtime error
Runtime error
WeiJie422
commited on
Commit
•
070ea8a
0
Parent(s):
initial commit
Browse files- UpdatedResumeDataSet.csv +0 -0
- __pycache__/embedding.cpython-311.pyc +0 -0
- __pycache__/pdf_loader.cpython-311.pyc +0 -0
- __pycache__/preprocessing.cpython-311.pyc +0 -0
- documents/business.pdf +0 -0
- documents/data_science.pdf +0 -0
- embedding.py +18 -0
- main.py +31 -0
- pdf_loader.py +30 -0
- preprocessing.py +105 -0
- requirements.txt +6 -0
UpdatedResumeDataSet.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
__pycache__/embedding.cpython-311.pyc
ADDED
Binary file (1.02 kB). View file
|
|
__pycache__/pdf_loader.cpython-311.pyc
ADDED
Binary file (2.12 kB). View file
|
|
__pycache__/preprocessing.cpython-311.pyc
ADDED
Binary file (4.74 kB). View file
|
|
documents/business.pdf
ADDED
Binary file (29 kB). View file
|
|
documents/data_science.pdf
ADDED
Binary file (27.8 kB). View file
|
|
embedding.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
|
4 |
+
|
5 |
+
def embedding(documents, embedding='bert'):
|
6 |
+
if embedding == 'bert':
|
7 |
+
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
|
8 |
+
|
9 |
+
document_embeddings = sbert_model.encode(documents)
|
10 |
+
return document_embeddings
|
11 |
+
|
12 |
+
if embedding == 'tfidf':
|
13 |
+
word_vectorizer = TfidfVectorizer(
|
14 |
+
sublinear_tf=True, stop_words='english')
|
15 |
+
word_vectorizer.fit(documents)
|
16 |
+
word_features = word_vectorizer.transform(documents)
|
17 |
+
|
18 |
+
return word_features
|
main.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf_loader import load_documents
|
2 |
+
from embedding import embedding
|
3 |
+
from preprocessing import preprocess
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
def rank_documents(input_doc, documents):
|
9 |
+
documents = np.insert(documents, 0, input_doc)
|
10 |
+
preprocessed_documents = preprocess(documents)
|
11 |
+
print("Encoding with BERT...")
|
12 |
+
documents_vectors = embedding(preprocessed_documents)
|
13 |
+
print("Encoding finished")
|
14 |
+
print(documents_vectors.shape)
|
15 |
+
|
16 |
+
pairwise = cosine_similarity(documents_vectors)
|
17 |
+
|
18 |
+
print('Resume ranking:')
|
19 |
+
|
20 |
+
sorted_idx = np.argsort(pairwise[0])[::-1]
|
21 |
+
|
22 |
+
for idx in sorted_idx[:10]:
|
23 |
+
if idx == 0:
|
24 |
+
continue
|
25 |
+
print(f'Resume of candidite {idx}')
|
26 |
+
print(f'Cosine Similarity: {pairwise[0][idx]}\n')
|
27 |
+
|
28 |
+
|
29 |
+
if __name__ == '__main__':
|
30 |
+
rank_documents('I want a data scientist',
|
31 |
+
load_documents('documents'))
|
pdf_loader.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import PyPDF2
|
3 |
+
|
4 |
+
|
5 |
+
def load_single_document(file_path: str):
|
6 |
+
# Loads a single document from file path
|
7 |
+
if file_path[-4:] == '.txt':
|
8 |
+
with open(file_path, 'r') as f:
|
9 |
+
return f.read()
|
10 |
+
|
11 |
+
elif file_path[-4:] == '.pdf':
|
12 |
+
pdfFileObj = open(file_path, 'rb')
|
13 |
+
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
14 |
+
text = ''
|
15 |
+
for page in pdfReader.pages:
|
16 |
+
text += page.extract_text()
|
17 |
+
return text
|
18 |
+
|
19 |
+
elif file_path[-4:] == '.csv':
|
20 |
+
with open(file_path, 'r') as f:
|
21 |
+
return f.read()
|
22 |
+
|
23 |
+
else:
|
24 |
+
raise Exception('Invalid file type')
|
25 |
+
|
26 |
+
|
27 |
+
def load_documents(source_dir: str):
|
28 |
+
# Loads all documents from source documents directory
|
29 |
+
all_files = os.listdir(source_dir)
|
30 |
+
return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if file_path[-4:] in ['.txt', '.pdf', '.csv']]
|
preprocessing.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import re
|
3 |
+
import unicodedata
|
4 |
+
import nltk
|
5 |
+
import inflect
|
6 |
+
from nltk import word_tokenize, sent_tokenize
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from nltk.stem import LancasterStemmer, WordNetLemmatizer
|
9 |
+
|
10 |
+
nltk.download('wordnet')
|
11 |
+
nltk.download('stopwords')
|
12 |
+
|
13 |
+
|
14 |
+
def remove_non_ascii(words):
|
15 |
+
"""Remove non-ASCII characters from list of tokenized words"""
|
16 |
+
new_words = []
|
17 |
+
for word in words:
|
18 |
+
new_word = unicodedata.normalize('NFKD', word).encode(
|
19 |
+
'ascii', 'ignore').decode('utf-8', 'ignore')
|
20 |
+
new_words.append(new_word)
|
21 |
+
return new_words
|
22 |
+
|
23 |
+
|
24 |
+
def to_lowercase(words):
|
25 |
+
"""Convert all characters to lowercase from list of tokenized words"""
|
26 |
+
new_words = []
|
27 |
+
for word in words:
|
28 |
+
new_word = word.lower()
|
29 |
+
new_words.append(new_word)
|
30 |
+
return new_words
|
31 |
+
|
32 |
+
|
33 |
+
def remove_punctuation(words):
|
34 |
+
"""Remove punctuation from list of tokenized words"""
|
35 |
+
new_words = []
|
36 |
+
for word in words:
|
37 |
+
new_word = re.sub(r'[^\w\s]', '', word)
|
38 |
+
if new_word != '':
|
39 |
+
new_words.append(new_word)
|
40 |
+
return new_words
|
41 |
+
|
42 |
+
|
43 |
+
def replace_numbers(words):
|
44 |
+
"""Replace all interger occurrences in list of tokenized words with textual representation"""
|
45 |
+
p = inflect.engine()
|
46 |
+
new_words = []
|
47 |
+
for word in words:
|
48 |
+
if word.isdigit():
|
49 |
+
new_word = p.number_to_words(word)
|
50 |
+
new_words.append(new_word)
|
51 |
+
else:
|
52 |
+
new_words.append(word)
|
53 |
+
return new_words
|
54 |
+
|
55 |
+
|
56 |
+
def remove_stopwords(words):
|
57 |
+
"""Remove stop words from list of tokenized words"""
|
58 |
+
new_words = []
|
59 |
+
for word in words:
|
60 |
+
# print(word)
|
61 |
+
if word not in stopwords.words('english'):
|
62 |
+
new_words.append(word)
|
63 |
+
return new_words
|
64 |
+
|
65 |
+
|
66 |
+
def stem_words(words):
|
67 |
+
"""Stem words in list of tokenized words"""
|
68 |
+
stemmer = LancasterStemmer()
|
69 |
+
stems = []
|
70 |
+
for word in words:
|
71 |
+
stem = stemmer.stem(word)
|
72 |
+
stems.append(stem)
|
73 |
+
return stems
|
74 |
+
|
75 |
+
|
76 |
+
def lemmatize_verbs(words):
|
77 |
+
"""Lemmatize verbs in list of tokenized words"""
|
78 |
+
lemmatizer = WordNetLemmatizer()
|
79 |
+
lemmas = []
|
80 |
+
for word in words:
|
81 |
+
lemma = lemmatizer.lemmatize(word, pos='v')
|
82 |
+
lemmas.append(lemma)
|
83 |
+
return lemmas
|
84 |
+
|
85 |
+
|
86 |
+
def normalize(words):
|
87 |
+
words = remove_non_ascii(words)
|
88 |
+
words = to_lowercase(words)
|
89 |
+
words = remove_punctuation(words)
|
90 |
+
words = replace_numbers(words)
|
91 |
+
words = remove_stopwords(words)
|
92 |
+
words = stem_words(words)
|
93 |
+
words = lemmatize_verbs(words)
|
94 |
+
return words
|
95 |
+
|
96 |
+
|
97 |
+
def preprocess(documents):
|
98 |
+
preprocessed_documents = []
|
99 |
+
for document in documents:
|
100 |
+
tokens = nltk.word_tokenize(document)
|
101 |
+
preprocessed = normalize(tokens)
|
102 |
+
preprocessed = ' '.join(map(str, preprocessed))
|
103 |
+
preprocessed_documents.append(preprocessed)
|
104 |
+
|
105 |
+
return preprocessed_documents
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
inflect==6.0.4
|
2 |
+
nltk==3.8.1
|
3 |
+
numpy==1.24.3
|
4 |
+
PyPDF2==3.0.1
|
5 |
+
scikit_learn==1.2.2
|
6 |
+
sentence_transformers==2.2.2
|