WeiJie422 commited on
Commit
070ea8a
0 Parent(s):

initial commit

Browse files
UpdatedResumeDataSet.csv ADDED
The diff for this file is too large to render. See raw diff
 
__pycache__/embedding.cpython-311.pyc ADDED
Binary file (1.02 kB). View file
 
__pycache__/pdf_loader.cpython-311.pyc ADDED
Binary file (2.12 kB). View file
 
__pycache__/preprocessing.cpython-311.pyc ADDED
Binary file (4.74 kB). View file
 
documents/business.pdf ADDED
Binary file (29 kB). View file
 
documents/data_science.pdf ADDED
Binary file (27.8 kB). View file
 
embedding.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from sentence_transformers import SentenceTransformer
3
+
4
+
5
+ def embedding(documents, embedding='bert'):
6
+ if embedding == 'bert':
7
+ sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
8
+
9
+ document_embeddings = sbert_model.encode(documents)
10
+ return document_embeddings
11
+
12
+ if embedding == 'tfidf':
13
+ word_vectorizer = TfidfVectorizer(
14
+ sublinear_tf=True, stop_words='english')
15
+ word_vectorizer.fit(documents)
16
+ word_features = word_vectorizer.transform(documents)
17
+
18
+ return word_features
main.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf_loader import load_documents
2
+ from embedding import embedding
3
+ from preprocessing import preprocess
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import numpy as np
6
+
7
+
8
+ def rank_documents(input_doc, documents):
9
+ documents = np.insert(documents, 0, input_doc)
10
+ preprocessed_documents = preprocess(documents)
11
+ print("Encoding with BERT...")
12
+ documents_vectors = embedding(preprocessed_documents)
13
+ print("Encoding finished")
14
+ print(documents_vectors.shape)
15
+
16
+ pairwise = cosine_similarity(documents_vectors)
17
+
18
+ print('Resume ranking:')
19
+
20
+ sorted_idx = np.argsort(pairwise[0])[::-1]
21
+
22
+ for idx in sorted_idx[:10]:
23
+ if idx == 0:
24
+ continue
25
+ print(f'Resume of candidite {idx}')
26
+ print(f'Cosine Similarity: {pairwise[0][idx]}\n')
27
+
28
+
29
+ if __name__ == '__main__':
30
+ rank_documents('I want a data scientist',
31
+ load_documents('documents'))
pdf_loader.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+
4
+
5
+ def load_single_document(file_path: str):
6
+ # Loads a single document from file path
7
+ if file_path[-4:] == '.txt':
8
+ with open(file_path, 'r') as f:
9
+ return f.read()
10
+
11
+ elif file_path[-4:] == '.pdf':
12
+ pdfFileObj = open(file_path, 'rb')
13
+ pdfReader = PyPDF2.PdfReader(pdfFileObj)
14
+ text = ''
15
+ for page in pdfReader.pages:
16
+ text += page.extract_text()
17
+ return text
18
+
19
+ elif file_path[-4:] == '.csv':
20
+ with open(file_path, 'r') as f:
21
+ return f.read()
22
+
23
+ else:
24
+ raise Exception('Invalid file type')
25
+
26
+
27
+ def load_documents(source_dir: str):
28
+ # Loads all documents from source documents directory
29
+ all_files = os.listdir(source_dir)
30
+ return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if file_path[-4:] in ['.txt', '.pdf', '.csv']]
preprocessing.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ import unicodedata
4
+ import nltk
5
+ import inflect
6
+ from nltk import word_tokenize, sent_tokenize
7
+ from nltk.corpus import stopwords
8
+ from nltk.stem import LancasterStemmer, WordNetLemmatizer
9
+
10
+ nltk.download('wordnet')
11
+ nltk.download('stopwords')
12
+
13
+
14
+ def remove_non_ascii(words):
15
+ """Remove non-ASCII characters from list of tokenized words"""
16
+ new_words = []
17
+ for word in words:
18
+ new_word = unicodedata.normalize('NFKD', word).encode(
19
+ 'ascii', 'ignore').decode('utf-8', 'ignore')
20
+ new_words.append(new_word)
21
+ return new_words
22
+
23
+
24
+ def to_lowercase(words):
25
+ """Convert all characters to lowercase from list of tokenized words"""
26
+ new_words = []
27
+ for word in words:
28
+ new_word = word.lower()
29
+ new_words.append(new_word)
30
+ return new_words
31
+
32
+
33
+ def remove_punctuation(words):
34
+ """Remove punctuation from list of tokenized words"""
35
+ new_words = []
36
+ for word in words:
37
+ new_word = re.sub(r'[^\w\s]', '', word)
38
+ if new_word != '':
39
+ new_words.append(new_word)
40
+ return new_words
41
+
42
+
43
+ def replace_numbers(words):
44
+ """Replace all interger occurrences in list of tokenized words with textual representation"""
45
+ p = inflect.engine()
46
+ new_words = []
47
+ for word in words:
48
+ if word.isdigit():
49
+ new_word = p.number_to_words(word)
50
+ new_words.append(new_word)
51
+ else:
52
+ new_words.append(word)
53
+ return new_words
54
+
55
+
56
+ def remove_stopwords(words):
57
+ """Remove stop words from list of tokenized words"""
58
+ new_words = []
59
+ for word in words:
60
+ # print(word)
61
+ if word not in stopwords.words('english'):
62
+ new_words.append(word)
63
+ return new_words
64
+
65
+
66
+ def stem_words(words):
67
+ """Stem words in list of tokenized words"""
68
+ stemmer = LancasterStemmer()
69
+ stems = []
70
+ for word in words:
71
+ stem = stemmer.stem(word)
72
+ stems.append(stem)
73
+ return stems
74
+
75
+
76
+ def lemmatize_verbs(words):
77
+ """Lemmatize verbs in list of tokenized words"""
78
+ lemmatizer = WordNetLemmatizer()
79
+ lemmas = []
80
+ for word in words:
81
+ lemma = lemmatizer.lemmatize(word, pos='v')
82
+ lemmas.append(lemma)
83
+ return lemmas
84
+
85
+
86
+ def normalize(words):
87
+ words = remove_non_ascii(words)
88
+ words = to_lowercase(words)
89
+ words = remove_punctuation(words)
90
+ words = replace_numbers(words)
91
+ words = remove_stopwords(words)
92
+ words = stem_words(words)
93
+ words = lemmatize_verbs(words)
94
+ return words
95
+
96
+
97
+ def preprocess(documents):
98
+ preprocessed_documents = []
99
+ for document in documents:
100
+ tokens = nltk.word_tokenize(document)
101
+ preprocessed = normalize(tokens)
102
+ preprocessed = ' '.join(map(str, preprocessed))
103
+ preprocessed_documents.append(preprocessed)
104
+
105
+ return preprocessed_documents
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ inflect==6.0.4
2
+ nltk==3.8.1
3
+ numpy==1.24.3
4
+ PyPDF2==3.0.1
5
+ scikit_learn==1.2.2
6
+ sentence_transformers==2.2.2