initial commit

Browse files

Files changed (11) hide show

UpdatedResumeDataSet.csv +0 -0
__pycache__/embedding.cpython-311.pyc +0 -0
__pycache__/pdf_loader.cpython-311.pyc +0 -0
__pycache__/preprocessing.cpython-311.pyc +0 -0
documents/business.pdf +0 -0
documents/data_science.pdf +0 -0
embedding.py +18 -0
main.py +31 -0
pdf_loader.py +30 -0
preprocessing.py +105 -0
requirements.txt +6 -0

UpdatedResumeDataSet.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

__pycache__/embedding.cpython-311.pyc ADDED Viewed

Binary file (1.02 kB). View file

__pycache__/pdf_loader.cpython-311.pyc ADDED Viewed

Binary file (2.12 kB). View file

__pycache__/preprocessing.cpython-311.pyc ADDED Viewed

Binary file (4.74 kB). View file

documents/business.pdf ADDED Viewed

Binary file (29 kB). View file

documents/data_science.pdf ADDED Viewed

Binary file (27.8 kB). View file

embedding.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from sklearn.feature_extraction.text import TfidfVectorizer
+from sentence_transformers import SentenceTransformer
+def embedding(documents, embedding='bert'):
+    if embedding == 'bert':
+        sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
+        document_embeddings = sbert_model.encode(documents)
+        return document_embeddings
+    if embedding == 'tfidf':
+        word_vectorizer = TfidfVectorizer(
+            sublinear_tf=True, stop_words='english')
+        word_vectorizer.fit(documents)
+        word_features = word_vectorizer.transform(documents)
+        return word_features

main.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from pdf_loader import load_documents
+from embedding import embedding
+from preprocessing import preprocess
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+def rank_documents(input_doc, documents):
+    documents = np.insert(documents, 0, input_doc)
+    preprocessed_documents = preprocess(documents)
+    print("Encoding with BERT...")
+    documents_vectors = embedding(preprocessed_documents)
+    print("Encoding finished")
+    print(documents_vectors.shape)
+    pairwise = cosine_similarity(documents_vectors)
+    print('Resume ranking:')
+    sorted_idx = np.argsort(pairwise[0])[::-1]
+    for idx in sorted_idx[:10]:
+        if idx == 0:
+            continue
+        print(f'Resume of candidite {idx}')
+        print(f'Cosine Similarity: {pairwise[0][idx]}\n')
+if __name__ == '__main__':
+    rank_documents('I want a data scientist',
+                   load_documents('documents'))

pdf_loader.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import PyPDF2
+def load_single_document(file_path: str):
+    # Loads a single document from file path
+    if file_path[-4:] == '.txt':
+        with open(file_path, 'r') as f:
+            return f.read()
+    elif file_path[-4:] == '.pdf':
+        pdfFileObj = open(file_path, 'rb')
+        pdfReader = PyPDF2.PdfReader(pdfFileObj)
+        text = ''
+        for page in pdfReader.pages:
+            text += page.extract_text()
+        return text
+    elif file_path[-4:] == '.csv':
+        with open(file_path, 'r') as f:
+            return f.read()
+    else:
+        raise Exception('Invalid file type')
+def load_documents(source_dir: str):
+    # Loads all documents from source documents directory
+    all_files = os.listdir(source_dir)
+    return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if file_path[-4:] in ['.txt', '.pdf', '.csv']]

preprocessing.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import re
+import unicodedata
+import nltk
+import inflect
+from nltk import word_tokenize, sent_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import LancasterStemmer, WordNetLemmatizer
+nltk.download('wordnet')
+nltk.download('stopwords')
+def remove_non_ascii(words):
+    """Remove non-ASCII characters from list of tokenized words"""
+    new_words = []
+    for word in words:
+        new_word = unicodedata.normalize('NFKD', word).encode(
+            'ascii', 'ignore').decode('utf-8', 'ignore')
+        new_words.append(new_word)
+    return new_words
+def to_lowercase(words):
+    """Convert all characters to lowercase from list of tokenized words"""
+    new_words = []
+    for word in words:
+        new_word = word.lower()
+        new_words.append(new_word)
+    return new_words
+def remove_punctuation(words):
+    """Remove punctuation from list of tokenized words"""
+    new_words = []
+    for word in words:
+        new_word = re.sub(r'[^\w\s]', '', word)
+        if new_word != '':
+            new_words.append(new_word)
+    return new_words
+def replace_numbers(words):
+    """Replace all interger occurrences in list of tokenized words with textual representation"""
+    p = inflect.engine()
+    new_words = []
+    for word in words:
+        if word.isdigit():
+            new_word = p.number_to_words(word)
+            new_words.append(new_word)
+        else:
+            new_words.append(word)
+    return new_words
+def remove_stopwords(words):
+    """Remove stop words from list of tokenized words"""
+    new_words = []
+    for word in words:
+        # print(word)
+        if word not in stopwords.words('english'):
+            new_words.append(word)
+    return new_words
+def stem_words(words):
+    """Stem words in list of tokenized words"""
+    stemmer = LancasterStemmer()
+    stems = []
+    for word in words:
+        stem = stemmer.stem(word)
+        stems.append(stem)
+    return stems
+def lemmatize_verbs(words):
+    """Lemmatize verbs in list of tokenized words"""
+    lemmatizer = WordNetLemmatizer()
+    lemmas = []
+    for word in words:
+        lemma = lemmatizer.lemmatize(word, pos='v')
+        lemmas.append(lemma)
+    return lemmas
+def normalize(words):
+    words = remove_non_ascii(words)
+    words = to_lowercase(words)
+    words = remove_punctuation(words)
+    words = replace_numbers(words)
+    words = remove_stopwords(words)
+    words = stem_words(words)
+    words = lemmatize_verbs(words)
+    return words
+def preprocess(documents):
+    preprocessed_documents = []
+    for document in documents:
+        tokens = nltk.word_tokenize(document)
+        preprocessed = normalize(tokens)
+        preprocessed = ' '.join(map(str, preprocessed))
+        preprocessed_documents.append(preprocessed)
+    return preprocessed_documents

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+inflect==6.0.4
+nltk==3.8.1
+numpy==1.24.3
+PyPDF2==3.0.1
+scikit_learn==1.2.2
+sentence_transformers==2.2.2