Spaces:

zhuolisam
/

resume-ranker

Runtime error

App Files Files Community

zhuolisam commited on Jun 5, 2023

Commit

0a6dedb

•

1 Parent(s): ee5a856

Feat:add pipeline and gradio

Browse files

Files changed (14) hide show

.gitignore +4 -0
README.md +22 -0
__pycache__/embedding.cpython-311.pyc +0 -0
__pycache__/pdf_loader.cpython-311.pyc +0 -0
__pycache__/preprocessing.cpython-311.pyc +0 -0
app.py +24 -0
core.py +35 -0
demo.py +26 -0
embedding.py +2 -2
install.sh +12 -0
main.py +5 -29
pdf_loader.py +14 -4
preprocessing.py +5 -3
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__
+venv
+nltk_packages
+embedding

README.md CHANGED Viewed

	@@ -1 +1,23 @@
1	# resume-ranker

 # resume-ranker
+<hr>
+## How to Use?
+Install all the dependencies with:
+```bash
+./install.sh
+```
+Run the Gradio UI with:
+```bash
+gradio app.py
+```
+Or run it from with:
+```bash
+python demo.py
+```

__pycache__/embedding.cpython-311.pyc DELETED Viewed

Binary file (1.02 kB)

__pycache__/pdf_loader.cpython-311.pyc DELETED Viewed

Binary file (2.12 kB)

__pycache__/preprocessing.cpython-311.pyc DELETED Viewed

Binary file (4.74 kB)

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pdf_loader import load_documents
+from core import pipeline
+import gradio as gr
+def inference(query, files):
+    #get path of uploaded files
+    files = [file.name for file in files]
+    results,_ = pipeline(query, load_documents(file_paths=files))
+    prob_per_documents = {result['name']: result['similarity'] for result in results}
+    return prob_per_documents
+with gr.Blocks() as demo:
+    #write a header
+    job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
+    files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
+    btn = gr.Button("Submit")
+    output = gr.Label(label="Results")
+    # output = gr.Number(label="Results")
+    btn.click(inference, inputs=[job_desc, files], outputs=output)
+demo.launch(server_port=7800)

core.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from embedding import embedding
+from preprocessing import preprocess
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+def pipeline(input_doc:str , ori_documents):
+    documents = np.array([doc['content'] for doc in ori_documents])
+    documents = np.insert(documents, 0, input_doc)
+    preprocessed_documents = preprocess(documents)
+    print("Encoding with BERT...")
+    documents_vectors = embedding(preprocessed_documents)
+    print("Encoding finished")
+    #compute cosine similarity
+    pairwise = cosine_similarity(documents_vectors)
+    #only retain useful information
+    pairwise = pairwise[0,1:]
+    sorted_idx = np.argsort(pairwise)[::-1]
+    result_pairwise = pairwise[sorted_idx]
+    results = []
+    print('Resume ranking:')
+    for idx in sorted_idx:
+        single_result = {
+            'rank': idx,
+            'name': ori_documents[idx]['name'],
+            'similarity': pairwise[idx].item()
+        }
+        results.append(single_result)
+        print(f'Resume of candidite {idx}')
+        print(f'Cosine Similarity: {pairwise[idx]}\n')
+    return results, result_pairwise

demo.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from pdf_loader import load_documents
+from core import pipeline
+if __name__ == '__main__':
+    pipeline('''About Sleek
+Sleek is on a mission to revolutionize how entrepreneurs operate their business. We want to give small business owners peace of mind and the power of online solutions to allow them to focus on what they do best - growing their business. As we work for our thousands of customers, we gather millions of data points about their business, and in turn we transform those into useful, actionable insights and recommendations to accelerate their growth through smart algorithms.
+We are a team of 400 builders from 17 countries, with offices in Singapore, Philippines, Hong Kong, Australia and the UK committed to delivering a delightful experience to our clients!
+You will be working in the Data & Analytics organization to solve a wide range of business problems leveraging advanced analytics. You will deploy a flexible analytical skill set to deliver insightful data and analysis and model business scenarios. Your principal goal will be to use data to drive better business decisions. This means translating data into meaningful insights and recommendations and, where relevant, proactively implement improvements. You will be developing the business reporting and analysis for our internal operations world-wide. The job will require working closely with the various Business Units to understand their business question as well as the whole data team to understand and access available data.
+Position Duties
+Drive analytical problem-solving and deep dives. Work with large, complex data sets. Solve difficult, non-routine problems, applying advanced quantitative methods.
+Collaborate with a wide variety of cross-functional partners to determine business needs, drive analytical projects from start to finish.
+Align with involved stakeholders to set up dashboards and reports to drive data driven decision across all departments
+Working very closely with our Data team, Tech and Product team to understand the business logic to generate accurate reports and correct analysis
+Requirements
+Performance Standards
+Able to commit for a period of at least 4 months
+Currently pursuing a degree in Business Science, Engineering or relevant disciplines with a focus on data.
+Good knowledge in SQL, R and Python.
+Experience in data visualization tools (Tableau, PowerBI, Google DataStudio or equivalent) will be an added advantage.''',
+                   load_documents(source_dir = 'documents'))

embedding.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer
 def embedding(documents, embedding='bert'):
     if embedding == 'bert':
-        sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
         document_embeddings = sbert_model.encode(documents)
         return document_embeddings

 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer
+import os
 def embedding(documents, embedding='bert'):
     if embedding == 'bert':
+        sbert_model = SentenceTransformer('bert-base-nli-mean-tokens', cache_folder=os.path.join(os.getcwd(), 'embedding'))
         document_embeddings = sbert_model.encode(documents)
         return document_embeddings

install.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+find . \( -name __pycache__ -o -name "*.pyc" \) -delete
+python3 -m venv venv
+# Check the operating system
+if [[ "$OSTYPE" == "msys" ]]; then
+    # Windows
+    source venv/Scripts/activate
+else
+    # Unix-like systems (macOS, Linux)
+    source venv/bin/activate
+fi
+pip install --no-cache-dir -r requirements.txt

main.py CHANGED Viewed

@@ -1,31 +1,7 @@
-from pdf_loader import load_documents
-from embedding import embedding
-from preprocessing import preprocess
-from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
-def rank_documents(input_doc, documents):
-    documents = np.insert(documents, 0, input_doc)
-    preprocessed_documents = preprocess(documents)
-    print("Encoding with BERT...")
-    documents_vectors = embedding(preprocessed_documents)
-    print("Encoding finished")
-    print(documents_vectors.shape)
-    pairwise = cosine_similarity(documents_vectors)
-    print('Resume ranking:')
-    sorted_idx = np.argsort(pairwise[0])[::-1]
-    for idx in sorted_idx[:10]:
-        if idx == 0:
-            continue
-        print(f'Resume of candidite {idx}')
-        print(f'Cosine Similarity: {pairwise[0][idx]}\n')
-if __name__ == '__main__':
-    rank_documents('I want a data scientist',
-                   load_documents('documents'))

+# from fastapi import FastAPI
+# app = FastAPI()
+# @app.get("/")
+# async def root():
+#     return {"message": "Hello World"}

pdf_loader.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 import PyPDF2
 def load_single_document(file_path: str):
     # Loads a single document from file path
     if file_path[-4:] == '.txt':
@@ -24,7 +23,18 @@ def load_single_document(file_path: str):
         raise Exception('Invalid file type')
-def load_documents(source_dir: str):
     # Loads all documents from source documents directory
-    all_files = os.listdir(source_dir)
-    return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if file_path[-4:] in ['.txt', '.pdf', '.csv']]

 import os
 import PyPDF2
 def load_single_document(file_path: str):
     # Loads a single document from file path
     if file_path[-4:] == '.txt':
         raise Exception('Invalid file type')
+def load_documents(file_paths: list[str] = None, source_dir: str = None):
     # Loads all documents from source documents directory
+    if file_paths:
+        all_files = file_paths
+    elif source_dir:
+        all_files = [os.path.abspath(os.path.join(source_dir, file)) for file in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, file))]
+    else:
+        raise Exception('No file paths or source directory provided')
+    return [
+            {
+                'name': os.path.basename(file_path),
+                'content': load_single_document(f"{file_path}")
+            } for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
+        ]

preprocessing.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 import unicodedata
 import nltk
 import inflect
@@ -7,9 +8,10 @@ from nltk import word_tokenize, sent_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import LancasterStemmer, WordNetLemmatizer
-nltk.download('wordnet')
-nltk.download('stopwords')
 def remove_non_ascii(words):
     """Remove non-ASCII characters from list of tokenized words"""

 import re
+import os
 import unicodedata
 import nltk
 import inflect
 from nltk.corpus import stopwords
 from nltk.stem import LancasterStemmer, WordNetLemmatizer
+download_path = os.path.join(os.getcwd(), 'nltk_packages')
+nltk.data.path.append(download_path)
+nltk.download('wordnet', download_dir=download_path)
+nltk.download('stopwords', download_dir=download_path)
 def remove_non_ascii(words):
     """Remove non-ASCII characters from list of tokenized words"""

requirements.txt CHANGED Viewed

@@ -4,3 +4,8 @@ numpy==1.24.3
 PyPDF2==3.0.1
 scikit_learn==1.2.2
 sentence_transformers==2.2.2

 PyPDF2==3.0.1
 scikit_learn==1.2.2
 sentence_transformers==2.2.2
+fastapi
+uvicorn[standard]
+python-multipart
+python-dotenv
+gradio