Spaces:
Runtime error
Runtime error
Feat:add pipeline and gradio
Browse files- .gitignore +4 -0
- README.md +22 -0
- __pycache__/embedding.cpython-311.pyc +0 -0
- __pycache__/pdf_loader.cpython-311.pyc +0 -0
- __pycache__/preprocessing.cpython-311.pyc +0 -0
- app.py +24 -0
- core.py +35 -0
- demo.py +26 -0
- embedding.py +2 -2
- install.sh +12 -0
- main.py +5 -29
- pdf_loader.py +14 -4
- preprocessing.py +5 -3
- requirements.txt +5 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
venv
|
3 |
+
nltk_packages
|
4 |
+
embedding
|
README.md
CHANGED
@@ -1 +1,23 @@
|
|
1 |
# resume-ranker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# resume-ranker
|
2 |
+
|
3 |
+
<hr>
|
4 |
+
|
5 |
+
## How to Use?
|
6 |
+
|
7 |
+
Install all the dependencies with:
|
8 |
+
|
9 |
+
```bash
|
10 |
+
./install.sh
|
11 |
+
```
|
12 |
+
|
13 |
+
Run the Gradio UI with:
|
14 |
+
|
15 |
+
```bash
|
16 |
+
gradio app.py
|
17 |
+
```
|
18 |
+
|
19 |
+
Or run it from with:
|
20 |
+
|
21 |
+
```bash
|
22 |
+
python demo.py
|
23 |
+
```
|
__pycache__/embedding.cpython-311.pyc
DELETED
Binary file (1.02 kB)
|
|
__pycache__/pdf_loader.cpython-311.pyc
DELETED
Binary file (2.12 kB)
|
|
__pycache__/preprocessing.cpython-311.pyc
DELETED
Binary file (4.74 kB)
|
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf_loader import load_documents
|
2 |
+
from core import pipeline
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
def inference(query, files):
|
7 |
+
#get path of uploaded files
|
8 |
+
files = [file.name for file in files]
|
9 |
+
results,_ = pipeline(query, load_documents(file_paths=files))
|
10 |
+
|
11 |
+
prob_per_documents = {result['name']: result['similarity'] for result in results}
|
12 |
+
return prob_per_documents
|
13 |
+
|
14 |
+
with gr.Blocks() as demo:
|
15 |
+
#write a header
|
16 |
+
|
17 |
+
job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
|
18 |
+
files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
|
19 |
+
btn = gr.Button("Submit")
|
20 |
+
output = gr.Label(label="Results")
|
21 |
+
# output = gr.Number(label="Results")
|
22 |
+
btn.click(inference, inputs=[job_desc, files], outputs=output)
|
23 |
+
|
24 |
+
demo.launch(server_port=7800)
|
core.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from embedding import embedding
|
2 |
+
from preprocessing import preprocess
|
3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
def pipeline(input_doc:str , ori_documents):
|
7 |
+
documents = np.array([doc['content'] for doc in ori_documents])
|
8 |
+
documents = np.insert(documents, 0, input_doc)
|
9 |
+
preprocessed_documents = preprocess(documents)
|
10 |
+
print("Encoding with BERT...")
|
11 |
+
documents_vectors = embedding(preprocessed_documents)
|
12 |
+
print("Encoding finished")
|
13 |
+
|
14 |
+
#compute cosine similarity
|
15 |
+
pairwise = cosine_similarity(documents_vectors)
|
16 |
+
|
17 |
+
#only retain useful information
|
18 |
+
pairwise = pairwise[0,1:]
|
19 |
+
|
20 |
+
sorted_idx = np.argsort(pairwise)[::-1]
|
21 |
+
result_pairwise = pairwise[sorted_idx]
|
22 |
+
|
23 |
+
results = []
|
24 |
+
print('Resume ranking:')
|
25 |
+
for idx in sorted_idx:
|
26 |
+
single_result = {
|
27 |
+
'rank': idx,
|
28 |
+
'name': ori_documents[idx]['name'],
|
29 |
+
'similarity': pairwise[idx].item()
|
30 |
+
}
|
31 |
+
results.append(single_result)
|
32 |
+
print(f'Resume of candidite {idx}')
|
33 |
+
print(f'Cosine Similarity: {pairwise[idx]}\n')
|
34 |
+
|
35 |
+
return results, result_pairwise
|
demo.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf_loader import load_documents
|
2 |
+
from core import pipeline
|
3 |
+
|
4 |
+
if __name__ == '__main__':
|
5 |
+
pipeline('''About Sleek
|
6 |
+
|
7 |
+
Sleek is on a mission to revolutionize how entrepreneurs operate their business. We want to give small business owners peace of mind and the power of online solutions to allow them to focus on what they do best - growing their business. As we work for our thousands of customers, we gather millions of data points about their business, and in turn we transform those into useful, actionable insights and recommendations to accelerate their growth through smart algorithms.
|
8 |
+
|
9 |
+
We are a team of 400 builders from 17 countries, with offices in Singapore, Philippines, Hong Kong, Australia and the UK committed to delivering a delightful experience to our clients!
|
10 |
+
|
11 |
+
You will be working in the Data & Analytics organization to solve a wide range of business problems leveraging advanced analytics. You will deploy a flexible analytical skill set to deliver insightful data and analysis and model business scenarios. Your principal goal will be to use data to drive better business decisions. This means translating data into meaningful insights and recommendations and, where relevant, proactively implement improvements. You will be developing the business reporting and analysis for our internal operations world-wide. The job will require working closely with the various Business Units to understand their business question as well as the whole data team to understand and access available data.
|
12 |
+
|
13 |
+
Position Duties
|
14 |
+
Drive analytical problem-solving and deep dives. Work with large, complex data sets. Solve difficult, non-routine problems, applying advanced quantitative methods.
|
15 |
+
Collaborate with a wide variety of cross-functional partners to determine business needs, drive analytical projects from start to finish.
|
16 |
+
Align with involved stakeholders to set up dashboards and reports to drive data driven decision across all departments
|
17 |
+
Working very closely with our Data team, Tech and Product team to understand the business logic to generate accurate reports and correct analysis
|
18 |
+
|
19 |
+
Requirements
|
20 |
+
|
21 |
+
Performance Standards
|
22 |
+
Able to commit for a period of at least 4 months
|
23 |
+
Currently pursuing a degree in Business Science, Engineering or relevant disciplines with a focus on data.
|
24 |
+
Good knowledge in SQL, R and Python.
|
25 |
+
Experience in data visualization tools (Tableau, PowerBI, Google DataStudio or equivalent) will be an added advantage.''',
|
26 |
+
load_documents(source_dir = 'documents'))
|
embedding.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
from sentence_transformers import SentenceTransformer
|
3 |
-
|
4 |
|
5 |
def embedding(documents, embedding='bert'):
|
6 |
if embedding == 'bert':
|
7 |
-
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
|
8 |
|
9 |
document_embeddings = sbert_model.encode(documents)
|
10 |
return document_embeddings
|
|
|
1 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
from sentence_transformers import SentenceTransformer
|
3 |
+
import os
|
4 |
|
5 |
def embedding(documents, embedding='bert'):
|
6 |
if embedding == 'bert':
|
7 |
+
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens', cache_folder=os.path.join(os.getcwd(), 'embedding'))
|
8 |
|
9 |
document_embeddings = sbert_model.encode(documents)
|
10 |
return document_embeddings
|
install.sh
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
find . \( -name __pycache__ -o -name "*.pyc" \) -delete
|
3 |
+
python3 -m venv venv
|
4 |
+
# Check the operating system
|
5 |
+
if [[ "$OSTYPE" == "msys" ]]; then
|
6 |
+
# Windows
|
7 |
+
source venv/Scripts/activate
|
8 |
+
else
|
9 |
+
# Unix-like systems (macOS, Linux)
|
10 |
+
source venv/bin/activate
|
11 |
+
fi
|
12 |
+
pip install --no-cache-dir -r requirements.txt
|
main.py
CHANGED
@@ -1,31 +1,7 @@
|
|
1 |
-
from
|
2 |
-
from embedding import embedding
|
3 |
-
from preprocessing import preprocess
|
4 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
-
import numpy as np
|
6 |
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
print("Encoding with BERT...")
|
12 |
-
documents_vectors = embedding(preprocessed_documents)
|
13 |
-
print("Encoding finished")
|
14 |
-
print(documents_vectors.shape)
|
15 |
-
|
16 |
-
pairwise = cosine_similarity(documents_vectors)
|
17 |
-
|
18 |
-
print('Resume ranking:')
|
19 |
-
|
20 |
-
sorted_idx = np.argsort(pairwise[0])[::-1]
|
21 |
-
|
22 |
-
for idx in sorted_idx[:10]:
|
23 |
-
if idx == 0:
|
24 |
-
continue
|
25 |
-
print(f'Resume of candidite {idx}')
|
26 |
-
print(f'Cosine Similarity: {pairwise[0][idx]}\n')
|
27 |
-
|
28 |
-
|
29 |
-
if __name__ == '__main__':
|
30 |
-
rank_documents('I want a data scientist',
|
31 |
-
load_documents('documents'))
|
|
|
1 |
+
# from fastapi import FastAPI
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
# app = FastAPI()
|
4 |
|
5 |
+
# @app.get("/")
|
6 |
+
# async def root():
|
7 |
+
# return {"message": "Hello World"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_loader.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import os
|
2 |
import PyPDF2
|
3 |
|
4 |
-
|
5 |
def load_single_document(file_path: str):
|
6 |
# Loads a single document from file path
|
7 |
if file_path[-4:] == '.txt':
|
@@ -24,7 +23,18 @@ def load_single_document(file_path: str):
|
|
24 |
raise Exception('Invalid file type')
|
25 |
|
26 |
|
27 |
-
def load_documents(source_dir: str):
|
28 |
# Loads all documents from source documents directory
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import PyPDF2
|
3 |
|
|
|
4 |
def load_single_document(file_path: str):
|
5 |
# Loads a single document from file path
|
6 |
if file_path[-4:] == '.txt':
|
|
|
23 |
raise Exception('Invalid file type')
|
24 |
|
25 |
|
26 |
+
def load_documents(file_paths: list[str] = None, source_dir: str = None):
|
27 |
# Loads all documents from source documents directory
|
28 |
+
if file_paths:
|
29 |
+
all_files = file_paths
|
30 |
+
elif source_dir:
|
31 |
+
all_files = [os.path.abspath(os.path.join(source_dir, file)) for file in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, file))]
|
32 |
+
else:
|
33 |
+
raise Exception('No file paths or source directory provided')
|
34 |
+
|
35 |
+
return [
|
36 |
+
{
|
37 |
+
'name': os.path.basename(file_path),
|
38 |
+
'content': load_single_document(f"{file_path}")
|
39 |
+
} for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
|
40 |
+
]
|
preprocessing.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
|
2 |
import re
|
|
|
3 |
import unicodedata
|
4 |
import nltk
|
5 |
import inflect
|
@@ -7,9 +8,10 @@ from nltk import word_tokenize, sent_tokenize
|
|
7 |
from nltk.corpus import stopwords
|
8 |
from nltk.stem import LancasterStemmer, WordNetLemmatizer
|
9 |
|
10 |
-
|
11 |
-
nltk.
|
12 |
-
|
|
|
13 |
|
14 |
def remove_non_ascii(words):
|
15 |
"""Remove non-ASCII characters from list of tokenized words"""
|
|
|
1 |
|
2 |
import re
|
3 |
+
import os
|
4 |
import unicodedata
|
5 |
import nltk
|
6 |
import inflect
|
|
|
8 |
from nltk.corpus import stopwords
|
9 |
from nltk.stem import LancasterStemmer, WordNetLemmatizer
|
10 |
|
11 |
+
download_path = os.path.join(os.getcwd(), 'nltk_packages')
|
12 |
+
nltk.data.path.append(download_path)
|
13 |
+
nltk.download('wordnet', download_dir=download_path)
|
14 |
+
nltk.download('stopwords', download_dir=download_path)
|
15 |
|
16 |
def remove_non_ascii(words):
|
17 |
"""Remove non-ASCII characters from list of tokenized words"""
|
requirements.txt
CHANGED
@@ -4,3 +4,8 @@ numpy==1.24.3
|
|
4 |
PyPDF2==3.0.1
|
5 |
scikit_learn==1.2.2
|
6 |
sentence_transformers==2.2.2
|
|
|
|
|
|
|
|
|
|
|
|
4 |
PyPDF2==3.0.1
|
5 |
scikit_learn==1.2.2
|
6 |
sentence_transformers==2.2.2
|
7 |
+
fastapi
|
8 |
+
uvicorn[standard]
|
9 |
+
python-multipart
|
10 |
+
python-dotenv
|
11 |
+
gradio
|