Spaces:
Runtime error
Runtime error
feat:streamlit
Browse files- README.md +2 -2
- app.py +54 -15
- core.py +5 -3
- documents/Chia Wei Jie.txt +25 -0
- documents/Sam Zhuo Li.txt +27 -0
- embedding.py +6 -0
- gradio_app.py +24 -0
- main.py +4 -5
- pdf_loader.py +24 -0
- preprocessing.py +3 -4
- requirements.txt +1 -1
README.md
CHANGED
@@ -10,10 +10,10 @@ Install all the dependencies with:
|
|
10 |
./install.sh
|
11 |
```
|
12 |
|
13 |
-
Run the
|
14 |
|
15 |
```bash
|
16 |
-
|
17 |
```
|
18 |
|
19 |
Or run it from with:
|
|
|
10 |
./install.sh
|
11 |
```
|
12 |
|
13 |
+
Run the Streamlilt with:
|
14 |
|
15 |
```bash
|
16 |
+
streamlit run app.py
|
17 |
```
|
18 |
|
19 |
Or run it from with:
|
app.py
CHANGED
@@ -1,24 +1,63 @@
|
|
1 |
-
|
|
|
2 |
from core import pipeline
|
3 |
|
4 |
-
|
5 |
|
6 |
-
|
7 |
-
#
|
8 |
-
|
9 |
-
|
|
|
10 |
|
|
|
11 |
prob_per_documents = {result['name']: result['similarity'] for result in results}
|
12 |
return prob_per_documents
|
13 |
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pdf_loader import load_btyes_io
|
3 |
from core import pipeline
|
4 |
|
5 |
+
def inference(query, files, embedding_type):
|
6 |
|
7 |
+
# pdfReader = PyPDF2.PdfReader(files[0])
|
8 |
+
# text = ''
|
9 |
+
# for page in pdfReader.pages:
|
10 |
+
# text += page.extract_text()
|
11 |
+
# st.write(text)
|
12 |
|
13 |
+
results, _ = pipeline(query, load_btyes_io(files), embedding_type=embedding_type)
|
14 |
prob_per_documents = {result['name']: result['similarity'] for result in results}
|
15 |
return prob_per_documents
|
16 |
|
17 |
+
def main():
|
18 |
+
sample_files = [
|
19 |
+
"documents/business.pdf",
|
20 |
+
"documents/data_science.pdf",
|
21 |
+
]
|
22 |
|
23 |
+
sample_job_descriptions = {
|
24 |
+
"Software Engineer": """We are looking for a software engineer with experience in Python and web development. The ideal candidate should have a strong background in building scalable and robust applications. Knowledge of frameworks such as Flask and Django is a plus. Experience with front-end technologies like HTML, CSS, and JavaScript is desirable. The candidate should also have a good understanding of databases and SQL. Strong problem-solving and communication skills are required for this role.
|
25 |
+
""",
|
26 |
+
"Data Scientist": """We are seeking a data scientist with expertise in machine learning and statistical analysis. The candidate should have a solid understanding of data manipulation, feature engineering, and model development. Proficiency in Python and popular data science libraries such as NumPy, Pandas, and Scikit-learn is required. Experience with deep learning frameworks like TensorFlow or PyTorch is a plus. Strong analytical and problem-solving skills are essential for this position.
|
27 |
+
"""
|
28 |
+
}
|
29 |
+
st.sidebar.header("Sample Files")
|
30 |
+
for sample_file in sample_files:
|
31 |
+
st.sidebar.markdown(f"[{sample_file}](./sample_files/{sample_file})")
|
32 |
|
33 |
+
st.sidebar.header("Sample Job Descriptions")
|
34 |
+
selected_job = st.sidebar.selectbox("Select a job description", list(sample_job_descriptions.keys()))
|
35 |
+
st.sidebar.markdown("```")
|
36 |
+
st.sidebar.code(sample_job_descriptions[selected_job])
|
37 |
+
st.title("👨🏼🎓Resume Ranker ")
|
38 |
+
|
39 |
+
query = st.text_area("Job Description", height=200, value=sample_job_descriptions[selected_job])
|
40 |
+
uploaded_files = st.file_uploader("Upload Resume", accept_multiple_files=True, type=["txt", "pdf"])
|
41 |
+
embedding_type = st.selectbox("Embedding Type", ["bert", "minilm", "tfidf"])
|
42 |
+
|
43 |
+
if st.button("Submit"):
|
44 |
+
if not query:
|
45 |
+
st.warning("Please enter a job description.")
|
46 |
+
elif not uploaded_files:
|
47 |
+
st.warning("Please upload one or more resumes.")
|
48 |
+
else:
|
49 |
+
with st.spinner("Processing..."):
|
50 |
+
results = inference(query, uploaded_files,embedding_type)
|
51 |
+
st.subheader("Results")
|
52 |
+
for document, similarity in results.items():
|
53 |
+
# make similiarty round to 2 decimal place
|
54 |
+
if similarity >= 1:
|
55 |
+
similarity = round(similarity, 2)
|
56 |
+
st.write(f"- {document}:")
|
57 |
+
st.progress(similarity, text=f"{similarity:.2%}")
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
if __name__ == '__main__':
|
63 |
+
main()
|
core.py
CHANGED
@@ -2,13 +2,16 @@ from embedding import embedding
|
|
2 |
from preprocessing import preprocess
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
import numpy as np
|
|
|
5 |
|
6 |
-
def pipeline(input_doc:str , ori_documents):
|
7 |
documents = np.array([doc['content'] for doc in ori_documents])
|
8 |
documents = np.insert(documents, 0, input_doc)
|
|
|
9 |
preprocessed_documents = preprocess(documents)
|
|
|
10 |
print("Encoding with BERT...")
|
11 |
-
documents_vectors = embedding(preprocessed_documents)
|
12 |
print("Encoding finished")
|
13 |
|
14 |
#compute cosine similarity
|
@@ -16,7 +19,6 @@ def pipeline(input_doc:str , ori_documents):
|
|
16 |
|
17 |
#only retain useful information
|
18 |
pairwise = pairwise[0,1:]
|
19 |
-
|
20 |
sorted_idx = np.argsort(pairwise)[::-1]
|
21 |
result_pairwise = pairwise[sorted_idx]
|
22 |
|
|
|
2 |
from preprocessing import preprocess
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
import numpy as np
|
5 |
+
import streamlit as st
|
6 |
|
7 |
+
def pipeline(input_doc:str , ori_documents, embedding_type='bert'):
|
8 |
documents = np.array([doc['content'] for doc in ori_documents])
|
9 |
documents = np.insert(documents, 0, input_doc)
|
10 |
+
# st.write(documents)
|
11 |
preprocessed_documents = preprocess(documents)
|
12 |
+
# st.write(preprocessed_documents)
|
13 |
print("Encoding with BERT...")
|
14 |
+
documents_vectors = embedding(preprocessed_documents, embedding=embedding_type)
|
15 |
print("Encoding finished")
|
16 |
|
17 |
#compute cosine similarity
|
|
|
19 |
|
20 |
#only retain useful information
|
21 |
pairwise = pairwise[0,1:]
|
|
|
22 |
sorted_idx = np.argsort(pairwise)[::-1]
|
23 |
result_pairwise = pairwise[sorted_idx]
|
24 |
|
documents/Chia Wei Jie.txt
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Java
|
2 |
+
Python
|
3 |
+
React
|
4 |
+
JavaScript
|
5 |
+
HTML + CSS
|
6 |
+
TypeScript
|
7 |
+
Flutter
|
8 |
+
ML
|
9 |
+
MongoDB
|
10 |
+
MySQL
|
11 |
+
SpringBoot
|
12 |
+
Backend developer
|
13 |
+
Cooperated with 4 members to complete a bug tracking
|
14 |
+
website (https://bugslife-fe.vercel.app) assigned as SEM 2
|
15 |
+
Data Structure assignment
|
16 |
+
Frontend developer
|
17 |
+
Cooperated with 2 members to complete an android
|
18 |
+
mobile application called ReturnMed used to join Google
|
19 |
+
Solution Challenge 2021.
|
20 |
+
Full-stack developer
|
21 |
+
Cooperated with 2 other members to develop a website
|
22 |
+
(https://target-t2d.my) used by hospitals for data collection
|
23 |
+
and analysis.
|
24 |
+
Carried out user training and handled website maintenance
|
25 |
+
after the website had been completely developed.
|
documents/Sam Zhuo Li.txt
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
WORKING / PROJECT EXPERIENCES
|
2 |
+
DoctorOnCall - Software and Data Intern
|
3 |
+
* Tech Stack: Next.js, Express.js, GraphQL, SQL, Python, GCP, Puppeteer.js
|
4 |
+
* Conducted analysis on online consultation data, identified poor performance doctors
|
5 |
+
* Automate workflow to access remote server, clean data, mailing and saved up 30 mins per day
|
6 |
+
* Refactored core Shopify client REST API to GraphQL, improved web performance
|
7 |
+
* Scraped more than 1k pages data on Yellow Pages, identified potential collaborator
|
8 |
+
|
9 |
+
PORTFOLIO & PROJECTS
|
10 |
+
* Dog Breed Classifier
|
11 |
+
* Web app that classifies dog breed with image- this
|
12 |
+
* Image Classifier of Pizza, Steak, Sushi
|
13 |
+
* Fine-tuned EfficientNet model that classifies food - this
|
14 |
+
* SupaVacation
|
15 |
+
* Online marketplace for vacation rentals - this
|
16 |
+
* AllSight
|
17 |
+
* Analytics dashboard to improve website performance- this
|
18 |
+
|
19 |
+
TECHNICAL SKILLS
|
20 |
+
Frontend
|
21 |
+
Backend
|
22 |
+
JavaScript, TypeScript, React.js, Node.js, HTML, CSS
|
23 |
+
Node.js, Firebase, MySQL, PostgreSQL, Rest API, GraphQL, Redis, FastAPI, Java
|
24 |
+
Machine Learning
|
25 |
+
Others
|
26 |
+
Python, TensorFlow, PyTorch, Scikit-Learn, Numpy, Pandas, Scikit-Learn, spaCy, NLTK
|
27 |
+
Git, Docker, Kubernetes, Nginx, GCP, BigQuery, Puppeteer.js, Figma
|
embedding.py
CHANGED
@@ -8,6 +8,12 @@ def embedding(documents, embedding='bert'):
|
|
8 |
|
9 |
document_embeddings = sbert_model.encode(documents)
|
10 |
return document_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
if embedding == 'tfidf':
|
13 |
word_vectorizer = TfidfVectorizer(
|
|
|
8 |
|
9 |
document_embeddings = sbert_model.encode(documents)
|
10 |
return document_embeddings
|
11 |
+
|
12 |
+
if embedding == 'minilm':
|
13 |
+
sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=os.path.join(os.getcwd(), 'embedding'))
|
14 |
+
|
15 |
+
document_embeddings = sbert_model.encode(documents)
|
16 |
+
return document_embeddings
|
17 |
|
18 |
if embedding == 'tfidf':
|
19 |
word_vectorizer = TfidfVectorizer(
|
gradio_app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf_loader import load_documents
|
2 |
+
from core import pipeline
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
def inference(query, files):
|
7 |
+
#get path of uploaded files
|
8 |
+
files = [file.name for file in files]
|
9 |
+
results,_ = pipeline(query, load_documents(file_paths=files))
|
10 |
+
|
11 |
+
prob_per_documents = {result['name']: result['similarity'] for result in results}
|
12 |
+
return prob_per_documents
|
13 |
+
|
14 |
+
with gr.Blocks() as demo:
|
15 |
+
#write a header
|
16 |
+
|
17 |
+
job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
|
18 |
+
files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
|
19 |
+
btn = gr.Button("Submit")
|
20 |
+
output = gr.Label(label="Results")
|
21 |
+
# output = gr.Number(label="Results")
|
22 |
+
btn.click(inference, inputs=[job_desc, files], outputs=output)
|
23 |
+
|
24 |
+
demo.launch()
|
main.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
-
# from fastapi import FastAPI
|
2 |
-
|
3 |
# app = FastAPI()
|
4 |
|
5 |
-
# @app.
|
6 |
-
# async def root():
|
7 |
-
# return {"
|
|
|
1 |
+
# from fastapi import FastAPI, File, UploadFile
|
|
|
2 |
# app = FastAPI()
|
3 |
|
4 |
+
# @app.post("/resume")
|
5 |
+
# async def root(name:str, email:str, about:str, file:UploadFile = File(...)):
|
6 |
+
# return {"name":name, "email":email, "about":about, "file_name":file.filename}
|
pdf_loader.py
CHANGED
@@ -38,3 +38,27 @@ def load_documents(file_paths: list[str] = None, source_dir: str = None):
|
|
38 |
'content': load_single_document(f"{file_path}")
|
39 |
} for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
|
40 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
'content': load_single_document(f"{file_path}")
|
39 |
} for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
|
40 |
]
|
41 |
+
|
42 |
+
def load_io(file_byte = None):
|
43 |
+
# Loads a single document from file path
|
44 |
+
if file_byte.name[-3:] == 'txt':
|
45 |
+
return file_byte.read().decode("utf-8")
|
46 |
+
|
47 |
+
elif file_byte.name[-3:] == 'pdf':
|
48 |
+
pdfReader = PyPDF2.PdfReader(file_byte)
|
49 |
+
text = ''
|
50 |
+
for page in pdfReader.pages:
|
51 |
+
text += page.extract_text()
|
52 |
+
return text
|
53 |
+
|
54 |
+
else:
|
55 |
+
raise Exception('Invalid file type')
|
56 |
+
|
57 |
+
def load_btyes_io(files = None):
|
58 |
+
|
59 |
+
return [
|
60 |
+
{
|
61 |
+
'name': file_btye.name,
|
62 |
+
'content': load_io(file_btye)
|
63 |
+
} for idx, file_btye in enumerate(files) if file_btye.name[-3:] in ['txt', 'pdf']
|
64 |
+
]
|
preprocessing.py
CHANGED
@@ -84,15 +84,14 @@ def lemmatize_verbs(words):
|
|
84 |
lemmas.append(lemma)
|
85 |
return lemmas
|
86 |
|
87 |
-
|
88 |
def normalize(words):
|
89 |
words = remove_non_ascii(words)
|
90 |
words = to_lowercase(words)
|
91 |
words = remove_punctuation(words)
|
92 |
-
words = replace_numbers(words)
|
93 |
words = remove_stopwords(words)
|
94 |
-
words = stem_words(words)
|
95 |
-
words = lemmatize_verbs(words)
|
96 |
return words
|
97 |
|
98 |
|
|
|
84 |
lemmas.append(lemma)
|
85 |
return lemmas
|
86 |
|
|
|
87 |
def normalize(words):
|
88 |
words = remove_non_ascii(words)
|
89 |
words = to_lowercase(words)
|
90 |
words = remove_punctuation(words)
|
91 |
+
# words = replace_numbers(words)
|
92 |
words = remove_stopwords(words)
|
93 |
+
# words = stem_words(words)
|
94 |
+
# words = lemmatize_verbs(words)
|
95 |
return words
|
96 |
|
97 |
|
requirements.txt
CHANGED
@@ -8,4 +8,4 @@ fastapi
|
|
8 |
uvicorn[standard]
|
9 |
python-multipart
|
10 |
python-dotenv
|
11 |
-
|
|
|
8 |
uvicorn[standard]
|
9 |
python-multipart
|
10 |
python-dotenv
|
11 |
+
streamlit
|