zhuolisam commited on
Commit
38070e5
2 Parent(s): b068d77 2c14023

feat:streamlit

Browse files
Files changed (12) hide show
  1. .gitignore +2 -1
  2. README.md +2 -2
  3. app.py +54 -15
  4. core.py +5 -3
  5. documents/Chia Wei Jie.txt +25 -0
  6. documents/Sam Zhuo Li.txt +27 -0
  7. embedding.py +6 -0
  8. gradio_app.py +24 -0
  9. main.py +4 -5
  10. pdf_loader.py +24 -0
  11. preprocessing.py +3 -4
  12. requirements.txt +1 -1
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  __pycache__
2
  venv
3
  nltk_packages
4
- embedding
 
 
1
  __pycache__
2
  venv
3
  nltk_packages
4
+ embedding
5
+ documents
README.md CHANGED
@@ -21,10 +21,10 @@ Install all the dependencies with:
21
  ./install.sh
22
  ```
23
 
24
- Run the Gradio UI with:
25
 
26
  ```bash
27
- gradio app.py
28
  ```
29
 
30
  Or run it from with:
 
21
  ./install.sh
22
  ```
23
 
24
+ Run the Streamlilt with:
25
 
26
  ```bash
27
+ streamlit run app.py
28
  ```
29
 
30
  Or run it from with:
app.py CHANGED
@@ -1,24 +1,63 @@
1
- from pdf_loader import load_documents
 
2
  from core import pipeline
3
 
4
- import gradio as gr
5
 
6
- def inference(query, files):
7
- #get path of uploaded files
8
- files = [file.name for file in files]
9
- results,_ = pipeline(query, load_documents(file_paths=files))
 
10
 
 
11
  prob_per_documents = {result['name']: result['similarity'] for result in results}
12
  return prob_per_documents
13
 
14
- with gr.Blocks() as demo:
15
- #write a header
 
 
 
16
 
17
- job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
18
- files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
19
- btn = gr.Button("Submit")
20
- output = gr.Label(label="Results")
21
- # output = gr.Number(label="Results")
22
- btn.click(inference, inputs=[job_desc, files], outputs=output)
 
 
 
23
 
24
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pdf_loader import load_btyes_io
3
  from core import pipeline
4
 
5
+ def inference(query, files, embedding_type):
6
 
7
+ # pdfReader = PyPDF2.PdfReader(files[0])
8
+ # text = ''
9
+ # for page in pdfReader.pages:
10
+ # text += page.extract_text()
11
+ # st.write(text)
12
 
13
+ results, _ = pipeline(query, load_btyes_io(files), embedding_type=embedding_type)
14
  prob_per_documents = {result['name']: result['similarity'] for result in results}
15
  return prob_per_documents
16
 
17
+ def main():
18
+ sample_files = [
19
+ "documents/business.pdf",
20
+ "documents/data_science.pdf",
21
+ ]
22
 
23
+ sample_job_descriptions = {
24
+ "Software Engineer": """We are looking for a software engineer with experience in Python and web development. The ideal candidate should have a strong background in building scalable and robust applications. Knowledge of frameworks such as Flask and Django is a plus. Experience with front-end technologies like HTML, CSS, and JavaScript is desirable. The candidate should also have a good understanding of databases and SQL. Strong problem-solving and communication skills are required for this role.
25
+ """,
26
+ "Data Scientist": """We are seeking a data scientist with expertise in machine learning and statistical analysis. The candidate should have a solid understanding of data manipulation, feature engineering, and model development. Proficiency in Python and popular data science libraries such as NumPy, Pandas, and Scikit-learn is required. Experience with deep learning frameworks like TensorFlow or PyTorch is a plus. Strong analytical and problem-solving skills are essential for this position.
27
+ """
28
+ }
29
+ st.sidebar.header("Sample Files")
30
+ for sample_file in sample_files:
31
+ st.sidebar.markdown(f"[{sample_file}](./sample_files/{sample_file})")
32
 
33
+ st.sidebar.header("Sample Job Descriptions")
34
+ selected_job = st.sidebar.selectbox("Select a job description", list(sample_job_descriptions.keys()))
35
+ st.sidebar.markdown("```")
36
+ st.sidebar.code(sample_job_descriptions[selected_job])
37
+ st.title("👨🏼‍🎓Resume Ranker ")
38
+
39
+ query = st.text_area("Job Description", height=200, value=sample_job_descriptions[selected_job])
40
+ uploaded_files = st.file_uploader("Upload Resume", accept_multiple_files=True, type=["txt", "pdf"])
41
+ embedding_type = st.selectbox("Embedding Type", ["bert", "minilm", "tfidf"])
42
+
43
+ if st.button("Submit"):
44
+ if not query:
45
+ st.warning("Please enter a job description.")
46
+ elif not uploaded_files:
47
+ st.warning("Please upload one or more resumes.")
48
+ else:
49
+ with st.spinner("Processing..."):
50
+ results = inference(query, uploaded_files,embedding_type)
51
+ st.subheader("Results")
52
+ for document, similarity in results.items():
53
+ # make similiarty round to 2 decimal place
54
+ if similarity >= 1:
55
+ similarity = round(similarity, 2)
56
+ st.write(f"- {document}:")
57
+ st.progress(similarity, text=f"{similarity:.2%}")
58
+
59
+
60
+
61
+
62
+ if __name__ == '__main__':
63
+ main()
core.py CHANGED
@@ -2,13 +2,16 @@ from embedding import embedding
2
  from preprocessing import preprocess
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import numpy as np
 
5
 
6
- def pipeline(input_doc:str , ori_documents):
7
  documents = np.array([doc['content'] for doc in ori_documents])
8
  documents = np.insert(documents, 0, input_doc)
 
9
  preprocessed_documents = preprocess(documents)
 
10
  print("Encoding with BERT...")
11
- documents_vectors = embedding(preprocessed_documents)
12
  print("Encoding finished")
13
 
14
  #compute cosine similarity
@@ -16,7 +19,6 @@ def pipeline(input_doc:str , ori_documents):
16
 
17
  #only retain useful information
18
  pairwise = pairwise[0,1:]
19
-
20
  sorted_idx = np.argsort(pairwise)[::-1]
21
  result_pairwise = pairwise[sorted_idx]
22
 
 
2
  from preprocessing import preprocess
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import numpy as np
5
+ import streamlit as st
6
 
7
+ def pipeline(input_doc:str , ori_documents, embedding_type='bert'):
8
  documents = np.array([doc['content'] for doc in ori_documents])
9
  documents = np.insert(documents, 0, input_doc)
10
+ # st.write(documents)
11
  preprocessed_documents = preprocess(documents)
12
+ # st.write(preprocessed_documents)
13
  print("Encoding with BERT...")
14
+ documents_vectors = embedding(preprocessed_documents, embedding=embedding_type)
15
  print("Encoding finished")
16
 
17
  #compute cosine similarity
 
19
 
20
  #only retain useful information
21
  pairwise = pairwise[0,1:]
 
22
  sorted_idx = np.argsort(pairwise)[::-1]
23
  result_pairwise = pairwise[sorted_idx]
24
 
documents/Chia Wei Jie.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Java
2
+ Python
3
+ React
4
+ JavaScript
5
+ HTML + CSS
6
+ TypeScript
7
+ Flutter
8
+ ML
9
+ MongoDB
10
+ MySQL
11
+ SpringBoot
12
+ Backend developer
13
+ Cooperated with 4 members to complete a bug tracking
14
+ website (https://bugslife-fe.vercel.app) assigned as SEM 2
15
+ Data Structure assignment
16
+ Frontend developer
17
+ Cooperated with 2 members to complete an android
18
+ mobile application called ReturnMed used to join Google
19
+ Solution Challenge 2021.
20
+ Full-stack developer
21
+ Cooperated with 2 other members to develop a website
22
+ (https://target-t2d.my) used by hospitals for data collection
23
+ and analysis.
24
+ Carried out user training and handled website maintenance
25
+ after the website had been completely developed.
documents/Sam Zhuo Li.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WORKING / PROJECT EXPERIENCES
2
+ DoctorOnCall - Software and Data Intern
3
+ * Tech Stack: Next.js, Express.js, GraphQL, SQL, Python, GCP, Puppeteer.js
4
+ * Conducted analysis on online consultation data, identified poor performance doctors
5
+ * Automate workflow to access remote server, clean data, mailing and saved up 30 mins per day
6
+ * Refactored core Shopify client REST API to GraphQL, improved web performance
7
+ * Scraped more than 1k pages data on Yellow Pages, identified potential collaborator
8
+
9
+ PORTFOLIO & PROJECTS
10
+ * Dog Breed Classifier
11
+ * Web app that classifies dog breed with image- this
12
+ * Image Classifier of Pizza, Steak, Sushi
13
+ * Fine-tuned EfficientNet model that classifies food - this
14
+ * SupaVacation
15
+ * Online marketplace for vacation rentals - this
16
+ * AllSight
17
+ * Analytics dashboard to improve website performance- this
18
+
19
+ TECHNICAL SKILLS
20
+ Frontend
21
+ Backend
22
+ JavaScript, TypeScript, React.js, Node.js, HTML, CSS
23
+ Node.js, Firebase, MySQL, PostgreSQL, Rest API, GraphQL, Redis, FastAPI, Java
24
+ Machine Learning
25
+ Others
26
+ Python, TensorFlow, PyTorch, Scikit-Learn, Numpy, Pandas, Scikit-Learn, spaCy, NLTK
27
+ Git, Docker, Kubernetes, Nginx, GCP, BigQuery, Puppeteer.js, Figma
embedding.py CHANGED
@@ -8,6 +8,12 @@ def embedding(documents, embedding='bert'):
8
 
9
  document_embeddings = sbert_model.encode(documents)
10
  return document_embeddings
 
 
 
 
 
 
11
 
12
  if embedding == 'tfidf':
13
  word_vectorizer = TfidfVectorizer(
 
8
 
9
  document_embeddings = sbert_model.encode(documents)
10
  return document_embeddings
11
+
12
+ if embedding == 'minilm':
13
+ sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=os.path.join(os.getcwd(), 'embedding'))
14
+
15
+ document_embeddings = sbert_model.encode(documents)
16
+ return document_embeddings
17
 
18
  if embedding == 'tfidf':
19
  word_vectorizer = TfidfVectorizer(
gradio_app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf_loader import load_documents
2
+ from core import pipeline
3
+
4
+ import gradio as gr
5
+
6
+ def inference(query, files):
7
+ #get path of uploaded files
8
+ files = [file.name for file in files]
9
+ results,_ = pipeline(query, load_documents(file_paths=files))
10
+
11
+ prob_per_documents = {result['name']: result['similarity'] for result in results}
12
+ return prob_per_documents
13
+
14
+ with gr.Blocks() as demo:
15
+ #write a header
16
+
17
+ job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
18
+ files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
19
+ btn = gr.Button("Submit")
20
+ output = gr.Label(label="Results")
21
+ # output = gr.Number(label="Results")
22
+ btn.click(inference, inputs=[job_desc, files], outputs=output)
23
+
24
+ demo.launch()
main.py CHANGED
@@ -1,7 +1,6 @@
1
- # from fastapi import FastAPI
2
-
3
  # app = FastAPI()
4
 
5
- # @app.get("/")
6
- # async def root():
7
- # return {"message": "Hello World"}
 
1
+ # from fastapi import FastAPI, File, UploadFile
 
2
  # app = FastAPI()
3
 
4
+ # @app.post("/resume")
5
+ # async def root(name:str, email:str, about:str, file:UploadFile = File(...)):
6
+ # return {"name":name, "email":email, "about":about, "file_name":file.filename}
pdf_loader.py CHANGED
@@ -38,3 +38,27 @@ def load_documents(file_paths: list[str] = None, source_dir: str = None):
38
  'content': load_single_document(f"{file_path}")
39
  } for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
40
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  'content': load_single_document(f"{file_path}")
39
  } for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
40
  ]
41
+
42
+ def load_io(file_byte = None):
43
+ # Loads a single document from file path
44
+ if file_byte.name[-3:] == 'txt':
45
+ return file_byte.read().decode("utf-8")
46
+
47
+ elif file_byte.name[-3:] == 'pdf':
48
+ pdfReader = PyPDF2.PdfReader(file_byte)
49
+ text = ''
50
+ for page in pdfReader.pages:
51
+ text += page.extract_text()
52
+ return text
53
+
54
+ else:
55
+ raise Exception('Invalid file type')
56
+
57
+ def load_btyes_io(files = None):
58
+
59
+ return [
60
+ {
61
+ 'name': file_btye.name,
62
+ 'content': load_io(file_btye)
63
+ } for idx, file_btye in enumerate(files) if file_btye.name[-3:] in ['txt', 'pdf']
64
+ ]
preprocessing.py CHANGED
@@ -85,15 +85,14 @@ def lemmatize_verbs(words):
85
  lemmas.append(lemma)
86
  return lemmas
87
 
88
-
89
  def normalize(words):
90
  words = remove_non_ascii(words)
91
  words = to_lowercase(words)
92
  words = remove_punctuation(words)
93
- words = replace_numbers(words)
94
  words = remove_stopwords(words)
95
- words = stem_words(words)
96
- words = lemmatize_verbs(words)
97
  return words
98
 
99
 
 
85
  lemmas.append(lemma)
86
  return lemmas
87
 
 
88
  def normalize(words):
89
  words = remove_non_ascii(words)
90
  words = to_lowercase(words)
91
  words = remove_punctuation(words)
92
+ # words = replace_numbers(words)
93
  words = remove_stopwords(words)
94
+ # words = stem_words(words)
95
+ # words = lemmatize_verbs(words)
96
  return words
97
 
98
 
requirements.txt CHANGED
@@ -8,4 +8,4 @@ fastapi
8
  uvicorn[standard]
9
  python-multipart
10
  python-dotenv
11
- gradio
 
8
  uvicorn[standard]
9
  python-multipart
10
  python-dotenv
11
+ streamlit