Spaces:

zhuolisam
/

resume-ranker

Runtime error

App Files Files Community

zhuolisam commited on Jun 7, 2023

Commit

2c14023

•

1 Parent(s): 0a6dedb

feat:streamlit

Browse files

Files changed (11) hide show

README.md +2 -2
app.py +54 -15
core.py +5 -3
documents/Chia Wei Jie.txt +25 -0
documents/Sam Zhuo Li.txt +27 -0
embedding.py +6 -0
gradio_app.py +24 -0
main.py +4 -5
pdf_loader.py +24 -0
preprocessing.py +3 -4
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -10,10 +10,10 @@ Install all the dependencies with:
 ./install.sh
 ```
-Run the Gradio UI with:
 ```bash
-gradio app.py
 ```
 Or run it from with:

 ./install.sh
 ```
+Run the Streamlilt with:
 ```bash
+streamlit run app.py
 ```
 Or run it from with:

app.py CHANGED Viewed

@@ -1,24 +1,63 @@
-from pdf_loader import load_documents
 from core import pipeline
-import gradio as gr
-def inference(query, files):
-    #get path of uploaded files
-    files = [file.name for file in files]
-    results,_ = pipeline(query, load_documents(file_paths=files))
     prob_per_documents = {result['name']: result['similarity'] for result in results}
     return prob_per_documents
-with gr.Blocks() as demo:
-    #write a header
-    job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
-    files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
-    btn = gr.Button("Submit")
-    output = gr.Label(label="Results")
-    # output = gr.Number(label="Results")
-    btn.click(inference, inputs=[job_desc, files], outputs=output)
-demo.launch(server_port=7800)

+import streamlit as st
+from pdf_loader import load_btyes_io
 from core import pipeline
+def inference(query, files, embedding_type):
+    # pdfReader = PyPDF2.PdfReader(files[0])
+    # text = ''
+    # for page in pdfReader.pages:
+    #     text += page.extract_text()
+    # st.write(text)
+    results, _ = pipeline(query, load_btyes_io(files), embedding_type=embedding_type)
     prob_per_documents = {result['name']: result['similarity'] for result in results}
     return prob_per_documents
+def main():
+    sample_files = [
+        "documents/business.pdf",
+        "documents/data_science.pdf",
+    ]
+    sample_job_descriptions = {
+        "Software Engineer": """We are looking for a software engineer with experience in Python and web development. The ideal candidate should have a strong background in building scalable and robust applications. Knowledge of frameworks such as Flask and Django is a plus. Experience with front-end technologies like HTML, CSS, and JavaScript is desirable. The candidate should also have a good understanding of databases and SQL. Strong problem-solving and communication skills are required for this role.
+        """,
+        "Data Scientist": """We are seeking a data scientist with expertise in machine learning and statistical analysis. The candidate should have a solid understanding of data manipulation, feature engineering, and model development. Proficiency in Python and popular data science libraries such as NumPy, Pandas, and Scikit-learn is required. Experience with deep learning frameworks like TensorFlow or PyTorch is a plus. Strong analytical and problem-solving skills are essential for this position.
+        """
+    }
+    st.sidebar.header("Sample Files")
+    for sample_file in sample_files:
+        st.sidebar.markdown(f"[{sample_file}](./sample_files/{sample_file})")
+    st.sidebar.header("Sample Job Descriptions")
+    selected_job = st.sidebar.selectbox("Select a job description", list(sample_job_descriptions.keys()))
+    st.sidebar.markdown("```")
+    st.sidebar.code(sample_job_descriptions[selected_job])
+    st.title("👨🏼‍🎓Resume Ranker ")
+    query = st.text_area("Job Description", height=200, value=sample_job_descriptions[selected_job])
+    uploaded_files = st.file_uploader("Upload Resume", accept_multiple_files=True, type=["txt", "pdf"])
+    embedding_type = st.selectbox("Embedding Type", ["bert", "minilm", "tfidf"])
+    if st.button("Submit"):
+        if not query:
+            st.warning("Please enter a job description.")
+        elif not uploaded_files:
+            st.warning("Please upload one or more resumes.")
+        else:
+            with st.spinner("Processing..."):
+                results = inference(query, uploaded_files,embedding_type)
+            st.subheader("Results")
+            for document, similarity in results.items():
+                # make similiarty round to 2 decimal place
+                if similarity >= 1:
+                    similarity = round(similarity, 2)
+                st.write(f"- {document}:")
+                st.progress(similarity, text=f"{similarity:.2%}")
+if __name__ == '__main__':
+    main()

core.py CHANGED Viewed

@@ -2,13 +2,16 @@ from embedding import embedding
 from preprocessing import preprocess
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
-def pipeline(input_doc:str , ori_documents):
     documents = np.array([doc['content'] for doc in ori_documents])
     documents = np.insert(documents, 0, input_doc)
     preprocessed_documents = preprocess(documents)
     print("Encoding with BERT...")
-    documents_vectors = embedding(preprocessed_documents)
     print("Encoding finished")
     #compute cosine similarity
@@ -16,7 +19,6 @@ def pipeline(input_doc:str , ori_documents):
     #only retain useful information
     pairwise = pairwise[0,1:]
     sorted_idx = np.argsort(pairwise)[::-1]
     result_pairwise = pairwise[sorted_idx]

 from preprocessing import preprocess
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
+import streamlit as st
+def pipeline(input_doc:str , ori_documents, embedding_type='bert'):
     documents = np.array([doc['content'] for doc in ori_documents])
     documents = np.insert(documents, 0, input_doc)
+    # st.write(documents)
     preprocessed_documents = preprocess(documents)
+    # st.write(preprocessed_documents)
     print("Encoding with BERT...")
+    documents_vectors = embedding(preprocessed_documents, embedding=embedding_type)
     print("Encoding finished")
     #compute cosine similarity
     #only retain useful information
     pairwise = pairwise[0,1:]
     sorted_idx = np.argsort(pairwise)[::-1]
     result_pairwise = pairwise[sorted_idx]

documents/Chia Wei Jie.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+Java
+Python
+React
+JavaScript
+HTML + CSS
+TypeScript
+Flutter
+ML
+MongoDB
+MySQL
+SpringBoot
+Backend developer
+Cooperated with 4 members to complete a bug tracking
+website (https://bugslife-fe.vercel.app) assigned as SEM 2
+Data Structure assignment
+Frontend developer
+Cooperated with 2 members to complete an android
+mobile application called ReturnMed used to join Google
+Solution Challenge 2021.
+Full-stack developer
+Cooperated with 2 other members to develop a website
+(https://target-t2d.my) used by hospitals for data collection
+and analysis.
+Carried out user training and handled website maintenance
+after the website had been completely developed.

documents/Sam Zhuo Li.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+WORKING / PROJECT EXPERIENCES
+DoctorOnCall - Software and Data Intern
+* Tech Stack: Next.js, Express.js, GraphQL, SQL, Python, GCP, Puppeteer.js
+* Conducted analysis on online consultation data, identified poor performance doctors
+* Automate workflow to access remote server, clean data, mailing and saved up 30 mins per day
+* Refactored core Shopify client REST API to GraphQL, improved web performance
+* Scraped more than 1k pages data on Yellow Pages, identified potential collaborator
+PORTFOLIO & PROJECTS
+* Dog Breed Classifier
+   * Web app that classifies dog breed with image- this
+* Image Classifier of Pizza, Steak, Sushi
+   * Fine-tuned EfficientNet model that classifies food - this
+* SupaVacation
+   * Online marketplace for vacation rentals -  this
+* AllSight
+   * Analytics dashboard to improve website performance- this
+TECHNICAL SKILLS
+Frontend
+Backend
+JavaScript, TypeScript, React.js, Node.js, HTML, CSS
+Node.js, Firebase, MySQL, PostgreSQL, Rest API, GraphQL, Redis, FastAPI, Java
+Machine Learning
+Others
+Python, TensorFlow, PyTorch, Scikit-Learn, Numpy, Pandas, Scikit-Learn, spaCy, NLTK
+Git, Docker, Kubernetes, Nginx, GCP, BigQuery, Puppeteer.js, Figma

embedding.py CHANGED Viewed

@@ -8,6 +8,12 @@ def embedding(documents, embedding='bert'):
         document_embeddings = sbert_model.encode(documents)
         return document_embeddings
     if embedding == 'tfidf':
         word_vectorizer = TfidfVectorizer(

         document_embeddings = sbert_model.encode(documents)
         return document_embeddings
+    if embedding == 'minilm':
+        sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=os.path.join(os.getcwd(), 'embedding'))
+        document_embeddings = sbert_model.encode(documents)
+        return document_embeddings
     if embedding == 'tfidf':
         word_vectorizer = TfidfVectorizer(

gradio_app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pdf_loader import load_documents
+from core import pipeline
+import gradio as gr
+def inference(query, files):
+    #get path of uploaded files
+    files = [file.name for file in files]
+    results,_ = pipeline(query, load_documents(file_paths=files))
+    prob_per_documents = {result['name']: result['similarity'] for result in results}
+    return prob_per_documents
+with gr.Blocks() as demo:
+    #write a header
+    job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
+    files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
+    btn = gr.Button("Submit")
+    output = gr.Label(label="Results")
+    # output = gr.Number(label="Results")
+    btn.click(inference, inputs=[job_desc, files], outputs=output)
+demo.launch()

main.py CHANGED Viewed

@@ -1,7 +1,6 @@
-# from fastapi import FastAPI
 # app = FastAPI()
-# @app.get("/")
-# async def root():
-#     return {"message": "Hello World"}

+# from fastapi import FastAPI, File, UploadFile
 # app = FastAPI()
+# @app.post("/resume")
+# async def root(name:str, email:str, about:str, file:UploadFile = File(...)):
+#     return {"name":name, "email":email, "about":about, "file_name":file.filename}

pdf_loader.py CHANGED Viewed

@@ -38,3 +38,27 @@ def load_documents(file_paths: list[str] = None, source_dir: str = None):
                 'content': load_single_document(f"{file_path}")
             } for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
         ]

                 'content': load_single_document(f"{file_path}")
             } for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
         ]
+def load_io(file_byte = None):
+    # Loads a single document from file path
+    if file_byte.name[-3:] == 'txt':
+        return file_byte.read().decode("utf-8")
+    elif file_byte.name[-3:] == 'pdf':
+        pdfReader = PyPDF2.PdfReader(file_byte)
+        text = ''
+        for page in pdfReader.pages:
+            text += page.extract_text()
+        return text
+    else:
+        raise Exception('Invalid file type')
+def load_btyes_io(files = None):
+    return [
+        {
+            'name': file_btye.name,
+            'content': load_io(file_btye)
+        } for idx, file_btye in enumerate(files) if file_btye.name[-3:] in ['txt', 'pdf']
+    ]

preprocessing.py CHANGED Viewed

@@ -84,15 +84,14 @@ def lemmatize_verbs(words):
         lemmas.append(lemma)
     return lemmas
 def normalize(words):
     words = remove_non_ascii(words)
     words = to_lowercase(words)
     words = remove_punctuation(words)
-    words = replace_numbers(words)
     words = remove_stopwords(words)
-    words = stem_words(words)
-    words = lemmatize_verbs(words)
     return words

         lemmas.append(lemma)
     return lemmas
 def normalize(words):
     words = remove_non_ascii(words)
     words = to_lowercase(words)
     words = remove_punctuation(words)
+    # words = replace_numbers(words)
     words = remove_stopwords(words)
+    # words = stem_words(words)
+    # words = lemmatize_verbs(words)
     return words

requirements.txt CHANGED Viewed

@@ -8,4 +8,4 @@ fastapi
 uvicorn[standard]
 python-multipart
 python-dotenv
-gradio

 uvicorn[standard]
 python-multipart
 python-dotenv
+streamlit