zhuolisam commited on
Commit
0a6dedb
1 Parent(s): ee5a856

Feat:add pipeline and gradio

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__
2
+ venv
3
+ nltk_packages
4
+ embedding
README.md CHANGED
@@ -1 +1,23 @@
1
  # resume-ranker
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # resume-ranker
2
+
3
+ <hr>
4
+
5
+ ## How to Use?
6
+
7
+ Install all the dependencies with:
8
+
9
+ ```bash
10
+ ./install.sh
11
+ ```
12
+
13
+ Run the Gradio UI with:
14
+
15
+ ```bash
16
+ gradio app.py
17
+ ```
18
+
19
+ Or run it from with:
20
+
21
+ ```bash
22
+ python demo.py
23
+ ```
__pycache__/embedding.cpython-311.pyc DELETED
Binary file (1.02 kB)
 
__pycache__/pdf_loader.cpython-311.pyc DELETED
Binary file (2.12 kB)
 
__pycache__/preprocessing.cpython-311.pyc DELETED
Binary file (4.74 kB)
 
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf_loader import load_documents
2
+ from core import pipeline
3
+
4
+ import gradio as gr
5
+
6
+ def inference(query, files):
7
+ #get path of uploaded files
8
+ files = [file.name for file in files]
9
+ results,_ = pipeline(query, load_documents(file_paths=files))
10
+
11
+ prob_per_documents = {result['name']: result['similarity'] for result in results}
12
+ return prob_per_documents
13
+
14
+ with gr.Blocks() as demo:
15
+ #write a header
16
+
17
+ job_desc = gr.inputs.Textbox(lines=5, label="Job Description")
18
+ files = gr.File(file_count="multiple", file_types=[".txt",".pdf"], label="Upload Resume")
19
+ btn = gr.Button("Submit")
20
+ output = gr.Label(label="Results")
21
+ # output = gr.Number(label="Results")
22
+ btn.click(inference, inputs=[job_desc, files], outputs=output)
23
+
24
+ demo.launch(server_port=7800)
core.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from embedding import embedding
2
+ from preprocessing import preprocess
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+
6
+ def pipeline(input_doc:str , ori_documents):
7
+ documents = np.array([doc['content'] for doc in ori_documents])
8
+ documents = np.insert(documents, 0, input_doc)
9
+ preprocessed_documents = preprocess(documents)
10
+ print("Encoding with BERT...")
11
+ documents_vectors = embedding(preprocessed_documents)
12
+ print("Encoding finished")
13
+
14
+ #compute cosine similarity
15
+ pairwise = cosine_similarity(documents_vectors)
16
+
17
+ #only retain useful information
18
+ pairwise = pairwise[0,1:]
19
+
20
+ sorted_idx = np.argsort(pairwise)[::-1]
21
+ result_pairwise = pairwise[sorted_idx]
22
+
23
+ results = []
24
+ print('Resume ranking:')
25
+ for idx in sorted_idx:
26
+ single_result = {
27
+ 'rank': idx,
28
+ 'name': ori_documents[idx]['name'],
29
+ 'similarity': pairwise[idx].item()
30
+ }
31
+ results.append(single_result)
32
+ print(f'Resume of candidite {idx}')
33
+ print(f'Cosine Similarity: {pairwise[idx]}\n')
34
+
35
+ return results, result_pairwise
demo.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf_loader import load_documents
2
+ from core import pipeline
3
+
4
+ if __name__ == '__main__':
5
+ pipeline('''About Sleek
6
+
7
+ Sleek is on a mission to revolutionize how entrepreneurs operate their business. We want to give small business owners peace of mind and the power of online solutions to allow them to focus on what they do best - growing their business. As we work for our thousands of customers, we gather millions of data points about their business, and in turn we transform those into useful, actionable insights and recommendations to accelerate their growth through smart algorithms.
8
+
9
+ We are a team of 400 builders from 17 countries, with offices in Singapore, Philippines, Hong Kong, Australia and the UK committed to delivering a delightful experience to our clients!
10
+
11
+ You will be working in the Data & Analytics organization to solve a wide range of business problems leveraging advanced analytics. You will deploy a flexible analytical skill set to deliver insightful data and analysis and model business scenarios. Your principal goal will be to use data to drive better business decisions. This means translating data into meaningful insights and recommendations and, where relevant, proactively implement improvements. You will be developing the business reporting and analysis for our internal operations world-wide. The job will require working closely with the various Business Units to understand their business question as well as the whole data team to understand and access available data.
12
+
13
+ Position Duties
14
+ Drive analytical problem-solving and deep dives. Work with large, complex data sets. Solve difficult, non-routine problems, applying advanced quantitative methods.
15
+ Collaborate with a wide variety of cross-functional partners to determine business needs, drive analytical projects from start to finish.
16
+ Align with involved stakeholders to set up dashboards and reports to drive data driven decision across all departments
17
+ Working very closely with our Data team, Tech and Product team to understand the business logic to generate accurate reports and correct analysis
18
+
19
+ Requirements
20
+
21
+ Performance Standards
22
+ Able to commit for a period of at least 4 months
23
+ Currently pursuing a degree in Business Science, Engineering or relevant disciplines with a focus on data.
24
+ Good knowledge in SQL, R and Python.
25
+ Experience in data visualization tools (Tableau, PowerBI, Google DataStudio or equivalent) will be an added advantage.''',
26
+ load_documents(source_dir = 'documents'))
embedding.py CHANGED
@@ -1,10 +1,10 @@
1
  from sklearn.feature_extraction.text import TfidfVectorizer
2
  from sentence_transformers import SentenceTransformer
3
-
4
 
5
  def embedding(documents, embedding='bert'):
6
  if embedding == 'bert':
7
- sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
8
 
9
  document_embeddings = sbert_model.encode(documents)
10
  return document_embeddings
 
1
  from sklearn.feature_extraction.text import TfidfVectorizer
2
  from sentence_transformers import SentenceTransformer
3
+ import os
4
 
5
  def embedding(documents, embedding='bert'):
6
  if embedding == 'bert':
7
+ sbert_model = SentenceTransformer('bert-base-nli-mean-tokens', cache_folder=os.path.join(os.getcwd(), 'embedding'))
8
 
9
  document_embeddings = sbert_model.encode(documents)
10
  return document_embeddings
install.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ find . \( -name __pycache__ -o -name "*.pyc" \) -delete
3
+ python3 -m venv venv
4
+ # Check the operating system
5
+ if [[ "$OSTYPE" == "msys" ]]; then
6
+ # Windows
7
+ source venv/Scripts/activate
8
+ else
9
+ # Unix-like systems (macOS, Linux)
10
+ source venv/bin/activate
11
+ fi
12
+ pip install --no-cache-dir -r requirements.txt
main.py CHANGED
@@ -1,31 +1,7 @@
1
- from pdf_loader import load_documents
2
- from embedding import embedding
3
- from preprocessing import preprocess
4
- from sklearn.metrics.pairwise import cosine_similarity
5
- import numpy as np
6
 
 
7
 
8
- def rank_documents(input_doc, documents):
9
- documents = np.insert(documents, 0, input_doc)
10
- preprocessed_documents = preprocess(documents)
11
- print("Encoding with BERT...")
12
- documents_vectors = embedding(preprocessed_documents)
13
- print("Encoding finished")
14
- print(documents_vectors.shape)
15
-
16
- pairwise = cosine_similarity(documents_vectors)
17
-
18
- print('Resume ranking:')
19
-
20
- sorted_idx = np.argsort(pairwise[0])[::-1]
21
-
22
- for idx in sorted_idx[:10]:
23
- if idx == 0:
24
- continue
25
- print(f'Resume of candidite {idx}')
26
- print(f'Cosine Similarity: {pairwise[0][idx]}\n')
27
-
28
-
29
- if __name__ == '__main__':
30
- rank_documents('I want a data scientist',
31
- load_documents('documents'))
 
1
+ # from fastapi import FastAPI
 
 
 
 
2
 
3
+ # app = FastAPI()
4
 
5
+ # @app.get("/")
6
+ # async def root():
7
+ # return {"message": "Hello World"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdf_loader.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import PyPDF2
3
 
4
-
5
  def load_single_document(file_path: str):
6
  # Loads a single document from file path
7
  if file_path[-4:] == '.txt':
@@ -24,7 +23,18 @@ def load_single_document(file_path: str):
24
  raise Exception('Invalid file type')
25
 
26
 
27
- def load_documents(source_dir: str):
28
  # Loads all documents from source documents directory
29
- all_files = os.listdir(source_dir)
30
- return [load_single_document(f"{source_dir}/{file_path}") for file_path in all_files if file_path[-4:] in ['.txt', '.pdf', '.csv']]
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import PyPDF2
3
 
 
4
  def load_single_document(file_path: str):
5
  # Loads a single document from file path
6
  if file_path[-4:] == '.txt':
 
23
  raise Exception('Invalid file type')
24
 
25
 
26
+ def load_documents(file_paths: list[str] = None, source_dir: str = None):
27
  # Loads all documents from source documents directory
28
+ if file_paths:
29
+ all_files = file_paths
30
+ elif source_dir:
31
+ all_files = [os.path.abspath(os.path.join(source_dir, file)) for file in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, file))]
32
+ else:
33
+ raise Exception('No file paths or source directory provided')
34
+
35
+ return [
36
+ {
37
+ 'name': os.path.basename(file_path),
38
+ 'content': load_single_document(f"{file_path}")
39
+ } for idx, file_path in enumerate(all_files) if file_path[-4:] in ['.txt', '.pdf', '.csv']
40
+ ]
preprocessing.py CHANGED
@@ -1,5 +1,6 @@
1
 
2
  import re
 
3
  import unicodedata
4
  import nltk
5
  import inflect
@@ -7,9 +8,10 @@ from nltk import word_tokenize, sent_tokenize
7
  from nltk.corpus import stopwords
8
  from nltk.stem import LancasterStemmer, WordNetLemmatizer
9
 
10
- nltk.download('wordnet')
11
- nltk.download('stopwords')
12
-
 
13
 
14
  def remove_non_ascii(words):
15
  """Remove non-ASCII characters from list of tokenized words"""
 
1
 
2
  import re
3
+ import os
4
  import unicodedata
5
  import nltk
6
  import inflect
 
8
  from nltk.corpus import stopwords
9
  from nltk.stem import LancasterStemmer, WordNetLemmatizer
10
 
11
+ download_path = os.path.join(os.getcwd(), 'nltk_packages')
12
+ nltk.data.path.append(download_path)
13
+ nltk.download('wordnet', download_dir=download_path)
14
+ nltk.download('stopwords', download_dir=download_path)
15
 
16
  def remove_non_ascii(words):
17
  """Remove non-ASCII characters from list of tokenized words"""
requirements.txt CHANGED
@@ -4,3 +4,8 @@ numpy==1.24.3
4
  PyPDF2==3.0.1
5
  scikit_learn==1.2.2
6
  sentence_transformers==2.2.2
 
 
 
 
 
 
4
  PyPDF2==3.0.1
5
  scikit_learn==1.2.2
6
  sentence_transformers==2.2.2
7
+ fastapi
8
+ uvicorn[standard]
9
+ python-multipart
10
+ python-dotenv
11
+ gradio