Spaces:

khrek
/

detailed-resume-parser

Running

App Files Files Community

khrek commited on Nov 11, 2023

Commit

0d375ed

•

1 Parent(s): ca25308

Upload 8 files

Browse files

Files changed (8) hide show

app.py +16 -0
models.py +49 -0
output_model.py +32 -0
parser.py +125 -0
reader.py +25 -0
requirements.txt +483 -0
sections.json +127 -0
segmenter.py +105 -0

app.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from pydoc import describe
+import gradio
+from main import Main
+main = Main()
+def parse(cv):
+    return main.parse(cv.name)
+description = "This is a demo of the resume parser. \
+    Upload a resume and it will return a JSON object with a detailed parsed resume data."
+article = "Demo of detailed resume parser"
+file_input = gradio.inputs.File(file_count="single", type="file", label="Upload your pdf resume (en)")
+iface = gradio.Interface(fn=parse, inputs=file_input, outputs="json",
+                         title="Detailed Resume Parser", description=description, article=article)
+iface.launch()

models.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import sentencepiece
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from langchain import PromptTemplate,  LLMChain, HuggingFacePipeline
+import ast
+class Models():
+    def __init__(self) -> None:
+        self.template = """
+              A virtual assistant answers questions from a user based on the provided text.
+              USER: Text: {input_text}
+              ASSISTANT: I’ve read this text.
+              USER: What describes {entity_type} in the text?
+              ASSISTANT:
+            """
+        self.load_trained_models()
+    def load_trained_models(self):
+        #is it best to keep in memory why not pickle?
+        checkpoint = "Universal-NER/UniNER-7B-all"
+        ner_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float16, offload_folder="offload", offload_state_dict = True)
+        tokenizer = AutoTokenizer.from_pretrained("Universal-NER/UniNER-7B-all", use_fast=False, padding="max_length")
+        pipeline = pipeline(
+            "text-generation", #task
+            model=ner_model,
+            max_length=1000,
+            tokenizer=tokenizer,
+            trust_remote_code=True,
+            do_sample=True,
+            top_k=10,
+            num_return_sequences=1
+        )
+        self.llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})
+        self.prompt = PromptTemplate(template=self.template, input_variables=["input_text","entity_type"])
+        self.llm_chain = LLMChain(prompt=self.prompt, llm=self.llm)
+    def extract_ner(self, context, entity_type):
+        return ast.literal_eval(self.llm_chain.run({"input_text":context,"entity_type":entity_type}))
+    def get_ner(self, clean_lines, entity):
+        tokens = []
+        try_num = 0
+        while try_num < 5 and tokens == []:
+            tokens = self.extract_ner(' '.join(clean_lines), entity)
+        if len(tokens) == 0:
+            raise ValueError("Couldnt extract {entity}")
+        return tokens

output_model.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Union
+from pydantic import BaseModel, Field
+class Work_experience(BaseModel):
+    position:List[str]
+    company:List[str]
+    start_date:Optional[str] = ""
+    end_date:Optional[str] = ""
+    description:Optional[str] = ""
+    location:Optional[List[str]] = []
+class Education(BaseModel):
+    degree:str = ""
+    major:List[str] = []
+    university:List[str] = []
+    start_date:Optional[str] = ""
+    end_date:Optional[str] = ""
+    location:Optional[List[str]] = []
+class Basic_info(BaseModel):
+    name: str
+    email : Optional[str] = ""
+    phone: Optional[str] = ""
+class ModelOutput(BaseModel):
+    basic_info: Basic_info
+    education: Optional[List[Education]] = None
+    work_experience: Optional[List[Work_experience]] = None

parser.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from itertools import chain
+from models.prototype.models import Models
+#from output_model import OutputModel, WorkExperience
+from models.prototype.segmenter import ResumeSegmenter
+from flashtext import KeywordProcessor
+from collections import defaultdict
+class ResumeParser():
+    def __init__(self) -> None:
+        self.resumeSegmenter = ResumeSegmenter()
+        self.models = Models()
+    def get_date_index(self, clean_resume_lines, date):
+        indexes = [i for i, line in enumerate(clean_resume_lines) if date in line]
+        return indexes
+    #better suited to a utils file
+    def sort_tokens_table(self, tokens_data):
+        table  = {}
+        for key, tokens in tokens_data:
+            for token in tokens:
+                table[token] = key
+        return table
+    def split_work_exp(self, resume_lines, start_index, end_index, work_dates):
+        dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
+        dates_indexes = list(chain.from_iterable(dates_indexes))
+        dates_indexes = [i + start_index for i in dates_indexes]
+        #this list should be unique and ordered
+        dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
+        dates_indexes = set(dates_indexes)
+        dates_indexes =  list(dates_indexes)
+        list_single_work_exp = []
+        for i in range(len(dates_indexes)-1):
+            index = dates_indexes[i]
+            next_index = dates_indexes[i+1]
+            section = resume_lines[index:next_index]
+            if len(section) == 0:
+              continue
+            list_single_work_exp.append(section)
+        return  list_single_work_exp
+    def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
+        text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
+        start_index = sections[section_header][0]
+        end_index = sections[section_header][1]
+        #on the bases dates would be unique
+        return start_index, end_index
+    #more of a utils function
+    def sort_tokens_table(tokens_data):
+        table  = {}
+        for key, tokens in tokens_data:
+            for token in tokens:
+                table[token] = key
+        return table
+    def format_output(self, keywords, work_section_list, isWorkExp=True):
+        if isWorkExp:
+            headlines = [text[0] for text in work_section_list]
+        else:
+            headlines = work_section_list
+        table = self.sort_tokens_table(keywords)
+        tokens_processor = KeywordProcessor()
+        list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
+        tokens_processor.add_keywords_from_list(list_keywords)
+        data = []
+        for i, header in enumerate(headlines):
+            current_data = defaultdict(list)
+            tokens = tokens_processor.extract_keywords(header)
+            for token in tokens:
+                current_data[table[token]].append(token)
+            if isWorkExp:
+              current_data["description"] = work_section_list[i][1:]
+            data.append(dict(current_data))
+        return data
+    def parse_work_history(self, resume_lines):
+        start_index, end_index = self.extract_section_text(resume_lines)
+        work_dates =  self.models.get_ner(resume_lines[start_index:end_index], "date")
+        single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
+        job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title")
+        companies = self.models.get_ner(resume_lines[start_index:end_index], "company")
+        keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
+        return self.format_output(keywords, single_work_experiences)
+    def parse_education(self, resume_lines):
+        start_index, end_index = self.extract_section_text(resume_lines, "education_and_training")
+        tokens = ["degree", "university", "degree field", "date", "location"]
+        for token in tokens:
+            keywords = self.get_ner(resume_lines[start_index+1:end_index], token)
+        output = self.format_output(keywords, resume_lines[start_index:end_index], False)
+        output = [res for res in output if res]
+        return output
+    def parse_basic_info(self,resume_lines):
+        start_index, end_index = self.extract_section_text(resume_lines, "basics_info")
+        #tokens = ["person", "email", "phone"]
+        tokens = ["person"]
+        for token in tokens:
+            keywords = self.models.get_ner(resume_lines[start_index:end_index], token)
+        output = {}
+        for token, result in keywords:
+            if len(result) > 0:
+                output[token] = result[0]
+        return output
+    def parse(self, resume_lines):
+        jobs = self.parse_work_history(resume_lines)
+        education = self.parse_education(resume_lines)
+        basic_info = self.parse_basic_info(resume_lines)
+        return {"basic_info":basic_info, "education":education, "work_experience":jobs}

reader.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import pypdfium2 as pdfium
+import re
+class ResumeReader:
+    def clean_text(self, raw_text):
+        clean_text = re.sub(r'\n+', '\n', raw_text)
+        clean_text = clean_text.replace("\r", "\n")
+        clean_text = clean_text.replace("\t", " ")
+        clean_text = re.sub(r"\uf0b7", " ", clean_text)
+        clean_text = re.sub(r'[^\x00-\x7F]+', '', clean_text) #remove non-ascii
+        clean_text = re.sub(r"\(cid:\d{0,3}\)", " ", clean_text)
+        clean_text = re.sub(r'• ', " ", clean_text)
+        return clean_text
+    def read_pdf(self, path_file):
+        raw_text = ""
+        pdf = pdfium.PdfDocument(path_file)
+        for page in pdf:
+            raw_text += page.get_textpage().get_text_range()
+        clean_text = self.clean_text(raw_text)
+        resume_lines = clean_text.splitlines(True)
+        resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
+        return resume_lines

requirements.txt ADDED Viewed

	@@ -0,0 +1,483 @@

+absl-py==1.4.0
+accelerate==0.24.1
+aiohttp==3.8.6
+aiosignal==1.3.1
+alabaster==0.7.13
+albumentations==1.3.1
+altair==4.2.2
+anyio==3.7.1
+appdirs==1.4.4
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+array-record==0.5.0
+arviz==0.15.1
+astropy==5.3.4
+astunparse==1.6.3
+async-timeout==4.0.3
+atpublic==4.0
+attrs==23.1.0
+audioread==3.0.1
+autograd==1.6.2
+Babel==2.13.1
+backcall==0.2.0
+beautifulsoup4==4.11.2
+bidict==0.22.1
+bigframes==0.13.0
+bleach==6.1.0
+blinker==1.4
+blis==0.7.11
+blosc2==2.0.0
+bokeh==3.3.0
+bqplot==0.12.42
+branca==0.7.0
+build==1.0.3
+CacheControl==0.13.1
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2023.7.22
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+chex==0.1.7
+click==8.1.7
+click-plugins==1.1.1
+cligj==0.7.2
+cloudpickle==2.2.1
+cmake==3.27.7
+cmdstanpy==1.2.0
+colorcet==3.0.1
+colorlover==0.3.0
+colour==0.1.5
+community==1.0.0b1
+confection==0.1.3
+cons==0.4.6
+contextlib2==21.6.0
+contourpy==1.2.0
+cryptography==41.0.5
+cufflinks==0.17.3
+cupy-cuda11x==11.0.0
+cvxopt==1.3.2
+cvxpy==1.3.2
+cycler==0.12.1
+cymem==2.0.8
+Cython==3.0.5
+dask==2023.8.1
+dataclasses-json==0.6.2
+datascience==0.17.6
+db-dtypes==1.1.1
+dbus-python==1.2.18
+debugpy==1.6.6
+decorator==4.4.2
+defusedxml==0.7.1
+diskcache==5.6.3
+distributed==2023.8.1
+distro==1.7.0
+dlib==19.24.2
+dm-tree==0.1.8
+docutils==0.18.1
+dopamine-rl==4.0.6
+duckdb==0.9.1
+earthengine-api==0.1.377
+easydict==1.11
+ecos==2.0.12
+editdistance==0.6.2
+eerepr==0.0.4
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl#sha256=83276fc78a70045627144786b52e1f2728ad5e29e5e43916ec37ea9c26a11212
+entrypoints==0.4
+et-xmlfile==1.1.0
+etils==1.5.2
+etuples==0.3.9
+exceptiongroup==1.1.3
+fastai==2.7.13
+fastcore==1.5.29
+fastdownload==0.0.7
+fastjsonschema==2.18.1
+fastprogress==1.0.3
+fastrlock==0.8.2
+filelock==3.13.1
+fiona==1.9.5
+firebase-admin==5.3.0
+flashtext==2.7
+Flask==2.2.5
+flatbuffers==23.5.26
+flax==0.7.5
+folium==0.14.0
+fonttools==4.44.0
+frozendict==2.3.8
+frozenlist==1.4.0
+fsspec==2023.6.0
+future==0.18.3
+gast==0.5.4
+gcsfs==2023.6.0
+GDAL==3.4.3
+gdown==4.6.6
+geemap==0.28.2
+gensim==4.3.2
+geocoder==1.38.1
+geographiclib==2.0
+geopandas==0.13.2
+geopy==2.3.0
+gin-config==0.5.0
+glob2==0.7
+google==2.0.3
+google-api-core==2.11.1
+google-api-python-client==2.84.0
+google-auth==2.17.3
+google-auth-httplib2==0.1.1
+google-auth-oauthlib==1.0.0
+google-cloud-bigquery==3.12.0
+google-cloud-bigquery-connection==1.12.1
+google-cloud-bigquery-storage==2.22.0
+google-cloud-core==2.3.3
+google-cloud-datastore==2.15.2
+google-cloud-firestore==2.11.1
+google-cloud-functions==1.13.3
+google-cloud-iam==2.12.2
+google-cloud-language==2.9.1
+google-cloud-resource-manager==1.10.4
+google-cloud-storage==2.8.0
+google-cloud-translate==3.11.3
+google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz#sha256=a7913e00463ccd8df75a61e36d8582af57905f6b05b88aa768c70a0d631990ef
+google-crc32c==1.5.0
+google-pasta==0.2.0
+google-resumable-media==2.6.0
+googleapis-common-protos==1.61.0
+googledrivedownloader==0.4
+graphviz==0.20.1
+greenlet==3.0.1
+grpc-google-iam-v1==0.12.6
+grpcio==1.59.2
+grpcio-status==1.48.2
+gspread==3.4.2
+gspread-dataframe==3.3.1
+gym==0.25.2
+gym-notices==0.0.8
+h5netcdf==1.3.0
+h5py==3.9.0
+holidays==0.36
+holoviews==1.17.1
+html5lib==1.1
+httpimport==1.3.1
+httplib2==0.22.0
+huggingface-hub==0.17.3
+humanize==4.7.0
+hyperopt==0.2.7
+ibis-framework==6.2.0
+idna==3.4
+imageio==2.31.6
+imageio-ffmpeg==0.4.9
+imagesize==1.4.1
+imbalanced-learn==0.10.1
+imgaug==0.4.0
+importlib-metadata==6.8.0
+importlib-resources==6.1.1
+imutils==0.5.4
+inflect==7.0.0
+iniconfig==2.0.0
+install==1.3.5
+intel-openmp==2023.2.0
+ipyevents==2.0.2
+ipyfilechooser==0.6.0
+ipykernel==5.5.6
+ipyleaflet==0.17.4
+ipython==7.34.0
+ipython-genutils==0.2.0
+ipython-sql==0.5.0
+ipytree==0.2.2
+ipywidgets==7.7.1
+itsdangerous==2.1.2
+jax==0.4.20
+jaxlib @ https://storage.googleapis.com/jax-releases/cuda11/jaxlib-0.4.20+cuda11.cudnn86-cp310-cp310-manylinux2014_x86_64.whl#sha256=01be66238133f884bf5adf15cd7eaaf8445f9d4b056c5c64df28a997a6aff2fe
+jeepney==0.7.1
+jieba==0.42.1
+Jinja2==3.1.2
+joblib==1.3.2
+jsonpatch==1.33
+jsonpickle==3.0.2
+jsonpointer==2.4
+jsonschema==4.19.2
+jsonschema-specifications==2023.7.1
+jupyter-client==6.1.12
+jupyter-console==6.1.0
+jupyter-server==1.24.0
+jupyter_core==5.5.0
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.9
+kaggle==1.5.16
+keras==2.14.0
+keyring==23.5.0
+kiwisolver==1.4.5
+langchain==0.0.334
+langcodes==3.3.0
+langsmith==0.0.63
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lazy_loader==0.3
+libclang==16.0.6
+librosa==0.10.1
+lida==0.0.10
+lightgbm==4.1.0
+linkify-it-py==2.0.2
+llmx==0.0.15a0
+llvmlite==0.41.1
+locket==1.0.0
+logical-unification==0.4.6
+lxml==4.9.3
+malloy==2023.1064
+Markdown==3.5.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+matplotlib-venn==0.11.9
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+miniKanren==1.0.3
+missingno==0.5.2
+mistune==0.8.4
+mizani==0.9.3
+mkl==2023.2.0
+ml-dtypes==0.2.0
+mlxtend==0.22.0
+more-itertools==10.1.0
+moviepy==1.0.3
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multipledispatch==1.0.0
+multitasking==0.0.11
+murmurhash==1.0.10
+music21==9.1.0
+mypy-extensions==1.0.0
+natsort==8.4.0
+nbclassic==1.0.0
+nbclient==0.9.0
+nbconvert==6.5.4
+nbformat==5.9.2
+nest-asyncio==1.5.8
+networkx==3.2.1
+nibabel==4.0.2
+nltk==3.8.1
+notebook==6.5.5
+notebook_shim==0.2.3
+numba==0.58.1
+numexpr==2.8.7
+numpy==1.23.5
+oauth2client==4.1.3
+oauthlib==3.2.2
+opencv-contrib-python==4.8.0.76
+opencv-python==4.8.0.76
+opencv-python-headless==4.8.1.78
+openpyxl==3.1.2
+opt-einsum==3.3.0
+optax==0.1.7
+orbax-checkpoint==0.4.2
+osqp==0.6.2.post8
+packaging==23.2
+pandas==1.5.3
+pandas-datareader==0.10.0
+pandas-gbq==0.17.9
+pandas-stubs==1.5.3.230304
+pandocfilters==1.5.0
+panel==1.3.1
+param==2.0.0
+parso==0.8.3
+parsy==2.1
+partd==1.4.1
+pathlib==1.0.1
+pathy==0.10.3
+patsy==0.5.3
+peewee==3.17.0
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.4.0
+pip-tools==6.13.0
+platformdirs==3.11.0
+plotly==5.15.0
+plotnine==0.12.4
+pluggy==1.3.0
+polars==0.17.3
+pooch==1.8.0
+portpicker==1.5.2
+prefetch-generator==1.0.3
+preshed==3.0.9
+prettytable==3.9.0
+proglog==0.1.10
+progressbar2==4.2.0
+prometheus-client==0.18.0
+promise==2.3
+prompt-toolkit==3.0.39
+prophet==1.1.5
+proto-plus==1.22.3
+protobuf==3.20.3
+psutil==5.9.5
+psycopg2==2.9.9
+ptyprocess==0.7.0
+py-cpuinfo==9.0.0
+py4j==0.10.9.7
+pyarrow==9.0.0
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycocotools==2.0.7
+pycparser==2.21
+pyct==0.5.0
+pydantic==1.10.13
+pydata-google-auth==1.8.2
+pydot==1.4.2
+pydot-ng==2.0.0
+pydotplus==2.0.2
+PyDrive==1.3.1
+PyDrive2==1.6.3
+pyerfa==2.0.1.1
+pygame==2.5.2
+Pygments==2.16.1
+PyGObject==3.42.1
+PyJWT==2.3.0
+pymc==5.7.2
+pymystem3==0.2.0
+PyOpenGL==3.1.7
+pyOpenSSL==23.3.0
+pyparsing==3.1.1
+pypdfium2==4.24.0
+pyperclip==1.8.2
+pyproj==3.6.1
+pyproject_hooks==1.0.0
+pyshp==2.3.1
+PySocks==1.7.1
+pytensor==2.14.2
+pytest==7.4.3
+python-apt==0.0.0
+python-box==7.1.1
+python-dateutil==2.8.2
+python-louvain==0.16
+python-slugify==8.0.1
+python-utils==3.8.1
+pytz==2023.3.post1
+pyviz_comms==3.0.0
+PyWavelets==1.4.1
+PyYAML==6.0.1
+pyzmq==23.2.1
+qdldl==0.1.7.post0
+qudida==0.0.4
+ratelim==0.1.6
+referencing==0.30.2
+regex==2023.6.3
+requests==2.31.0
+requests-oauthlib==1.3.1
+requirements-parser==0.5.0
+rich==13.6.0
+rpds-py==0.12.0
+rpy2==3.4.2
+rsa==4.9
+safetensors==0.4.0
+scikit-image==0.19.3
+scikit-learn==1.2.2
+scipy==1.11.3
+scooby==0.9.2
+scs==3.2.4
+seaborn==0.12.2
+SecretStorage==3.3.1
+Send2Trash==1.8.2
+sentencepiece==0.1.99
+shapely==2.0.2
+six==1.16.0
+sklearn-pandas==2.2.0
+smart-open==6.4.0
+sniffio==1.3.0
+snowballstemmer==2.2.0
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.5
+soxr==0.3.7
+spacy==3.6.1
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+Sphinx==5.0.2
+sphinxcontrib-applehelp==1.0.7
+sphinxcontrib-devhelp==1.0.5
+sphinxcontrib-htmlhelp==2.0.4
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.6
+sphinxcontrib-serializinghtml==1.1.9
+SQLAlchemy==2.0.23
+sqlglot==17.16.2
+sqlparse==0.4.4
+srsly==2.4.8
+stanio==0.3.0
+statsmodels==0.14.0
+sympy==1.12
+tables==3.8.0
+tabulate==0.9.0
+tbb==2021.10.0
+tblib==3.0.0
+tenacity==8.2.3
+tensorboard==2.14.1
+tensorboard-data-server==0.7.2
+tensorflow==2.14.0
+tensorflow-datasets==4.9.3
+tensorflow-estimator==2.14.0
+tensorflow-gcs-config==2.14.0
+tensorflow-hub==0.15.0
+tensorflow-io-gcs-filesystem==0.34.0
+tensorflow-metadata==1.14.0
+tensorflow-probability==0.22.0
+tensorstore==0.1.45
+termcolor==2.3.0
+terminado==0.17.1
+text-unidecode==1.3
+textblob==0.17.1
+tf-slim==1.1.0
+thinc==8.1.12
+threadpoolctl==3.2.0
+tifffile==2023.9.26
+tinycss2==1.2.1
+tokenizers==0.14.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.0
+torch @ https://download.pytorch.org/whl/cu118/torch-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=a81b554184492005543ddc32e96469f9369d778dedd195d73bda9bed407d6589
+torchaudio @ https://download.pytorch.org/whl/cu118/torchaudio-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=cdfd0a129406155eee595f408cafbb92589652da4090d1d2040f5453d4cae71f
+torchdata==0.7.0
+torchsummary==1.5.1
+torchtext==0.16.0
+torchvision @ https://download.pytorch.org/whl/cu118/torchvision-0.16.0%2Bcu118-cp310-cp310-linux_x86_64.whl#sha256=033712f65d45afe806676c4129dfe601ad1321d9e092df62b15847c02d4061dc
+tornado==6.3.2
+tqdm==4.66.1
+traitlets==5.7.1
+traittypes==0.2.1
+transformers==4.35.0
+triton==2.1.0
+tweepy==4.14.0
+typer==0.9.0
+types-pytz==2023.3.1.1
+types-setuptools==68.2.0.1
+typing-inspect==0.9.0
+typing_extensions==4.5.0
+tzlocal==5.2
+uc-micro-py==1.0.2
+uritemplate==4.1.1
+urllib3==2.0.7
+vega-datasets==0.9.0
+wadllib==1.3.6
+wasabi==1.1.2
+wcwidth==0.2.9
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.6.4
+Werkzeug==3.0.1
+widgetsnbextension==3.6.6
+wordcloud==1.9.2
+wrapt==1.14.1
+xarray==2023.7.0
+xarray-einstats==0.6.0
+xgboost==2.0.1
+xlrd==2.0.1
+xxhash==3.4.1
+xyzservices==2023.10.1
+yarl==1.9.2
+yellowbrick==1.5
+yfinance==0.2.31
+zict==3.0.0
+zipp==3.17.0

sections.json ADDED Viewed

	@@ -0,0 +1,127 @@

+{
+  "section_headers": {
+    "objective": [
+      "career goal",
+      "objective",
+      "career objective",
+      "employment objective",
+      "professional objective",
+      "summary",
+      "summary of qualifications"
+    ],
+    "work_and_employment": [
+      "employment history",
+      "employment data",
+      "career summary",
+      "work history",
+      "working history",
+      "work experience",
+      "experience",
+      "professional experience",
+      "professional background",
+      "professional employment",
+      "additional experience",
+      "career related experience",
+      "professional employment history",
+      "related experience",
+      "relevant experience",
+      "programming experience",
+      "freelance",
+      "freelance experience",
+      "army experience",
+      "military experience",
+      "military background"
+    ],
+    "education_and_training": [
+      "academic background",
+      "academic experience",
+      "programs",
+      "courses",
+      "related courses",
+      "education",
+      "educational background",
+      "educational qualifications",
+      "educational training",
+      "education and training",
+      "training",
+      "academic training",
+      "Academic Qualification",
+      "professional training",
+      "course project experience",
+      "related course projects",
+      "internship experience",
+      "internships",
+      "apprenticeships",
+      "college activities",
+      "certifications",
+      "special training"
+    ],
+    "skills": [
+      "credentials",
+      "qualifications",
+      "areas of experience",
+      "areas of expertise",
+      "areas of knowledge",
+      "skills",
+      "Skills",
+      "other skills",
+      "other abilities",
+      "career related skills",
+      "professional skills",
+      "specialized skills",
+      "technical skills",
+      "computer skills",
+      "personal skills",
+      "computer knowledge",
+      "technologies",
+      "technical experience",
+      "proficiencies",
+      "languages",
+      "language competencies and skills",
+      "programming languages",
+      "competencies"
+    ],
+    "misc": [
+      "activities and honors",
+      "activities",
+      "affiliations",
+      "professional affiliations",
+      "associations",
+      "professional associations",
+      "memberships",
+      "professional memberships",
+      "athletic involvement",
+      "community involvement",
+      "refere",
+      "civic activities",
+      "extra-Curricular activities",
+      "professional activities",
+      "volunteer work",
+      "volunteer experience",
+      "additional information",
+      "interests"
+    ],
+    "accomplishments": [
+      "awards",
+      "achievement",
+      "awards and achievements",
+      "licenses",
+      "presentations",
+      "conference presentations",
+      "conventions",
+      "dissertations",
+      "exhibits",
+      "papers",
+      "publications",
+      "professional publications",
+      "research experience",
+      "research grants",
+      "projects",
+      "research projects",
+      "personal projects",
+      "current research interests",
+      "thesis",
+      "theses"
+    ]
+  }
+}

segmenter.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from flashtext import KeywordProcessor
+import json
+class ResumeSegmenter():
+    class ResumeSegmenter():
+        def __init__(self):
+            self.resume_segments = {
+                'objective': [],
+                'work_and_employment': [],
+                'education_and_training': [],
+                'skills': [],
+                'accomplishments': [],
+                'misc': []
+            }
+            self.resume_indices = []
+        def get_average_line_len(self, lines):
+            sum = 0
+            for line in lines:
+                sum+=len(line)
+            return sum / len(lines)
+        def get_average_words_per_line(self, lines):
+            sum = 0
+            for line in lines:
+                #other stopwords too?
+                sum+= len(line.split(' '))
+            return sum/ len(lines)
+        def find_segment_indices(self, text_list):
+            with open(r"./sections.json") as f:
+                data = json.load(f)
+                section_headers = data["section_headers"]
+            f.close()
+            keyword_processor = KeywordProcessor()
+            keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
+            average_words_per_line = self.get_average_words_per_line(text_list)
+            for i, line in enumerate(text_list):
+                if line[0].islower() or line[-1] == '.':
+                    continue
+                kys =  keyword_processor.extract_keywords(line)
+                if len(kys) > 0:
+                    #other stopwords? from where? nltk lib ? pos tagger?
+                    if len(line.split(" ")) > average_words_per_line * 0.75:
+                        continue
+                    #is it necessary to keep the actual raw keyword?
+                    self.resume_indices.append(i)
+                    self.resume_segments[kys[0]].append(i)
+        def slice_segments(self, lines):
+            sections = {}
+            if len(self.resume_indices) == 0:
+                return None
+            for section, points in self.resume_segments.items():
+                if len(points) == 0: continue
+                start_point = points[0]
+                tmp_end_point = points[-1]
+                end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1,
+                                            len(self.resume_indices)-1)]
+                if start_point == self.resume_indices[-1]:
+                    end_point = len(lines)
+                sections[section] = (start_point, end_point)
+            sections["basics_info"] = (0, self.resume_indices[0])
+            return sections
+        def get_interval_intersection(self, sections, interval):
+            for section in sections:
+                s = section[1]
+                if s[0] >= interval[1] or interval[0] >= s[1]:
+                    return None
+                else:
+                    start = max(s[0], interval[0])
+                    end = min(s[1], interval[1])
+                    return [start, end], section
+        def segment(self, resume_lines):
+            self.find_segment_indices(resume_lines)
+            sections = self.slice_segments(resume_lines)
+            #whats the naming convention here sections_list or list_sections???
+            sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
+            intersection_intervals = []
+            for i, s in enumerate(sections_list[:-1]):
+                result = self.get_interval_intersection(sections_list[i+1:], s[1])
+                if result is None:
+                    continue
+                else:
+                    a,b = result
+                    print(a,b,s[0])
+                    intersection_intervals.append((a,b,s[0]))
+            if len(intersection_intervals) > 0:
+                print("there are intersections", intersection_intervals)
+            #needs last method of cleaning overlapping intervals with zero shot
+            #classifier + substract intervals
+            return sections
+        def get_parsed_sections(self, resume_lines):
+            text_segments = {}
+            sections = self.segment(resume_lines)
+            for header_title, section in sections.items():
+                lines = resume_lines[section[0]:section[1]]
+                text_segments[header_title] = lines
+            return text_segments, sections