File size: 1,745 Bytes
e062e72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from sklearn import preprocessing

# Load the Hugging Face model and tokenizer
model_name = "ahmedheakl/bert-resume-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the dataset and prepare the label encoder
dataset_id = 'ahmedheakl/resume-atlas'
from datasets import load_dataset

# Load the dataset
ds = load_dataset(dataset_id, trust_remote_code=True)
label_column = "Category"

# Initialize Label Encoder and fit it to the categories in the dataset
le = preprocessing.LabelEncoder()
le.fit(ds['train'][label_column])

def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class_index = torch.argmax(probabilities).item()
    
    # Convert predicted class index to category name
    predicted_category = le.inverse_transform([predicted_class_index])[0]
    return predicted_category

#multiclass-classification
def classify_text_multi(text, threshold=0.95):
    inputs = tokenizer(text, return_tensors="pt",
                       truncation=True, padding=True)
    outputs = model(**inputs)
    probabilities = torch.nn.functional.sigmoid(outputs.logits)
    predicted_classes = (probabilities > threshold).int().tolist()[0]
    job_titles = [le.inverse_transform([idx])[0] for idx, val in enumerate(predicted_classes) if val == 1]
    
    if not job_titles:
        return ["Uncertain Prediction"]
    return job_titles