File size: 14,161 Bytes
6ca12ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# -*- coding: utf-8 -*-
"""xlm-roberta-large.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/18YiC93vkjig-o550pHFJSB3bCQ7rhb4M
"""

!pip install transformers datasets seqeval huggingface_hub

# Standard library imports
import os                 # Provides functions for interacting with the operating system
import warnings           # Used to handle or suppress warnings
import numpy as np        # Essential for numerical operations and array manipulation
import torch              # PyTorch library for tensor computations and model handling
import ast                # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)

# Hugging Face and Transformers imports
from datasets import load_dataset                     # Loads datasets for model training and evaluation
from transformers import (
    AutoTokenizer,                                   # Initializes a tokenizer from a pre-trained model
    DataCollatorForTokenClassification,              # Handles padding and formatting of token classification data
    TrainingArguments,                               # Defines training parameters like batch size and learning rate
    Trainer,                                         # High-level API for managing training and evaluation
    AutoModelForTokenClassification,                 # Loads a pre-trained model for token classification tasks
    get_linear_schedule_with_warmup,                 # Learning rate scheduler for gradual warm-up and linear decay
    EarlyStoppingCallback                           # Callback to stop training if validation performance plateaus
)

# Hugging Face Hub
from huggingface_hub import login                   # Allows logging in to Hugging Face Hub to upload models

# seqeval metrics for NER evaluation
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
# Provides precision, recall, F1-score, and classification report for evaluating NER model performance

# Log in to Hugging Face Hub
login(token="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")

# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training
os.environ["WANDB_DISABLED"] = "true"

# Suppress warning messages to keep output clean, especially during training and evaluation
warnings.filterwarnings("ignore")

# Load the Azerbaijani NER dataset from Hugging Face
dataset = load_dataset("LocalDoc/azerbaijani-ner-dataset")
print(dataset)  # Display dataset structure (e.g., train/validation splits)

# Preprocessing function to format tokens and NER tags correctly
def preprocess_example(example):
    try:
        # Convert string of tokens to a list and parse NER tags to integers
        example["tokens"] = ast.literal_eval(example["tokens"])
        example["ner_tags"] = list(map(int, ast.literal_eval(example["ner_tags"])))
    except (ValueError, SyntaxError) as e:
        # Skip and log malformed examples, ensuring error resilience
        print(f"Skipping malformed example: {example['index']} due to error: {e}")
        example["tokens"] = []
        example["ner_tags"] = []
    return example

# Apply preprocessing to each dataset entry, ensuring consistent formatting
dataset = dataset.map(preprocess_example)

# Initialize the tokenizer for multilingual NER using xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

# Function to tokenize input and align labels with tokenized words
def tokenize_and_align_labels(example):
    # Tokenize the sentence while preserving word boundaries for correct NER tag alignment
    tokenized_inputs = tokenizer(
        example["tokens"],            # List of words (tokens) in the sentence
        truncation=True,               # Truncate sentences longer than max_length
        is_split_into_words=True,      # Specify that input is a list of words
        padding="max_length",          # Pad to maximum sequence length
        max_length=128,                # Set the maximum sequence length to 128 tokens
    )

    labels = []                        # List to store aligned NER labels
    word_ids = tokenized_inputs.word_ids()  # Get word IDs for each token
    previous_word_idx = None           # Initialize previous word index for tracking

    # Loop through word indices to align NER tags with subword tokens
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)        # Set padding token labels to -100 (ignored in loss)
        elif word_idx != previous_word_idx:
            # Assign the label from example's NER tags if word index matches
            labels.append(example["ner_tags"][word_idx] if word_idx < len(example["ner_tags"]) else -100)
        else:
            labels.append(-100)        # Label subword tokens with -100 to avoid redundant labels
        previous_word_idx = word_idx   # Update previous word index

    tokenized_inputs["labels"] = labels  # Add labels to tokenized inputs
    return tokenized_inputs

# Apply tokenization and label alignment function to the dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)

# Create a 90-10 split of the dataset for training and validation
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)
print(tokenized_datasets)  # Output structure of split datasets

# Define a list of entity labels for NER tagging with B- (beginning) and I- (inside) markers
label_list = [
    "O",                  # Outside of a named entity
    "B-PERSON", "I-PERSON",         # Person name (e.g., "John" in "John Doe")
    "B-LOCATION", "I-LOCATION",     # Geographical location (e.g., "Paris")
    "B-ORGANISATION", "I-ORGANISATION", # Organization name (e.g., "UNICEF")
    "B-DATE", "I-DATE",             # Date entity (e.g., "2024-11-05")
    "B-TIME", "I-TIME",             # Time (e.g., "12:00 PM")
    "B-MONEY", "I-MONEY",           # Monetary values (e.g., "$20")
    "B-PERCENTAGE", "I-PERCENTAGE", # Percentage values (e.g., "20%")
    "B-FACILITY", "I-FACILITY",     # Physical facilities (e.g., "Airport")
    "B-PRODUCT", "I-PRODUCT",       # Product names (e.g., "iPhone")
    "B-EVENT", "I-EVENT",           # Named events (e.g., "Olympics")
    "B-ART", "I-ART",               # Works of art (e.g., "Mona Lisa")
    "B-LAW", "I-LAW",               # Laws and legal documents (e.g., "Article 50")
    "B-LANGUAGE", "I-LANGUAGE",     # Languages (e.g., "Azerbaijani")
    "B-GPE", "I-GPE",               # Geopolitical entities (e.g., "Europe")
    "B-NORP", "I-NORP",             # Nationalities, religious groups, political groups
    "B-ORDINAL", "I-ORDINAL",       # Ordinal indicators (e.g., "first", "second")
    "B-CARDINAL", "I-CARDINAL",     # Cardinal numbers (e.g., "three")
    "B-DISEASE", "I-DISEASE",       # Diseases (e.g., "COVID-19")
    "B-CONTACT", "I-CONTACT",       # Contact info (e.g., email or phone number)
    "B-ADAGE", "I-ADAGE",           # Common sayings or adages
    "B-QUANTITY", "I-QUANTITY",     # Quantities (e.g., "5 km")
    "B-MISCELLANEOUS", "I-MISCELLANEOUS", # Miscellaneous entities not fitting other categories
    "B-POSITION", "I-POSITION",     # Job titles or positions (e.g., "CEO")
    "B-PROJECT", "I-PROJECT"        # Project names (e.g., "Project Apollo")
]

# Initialize a data collator to handle padding and formatting for token classification
data_collator = DataCollatorForTokenClassification(tokenizer)

# Load a pre-trained model for token classification, adapted for NER tasks
model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-large",               # Base model (multilingual XLM-RoBERTa) for NER
    num_labels=len(label_list)        # Set the number of output labels to match NER categories
)

# Define a function to compute evaluation metrics for the model's predictions
def compute_metrics(p):
    predictions, labels = p  # Unpack predictions and true labels from the input

    # Convert logits to predicted label indices by taking the argmax along the last axis
    predictions = np.argmax(predictions, axis=2)

    # Filter out special padding labels (-100) and convert indices to label names
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Print a detailed classification report for each label category
    print(classification_report(true_labels, true_predictions))

    # Calculate and return key evaluation metrics
    return {
        # Precision measures the accuracy of predicted positive instances
        # Important in NER to ensure entity predictions are correct and reduce false positives.
        "precision": precision_score(true_labels, true_predictions),

        # Recall measures the model's ability to capture all relevant entities
        # Essential in NER to ensure the model captures all entities, reducing false negatives.
        "recall": recall_score(true_labels, true_predictions),

        # F1-score is the harmonic mean of precision and recall, balancing both metrics
        # Useful in NER for providing an overall performance measure, especially when precision and recall are both important.
        "f1": f1_score(true_labels, true_predictions),
    }

# Set up training arguments for model training, defining essential training configurations
training_args = TrainingArguments(
    output_dir="./results",               # Directory to save model checkpoints and final outputs
    evaluation_strategy="epoch",          # Evaluate model on the validation set at the end of each epoch
    save_strategy="epoch",                # Save model checkpoints at the end of each epoch
    learning_rate=2e-5,                   # Set a low learning rate to ensure stable training for fine-tuning
    per_device_train_batch_size=128,       # Number of examples per batch during training, balancing speed and memory
    per_device_eval_batch_size=128,        # Number of examples per batch during evaluation
    num_train_epochs=12,                   # Number of full training passes over the dataset
    weight_decay=0.005,                    # Regularization term to prevent overfitting by penalizing large weights
    fp16=True,                            # Use 16-bit floating point for faster and memory-efficient training
    logging_dir='./logs',                 # Directory to store training logs
    save_total_limit=2,                   # Keep only the 2 latest model checkpoints to save storage space
    load_best_model_at_end=True,          # Load the best model based on metrics at the end of training
    metric_for_best_model="f1",           # Use F1-score to determine the best model checkpoint
    report_to="none"                      # Disable reporting to external services (useful in local runs)
)

# Initialize the Trainer class to manage the training loop with all necessary components
trainer = Trainer(
    model=model,                         # The pre-trained model to be fine-tuned
    args=training_args,                  # Training configuration parameters defined in TrainingArguments
    train_dataset=tokenized_datasets["train"],  # Tokenized training dataset
    eval_dataset=tokenized_datasets["test"],    # Tokenized validation dataset
    tokenizer=tokenizer,                 # Tokenizer used for processing input text
    data_collator=data_collator,         # Data collator for padding and batching during training
    compute_metrics=compute_metrics,     # Function to calculate evaluation metrics like precision, recall, F1
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] # Stop training early if validation metrics don't improve for 2 epochs
)

# Begin the training process and capture the training metrics
training_metrics = trainer.train()

# Evaluate the model on the validation set after training
eval_results = trainer.evaluate()

# Print evaluation results, including precision, recall, and F1-score
print(eval_results)

# Define the directory where the trained model and tokenizer will be saved
save_directory = "./xlm-roberta-large"

# Save the trained model to the specified directory
model.save_pretrained(save_directory)

# Save the tokenizer to the same directory for compatibility with the model
tokenizer.save_pretrained(save_directory)

from transformers import pipeline

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForTokenClassification.from_pretrained(save_directory)

# Initialize the NER pipeline
device = 0 if torch.cuda.is_available() else -1
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_list) if label != "O"}

def evaluate_model(test_texts, true_labels):
    predictions = []
    for i, text in enumerate(test_texts):
        pred_entities = nlp_ner(text)
        pred_labels = [label_mapping.get(entity["entity_group"], "O") for entity in pred_entities if entity["entity_group"] in label_mapping]
        if len(pred_labels) != len(true_labels[i]):
            print(f"Warning: Inconsistent number of entities in sample {i+1}. Adjusting predicted entities.")
            pred_labels = pred_labels[:len(true_labels[i])]
        predictions.append(pred_labels)
    if all(len(true) == len(pred) for true, pred in zip(true_labels, predictions)):
        precision = precision_score(true_labels, predictions)
        recall = recall_score(true_labels, predictions)
        f1 = f1_score(true_labels, predictions)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-Score:", f1)
        print(classification_report(true_labels, predictions))
    else:
        print("Error: Could not align all samples correctly for evaluation.")

test_texts = ["Shahla Khuduyeva və Pasha Sığorta şirkəti haqqında məlumat."]
true_labels = [["B-PERSON", "B-ORGANISATION"]]
evaluate_model(test_texts, true_labels)