File size: 14,161 Bytes
6ca12ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 |
# -*- coding: utf-8 -*-
"""xlm-roberta-large.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/18YiC93vkjig-o550pHFJSB3bCQ7rhb4M
"""
!pip install transformers datasets seqeval huggingface_hub
# Standard library imports
import os # Provides functions for interacting with the operating system
import warnings # Used to handle or suppress warnings
import numpy as np # Essential for numerical operations and array manipulation
import torch # PyTorch library for tensor computations and model handling
import ast # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)
# Hugging Face and Transformers imports
from datasets import load_dataset # Loads datasets for model training and evaluation
from transformers import (
AutoTokenizer, # Initializes a tokenizer from a pre-trained model
DataCollatorForTokenClassification, # Handles padding and formatting of token classification data
TrainingArguments, # Defines training parameters like batch size and learning rate
Trainer, # High-level API for managing training and evaluation
AutoModelForTokenClassification, # Loads a pre-trained model for token classification tasks
get_linear_schedule_with_warmup, # Learning rate scheduler for gradual warm-up and linear decay
EarlyStoppingCallback # Callback to stop training if validation performance plateaus
)
# Hugging Face Hub
from huggingface_hub import login # Allows logging in to Hugging Face Hub to upload models
# seqeval metrics for NER evaluation
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
# Provides precision, recall, F1-score, and classification report for evaluating NER model performance
# Log in to Hugging Face Hub
login(token="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")
# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training
os.environ["WANDB_DISABLED"] = "true"
# Suppress warning messages to keep output clean, especially during training and evaluation
warnings.filterwarnings("ignore")
# Load the Azerbaijani NER dataset from Hugging Face
dataset = load_dataset("LocalDoc/azerbaijani-ner-dataset")
print(dataset) # Display dataset structure (e.g., train/validation splits)
# Preprocessing function to format tokens and NER tags correctly
def preprocess_example(example):
try:
# Convert string of tokens to a list and parse NER tags to integers
example["tokens"] = ast.literal_eval(example["tokens"])
example["ner_tags"] = list(map(int, ast.literal_eval(example["ner_tags"])))
except (ValueError, SyntaxError) as e:
# Skip and log malformed examples, ensuring error resilience
print(f"Skipping malformed example: {example['index']} due to error: {e}")
example["tokens"] = []
example["ner_tags"] = []
return example
# Apply preprocessing to each dataset entry, ensuring consistent formatting
dataset = dataset.map(preprocess_example)
# Initialize the tokenizer for multilingual NER using xlm-roberta-large
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
# Function to tokenize input and align labels with tokenized words
def tokenize_and_align_labels(example):
# Tokenize the sentence while preserving word boundaries for correct NER tag alignment
tokenized_inputs = tokenizer(
example["tokens"], # List of words (tokens) in the sentence
truncation=True, # Truncate sentences longer than max_length
is_split_into_words=True, # Specify that input is a list of words
padding="max_length", # Pad to maximum sequence length
max_length=128, # Set the maximum sequence length to 128 tokens
)
labels = [] # List to store aligned NER labels
word_ids = tokenized_inputs.word_ids() # Get word IDs for each token
previous_word_idx = None # Initialize previous word index for tracking
# Loop through word indices to align NER tags with subword tokens
for word_idx in word_ids:
if word_idx is None:
labels.append(-100) # Set padding token labels to -100 (ignored in loss)
elif word_idx != previous_word_idx:
# Assign the label from example's NER tags if word index matches
labels.append(example["ner_tags"][word_idx] if word_idx < len(example["ner_tags"]) else -100)
else:
labels.append(-100) # Label subword tokens with -100 to avoid redundant labels
previous_word_idx = word_idx # Update previous word index
tokenized_inputs["labels"] = labels # Add labels to tokenized inputs
return tokenized_inputs
# Apply tokenization and label alignment function to the dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)
# Create a 90-10 split of the dataset for training and validation
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)
print(tokenized_datasets) # Output structure of split datasets
# Define a list of entity labels for NER tagging with B- (beginning) and I- (inside) markers
label_list = [
"O", # Outside of a named entity
"B-PERSON", "I-PERSON", # Person name (e.g., "John" in "John Doe")
"B-LOCATION", "I-LOCATION", # Geographical location (e.g., "Paris")
"B-ORGANISATION", "I-ORGANISATION", # Organization name (e.g., "UNICEF")
"B-DATE", "I-DATE", # Date entity (e.g., "2024-11-05")
"B-TIME", "I-TIME", # Time (e.g., "12:00 PM")
"B-MONEY", "I-MONEY", # Monetary values (e.g., "$20")
"B-PERCENTAGE", "I-PERCENTAGE", # Percentage values (e.g., "20%")
"B-FACILITY", "I-FACILITY", # Physical facilities (e.g., "Airport")
"B-PRODUCT", "I-PRODUCT", # Product names (e.g., "iPhone")
"B-EVENT", "I-EVENT", # Named events (e.g., "Olympics")
"B-ART", "I-ART", # Works of art (e.g., "Mona Lisa")
"B-LAW", "I-LAW", # Laws and legal documents (e.g., "Article 50")
"B-LANGUAGE", "I-LANGUAGE", # Languages (e.g., "Azerbaijani")
"B-GPE", "I-GPE", # Geopolitical entities (e.g., "Europe")
"B-NORP", "I-NORP", # Nationalities, religious groups, political groups
"B-ORDINAL", "I-ORDINAL", # Ordinal indicators (e.g., "first", "second")
"B-CARDINAL", "I-CARDINAL", # Cardinal numbers (e.g., "three")
"B-DISEASE", "I-DISEASE", # Diseases (e.g., "COVID-19")
"B-CONTACT", "I-CONTACT", # Contact info (e.g., email or phone number)
"B-ADAGE", "I-ADAGE", # Common sayings or adages
"B-QUANTITY", "I-QUANTITY", # Quantities (e.g., "5 km")
"B-MISCELLANEOUS", "I-MISCELLANEOUS", # Miscellaneous entities not fitting other categories
"B-POSITION", "I-POSITION", # Job titles or positions (e.g., "CEO")
"B-PROJECT", "I-PROJECT" # Project names (e.g., "Project Apollo")
]
# Initialize a data collator to handle padding and formatting for token classification
data_collator = DataCollatorForTokenClassification(tokenizer)
# Load a pre-trained model for token classification, adapted for NER tasks
model = AutoModelForTokenClassification.from_pretrained(
"xlm-roberta-large", # Base model (multilingual XLM-RoBERTa) for NER
num_labels=len(label_list) # Set the number of output labels to match NER categories
)
# Define a function to compute evaluation metrics for the model's predictions
def compute_metrics(p):
predictions, labels = p # Unpack predictions and true labels from the input
# Convert logits to predicted label indices by taking the argmax along the last axis
predictions = np.argmax(predictions, axis=2)
# Filter out special padding labels (-100) and convert indices to label names
true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
# Print a detailed classification report for each label category
print(classification_report(true_labels, true_predictions))
# Calculate and return key evaluation metrics
return {
# Precision measures the accuracy of predicted positive instances
# Important in NER to ensure entity predictions are correct and reduce false positives.
"precision": precision_score(true_labels, true_predictions),
# Recall measures the model's ability to capture all relevant entities
# Essential in NER to ensure the model captures all entities, reducing false negatives.
"recall": recall_score(true_labels, true_predictions),
# F1-score is the harmonic mean of precision and recall, balancing both metrics
# Useful in NER for providing an overall performance measure, especially when precision and recall are both important.
"f1": f1_score(true_labels, true_predictions),
}
# Set up training arguments for model training, defining essential training configurations
training_args = TrainingArguments(
output_dir="./results", # Directory to save model checkpoints and final outputs
evaluation_strategy="epoch", # Evaluate model on the validation set at the end of each epoch
save_strategy="epoch", # Save model checkpoints at the end of each epoch
learning_rate=2e-5, # Set a low learning rate to ensure stable training for fine-tuning
per_device_train_batch_size=128, # Number of examples per batch during training, balancing speed and memory
per_device_eval_batch_size=128, # Number of examples per batch during evaluation
num_train_epochs=12, # Number of full training passes over the dataset
weight_decay=0.005, # Regularization term to prevent overfitting by penalizing large weights
fp16=True, # Use 16-bit floating point for faster and memory-efficient training
logging_dir='./logs', # Directory to store training logs
save_total_limit=2, # Keep only the 2 latest model checkpoints to save storage space
load_best_model_at_end=True, # Load the best model based on metrics at the end of training
metric_for_best_model="f1", # Use F1-score to determine the best model checkpoint
report_to="none" # Disable reporting to external services (useful in local runs)
)
# Initialize the Trainer class to manage the training loop with all necessary components
trainer = Trainer(
model=model, # The pre-trained model to be fine-tuned
args=training_args, # Training configuration parameters defined in TrainingArguments
train_dataset=tokenized_datasets["train"], # Tokenized training dataset
eval_dataset=tokenized_datasets["test"], # Tokenized validation dataset
tokenizer=tokenizer, # Tokenizer used for processing input text
data_collator=data_collator, # Data collator for padding and batching during training
compute_metrics=compute_metrics, # Function to calculate evaluation metrics like precision, recall, F1
callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] # Stop training early if validation metrics don't improve for 2 epochs
)
# Begin the training process and capture the training metrics
training_metrics = trainer.train()
# Evaluate the model on the validation set after training
eval_results = trainer.evaluate()
# Print evaluation results, including precision, recall, and F1-score
print(eval_results)
# Define the directory where the trained model and tokenizer will be saved
save_directory = "./xlm-roberta-large"
# Save the trained model to the specified directory
model.save_pretrained(save_directory)
# Save the tokenizer to the same directory for compatibility with the model
tokenizer.save_pretrained(save_directory)
from transformers import pipeline
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForTokenClassification.from_pretrained(save_directory)
# Initialize the NER pipeline
device = 0 if torch.cuda.is_available() else -1
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)
label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_list) if label != "O"}
def evaluate_model(test_texts, true_labels):
predictions = []
for i, text in enumerate(test_texts):
pred_entities = nlp_ner(text)
pred_labels = [label_mapping.get(entity["entity_group"], "O") for entity in pred_entities if entity["entity_group"] in label_mapping]
if len(pred_labels) != len(true_labels[i]):
print(f"Warning: Inconsistent number of entities in sample {i+1}. Adjusting predicted entities.")
pred_labels = pred_labels[:len(true_labels[i])]
predictions.append(pred_labels)
if all(len(true) == len(pred) for true, pred in zip(true_labels, predictions)):
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print(classification_report(true_labels, predictions))
else:
print("Error: Could not align all samples correctly for evaluation.")
test_texts = ["Shahla Khuduyeva və Pasha Sığorta şirkəti haqqında məlumat."]
true_labels = [["B-PERSON", "B-ORGANISATION"]]
evaluate_model(test_texts, true_labels)
|