In [None]:
# Install necessary libraries
!pip install datasets

# Importing required libraries for dataset, model, tokenizer, training, and evaluation
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from IPython import get_ipython
from IPython.display import display

In [None]:
# 1. Load Dataset
# Load the Bengali transliteration dataset
raw_dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split dataset into training and validation sets (90% training, 10% validation)
train_data = raw_dataset['train'].train_test_split(test_size=0.1, seed=42)['train'] # Added seed for reproducibility
val_data = raw_dataset['train'].train_test_split(test_size=0.1, seed=42)['test'] # Added seed for reproducibility

# 2. Preprocessing
# Define model name for T5 and load its tokenizer
model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Tokenize function for preprocessing the data
def preprocess_data(examples):
 # Tokenize input and target sequences with padding and truncation to fixed length
 # Access the correct columns based on the dataset structure
 inputs = tokenizer(examples['bn'], padding="max_length", truncation=True, max_length=128)
 targets = tokenizer(examples['rm'], padding="max_length", truncation=True, max_length=128)

 # Assign the tokenized target as labels for the model
 inputs['labels'] = targets['input_ids']
 return inputs

# Apply the preprocessing function to the training and validation datasets
train_dataset = train_data.map(preprocess_data, batched=True)
val_dataset = val_data.map(preprocess_data, batched=True)

# 3. Model Selection
# Load the T5 model for conditional generation
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 4. Training
# Define training arguments like learning rate, batch size, number of epochs, etc.
training_args = TrainingArguments(
 output_dir="./results", # Directory to store the results
 evaluation_strategy="epoch", # Evaluate model after each epoch
 learning_rate=5e-5, # Learning rate
 per_device_train_batch_size=16, # Batch size for training
 per_device_eval_batch_size=16, # Batch size for evaluation
 num_train_epochs=5, # Number of training epochs
 weight_decay=0.01, # Weight decay for regularization
 save_steps=1000, # Save model every 1000 steps
 save_total_limit=2, # Keep only 2 most recent checkpoints
 logging_dir="./logs", # Directory to store logs
 logging_steps=500, # Log every 500 steps
)


In [None]:
# Initialize the Trainer with the model, arguments, and datasets
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset
)

# Start the training process
trainer.train()