File size: 13,666 Bytes

import optuna
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
    Trainer, DataCollatorForLanguageModeling
)
import torch
from datasets import load_dataset
import numpy as np
import gc
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, Matern
import matplotlib.pyplot as plt
from scipy.stats import norm
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

from transformers import TrainerCallback

import argparse

# Configuration parameters
num_trials = 10  # Adjust this value to control the number of optimization trials
DATASET = load_dataset("BramVanroy/dolly-15k-dutch", split="train_sft[:1000]")
CONTEXT_WINDOW = 1024

# Initialize tokenizer once
tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-1.2B")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

def prepare_chat_format(examples):
    chats = []
    for messages in examples['messages']:
        try:
            chat = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                max_length=CONTEXT_WINDOW,
                truncation=True,
                return_tensors=None
            )
            chats.append(chat)
        except Exception as e:
            print(f"Error applying chat template: {e}")
            print("Fallback format if chat template fails")
            text = ""
            for message in messages:
                role = message["role"]
                content = message["content"]
                text += f"<|{role}|>\n{content}</s>\n"
            
            chat = tokenizer(
                text,
                max_length=CONTEXT_WINDOW,
                truncation=True,
                return_tensors=None
            )["input_ids"]
            
            chats.append(chat)
    return {"input_ids": chats}

# Prepare dataset once
tokenized_dataset = DATASET.map(
    prepare_chat_format,
    batched=True,
    remove_columns=DATASET.column_names
)

def clear_memory():
    """Clear GPU memory between trials"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

class LossCallback(TrainerCallback):
    def __init__(self):
        self.losses = []
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and "loss" in logs:
            self.losses.append(logs["loss"])

def objective(trial):
    # Clear memory from previous trial
    clear_memory()
    
    lr = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    
    # Initialize model with fresh state
    torch.manual_seed(42)
    model = AutoModelForCausalLM.from_pretrained(
        "Zyphra/Zamba2-1.2B",
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    model.config.pad_token_id = tokenizer.pad_token_id

    # Calculate steps with larger batch size
    batch_size = 4  # Increased from 1
    grad_accum_steps = 8  # Decreased from 32 since we increased batch size
    effective_batch_size = batch_size * grad_accum_steps  # Still 32 total
    total_steps = len(tokenized_dataset) // effective_batch_size
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./optuna_runs/trial_{trial.number}",
        num_train_epochs=1,
        per_device_train_batch_size=batch_size,  # Increased
        gradient_accumulation_steps=grad_accum_steps,  # Decreased
        logging_steps=max(total_steps // 20, 1),
        learning_rate=lr,
        weight_decay=0.01,
        fp16=False,
        bf16=True,
        warmup_steps=total_steps // 10,
        save_steps=1000000,
        save_total_limit=None,
        report_to="none",
        seed=42,
        dataloader_num_workers=4,  # Added for faster data loading
        gradient_checkpointing=True,  # Added to optimize memory usage
        max_grad_norm=1.0  # Added for stability
    )

    print(f"\nTrial {trial.number}:")
    print(f"Learning rate: {lr}")
    print(f"Total steps: {total_steps}")
    print(f"Logging every {training_args.logging_steps} steps")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    class CustomTrainer(Trainer):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.model = model
            
        def _move_model_to_device(self, model, device):
            pass

    # Initialize callback
    loss_callback = LossCallback()

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
        callbacks=[loss_callback]  # Use the proper callback
    )

    try:
        train_result = trainer.train()
        
        # Calculate mean of last 20% of losses
        losses = loss_callback.losses  # Get losses from callback
        n_losses = max(len(losses) // 5, 1)
        final_losses = losses[-n_losses:]
        mean_loss = np.mean(final_losses) if final_losses else float('inf')
        
        # Clean up
        del model
        del trainer
        clear_memory()
        
        return mean_loss
    
    except Exception as e:
        print(f"Trial failed with error: {e}")
        # Clean up on failure
        del model
        del trainer
        clear_memory()
        return float('inf')

# Create and run the study
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42),
    study_name="learning_rate_optimization"
)

study.optimize(objective, n_trials=num_trials)

# Print results
print(f"\nOptimization Results ({num_trials} trials):")
print("Best learning rate:", study.best_params["learning_rate"])
print("Best loss:", study.best_value)
print("\nAll trials:")
for trial in study.trials:
    print(f"Learning rate: {trial.params['learning_rate']:.2e}, Loss: {trial.value:.4f}")

# Save results
import json
results = {
    "best_learning_rate": study.best_params["learning_rate"],
    "best_loss": study.best_value,
    "all_trials": [(trial.params["learning_rate"], trial.value) for trial in study.trials]
}
with open("lr_optimization_results.json", "w") as f:
    json.dump(results, f, indent=4)

# Plot optimization history
try:
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()
except Exception as e:
    print(f"Could not create visualization: {e}")

# Add sophisticated final optimization using Gaussian Process Regression
def optimize_final_lr(study):
    try:
        # Extract learning rates and losses
        X = np.array([[trial.params['learning_rate']] for trial in study.trials])
        y = np.array([trial.value for trial in study.trials])
        
        # Check if we have any valid results
        valid_mask = np.isfinite(y)
        if not np.any(valid_mask):
            print("No valid trials found. Returning default learning rate.")
            return {
                'gpr_optimal_lr': 2e-5,  # default fallback
                'ei_optimal_lr': 2e-5,
                'predicted_loss': float('inf'),
                'uncertainty': float('inf')
            }
        
        # Filter out infinite values
        X = X[valid_mask]
        y = y[valid_mask]
        
        # Ensure we have enough points for fitting
        if len(X) < 2:
            print("Not enough valid trials for GPR. Returning best observed value.")
            best_idx = np.argmin(y)
            return {
                'gpr_optimal_lr': float(X[best_idx][0]),
                'ei_optimal_lr': float(X[best_idx][0]),
                'predicted_loss': float(y[best_idx]),
                'uncertainty': float('inf')
            }
        
        # Transform to log space
        X_log = np.log10(X)
        
        # Normalize y values
        y_mean = np.mean(y)
        y_std = np.std(y)
        if y_std == 0:
            y_std = 1
        y_normalized = (y - y_mean) / y_std
        
        # Define kernel
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5)
        
        # Fit Gaussian Process
        gpr = GaussianProcessRegressor(
            kernel=kernel,
            n_restarts_optimizer=10,
            random_state=42,
            normalize_y=False  # we're manually normalizing
        )
        
        try:
            gpr.fit(X_log, y_normalized)
        except np.linalg.LinAlgError:
            print("GPR fitting failed. Returning best observed value.")
            best_idx = np.argmin(y)
            return {
                'gpr_optimal_lr': float(X[best_idx][0]),
                'ei_optimal_lr': float(X[best_idx][0]),
                'predicted_loss': float(y[best_idx]),
                'uncertainty': float('inf')
            }
        
        # Create fine grid of points for prediction
        X_pred_log = np.linspace(np.log10(X.min()), np.log10(X.max()), 1000).reshape(-1, 1)
        
        # Predict mean and std
        y_pred_normalized, sigma = gpr.predict(X_pred_log, return_std=True)
        
        # Denormalize predictions
        y_pred = y_pred_normalized * y_std + y_mean
        sigma = sigma * y_std
        
        # Find the point with lowest predicted value
        best_idx = np.argmin(y_pred)
        optimal_lr = 10 ** X_pred_log[best_idx, 0]
        
        # Calculate acquisition function (Expected Improvement)
        best_f = np.min(y)
        Z = (best_f - y_pred) / (sigma + 1e-9)  # add small constant to prevent division by zero
        ei = sigma * (Z * norm.cdf(Z) + norm.pdf(Z))
        
        # Find point with highest expected improvement
        ei_best_idx = np.argmax(ei)
        ei_optimal_lr = 10 ** X_pred_log[ei_best_idx, 0]
        
        return {
            'gpr_optimal_lr': float(optimal_lr),
            'ei_optimal_lr': float(ei_optimal_lr),
            'predicted_loss': float(y_pred[best_idx]),
            'uncertainty': float(sigma[best_idx])
        }
        
    except Exception as e:
        print(f"Optimization failed with error: {e}")
        return {
            'gpr_optimal_lr': 2e-5,  # default fallback
            'ei_optimal_lr': 2e-5,
            'predicted_loss': float('inf'),
            'uncertainty': float('inf')
        }

# Run final optimization and handle potential failures
try:
    final_optimization = optimize_final_lr(study)
    print("\nAdvanced Optimization Results:")
    print(f"GPR Optimal Learning Rate: {final_optimization['gpr_optimal_lr']:.2e}")
    print(f"Expected Improvement Optimal Learning Rate: {final_optimization['ei_optimal_lr']:.2e}")
    print(f"Predicted Loss: {final_optimization['predicted_loss']:.4f}")
    print(f"Uncertainty: {final_optimization['uncertainty']:.4f}")
except Exception as e:
    print(f"Final optimization failed: {e}")
    final_optimization = {
        'gpr_optimal_lr': 2e-5,
        'ei_optimal_lr': 2e-5,
        'predicted_loss': float('inf'),
        'uncertainty': float('inf')
    }

# Save extended results
results.update({
    "gpr_optimal_lr": float(final_optimization['gpr_optimal_lr']),
    "ei_optimal_lr": float(final_optimization['ei_optimal_lr']),
    "predicted_loss": float(final_optimization['predicted_loss']),
    "uncertainty": float(final_optimization['uncertainty'])
})

# Visualization of the GPR results
def plot_gpr_results(study, final_optimization):
    # Extract data and filter out infinite values
    X = np.array([[trial.params['learning_rate']] for trial in study.trials])
    y = np.array([trial.value for trial in study.trials])
    
    # Create mask for finite values
    finite_mask = np.isfinite(y)
    X = X[finite_mask]
    y = y[finite_mask]
    
    # Check if we have enough valid points
    if len(X) < 2:
        print("Not enough valid points for GPR visualization")
        return
    
    # Create prediction points
    X_pred = np.logspace(np.log10(X.min()), np.log10(X.max()), 100).reshape(-1, 1)
    X_pred_log = np.log10(X_pred)
    
    # Fit GPR for plotting
    kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5)
    gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)
    gpr.fit(np.log10(X), y)
    
    # Predict mean and std
    y_pred, sigma = gpr.predict(X_pred_log, return_std=True)
    
    plt.figure(figsize=(12, 6))
    plt.semilogx(X, y, 'ko', label='Valid Trials', markersize=8)
    plt.semilogx(X_pred, y_pred, 'b-', label='GPR Mean')
    plt.fill_between(X_pred.ravel(), 
                    y_pred - 2*sigma, 
                    y_pred + 2*sigma, 
                    color='blue', 
                    alpha=0.2, 
                    label='95% Confidence')
    
    # Only plot optimal lines if they are finite
    if np.isfinite(final_optimization['gpr_optimal_lr']):
        plt.axvline(final_optimization['gpr_optimal_lr'], color='r', linestyle='--', 
                    label='GPR Optimal LR')
    if np.isfinite(final_optimization['ei_optimal_lr']):
        plt.axvline(final_optimization['ei_optimal_lr'], color='g', linestyle='--', 
                    label='EI Optimal LR')
    
    plt.xlabel('Learning Rate')
    plt.ylabel('Loss')
    plt.title('Learning Rate Optimization Results with GPR')
    plt.legend()
    plt.grid(True)
    plt.savefig('lr_optimization_plot.png', dpi=300, bbox_inches='tight')
    plt.close()

plot_gpr_results(study, final_optimization)

# Save all results
with open("lr_optimization_results.json", "w") as f:
    json.dump(results, f, indent=4)

# Store best learning rate as a variable for finetune.py to use
best_lr = study.best_params["learning_rate"]