[ { "Name": "batch_size_grokking", "Title": "Batch Size Grokking: Assessing the impact of the training batchsize on the grokking phenomenon", "Experiment": "Modify the experiments to dynamically adjust the batch size during training, starting with a small batch size and gradually increasing it. This could potentially lead to faster generalization on the validation set.", "Interestingness": 6, "Feasibility": 4, "Novelty": 4, "novel": true }, { "Name": "model_architecture_grokking", "Title": "Impact of Transformer Architecture on Grokking Behavior", "Experiment": "Modify the Transformer class to accept different architectures (layers: 1-4, dimensions: 64-256, heads: 2-8). Create 5 configurations with approximately equal parameter counts. Run experiments for each architecture across all datasets. Compare grokking behavior, final accuracies, steps to 99% validation accuracy, and analyze learning curves across architectures.", "Interestingness": 8, "Feasibility": 7, "Novelty": 7, "novel": true }, { "Name": "cyclic_lr_grokking", "Title": "Cyclic Learning Rates and Gradient Dynamics in Grokking", "Experiment": "Implement a cyclic learning rate schedule alongside the existing warmup schedule. Run experiments for both schedules across all datasets. Compare grokking behavior, final accuracies, and steps to 99% validation accuracy. Plot learning rate vs. training step for each schedule alongside validation accuracy curves. Track and plot gradient norms throughout training to understand how different schedules affect optimization dynamics, particularly during the transition from memorization to generalization.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "initialization_impact_grokking", "Title": "Structured vs. Random Initialization: Effects on Grokking Dynamics", "Experiment": "Modify the Transformer class to accept two initialization methods: Xavier/Glorot (random) and orthogonal (structured). Implement these initialization methods. Run experiments for both initializations across all datasets. Compare grokking behavior, final accuracies, steps to 99% validation accuracy, and analyze learning curves. Track and plot weight distribution changes and analyze model representations (e.g., attention patterns, hidden states) at key points during training (initial, peak training accuracy, lowest validation loss, and final) to understand how different initializations affect the transition from memorization to generalization.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "task_complexity_grokking", "Title": "Grokking Dynamics Across Task Complexity: From Simple to Compound Operations", "Experiment": "Modify the AbstractDataset class to include new datasets for compound operations: (a+b)-c, (a-b)/c, and (a/b)+c. Implement these new datasets in the operation_mod_p_data function. Run experiments across all datasets, including the new compound ones. Compare grokking behavior, final accuracies, steps to 99% validation accuracy, and analyze learning curves. Plot these metrics against the complexity of the operation (measured by the number of component operations) to visualize how task complexity affects grokking.", "Interestingness": 9, "Feasibility": 6, "Novelty": 8, "novel": true }, { "Name": "data_imbalance_grokking", "Title": "Impact of Data Imbalance on Grokking Dynamics", "Experiment": "Modify the AbstractDataset class to introduce controlled imbalances in the training data. Implement a new function create_imbalanced_dataset(dataset, imbalance_factor) where imbalance_factor ranges from 0 (balanced) to 1 (highly imbalanced). Run experiments on ModDivisionDataset and PermutationGroup with imbalance_factors [0, 0.25, 0.5, 0.75, 0.9]. Modify train() and evaluate() functions to track overall accuracy, per-class accuracy, steps to 99% validation accuracy, and analyze learning curves. Plot these metrics against the imbalance factor to visualize how data distribution affects grokking. Analyze how imbalance impacts the transition from memorization to generalization for different operation types.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "scheduled_regularization_grokking", "Title": "Scheduled Regularization: A Dynamic Approach to Enhance Grokking", "Experiment": "Implement a ScheduledRegularization class that increases weight decay linearly from 0 to a maximum value of 0.1 over the course of training. Modify the Transformer class to use this scheduled regularization. Run experiments with two conditions across all datasets: no regularization (baseline) and scheduled regularization. Compare grokking behavior by tracking training accuracy, validation accuracy, and loss. Identify the 'grokking point' where validation accuracy suddenly improves. Analyze how scheduled regularization affects the timing and abruptness of this grokking point. Plot weight distributions before and after the grokking point to understand how regularization influences the transition from memorization to generalization.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "attention_grokking_analysis", "Title": "Quantifying Attention Shifts During Grokking", "Experiment": "Modify the last DecoderBlock to output attention weights. Update the Transformer's forward method to return these weights. Implement functions to: 1) Visualize attention as heatmaps, 2) Compute attention entropy, 3) Calculate the Frobenius norm of attention weight changes. Run experiments across all datasets, collecting these metrics at regular intervals. Plot attention entropy and weight changes alongside validation accuracy. Identify the 'grokking point' where validation accuracy improves suddenly, and analyze how attention metrics change around this point. Compare these patterns across different operations to characterize attention behavior during grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "input_encoding_grokking", "Title": "Impact of Input Encodings on Grokking Dynamics in Mathematical Operations", "Experiment": "Modify AbstractDataset to support two encoding schemes: one-hot and binary. Update the Transformer class to handle these new input formats and output attention weights. Implement a new function select_encoding(encoding_type) to switch between encodings. Run experiments for each encoding across all datasets. Track training accuracy, validation accuracy, and loss. Define the 'grokking point' as the step where validation accuracy exceeds 95%. Compare grokking points, final accuracies, and learning curve shapes between encodings. Plot learning curves with grokking points marked for each encoding scheme side by side. Analyze attention patterns at the grokking point for each encoding. Evaluate how different encodings affect grokking speed, final performance, and internal representations across operation types.", "Interestingness": 9, "Feasibility": 6, "Novelty": 8, "novel": true }, { "Name": "loss_landscape_grokking", "Title": "Quantifying Loss Landscape Evolution During Grokking", "Experiment": "Implement a function to compute 2D loss landscapes using filter-wise normalization. Define the 'grokking point' as the step where validation accuracy exceeds 95%. Modify the training loop to save model checkpoints and compute loss landscapes at five key points: initial, 50% training accuracy, grokking point, 99% validation accuracy, and final. Focus on ModDivisionDataset and PermutationGroup datasets. Compute quantitative metrics of landscape geometry (\u03b5-sharpness and \u03b5-flatness as defined in Keskar et al., 2017) at each point. Plot these metrics alongside training and validation accuracy curves. Compare the evolution of loss landscape metrics across different operations and relate changes to the grokking phenomenon.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "optimizer_impact_grokking", "Title": "Optimizer Impact on Grokking Dynamics: SGD vs Adam", "Experiment": "Modify the run() function to accept an optimizer type (SGD or Adam) and learning rate as arguments. Implement a grid search over optimizer types and learning rates (e.g., [1e-4, 1e-3, 1e-2] for SGD and [1e-5, 1e-4, 1e-3] for Adam). Run experiments for each combination across all datasets. Track and compare grokking behavior, including steps to 99% validation accuracy, sharpness of grokking transition, final accuracies, and computational cost. Plot learning curves and gradient norm evolution for each optimizer-learning rate combination. Analyze how different optimizers and learning rates affect the memorization-to-generalization transition. Implement a function to visualize weight distributions at key points (initial, pre-grokking, post-grokking, final) for each optimizer to understand how optimization strategy influences weight dynamics during grokking.", "Interestingness": 9, "Feasibility": 9, "Novelty": 8, "novel": true }, { "Name": "data_augmentation_grokking", "Title": "Impact of Data Augmentation on Grokking Dynamics in Mathematical Operations", "Experiment": "Modify AbstractDataset to include methods for operand reversal (for addition and multiplication) and operand negation (for addition, subtraction, and division) augmentations. Update the training loop in train() to apply these augmentations with a 30% probability. Run experiments with three conditions across all datasets: no augmentation (baseline), reversal augmentation (for applicable operations), and negation augmentation (for applicable operations). Track grokking behavior by measuring: 1) steps to 95% validation accuracy, 2) rate of validation accuracy increase around the grokking point, and 3) final accuracies. Plot learning curves and gradient norm evolution for each condition. Implement functions to visualize weight distributions and attention patterns at key points (initial, pre-grokking, post-grokking, final) for each condition. Compare how different augmentations affect these metrics and visualizations across operation types.", "Interestingness": 9, "Feasibility": 9, "Novelty": 8, "novel": true }, { "Name": "hidden_representation_evolution", "Title": "Tracking the Evolution of Hidden Representations During Grokking", "Experiment": "Modify the Transformer class to output hidden states from each layer. Update the training loop to save these hidden states at five key points: initial, pre-grokking (50% training accuracy), grokking point (95% validation accuracy), post-grokking (99% validation accuracy), and final. Implement functions to: 1) Apply PCA to visualize hidden states in 2D, 2) Compute cosine similarities between hidden states of different examples, 3) Analyze correct vs. incorrect predictions separately. Run experiments on ModDivisionDataset and PermutationGroup. Generate visualizations of hidden state evolution and plot similarity metrics alongside validation accuracy. Compare how representations of correct and incorrect predictions evolve differently across the grokking process.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "mutual_information_grokking", "Title": "Mutual Information Dynamics During Grokking: Quantifying Representation Changes", "Experiment": "Modify the Transformer class to output hidden states from the first and last layers. Implement a function to estimate mutual information using PCA for dimensionality reduction followed by the histogram method (binning). Update the training loop to compute and save mutual information every 100 steps. Run experiments on ModDivisionDataset and PermutationGroup. Plot mutual information evolution alongside validation accuracy. Define the 'MI change rate' as the slope of mutual information vs. training steps. Compare MI change rates before and after the grokking point (defined as 95% validation accuracy). Analyze how these rates differ between the first and last layers, and between different operations.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "curriculum_learning_grokking", "Title": "Curriculum Learning's Impact on Grokking Dynamics in Mathematical Operations", "Experiment": "Modify AbstractDataset to sort examples based on operand magnitude. Implement a CurriculumSampler class that linearly increases the range of accessible examples during training, starting from [0, p/4] and reaching [0, p] by the end of training. Update the training loop to use this sampler for curriculum condition. Run experiments on ModDivisionDataset and ModSumDataset with two conditions: no curriculum (baseline) and linear curriculum. Track grokking behavior by measuring steps to 95% validation accuracy, rate of validation accuracy increase around the grokking point, and final accuracies. Plot learning curves and gradient norm evolution for each condition. Analyze attention patterns at key points (pre-grokking, grokking point, post-grokking) to compare how internal representations evolve between curriculum and non-curriculum approaches.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "robustness_grokking_correlation", "Title": "Correlating Grokking with Input Robustness in Mathematical Operations", "Experiment": "Implement a function to generate perturbed inputs by randomly changing one element of the input sequence. Modify evaluate() to compute accuracy on both clean and perturbed inputs. Update the training loop to evaluate robustness every 100 steps. Run experiments on ModDivisionDataset and PermutationGroup. Plot clean accuracy, perturbed input accuracy, and their difference over training steps. Define the 'robustness point' as the step where perturbed input accuracy exceeds 90% of clean accuracy. Compare this point with the grokking point (95% clean validation accuracy) across different operations. Analyze how the accuracy gap changes before, during, and after grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "noisy_grokking", "Title": "Grokking Under Noise: Effects of Input Perturbations on Learning Dynamics and Robustness", "Experiment": "Implement a function add_gaussian_noise(inputs, noise_level) to add controlled noise to inputs. Modify train() to include noisy inputs with probabilities [0, 0.1, 0.3]. Run experiments on ModDivisionDataset and PermutationGroup for each noise level. Track grokking behavior by measuring steps to 95% validation accuracy, rate of validation accuracy increase around the grokking point, and final accuracies. Plot learning curves and gradient norm evolution for each condition. Analyze attention patterns at key points (pre-grokking, grokking point, post-grokking) to understand how noise affects internal representations. After training, evaluate models on test sets with varying noise levels [0, 0.1, 0.3, 0.5] to assess robustness. Compare how different training noise levels affect grokking dynamics, final model performance, and robustness to unseen noise levels.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "grokking_extrapolation", "Title": "Grokking and Extrapolation: Assessing Out-of-Distribution Generalization in Mathematical Operations", "Experiment": "Modify AbstractDataset to limit the range of numbers used in training (e.g., 0-25, 0-50, 0-75 for a prime p=97). Implement an ExtrapolationDataset class that generates examples using numbers outside the training range. Update the evaluation function to test on both in-distribution and out-of-distribution data. Run experiments on ModDivisionDataset, ModSumDataset, and PermutationGroup for each training range. Track in-distribution accuracy, out-of-distribution accuracy, and their difference over training steps. Define the 'extrapolation point' as the step where out-of-distribution accuracy exceeds 90% of in-distribution accuracy. Compare this point with the grokking point (95% in-distribution validation accuracy). Analyze how the accuracy gap changes before, during, and after grokking. Plot learning curves for both in-distribution and out-of-distribution data. Visualize attention patterns and weight distributions at key points (pre-grokking, grokking point, extrapolation point) for each operation type and training range. Compare extrapolation abilities across different operation types to identify operation-specific patterns in grokking and generalization.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "pruning_grokking_dynamics", "Title": "Pruning and Grokking: Investigating the Impact of Sparsity on Generalization Dynamics", "Experiment": "Implement an iterative magnitude pruning function prune_model(model, sparsity) that sets the smallest weights to zero. Modify the training loop to apply pruning every 100 steps and track gradient norms. Run experiments with sparsity levels [0, 0.3, 0.45, 0.6, 0.9] on ModDivisionDataset and PermutationGroup. Track grokking behavior by measuring the 'grokking point' (defined as the step where validation accuracy suddenly improves), steps to 95% validation accuracy, and final accuracies. Plot learning curves, sparsity levels, gradient norms, and weight magnitude distributions over time. Analyze attention patterns at key points (pre-grokking, grokking point, post-grokking) for each sparsity level. Compare how different sparsity levels affect grokking speed, the sharpness of the transition from memorization to generalization, and final performance.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "perturbation_robustness_grokking", "Title": "Grokking and Input Robustness: Exploring the Connection Between Generalization and Resilience to Perturbations", "Experiment": "Implement a simple input perturbation function add_uniform_noise(inputs, epsilon) to add controlled noise to inputs. Modify evaluate() to compute accuracy on both clean and perturbed inputs. Update the training loop to evaluate perturbation robustness every 100 steps, using epsilon=0.1. Run experiments on ModDivisionDataset and PermutationGroup. Plot clean accuracy, perturbed accuracy, and their ratio over training steps. Define the 'grokking point' as the step where clean validation accuracy exceeds 95%. Analyze how the accuracy ratio changes before, during, and after grokking. Compare the evolution of perturbation robustness across different operations to identify operation-specific patterns. For ModDivisionDataset only, implement a function to visualize 2D decision boundaries at key points (pre-grokking, grokking point, post-grokking) to understand how perturbation robustness relates to the model's learned decision boundaries.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "adversarial_robustness_grokking", "Title": "Grokking and Adversarial Robustness: Investigating the Relationship Between Sudden Generalization and Resilience to Adversarial Attacks", "Experiment": "Implement a Fast Gradient Sign Method (FGSM) attack function. Modify evaluate() to compute accuracy on both clean and adversarial inputs. Update the training loop to evaluate adversarial robustness every 500 steps. Define an adversarial robustness metric as the ratio of adversarial accuracy to clean accuracy. Run experiments on ModDivisionDataset and PermutationGroup, including a comparison with models trained on larger datasets (no grokking). Plot clean accuracy, adversarial robustness metric, and their evolution over training steps. Define the 'grokking point' as the step where clean validation accuracy exceeds 95%. Analyze how adversarial robustness changes before, during, and after grokking. Compare the evolution of adversarial robustness across different operations and between grokking and non-grokking models. Implement a function to visualize the distribution of adversarial perturbations at key points (pre-grokking, grokking point, post-grokking) to understand how the nature of successful adversarial attacks changes as the model groks the task.", "Interestingness": 9, "Feasibility": 7, "Novelty": 9, "novel": true }, { "Name": "mdl_grokking_correlation", "Title": "Minimal Description Length and Grokking: An Information-Theoretic Perspective on Sudden Generalization", "Experiment": "Implement a function estimate_mdl(model) using weight pruning to approximate the model's description length. Prune weights below a threshold and count remaining non-zero weights. Modify the training loop to compute MDL every 500 steps. Run experiments on ModDivisionDataset and PermutationGroup, including a baseline without MDL tracking. Plot MDL estimates alongside validation accuracy. Define the 'MDL transition point' as the step with the steepest decrease in MDL. Compare this point with the grokking point (95% validation accuracy). Analyze the correlation between MDL reduction and improvement in validation accuracy. Compare MDL evolution between grokking and non-grokking (baseline) scenarios.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "critical_period_grokking", "Title": "Critical Learning Periods in Grokking: Investigating Component-Specific Freezing on Generalization", "Experiment": "Modify the Transformer class to allow freezing of specific components (embeddings, attention layers, feed-forward layers) during training. Implement a freeze_components(model, components) function to freeze specified parts of the network. Run experiments on ModDivisionDataset and PermutationGroup with three conditions: early freezing (at 25% of training steps), mid freezing (at 50%), and no freezing (baseline). For each condition, freeze embeddings, then attention layers, then feed-forward layers in separate runs. Track grokking behavior by measuring: 1) steps to 95% validation accuracy, 2) 'grokking speed' (rate of validation accuracy increase in a 1000-step window around the grokking point), and 3) final accuracies. Plot learning curves for each condition and component-freezing combination. Implement a function to visualize weight distributions of frozen vs. unfrozen components at key points (pre-grokking, grokking point, post-grokking). Compare how different freezing schedules and component choices affect the timing, speed, and magnitude of grokking across different operations.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "lottery_ticket_grokking", "Title": "Lottery Tickets in Grokking: Uncovering Sparse Subnetworks During Sudden Generalization", "Experiment": "Implement one-shot magnitude pruning function to find potential lottery tickets. Apply pruning at three key points: pre-grokking (50% training accuracy), grokking point (95% validation accuracy), and post-grokking (99% validation accuracy). For each pruned network, retrain from the original initialization for a fixed number of steps (e.g., 1000). Define a 'winning ticket' as a pruned network that reaches 95% validation accuracy within these steps. Analyze properties of winning tickets (sparsity, weight distributions) at each point. Run experiments on ModDivisionDataset and PermutationGroup. Plot the minimum sparsity level that still produces a winning ticket at each point. Implement a function to visualize and compare attention patterns in winning tickets vs. full network. Calculate the overlap of winning tickets found at different points to quantify stability. Compare the emergence and properties of winning tickets across different operations to identify operation-specific patterns in grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "invariance_learning_grokking", "Title": "Invariance Learning in Grokking: Tracking Symmetry Awareness During Sudden Generalization", "Experiment": "1) Implement an invariance_score(model, dataset) function that measures how consistent the model's outputs are under task-specific transformations (e.g., for ModDivisionDataset, (a/b) mod p == ((a+p)/(b+p)) mod p). 2) Modify the training loop to compute and log this score every 100 steps. 3) Run experiments on ModDivisionDataset, ModSumDataset, and PermutationGroup. 4) Plot the invariance score alongside validation accuracy and training loss. 5) Define the 'invariance point' as the step where the invariance score exceeds 90% of its maximum value. 6) Compare this point with the grokking point (95% validation accuracy) for each operation type. 7) Analyze the correlation between improvements in invariance score and validation accuracy. 8) Implement a function to visualize attention patterns for 5 random examples at key points (pre-grokking, invariance point, grokking point, post-grokking) to understand how learned invariances manifest in the model's attention mechanisms.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "multi_task_grokking", "Title": "Multi-Task Learning and Grokking: Investigating Cross-Task Influences on Sudden Generalization", "Experiment": "1) Modify AbstractDataset to support two operations simultaneously: ModSumDataset and ModSubtractDataset. 2) Update Transformer class to have two task-specific output layers. 3) Implement a multi-task loss function that equally weights both tasks. 4) Modify training loop to alternate between tasks for each batch. 5) Run experiments comparing multi-task learning to single-task baselines. 6) Track and compare grokking points (95% validation accuracy), grokking speeds (rate of accuracy increase around grokking point), learning curves, and final performances between multi-task and single-task setups. 7) Analyze potential transfer effects by freezing shared layers after grokking on one task and fine-tuning on the other. 8) Visualize attention patterns for both tasks at key points: pre-grokking, grokking point, and post-grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "gradient_flow_grokking", "Title": "Gradient Flow Analysis During Grokking: Uncovering Internal Dynamics of Sudden Generalization", "Experiment": "1) Modify the training loop to compute and store gradient L2 norm and cosine similarity between consecutive gradient updates for each layer. 2) Implement these computations every 100 steps during training. 3) Run experiments on ModDivisionDataset and PermutationGroup. 4) Plot gradient L2 norm and cosine similarity alongside validation accuracy. 5) Define the 'gradient shift point' as the step where the moving average of cosine similarity drops below 0.5 for the first time. 6) Compare this point with the grokking point (95% validation accuracy). 7) Analyze layer-wise differences in gradient flow before, during, and after grokking. 8) Create a heatmap visualization of gradient L2 norm across layers and training steps.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "information_bottleneck_grokking", "Title": "Information Bottlenecks and Grokking: Exploring the Role of Information Compression in Sudden Generalization", "Experiment": "1) Modify the Transformer class to include a fixed-size bottleneck layer between the self-attention and feed-forward layers. 2) Implement experiments with different bottleneck sizes (e.g., 16, 32, 64, 128) while keeping the total parameter count constant. 3) Run experiments on ModDivisionDataset and PermutationGroup for each bottleneck size. 4) Track and compare grokking points (95% validation accuracy), grokking speeds, and final performances across different bottleneck sizes. 5) Plot learning curves for each bottleneck size alongside validation accuracy. 6) Analyze the activation patterns of the bottleneck layer at key points: pre-grokking, grokking point, and post-grokking. 7) Visualize weight distributions and attention patterns for different bottleneck sizes to understand how information compression affects internal representations. 8) Analyze the relationship between bottleneck size and grokking characteristics (e.g., grokking point, speed, and final performance) to determine optimal information compression for sudden generalization.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "calibration_grokking_dynamics", "Title": "Calibration Dynamics During Grokking: Exploring Probability Estimation in Sudden Generalization", "Experiment": "1) Implement functions compute_ece(model, dataloader) and compute_mce(model, dataloader) to calculate Expected and Maximum Calibration Error. 2) Modify evaluate() to compute and return ECE and MCE alongside accuracy. 3) Update the training loop to track these metrics every 100 steps. 4) Define the 'grokking point' as the step where validation accuracy exceeds 95%. 5) Run experiments on ModDivisionDataset and PermutationGroup. 6) Plot ECE, MCE, and validation accuracy over training steps. 7) Implement a function to generate and save reliability diagrams every 500 steps, creating an animation of calibration evolution. 8) Analyze the relationship between improvements in calibration metrics and validation accuracy, particularly during the rapid accuracy increase around the grokking point. 9) Compare calibration dynamics between different operations to identify operation-specific patterns in grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "positional_encoding_grokking", "Title": "Impact of Positional Encoding Schemes on Grokking Dynamics", "Experiment": "1) Modify the Transformer class to support two positional encoding schemes: sinusoidal and learned. 2) Implement these encoding schemes. 3) Update the training loop to use the specified encoding scheme. 4) Run experiments on ModDivisionDataset and PermutationGroup for each encoding scheme. 5) Track and compare: a) grokking points (95% validation accuracy), b) grokking speeds (rate of accuracy increase in 1000-step window around grokking point), c) final performances. 6) Plot learning curves and gradient norms for each encoding scheme. 7) Implement a function to visualize attention patterns at key points (pre-grokking, grokking point, post-grokking) for each encoding scheme. 8) Test trained models on sequences longer than those seen during training to assess extrapolation capabilities. 9) Analyze and compare how different encoding schemes affect the suddenness and completeness of generalization, relating findings to the grokking phenomenon.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "gradient_accumulation_grokking", "Title": "Impact of Gradient Accumulation on Grokking Dynamics and Computational Efficiency", "Experiment": "1) Modify the train() function to support gradient accumulation with a specified number of steps. 2) Implement experiments with different accumulation steps (e.g., 1, 2, 4, 8, 16) while keeping the total number of weight updates constant. 3) For each accumulation step, test multiple learning rates (e.g., 1e-4, 5e-4, 1e-3, 5e-3). 4) Run experiments on ModDivisionDataset and PermutationGroup for each accumulation step and learning rate combination. 5) Track and compare grokking points (95% validation accuracy), grokking speeds, final performances, and training time across different configurations. 6) Plot learning curves, gradient norms, and gradient statistics (mean and variance) for each configuration alongside validation accuracy. 7) Analyze the relationship between accumulation steps, learning rates, and grokking characteristics to determine optimal settings for sudden generalization. 8) Implement a function to visualize weight distributions at key points (pre-grokking, grokking point, post-grokking) for different configurations. 9) Evaluate the computational efficiency gains from gradient accumulation on the given hardware. 10) Analyze the trade-off between grokking speed and computational efficiency to provide insights for practical applications.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "neural_collapse_grokking", "Title": "Neural Collapse and Grokking: Investigating Feature Convergence During Sudden Generalization", "Experiment": "1) Modify the Transformer class to output features from the penultimate layer. 2) Implement functions to compute simplified neural collapse metrics: within-class and between-class feature distances using cosine similarity. 3) Update the training loop to compute and log these metrics every 100 steps. 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Plot neural collapse metrics alongside validation accuracy. 6) Define the 'collapse point' as the step where the ratio of between-class to within-class distance exceeds a threshold. 7) Compare this point with the grokking point (95% validation accuracy). 8) Analyze the correlation between changes in neural collapse metrics and validation accuracy around the grokking point.", "Interestingness": 9, "Feasibility": 6, "Novelty": 9, "novel": true }, { "Name": "hidden_state_clustering_grokking", "Title": "Evolution of Hidden State Clusters During Grokking: A Window into Neural Network Generalization", "Experiment": "1) Modify Transformer class to output hidden states from first and last layers. 2) Apply PCA to reduce hidden state dimensionality to 50. 3) Compute silhouette score on PCA-reduced states every 500 steps. 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Plot silhouette scores alongside validation accuracy. 6) Define 'cluster quality point' as step where silhouette score stabilizes. 7) Compare this point with grokking point (95% validation accuracy). 8) Visualize first 2 PCA components of hidden states at key points (initial, pre-grokking, grokking point, post-grokking). 9) Analyze relationship between cluster quality dynamics and grokking behavior.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "double_descent_grokking", "Title": "Double Descent and Grokking: Exploring the Interplay Between Model Depth and Sudden Generalization", "Experiment": "1) Modify the Transformer class to easily scale the number of layers. 2) Implement a range of model depths, from 1 to 8 layers, keeping other hyperparameters constant. 3) Run experiments on ModDivisionDataset and PermutationGroup for each model depth, using multiple random seeds. 4) Track and plot training loss, validation accuracy, and number of layers throughout training. 5) Identify the grokking point (95% validation accuracy) for each model depth. 6) Create a single, comprehensive plot showing training loss, validation accuracy, and model depth on the same graph, highlighting grokking points. 7) Analyze how the grokking point relates to the different regions of the double descent curve (under-parameterized, critical, over-parameterized). 8) Compare grokking characteristics (speed, completeness) across different model depths. 9) Investigate whether grokking occurs more frequently or rapidly in specific regions of the double descent curve.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": false }, { "Name": "attention_evolution_grokking", "Title": "Evolution of Attention Patterns During Grokking: Unveiling the Mechanisms of Sudden Generalization", "Experiment": "1) Modify Transformer class to output attention weights from all layers. 2) Implement functions to visualize attention weights as heatmaps and compute Frobenius norm of attention changes. 3) Update training loop to save attention weights at key points: initial, pre-grokking (50% training accuracy), grokking point (95% validation accuracy), and post-grokking (99% validation accuracy). 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Plot attention entropy, Frobenius norm of attention changes, and validation accuracy. 6) Generate and analyze heatmap visualizations of attention patterns for different input types/classes at each key point. 7) Compare attention evolution across different operations to identify common patterns in grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "critical_learning_periods_grokking", "Title": "Critical Learning Periods in Grokking: Temporal Dynamics of Generalization", "Experiment": "1) Modify Transformer class to allow freezing of specific layers (embeddings, attention layers, feed-forward layers) during training. 2) Implement a freeze_layers(model, layers_to_freeze) function. 3) Run baseline experiments to determine typical grokking point. 4) Define freezing schedules: pre-grokking (50% of steps to typical grokking point), mid-grokking (at typical grokking point), post-grokking (50% of steps after typical grokking point), and no freezing (baseline). 5) Run experiments on ModDivisionDataset and PermutationGroup for each schedule, freezing embeddings, then attention layers, then feed-forward layers in separate runs. Use 5 different random seeds for each configuration. 6) Track grokking point (95% validation accuracy), grokking speed (rate of accuracy increase in 1000-step window around grokking point), and final performance. 7) Plot learning curves for each freezing schedule and layer combination. 8) Implement function to visualize weight distributions and analyze gradient flow in frozen vs. unfrozen layers at key points (pre-grokking, grokking point, post-grokking). 9) Analyze how different freezing schedules affect grokking timing, speed, and magnitude across operations.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "emergent_modularity_grokking", "Title": "Emergent Modularity During Grokking: Analyzing Functional Specialization in Transformer Networks", "Experiment": "1) Modify Transformer class to output attention weights and feed-forward layer activations. 2) Implement functions to analyze attention patterns and identify recurring motifs. 3) Update training loop to save attention weights and activations every 100 steps. 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Apply k-means clustering to attention patterns and activations to identify emergent functional modules. 6) Compute module stability using cosine similarity between attention patterns over consecutive epochs. 7) Plot the number of stable modules and average module stability alongside validation accuracy. 8) Define 'module emergence point' as the step where the number of stable modules plateaus. 9) Compare this point with the grokking point (95% validation accuracy). 10) Analyze how different emergent modules specialize to different input features or operations. 11) Visualize module interactions at key points: pre-grokking, grokking point, and post-grokking. 12) Compare emergent modular dynamics between different operations to identify common patterns in functional specialization during grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "grokking_trigger_samples", "Title": "Grokking Trigger Samples: Identifying Key Data Points in Sudden Generalization", "Experiment": "1) Modify evaluate() to track samples that transition from high-confidence incorrect to high-confidence correct predictions. 2) Update the training loop to identify and store these 'grokking trigger' samples every 100 steps. 3) Run experiments on ModDivisionDataset and PermutationGroup. 4) Plot the number of grokking trigger samples and their average confidence scores alongside validation accuracy. 5) Analyze the properties of grokking trigger samples (e.g., their difficulty, their representation in hidden layers) before and after the grokking point. 6) Implement a function to visualize the attention patterns for grokking trigger samples at key points: pre-grokking, grokking point, and post-grokking. 7) Compare the characteristics of grokking trigger samples across different operations to identify common patterns in grokking. 8) Experiment with fine-tuning on only the grokking trigger samples post-grokking to test their importance in maintaining generalization. 9) Compare the distribution of grokking trigger samples with the overall dataset distribution to identify any biases or patterns.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "effective_capacity_grokking", "Title": "Effective Capacity Dynamics During Grokking: Unraveling the Utilization of Model Parameters", "Experiment": "1) Implement a function estimate_effective_capacity(model) that counts parameters with gradients above a threshold. 2) Modify the training loop to compute and log effective capacity every 100 steps. 3) Run experiments on ModDivisionDataset and PermutationGroup. 4) Plot effective capacity alongside validation accuracy and training loss. 5) Define the 'capacity shift point' as the step with the steepest increase in effective capacity. 6) Compare this point with the grokking point (95% validation accuracy). 7) Analyze the correlation between effective capacity and validation accuracy using Pearson correlation. 8) Visualize the distribution of parameter gradients at key points: pre-grokking, capacity shift point, grokking point, and post-grokking. 9) Compare effective capacity dynamics between different operations to identify common patterns in parameter utilization during grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "critical_samples_grokking", "Title": "Critical Samples in Grokking: Identifying Key Data Points that Trigger Generalization", "Experiment": "1) Modify train() to track per-sample loss changes between epochs. 2) Implement a function identify_critical_samples(model, dataloader) that ranks samples based on their loss change impact. 3) Update the training loop to identify critical samples every epoch. 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Plot the stability of critical sample set and their average loss change alongside validation accuracy. 6) Implement a function to visualize the evolution of critical samples throughout training. 7) Analyze properties of critical samples (e.g., difficulty, representation in hidden layers) before and after grokking point. 8) Test the impact of removing or upweighting critical samples on grokking speed and final performance. 9) Compare critical samples across different operations to identify common patterns in grokking triggers.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "relational_complexity_grokking", "Title": "Impact of Input Relational Complexity on Grokking Dynamics", "Experiment": "1) Modify AbstractDataset to include a compute_complexity(a, b) method. 2) Implement this method for ModDivisionDataset as (max(a,b) % min(a,b)) / min(a,b), and for ModSumDataset as abs(a-b) / max(a,b). 3) Update fetch_example() to store complexity alongside each example. 4) Modify train() and evaluate() to track performance for three complexity levels: low (0-0.33), medium (0.33-0.67), high (0.67-1). 5) Run experiments on ModDivisionDataset and ModSumDataset with the new complexity measure. 6) Plot learning curves for different complexity levels alongside overall validation accuracy. 7) Define 'complexity-specific grokking points' as the step where validation accuracy for a specific complexity level reaches 95%. 8) Calculate and plot the average number of steps to reach grokking point for each complexity level. 9) Analyze the correlation between complexity and grokking speed using Spearman's rank correlation coefficient.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "simplicity_bias_grokking", "Title": "Simplicity Bias in Grokking: Analyzing Function Complexity Evolution During Sudden Generalization", "Experiment": "1) Implement a function estimate_function_complexity(model) that computes the L1 norm of the model's weights as a proxy for function complexity. 2) Modify the training loop to compute and log function complexity every 100 steps. 3) Run experiments on ModDivisionDataset and PermutationGroup. 4) Plot estimated function complexity alongside validation accuracy and training loss. 5) Define the 'simplification point' as the step with the steepest decrease in function complexity. 6) Compare this point with the grokking point (95% validation accuracy). 7) Compute generalization error as the difference between training and validation accuracy, and analyze its correlation with function complexity using Pearson correlation. 8) For ModDivisionDataset only, visualize the decision boundaries of the learned function at key points: initial, pre-grokking, simplification point, grokking point, and post-grokking. 9) Compare the observed complexity dynamics to theoretical predictions from simplicity bias literature. 10) Compare function complexity dynamics between different operations to identify common patterns in simplicity bias during grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "proto_lottery_tickets_grokking", "Title": "Evolution of Proto-Lottery Tickets During Grokking: Tracking Emergent Subnetworks in Sudden Generalization", "Experiment": "1) Implement a function track_active_subnetwork(model, threshold) to identify the most active neurons and connections. 2) Modify the training loop to track the active subnetwork every 100 steps. 3) Implement a function analyze_subnetwork_stability(current_subnetwork, previous_subnetwork) to measure subnetwork stability over time. 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Plot subnetwork stability and performance alongside the full model's validation accuracy. 6) Define the 'subnetwork emergence point' as the step where subnetwork stability significantly increases. 7) Compare this point with the grokking point (95% validation accuracy). 8) Analyze the evolution of subnetwork size and composition before, during, and after the grokking point. 9) Implement a function to visualize the active subnetwork at key points in training. 10) Compare subnetwork evolution patterns across different operations to identify common characteristics in relation to grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "positional_bias_grokking", "Title": "Impact of Positional Inductive Biases on Grokking Dynamics in Transformers", "Experiment": "1) Modify Transformer class to support three positional encoding schemes: a) Learned (baseline), b) Sinusoidal, c) Relative. 2) Implement these encoding schemes ensuring similar parameter counts. 3) Run experiments on ModDivisionDataset and PermutationGroup for each variant. 4) Track and compare: grokking points (95% validation accuracy), grokking speeds, final performances. 5) Plot learning curves and gradient norms for each variant. 6) Implement function to visualize attention patterns at key points (pre-grokking, grokking point, post-grokking) for each variant. 7) Analyze how different positional biases affect the suddenness and completeness of generalization. 8) Implement a function to compute positional influence scores by masking out positional information and measuring performance drop. 9) Test trained models on sequences with larger numerical values to assess extrapolation capabilities. 10) Compare computational efficiency of different variants in terms of time to grokking point.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "curriculum_grokking", "Title": "Curriculum Learning in Grokking: Optimizing Operation Sequences for Efficient Generalization", "Experiment": "1) Modify AbstractDataset to support multiple operations simultaneously. 2) Implement a CurriculumScheduler class that introduces operations in a specified order. 3) Update the training loop to use the CurriculumScheduler. 4) Run experiments comparing: a) Curriculum learning (easy to hard: ModSum -> ModSubtract -> ModDivision), b) Reverse curriculum (hard to easy), c) Random ordering, d) Simultaneous learning of all operations. 5) Track grokking points (95% validation accuracy) for each operation under different schedules. 6) Plot learning curves for each operation under different learning schedules. 7) Implement a 'transfer metric' to quantify how learning one operation affects the learning speed of subsequent operations. 8) Analyze how the introduction of new operations affects the grokking of previously learned operations. 9) Visualize attention patterns for different operations at key curriculum points. 10) Compare final performance, grokking speed, and transfer metrics across different curricula.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "mutual_information_grokking", "Title": "Mutual Information Dynamics During Grokking: Tracking Information Flow in Key Network Layers", "Experiment": "1) Modify Transformer class to output representations from input embedding, middle attention layer, and final layer. 2) Implement a function estimate_mutual_information(X, Y) using a binning approach. 3) Update the training loop to compute and log mutual information between these key layers every 100 steps. 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Plot mutual information between key layers alongside validation accuracy and training loss. 6) Define the 'information shift point' as the step with the steepest change in mutual information. 7) Compare this point with the grokking point (95% validation accuracy). 8) Analyze the correlation between changes in mutual information and validation accuracy using Pearson correlation. 9) Create heatmap visualizations of mutual information at key points: initial, pre-grokking, information shift point, grokking point, and post-grokking. 10) Test the trained model on a separate held-out test set to analyze how mutual information relates to generalization performance. 11) Compare mutual information dynamics between different operations to identify common patterns in information flow during grokking.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "loss_landscape_curvature_grokking", "Title": "Loss Landscape Curvature Evolution During Grokking: Geometric Insights into Sudden Generalization", "Experiment": "1) Implement a function estimate_hessian_top_eigenvalue(model, dataloader) using the power iteration method. 2) Modify the training loop to compute and log the top eigenvalue of the Hessian every 1000 steps. 3) Run experiments on ModDivisionDataset with 5 different random seeds. 4) Plot the top eigenvalue of the Hessian alongside validation accuracy and training loss. 5) Define the 'curvature transition point' as the step with the most significant change in the top eigenvalue. 6) Compare this point with the grokking point (95% validation accuracy). 7) Analyze the correlation between changes in curvature and validation accuracy using Pearson correlation. 8) Implement a function to visualize 1D loss landscape slices at key points: initial, pre-grokking, curvature transition point, grokking point, and post-grokking. 9) Analyze how the shape of the loss landscape evolves during the grokking process.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "adaptive_grokking_critical_periods", "Title": "Adaptive Discovery of Critical Learning Periods in Grokking", "Experiment": "1) Modify Transformer class to allow freezing of specific components (embeddings, individual attention layers, feed-forward layers). 2) Implement freeze_component(model, component, step) function. 3) Create an AdaptiveFreezeScheduler class that starts with a coarse grid of freezing times (25%, 50%, 75% of typical grokking time) and refines based on results. 4) Implement analyze_frozen_component(model, component) to study weight distributions and gradients of frozen components. 5) Run experiments on ModDivisionDataset and PermutationGroup using the adaptive scheduler, including a baseline no-freezing condition. 6) Track grokking point (95% validation accuracy), grokking speed, and final performance for each schedule and the baseline. 7) Plot learning curves and component analysis results for each freezing schedule and the baseline. 8) Identify 'critical periods' for each component where freezing has the most significant impact compared to the baseline. 9) Compare critical periods across different operations. 10) Analyze the characteristics of components during their critical periods to understand the mechanisms behind grokking.", "Interestingness": 9, "Feasibility": 9, "Novelty": 9, "novel": true }, { "Name": "adversarial_robustness_evolution_grokking", "Title": "Evolution of Adversarial Robustness During Grokking: Linking Sudden Generalization with Model Robustness", "Experiment": "1) Implement a simple adversarial perturbation function add_uniform_noise(inputs, epsilon=0.1). 2) Modify evaluate() to compute accuracy on both clean and perturbed inputs. 3) Update the training loop to evaluate adversarial robustness every 500 steps. 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Plot clean accuracy, perturbed accuracy, and their ratio over training steps. 6) Define the 'robustness grokking point' as the step where the accuracy ratio suddenly improves. 7) Compare this point with the standard grokking point (95% clean validation accuracy). 8) Analyze the correlation between changes in clean accuracy and perturbed accuracy using Pearson correlation. 9) Implement a function to visualize the distribution of perturbation impacts at key points: initial, pre-grokking, grokking point, robustness grokking point (if different), and post-grokking. 10) Compare robustness evolution patterns across different operations to identify common characteristics in relation to grokking.", "Interestingness": 9, "Feasibility": 9, "Novelty": 9, "novel": true }, { "Name": "adaptive_memory_dynamics_grokking", "Title": "Adaptive Memory Dynamics During Grokking: Tracking Forgetting Patterns in Relation to Generalization", "Experiment": "1) Implement select_probe_set(model, dataloader, size=100) to choose examples the model struggles with. 2) Modify train() to update the probe set and evaluate on it at key phases: initial, pre-grokking (50% training accuracy), grokking point (95% validation accuracy), and post-grokking. 3) Implement compute_example_difficulty(probe_accuracies) to measure how often each example is forgotten. 4) Run experiments on ModDivisionDataset and PermutationGroup. 5) Plot probe set accuracy, average example difficulty, and validation accuracy over training phases. 6) Analyze correlation between changes in example difficulty and validation accuracy improvements. 7) Visualize attention patterns for high-difficulty probe examples at each key phase. 8) Compare adaptive memory dynamics patterns across different operations.", "Interestingness": 9, "Feasibility": 9, "Novelty": 9, "novel": true }, { "Name": "noisy_grokking", "Title": "Grokking Under Noise: Investigating the Impact of Input Perturbations on Sudden Generalization", "Experiment": "1) Implement add_uniform_noise(inputs, noise_level) and add_gaussian_noise(inputs, noise_level) functions. 2) Modify train() to apply noise with probability p during training. 3) Update evaluate() to test on both clean and noisy inputs. 4) Run experiments on ModDivisionDataset and PermutationGroup with noise levels [0, 0.1, 0.3, 0.5] and noise probabilities [0, 0.1, 0.3, 0.5]. 5) Track and plot clean accuracy and noisy accuracy alongside training steps. 6) Compare grokking points (95% validation accuracy) and speeds for clean and noisy inputs. 7) Analyze how different noise levels and probabilities affect the timing and sharpness of the grokking transition. 8) Implement a function to visualize the distribution of model predictions for clean and noisy inputs at key points (pre-grokking, grokking point, post-grokking).", "Interestingness": 9, "Feasibility": 9, "Novelty": 9, "novel": true } ]