[ { "Name": "batch_size_grokking", "Title": "Batch Size Grokking: Assessing the impact of the training batchsize on the grokking phenomenon", "Experiment": "Modify the experiments to dynamically adjust the batch size during training, starting with a small batch size and gradually increasing it. This could potentially lead to faster generalization on the validation set.", "Interestingness": 6, "Feasibility": 4, "Novelty": 4, "novel": true }, { "Name": "weight_initialization_grokking", "Title": "Weight Initialization Grokking: Assessing the impact of weight initialization strategies on the grokking phenomenon", "Experiment": "Modify the `run` function to include different weight initialization strategies (Xavier, He, orthogonal) for the Transformer model. Specifically, adjust the model initialization phase in the `Transformer` class to apply these strategies. Compare these against the baseline (PyTorch default) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 8, "Feasibility": 7, "Novelty": 7, "novel": true }, { "Name": "learning_rate_schedule_grokking", "Title": "Learning Rate Schedule Grokking: Assessing the impact of learning rate schedules on the grokking phenomenon", "Experiment": "Modify the `run` function to include different learning rate schedules such as cosine annealing, step decay, and cyclical learning rates. Specifically, adjust the optimizer and scheduler instantiation in the `run` function to apply these strategies. Compare these against the baseline (warmup followed by constant learning rate) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 7, "novel": true }, { "Name": "architecture_grokking", "Title": "Architecture Grokking: Assessing the impact of Transformer architectural configurations on the grokking phenomenon", "Experiment": "Modify the `run` function to include three specific configurations for the number of attention heads and layers in the Transformer model: Small (2 layers, 2 heads), Medium (4 layers, 4 heads), and Large (6 layers, 8 heads). Specifically, adjust the instantiation of the `Transformer` class to apply these configurations. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 8, "Feasibility": 7, "Novelty": 8, "novel": true }, { "Name": "data_augmentation_grokking", "Title": "Data Augmentation Grokking: Assessing the impact of data augmentation techniques on the grokking phenomenon", "Experiment": "Modify the `AbstractDataset` class to include specific data augmentation techniques: 1) Adding noise to the operands (e.g., small random integers), 2) Shuffling the order of operands, 3) Adding irrelevant tokens (e.g., extra symbols that do not change the operation). Implement these augmentations in the `fetch_train_example` and `fetch_val_example` methods. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 8, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "regularization_grokking", "Title": "Regularization Grokking: Assessing the impact of regularization techniques on the grokking phenomenon", "Experiment": "Modify the `Transformer` class to include dropout layers with different dropout rates (e.g., 0.1, 0.3, 0.5). Adjust the `run` function to add L2 weight decay (e.g., 0.01, 0.05) to the optimizer and apply gradient clipping (e.g., clip gradients with norms above 1.0). Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "data_split_grokking", "Title": "Data Split Grokking: Assessing the impact of training-validation data splits on the grokking phenomenon", "Experiment": "Modify the `run` function to include different fractions of data used for training (e.g., 0.1, 0.3, 0.5, 0.7, 0.9). Specifically, adjust the `training_fraction` parameter in the `get_data` function call. Compare these against the baseline (0.5) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 8, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "sequence_length_grokking", "Title": "Sequence Length Grokking: Assessing the impact of input sequence length on the grokking phenomenon", "Experiment": "Modify the `run` function and `Transformer` class to include different input sequence lengths (e.g., 3, 5, 7). Adjust the input equation generation in the `AbstractDataset` class to match the new sequence lengths by appropriately padding or truncating the equations to ensure uniform length. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 8, "Feasibility": 7, "Novelty": 8, "novel": true }, { "Name": "activation_function_grokking", "Title": "Activation Function Grokking: Assessing the impact of different activation functions on the grokking phenomenon", "Experiment": "Modify the `DecoderBlock` class to accept different activation functions (ReLU, GELU, Tanh). Implement a parameter in the `Transformer` class to select the activation function during initialization. Specifically, add this parameter to the `__init__` method and pass it to the `DecoderBlock`. Measure the final training and validation accuracy, loss, the number of steps to reach 99% validation accuracy, and the variance in loss/accuracy across epochs. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "optimizer_grokking", "Title": "Optimizer Grokking: Assessing the impact of optimization algorithms on the grokking phenomenon", "Experiment": "Modify the `run` function to include different optimization algorithms such as Adam, SGD, RMSprop, and Adagrad. Adjust the optimizer instantiation in the `run` function to apply these strategies. Compare these against the baseline (AdamW) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 8, "Feasibility": 8, "Novelty": 8, "novel": true }, { "Name": "loss_function_grokking", "Title": "Loss Function Grokking: Assessing the impact of different loss functions on the grokking phenomenon", "Experiment": "Modify the `run` function to include different loss functions such as Focal Loss, Huber Loss, and a custom loss function that penalizes incorrect predictions more steeply. Implement these loss functions in the `train` and `evaluate` functions. Compare these against the baseline (Cross-Entropy Loss) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 7, "Novelty": 9, "novel": true }, { "Name": "intermediate_evaluation_grokking", "Title": "Intermediate Evaluation Grokking: Assessing the impact of regular intermediate evaluations on understanding the grokking phenomenon", "Experiment": "Modify the `run` function to include periodic evaluations of the model's performance during training. Specifically, adjust the training loop to evaluate and log the model's accuracy and loss on both the training and validation sets every 50 steps. Store these intermediate results and analyze them to identify patterns leading up to the grokking phase. Visualize the results to understand the temporal dynamics of the grokking phenomenon.", "Interestingness": 9, "Feasibility": 9, "Novelty": 8, "novel": true }, { "Name": "attention_mechanism_grokking", "Title": "Attention Mechanism Grokking: Assessing the impact of different attention mechanisms on the grokking phenomenon", "Experiment": "Modify the `DecoderBlock` class to include different attention mechanisms (sparse attention, local attention). Implement a parameter in the `Transformer` class to select the attention mechanism during initialization. Specifically, add this parameter to the `__init__` method and pass it to the `DecoderBlock`. Measure the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "meta_learning_grokking", "Title": "Meta-Learning Grokking: Assessing the impact of meta-learning strategies on the grokking phenomenon", "Experiment": "Modify the `run` function to include a simpler meta-learning algorithm like Reptile. Specifically, adjust the training loop to include both inner-loop (task-specific) training and outer-loop (meta) training, where Reptile's update rule is applied. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "early_stopping_grokking", "Title": "Early Stopping Grokking: Assessing the impact of early stopping criteria on the grokking phenomenon", "Experiment": "Modify the `run` function to include different early stopping criteria: 1) Patience-based stopping where training stops if validation accuracy does not improve for a specified number of epochs (e.g., 10); 2) Threshold-based stopping where training stops if validation accuracy exceeds a certain threshold (e.g., 99%); 3) Combined stopping criteria using both patience and threshold. Compare these against the baseline (no early stopping) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 9, "Novelty": 8, "novel": true }, { "Name": "sequence_complexity_grokking", "Title": "Sequence Complexity Grokking: Assessing the impact of input sequence complexity on the grokking phenomenon", "Experiment": "Modify the `AbstractDataset` class to include sequences with varying complexities: 1) Simple single operations (baseline), 2) Nested operations (e.g., (a + b) - c), 3) Multi-step arithmetic problems (e.g., (a + b) * c). Implement these complexities in the `fetch_example` method by creating additional methods to generate and encode these complex sequences. Adjust the `run` function to handle different sequence complexities and compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 7, "Novelty": 9, "novel": true }, { "Name": "data_diversity_grokking", "Title": "Data Diversity Grokking: Assessing the impact of data diversity on the grokking phenomenon", "Experiment": "Modify the AbstractDataset class to include a new dataset that combines multiple types of mathematical operations (addition, subtraction, and division). Implement a new dataset class (MixedOperationsDataset) that generates examples from different operations in equal proportions. Adjust the get_data function to handle this new dataset. Ensure the encoding and decoding processes accommodate the combined vocab and output sizes. Compare the results against the baseline individual datasets by measuring final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "curriculum_learning_grokking", "Title": "Curriculum Learning Grokking: Assessing the impact of training sample order on the grokking phenomenon", "Experiment": "Modify the `GroupDataset` class to include different training sample orders: 1) Baseline (random order), 2) Curriculum learning (increasing difficulty), 3) Anti-curriculum learning (decreasing difficulty). Implement a method to rank the difficulty of samples based on the operation and operands. Adjust the `__iter__` and `__next__` methods in `GroupDataset` to present samples in the specified order. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "normalization_grokking", "Title": "Normalization Grokking: Assessing the impact of different normalization techniques on the grokking phenomenon", "Experiment": "Modify the `DecoderBlock` class to include different normalization techniques (LayerNorm, BatchNorm, GroupNorm, InstanceNorm). Implement a parameter in the `Transformer` class to select the normalization method during initialization. Specifically, add this parameter to the `__init__` method and pass it to the `DecoderBlock`. Implement a fallback to LayerNorm if an invalid normalization technique is provided. Compare these against the baseline (LayerNorm) by measuring the final training and validation accuracy, loss, the number of steps to reach 99% validation accuracy, and the speed of reaching certain accuracy thresholds. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "connectivity_grokking", "Title": "Layer Connectivity Grokking: Assessing the impact of transformer layer connectivity patterns on the grokking phenomenon", "Experiment": "Modify the `Transformer` class to include different layer connectivity patterns: 1) Sequential (baseline), 2) Skip connections, 3) Dense connections. Implement these patterns by adjusting the forward pass method to alter how layers are connected and integrated. Measure the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 7, "Novelty": 9, "novel": true }, { "Name": "update_frequency_grokking", "Title": "Update Frequency Grokking: Assessing the impact of update frequency on the grokking phenomenon", "Experiment": "Modify the `run` function to include different frequencies of model updates (e.g., after every batch, after every 2 batches, after every 5 batches). Specifically, adjust the training loop to accumulate gradients over multiple batches before performing an update. Measure the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "layerwise_lr_grokking", "Title": "Layer-wise Learning Rate Grokking: Assessing the impact of layer-wise learning rates on the grokking phenomenon", "Experiment": "Modify the `run` function to implement layer-wise learning rates. Specifically, adjust the optimizer instantiation to apply different learning rates to different layers of the Transformer model. Define three groups: 1) Embedding layers with a small learning rate (e.g., 1e-4), 2) Lower Transformer layers with a moderate learning rate (e.g., 1e-3), 3) Higher Transformer layers with a larger learning rate (e.g., 1e-2). Use PyTorch's parameter groups feature to assign these learning rates. Compare these against the baseline (uniform learning rate) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "noise_robustness_grokking", "Title": "Noise Robustness Grokking: Assessing the impact of noise on the grokking phenomenon", "Experiment": "Modify the run function and other relevant parts of the code to include three types of noise: 1) Input Noise: Add Gaussian noise to the inputs in the train function by adding a noise tensor drawn from a normal distribution. 2) Label Noise: Randomly flip a fraction (e.g., 10%) of the labels in the fetch_train_example method by changing the label to a random class. 3) Parameter Noise: Add Gaussian noise to the model parameters during the forward pass in the Transformer class. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "math_properties_grokking", "Title": "Math Properties Grokking: Assessing the impact of fundamental mathematical properties on the grokking phenomenon", "Experiment": "Introduce new dataset classes to test specific mathematical properties: 1) Commutative operations (e.g., addition mod p), 2) Non-commutative operations (e.g., matrix multiplication mod p), 3) Associative operations (e.g., addition mod p), 4) Non-associative operations (e.g., a custom operation defined as (a * b) - c mod p). Modify the AbstractDataset and its subclasses to define these new operations. Implement these operations in the fetch_output method and run experiments to compare the model's performance on these datasets. Measure final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "attention_visualization_grokking", "Title": "Attention Visualization Grokking: Understanding the role of attention weights in the grokking phenomenon", "Experiment": "Modify the `DecoderBlock` class to store attention weights during the forward pass. Implement a logging mechanism in the `run` function to save these weights at specified intervals (e.g., every 100 steps). Create a visualization script to plot attention weights over epochs using techniques like heatmaps and attention maps overlaid on input sequences. Compare attention weight patterns against performance metrics to identify correlations. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "input_representation_grokking", "Title": "Input Representation Grokking: Assessing the impact of different input representations on the grokking phenomenon", "Experiment": "Modify the AbstractDataset class to include a parameter for selecting input encoding types (binary, one-hot, positional). Implement encoding methods for binary encoding, one-hot encoding, and positional encoding in the encode method. Ensure the decode method can handle these representations. Adjust the Transformer class to accept and process these different input representations. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "loss_landscape_modifications_grokking", "Title": "Loss Landscape Modifications: Assessing the impact of loss landscape modifications on the grokking phenomenon", "Experiment": "Modify the train function to introduce various loss landscape modifications: 1) Gradient Norm Regularization: Add a term to the loss function that penalizes large gradients (L2 norm of gradients). 2) Smoothness Regularization: Add a term to the loss function that penalizes large differences in loss between consecutive steps. Implement these modifications as toggleable options in the train function. Adjust the run function to accept parameters for enabling these modifications. Compare these against the baseline by measuring the final training and validation accuracy, loss, time to reach 99% validation accuracy, and stability of training. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "adversarial_training_grokking", "Title": "Adversarial Training Grokking: Assessing the impact of adversarial training on the grokking phenomenon", "Experiment": "Modify the train function to include adversarial example generation using Fast Gradient Sign Method (FGSM). Specifically, generate adversarial examples during each training step and include them in the training batch. Adjust the loss calculation to account for both original and adversarial examples. Compare the results against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "data_sparsity_grokking", "Title": "Data Sparsity and Irrelevant Features Grokking: Assessing the impact of data sparsity and irrelevant features on the grokking phenomenon", "Experiment": "Modify the AbstractDataset class to include parameters for sparsity and irrelevant features. Implement methods to randomly drop a specified percentage of features (e.g., 10%, 20%) and add random irrelevant features (e.g., random integers from the same range as original features). Adjust the fetch_example, fetch_train_example, and fetch_val_example methods to apply these modifications during data generation. Compare the results against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "tokenization_grokking", "Title": "Tokenization Grokking: Assessing the impact of different tokenization strategies on the grokking phenomenon", "Experiment": "Modify the `AbstractDataset` class to include different tokenization strategies: 1) Subword units using byte-pair encoding (BPE), 2) Character-level tokenization. Implement these tokenization strategies in the `encode` and `decode` methods. Specifically, add functions for BPE and character-level tokenization and modify the `form_equation` method to generate appropriate sequences. Adjust the `run` function and `Transformer` class to handle these different tokenization methods, ensuring compatibility with the existing vocabulary structure. Measure the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination, and compare them with the baseline tokenization strategy.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "training_duration_grokking", "Title": "Training Duration Grokking: Assessing the impact of training duration on the grokking phenomenon", "Experiment": "Modify the `run` function to include various training durations (e.g., 5000, 10000, 15000, 20000 steps). Specifically, adjust the `num_total_updates` parameter to reflect these different durations. Measure and compare the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy for each duration. Evaluate how training duration influences the grokking phenomenon across different datasets and seed combinations. Analyze and visualize the results to identify optimal training durations and the relationship between training duration and model generalization.", "Interestingness": 9, "Feasibility": 9, "Novelty": 9, "novel": true }, { "Name": "arithmetic_features_grokking", "Title": "Arithmetic Features Grokking: Assessing the impact of embedding arithmetic properties on the grokking phenomenon", "Experiment": "Extend the AbstractDataset class to include methods for calculating additional arithmetic properties of the operands, such as parity (odd/even), prime status, and binary representation. Modify the fetch_example, fetch_train_example, and fetch_val_example methods to include these properties as additional features in the input sequences. Adjust the Transformer model to accept these expanded input sequences. Compare the model's performance on datasets with and without these additional features by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "multi_task_grokking", "Title": "Multi-Task Learning Grokking: Assessing the impact of multi-task learning on the grokking phenomenon", "Experiment": "Modify the `AbstractDataset` class to create a new dataset (MultiTaskDataset) that combines multiple operations (addition, subtraction, division, permutations). Implement methods to fetch examples from different operations and label them to indicate the operation type. Adjust the `Transformer` model to include a task-specific embedding that indicates the operation type. Modify the `run` function and training loop to handle multi-task learning with a multi-task objective. Specifically, modify the `fetch_example` method to return examples from different tasks and adjust the model's forward pass to handle multiple outputs. Compare the results against single-task training by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy, tracking both overall and per-task performance.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "sparsity_grokking", "Title": "Sparsity Grokking: Assessing the impact of model sparsity on the grokking phenomenon", "Experiment": "Modify the `train` function to implement sparsity-inducing techniques like L1 regularization, pruning, and variational dropout. Implement these techniques as toggleable options in the training loop. Adjust the `run` function to accept parameters for enabling these techniques and their respective hyperparameters. Evaluate each sparsity technique separately to isolate their effects. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Tune hyperparameters associated with sparsity techniques and report their values. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "encoder_decoder_grokking", "Title": "Encoder-Decoder Grokking: Assessing the impact of encoder-decoder architecture on the grokking phenomenon", "Experiment": "Modify the `Transformer` class to include an encoder module. Implement an `EncoderBlock` class similar to the `DecoderBlock` class, focusing on self-attention and feed-forward networks. Update the `forward` method in the `Transformer` class to first pass the input sequence through the encoder and then through the decoder. Adjust the `run` function to handle the new architecture by ensuring the encoder and decoder are properly trained. Compare the results against the baseline (decoder-only model) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 7, "Novelty": 10, "novel": true }, { "Name": "embedding_strategy_grokking", "Title": "Embedding Strategy Grokking: Assessing the impact of different embedding strategies on the grokking phenomenon", "Experiment": "Modify the `Transformer` class to include different embedding strategies: 1) Static Embeddings: Use randomly initialized embeddings that are fixed during training, 2) Dynamic Embeddings: Learn embeddings during training, 3) Hybrid Embeddings: Combine static (randomly initialized) and dynamic embeddings. Implement these strategies by adjusting the `__init__` and `forward` methods in the `Transformer` class. Compare these against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "parameter_efficiency_grokking", "Title": "Parameter Efficiency Grokking: Assessing the impact of parameter-efficient techniques on the grokking phenomenon", "Experiment": "Modify the `Transformer` class to include specific parameter-efficient techniques: 1) Parameter Sharing: Share weights between different layers of the Transformer. 2) Low-Rank Factorization: Factorize the weight matrices into lower-rank approximations. 3) Pruning: Implement weight pruning to remove less important weights during training. Adjust the `__init__` and `forward` methods in the `Transformer` class to apply these techniques. Implement a toggle mechanism to enable or disable each technique independently. Compare these against the baseline by measuring the final training and validation accuracy, loss, the number of steps to reach 99% validation accuracy, and the total number of parameters. Evaluate the results for each dataset and seed combination. Specifically, run experiments with each technique individually and in combination to assess their impact on parameter efficiency and performance.", "Interestingness": 10, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "embedding_types_grokking", "Title": "Embedding Types Grokking: Assessing the impact of pretrained embeddings on the grokking phenomenon", "Experiment": "Modify the `Transformer` class to include options for using pretrained embeddings such as Word2Vec, GloVe, or FastText. Load these embeddings during the initialization of the `Transformer` class. Adjust the `run` function to handle these new embeddings and compare them against the baseline by measuring final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "transfer_learning_grokking", "Title": "Transfer Learning Grokking: Assessing the impact of transfer learning on the grokking phenomenon", "Experiment": "Modify the `run` function to include a two-phase training procedure: pre-training on one type of arithmetic operation (e.g., ModSumDataset) and fine-tuning on a different type (e.g., ModSubtractDataset). Specifically, adjust the training loop to first train a model on a source dataset until it reaches a certain accuracy threshold or number of epochs. Save the pre-trained model and then fine-tune it on a target dataset. Compare these against models trained from scratch by measuring final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Additionally, evaluate the rate of convergence during fine-tuning. Repeat experiments for various dataset pairs and seed combinations.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "depth_width_tradeoff_grokking", "Title": "Depth-Width Tradeoff Grokking: Assessing the impact of Transformer model depth and width on the grokking phenomenon", "Experiment": "Modify the `Transformer` class to accept configurable numbers of layers and dimensions. Create four configurations for experimentation: 1) Baseline (2 layers, 128 dimensions), 2) Shallow and Wide (2 layers, 256 dimensions), 3) Deep and Narrow (6 layers, 64 dimensions), 4) Balanced (4 layers, 128 dimensions). Adjust the `run` function to train and evaluate these configurations on all datasets and seeds. Measure final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate and compare these metrics for each configuration and dataset.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "interpretability_grokking", "Title": "Interpretability Grokking: Assessing the impact of interpretability methods on the grokking phenomenon", "Experiment": "Modify the `run` function to include periodic extraction of interpretability metrics using Integrated Gradients initially. Integrate this method into the training loop to analyze model decisions at various points during training. Implement a logging mechanism to save interpretability metrics at specified intervals (e.g., every 100 steps). Create a basic visualization script to plot these metrics over epochs and correlate them with performance metrics like accuracy and loss. Compare the interpretability patterns against performance metrics to identify correlations and potential early indicators of generalization. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "memory_mechanisms_grokking", "Title": "Memory Mechanisms Grokking: Assessing the impact of memory-augmented models on the grokking phenomenon", "Experiment": "Modify the `Transformer` class to integrate GRU and LSTM units by adding them as additional layers after the existing DecoderBlock or replacing the feed-forward network within the DecoderBlock. Adjust the `__init__` and `forward` methods to incorporate these units. Compare the performance against the baseline transformer by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy across different datasets.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "optimization_strategies_grokking", "Title": "Optimization Strategies Grokking: Assessing the impact of modern optimization techniques on the grokking phenomenon", "Experiment": "Modify the `train` function to include three optimization techniques: 1) Gradient Accumulation: Accumulate gradients over multiple batches before performing an update by adjusting the optimizer step frequency (e.g., after every 2 or 5 batches). 2) Mixed Precision Training: Integrate PyTorch's AMP (Automatic Mixed Precision) to use both float16 and float32 during training, improving computation speed and reducing memory usage. 3) Adaptive Gradient Clipping: Implement adaptive gradient clipping to dynamically adjust the clipping threshold based on gradient norms. Adjust the `run` function to toggle these techniques independently and in combination. Include a fallback to the baseline training loop if any technique degrades performance. Compare these against the baseline by measuring final training and validation accuracy, loss, the number of steps to reach 99% validation accuracy, and training time. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "parameter_importance_grokking", "Title": "Parameter Importance Grokking: Assessing the impact of parameter freezing and selective fine-tuning on the grokking phenomenon", "Experiment": "Modify the Transformer class to include methods for freezing and unfreezing layers. Adjust the train function to implement dynamic layer freezing and selective fine-tuning based on predefined conditions such as reaching specific accuracy thresholds or after a certain number of training steps. Compare the performance metrics against the baseline by measuring final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 7, "Novelty": 9, "novel": true }, { "Name": "incremental_learning_grokking", "Title": "Incremental Learning Grokking: Examining the impact of incremental task exposure on the grokking phenomenon", "Experiment": "Modify the `run` function to implement incremental learning. Train the model on one operation first (e.g., ModSumDataset) and introduce subsequent operations (e.g., ModSubtractDataset, ModDivisonDataset) incrementally. Track and log model performance metrics (accuracy, loss) at each task transition. Save intermediate model states for analysis. Compare results against a baseline where all tasks are learned simultaneously by measuring final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy.", "Interestingness": 10, "Feasibility": 7, "Novelty": 10, "novel": true }, { "Name": "zero_shot_grokking", "Title": "Zero-Shot Grokking: Assessing the impact of zero-shot learning on the grokking phenomenon", "Experiment": "Modify the `run` function to train the model on three out of the four provided datasets and evaluate its performance on the unseen fourth dataset. Specifically, adjust the training loop to exclude one dataset from training and use it exclusively for evaluation. Measure the model's performance on the unseen dataset by calculating the final validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Compare these metrics against the baseline performance on seen datasets to assess the model's ability to generalize to new tasks.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "task_specificity_generalization_grokking", "Title": "Task Specificity and Generalization Grokking: Assessing the impact of specialized versus multi-task models on the grokking phenomenon", "Experiment": "Create a new dataset class (MultiTaskDataset) that combines multiple operations (addition, subtraction, division). Include an operation token in the input sequence to indicate the type of operation. Modify the Transformer model to include an embedding for the operation type. Adjust the run function and training loop to handle multi-task learning. Compare the results against single-task training by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy, tracking both overall and per-task performance.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "cross_validation_grokking", "Title": "Cross-Validation Grokking: Assessing the impact of cross-validation on the grokking phenomenon", "Experiment": "Modify the `run` function to implement k-fold cross-validation (e.g., k=5). Adjust the data loading and splitting to create k subsets. Train the model k times, each with a different subset as the validation set and the remaining subsets as the training set. Aggregate the performance metrics (accuracy, loss, steps to reach 99% validation accuracy) by reporting the mean and standard deviation across all k folds. Compare these results against the baseline single train/validation split for all datasets.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "numerical_encoding_grokking", "Title": "Numerical Encoding Grokking: Assessing the impact of different numerical encoding schemes on the grokking phenomenon", "Experiment": "Modify the AbstractDataset class to include different numerical encoding schemes: 1) Binary encoding, 2) Positional encoding, 3) Prime factor encoding. Implement these encoding methods in the encode method. Ensure the decode method can handle these representations. Adjust the Transformer class to accept and process these different encoding schemes. Compare these against the baseline (integer encoding) by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true }, { "Name": "commutativity_integration_grokking", "Title": "Commutativity Integration Grokking: Assessing the impact of embedding commutativity on the grokking phenomenon", "Experiment": "Modify the dataset generation process to include both `a + b` and `b + a` for commutative operations. Specifically, adjust the `fetch_example` method in the `AbstractDataset` class to generate both permutations for each example. Compare the results against the baseline by measuring the final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy. Evaluate the results for each dataset and seed combination.", "Interestingness": 9, "Feasibility": 8, "Novelty": 9, "novel": true }, { "Name": "multi_modal_grokking", "Title": "Multi-Modal Grokking: Assessing the impact of multi-modal input representations on the grokking phenomenon", "Experiment": "Modify the AbstractDataset class to include an additional modality, such as a textual description of the operation (e.g., 'The result of x plus y'). Adjust the Transformer model to handle multi-modal inputs by creating separate embedding layers for numerical and textual inputs and integrating them using concatenation. Measure performance against the baseline by evaluating final training and validation accuracy, loss, and the number of steps to reach 99% validation accuracy.", "Interestingness": 10, "Feasibility": 8, "Novelty": 10, "novel": true } ]