\begin{thebibliography}{11} \providecommand{\natexlab}[1]{#1} \providecommand{\url}[1]{\texttt{#1}} \expandafter\ifx\csname urlstyle\endcsname\relax \providecommand{\doi}[1]{doi: #1}\else \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi \bibitem[Ba et~al.(2016)Ba, Kiros, and Hinton]{ba2016layer} Jimmy~Lei Ba, Jamie~Ryan Kiros, and Geoffrey~E Hinton. \newblock Layer normalization. \newblock \emph{arXiv preprint arXiv:1607.06450}, 2016. \bibitem[Bahamou \& Goldfarb(2023)Bahamou and Goldfarb]{Bahamou2023LayerwiseAS} Achraf Bahamou and D.~Goldfarb. \newblock Layer-wise adaptive step-sizes for stochastic first-order methods for deep learning. \newblock \emph{ArXiv}, abs/2305.13664, 2023. \bibitem[Goodfellow et~al.(2016)Goodfellow, Bengio, Courville, and Bengio]{goodfellow2016deep} Ian Goodfellow, Yoshua Bengio, Aaron Courville, and Yoshua Bengio. \newblock \emph{Deep learning}, volume~1. \newblock MIT Press, 2016. \bibitem[Hu et~al.(2021)Hu, Shen, Wallis, Allen-Zhu, Li, Wang, and Chen]{Hu2021LoRALA} J.~E. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, and Weizhu Chen. \newblock Lora: Low-rank adaptation of large language models. \newblock \emph{ArXiv}, abs/2106.09685, 2021. \bibitem[Kingma \& Ba(2014)Kingma and Ba]{kingma2014adam} Diederik~P Kingma and Jimmy Ba. \newblock Adam: A method for stochastic optimization. \newblock \emph{arXiv preprint arXiv:1412.6980}, 2014. \bibitem[Ko et~al.(2022)Ko, Lee, and Kim]{Ko2022NotAL} Yunyong Ko, Dongwon Lee, and Sang-Wook Kim. \newblock Not all layers are equal: A layer-wise adaptive approach toward large-scale dnn training. \newblock \emph{Proceedings of the ACM Web Conference 2022}, 2022. \bibitem[Loshchilov \& Hutter(2017)Loshchilov and Hutter]{loshchilov2017adamw} Ilya Loshchilov and Frank Hutter. \newblock Decoupled weight decay regularization. \newblock \emph{arXiv preprint arXiv:1711.05101}, 2017. \bibitem[Paszke et~al.(2019)Paszke, Gross, Massa, Lerer, Bradbury, Chanan, Killeen, Lin, Gimelshein, Antiga, et~al.]{paszke2019pytorch} Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et~al. \newblock Pytorch: An imperative style, high-performance deep learning library. \newblock \emph{Advances in neural information processing systems}, 32, 2019. \bibitem[Power et~al.(2022)Power, Burda, Edwards, Babuschkin, and Misra]{power2022grokking} Alethea Power, Yuri Burda, Harri Edwards, Igor Babuschkin, and Vedant Misra. \newblock Grokking: Generalization beyond overfitting on small algorithmic datasets. \newblock \emph{arXiv preprint arXiv:2201.02177}, 2022. \bibitem[Shea \& Schmidt(2024)Shea and Schmidt]{Shea2024WhyLS} Betty Shea and Mark Schmidt. \newblock Why line search when you can plane search? so-friendly neural networks allow per-iteration optimization of learning and momentum rates for every layer. \newblock \emph{ArXiv}, abs/2406.17954, 2024. \bibitem[Vaswani et~al.(2017)Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin]{vaswani2017attention} Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin. \newblock Attention is all you need. \newblock \emph{Advances in neural information processing systems}, 30, 2017. \end{thebibliography}