\begin{thebibliography}{9} \providecommand{\natexlab}[1]{#1} \providecommand{\url}[1]{\texttt{#1}} \expandafter\ifx\csname urlstyle\endcsname\relax \providecommand{\doi}[1]{doi: #1}\else \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi \bibitem[Bahdanau et~al.(2014)Bahdanau, Cho, and Bengio]{bahdanau2014neural} Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. \newblock Neural machine translation by jointly learning to align and translate. \newblock \emph{arXiv preprint arXiv:1409.0473}, 2014. \bibitem[Goodfellow et~al.(2016)Goodfellow, Bengio, Courville, and Bengio]{goodfellow2016deep} Ian Goodfellow, Yoshua Bengio, Aaron Courville, and Yoshua Bengio. \newblock \emph{Deep learning}, volume~1. \newblock MIT Press, 2016. \bibitem[Kingma \& Ba(2014)Kingma and Ba]{kingma2014adam} Diederik~P Kingma and Jimmy Ba. \newblock Adam: A method for stochastic optimization. \newblock \emph{arXiv preprint arXiv:1412.6980}, 2014. \bibitem[Loshchilov \& Hutter(2017)Loshchilov and Hutter]{loshchilov2017adamw} Ilya Loshchilov and Frank Hutter. \newblock Decoupled weight decay regularization. \newblock \emph{arXiv preprint arXiv:1711.05101}, 2017. \bibitem[Paszke et~al.(2019)Paszke, Gross, Massa, Lerer, Bradbury, Chanan, Killeen, Lin, Gimelshein, Antiga, et~al.]{paszke2019pytorch} Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et~al. \newblock Pytorch: An imperative style, high-performance deep learning library. \newblock \emph{Advances in neural information processing systems}, 32, 2019. \bibitem[Power et~al.(2022{\natexlab{a}})Power, Burda, Edwards, Babuschkin, and Misra]{power2022grokking} Alethea Power, Yuri Burda, Harri Edwards, Igor Babuschkin, and Vedant Misra. \newblock Grokking: Generalization beyond overfitting on small algorithmic datasets. \newblock \emph{arXiv preprint arXiv:2201.02177}, 2022{\natexlab{a}}. \bibitem[Power et~al.(2022{\natexlab{b}})Power, Burda, Edwards, Babuschkin, and Misra]{Power2022GrokkingGB} Alethea Power, Yuri Burda, Harrison Edwards, Igor Babuschkin, and Vedant Misra. \newblock Grokking: Generalization beyond overfitting on small algorithmic datasets. \newblock \emph{ArXiv}, abs/2201.02177, 2022{\natexlab{b}}. \bibitem[Radford et~al.(2019)Radford, Wu, Child, Luan, Amodei, and Sutskever]{radford2019language} Alec Radford, Jeff Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. \newblock Language models are unsupervised multitask learners. \newblock 2019. \bibitem[Vaswani et~al.(2017)Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin]{vaswani2017attention} Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin. \newblock Attention is all you need. \newblock \emph{Advances in neural information processing systems}, 30, 2017. \end{thebibliography}