|
\begin{thebibliography}{9} |
|
\providecommand{\natexlab}[1]{ |
|
\providecommand{\url}[1]{\texttt{ |
|
\expandafter\ifx\csname urlstyle\endcsname\relax |
|
\providecommand{\doi}[1]{doi: |
|
\providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi |
|
|
|
\bibitem[Bahdanau et~al.(2014)Bahdanau, Cho, and Bengio]{bahdanau2014neural} |
|
Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. |
|
\newblock Neural machine translation by jointly learning to align and |
|
translate. |
|
\newblock \emph{arXiv preprint arXiv:1409.0473}, 2014. |
|
|
|
\bibitem[Goodfellow et~al.(2016)Goodfellow, Bengio, Courville, and |
|
Bengio]{goodfellow2016deep} |
|
Ian Goodfellow, Yoshua Bengio, Aaron Courville, and Yoshua Bengio. |
|
\newblock \emph{Deep learning}, volume~1. |
|
\newblock MIT Press, 2016. |
|
|
|
\bibitem[Kingma \& Ba(2014)Kingma and Ba]{kingma2014adam} |
|
Diederik~P Kingma and Jimmy Ba. |
|
\newblock Adam: A method for stochastic optimization. |
|
\newblock \emph{arXiv preprint arXiv:1412.6980}, 2014. |
|
|
|
\bibitem[Loshchilov \& Hutter(2017)Loshchilov and Hutter]{loshchilov2017adamw} |
|
Ilya Loshchilov and Frank Hutter. |
|
\newblock Decoupled weight decay regularization. |
|
\newblock \emph{arXiv preprint arXiv:1711.05101}, 2017. |
|
|
|
\bibitem[Paszke et~al.(2019)Paszke, Gross, Massa, Lerer, Bradbury, Chanan, |
|
Killeen, Lin, Gimelshein, Antiga, et~al.]{paszke2019pytorch} |
|
Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory |
|
Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et~al. |
|
\newblock Pytorch: An imperative style, high-performance deep learning library. |
|
\newblock \emph{Advances in neural information processing systems}, 32, 2019. |
|
|
|
\bibitem[Power et~al.(2022{\natexlab{a}})Power, Burda, Edwards, Babuschkin, and |
|
Misra]{power2022grokking} |
|
Alethea Power, Yuri Burda, Harri Edwards, Igor Babuschkin, and Vedant Misra. |
|
\newblock Grokking: Generalization beyond overfitting on small algorithmic |
|
datasets. |
|
\newblock \emph{arXiv preprint arXiv:2201.02177}, 2022{\natexlab{a}}. |
|
|
|
\bibitem[Power et~al.(2022{\natexlab{b}})Power, Burda, Edwards, Babuschkin, and |
|
Misra]{Power2022GrokkingGB} |
|
Alethea Power, Yuri Burda, Harrison Edwards, Igor Babuschkin, and Vedant Misra. |
|
\newblock Grokking: Generalization beyond overfitting on small algorithmic |
|
datasets. |
|
\newblock \emph{ArXiv}, abs/2201.02177, 2022{\natexlab{b}}. |
|
|
|
\bibitem[Radford et~al.(2019)Radford, Wu, Child, Luan, Amodei, and |
|
Sutskever]{radford2019language} |
|
Alec Radford, Jeff Wu, Rewon Child, David Luan, Dario Amodei, and Ilya |
|
Sutskever. |
|
\newblock Language models are unsupervised multitask learners. |
|
\newblock 2019. |
|
|
|
\bibitem[Vaswani et~al.(2017)Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, |
|
Kaiser, and Polosukhin]{vaswani2017attention} |
|
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, |
|
Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin. |
|
\newblock Attention is all you need. |
|
\newblock \emph{Advances in neural information processing systems}, 30, 2017. |
|
|
|
\end{thebibliography} |
|
|