%% LaTeX2e file `references.bib' %% generated by the `filecontents' environment %% from source `template' on 2024/08/09. %% @book{goodfellow2016deep, title={Deep learning}, author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua}, volume={1}, year={2016}, publisher={MIT Press} } @article{power2022grokking, title={Grokking: Generalization beyond overfitting on small algorithmic datasets}, author={Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant}, journal={arXiv preprint arXiv:2201.02177}, year={2022} } @article{vaswani2017attention, title={Attention is all you need}, author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, journal={Advances in neural information processing systems}, volume={30}, year={2017} } @article{kingma2014adam, title={Adam: A method for stochastic optimization}, author={Kingma, Diederik P and Ba, Jimmy}, journal={arXiv preprint arXiv:1412.6980}, year={2014} } @article{ba2016layer, title={Layer normalization}, author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E}, journal={arXiv preprint arXiv:1607.06450}, year={2016} } @article{loshchilov2017adamw, title={Decoupled weight decay regularization}, author={Loshchilov, Ilya and Hutter, Frank}, journal={arXiv preprint arXiv:1711.05101}, year={2017} } @article{radford2019language, title={Language Models are Unsupervised Multitask Learners}, author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, year={2019} } @article{bahdanau2014neural, title={Neural machine translation by jointly learning to align and translate}, author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, journal={arXiv preprint arXiv:1409.0473}, year={2014} } @article{paszke2019pytorch, title={Pytorch: An imperative style, high-performance deep learning library}, author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others}, journal={Advances in neural information processing systems}, volume={32}, year={2019} } @Article{Power2022GrokkingGB, author = {Alethea Power and Yuri Burda and Harrison Edwards and Igor Babuschkin and Vedant Misra}, booktitle = {arXiv.org}, journal = {ArXiv}, title = {Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets}, volume = {abs/2201.02177}, year = {2022} }