|
%% LaTeX2e file `references.bib' |
|
%% generated by the `filecontents' environment |
|
%% from source `template' on 2024/08/10. |
|
%% |
|
@book{goodfellow2016deep, |
|
title={Deep learning}, |
|
author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua}, |
|
volume={1}, |
|
year={2016}, |
|
publisher={MIT Press} |
|
} |
|
|
|
@article{power2022grokking, |
|
title={Grokking: Generalization beyond overfitting on small algorithmic datasets}, |
|
author={Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant}, |
|
journal={arXiv preprint arXiv:2201.02177}, |
|
year={2022} |
|
} |
|
|
|
@article{vaswani2017attention, |
|
title={Attention is all you need}, |
|
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, |
|
journal={Advances in neural information processing systems}, |
|
volume={30}, |
|
year={2017} |
|
} |
|
|
|
@article{kingma2014adam, |
|
title={Adam: A method for stochastic optimization}, |
|
author={Kingma, Diederik P and Ba, Jimmy}, |
|
journal={arXiv preprint arXiv:1412.6980}, |
|
year={2014} |
|
} |
|
|
|
@article{ba2016layer, |
|
title={Layer normalization}, |
|
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E}, |
|
journal={arXiv preprint arXiv:1607.06450}, |
|
year={2016} |
|
} |
|
|
|
@article{loshchilov2017adamw, |
|
title={Decoupled weight decay regularization}, |
|
author={Loshchilov, Ilya and Hutter, Frank}, |
|
journal={arXiv preprint arXiv:1711.05101}, |
|
year={2017} |
|
} |
|
|
|
@article{radford2019language, |
|
title={Language Models are Unsupervised Multitask Learners}, |
|
author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, |
|
year={2019} |
|
} |
|
|
|
@article{bahdanau2014neural, |
|
title={Neural machine translation by jointly learning to align and translate}, |
|
author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, |
|
journal={arXiv preprint arXiv:1409.0473}, |
|
year={2014} |
|
} |
|
|
|
@article{paszke2019pytorch, |
|
title={Pytorch: An imperative style, high-performance deep learning library}, |
|
author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others}, |
|
journal={Advances in neural information processing systems}, |
|
volume={32}, |
|
year={2019} |
|
} |
|
|
|
@Article{Power2022GrokkingGB, |
|
author = {Alethea Power and Yuri Burda and Harrison Edwards and Igor Babuschkin and Vedant Misra}, |
|
booktitle = {arXiv.org}, |
|
journal = {ArXiv}, |
|
title = {Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets}, |
|
volume = {abs/2201.02177}, |
|
year = {2022} |
|
} |
|
|
|
|