File size: 2,655 Bytes
f71c233 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
%% LaTeX2e file `references.bib'
%% generated by the `filecontents' environment
%% from source `template' on 2024/08/10.
%%
@book{goodfellow2016deep,
title={Deep learning},
author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua},
volume={1},
year={2016},
publisher={MIT Press}
}
@article{power2022grokking,
title={Grokking: Generalization beyond overfitting on small algorithmic datasets},
author={Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant},
journal={arXiv preprint arXiv:2201.02177},
year={2022}
}
@article{vaswani2017attention,
title={Attention is all you need},
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
journal={Advances in neural information processing systems},
volume={30},
year={2017}
}
@article{kingma2014adam,
title={Adam: A method for stochastic optimization},
author={Kingma, Diederik P and Ba, Jimmy},
journal={arXiv preprint arXiv:1412.6980},
year={2014}
}
@article{ba2016layer,
title={Layer normalization},
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
journal={arXiv preprint arXiv:1607.06450},
year={2016}
}
@article{loshchilov2017adamw,
title={Decoupled weight decay regularization},
author={Loshchilov, Ilya and Hutter, Frank},
journal={arXiv preprint arXiv:1711.05101},
year={2017}
}
@article{radford2019language,
title={Language Models are Unsupervised Multitask Learners},
author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
year={2019}
}
@article{bahdanau2014neural,
title={Neural machine translation by jointly learning to align and translate},
author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
journal={arXiv preprint arXiv:1409.0473},
year={2014}
}
@article{paszke2019pytorch,
title={Pytorch: An imperative style, high-performance deep learning library},
author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others},
journal={Advances in neural information processing systems},
volume={32},
year={2019}
}
@Article{Power2022GrokkingGB,
author = {Alethea Power and Yuri Burda and Harrison Edwards and Igor Babuschkin and Vedant Misra},
booktitle = {arXiv.org},
journal = {ArXiv},
title = {Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets},
volume = {abs/2201.02177},
year = {2022}
}
|