pradachan's picture
Upload folder using huggingface_hub
f71c233 verified
%% LaTeX2e file `references.bib'
%% generated by the `filecontents' environment
%% from source `template' on 2024/08/08.
%%
@book{goodfellow2016deep,
title={Deep learning},
author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua},
volume={1},
year={2016},
publisher={MIT Press}
}
@article{power2022grokking,
title={Grokking: Generalization beyond overfitting on small algorithmic datasets},
author={Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant},
journal={arXiv preprint arXiv:2201.02177},
year={2022}
}
@article{vaswani2017attention,
title={Attention is all you need},
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
journal={Advances in neural information processing systems},
volume={30},
year={2017}
}
@article{kingma2014adam,
title={Adam: A method for stochastic optimization},
author={Kingma, Diederik P and Ba, Jimmy},
journal={arXiv preprint arXiv:1412.6980},
year={2014}
}
@article{ba2016layer,
title={Layer normalization},
author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
journal={arXiv preprint arXiv:1607.06450},
year={2016}
}
@article{loshchilov2017adamw,
title={Decoupled weight decay regularization},
author={Loshchilov, Ilya and Hutter, Frank},
journal={arXiv preprint arXiv:1711.05101},
year={2017}
}
@article{radford2019language,
title={Language Models are Unsupervised Multitask Learners},
author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
year={2019}
}
@article{bahdanau2014neural,
title={Neural machine translation by jointly learning to align and translate},
author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
journal={arXiv preprint arXiv:1409.0473},
year={2014}
}
@article{paszke2019pytorch,
title={Pytorch: An imperative style, high-performance deep learning library},
author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others},
journal={Advances in neural information processing systems},
volume={32},
year={2019}
}
@Article{Glorot2010UnderstandingTD,
author = {Xavier Glorot and Yoshua Bengio},
booktitle = {International Conference on Artificial Intelligence and Statistics},
pages = {249-256},
title = {Understanding the difficulty of training deep feedforward neural networks},
year = {2010}
}
@Article{He2015DelvingDI,
author = {Kaiming He and X. Zhang and Shaoqing Ren and Jian Sun},
booktitle = {IEEE International Conference on Computer Vision},
journal = {2015 IEEE International Conference on Computer Vision (ICCV)},
pages = {1026-1034},
title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
year = {2015}
}
@Article{Saxe2013ExactST,
author = {Andrew M. Saxe and James L. McClelland and S. Ganguli},
booktitle = {International Conference on Learning Representations},
journal = {CoRR},
title = {Exact solutions to the nonlinear dynamics of learning in deep linear neural networks},
volume = {abs/1312.6120},
year = {2013}
}