pradachan
/

AI-Scientist

Model card Files Files and versions Community

AI-Scientist / example_papers /weight_initialization_grokking /latex /references.bib

pradachan's picture

Upload folder using huggingface_hub

f71c233 verified 19 days ago

history blame contribute delete

3.34 kB

	%% LaTeX2e file `references.bib'
	%% generated by the `filecontents' environment
	%% from source `template' on 2024/08/08.
	%%
	@book{goodfellow2016deep,
	title={Deep learning},
	author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua},
	volume={1},
	year={2016},
	publisher={MIT Press}
	}

	@article{power2022grokking,
	title={Grokking: Generalization beyond overfitting on small algorithmic datasets},
	author={Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant},
	journal={arXiv preprint arXiv:2201.02177},
	year={2022}
	}

	@article{vaswani2017attention,
	title={Attention is all you need},
	author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
	journal={Advances in neural information processing systems},
	volume={30},
	year={2017}
	}

	@article{kingma2014adam,
	title={Adam: A method for stochastic optimization},
	author={Kingma, Diederik P and Ba, Jimmy},
	journal={arXiv preprint arXiv:1412.6980},
	year={2014}
	}

	@article{ba2016layer,
	title={Layer normalization},
	author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
	journal={arXiv preprint arXiv:1607.06450},
	year={2016}
	}

	@article{loshchilov2017adamw,
	title={Decoupled weight decay regularization},
	author={Loshchilov, Ilya and Hutter, Frank},
	journal={arXiv preprint arXiv:1711.05101},
	year={2017}
	}

	@article{radford2019language,
	title={Language Models are Unsupervised Multitask Learners},
	author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
	year={2019}
	}

	@article{bahdanau2014neural,
	title={Neural machine translation by jointly learning to align and translate},
	author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
	journal={arXiv preprint arXiv:1409.0473},
	year={2014}
	}

	@article{paszke2019pytorch,
	title={Pytorch: An imperative style, high-performance deep learning library},
	author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others},
	journal={Advances in neural information processing systems},
	volume={32},
	year={2019}
	}

	@Article{Glorot2010UnderstandingTD,
	author = {Xavier Glorot and Yoshua Bengio},
	booktitle = {International Conference on Artificial Intelligence and Statistics},
	pages = {249-256},
	title = {Understanding the difficulty of training deep feedforward neural networks},
	year = {2010}
	}


	@Article{He2015DelvingDI,
	author = {Kaiming He and X. Zhang and Shaoqing Ren and Jian Sun},
	booktitle = {IEEE International Conference on Computer Vision},
	journal = {2015 IEEE International Conference on Computer Vision (ICCV)},
	pages = {1026-1034},
	title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
	year = {2015}
	}


	@Article{Saxe2013ExactST,
	author = {Andrew M. Saxe and James L. McClelland and S. Ganguli},
	booktitle = {International Conference on Learning Representations},
	journal = {CoRR},
	title = {Exact solutions to the nonlinear dynamics of learning in deep linear neural networks},
	volume = {abs/1312.6120},
	year = {2013}
	}