pradachan
/

AI-Scientist

Model card Files Files and versions Community

AI-Scientist / example_papers /mdl_grokking_correlation /latex /references.bib

pradachan

Upload folder using huggingface_hub

f71c233 verified 19 days ago

raw

history blame contribute delete

2.66 kB

	%% LaTeX2e file `references.bib'
	%% generated by the `filecontents' environment
	%% from source `template' on 2024/08/10.
	%%
	@book{goodfellow2016deep,
	title={Deep learning},
	author={Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron and Bengio, Yoshua},
	volume={1},
	year={2016},
	publisher={MIT Press}
	}

	@article{power2022grokking,
	title={Grokking: Generalization beyond overfitting on small algorithmic datasets},
	author={Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant},
	journal={arXiv preprint arXiv:2201.02177},
	year={2022}
	}

	@article{vaswani2017attention,
	title={Attention is all you need},
	author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
	journal={Advances in neural information processing systems},
	volume={30},
	year={2017}
	}

	@article{kingma2014adam,
	title={Adam: A method for stochastic optimization},
	author={Kingma, Diederik P and Ba, Jimmy},
	journal={arXiv preprint arXiv:1412.6980},
	year={2014}
	}

	@article{ba2016layer,
	title={Layer normalization},
	author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
	journal={arXiv preprint arXiv:1607.06450},
	year={2016}
	}

	@article{loshchilov2017adamw,
	title={Decoupled weight decay regularization},
	author={Loshchilov, Ilya and Hutter, Frank},
	journal={arXiv preprint arXiv:1711.05101},
	year={2017}
	}

	@article{radford2019language,
	title={Language Models are Unsupervised Multitask Learners},
	author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
	year={2019}
	}

	@article{bahdanau2014neural,
	title={Neural machine translation by jointly learning to align and translate},
	author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
	journal={arXiv preprint arXiv:1409.0473},
	year={2014}
	}

	@article{paszke2019pytorch,
	title={Pytorch: An imperative style, high-performance deep learning library},
	author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others},
	journal={Advances in neural information processing systems},
	volume={32},
	year={2019}
	}

	@Article{Power2022GrokkingGB,
	author = {Alethea Power and Yuri Burda and Harrison Edwards and Igor Babuschkin and Vedant Misra},
	booktitle = {arXiv.org},
	journal = {ArXiv},
	title = {Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets},
	volume = {abs/2201.02177},
	year = {2022}
	}