pradachan
/

AI-Scientist

Model card Files Files and versions Community

AI-Scientist / example_papers /weight_initialization_grokking /latex /template.bbl

pradachan's picture

Upload folder using huggingface_hub

f71c233 verified 19 days ago

2.71 kB

	\begin{thebibliography}{9}
	\providecommand{\natexlab}[1]{#1}
	\providecommand{\url}[1]{\texttt{#1}}
	\expandafter\ifx\csname urlstyle\endcsname\relax
	\providecommand{\doi}[1]{doi: #1}\else
	\providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

	\bibitem[Bahdanau et~al.(2014)Bahdanau, Cho, and Bengio]{bahdanau2014neural}
	Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio.
	\newblock Neural machine translation by jointly learning to align and
	translate.
	\newblock \emph{arXiv preprint arXiv:1409.0473}, 2014.

	\bibitem[Glorot \& Bengio(2010)Glorot and Bengio]{Glorot2010UnderstandingTD}
	Xavier Glorot and Yoshua Bengio.
	\newblock Understanding the difficulty of training deep feedforward neural
	networks.
	\newblock pp.\ 249--256, 2010.

	\bibitem[Goodfellow et~al.(2016)Goodfellow, Bengio, Courville, and
	Bengio]{goodfellow2016deep}
	Ian Goodfellow, Yoshua Bengio, Aaron Courville, and Yoshua Bengio.
	\newblock \emph{Deep learning}, volume~1.
	\newblock MIT Press, 2016.

	\bibitem[He et~al.(2015)He, Zhang, Ren, and Sun]{He2015DelvingDI}
	Kaiming He, X.~Zhang, Shaoqing Ren, and Jian Sun.
	\newblock Delving deep into rectifiers: Surpassing human-level performance on
	imagenet classification.
	\newblock \emph{2015 IEEE International Conference on Computer Vision (ICCV)},
	pp.\ 1026--1034, 2015.

	\bibitem[Kingma \& Ba(2014)Kingma and Ba]{kingma2014adam}
	Diederik~P Kingma and Jimmy Ba.
	\newblock Adam: A method for stochastic optimization.
	\newblock \emph{arXiv preprint arXiv:1412.6980}, 2014.

	\bibitem[Loshchilov \& Hutter(2017)Loshchilov and Hutter]{loshchilov2017adamw}
	Ilya Loshchilov and Frank Hutter.
	\newblock Decoupled weight decay regularization.
	\newblock \emph{arXiv preprint arXiv:1711.05101}, 2017.

	\bibitem[Power et~al.(2022)Power, Burda, Edwards, Babuschkin, and
	Misra]{power2022grokking}
	Alethea Power, Yuri Burda, Harri Edwards, Igor Babuschkin, and Vedant Misra.
	\newblock Grokking: Generalization beyond overfitting on small algorithmic
	datasets.
	\newblock \emph{arXiv preprint arXiv:2201.02177}, 2022.

	\bibitem[Saxe et~al.(2013)Saxe, McClelland, and Ganguli]{Saxe2013ExactST}
	Andrew~M. Saxe, James~L. McClelland, and S.~Ganguli.
	\newblock Exact solutions to the nonlinear dynamics of learning in deep linear
	neural networks.
	\newblock \emph{CoRR}, abs/1312.6120, 2013.

	\bibitem[Vaswani et~al.(2017)Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez,
	Kaiser, and Polosukhin]{vaswani2017attention}
	Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
	Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin.
	\newblock Attention is all you need.
	\newblock \emph{Advances in neural information processing systems}, 30, 2017.

	\end{thebibliography}