pradachan
/

AI-Scientist

Model card Files Files and versions Community

AI-Scientist / example_papers /layerwise_lr_grokking /latex /template.bbl

pradachan

Upload folder using huggingface_hub

f71c233 verified 19 days ago

raw

history blame contribute delete

3.4 kB

	\begin{thebibliography}{11}
	\providecommand{\natexlab}[1]{#1}
	\providecommand{\url}[1]{\texttt{#1}}
	\expandafter\ifx\csname urlstyle\endcsname\relax
	\providecommand{\doi}[1]{doi: #1}\else
	\providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

	\bibitem[Ba et~al.(2016)Ba, Kiros, and Hinton]{ba2016layer}
	Jimmy~Lei Ba, Jamie~Ryan Kiros, and Geoffrey~E Hinton.
	\newblock Layer normalization.
	\newblock \emph{arXiv preprint arXiv:1607.06450}, 2016.

	\bibitem[Bahamou \& Goldfarb(2023)Bahamou and Goldfarb]{Bahamou2023LayerwiseAS}
	Achraf Bahamou and D.~Goldfarb.
	\newblock Layer-wise adaptive step-sizes for stochastic first-order methods for
	deep learning.
	\newblock \emph{ArXiv}, abs/2305.13664, 2023.

	\bibitem[Goodfellow et~al.(2016)Goodfellow, Bengio, Courville, and
	Bengio]{goodfellow2016deep}
	Ian Goodfellow, Yoshua Bengio, Aaron Courville, and Yoshua Bengio.
	\newblock \emph{Deep learning}, volume~1.
	\newblock MIT Press, 2016.

	\bibitem[Hu et~al.(2021)Hu, Shen, Wallis, Allen-Zhu, Li, Wang, and
	Chen]{Hu2021LoRALA}
	J.~E. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean
	Wang, and Weizhu Chen.
	\newblock Lora: Low-rank adaptation of large language models.
	\newblock \emph{ArXiv}, abs/2106.09685, 2021.

	\bibitem[Kingma \& Ba(2014)Kingma and Ba]{kingma2014adam}
	Diederik~P Kingma and Jimmy Ba.
	\newblock Adam: A method for stochastic optimization.
	\newblock \emph{arXiv preprint arXiv:1412.6980}, 2014.

	\bibitem[Ko et~al.(2022)Ko, Lee, and Kim]{Ko2022NotAL}
	Yunyong Ko, Dongwon Lee, and Sang-Wook Kim.
	\newblock Not all layers are equal: A layer-wise adaptive approach toward
	large-scale dnn training.
	\newblock \emph{Proceedings of the ACM Web Conference 2022}, 2022.

	\bibitem[Loshchilov \& Hutter(2017)Loshchilov and Hutter]{loshchilov2017adamw}
	Ilya Loshchilov and Frank Hutter.
	\newblock Decoupled weight decay regularization.
	\newblock \emph{arXiv preprint arXiv:1711.05101}, 2017.

	\bibitem[Paszke et~al.(2019)Paszke, Gross, Massa, Lerer, Bradbury, Chanan,
	Killeen, Lin, Gimelshein, Antiga, et~al.]{paszke2019pytorch}
	Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory
	Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et~al.
	\newblock Pytorch: An imperative style, high-performance deep learning library.
	\newblock \emph{Advances in neural information processing systems}, 32, 2019.

	\bibitem[Power et~al.(2022)Power, Burda, Edwards, Babuschkin, and
	Misra]{power2022grokking}
	Alethea Power, Yuri Burda, Harri Edwards, Igor Babuschkin, and Vedant Misra.
	\newblock Grokking: Generalization beyond overfitting on small algorithmic
	datasets.
	\newblock \emph{arXiv preprint arXiv:2201.02177}, 2022.

	\bibitem[Shea \& Schmidt(2024)Shea and Schmidt]{Shea2024WhyLS}
	Betty Shea and Mark Schmidt.
	\newblock Why line search when you can plane search? so-friendly neural
	networks allow per-iteration optimization of learning and momentum rates for
	every layer.
	\newblock \emph{ArXiv}, abs/2406.17954, 2024.

	\bibitem[Vaswani et~al.(2017)Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez,
	Kaiser, and Polosukhin]{vaswani2017attention}
	Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
	Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin.
	\newblock Attention is all you need.
	\newblock \emph{Advances in neural information processing systems}, 30, 2017.

	\end{thebibliography}