|
\relax |
|
\providecommand\hyper@newdestlabel[2]{} |
|
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} |
|
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined |
|
\global\let\oldcontentsline\contentsline |
|
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} |
|
\global\let\oldnewlabel\newlabel |
|
\gdef\newlabel#1#2{\newlabelxx{#1}#2} |
|
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} |
|
\AtEndDocument{\ifx\hyper@anchor\@undefined |
|
\let\contentsline\oldcontentsline |
|
\let\newlabel\oldnewlabel |
|
\fi} |
|
\fi} |
|
\global\let\hyper@last\relax |
|
\gdef\HyperFirstAtBeginDocument#1{#1} |
|
\providecommand\HyField@AuxAddToFields[1]{} |
|
\providecommand\HyField@AuxAddToCoFields[2]{} |
|
\citation{goodfellow2016deep} |
|
\citation{power2022grokking} |
|
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent } |
|
\newlabel{sec:intro}{{1}{1}{Introduction}{section.1}{}} |
|
\newlabel{sec:intro@cref}{{[section][1][]1}{[1][1][]1}} |
|
\citation{Power2022GrokkingGB} |
|
\citation{goodfellow2016deep} |
|
\citation{bahdanau2014neural} |
|
\citation{paszke2019pytorch} |
|
\@writefile{toc}{\contentsline {section}{\numberline {2}Related Work}{2}{section.2}\protected@file@percent } |
|
\newlabel{sec:related}{{2}{2}{Related Work}{section.2}{}} |
|
\newlabel{sec:related@cref}{{[section][2][]2}{[1][2][]2}} |
|
\citation{radford2019language} |
|
\citation{kingma2014adam} |
|
\citation{loshchilov2017adamw} |
|
\citation{vaswani2017attention} |
|
\citation{goodfellow2016deep} |
|
\citation{power2022grokking} |
|
\citation{goodfellow2016deep} |
|
\@writefile{toc}{\contentsline {section}{\numberline {3}Background}{3}{section.3}\protected@file@percent } |
|
\newlabel{sec:background}{{3}{3}{Background}{section.3}{}} |
|
\newlabel{sec:background@cref}{{[section][3][]3}{[1][3][]3}} |
|
\citation{vaswani2017attention} |
|
\citation{loshchilov2017adamw} |
|
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Problem Setting}{4}{subsection.3.1}\protected@file@percent } |
|
\@writefile{toc}{\contentsline {section}{\numberline {4}Method}{4}{section.4}\protected@file@percent } |
|
\newlabel{sec:method}{{4}{4}{Method}{section.4}{}} |
|
\newlabel{sec:method@cref}{{[section][4][]4}{[1][4][]4}} |
|
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}MDL Estimation Technique}{4}{subsection.4.1}\protected@file@percent } |
|
\citation{vaswani2017attention} |
|
\citation{paszke2019pytorch} |
|
\citation{loshchilov2017adamw} |
|
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Experimental Setup}{5}{subsection.4.2}\protected@file@percent } |
|
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Analysis of MDL and Grokking Relationship}{5}{subsection.4.3}\protected@file@percent } |
|
\@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Visualization and Comparative Analysis}{5}{subsection.4.4}\protected@file@percent } |
|
\@writefile{toc}{\contentsline {section}{\numberline {5}Experimental Setup}{5}{section.5}\protected@file@percent } |
|
\newlabel{sec:experimental}{{5}{5}{Experimental Setup}{section.5}{}} |
|
\newlabel{sec:experimental@cref}{{[section][5][]5}{[1][5][]5}} |
|
\citation{loshchilov2017adamw} |
|
\@writefile{toc}{\contentsline {section}{\numberline {6}Results}{6}{section.6}\protected@file@percent } |
|
\newlabel{sec:results}{{6}{6}{Results}{section.6}{}} |
|
\newlabel{sec:results@cref}{{[section][6][]6}{[1][6][]6}} |
|
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Final performance metrics across datasets (mean values over 3 runs)\relax }}{6}{table.caption.1}\protected@file@percent } |
|
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} |
|
\newlabel{tab:final_performance}{{1}{6}{Final performance metrics across datasets (mean values over 3 runs)\relax }{table.caption.1}{}} |
|
\newlabel{tab:final_performance@cref}{{[table][1][]1}{[1][6][]6}} |
|
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Grokking points (steps to reach 95\% and 99\% validation accuracy)\relax }}{6}{table.caption.3}\protected@file@percent } |
|
\newlabel{tab:grokking_points}{{2}{6}{Grokking points (steps to reach 95\% and 99\% validation accuracy)\relax }{table.caption.3}{}} |
|
\newlabel{tab:grokking_points@cref}{{[table][2][]2}{[1][6][]6}} |
|
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Validation accuracy and normalized MDL for x\_div\_y task\relax }}{7}{figure.caption.2}\protected@file@percent } |
|
\newlabel{fig:val_acc_mdl_x_div_y}{{1}{7}{Validation accuracy and normalized MDL for x\_div\_y task\relax }{figure.caption.2}{}} |
|
\newlabel{fig:val_acc_mdl_x_div_y@cref}{{[figure][1][]1}{[1][6][]7}} |
|
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces MDL transition points vs.\ grokking points across datasets\relax }}{7}{figure.caption.4}\protected@file@percent } |
|
\newlabel{fig:mdl_transition_vs_grokking}{{2}{7}{MDL transition points vs.\ grokking points across datasets\relax }{figure.caption.4}{}} |
|
\newlabel{fig:mdl_transition_vs_grokking@cref}{{[figure][2][]2}{[1][6][]7}} |
|
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Correlation between MDL reduction and validation accuracy improvement\relax }}{8}{figure.caption.5}\protected@file@percent } |
|
\newlabel{fig:mdl_val_acc_correlation}{{3}{8}{Correlation between MDL reduction and validation accuracy improvement\relax }{figure.caption.5}{}} |
|
\newlabel{fig:mdl_val_acc_correlation@cref}{{[figure][3][]3}{[1][7][]8}} |
|
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces MDL evolution and generalization gap for x\_div\_y task\relax }}{8}{figure.caption.6}\protected@file@percent } |
|
\newlabel{fig:mdl_gen_gap_x_div_y}{{4}{8}{MDL evolution and generalization gap for x\_div\_y task\relax }{figure.caption.6}{}} |
|
\newlabel{fig:mdl_gen_gap_x_div_y@cref}{{[figure][4][]4}{[1][7][]8}} |
|
\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces MDL transition rate vs.\ grokking speed across datasets\relax }}{9}{figure.caption.7}\protected@file@percent } |
|
\newlabel{fig:mdl_transition_rate_vs_grokking_speed}{{5}{9}{MDL transition rate vs.\ grokking speed across datasets\relax }{figure.caption.7}{}} |
|
\newlabel{fig:mdl_transition_rate_vs_grokking_speed@cref}{{[figure][5][]5}{[1][7][]9}} |
|
\newlabel{fig:train_acc_x_div_y}{{6a}{9}{Training accuracy for x\_div\_y task\relax }{figure.caption.8}{}} |
|
\newlabel{fig:train_acc_x_div_y@cref}{{[subfigure][1][6]6a}{[1][8][]9}} |
|
\newlabel{sub@fig:train_acc_x_div_y}{{a}{9}{Training accuracy for x\_div\_y task\relax }{figure.caption.8}{}} |
|
\newlabel{sub@fig:train_acc_x_div_y@cref}{{[subfigure][1][6]6a}{[1][8][]9}} |
|
\newlabel{fig:train_loss_x_div_y}{{6b}{9}{Training loss for x\_div\_y task\relax }{figure.caption.8}{}} |
|
\newlabel{fig:train_loss_x_div_y@cref}{{[subfigure][2][6]6b}{[1][8][]9}} |
|
\newlabel{sub@fig:train_loss_x_div_y}{{b}{9}{Training loss for x\_div\_y task\relax }{figure.caption.8}{}} |
|
\newlabel{sub@fig:train_loss_x_div_y@cref}{{[subfigure][2][6]6b}{[1][8][]9}} |
|
\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces Training metrics for x\_div\_y task\relax }}{9}{figure.caption.8}\protected@file@percent } |
|
\newlabel{fig:training_metrics_x_div_y}{{6}{9}{Training metrics for x\_div\_y task\relax }{figure.caption.8}{}} |
|
\newlabel{fig:training_metrics_x_div_y@cref}{{[figure][6][]6}{[1][8][]9}} |
|
\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{9}{section.7}\protected@file@percent } |
|
\newlabel{sec:conclusion}{{7}{9}{Conclusion}{section.7}{}} |
|
\newlabel{sec:conclusion@cref}{{[section][7][]7}{[1][8][]9}} |
|
\bibstyle{iclr2024_conference} |
|
\bibdata{references} |
|
\bibcite{bahdanau2014neural}{{1}{2014}{{Bahdanau et~al.}}{{Bahdanau, Cho, and Bengio}}} |
|
\bibcite{goodfellow2016deep}{{2}{2016}{{Goodfellow et~al.}}{{Goodfellow, Bengio, Courville, and Bengio}}} |
|
\bibcite{kingma2014adam}{{3}{2014}{{Kingma \& Ba}}{{Kingma and Ba}}} |
|
\bibcite{loshchilov2017adamw}{{4}{2017}{{Loshchilov \& Hutter}}{{Loshchilov and Hutter}}} |
|
\bibcite{paszke2019pytorch}{{5}{2019}{{Paszke et~al.}}{{Paszke, Gross, Massa, Lerer, Bradbury, Chanan, Killeen, Lin, Gimelshein, Antiga, et~al.}}} |
|
\bibcite{power2022grokking}{{6}{2022{a}}{{Power et~al.}}{{Power, Burda, Edwards, Babuschkin, and Misra}}} |
|
\bibcite{Power2022GrokkingGB}{{7}{2022{b}}{{Power et~al.}}{{Power, Burda, Edwards, Babuschkin, and Misra}}} |
|
\bibcite{radford2019language}{{8}{2019}{{Radford et~al.}}{{Radford, Wu, Child, Luan, Amodei, and Sutskever}}} |
|
\bibcite{vaswani2017attention}{{9}{2017}{{Vaswani et~al.}}{{Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin}}} |
|
\@writefile{toc}{\contentsline {section}{\numberline {8}Related Work}{10}{section.8}\protected@file@percent } |
|
\newlabel{sec:related}{{8}{10}{Related Work}{section.8}{}} |
|
\newlabel{sec:related@cref}{{[section][8][]8}{[1][10][]10}} |
|
\ttl@finishall |
|
\gdef \@abspage@last{10} |
|
|