File size: 9,991 Bytes
f71c233 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
\relax
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldcontentsline\contentsline
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\contentsline\oldcontentsline
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\citation{goodfellow2016deep}
\citation{power2022grokking}
\citation{vaswani2017attention}
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
\newlabel{sec:intro}{{1}{1}{Introduction}{section.1}{}}
\newlabel{sec:intro@cref}{{[section][1][]1}{[1][1][]1}}
\citation{Glorot2010UnderstandingTD}
\@writefile{toc}{\contentsline {section}{\numberline {2}Related Work}{2}{section.2}\protected@file@percent }
\newlabel{sec:related}{{2}{2}{Related Work}{section.2}{}}
\newlabel{sec:related@cref}{{[section][2][]2}{[1][2][]2}}
\citation{He2015DelvingDI}
\citation{Saxe2013ExactST}
\citation{power2022grokking}
\citation{vaswani2017attention}
\citation{vaswani2017attention}
\citation{bahdanau2014neural}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Weight Initialization Strategies}{3}{subsection.2.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Grokking Phenomenon}{3}{subsection.2.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Transformer Training Dynamics}{3}{subsection.2.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Background}{3}{section.3}\protected@file@percent }
\newlabel{sec:background}{{3}{3}{Background}{section.3}{}}
\newlabel{sec:background@cref}{{[section][3][]3}{[1][3][]3}}
\citation{goodfellow2016deep}
\citation{Glorot2010UnderstandingTD}
\citation{He2015DelvingDI}
\citation{Saxe2013ExactST}
\citation{power2022grokking}
\citation{loshchilov2017adamw}
\citation{kingma2014adam}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Problem Setting}{4}{subsection.3.1}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}Method}{5}{section.4}\protected@file@percent }
\newlabel{sec:method}{{4}{5}{Method}{section.4}{}}
\newlabel{sec:method@cref}{{[section][4][]4}{[1][4][]5}}
\@writefile{toc}{\contentsline {section}{\numberline {5}Experimental Setup}{5}{section.5}\protected@file@percent }
\newlabel{sec:experimental}{{5}{5}{Experimental Setup}{section.5}{}}
\newlabel{sec:experimental@cref}{{[section][5][]5}{[1][5][]5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Dataset}{5}{subsection.5.1}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Experimental Procedure\relax }}{6}{algorithm.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Model Architecture}{6}{subsection.5.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Training Details}{6}{subsection.5.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Initialization Strategies}{6}{subsection.5.4}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5}Evaluation Metrics}{7}{subsection.5.5}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.6}Implementation Details}{7}{subsection.5.6}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}Results}{7}{section.6}\protected@file@percent }
\newlabel{sec:results}{{6}{7}{Results}{section.6}{}}
\newlabel{sec:results@cref}{{[section][6][]6}{[1][7][]7}}
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:train_acc_x_plus_y}{{1a}{7}{Training Accuracy\relax }{figure.caption.1}{}}
\newlabel{fig:train_acc_x_plus_y@cref}{{[subfigure][1][1]1a}{[1][7][]7}}
\newlabel{sub@fig:train_acc_x_plus_y}{{a}{7}{Training Accuracy\relax }{figure.caption.1}{}}
\newlabel{sub@fig:train_acc_x_plus_y@cref}{{[subfigure][1][1]1a}{[1][7][]7}}
\newlabel{fig:val_acc_x_plus_y}{{1b}{7}{Validation Accuracy\relax }{figure.caption.1}{}}
\newlabel{fig:val_acc_x_plus_y@cref}{{[subfigure][2][1]1b}{[1][7][]7}}
\newlabel{sub@fig:val_acc_x_plus_y}{{b}{7}{Validation Accuracy\relax }{figure.caption.1}{}}
\newlabel{sub@fig:val_acc_x_plus_y@cref}{{[subfigure][2][1]1b}{[1][7][]7}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Training and Validation Accuracy for x\_plus\_y task across different initialization methods\relax }}{7}{figure.caption.1}\protected@file@percent }
\newlabel{fig:acc_x_plus_y}{{1}{7}{Training and Validation Accuracy for x\_plus\_y task across different initialization methods\relax }{figure.caption.1}{}}
\newlabel{fig:acc_x_plus_y@cref}{{[figure][1][]1}{[1][7][]7}}
\newlabel{fig:train_loss_x_minus_y}{{2a}{8}{Training Loss\relax }{figure.caption.2}{}}
\newlabel{fig:train_loss_x_minus_y@cref}{{[subfigure][1][2]2a}{[1][7][]8}}
\newlabel{sub@fig:train_loss_x_minus_y}{{a}{8}{Training Loss\relax }{figure.caption.2}{}}
\newlabel{sub@fig:train_loss_x_minus_y@cref}{{[subfigure][1][2]2a}{[1][7][]8}}
\newlabel{fig:val_loss_x_minus_y}{{2b}{8}{Validation Loss\relax }{figure.caption.2}{}}
\newlabel{fig:val_loss_x_minus_y@cref}{{[subfigure][2][2]2b}{[1][7][]8}}
\newlabel{sub@fig:val_loss_x_minus_y}{{b}{8}{Validation Loss\relax }{figure.caption.2}{}}
\newlabel{sub@fig:val_loss_x_minus_y@cref}{{[subfigure][2][2]2b}{[1][7][]8}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Training and Validation Loss for x\_minus\_y task across different initialization methods\relax }}{8}{figure.caption.2}\protected@file@percent }
\newlabel{fig:loss_x_minus_y}{{2}{8}{Training and Validation Loss for x\_minus\_y task across different initialization methods\relax }{figure.caption.2}{}}
\newlabel{fig:loss_x_minus_y@cref}{{[figure][2][]2}{[1][7][]8}}
\newlabel{fig:train_acc_x_div_y}{{3a}{8}{Training Accuracy\relax }{figure.caption.3}{}}
\newlabel{fig:train_acc_x_div_y@cref}{{[subfigure][1][3]3a}{[1][7][]8}}
\newlabel{sub@fig:train_acc_x_div_y}{{a}{8}{Training Accuracy\relax }{figure.caption.3}{}}
\newlabel{sub@fig:train_acc_x_div_y@cref}{{[subfigure][1][3]3a}{[1][7][]8}}
\newlabel{fig:val_acc_x_div_y}{{3b}{8}{Validation Accuracy\relax }{figure.caption.3}{}}
\newlabel{fig:val_acc_x_div_y@cref}{{[subfigure][2][3]3b}{[1][7][]8}}
\newlabel{sub@fig:val_acc_x_div_y}{{b}{8}{Validation Accuracy\relax }{figure.caption.3}{}}
\newlabel{sub@fig:val_acc_x_div_y@cref}{{[subfigure][2][3]3b}{[1][7][]8}}
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Training and Validation Accuracy for x\_div\_y task across different initialization methods\relax }}{8}{figure.caption.3}\protected@file@percent }
\newlabel{fig:acc_x_div_y}{{3}{8}{Training and Validation Accuracy for x\_div\_y task across different initialization methods\relax }{figure.caption.3}{}}
\newlabel{fig:acc_x_div_y@cref}{{[figure][3][]3}{[1][7][]8}}
\newlabel{fig:train_loss_permutation}{{4a}{8}{Training Loss\relax }{figure.caption.4}{}}
\newlabel{fig:train_loss_permutation@cref}{{[subfigure][1][4]4a}{[1][8][]8}}
\newlabel{sub@fig:train_loss_permutation}{{a}{8}{Training Loss\relax }{figure.caption.4}{}}
\newlabel{sub@fig:train_loss_permutation@cref}{{[subfigure][1][4]4a}{[1][8][]8}}
\newlabel{fig:val_loss_permutation}{{4b}{8}{Validation Loss\relax }{figure.caption.4}{}}
\newlabel{fig:val_loss_permutation@cref}{{[subfigure][2][4]4b}{[1][8][]8}}
\newlabel{sub@fig:val_loss_permutation}{{b}{8}{Validation Loss\relax }{figure.caption.4}{}}
\newlabel{sub@fig:val_loss_permutation@cref}{{[subfigure][2][4]4b}{[1][8][]8}}
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Training and Validation Loss for permutation task across different initialization methods\relax }}{8}{figure.caption.4}\protected@file@percent }
\newlabel{fig:loss_permutation}{{4}{8}{Training and Validation Loss for permutation task across different initialization methods\relax }{figure.caption.4}{}}
\newlabel{fig:loss_permutation@cref}{{[figure][4][]4}{[1][8][]8}}
\citation{power2022grokking}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces 95\% Confidence Intervals for Steps to 99\% Validation Accuracy ($S_{99}$)\relax }}{9}{table.caption.5}\protected@file@percent }
\newlabel{tab:confidence_intervals}{{1}{9}{95\% Confidence Intervals for Steps to 99\% Validation Accuracy ($S_{99}$)\relax }{table.caption.5}{}}
\newlabel{tab:confidence_intervals@cref}{{[table][1][]1}{[1][8][]9}}
\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusions}{9}{section.7}\protected@file@percent }
\newlabel{sec:conclusion}{{7}{9}{Conclusions}{section.7}{}}
\newlabel{sec:conclusion@cref}{{[section][7][]7}{[1][9][]9}}
\bibstyle{iclr2024_conference}
\bibdata{references}
\bibcite{bahdanau2014neural}{{1}{2014}{{Bahdanau et~al.}}{{Bahdanau, Cho, and Bengio}}}
\bibcite{Glorot2010UnderstandingTD}{{2}{2010}{{Glorot \& Bengio}}{{Glorot and Bengio}}}
\bibcite{goodfellow2016deep}{{3}{2016}{{Goodfellow et~al.}}{{Goodfellow, Bengio, Courville, and Bengio}}}
\bibcite{He2015DelvingDI}{{4}{2015}{{He et~al.}}{{He, Zhang, Ren, and Sun}}}
\bibcite{kingma2014adam}{{5}{2014}{{Kingma \& Ba}}{{Kingma and Ba}}}
\bibcite{loshchilov2017adamw}{{6}{2017}{{Loshchilov \& Hutter}}{{Loshchilov and Hutter}}}
\bibcite{power2022grokking}{{7}{2022}{{Power et~al.}}{{Power, Burda, Edwards, Babuschkin, and Misra}}}
\bibcite{Saxe2013ExactST}{{8}{2013}{{Saxe et~al.}}{{Saxe, McClelland, and Ganguli}}}
\bibcite{vaswani2017attention}{{9}{2017}{{Vaswani et~al.}}{{Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin}}}
\ttl@finishall
\gdef \@abspage@last{10}
|