File size: 9,991 Bytes
f71c233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
\relax 
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldcontentsline\contentsline
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\contentsline\oldcontentsline
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax 
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\citation{goodfellow2016deep}
\citation{power2022grokking}
\citation{vaswani2017attention}
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
\newlabel{sec:intro}{{1}{1}{Introduction}{section.1}{}}
\newlabel{sec:intro@cref}{{[section][1][]1}{[1][1][]1}}
\citation{Glorot2010UnderstandingTD}
\@writefile{toc}{\contentsline {section}{\numberline {2}Related Work}{2}{section.2}\protected@file@percent }
\newlabel{sec:related}{{2}{2}{Related Work}{section.2}{}}
\newlabel{sec:related@cref}{{[section][2][]2}{[1][2][]2}}
\citation{He2015DelvingDI}
\citation{Saxe2013ExactST}
\citation{power2022grokking}
\citation{vaswani2017attention}
\citation{vaswani2017attention}
\citation{bahdanau2014neural}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Weight Initialization Strategies}{3}{subsection.2.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Grokking Phenomenon}{3}{subsection.2.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Transformer Training Dynamics}{3}{subsection.2.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Background}{3}{section.3}\protected@file@percent }
\newlabel{sec:background}{{3}{3}{Background}{section.3}{}}
\newlabel{sec:background@cref}{{[section][3][]3}{[1][3][]3}}
\citation{goodfellow2016deep}
\citation{Glorot2010UnderstandingTD}
\citation{He2015DelvingDI}
\citation{Saxe2013ExactST}
\citation{power2022grokking}
\citation{loshchilov2017adamw}
\citation{kingma2014adam}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Problem Setting}{4}{subsection.3.1}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}Method}{5}{section.4}\protected@file@percent }
\newlabel{sec:method}{{4}{5}{Method}{section.4}{}}
\newlabel{sec:method@cref}{{[section][4][]4}{[1][4][]5}}
\@writefile{toc}{\contentsline {section}{\numberline {5}Experimental Setup}{5}{section.5}\protected@file@percent }
\newlabel{sec:experimental}{{5}{5}{Experimental Setup}{section.5}{}}
\newlabel{sec:experimental@cref}{{[section][5][]5}{[1][5][]5}}
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Dataset}{5}{subsection.5.1}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Experimental Procedure\relax }}{6}{algorithm.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Model Architecture}{6}{subsection.5.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Training Details}{6}{subsection.5.3}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Initialization Strategies}{6}{subsection.5.4}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5}Evaluation Metrics}{7}{subsection.5.5}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {5.6}Implementation Details}{7}{subsection.5.6}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}Results}{7}{section.6}\protected@file@percent }
\newlabel{sec:results}{{6}{7}{Results}{section.6}{}}
\newlabel{sec:results@cref}{{[section][6][]6}{[1][7][]7}}
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:train_acc_x_plus_y}{{1a}{7}{Training Accuracy\relax }{figure.caption.1}{}}
\newlabel{fig:train_acc_x_plus_y@cref}{{[subfigure][1][1]1a}{[1][7][]7}}
\newlabel{sub@fig:train_acc_x_plus_y}{{a}{7}{Training Accuracy\relax }{figure.caption.1}{}}
\newlabel{sub@fig:train_acc_x_plus_y@cref}{{[subfigure][1][1]1a}{[1][7][]7}}
\newlabel{fig:val_acc_x_plus_y}{{1b}{7}{Validation Accuracy\relax }{figure.caption.1}{}}
\newlabel{fig:val_acc_x_plus_y@cref}{{[subfigure][2][1]1b}{[1][7][]7}}
\newlabel{sub@fig:val_acc_x_plus_y}{{b}{7}{Validation Accuracy\relax }{figure.caption.1}{}}
\newlabel{sub@fig:val_acc_x_plus_y@cref}{{[subfigure][2][1]1b}{[1][7][]7}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Training and Validation Accuracy for x\_plus\_y task across different initialization methods\relax }}{7}{figure.caption.1}\protected@file@percent }
\newlabel{fig:acc_x_plus_y}{{1}{7}{Training and Validation Accuracy for x\_plus\_y task across different initialization methods\relax }{figure.caption.1}{}}
\newlabel{fig:acc_x_plus_y@cref}{{[figure][1][]1}{[1][7][]7}}
\newlabel{fig:train_loss_x_minus_y}{{2a}{8}{Training Loss\relax }{figure.caption.2}{}}
\newlabel{fig:train_loss_x_minus_y@cref}{{[subfigure][1][2]2a}{[1][7][]8}}
\newlabel{sub@fig:train_loss_x_minus_y}{{a}{8}{Training Loss\relax }{figure.caption.2}{}}
\newlabel{sub@fig:train_loss_x_minus_y@cref}{{[subfigure][1][2]2a}{[1][7][]8}}
\newlabel{fig:val_loss_x_minus_y}{{2b}{8}{Validation Loss\relax }{figure.caption.2}{}}
\newlabel{fig:val_loss_x_minus_y@cref}{{[subfigure][2][2]2b}{[1][7][]8}}
\newlabel{sub@fig:val_loss_x_minus_y}{{b}{8}{Validation Loss\relax }{figure.caption.2}{}}
\newlabel{sub@fig:val_loss_x_minus_y@cref}{{[subfigure][2][2]2b}{[1][7][]8}}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Training and Validation Loss for x\_minus\_y task across different initialization methods\relax }}{8}{figure.caption.2}\protected@file@percent }
\newlabel{fig:loss_x_minus_y}{{2}{8}{Training and Validation Loss for x\_minus\_y task across different initialization methods\relax }{figure.caption.2}{}}
\newlabel{fig:loss_x_minus_y@cref}{{[figure][2][]2}{[1][7][]8}}
\newlabel{fig:train_acc_x_div_y}{{3a}{8}{Training Accuracy\relax }{figure.caption.3}{}}
\newlabel{fig:train_acc_x_div_y@cref}{{[subfigure][1][3]3a}{[1][7][]8}}
\newlabel{sub@fig:train_acc_x_div_y}{{a}{8}{Training Accuracy\relax }{figure.caption.3}{}}
\newlabel{sub@fig:train_acc_x_div_y@cref}{{[subfigure][1][3]3a}{[1][7][]8}}
\newlabel{fig:val_acc_x_div_y}{{3b}{8}{Validation Accuracy\relax }{figure.caption.3}{}}
\newlabel{fig:val_acc_x_div_y@cref}{{[subfigure][2][3]3b}{[1][7][]8}}
\newlabel{sub@fig:val_acc_x_div_y}{{b}{8}{Validation Accuracy\relax }{figure.caption.3}{}}
\newlabel{sub@fig:val_acc_x_div_y@cref}{{[subfigure][2][3]3b}{[1][7][]8}}
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Training and Validation Accuracy for x\_div\_y task across different initialization methods\relax }}{8}{figure.caption.3}\protected@file@percent }
\newlabel{fig:acc_x_div_y}{{3}{8}{Training and Validation Accuracy for x\_div\_y task across different initialization methods\relax }{figure.caption.3}{}}
\newlabel{fig:acc_x_div_y@cref}{{[figure][3][]3}{[1][7][]8}}
\newlabel{fig:train_loss_permutation}{{4a}{8}{Training Loss\relax }{figure.caption.4}{}}
\newlabel{fig:train_loss_permutation@cref}{{[subfigure][1][4]4a}{[1][8][]8}}
\newlabel{sub@fig:train_loss_permutation}{{a}{8}{Training Loss\relax }{figure.caption.4}{}}
\newlabel{sub@fig:train_loss_permutation@cref}{{[subfigure][1][4]4a}{[1][8][]8}}
\newlabel{fig:val_loss_permutation}{{4b}{8}{Validation Loss\relax }{figure.caption.4}{}}
\newlabel{fig:val_loss_permutation@cref}{{[subfigure][2][4]4b}{[1][8][]8}}
\newlabel{sub@fig:val_loss_permutation}{{b}{8}{Validation Loss\relax }{figure.caption.4}{}}
\newlabel{sub@fig:val_loss_permutation@cref}{{[subfigure][2][4]4b}{[1][8][]8}}
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Training and Validation Loss for permutation task across different initialization methods\relax }}{8}{figure.caption.4}\protected@file@percent }
\newlabel{fig:loss_permutation}{{4}{8}{Training and Validation Loss for permutation task across different initialization methods\relax }{figure.caption.4}{}}
\newlabel{fig:loss_permutation@cref}{{[figure][4][]4}{[1][8][]8}}
\citation{power2022grokking}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces 95\% Confidence Intervals for Steps to 99\% Validation Accuracy ($S_{99}$)\relax }}{9}{table.caption.5}\protected@file@percent }
\newlabel{tab:confidence_intervals}{{1}{9}{95\% Confidence Intervals for Steps to 99\% Validation Accuracy ($S_{99}$)\relax }{table.caption.5}{}}
\newlabel{tab:confidence_intervals@cref}{{[table][1][]1}{[1][8][]9}}
\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusions}{9}{section.7}\protected@file@percent }
\newlabel{sec:conclusion}{{7}{9}{Conclusions}{section.7}{}}
\newlabel{sec:conclusion@cref}{{[section][7][]7}{[1][9][]9}}
\bibstyle{iclr2024_conference}
\bibdata{references}
\bibcite{bahdanau2014neural}{{1}{2014}{{Bahdanau et~al.}}{{Bahdanau, Cho, and Bengio}}}
\bibcite{Glorot2010UnderstandingTD}{{2}{2010}{{Glorot \& Bengio}}{{Glorot and Bengio}}}
\bibcite{goodfellow2016deep}{{3}{2016}{{Goodfellow et~al.}}{{Goodfellow, Bengio, Courville, and Bengio}}}
\bibcite{He2015DelvingDI}{{4}{2015}{{He et~al.}}{{He, Zhang, Ren, and Sun}}}
\bibcite{kingma2014adam}{{5}{2014}{{Kingma \& Ba}}{{Kingma and Ba}}}
\bibcite{loshchilov2017adamw}{{6}{2017}{{Loshchilov \& Hutter}}{{Loshchilov and Hutter}}}
\bibcite{power2022grokking}{{7}{2022}{{Power et~al.}}{{Power, Burda, Edwards, Babuschkin, and Misra}}}
\bibcite{Saxe2013ExactST}{{8}{2013}{{Saxe et~al.}}{{Saxe, McClelland, and Ganguli}}}
\bibcite{vaswani2017attention}{{9}{2017}{{Vaswani et~al.}}{{Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin}}}
\ttl@finishall
\gdef \@abspage@last{10}