diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4927 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.982608695652174,
+  "eval_steps": 87,
+  "global_step": 690,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.002898550724637681,
+      "grad_norm": 0.44052618741989136,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 1.4473,
+      "step": 1
+    },
+    {
+      "epoch": 0.002898550724637681,
+      "eval_loss": 1.4117156267166138,
+      "eval_runtime": 46.1446,
+      "eval_samples_per_second": 5.548,
+      "eval_steps_per_second": 0.693,
+      "step": 1
+    },
+    {
+      "epoch": 0.005797101449275362,
+      "grad_norm": 0.4932183027267456,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 1.3923,
+      "step": 2
+    },
+    {
+      "epoch": 0.008695652173913044,
+      "grad_norm": 0.4844379723072052,
+      "learning_rate": 3e-06,
+      "loss": 1.4468,
+      "step": 3
+    },
+    {
+      "epoch": 0.011594202898550725,
+      "grad_norm": 0.5023930668830872,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 1.3773,
+      "step": 4
+    },
+    {
+      "epoch": 0.014492753623188406,
+      "grad_norm": 0.483876496553421,
+      "learning_rate": 5e-06,
+      "loss": 1.4103,
+      "step": 5
+    },
+    {
+      "epoch": 0.017391304347826087,
+      "grad_norm": 0.4460753798484802,
+      "learning_rate": 6e-06,
+      "loss": 1.4707,
+      "step": 6
+    },
+    {
+      "epoch": 0.020289855072463767,
+      "grad_norm": 0.4342319369316101,
+      "learning_rate": 7e-06,
+      "loss": 1.3563,
+      "step": 7
+    },
+    {
+      "epoch": 0.02318840579710145,
+      "grad_norm": 0.479257196187973,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.414,
+      "step": 8
+    },
+    {
+      "epoch": 0.02608695652173913,
+      "grad_norm": 0.5028970241546631,
+      "learning_rate": 9e-06,
+      "loss": 1.4601,
+      "step": 9
+    },
+    {
+      "epoch": 0.028985507246376812,
+      "grad_norm": 0.49131453037261963,
+      "learning_rate": 1e-05,
+      "loss": 1.4364,
+      "step": 10
+    },
+    {
+      "epoch": 0.03188405797101449,
+      "grad_norm": 0.5517832040786743,
+      "learning_rate": 9.999946639344475e-06,
+      "loss": 1.4873,
+      "step": 11
+    },
+    {
+      "epoch": 0.034782608695652174,
+      "grad_norm": 0.5310211181640625,
+      "learning_rate": 9.99978655851684e-06,
+      "loss": 1.4346,
+      "step": 12
+    },
+    {
+      "epoch": 0.03768115942028986,
+      "grad_norm": 0.4639141857624054,
+      "learning_rate": 9.999519760933905e-06,
+      "loss": 1.4402,
+      "step": 13
+    },
+    {
+      "epoch": 0.04057971014492753,
+      "grad_norm": 0.47811073064804077,
+      "learning_rate": 9.999146252290264e-06,
+      "loss": 1.4106,
+      "step": 14
+    },
+    {
+      "epoch": 0.043478260869565216,
+      "grad_norm": 0.5223386883735657,
+      "learning_rate": 9.998666040558187e-06,
+      "loss": 1.3732,
+      "step": 15
+    },
+    {
+      "epoch": 0.0463768115942029,
+      "grad_norm": 0.5601791143417358,
+      "learning_rate": 9.998079135987437e-06,
+      "loss": 1.4166,
+      "step": 16
+    },
+    {
+      "epoch": 0.04927536231884058,
+      "grad_norm": 0.5459745526313782,
+      "learning_rate": 9.997385551105061e-06,
+      "loss": 1.4501,
+      "step": 17
+    },
+    {
+      "epoch": 0.05217391304347826,
+      "grad_norm": 0.6155043244361877,
+      "learning_rate": 9.996585300715117e-06,
+      "loss": 1.3987,
+      "step": 18
+    },
+    {
+      "epoch": 0.05507246376811594,
+      "grad_norm": 0.539135754108429,
+      "learning_rate": 9.995678401898354e-06,
+      "loss": 1.3943,
+      "step": 19
+    },
+    {
+      "epoch": 0.057971014492753624,
+      "grad_norm": 0.5232663154602051,
+      "learning_rate": 9.994664874011864e-06,
+      "loss": 1.3742,
+      "step": 20
+    },
+    {
+      "epoch": 0.06086956521739131,
+      "grad_norm": 0.4995758533477783,
+      "learning_rate": 9.993544738688647e-06,
+      "loss": 1.3969,
+      "step": 21
+    },
+    {
+      "epoch": 0.06376811594202898,
+      "grad_norm": 0.5397970080375671,
+      "learning_rate": 9.992318019837171e-06,
+      "loss": 1.3238,
+      "step": 22
+    },
+    {
+      "epoch": 0.06666666666666667,
+      "grad_norm": 0.5533668994903564,
+      "learning_rate": 9.990984743640839e-06,
+      "loss": 1.3717,
+      "step": 23
+    },
+    {
+      "epoch": 0.06956521739130435,
+      "grad_norm": 0.5304050445556641,
+      "learning_rate": 9.989544938557453e-06,
+      "loss": 1.3565,
+      "step": 24
+    },
+    {
+      "epoch": 0.07246376811594203,
+      "grad_norm": 0.5658550262451172,
+      "learning_rate": 9.987998635318586e-06,
+      "loss": 1.3075,
+      "step": 25
+    },
+    {
+      "epoch": 0.07536231884057971,
+      "grad_norm": 0.5798805952072144,
+      "learning_rate": 9.98634586692894e-06,
+      "loss": 1.4202,
+      "step": 26
+    },
+    {
+      "epoch": 0.0782608695652174,
+      "grad_norm": 0.49352607131004333,
+      "learning_rate": 9.984586668665641e-06,
+      "loss": 1.3172,
+      "step": 27
+    },
+    {
+      "epoch": 0.08115942028985507,
+      "grad_norm": 0.576454222202301,
+      "learning_rate": 9.982721078077474e-06,
+      "loss": 1.3633,
+      "step": 28
+    },
+    {
+      "epoch": 0.08405797101449275,
+      "grad_norm": 0.5843266248703003,
+      "learning_rate": 9.980749134984094e-06,
+      "loss": 1.3031,
+      "step": 29
+    },
+    {
+      "epoch": 0.08695652173913043,
+      "grad_norm": 0.5863199234008789,
+      "learning_rate": 9.978670881475173e-06,
+      "loss": 1.3228,
+      "step": 30
+    },
+    {
+      "epoch": 0.08985507246376812,
+      "grad_norm": 0.6071418523788452,
+      "learning_rate": 9.9764863619095e-06,
+      "loss": 1.3277,
+      "step": 31
+    },
+    {
+      "epoch": 0.0927536231884058,
+      "grad_norm": 0.5361754298210144,
+      "learning_rate": 9.97419562291403e-06,
+      "loss": 1.3189,
+      "step": 32
+    },
+    {
+      "epoch": 0.09565217391304348,
+      "grad_norm": 0.6043053865432739,
+      "learning_rate": 9.971798713382896e-06,
+      "loss": 1.2567,
+      "step": 33
+    },
+    {
+      "epoch": 0.09855072463768116,
+      "grad_norm": 0.4795907139778137,
+      "learning_rate": 9.96929568447637e-06,
+      "loss": 1.33,
+      "step": 34
+    },
+    {
+      "epoch": 0.10144927536231885,
+      "grad_norm": 0.5752019882202148,
+      "learning_rate": 9.96668658961975e-06,
+      "loss": 1.1915,
+      "step": 35
+    },
+    {
+      "epoch": 0.10434782608695652,
+      "grad_norm": 0.47888195514678955,
+      "learning_rate": 9.963971484502247e-06,
+      "loss": 1.2753,
+      "step": 36
+    },
+    {
+      "epoch": 0.1072463768115942,
+      "grad_norm": 0.5371452569961548,
+      "learning_rate": 9.96115042707577e-06,
+      "loss": 1.2659,
+      "step": 37
+    },
+    {
+      "epoch": 0.11014492753623188,
+      "grad_norm": 0.6198606491088867,
+      "learning_rate": 9.958223477553715e-06,
+      "loss": 1.2166,
+      "step": 38
+    },
+    {
+      "epoch": 0.11304347826086956,
+      "grad_norm": 0.4718591272830963,
+      "learning_rate": 9.955190698409656e-06,
+      "loss": 1.2708,
+      "step": 39
+    },
+    {
+      "epoch": 0.11594202898550725,
+      "grad_norm": 0.5691114068031311,
+      "learning_rate": 9.952052154376027e-06,
+      "loss": 1.2074,
+      "step": 40
+    },
+    {
+      "epoch": 0.11884057971014493,
+      "grad_norm": 0.515771210193634,
+      "learning_rate": 9.948807912442735e-06,
+      "loss": 1.1958,
+      "step": 41
+    },
+    {
+      "epoch": 0.12173913043478261,
+      "grad_norm": 0.6830301880836487,
+      "learning_rate": 9.945458041855732e-06,
+      "loss": 1.2992,
+      "step": 42
+    },
+    {
+      "epoch": 0.1246376811594203,
+      "grad_norm": 0.5583641529083252,
+      "learning_rate": 9.94200261411553e-06,
+      "loss": 1.2654,
+      "step": 43
+    },
+    {
+      "epoch": 0.12753623188405797,
+      "grad_norm": 0.5985351800918579,
+      "learning_rate": 9.938441702975689e-06,
+      "loss": 1.2064,
+      "step": 44
+    },
+    {
+      "epoch": 0.13043478260869565,
+      "grad_norm": 0.5092725157737732,
+      "learning_rate": 9.93477538444123e-06,
+      "loss": 1.1477,
+      "step": 45
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.5719948410987854,
+      "learning_rate": 9.931003736767013e-06,
+      "loss": 1.3045,
+      "step": 46
+    },
+    {
+      "epoch": 0.13623188405797101,
+      "grad_norm": 0.5000984072685242,
+      "learning_rate": 9.92712684045608e-06,
+      "loss": 1.2954,
+      "step": 47
+    },
+    {
+      "epoch": 0.1391304347826087,
+      "grad_norm": 0.6268609762191772,
+      "learning_rate": 9.923144778257918e-06,
+      "loss": 1.2742,
+      "step": 48
+    },
+    {
+      "epoch": 0.14202898550724638,
+      "grad_norm": 0.5395749807357788,
+      "learning_rate": 9.91905763516671e-06,
+      "loss": 1.1651,
+      "step": 49
+    },
+    {
+      "epoch": 0.14492753623188406,
+      "grad_norm": 0.6797102689743042,
+      "learning_rate": 9.91486549841951e-06,
+      "loss": 1.2083,
+      "step": 50
+    },
+    {
+      "epoch": 0.14782608695652175,
+      "grad_norm": 0.554821252822876,
+      "learning_rate": 9.91056845749438e-06,
+      "loss": 1.1623,
+      "step": 51
+    },
+    {
+      "epoch": 0.15072463768115943,
+      "grad_norm": 0.6033896803855896,
+      "learning_rate": 9.906166604108494e-06,
+      "loss": 1.2135,
+      "step": 52
+    },
+    {
+      "epoch": 0.1536231884057971,
+      "grad_norm": 0.568701446056366,
+      "learning_rate": 9.901660032216159e-06,
+      "loss": 1.1956,
+      "step": 53
+    },
+    {
+      "epoch": 0.1565217391304348,
+      "grad_norm": 0.6862343549728394,
+      "learning_rate": 9.89704883800683e-06,
+      "loss": 1.1992,
+      "step": 54
+    },
+    {
+      "epoch": 0.15942028985507245,
+      "grad_norm": 0.49399352073669434,
+      "learning_rate": 9.892333119903045e-06,
+      "loss": 1.1711,
+      "step": 55
+    },
+    {
+      "epoch": 0.16231884057971013,
+      "grad_norm": 0.5683416724205017,
+      "learning_rate": 9.887512978558329e-06,
+      "loss": 1.2608,
+      "step": 56
+    },
+    {
+      "epoch": 0.16521739130434782,
+      "grad_norm": 0.4855175018310547,
+      "learning_rate": 9.88258851685504e-06,
+      "loss": 1.1652,
+      "step": 57
+    },
+    {
+      "epoch": 0.1681159420289855,
+      "grad_norm": 0.5765471458435059,
+      "learning_rate": 9.877559839902185e-06,
+      "loss": 1.2653,
+      "step": 58
+    },
+    {
+      "epoch": 0.17101449275362318,
+      "grad_norm": 0.5921582579612732,
+      "learning_rate": 9.872427055033156e-06,
+      "loss": 1.1191,
+      "step": 59
+    },
+    {
+      "epoch": 0.17391304347826086,
+      "grad_norm": 0.5046260356903076,
+      "learning_rate": 9.867190271803466e-06,
+      "loss": 1.1824,
+      "step": 60
+    },
+    {
+      "epoch": 0.17681159420289855,
+      "grad_norm": 0.5180432796478271,
+      "learning_rate": 9.861849601988384e-06,
+      "loss": 1.1736,
+      "step": 61
+    },
+    {
+      "epoch": 0.17971014492753623,
+      "grad_norm": 0.65400230884552,
+      "learning_rate": 9.85640515958057e-06,
+      "loss": 1.1129,
+      "step": 62
+    },
+    {
+      "epoch": 0.1826086956521739,
+      "grad_norm": 0.5726003646850586,
+      "learning_rate": 9.85085706078763e-06,
+      "loss": 1.1567,
+      "step": 63
+    },
+    {
+      "epoch": 0.1855072463768116,
+      "grad_norm": 0.5297178030014038,
+      "learning_rate": 9.845205424029639e-06,
+      "loss": 1.101,
+      "step": 64
+    },
+    {
+      "epoch": 0.18840579710144928,
+      "grad_norm": 0.5242377519607544,
+      "learning_rate": 9.839450369936615e-06,
+      "loss": 1.174,
+      "step": 65
+    },
+    {
+      "epoch": 0.19130434782608696,
+      "grad_norm": 0.5277882218360901,
+      "learning_rate": 9.833592021345938e-06,
+      "loss": 1.1772,
+      "step": 66
+    },
+    {
+      "epoch": 0.19420289855072465,
+      "grad_norm": 0.5334244966506958,
+      "learning_rate": 9.827630503299741e-06,
+      "loss": 1.1722,
+      "step": 67
+    },
+    {
+      "epoch": 0.19710144927536233,
+      "grad_norm": 0.6054286360740662,
+      "learning_rate": 9.821565943042225e-06,
+      "loss": 1.2022,
+      "step": 68
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5691675543785095,
+      "learning_rate": 9.815398470016957e-06,
+      "loss": 1.1256,
+      "step": 69
+    },
+    {
+      "epoch": 0.2028985507246377,
+      "grad_norm": 0.4579974114894867,
+      "learning_rate": 9.809128215864096e-06,
+      "loss": 1.1548,
+      "step": 70
+    },
+    {
+      "epoch": 0.20579710144927535,
+      "grad_norm": 0.605627715587616,
+      "learning_rate": 9.802755314417592e-06,
+      "loss": 1.0972,
+      "step": 71
+    },
+    {
+      "epoch": 0.20869565217391303,
+      "grad_norm": 0.5655208826065063,
+      "learning_rate": 9.796279901702326e-06,
+      "loss": 1.0902,
+      "step": 72
+    },
+    {
+      "epoch": 0.21159420289855072,
+      "grad_norm": 0.570743978023529,
+      "learning_rate": 9.789702115931202e-06,
+      "loss": 1.0654,
+      "step": 73
+    },
+    {
+      "epoch": 0.2144927536231884,
+      "grad_norm": 0.7513704895973206,
+      "learning_rate": 9.783022097502204e-06,
+      "loss": 1.1348,
+      "step": 74
+    },
+    {
+      "epoch": 0.21739130434782608,
+      "grad_norm": 0.592363715171814,
+      "learning_rate": 9.776239988995401e-06,
+      "loss": 1.1733,
+      "step": 75
+    },
+    {
+      "epoch": 0.22028985507246376,
+      "grad_norm": 0.5394357442855835,
+      "learning_rate": 9.76935593516989e-06,
+      "loss": 1.1313,
+      "step": 76
+    },
+    {
+      "epoch": 0.22318840579710145,
+      "grad_norm": 0.598983108997345,
+      "learning_rate": 9.762370082960727e-06,
+      "loss": 1.1077,
+      "step": 77
+    },
+    {
+      "epoch": 0.22608695652173913,
+      "grad_norm": 0.5635719895362854,
+      "learning_rate": 9.755282581475769e-06,
+      "loss": 1.0393,
+      "step": 78
+    },
+    {
+      "epoch": 0.2289855072463768,
+      "grad_norm": 0.5638449788093567,
+      "learning_rate": 9.748093581992506e-06,
+      "loss": 1.1126,
+      "step": 79
+    },
+    {
+      "epoch": 0.2318840579710145,
+      "grad_norm": 0.5267054438591003,
+      "learning_rate": 9.74080323795483e-06,
+      "loss": 1.108,
+      "step": 80
+    },
+    {
+      "epoch": 0.23478260869565218,
+      "grad_norm": 0.69565749168396,
+      "learning_rate": 9.733411704969754e-06,
+      "loss": 1.1065,
+      "step": 81
+    },
+    {
+      "epoch": 0.23768115942028986,
+      "grad_norm": 0.5769387483596802,
+      "learning_rate": 9.7259191408041e-06,
+      "loss": 1.0892,
+      "step": 82
+    },
+    {
+      "epoch": 0.24057971014492754,
+      "grad_norm": 0.4646681845188141,
+      "learning_rate": 9.718325705381115e-06,
+      "loss": 1.0984,
+      "step": 83
+    },
+    {
+      "epoch": 0.24347826086956523,
+      "grad_norm": 0.5441101789474487,
+      "learning_rate": 9.710631560777082e-06,
+      "loss": 1.134,
+      "step": 84
+    },
+    {
+      "epoch": 0.2463768115942029,
+      "grad_norm": 0.6711792349815369,
+      "learning_rate": 9.702836871217838e-06,
+      "loss": 1.118,
+      "step": 85
+    },
+    {
+      "epoch": 0.2492753623188406,
+      "grad_norm": 0.6086435914039612,
+      "learning_rate": 9.694941803075285e-06,
+      "loss": 1.1332,
+      "step": 86
+    },
+    {
+      "epoch": 0.25217391304347825,
+      "grad_norm": 0.6047069430351257,
+      "learning_rate": 9.686946524863821e-06,
+      "loss": 1.0948,
+      "step": 87
+    },
+    {
+      "epoch": 0.25217391304347825,
+      "eval_loss": 1.093648910522461,
+      "eval_runtime": 46.2827,
+      "eval_samples_per_second": 5.531,
+      "eval_steps_per_second": 0.691,
+      "step": 87
+    },
+    {
+      "epoch": 0.25507246376811593,
+      "grad_norm": 0.5494099259376526,
+      "learning_rate": 9.678851207236764e-06,
+      "loss": 1.0677,
+      "step": 88
+    },
+    {
+      "epoch": 0.2579710144927536,
+      "grad_norm": 0.6029177308082581,
+      "learning_rate": 9.670656022982696e-06,
+      "loss": 1.1122,
+      "step": 89
+    },
+    {
+      "epoch": 0.2608695652173913,
+      "grad_norm": 0.6882422566413879,
+      "learning_rate": 9.66236114702178e-06,
+      "loss": 1.131,
+      "step": 90
+    },
+    {
+      "epoch": 0.263768115942029,
+      "grad_norm": 0.5858222246170044,
+      "learning_rate": 9.65396675640202e-06,
+      "loss": 1.0904,
+      "step": 91
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.6096974611282349,
+      "learning_rate": 9.645473030295496e-06,
+      "loss": 1.1001,
+      "step": 92
+    },
+    {
+      "epoch": 0.26956521739130435,
+      "grad_norm": 0.5705183148384094,
+      "learning_rate": 9.636880149994518e-06,
+      "loss": 1.1159,
+      "step": 93
+    },
+    {
+      "epoch": 0.27246376811594203,
+      "grad_norm": 0.5896604061126709,
+      "learning_rate": 9.628188298907782e-06,
+      "loss": 1.0236,
+      "step": 94
+    },
+    {
+      "epoch": 0.2753623188405797,
+      "grad_norm": 0.6060263514518738,
+      "learning_rate": 9.619397662556434e-06,
+      "loss": 1.0991,
+      "step": 95
+    },
+    {
+      "epoch": 0.2782608695652174,
+      "grad_norm": 0.6302357316017151,
+      "learning_rate": 9.610508428570122e-06,
+      "loss": 1.073,
+      "step": 96
+    },
+    {
+      "epoch": 0.2811594202898551,
+      "grad_norm": 0.6086059212684631,
+      "learning_rate": 9.601520786682989e-06,
+      "loss": 1.1556,
+      "step": 97
+    },
+    {
+      "epoch": 0.28405797101449276,
+      "grad_norm": 0.5601389408111572,
+      "learning_rate": 9.592434928729617e-06,
+      "loss": 1.0691,
+      "step": 98
+    },
+    {
+      "epoch": 0.28695652173913044,
+      "grad_norm": 0.6236623525619507,
+      "learning_rate": 9.583251048640941e-06,
+      "loss": 1.0293,
+      "step": 99
+    },
+    {
+      "epoch": 0.2898550724637681,
+      "grad_norm": 0.661264181137085,
+      "learning_rate": 9.573969342440107e-06,
+      "loss": 1.0597,
+      "step": 100
+    },
+    {
+      "epoch": 0.2927536231884058,
+      "grad_norm": 0.5187559127807617,
+      "learning_rate": 9.564590008238284e-06,
+      "loss": 1.0152,
+      "step": 101
+    },
+    {
+      "epoch": 0.2956521739130435,
+      "grad_norm": 0.7033849358558655,
+      "learning_rate": 9.555113246230443e-06,
+      "loss": 1.0583,
+      "step": 102
+    },
+    {
+      "epoch": 0.2985507246376812,
+      "grad_norm": 0.6243430376052856,
+      "learning_rate": 9.545539258691076e-06,
+      "loss": 1.0415,
+      "step": 103
+    },
+    {
+      "epoch": 0.30144927536231886,
+      "grad_norm": 0.7448285222053528,
+      "learning_rate": 9.535868249969882e-06,
+      "loss": 1.1665,
+      "step": 104
+    },
+    {
+      "epoch": 0.30434782608695654,
+      "grad_norm": 0.7407688498497009,
+      "learning_rate": 9.52610042648741e-06,
+      "loss": 1.0805,
+      "step": 105
+    },
+    {
+      "epoch": 0.3072463768115942,
+      "grad_norm": 0.6399569511413574,
+      "learning_rate": 9.516235996730645e-06,
+      "loss": 1.0622,
+      "step": 106
+    },
+    {
+      "epoch": 0.3101449275362319,
+      "grad_norm": 0.6391183733940125,
+      "learning_rate": 9.50627517124856e-06,
+      "loss": 1.0988,
+      "step": 107
+    },
+    {
+      "epoch": 0.3130434782608696,
+      "grad_norm": 0.6799684166908264,
+      "learning_rate": 9.496218162647629e-06,
+      "loss": 1.0667,
+      "step": 108
+    },
+    {
+      "epoch": 0.3159420289855073,
+      "grad_norm": 0.6955932378768921,
+      "learning_rate": 9.486065185587278e-06,
+      "loss": 1.0475,
+      "step": 109
+    },
+    {
+      "epoch": 0.3188405797101449,
+      "grad_norm": 0.6768685579299927,
+      "learning_rate": 9.475816456775313e-06,
+      "loss": 1.0906,
+      "step": 110
+    },
+    {
+      "epoch": 0.3217391304347826,
+      "grad_norm": 0.6448860168457031,
+      "learning_rate": 9.465472194963287e-06,
+      "loss": 1.0725,
+      "step": 111
+    },
+    {
+      "epoch": 0.32463768115942027,
+      "grad_norm": 0.654137909412384,
+      "learning_rate": 9.45503262094184e-06,
+      "loss": 1.0477,
+      "step": 112
+    },
+    {
+      "epoch": 0.32753623188405795,
+      "grad_norm": 0.5668336749076843,
+      "learning_rate": 9.444497957535975e-06,
+      "loss": 1.0419,
+      "step": 113
+    },
+    {
+      "epoch": 0.33043478260869563,
+      "grad_norm": 0.8345162868499756,
+      "learning_rate": 9.43386842960031e-06,
+      "loss": 1.1125,
+      "step": 114
+    },
+    {
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.5995410084724426,
+      "learning_rate": 9.423144264014278e-06,
+      "loss": 1.048,
+      "step": 115
+    },
+    {
+      "epoch": 0.336231884057971,
+      "grad_norm": 0.6526032090187073,
+      "learning_rate": 9.41232568967728e-06,
+      "loss": 1.0868,
+      "step": 116
+    },
+    {
+      "epoch": 0.3391304347826087,
+      "grad_norm": 0.7131723165512085,
+      "learning_rate": 9.401412937503802e-06,
+      "loss": 1.0154,
+      "step": 117
+    },
+    {
+      "epoch": 0.34202898550724636,
+      "grad_norm": 0.7425084114074707,
+      "learning_rate": 9.39040624041849e-06,
+      "loss": 1.1046,
+      "step": 118
+    },
+    {
+      "epoch": 0.34492753623188405,
+      "grad_norm": 0.6741538643836975,
+      "learning_rate": 9.379305833351174e-06,
+      "loss": 1.0884,
+      "step": 119
+    },
+    {
+      "epoch": 0.34782608695652173,
+      "grad_norm": 0.6611533164978027,
+      "learning_rate": 9.368111953231849e-06,
+      "loss": 1.1291,
+      "step": 120
+    },
+    {
+      "epoch": 0.3507246376811594,
+      "grad_norm": 0.6605979204177856,
+      "learning_rate": 9.35682483898563e-06,
+      "loss": 1.0354,
+      "step": 121
+    },
+    {
+      "epoch": 0.3536231884057971,
+      "grad_norm": 0.7649601101875305,
+      "learning_rate": 9.345444731527642e-06,
+      "loss": 1.0705,
+      "step": 122
+    },
+    {
+      "epoch": 0.3565217391304348,
+      "grad_norm": 0.6104558110237122,
+      "learning_rate": 9.333971873757885e-06,
+      "loss": 1.0221,
+      "step": 123
+    },
+    {
+      "epoch": 0.35942028985507246,
+      "grad_norm": 0.5945985913276672,
+      "learning_rate": 9.32240651055604e-06,
+      "loss": 1.0352,
+      "step": 124
+    },
+    {
+      "epoch": 0.36231884057971014,
+      "grad_norm": 0.7351408004760742,
+      "learning_rate": 9.310748888776254e-06,
+      "loss": 1.0283,
+      "step": 125
+    },
+    {
+      "epoch": 0.3652173913043478,
+      "grad_norm": 0.6751654148101807,
+      "learning_rate": 9.298999257241862e-06,
+      "loss": 1.1355,
+      "step": 126
+    },
+    {
+      "epoch": 0.3681159420289855,
+      "grad_norm": 0.6744984984397888,
+      "learning_rate": 9.287157866740082e-06,
+      "loss": 1.097,
+      "step": 127
+    },
+    {
+      "epoch": 0.3710144927536232,
+      "grad_norm": 0.6096031665802002,
+      "learning_rate": 9.275224970016656e-06,
+      "loss": 0.9879,
+      "step": 128
+    },
+    {
+      "epoch": 0.3739130434782609,
+      "grad_norm": 0.6282311081886292,
+      "learning_rate": 9.263200821770462e-06,
+      "loss": 1.0088,
+      "step": 129
+    },
+    {
+      "epoch": 0.37681159420289856,
+      "grad_norm": 0.6340439319610596,
+      "learning_rate": 9.251085678648072e-06,
+      "loss": 1.0314,
+      "step": 130
+    },
+    {
+      "epoch": 0.37971014492753624,
+      "grad_norm": 0.6008773446083069,
+      "learning_rate": 9.238879799238278e-06,
+      "loss": 1.0304,
+      "step": 131
+    },
+    {
+      "epoch": 0.3826086956521739,
+      "grad_norm": 0.83261638879776,
+      "learning_rate": 9.22658344406657e-06,
+      "loss": 1.0767,
+      "step": 132
+    },
+    {
+      "epoch": 0.3855072463768116,
+      "grad_norm": 0.6942703127861023,
+      "learning_rate": 9.214196875589577e-06,
+      "loss": 1.0238,
+      "step": 133
+    },
+    {
+      "epoch": 0.3884057971014493,
+      "grad_norm": 0.6649532914161682,
+      "learning_rate": 9.201720358189464e-06,
+      "loss": 1.0353,
+      "step": 134
+    },
+    {
+      "epoch": 0.391304347826087,
+      "grad_norm": 0.6827482581138611,
+      "learning_rate": 9.189154158168293e-06,
+      "loss": 1.0123,
+      "step": 135
+    },
+    {
+      "epoch": 0.39420289855072466,
+      "grad_norm": 0.8225923776626587,
+      "learning_rate": 9.176498543742328e-06,
+      "loss": 1.0894,
+      "step": 136
+    },
+    {
+      "epoch": 0.39710144927536234,
+      "grad_norm": 0.7622413635253906,
+      "learning_rate": 9.163753785036324e-06,
+      "loss": 1.0987,
+      "step": 137
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.729880690574646,
+      "learning_rate": 9.150920154077753e-06,
+      "loss": 1.0686,
+      "step": 138
+    },
+    {
+      "epoch": 0.4028985507246377,
+      "grad_norm": 0.5569338798522949,
+      "learning_rate": 9.137997924791e-06,
+      "loss": 1.0554,
+      "step": 139
+    },
+    {
+      "epoch": 0.4057971014492754,
+      "grad_norm": 0.7127766013145447,
+      "learning_rate": 9.124987372991512e-06,
+      "loss": 1.0878,
+      "step": 140
+    },
+    {
+      "epoch": 0.40869565217391307,
+      "grad_norm": 0.6865119338035583,
+      "learning_rate": 9.11188877637992e-06,
+      "loss": 1.078,
+      "step": 141
+    },
+    {
+      "epoch": 0.4115942028985507,
+      "grad_norm": 0.7496594786643982,
+      "learning_rate": 9.098702414536107e-06,
+      "loss": 1.1678,
+      "step": 142
+    },
+    {
+      "epoch": 0.4144927536231884,
+      "grad_norm": 0.7547608017921448,
+      "learning_rate": 9.085428568913233e-06,
+      "loss": 1.0282,
+      "step": 143
+    },
+    {
+      "epoch": 0.41739130434782606,
+      "grad_norm": 0.6696781516075134,
+      "learning_rate": 9.072067522831743e-06,
+      "loss": 1.0529,
+      "step": 144
+    },
+    {
+      "epoch": 0.42028985507246375,
+      "grad_norm": 0.6223747134208679,
+      "learning_rate": 9.058619561473308e-06,
+      "loss": 1.0101,
+      "step": 145
+    },
+    {
+      "epoch": 0.42318840579710143,
+      "grad_norm": 0.6682969331741333,
+      "learning_rate": 9.045084971874738e-06,
+      "loss": 1.0669,
+      "step": 146
+    },
+    {
+      "epoch": 0.4260869565217391,
+      "grad_norm": 0.702489972114563,
+      "learning_rate": 9.031464042921866e-06,
+      "loss": 1.0696,
+      "step": 147
+    },
+    {
+      "epoch": 0.4289855072463768,
+      "grad_norm": 0.6877920031547546,
+      "learning_rate": 9.017757065343368e-06,
+      "loss": 1.0181,
+      "step": 148
+    },
+    {
+      "epoch": 0.4318840579710145,
+      "grad_norm": 0.7262343168258667,
+      "learning_rate": 9.003964331704574e-06,
+      "loss": 1.0869,
+      "step": 149
+    },
+    {
+      "epoch": 0.43478260869565216,
+      "grad_norm": 0.6435033082962036,
+      "learning_rate": 8.990086136401199e-06,
+      "loss": 1.0943,
+      "step": 150
+    },
+    {
+      "epoch": 0.43768115942028984,
+      "grad_norm": 0.8294116854667664,
+      "learning_rate": 8.976122775653087e-06,
+      "loss": 1.0053,
+      "step": 151
+    },
+    {
+      "epoch": 0.4405797101449275,
+      "grad_norm": 0.7582129240036011,
+      "learning_rate": 8.96207454749787e-06,
+      "loss": 1.0255,
+      "step": 152
+    },
+    {
+      "epoch": 0.4434782608695652,
+      "grad_norm": 0.7421862483024597,
+      "learning_rate": 8.947941751784614e-06,
+      "loss": 0.995,
+      "step": 153
+    },
+    {
+      "epoch": 0.4463768115942029,
+      "grad_norm": 0.6562067866325378,
+      "learning_rate": 8.933724690167417e-06,
+      "loss": 1.0051,
+      "step": 154
+    },
+    {
+      "epoch": 0.4492753623188406,
+      "grad_norm": 0.7008780241012573,
+      "learning_rate": 8.91942366609897e-06,
+      "loss": 1.0224,
+      "step": 155
+    },
+    {
+      "epoch": 0.45217391304347826,
+      "grad_norm": 0.8320948481559753,
+      "learning_rate": 8.905038984824079e-06,
+      "loss": 1.0867,
+      "step": 156
+    },
+    {
+      "epoch": 0.45507246376811594,
+      "grad_norm": 0.7078688740730286,
+      "learning_rate": 8.890570953373152e-06,
+      "loss": 1.0233,
+      "step": 157
+    },
+    {
+      "epoch": 0.4579710144927536,
+      "grad_norm": 0.602080225944519,
+      "learning_rate": 8.87601988055565e-06,
+      "loss": 1.033,
+      "step": 158
+    },
+    {
+      "epoch": 0.4608695652173913,
+      "grad_norm": 0.6947946548461914,
+      "learning_rate": 8.861386076953485e-06,
+      "loss": 1.0056,
+      "step": 159
+    },
+    {
+      "epoch": 0.463768115942029,
+      "grad_norm": 0.7520703673362732,
+      "learning_rate": 8.846669854914395e-06,
+      "loss": 1.0129,
+      "step": 160
+    },
+    {
+      "epoch": 0.4666666666666667,
+      "grad_norm": 0.8198053240776062,
+      "learning_rate": 8.831871528545286e-06,
+      "loss": 1.0554,
+      "step": 161
+    },
+    {
+      "epoch": 0.46956521739130436,
+      "grad_norm": 0.8595309257507324,
+      "learning_rate": 8.816991413705515e-06,
+      "loss": 0.9769,
+      "step": 162
+    },
+    {
+      "epoch": 0.47246376811594204,
+      "grad_norm": 0.7658084034919739,
+      "learning_rate": 8.802029828000157e-06,
+      "loss": 1.0942,
+      "step": 163
+    },
+    {
+      "epoch": 0.4753623188405797,
+      "grad_norm": 0.779561460018158,
+      "learning_rate": 8.786987090773214e-06,
+      "loss": 1.0526,
+      "step": 164
+    },
+    {
+      "epoch": 0.4782608695652174,
+      "grad_norm": 0.7491458654403687,
+      "learning_rate": 8.771863523100821e-06,
+      "loss": 1.076,
+      "step": 165
+    },
+    {
+      "epoch": 0.4811594202898551,
+      "grad_norm": 0.7698597311973572,
+      "learning_rate": 8.756659447784367e-06,
+      "loss": 1.0513,
+      "step": 166
+    },
+    {
+      "epoch": 0.48405797101449277,
+      "grad_norm": 0.7076740860939026,
+      "learning_rate": 8.741375189343625e-06,
+      "loss": 0.952,
+      "step": 167
+    },
+    {
+      "epoch": 0.48695652173913045,
+      "grad_norm": 0.8549159169197083,
+      "learning_rate": 8.726011074009813e-06,
+      "loss": 1.0062,
+      "step": 168
+    },
+    {
+      "epoch": 0.48985507246376814,
+      "grad_norm": 0.7257103323936462,
+      "learning_rate": 8.71056742971864e-06,
+      "loss": 1.0124,
+      "step": 169
+    },
+    {
+      "epoch": 0.4927536231884058,
+      "grad_norm": 0.6643837094306946,
+      "learning_rate": 8.695044586103297e-06,
+      "loss": 1.0646,
+      "step": 170
+    },
+    {
+      "epoch": 0.4956521739130435,
+      "grad_norm": 0.6454336643218994,
+      "learning_rate": 8.679442874487427e-06,
+      "loss": 1.0482,
+      "step": 171
+    },
+    {
+      "epoch": 0.4985507246376812,
+      "grad_norm": 0.6484606266021729,
+      "learning_rate": 8.663762627878059e-06,
+      "loss": 1.0361,
+      "step": 172
+    },
+    {
+      "epoch": 0.5014492753623189,
+      "grad_norm": 0.8437646627426147,
+      "learning_rate": 8.64800418095848e-06,
+      "loss": 1.1064,
+      "step": 173
+    },
+    {
+      "epoch": 0.5043478260869565,
+      "grad_norm": 0.8865697979927063,
+      "learning_rate": 8.632167870081122e-06,
+      "loss": 1.0187,
+      "step": 174
+    },
+    {
+      "epoch": 0.5043478260869565,
+      "eval_loss": 1.0253716707229614,
+      "eval_runtime": 46.4716,
+      "eval_samples_per_second": 5.509,
+      "eval_steps_per_second": 0.689,
+      "step": 174
+    },
+    {
+      "epoch": 0.5072463768115942,
+      "grad_norm": 0.6522702574729919,
+      "learning_rate": 8.616254033260351e-06,
+      "loss": 1.0466,
+      "step": 175
+    },
+    {
+      "epoch": 0.5101449275362319,
+      "grad_norm": 0.7485548257827759,
+      "learning_rate": 8.600263010165275e-06,
+      "loss": 1.051,
+      "step": 176
+    },
+    {
+      "epoch": 0.5130434782608696,
+      "grad_norm": 0.7864269614219666,
+      "learning_rate": 8.584195142112482e-06,
+      "loss": 0.9823,
+      "step": 177
+    },
+    {
+      "epoch": 0.5159420289855072,
+      "grad_norm": 0.669228732585907,
+      "learning_rate": 8.568050772058763e-06,
+      "loss": 0.9959,
+      "step": 178
+    },
+    {
+      "epoch": 0.518840579710145,
+      "grad_norm": 0.7351509928703308,
+      "learning_rate": 8.551830244593785e-06,
+      "loss": 1.0523,
+      "step": 179
+    },
+    {
+      "epoch": 0.5217391304347826,
+      "grad_norm": 0.6464654207229614,
+      "learning_rate": 8.535533905932739e-06,
+      "loss": 1.0576,
+      "step": 180
+    },
+    {
+      "epoch": 0.5246376811594203,
+      "grad_norm": 0.6708983182907104,
+      "learning_rate": 8.519162103908951e-06,
+      "loss": 1.0036,
+      "step": 181
+    },
+    {
+      "epoch": 0.527536231884058,
+      "grad_norm": 0.6712408661842346,
+      "learning_rate": 8.502715187966455e-06,
+      "loss": 0.9567,
+      "step": 182
+    },
+    {
+      "epoch": 0.5304347826086957,
+      "grad_norm": 0.8165604472160339,
+      "learning_rate": 8.48619350915254e-06,
+      "loss": 1.0074,
+      "step": 183
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.8015124797821045,
+      "learning_rate": 8.469597420110249e-06,
+      "loss": 1.04,
+      "step": 184
+    },
+    {
+      "epoch": 0.5362318840579711,
+      "grad_norm": 0.6764898896217346,
+      "learning_rate": 8.452927275070858e-06,
+      "loss": 1.0259,
+      "step": 185
+    },
+    {
+      "epoch": 0.5391304347826087,
+      "grad_norm": 0.7508796453475952,
+      "learning_rate": 8.436183429846314e-06,
+      "loss": 1.0153,
+      "step": 186
+    },
+    {
+      "epoch": 0.5420289855072464,
+      "grad_norm": 0.7400704026222229,
+      "learning_rate": 8.41936624182164e-06,
+      "loss": 1.0302,
+      "step": 187
+    },
+    {
+      "epoch": 0.5449275362318841,
+      "grad_norm": 0.7747941017150879,
+      "learning_rate": 8.402476069947309e-06,
+      "loss": 1.0516,
+      "step": 188
+    },
+    {
+      "epoch": 0.5478260869565217,
+      "grad_norm": 0.6391712427139282,
+      "learning_rate": 8.385513274731574e-06,
+      "loss": 0.9144,
+      "step": 189
+    },
+    {
+      "epoch": 0.5507246376811594,
+      "grad_norm": 0.7723587155342102,
+      "learning_rate": 8.368478218232787e-06,
+      "loss": 1.038,
+      "step": 190
+    },
+    {
+      "epoch": 0.553623188405797,
+      "grad_norm": 0.6703996062278748,
+      "learning_rate": 8.351371264051659e-06,
+      "loss": 0.9767,
+      "step": 191
+    },
+    {
+      "epoch": 0.5565217391304348,
+      "grad_norm": 0.6496030688285828,
+      "learning_rate": 8.334192777323508e-06,
+      "loss": 1.0139,
+      "step": 192
+    },
+    {
+      "epoch": 0.5594202898550724,
+      "grad_norm": 0.9179766178131104,
+      "learning_rate": 8.316943124710457e-06,
+      "loss": 1.0217,
+      "step": 193
+    },
+    {
+      "epoch": 0.5623188405797102,
+      "grad_norm": 0.739105761051178,
+      "learning_rate": 8.299622674393615e-06,
+      "loss": 1.0097,
+      "step": 194
+    },
+    {
+      "epoch": 0.5652173913043478,
+      "grad_norm": 0.6799715757369995,
+      "learning_rate": 8.282231796065215e-06,
+      "loss": 0.9814,
+      "step": 195
+    },
+    {
+      "epoch": 0.5681159420289855,
+      "grad_norm": 0.7482266426086426,
+      "learning_rate": 8.264770860920722e-06,
+      "loss": 0.9651,
+      "step": 196
+    },
+    {
+      "epoch": 0.5710144927536231,
+      "grad_norm": 0.7226840853691101,
+      "learning_rate": 8.247240241650918e-06,
+      "loss": 1.0257,
+      "step": 197
+    },
+    {
+      "epoch": 0.5739130434782609,
+      "grad_norm": 0.8682334423065186,
+      "learning_rate": 8.229640312433938e-06,
+      "loss": 0.9359,
+      "step": 198
+    },
+    {
+      "epoch": 0.5768115942028985,
+      "grad_norm": 0.7574880123138428,
+      "learning_rate": 8.21197144892728e-06,
+      "loss": 1.0316,
+      "step": 199
+    },
+    {
+      "epoch": 0.5797101449275363,
+      "grad_norm": 0.6719037890434265,
+      "learning_rate": 8.194234028259806e-06,
+      "loss": 0.9718,
+      "step": 200
+    },
+    {
+      "epoch": 0.5826086956521739,
+      "grad_norm": 0.7872765064239502,
+      "learning_rate": 8.176428429023674e-06,
+      "loss": 1.0055,
+      "step": 201
+    },
+    {
+      "epoch": 0.5855072463768116,
+      "grad_norm": 0.8982404470443726,
+      "learning_rate": 8.158555031266255e-06,
+      "loss": 1.0763,
+      "step": 202
+    },
+    {
+      "epoch": 0.5884057971014492,
+      "grad_norm": 0.7265183925628662,
+      "learning_rate": 8.140614216482046e-06,
+      "loss": 0.9921,
+      "step": 203
+    },
+    {
+      "epoch": 0.591304347826087,
+      "grad_norm": 0.7971622943878174,
+      "learning_rate": 8.122606367604497e-06,
+      "loss": 0.9986,
+      "step": 204
+    },
+    {
+      "epoch": 0.5942028985507246,
+      "grad_norm": 0.689160943031311,
+      "learning_rate": 8.104531868997858e-06,
+      "loss": 0.9896,
+      "step": 205
+    },
+    {
+      "epoch": 0.5971014492753624,
+      "grad_norm": 0.8191243410110474,
+      "learning_rate": 8.086391106448965e-06,
+      "loss": 1.0141,
+      "step": 206
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.860882043838501,
+      "learning_rate": 8.068184467159014e-06,
+      "loss": 0.9608,
+      "step": 207
+    },
+    {
+      "epoch": 0.6028985507246377,
+      "grad_norm": 0.7216934561729431,
+      "learning_rate": 8.049912339735284e-06,
+      "loss": 0.9898,
+      "step": 208
+    },
+    {
+      "epoch": 0.6057971014492753,
+      "grad_norm": 0.685965359210968,
+      "learning_rate": 8.031575114182856e-06,
+      "loss": 0.9532,
+      "step": 209
+    },
+    {
+      "epoch": 0.6086956521739131,
+      "grad_norm": 0.6752814054489136,
+      "learning_rate": 8.013173181896283e-06,
+      "loss": 1.0043,
+      "step": 210
+    },
+    {
+      "epoch": 0.6115942028985507,
+      "grad_norm": 0.815260112285614,
+      "learning_rate": 7.994706935651228e-06,
+      "loss": 1.0049,
+      "step": 211
+    },
+    {
+      "epoch": 0.6144927536231884,
+      "grad_norm": 0.729771077632904,
+      "learning_rate": 7.976176769596095e-06,
+      "loss": 1.0003,
+      "step": 212
+    },
+    {
+      "epoch": 0.6173913043478261,
+      "grad_norm": 0.6407178044319153,
+      "learning_rate": 7.957583079243607e-06,
+      "loss": 1.0197,
+      "step": 213
+    },
+    {
+      "epoch": 0.6202898550724638,
+      "grad_norm": 0.6758530735969543,
+      "learning_rate": 7.938926261462366e-06,
+      "loss": 1.0632,
+      "step": 214
+    },
+    {
+      "epoch": 0.6231884057971014,
+      "grad_norm": 0.7678017616271973,
+      "learning_rate": 7.920206714468383e-06,
+      "loss": 1.004,
+      "step": 215
+    },
+    {
+      "epoch": 0.6260869565217392,
+      "grad_norm": 0.6864491105079651,
+      "learning_rate": 7.90142483781658e-06,
+      "loss": 0.9798,
+      "step": 216
+    },
+    {
+      "epoch": 0.6289855072463768,
+      "grad_norm": 0.7141516804695129,
+      "learning_rate": 7.882581032392252e-06,
+      "loss": 0.9969,
+      "step": 217
+    },
+    {
+      "epoch": 0.6318840579710145,
+      "grad_norm": 0.7497020363807678,
+      "learning_rate": 7.863675700402527e-06,
+      "loss": 0.9951,
+      "step": 218
+    },
+    {
+      "epoch": 0.6347826086956522,
+      "grad_norm": 0.7010701894760132,
+      "learning_rate": 7.844709245367766e-06,
+      "loss": 1.0164,
+      "step": 219
+    },
+    {
+      "epoch": 0.6376811594202898,
+      "grad_norm": 0.8556409478187561,
+      "learning_rate": 7.82568207211296e-06,
+      "loss": 1.0079,
+      "step": 220
+    },
+    {
+      "epoch": 0.6405797101449275,
+      "grad_norm": 0.8755605816841125,
+      "learning_rate": 7.806594586759083e-06,
+      "loss": 1.0401,
+      "step": 221
+    },
+    {
+      "epoch": 0.6434782608695652,
+      "grad_norm": 0.7478286623954773,
+      "learning_rate": 7.787447196714428e-06,
+      "loss": 0.9966,
+      "step": 222
+    },
+    {
+      "epoch": 0.6463768115942029,
+      "grad_norm": 0.6972207427024841,
+      "learning_rate": 7.768240310665909e-06,
+      "loss": 1.0277,
+      "step": 223
+    },
+    {
+      "epoch": 0.6492753623188405,
+      "grad_norm": 0.7753648161888123,
+      "learning_rate": 7.748974338570337e-06,
+      "loss": 1.0531,
+      "step": 224
+    },
+    {
+      "epoch": 0.6521739130434783,
+      "grad_norm": 0.8420187830924988,
+      "learning_rate": 7.729649691645673e-06,
+      "loss": 1.0101,
+      "step": 225
+    },
+    {
+      "epoch": 0.6550724637681159,
+      "grad_norm": 0.7467186450958252,
+      "learning_rate": 7.710266782362248e-06,
+      "loss": 1.086,
+      "step": 226
+    },
+    {
+      "epoch": 0.6579710144927536,
+      "grad_norm": 0.679282009601593,
+      "learning_rate": 7.69082602443396e-06,
+      "loss": 1.0756,
+      "step": 227
+    },
+    {
+      "epoch": 0.6608695652173913,
+      "grad_norm": 0.8682421445846558,
+      "learning_rate": 7.671327832809442e-06,
+      "loss": 1.0337,
+      "step": 228
+    },
+    {
+      "epoch": 0.663768115942029,
+      "grad_norm": 0.9190111756324768,
+      "learning_rate": 7.651772623663212e-06,
+      "loss": 1.0412,
+      "step": 229
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.7419721484184265,
+      "learning_rate": 7.63216081438678e-06,
+      "loss": 0.9895,
+      "step": 230
+    },
+    {
+      "epoch": 0.6695652173913044,
+      "grad_norm": 0.7735477685928345,
+      "learning_rate": 7.612492823579744e-06,
+      "loss": 1.0109,
+      "step": 231
+    },
+    {
+      "epoch": 0.672463768115942,
+      "grad_norm": 0.6718391180038452,
+      "learning_rate": 7.5927690710408606e-06,
+      "loss": 1.0699,
+      "step": 232
+    },
+    {
+      "epoch": 0.6753623188405797,
+      "grad_norm": 0.8104904890060425,
+      "learning_rate": 7.572989977759073e-06,
+      "loss": 0.9957,
+      "step": 233
+    },
+    {
+      "epoch": 0.6782608695652174,
+      "grad_norm": 0.8718286752700806,
+      "learning_rate": 7.553155965904535e-06,
+      "loss": 0.9674,
+      "step": 234
+    },
+    {
+      "epoch": 0.6811594202898551,
+      "grad_norm": 0.727627158164978,
+      "learning_rate": 7.533267458819597e-06,
+      "loss": 1.0256,
+      "step": 235
+    },
+    {
+      "epoch": 0.6840579710144927,
+      "grad_norm": 0.6747854948043823,
+      "learning_rate": 7.513324881009769e-06,
+      "loss": 0.9956,
+      "step": 236
+    },
+    {
+      "epoch": 0.6869565217391305,
+      "grad_norm": 0.8896199464797974,
+      "learning_rate": 7.49332865813466e-06,
+      "loss": 1.052,
+      "step": 237
+    },
+    {
+      "epoch": 0.6898550724637681,
+      "grad_norm": 0.8011343479156494,
+      "learning_rate": 7.473279216998896e-06,
+      "loss": 0.9809,
+      "step": 238
+    },
+    {
+      "epoch": 0.6927536231884058,
+      "grad_norm": 0.7936311960220337,
+      "learning_rate": 7.453176985543002e-06,
+      "loss": 0.9491,
+      "step": 239
+    },
+    {
+      "epoch": 0.6956521739130435,
+      "grad_norm": 0.783686101436615,
+      "learning_rate": 7.4330223928342814e-06,
+      "loss": 1.0627,
+      "step": 240
+    },
+    {
+      "epoch": 0.6985507246376812,
+      "grad_norm": 0.6777355670928955,
+      "learning_rate": 7.412815869057644e-06,
+      "loss": 0.9836,
+      "step": 241
+    },
+    {
+      "epoch": 0.7014492753623188,
+      "grad_norm": 0.8609856367111206,
+      "learning_rate": 7.392557845506433e-06,
+      "loss": 1.0383,
+      "step": 242
+    },
+    {
+      "epoch": 0.7043478260869566,
+      "grad_norm": 0.7346140146255493,
+      "learning_rate": 7.372248754573213e-06,
+      "loss": 1.0237,
+      "step": 243
+    },
+    {
+      "epoch": 0.7072463768115942,
+      "grad_norm": 0.8134037852287292,
+      "learning_rate": 7.351889029740548e-06,
+      "loss": 1.0051,
+      "step": 244
+    },
+    {
+      "epoch": 0.7101449275362319,
+      "grad_norm": 0.7623313069343567,
+      "learning_rate": 7.33147910557174e-06,
+      "loss": 0.966,
+      "step": 245
+    },
+    {
+      "epoch": 0.7130434782608696,
+      "grad_norm": 0.8289423584938049,
+      "learning_rate": 7.311019417701567e-06,
+      "loss": 1.0162,
+      "step": 246
+    },
+    {
+      "epoch": 0.7159420289855073,
+      "grad_norm": 0.6778679490089417,
+      "learning_rate": 7.290510402826967e-06,
+      "loss": 1.042,
+      "step": 247
+    },
+    {
+      "epoch": 0.7188405797101449,
+      "grad_norm": 0.7705609798431396,
+      "learning_rate": 7.269952498697734e-06,
+      "loss": 0.9979,
+      "step": 248
+    },
+    {
+      "epoch": 0.7217391304347827,
+      "grad_norm": 0.8417146801948547,
+      "learning_rate": 7.249346144107165e-06,
+      "loss": 0.9937,
+      "step": 249
+    },
+    {
+      "epoch": 0.7246376811594203,
+      "grad_norm": 0.6634312868118286,
+      "learning_rate": 7.2286917788826926e-06,
+      "loss": 1.0299,
+      "step": 250
+    },
+    {
+      "epoch": 0.7275362318840579,
+      "grad_norm": 0.7162610292434692,
+      "learning_rate": 7.207989843876505e-06,
+      "loss": 0.9627,
+      "step": 251
+    },
+    {
+      "epoch": 0.7304347826086957,
+      "grad_norm": 0.886674165725708,
+      "learning_rate": 7.187240780956133e-06,
+      "loss": 0.9804,
+      "step": 252
+    },
+    {
+      "epoch": 0.7333333333333333,
+      "grad_norm": 0.8589048385620117,
+      "learning_rate": 7.166445032995013e-06,
+      "loss": 0.9972,
+      "step": 253
+    },
+    {
+      "epoch": 0.736231884057971,
+      "grad_norm": 0.792225182056427,
+      "learning_rate": 7.145603043863045e-06,
+      "loss": 1.0047,
+      "step": 254
+    },
+    {
+      "epoch": 0.7391304347826086,
+      "grad_norm": 0.7787736654281616,
+      "learning_rate": 7.124715258417111e-06,
+      "loss": 0.974,
+      "step": 255
+    },
+    {
+      "epoch": 0.7420289855072464,
+      "grad_norm": 0.7716973423957825,
+      "learning_rate": 7.103782122491577e-06,
+      "loss": 0.9476,
+      "step": 256
+    },
+    {
+      "epoch": 0.744927536231884,
+      "grad_norm": 0.8235695958137512,
+      "learning_rate": 7.082804082888787e-06,
+      "loss": 1.0303,
+      "step": 257
+    },
+    {
+      "epoch": 0.7478260869565218,
+      "grad_norm": 0.8061054944992065,
+      "learning_rate": 7.061781587369518e-06,
+      "loss": 1.0254,
+      "step": 258
+    },
+    {
+      "epoch": 0.7507246376811594,
+      "grad_norm": 0.8522235751152039,
+      "learning_rate": 7.040715084643429e-06,
+      "loss": 1.0196,
+      "step": 259
+    },
+    {
+      "epoch": 0.7536231884057971,
+      "grad_norm": 0.8005476593971252,
+      "learning_rate": 7.019605024359475e-06,
+      "loss": 1.052,
+      "step": 260
+    },
+    {
+      "epoch": 0.7565217391304347,
+      "grad_norm": 0.9044481515884399,
+      "learning_rate": 6.998451857096321e-06,
+      "loss": 1.04,
+      "step": 261
+    },
+    {
+      "epoch": 0.7565217391304347,
+      "eval_loss": 0.9999631643295288,
+      "eval_runtime": 46.2792,
+      "eval_samples_per_second": 5.532,
+      "eval_steps_per_second": 0.691,
+      "step": 261
+    },
+    {
+      "epoch": 0.7594202898550725,
+      "grad_norm": 0.6946824193000793,
+      "learning_rate": 6.977256034352713e-06,
+      "loss": 0.9869,
+      "step": 262
+    },
+    {
+      "epoch": 0.7623188405797101,
+      "grad_norm": 0.8048357963562012,
+      "learning_rate": 6.956018008537852e-06,
+      "loss": 0.9773,
+      "step": 263
+    },
+    {
+      "epoch": 0.7652173913043478,
+      "grad_norm": 0.7211609482765198,
+      "learning_rate": 6.934738232961728e-06,
+      "loss": 0.9727,
+      "step": 264
+    },
+    {
+      "epoch": 0.7681159420289855,
+      "grad_norm": 0.7225235104560852,
+      "learning_rate": 6.913417161825449e-06,
+      "loss": 1.0209,
+      "step": 265
+    },
+    {
+      "epoch": 0.7710144927536232,
+      "grad_norm": 0.6443622708320618,
+      "learning_rate": 6.892055250211552e-06,
+      "loss": 1.0398,
+      "step": 266
+    },
+    {
+      "epoch": 0.7739130434782608,
+      "grad_norm": 0.8570783138275146,
+      "learning_rate": 6.8706529540742775e-06,
+      "loss": 0.9883,
+      "step": 267
+    },
+    {
+      "epoch": 0.7768115942028986,
+      "grad_norm": 0.9808831810951233,
+      "learning_rate": 6.849210730229846e-06,
+      "loss": 1.0847,
+      "step": 268
+    },
+    {
+      "epoch": 0.7797101449275362,
+      "grad_norm": 0.8551820516586304,
+      "learning_rate": 6.827729036346706e-06,
+      "loss": 0.9621,
+      "step": 269
+    },
+    {
+      "epoch": 0.782608695652174,
+      "grad_norm": 0.8964309692382812,
+      "learning_rate": 6.806208330935766e-06,
+      "loss": 0.9886,
+      "step": 270
+    },
+    {
+      "epoch": 0.7855072463768116,
+      "grad_norm": 0.8737574219703674,
+      "learning_rate": 6.784649073340601e-06,
+      "loss": 1.0019,
+      "step": 271
+    },
+    {
+      "epoch": 0.7884057971014493,
+      "grad_norm": 0.7480164170265198,
+      "learning_rate": 6.763051723727663e-06,
+      "loss": 0.9987,
+      "step": 272
+    },
+    {
+      "epoch": 0.7913043478260869,
+      "grad_norm": 0.7155961990356445,
+      "learning_rate": 6.741416743076443e-06,
+      "loss": 1.0043,
+      "step": 273
+    },
+    {
+      "epoch": 0.7942028985507247,
+      "grad_norm": 0.8288201093673706,
+      "learning_rate": 6.719744593169642e-06,
+      "loss": 0.9703,
+      "step": 274
+    },
+    {
+      "epoch": 0.7971014492753623,
+      "grad_norm": 0.7403139472007751,
+      "learning_rate": 6.698035736583307e-06,
+      "loss": 0.9453,
+      "step": 275
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.7977936863899231,
+      "learning_rate": 6.67629063667697e-06,
+      "loss": 1.0091,
+      "step": 276
+    },
+    {
+      "epoch": 0.8028985507246377,
+      "grad_norm": 0.8381959795951843,
+      "learning_rate": 6.6545097575837405e-06,
+      "loss": 1.0001,
+      "step": 277
+    },
+    {
+      "epoch": 0.8057971014492754,
+      "grad_norm": 0.7988629937171936,
+      "learning_rate": 6.6326935642004165e-06,
+      "loss": 1.0053,
+      "step": 278
+    },
+    {
+      "epoch": 0.808695652173913,
+      "grad_norm": 0.8848451375961304,
+      "learning_rate": 6.610842522177549e-06,
+      "loss": 1.021,
+      "step": 279
+    },
+    {
+      "epoch": 0.8115942028985508,
+      "grad_norm": 0.8423268795013428,
+      "learning_rate": 6.588957097909509e-06,
+      "loss": 1.0245,
+      "step": 280
+    },
+    {
+      "epoch": 0.8144927536231884,
+      "grad_norm": 0.6828733682632446,
+      "learning_rate": 6.567037758524529e-06,
+      "loss": 0.9966,
+      "step": 281
+    },
+    {
+      "epoch": 0.8173913043478261,
+      "grad_norm": 0.8118813633918762,
+      "learning_rate": 6.545084971874738e-06,
+      "loss": 0.9777,
+      "step": 282
+    },
+    {
+      "epoch": 0.8202898550724638,
+      "grad_norm": 0.8288912773132324,
+      "learning_rate": 6.5230992065261685e-06,
+      "loss": 1.0158,
+      "step": 283
+    },
+    {
+      "epoch": 0.8231884057971014,
+      "grad_norm": 0.7110708951950073,
+      "learning_rate": 6.501080931748764e-06,
+      "loss": 0.9331,
+      "step": 284
+    },
+    {
+      "epoch": 0.8260869565217391,
+      "grad_norm": 0.767749011516571,
+      "learning_rate": 6.4790306175063535e-06,
+      "loss": 0.8917,
+      "step": 285
+    },
+    {
+      "epoch": 0.8289855072463768,
+      "grad_norm": 0.8519418835639954,
+      "learning_rate": 6.456948734446624e-06,
+      "loss": 1.0296,
+      "step": 286
+    },
+    {
+      "epoch": 0.8318840579710145,
+      "grad_norm": 0.7988749742507935,
+      "learning_rate": 6.43483575389108e-06,
+      "loss": 0.9296,
+      "step": 287
+    },
+    {
+      "epoch": 0.8347826086956521,
+      "grad_norm": 0.8312949538230896,
+      "learning_rate": 6.412692147824976e-06,
+      "loss": 1.0632,
+      "step": 288
+    },
+    {
+      "epoch": 0.8376811594202899,
+      "grad_norm": 0.9024953246116638,
+      "learning_rate": 6.390518388887246e-06,
+      "loss": 1.0013,
+      "step": 289
+    },
+    {
+      "epoch": 0.8405797101449275,
+      "grad_norm": 0.6774289011955261,
+      "learning_rate": 6.368314950360416e-06,
+      "loss": 0.954,
+      "step": 290
+    },
+    {
+      "epoch": 0.8434782608695652,
+      "grad_norm": 0.739329993724823,
+      "learning_rate": 6.3460823061604984e-06,
+      "loss": 0.9453,
+      "step": 291
+    },
+    {
+      "epoch": 0.8463768115942029,
+      "grad_norm": 0.7888621687889099,
+      "learning_rate": 6.323820930826879e-06,
+      "loss": 0.9672,
+      "step": 292
+    },
+    {
+      "epoch": 0.8492753623188406,
+      "grad_norm": 0.7777626514434814,
+      "learning_rate": 6.301531299512195e-06,
+      "loss": 1.0118,
+      "step": 293
+    },
+    {
+      "epoch": 0.8521739130434782,
+      "grad_norm": 0.8532302975654602,
+      "learning_rate": 6.279213887972179e-06,
+      "loss": 0.9837,
+      "step": 294
+    },
+    {
+      "epoch": 0.855072463768116,
+      "grad_norm": 0.8223821520805359,
+      "learning_rate": 6.2568691725555144e-06,
+      "loss": 0.9786,
+      "step": 295
+    },
+    {
+      "epoch": 0.8579710144927536,
+      "grad_norm": 0.7102084755897522,
+      "learning_rate": 6.234497630193666e-06,
+      "loss": 0.9634,
+      "step": 296
+    },
+    {
+      "epoch": 0.8608695652173913,
+      "grad_norm": 0.7488099932670593,
+      "learning_rate": 6.2120997383907015e-06,
+      "loss": 1.0271,
+      "step": 297
+    },
+    {
+      "epoch": 0.863768115942029,
+      "grad_norm": 0.755387008190155,
+      "learning_rate": 6.189675975213094e-06,
+      "loss": 1.0068,
+      "step": 298
+    },
+    {
+      "epoch": 0.8666666666666667,
+      "grad_norm": 0.7323296666145325,
+      "learning_rate": 6.1672268192795285e-06,
+      "loss": 1.0177,
+      "step": 299
+    },
+    {
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.7505559325218201,
+      "learning_rate": 6.144752749750671e-06,
+      "loss": 1.0031,
+      "step": 300
+    },
+    {
+      "epoch": 0.8724637681159421,
+      "grad_norm": 0.8251679539680481,
+      "learning_rate": 6.122254246318957e-06,
+      "loss": 1.0281,
+      "step": 301
+    },
+    {
+      "epoch": 0.8753623188405797,
+      "grad_norm": 0.7030305862426758,
+      "learning_rate": 6.099731789198344e-06,
+      "loss": 0.977,
+      "step": 302
+    },
+    {
+      "epoch": 0.8782608695652174,
+      "grad_norm": 0.872175931930542,
+      "learning_rate": 6.077185859114059e-06,
+      "loss": 1.0279,
+      "step": 303
+    },
+    {
+      "epoch": 0.881159420289855,
+      "grad_norm": 0.6906105279922485,
+      "learning_rate": 6.05461693729235e-06,
+      "loss": 0.9747,
+      "step": 304
+    },
+    {
+      "epoch": 0.8840579710144928,
+      "grad_norm": 0.8041731119155884,
+      "learning_rate": 6.0320255054501985e-06,
+      "loss": 0.9706,
+      "step": 305
+    },
+    {
+      "epoch": 0.8869565217391304,
+      "grad_norm": 0.9219099283218384,
+      "learning_rate": 6.009412045785051e-06,
+      "loss": 1.0192,
+      "step": 306
+    },
+    {
+      "epoch": 0.8898550724637682,
+      "grad_norm": 0.5931650996208191,
+      "learning_rate": 5.986777040964521e-06,
+      "loss": 1.0064,
+      "step": 307
+    },
+    {
+      "epoch": 0.8927536231884058,
+      "grad_norm": 0.9496859908103943,
+      "learning_rate": 5.964120974116085e-06,
+      "loss": 1.0138,
+      "step": 308
+    },
+    {
+      "epoch": 0.8956521739130435,
+      "grad_norm": 0.719667375087738,
+      "learning_rate": 5.941444328816775e-06,
+      "loss": 1.0213,
+      "step": 309
+    },
+    {
+      "epoch": 0.8985507246376812,
+      "grad_norm": 0.8299076557159424,
+      "learning_rate": 5.918747589082853e-06,
+      "loss": 0.9931,
+      "step": 310
+    },
+    {
+      "epoch": 0.9014492753623189,
+      "grad_norm": 0.8233078718185425,
+      "learning_rate": 5.896031239359485e-06,
+      "loss": 0.9789,
+      "step": 311
+    },
+    {
+      "epoch": 0.9043478260869565,
+      "grad_norm": 0.6814295649528503,
+      "learning_rate": 5.8732957645103946e-06,
+      "loss": 1.0711,
+      "step": 312
+    },
+    {
+      "epoch": 0.9072463768115943,
+      "grad_norm": 0.786590039730072,
+      "learning_rate": 5.85054164980752e-06,
+      "loss": 1.0282,
+      "step": 313
+    },
+    {
+      "epoch": 0.9101449275362319,
+      "grad_norm": 0.7114934921264648,
+      "learning_rate": 5.82776938092065e-06,
+      "loss": 1.0125,
+      "step": 314
+    },
+    {
+      "epoch": 0.9130434782608695,
+      "grad_norm": 0.8856657147407532,
+      "learning_rate": 5.804979443907065e-06,
+      "loss": 1.0325,
+      "step": 315
+    },
+    {
+      "epoch": 0.9159420289855073,
+      "grad_norm": 0.9123273491859436,
+      "learning_rate": 5.782172325201155e-06,
+      "loss": 1.0696,
+      "step": 316
+    },
+    {
+      "epoch": 0.9188405797101449,
+      "grad_norm": 0.7296032905578613,
+      "learning_rate": 5.7593485116040425e-06,
+      "loss": 1.0004,
+      "step": 317
+    },
+    {
+      "epoch": 0.9217391304347826,
+      "grad_norm": 0.8410807847976685,
+      "learning_rate": 5.736508490273189e-06,
+      "loss": 0.9547,
+      "step": 318
+    },
+    {
+      "epoch": 0.9246376811594202,
+      "grad_norm": 1.0709190368652344,
+      "learning_rate": 5.713652748711997e-06,
+      "loss": 0.9583,
+      "step": 319
+    },
+    {
+      "epoch": 0.927536231884058,
+      "grad_norm": 0.6270896196365356,
+      "learning_rate": 5.690781774759412e-06,
+      "loss": 1.0024,
+      "step": 320
+    },
+    {
+      "epoch": 0.9304347826086956,
+      "grad_norm": 0.7849041223526001,
+      "learning_rate": 5.667896056579495e-06,
+      "loss": 0.9477,
+      "step": 321
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.7513189315795898,
+      "learning_rate": 5.644996082651018e-06,
+      "loss": 0.9937,
+      "step": 322
+    },
+    {
+      "epoch": 0.936231884057971,
+      "grad_norm": 0.8150386214256287,
+      "learning_rate": 5.622082341757027e-06,
+      "loss": 1.0589,
+      "step": 323
+    },
+    {
+      "epoch": 0.9391304347826087,
+      "grad_norm": 0.8518944978713989,
+      "learning_rate": 5.5991553229744166e-06,
+      "loss": 1.0393,
+      "step": 324
+    },
+    {
+      "epoch": 0.9420289855072463,
+      "grad_norm": 0.814802885055542,
+      "learning_rate": 5.576215515663489e-06,
+      "loss": 1.0186,
+      "step": 325
+    },
+    {
+      "epoch": 0.9449275362318841,
+      "grad_norm": 0.9456635117530823,
+      "learning_rate": 5.553263409457504e-06,
+      "loss": 0.9657,
+      "step": 326
+    },
+    {
+      "epoch": 0.9478260869565217,
+      "grad_norm": 0.7259712815284729,
+      "learning_rate": 5.530299494252238e-06,
+      "loss": 1.0066,
+      "step": 327
+    },
+    {
+      "epoch": 0.9507246376811594,
+      "grad_norm": 0.7462155818939209,
+      "learning_rate": 5.507324260195516e-06,
+      "loss": 0.9246,
+      "step": 328
+    },
+    {
+      "epoch": 0.9536231884057971,
+      "grad_norm": 0.9022188782691956,
+      "learning_rate": 5.484338197676757e-06,
+      "loss": 0.9624,
+      "step": 329
+    },
+    {
+      "epoch": 0.9565217391304348,
+      "grad_norm": 0.8874835968017578,
+      "learning_rate": 5.46134179731651e-06,
+      "loss": 0.9851,
+      "step": 330
+    },
+    {
+      "epoch": 0.9594202898550724,
+      "grad_norm": 0.7534209489822388,
+      "learning_rate": 5.4383355499559734e-06,
+      "loss": 0.9761,
+      "step": 331
+    },
+    {
+      "epoch": 0.9623188405797102,
+      "grad_norm": 0.9121699929237366,
+      "learning_rate": 5.41531994664652e-06,
+      "loss": 0.9994,
+      "step": 332
+    },
+    {
+      "epoch": 0.9652173913043478,
+      "grad_norm": 0.774753212928772,
+      "learning_rate": 5.392295478639226e-06,
+      "loss": 1.0218,
+      "step": 333
+    },
+    {
+      "epoch": 0.9681159420289855,
+      "grad_norm": 0.7575943470001221,
+      "learning_rate": 5.36926263737437e-06,
+      "loss": 0.9855,
+      "step": 334
+    },
+    {
+      "epoch": 0.9710144927536232,
+      "grad_norm": 0.8202754259109497,
+      "learning_rate": 5.346221914470959e-06,
+      "loss": 1.0112,
+      "step": 335
+    },
+    {
+      "epoch": 0.9739130434782609,
+      "grad_norm": 0.8952569961547852,
+      "learning_rate": 5.323173801716222e-06,
+      "loss": 0.9722,
+      "step": 336
+    },
+    {
+      "epoch": 0.9768115942028985,
+      "grad_norm": 0.7153046727180481,
+      "learning_rate": 5.300118791055122e-06,
+      "loss": 0.9847,
+      "step": 337
+    },
+    {
+      "epoch": 0.9797101449275363,
+      "grad_norm": 0.7900391221046448,
+      "learning_rate": 5.27705737457985e-06,
+      "loss": 1.0324,
+      "step": 338
+    },
+    {
+      "epoch": 0.9826086956521739,
+      "grad_norm": 0.8250629305839539,
+      "learning_rate": 5.253990044519329e-06,
+      "loss": 0.9764,
+      "step": 339
+    },
+    {
+      "epoch": 0.9855072463768116,
+      "grad_norm": 0.8809992671012878,
+      "learning_rate": 5.230917293228699e-06,
+      "loss": 1.0198,
+      "step": 340
+    },
+    {
+      "epoch": 0.9884057971014493,
+      "grad_norm": 0.7209755778312683,
+      "learning_rate": 5.207839613178814e-06,
+      "loss": 1.0253,
+      "step": 341
+    },
+    {
+      "epoch": 0.991304347826087,
+      "grad_norm": 0.8488002419471741,
+      "learning_rate": 5.184757496945726e-06,
+      "loss": 0.9333,
+      "step": 342
+    },
+    {
+      "epoch": 0.9942028985507246,
+      "grad_norm": 0.8114776611328125,
+      "learning_rate": 5.161671437200179e-06,
+      "loss": 1.0026,
+      "step": 343
+    },
+    {
+      "epoch": 0.9971014492753624,
+      "grad_norm": 0.8550688028335571,
+      "learning_rate": 5.138581926697083e-06,
+      "loss": 1.0057,
+      "step": 344
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.9187963008880615,
+      "learning_rate": 5.115489458265006e-06,
+      "loss": 1.0037,
+      "step": 345
+    },
+    {
+      "epoch": 1.0028985507246377,
+      "grad_norm": 0.8499656915664673,
+      "learning_rate": 5.09239452479565e-06,
+      "loss": 0.9793,
+      "step": 346
+    },
+    {
+      "epoch": 1.0057971014492753,
+      "grad_norm": 0.9663048982620239,
+      "learning_rate": 5.0692976192333295e-06,
+      "loss": 0.9337,
+      "step": 347
+    },
+    {
+      "epoch": 1.008695652173913,
+      "grad_norm": 0.8095614910125732,
+      "learning_rate": 5.046199234564455e-06,
+      "loss": 0.9461,
+      "step": 348
+    },
+    {
+      "epoch": 1.008695652173913,
+      "eval_loss": 0.9858289361000061,
+      "eval_runtime": 46.4396,
+      "eval_samples_per_second": 5.513,
+      "eval_steps_per_second": 0.689,
+      "step": 348
+    },
+    {
+      "epoch": 1.0115942028985507,
+      "grad_norm": 0.839413046836853,
+      "learning_rate": 5.0230998638070024e-06,
+      "loss": 0.9702,
+      "step": 349
+    },
+    {
+      "epoch": 1.0144927536231885,
+      "grad_norm": 0.8220239877700806,
+      "learning_rate": 5e-06,
+      "loss": 0.9403,
+      "step": 350
+    },
+    {
+      "epoch": 1.017391304347826,
+      "grad_norm": 0.8942255973815918,
+      "learning_rate": 4.976900136192998e-06,
+      "loss": 0.9763,
+      "step": 351
+    },
+    {
+      "epoch": 1.0028985507246377,
+      "grad_norm": 0.785389244556427,
+      "learning_rate": 4.953800765435547e-06,
+      "loss": 1.0033,
+      "step": 352
+    },
+    {
+      "epoch": 1.0057971014492753,
+      "grad_norm": 0.9310470223426819,
+      "learning_rate": 4.930702380766671e-06,
+      "loss": 0.9569,
+      "step": 353
+    },
+    {
+      "epoch": 1.008695652173913,
+      "grad_norm": 0.9420292377471924,
+      "learning_rate": 4.907605475204352e-06,
+      "loss": 1.0085,
+      "step": 354
+    },
+    {
+      "epoch": 1.0115942028985507,
+      "grad_norm": 0.8762017488479614,
+      "learning_rate": 4.8845105417349955e-06,
+      "loss": 1.0225,
+      "step": 355
+    },
+    {
+      "epoch": 1.0144927536231885,
+      "grad_norm": 0.8962522149085999,
+      "learning_rate": 4.861418073302919e-06,
+      "loss": 0.9543,
+      "step": 356
+    },
+    {
+      "epoch": 1.017391304347826,
+      "grad_norm": 0.8070088028907776,
+      "learning_rate": 4.838328562799824e-06,
+      "loss": 0.9334,
+      "step": 357
+    },
+    {
+      "epoch": 1.0202898550724637,
+      "grad_norm": 0.8407843708992004,
+      "learning_rate": 4.815242503054277e-06,
+      "loss": 0.9499,
+      "step": 358
+    },
+    {
+      "epoch": 1.0231884057971015,
+      "grad_norm": 0.8197099566459656,
+      "learning_rate": 4.79216038682119e-06,
+      "loss": 1.0039,
+      "step": 359
+    },
+    {
+      "epoch": 1.0260869565217392,
+      "grad_norm": 0.7919727563858032,
+      "learning_rate": 4.7690827067713035e-06,
+      "loss": 0.9731,
+      "step": 360
+    },
+    {
+      "epoch": 1.0289855072463767,
+      "grad_norm": 0.7514965534210205,
+      "learning_rate": 4.746009955480672e-06,
+      "loss": 0.9124,
+      "step": 361
+    },
+    {
+      "epoch": 1.0318840579710145,
+      "grad_norm": 0.7958142757415771,
+      "learning_rate": 4.7229426254201504e-06,
+      "loss": 0.9836,
+      "step": 362
+    },
+    {
+      "epoch": 1.0347826086956522,
+      "grad_norm": 0.9223296642303467,
+      "learning_rate": 4.69988120894488e-06,
+      "loss": 1.0372,
+      "step": 363
+    },
+    {
+      "epoch": 1.03768115942029,
+      "grad_norm": 0.7448701858520508,
+      "learning_rate": 4.676826198283779e-06,
+      "loss": 0.9189,
+      "step": 364
+    },
+    {
+      "epoch": 1.0405797101449274,
+      "grad_norm": 0.731107771396637,
+      "learning_rate": 4.653778085529043e-06,
+      "loss": 0.9632,
+      "step": 365
+    },
+    {
+      "epoch": 1.0434782608695652,
+      "grad_norm": 0.8460220694541931,
+      "learning_rate": 4.630737362625631e-06,
+      "loss": 0.9794,
+      "step": 366
+    },
+    {
+      "epoch": 1.046376811594203,
+      "grad_norm": 0.8166036605834961,
+      "learning_rate": 4.6077045213607765e-06,
+      "loss": 0.9976,
+      "step": 367
+    },
+    {
+      "epoch": 1.0492753623188407,
+      "grad_norm": 0.6962491869926453,
+      "learning_rate": 4.584680053353481e-06,
+      "loss": 0.9374,
+      "step": 368
+    },
+    {
+      "epoch": 1.0521739130434782,
+      "grad_norm": 0.8353239893913269,
+      "learning_rate": 4.561664450044029e-06,
+      "loss": 0.991,
+      "step": 369
+    },
+    {
+      "epoch": 1.055072463768116,
+      "grad_norm": 0.8190463781356812,
+      "learning_rate": 4.53865820268349e-06,
+      "loss": 0.9971,
+      "step": 370
+    },
+    {
+      "epoch": 1.0579710144927537,
+      "grad_norm": 0.904393196105957,
+      "learning_rate": 4.515661802323244e-06,
+      "loss": 0.9548,
+      "step": 371
+    },
+    {
+      "epoch": 1.0608695652173914,
+      "grad_norm": 0.7582879066467285,
+      "learning_rate": 4.492675739804486e-06,
+      "loss": 0.934,
+      "step": 372
+    },
+    {
+      "epoch": 1.063768115942029,
+      "grad_norm": 0.7787836194038391,
+      "learning_rate": 4.4697005057477634e-06,
+      "loss": 0.973,
+      "step": 373
+    },
+    {
+      "epoch": 1.0666666666666667,
+      "grad_norm": 0.7273504137992859,
+      "learning_rate": 4.446736590542497e-06,
+      "loss": 1.0166,
+      "step": 374
+    },
+    {
+      "epoch": 1.0695652173913044,
+      "grad_norm": 0.7512848377227783,
+      "learning_rate": 4.4237844843365126e-06,
+      "loss": 0.9951,
+      "step": 375
+    },
+    {
+      "epoch": 1.0724637681159421,
+      "grad_norm": 0.8715952038764954,
+      "learning_rate": 4.400844677025585e-06,
+      "loss": 1.0384,
+      "step": 376
+    },
+    {
+      "epoch": 1.0753623188405796,
+      "grad_norm": 1.1643601655960083,
+      "learning_rate": 4.377917658242975e-06,
+      "loss": 0.9725,
+      "step": 377
+    },
+    {
+      "epoch": 1.0782608695652174,
+      "grad_norm": 1.0170421600341797,
+      "learning_rate": 4.355003917348985e-06,
+      "loss": 0.9877,
+      "step": 378
+    },
+    {
+      "epoch": 1.0811594202898551,
+      "grad_norm": 0.8441584706306458,
+      "learning_rate": 4.332103943420507e-06,
+      "loss": 0.9795,
+      "step": 379
+    },
+    {
+      "epoch": 1.0840579710144929,
+      "grad_norm": 0.9508838057518005,
+      "learning_rate": 4.309218225240591e-06,
+      "loss": 1.0274,
+      "step": 380
+    },
+    {
+      "epoch": 1.0869565217391304,
+      "grad_norm": 0.9078054428100586,
+      "learning_rate": 4.286347251288004e-06,
+      "loss": 1.0117,
+      "step": 381
+    },
+    {
+      "epoch": 1.0898550724637681,
+      "grad_norm": 1.056804895401001,
+      "learning_rate": 4.263491509726812e-06,
+      "loss": 0.9588,
+      "step": 382
+    },
+    {
+      "epoch": 1.0927536231884059,
+      "grad_norm": 0.8957586288452148,
+      "learning_rate": 4.240651488395958e-06,
+      "loss": 0.9644,
+      "step": 383
+    },
+    {
+      "epoch": 1.0956521739130434,
+      "grad_norm": 0.9251319169998169,
+      "learning_rate": 4.217827674798845e-06,
+      "loss": 0.9764,
+      "step": 384
+    },
+    {
+      "epoch": 1.098550724637681,
+      "grad_norm": 0.8325505256652832,
+      "learning_rate": 4.195020556092935e-06,
+      "loss": 0.987,
+      "step": 385
+    },
+    {
+      "epoch": 1.1014492753623188,
+      "grad_norm": 0.8144704699516296,
+      "learning_rate": 4.17223061907935e-06,
+      "loss": 0.9898,
+      "step": 386
+    },
+    {
+      "epoch": 1.1043478260869566,
+      "grad_norm": 0.8545647859573364,
+      "learning_rate": 4.14945835019248e-06,
+      "loss": 0.9214,
+      "step": 387
+    },
+    {
+      "epoch": 1.107246376811594,
+      "grad_norm": 0.8896581530570984,
+      "learning_rate": 4.126704235489606e-06,
+      "loss": 0.9432,
+      "step": 388
+    },
+    {
+      "epoch": 1.1101449275362318,
+      "grad_norm": 0.8762820959091187,
+      "learning_rate": 4.103968760640516e-06,
+      "loss": 0.9754,
+      "step": 389
+    },
+    {
+      "epoch": 1.1130434782608696,
+      "grad_norm": 0.7869084477424622,
+      "learning_rate": 4.081252410917148e-06,
+      "loss": 0.9655,
+      "step": 390
+    },
+    {
+      "epoch": 1.1159420289855073,
+      "grad_norm": 0.9484694600105286,
+      "learning_rate": 4.058555671183227e-06,
+      "loss": 0.9461,
+      "step": 391
+    },
+    {
+      "epoch": 1.1188405797101448,
+      "grad_norm": 0.8366033434867859,
+      "learning_rate": 4.035879025883916e-06,
+      "loss": 0.9745,
+      "step": 392
+    },
+    {
+      "epoch": 1.1217391304347826,
+      "grad_norm": 0.8974631428718567,
+      "learning_rate": 4.013222959035481e-06,
+      "loss": 1.003,
+      "step": 393
+    },
+    {
+      "epoch": 1.1246376811594203,
+      "grad_norm": 0.9970961809158325,
+      "learning_rate": 3.99058795421495e-06,
+      "loss": 0.9548,
+      "step": 394
+    },
+    {
+      "epoch": 1.127536231884058,
+      "grad_norm": 0.8342113494873047,
+      "learning_rate": 3.967974494549803e-06,
+      "loss": 0.8879,
+      "step": 395
+    },
+    {
+      "epoch": 1.1304347826086956,
+      "grad_norm": 0.7740679383277893,
+      "learning_rate": 3.945383062707652e-06,
+      "loss": 1.0181,
+      "step": 396
+    },
+    {
+      "epoch": 1.1333333333333333,
+      "grad_norm": 0.8080225586891174,
+      "learning_rate": 3.922814140885942e-06,
+      "loss": 0.9629,
+      "step": 397
+    },
+    {
+      "epoch": 1.136231884057971,
+      "grad_norm": 0.745694637298584,
+      "learning_rate": 3.9002682108016585e-06,
+      "loss": 0.9725,
+      "step": 398
+    },
+    {
+      "epoch": 1.1391304347826088,
+      "grad_norm": 0.93767249584198,
+      "learning_rate": 3.8777457536810446e-06,
+      "loss": 0.9411,
+      "step": 399
+    },
+    {
+      "epoch": 1.1420289855072463,
+      "grad_norm": 0.7331735491752625,
+      "learning_rate": 3.855247250249331e-06,
+      "loss": 0.9187,
+      "step": 400
+    },
+    {
+      "epoch": 1.144927536231884,
+      "grad_norm": 1.1504460573196411,
+      "learning_rate": 3.832773180720475e-06,
+      "loss": 1.0038,
+      "step": 401
+    },
+    {
+      "epoch": 1.1478260869565218,
+      "grad_norm": 0.7792490124702454,
+      "learning_rate": 3.8103240247869077e-06,
+      "loss": 0.9583,
+      "step": 402
+    },
+    {
+      "epoch": 1.1507246376811595,
+      "grad_norm": 0.8607194423675537,
+      "learning_rate": 3.7879002616093015e-06,
+      "loss": 0.9608,
+      "step": 403
+    },
+    {
+      "epoch": 1.153623188405797,
+      "grad_norm": 0.7470278143882751,
+      "learning_rate": 3.765502369806334e-06,
+      "loss": 1.0097,
+      "step": 404
+    },
+    {
+      "epoch": 1.1565217391304348,
+      "grad_norm": 0.8549491763114929,
+      "learning_rate": 3.743130827444487e-06,
+      "loss": 0.9707,
+      "step": 405
+    },
+    {
+      "epoch": 1.1594202898550725,
+      "grad_norm": 0.8472537398338318,
+      "learning_rate": 3.720786112027822e-06,
+      "loss": 0.9746,
+      "step": 406
+    },
+    {
+      "epoch": 1.1623188405797102,
+      "grad_norm": 0.7988584637641907,
+      "learning_rate": 3.6984687004878052e-06,
+      "loss": 0.9883,
+      "step": 407
+    },
+    {
+      "epoch": 1.1652173913043478,
+      "grad_norm": 0.823165774345398,
+      "learning_rate": 3.6761790691731207e-06,
+      "loss": 1.013,
+      "step": 408
+    },
+    {
+      "epoch": 1.1681159420289855,
+      "grad_norm": 0.7537344694137573,
+      "learning_rate": 3.6539176938395037e-06,
+      "loss": 1.0081,
+      "step": 409
+    },
+    {
+      "epoch": 1.1710144927536232,
+      "grad_norm": 0.7858260273933411,
+      "learning_rate": 3.6316850496395863e-06,
+      "loss": 0.9688,
+      "step": 410
+    },
+    {
+      "epoch": 1.1739130434782608,
+      "grad_norm": 0.8715892434120178,
+      "learning_rate": 3.609481611112755e-06,
+      "loss": 1.0181,
+      "step": 411
+    },
+    {
+      "epoch": 1.1768115942028985,
+      "grad_norm": 0.816693127155304,
+      "learning_rate": 3.587307852175025e-06,
+      "loss": 0.9505,
+      "step": 412
+    },
+    {
+      "epoch": 1.1797101449275362,
+      "grad_norm": 0.9773905277252197,
+      "learning_rate": 3.5651642461089207e-06,
+      "loss": 0.9745,
+      "step": 413
+    },
+    {
+      "epoch": 1.182608695652174,
+      "grad_norm": 0.7822540998458862,
+      "learning_rate": 3.5430512655533774e-06,
+      "loss": 0.9977,
+      "step": 414
+    },
+    {
+      "epoch": 1.1855072463768117,
+      "grad_norm": 0.9197254180908203,
+      "learning_rate": 3.5209693824936486e-06,
+      "loss": 0.9955,
+      "step": 415
+    },
+    {
+      "epoch": 1.1884057971014492,
+      "grad_norm": 0.8545462489128113,
+      "learning_rate": 3.498919068251237e-06,
+      "loss": 1.0544,
+      "step": 416
+    },
+    {
+      "epoch": 1.191304347826087,
+      "grad_norm": 0.8395746350288391,
+      "learning_rate": 3.476900793473832e-06,
+      "loss": 0.9757,
+      "step": 417
+    },
+    {
+      "epoch": 1.1942028985507247,
+      "grad_norm": 0.8740842938423157,
+      "learning_rate": 3.4549150281252635e-06,
+      "loss": 0.9468,
+      "step": 418
+    },
+    {
+      "epoch": 1.1971014492753622,
+      "grad_norm": 0.7521042823791504,
+      "learning_rate": 3.4329622414754728e-06,
+      "loss": 0.9432,
+      "step": 419
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.713711142539978,
+      "learning_rate": 3.4110429020904924e-06,
+      "loss": 0.9838,
+      "step": 420
+    },
+    {
+      "epoch": 1.2028985507246377,
+      "grad_norm": 0.8481893539428711,
+      "learning_rate": 3.3891574778224524e-06,
+      "loss": 0.9489,
+      "step": 421
+    },
+    {
+      "epoch": 1.2057971014492754,
+      "grad_norm": 0.863029420375824,
+      "learning_rate": 3.3673064357995844e-06,
+      "loss": 1.0462,
+      "step": 422
+    },
+    {
+      "epoch": 1.208695652173913,
+      "grad_norm": 0.8649914860725403,
+      "learning_rate": 3.3454902424162603e-06,
+      "loss": 1.0085,
+      "step": 423
+    },
+    {
+      "epoch": 1.2115942028985507,
+      "grad_norm": 0.8374588489532471,
+      "learning_rate": 3.3237093633230323e-06,
+      "loss": 1.0425,
+      "step": 424
+    },
+    {
+      "epoch": 1.2144927536231884,
+      "grad_norm": 0.9396947026252747,
+      "learning_rate": 3.301964263416693e-06,
+      "loss": 1.0303,
+      "step": 425
+    },
+    {
+      "epoch": 1.2173913043478262,
+      "grad_norm": 0.8101410865783691,
+      "learning_rate": 3.2802554068303595e-06,
+      "loss": 0.9747,
+      "step": 426
+    },
+    {
+      "epoch": 1.2202898550724637,
+      "grad_norm": 0.9860018491744995,
+      "learning_rate": 3.2585832569235576e-06,
+      "loss": 0.9533,
+      "step": 427
+    },
+    {
+      "epoch": 1.2231884057971014,
+      "grad_norm": 0.950383186340332,
+      "learning_rate": 3.236948276272337e-06,
+      "loss": 0.9562,
+      "step": 428
+    },
+    {
+      "epoch": 1.2260869565217392,
+      "grad_norm": 0.8197913765907288,
+      "learning_rate": 3.2153509266593984e-06,
+      "loss": 0.9588,
+      "step": 429
+    },
+    {
+      "epoch": 1.228985507246377,
+      "grad_norm": 0.8033617734909058,
+      "learning_rate": 3.1937916690642356e-06,
+      "loss": 1.0014,
+      "step": 430
+    },
+    {
+      "epoch": 1.2318840579710144,
+      "grad_norm": 0.8451259732246399,
+      "learning_rate": 3.1722709636532944e-06,
+      "loss": 0.9428,
+      "step": 431
+    },
+    {
+      "epoch": 1.2347826086956522,
+      "grad_norm": 0.7560276985168457,
+      "learning_rate": 3.150789269770155e-06,
+      "loss": 1.002,
+      "step": 432
+    },
+    {
+      "epoch": 1.23768115942029,
+      "grad_norm": 0.918804943561554,
+      "learning_rate": 3.1293470459257237e-06,
+      "loss": 0.9653,
+      "step": 433
+    },
+    {
+      "epoch": 1.2405797101449276,
+      "grad_norm": 0.8339065313339233,
+      "learning_rate": 3.107944749788449e-06,
+      "loss": 0.9407,
+      "step": 434
+    },
+    {
+      "epoch": 1.2434782608695651,
+      "grad_norm": 0.7564199566841125,
+      "learning_rate": 3.0865828381745515e-06,
+      "loss": 1.012,
+      "step": 435
+    },
+    {
+      "epoch": 1.2434782608695651,
+      "eval_loss": 0.9773865938186646,
+      "eval_runtime": 46.2701,
+      "eval_samples_per_second": 5.533,
+      "eval_steps_per_second": 0.692,
+      "step": 435
+    },
+    {
+      "epoch": 1.2463768115942029,
+      "grad_norm": 0.7768362164497375,
+      "learning_rate": 3.0652617670382745e-06,
+      "loss": 0.9642,
+      "step": 436
+    },
+    {
+      "epoch": 1.2492753623188406,
+      "grad_norm": 0.8295703530311584,
+      "learning_rate": 3.04398199146215e-06,
+      "loss": 1.0002,
+      "step": 437
+    },
+    {
+      "epoch": 1.2521739130434781,
+      "grad_norm": 0.8403414487838745,
+      "learning_rate": 3.0227439656472878e-06,
+      "loss": 0.9772,
+      "step": 438
+    },
+    {
+      "epoch": 1.2550724637681159,
+      "grad_norm": 0.8178934454917908,
+      "learning_rate": 3.0015481429036807e-06,
+      "loss": 1.0126,
+      "step": 439
+    },
+    {
+      "epoch": 1.2579710144927536,
+      "grad_norm": 0.8231812119483948,
+      "learning_rate": 2.980394975640526e-06,
+      "loss": 0.9118,
+      "step": 440
+    },
+    {
+      "epoch": 1.2608695652173914,
+      "grad_norm": 0.8780835270881653,
+      "learning_rate": 2.9592849153565727e-06,
+      "loss": 0.9549,
+      "step": 441
+    },
+    {
+      "epoch": 1.263768115942029,
+      "grad_norm": 1.000675916671753,
+      "learning_rate": 2.9382184126304834e-06,
+      "loss": 1.0483,
+      "step": 442
+    },
+    {
+      "epoch": 1.2666666666666666,
+      "grad_norm": 0.8840986490249634,
+      "learning_rate": 2.917195917111215e-06,
+      "loss": 0.9931,
+      "step": 443
+    },
+    {
+      "epoch": 1.2695652173913043,
+      "grad_norm": 0.8707259297370911,
+      "learning_rate": 2.8962178775084267e-06,
+      "loss": 0.8975,
+      "step": 444
+    },
+    {
+      "epoch": 1.272463768115942,
+      "grad_norm": 0.7439221739768982,
+      "learning_rate": 2.8752847415828923e-06,
+      "loss": 0.9453,
+      "step": 445
+    },
+    {
+      "epoch": 1.2753623188405796,
+      "grad_norm": 0.9899610280990601,
+      "learning_rate": 2.8543969561369556e-06,
+      "loss": 0.9426,
+      "step": 446
+    },
+    {
+      "epoch": 1.2782608695652173,
+      "grad_norm": 0.9144057035446167,
+      "learning_rate": 2.8335549670049866e-06,
+      "loss": 0.9453,
+      "step": 447
+    },
+    {
+      "epoch": 1.281159420289855,
+      "grad_norm": 0.9034680128097534,
+      "learning_rate": 2.812759219043869e-06,
+      "loss": 0.9258,
+      "step": 448
+    },
+    {
+      "epoch": 1.2840579710144928,
+      "grad_norm": 0.9689735174179077,
+      "learning_rate": 2.7920101561234954e-06,
+      "loss": 0.993,
+      "step": 449
+    },
+    {
+      "epoch": 1.2869565217391306,
+      "grad_norm": 0.6610868573188782,
+      "learning_rate": 2.771308221117309e-06,
+      "loss": 0.9506,
+      "step": 450
+    },
+    {
+      "epoch": 1.289855072463768,
+      "grad_norm": 0.829849362373352,
+      "learning_rate": 2.750653855892836e-06,
+      "loss": 0.9609,
+      "step": 451
+    },
+    {
+      "epoch": 1.2927536231884058,
+      "grad_norm": 0.7730438709259033,
+      "learning_rate": 2.7300475013022666e-06,
+      "loss": 0.9859,
+      "step": 452
+    },
+    {
+      "epoch": 1.2956521739130435,
+      "grad_norm": 0.925363302230835,
+      "learning_rate": 2.7094895971730326e-06,
+      "loss": 1.0286,
+      "step": 453
+    },
+    {
+      "epoch": 1.298550724637681,
+      "grad_norm": 0.886048436164856,
+      "learning_rate": 2.6889805822984348e-06,
+      "loss": 0.952,
+      "step": 454
+    },
+    {
+      "epoch": 1.3014492753623188,
+      "grad_norm": 1.1092323064804077,
+      "learning_rate": 2.668520894428259e-06,
+      "loss": 1.0032,
+      "step": 455
+    },
+    {
+      "epoch": 1.3043478260869565,
+      "grad_norm": 0.7811794877052307,
+      "learning_rate": 2.648110970259454e-06,
+      "loss": 0.9296,
+      "step": 456
+    },
+    {
+      "epoch": 1.3072463768115943,
+      "grad_norm": 0.8023120164871216,
+      "learning_rate": 2.6277512454267874e-06,
+      "loss": 0.9304,
+      "step": 457
+    },
+    {
+      "epoch": 1.310144927536232,
+      "grad_norm": 0.7649518251419067,
+      "learning_rate": 2.607442154493568e-06,
+      "loss": 0.9441,
+      "step": 458
+    },
+    {
+      "epoch": 1.3130434782608695,
+      "grad_norm": 0.8725413680076599,
+      "learning_rate": 2.5871841309423557e-06,
+      "loss": 0.9637,
+      "step": 459
+    },
+    {
+      "epoch": 1.3159420289855073,
+      "grad_norm": 0.7210862636566162,
+      "learning_rate": 2.5669776071657194e-06,
+      "loss": 0.9869,
+      "step": 460
+    },
+    {
+      "epoch": 1.318840579710145,
+      "grad_norm": 0.8270391821861267,
+      "learning_rate": 2.546823014456998e-06,
+      "loss": 0.9164,
+      "step": 461
+    },
+    {
+      "epoch": 1.3217391304347825,
+      "grad_norm": 0.829223096370697,
+      "learning_rate": 2.526720783001107e-06,
+      "loss": 1.0128,
+      "step": 462
+    },
+    {
+      "epoch": 1.3246376811594203,
+      "grad_norm": 0.9681026935577393,
+      "learning_rate": 2.506671341865341e-06,
+      "loss": 0.9768,
+      "step": 463
+    },
+    {
+      "epoch": 1.327536231884058,
+      "grad_norm": 0.840314507484436,
+      "learning_rate": 2.486675118990233e-06,
+      "loss": 0.9359,
+      "step": 464
+    },
+    {
+      "epoch": 1.3304347826086955,
+      "grad_norm": 0.659677267074585,
+      "learning_rate": 2.466732541180404e-06,
+      "loss": 0.965,
+      "step": 465
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.9055850505828857,
+      "learning_rate": 2.4468440340954664e-06,
+      "loss": 0.9557,
+      "step": 466
+    },
+    {
+      "epoch": 1.336231884057971,
+      "grad_norm": 0.8318009972572327,
+      "learning_rate": 2.4270100222409275e-06,
+      "loss": 0.9111,
+      "step": 467
+    },
+    {
+      "epoch": 1.3391304347826087,
+      "grad_norm": 0.9112004041671753,
+      "learning_rate": 2.4072309289591394e-06,
+      "loss": 0.9243,
+      "step": 468
+    },
+    {
+      "epoch": 1.3420289855072465,
+      "grad_norm": 0.8032493591308594,
+      "learning_rate": 2.387507176420256e-06,
+      "loss": 0.9228,
+      "step": 469
+    },
+    {
+      "epoch": 1.344927536231884,
+      "grad_norm": 0.662981390953064,
+      "learning_rate": 2.3678391856132203e-06,
+      "loss": 0.9778,
+      "step": 470
+    },
+    {
+      "epoch": 1.3478260869565217,
+      "grad_norm": 0.8368533849716187,
+      "learning_rate": 2.348227376336789e-06,
+      "loss": 1.0145,
+      "step": 471
+    },
+    {
+      "epoch": 1.3507246376811595,
+      "grad_norm": 0.9046915769577026,
+      "learning_rate": 2.328672167190558e-06,
+      "loss": 0.9393,
+      "step": 472
+    },
+    {
+      "epoch": 1.353623188405797,
+      "grad_norm": 0.9030489921569824,
+      "learning_rate": 2.3091739755660425e-06,
+      "loss": 0.9636,
+      "step": 473
+    },
+    {
+      "epoch": 1.3565217391304347,
+      "grad_norm": 0.8339246511459351,
+      "learning_rate": 2.289733217637753e-06,
+      "loss": 0.9395,
+      "step": 474
+    },
+    {
+      "epoch": 1.3594202898550725,
+      "grad_norm": 0.7877910733222961,
+      "learning_rate": 2.2703503083543288e-06,
+      "loss": 0.9454,
+      "step": 475
+    },
+    {
+      "epoch": 1.3623188405797102,
+      "grad_norm": 0.9808143377304077,
+      "learning_rate": 2.2510256614296638e-06,
+      "loss": 0.9968,
+      "step": 476
+    },
+    {
+      "epoch": 1.365217391304348,
+      "grad_norm": 1.2518080472946167,
+      "learning_rate": 2.2317596893340924e-06,
+      "loss": 0.9732,
+      "step": 477
+    },
+    {
+      "epoch": 1.3681159420289855,
+      "grad_norm": 0.8053367137908936,
+      "learning_rate": 2.2125528032855727e-06,
+      "loss": 0.9803,
+      "step": 478
+    },
+    {
+      "epoch": 1.3710144927536232,
+      "grad_norm": 0.9491231441497803,
+      "learning_rate": 2.1934054132409183e-06,
+      "loss": 0.9332,
+      "step": 479
+    },
+    {
+      "epoch": 1.373913043478261,
+      "grad_norm": 0.7503049373626709,
+      "learning_rate": 2.174317927887041e-06,
+      "loss": 0.9591,
+      "step": 480
+    },
+    {
+      "epoch": 1.3768115942028984,
+      "grad_norm": 0.819608211517334,
+      "learning_rate": 2.1552907546322356e-06,
+      "loss": 0.9795,
+      "step": 481
+    },
+    {
+      "epoch": 1.3797101449275362,
+      "grad_norm": 0.8053436279296875,
+      "learning_rate": 2.136324299597474e-06,
+      "loss": 1.0053,
+      "step": 482
+    },
+    {
+      "epoch": 1.382608695652174,
+      "grad_norm": 0.7377948760986328,
+      "learning_rate": 2.11741896760775e-06,
+      "loss": 1.0277,
+      "step": 483
+    },
+    {
+      "epoch": 1.3855072463768117,
+      "grad_norm": 0.865705668926239,
+      "learning_rate": 2.098575162183422e-06,
+      "loss": 0.9952,
+      "step": 484
+    },
+    {
+      "epoch": 1.3884057971014494,
+      "grad_norm": 0.8623892664909363,
+      "learning_rate": 2.0797932855316183e-06,
+      "loss": 1.0304,
+      "step": 485
+    },
+    {
+      "epoch": 1.391304347826087,
+      "grad_norm": 0.803113579750061,
+      "learning_rate": 2.061073738537635e-06,
+      "loss": 0.993,
+      "step": 486
+    },
+    {
+      "epoch": 1.3942028985507247,
+      "grad_norm": 0.7748633623123169,
+      "learning_rate": 2.0424169207563954e-06,
+      "loss": 0.9103,
+      "step": 487
+    },
+    {
+      "epoch": 1.3971014492753624,
+      "grad_norm": 0.9022510051727295,
+      "learning_rate": 2.023823230403907e-06,
+      "loss": 0.9125,
+      "step": 488
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.8588757514953613,
+      "learning_rate": 2.005293064348773e-06,
+      "loss": 1.0259,
+      "step": 489
+    },
+    {
+      "epoch": 1.4028985507246376,
+      "grad_norm": 0.8985849618911743,
+      "learning_rate": 1.9868268181037186e-06,
+      "loss": 0.9839,
+      "step": 490
+    },
+    {
+      "epoch": 1.4057971014492754,
+      "grad_norm": 0.8959106802940369,
+      "learning_rate": 1.968424885817143e-06,
+      "loss": 0.9752,
+      "step": 491
+    },
+    {
+      "epoch": 1.4086956521739131,
+      "grad_norm": 0.9213183522224426,
+      "learning_rate": 1.9500876602647167e-06,
+      "loss": 0.9053,
+      "step": 492
+    },
+    {
+      "epoch": 1.4115942028985506,
+      "grad_norm": 0.8219558596611023,
+      "learning_rate": 1.931815532840987e-06,
+      "loss": 0.9522,
+      "step": 493
+    },
+    {
+      "epoch": 1.4144927536231884,
+      "grad_norm": 0.8716898560523987,
+      "learning_rate": 1.913608893551036e-06,
+      "loss": 0.9858,
+      "step": 494
+    },
+    {
+      "epoch": 1.4173913043478261,
+      "grad_norm": 0.9072102904319763,
+      "learning_rate": 1.8954681310021434e-06,
+      "loss": 0.9382,
+      "step": 495
+    },
+    {
+      "epoch": 1.4202898550724639,
+      "grad_norm": 0.8592570424079895,
+      "learning_rate": 1.8773936323955055e-06,
+      "loss": 1.0004,
+      "step": 496
+    },
+    {
+      "epoch": 1.4231884057971014,
+      "grad_norm": 0.8882102966308594,
+      "learning_rate": 1.8593857835179557e-06,
+      "loss": 0.9862,
+      "step": 497
+    },
+    {
+      "epoch": 1.4260869565217391,
+      "grad_norm": 0.851216197013855,
+      "learning_rate": 1.8414449687337467e-06,
+      "loss": 1.0109,
+      "step": 498
+    },
+    {
+      "epoch": 1.4289855072463769,
+      "grad_norm": 0.7851223349571228,
+      "learning_rate": 1.8235715709763285e-06,
+      "loss": 0.9404,
+      "step": 499
+    },
+    {
+      "epoch": 1.4318840579710144,
+      "grad_norm": 0.7435230612754822,
+      "learning_rate": 1.8057659717401948e-06,
+      "loss": 1.0388,
+      "step": 500
+    },
+    {
+      "epoch": 1.434782608695652,
+      "grad_norm": 0.795467734336853,
+      "learning_rate": 1.7880285510727197e-06,
+      "loss": 1.0,
+      "step": 501
+    },
+    {
+      "epoch": 1.4376811594202898,
+      "grad_norm": 0.8847975730895996,
+      "learning_rate": 1.7703596875660645e-06,
+      "loss": 1.0182,
+      "step": 502
+    },
+    {
+      "epoch": 1.4405797101449276,
+      "grad_norm": 1.0256052017211914,
+      "learning_rate": 1.7527597583490825e-06,
+      "loss": 0.9573,
+      "step": 503
+    },
+    {
+      "epoch": 1.4434782608695653,
+      "grad_norm": 0.7743212580680847,
+      "learning_rate": 1.7352291390792798e-06,
+      "loss": 0.9831,
+      "step": 504
+    },
+    {
+      "epoch": 1.4463768115942028,
+      "grad_norm": 0.9608955979347229,
+      "learning_rate": 1.7177682039347875e-06,
+      "loss": 0.9683,
+      "step": 505
+    },
+    {
+      "epoch": 1.4492753623188406,
+      "grad_norm": 0.899786651134491,
+      "learning_rate": 1.7003773256063882e-06,
+      "loss": 1.0373,
+      "step": 506
+    },
+    {
+      "epoch": 1.4521739130434783,
+      "grad_norm": 0.933459997177124,
+      "learning_rate": 1.6830568752895455e-06,
+      "loss": 1.0065,
+      "step": 507
+    },
+    {
+      "epoch": 1.4550724637681158,
+      "grad_norm": 0.7607547640800476,
+      "learning_rate": 1.6658072226764949e-06,
+      "loss": 0.9652,
+      "step": 508
+    },
+    {
+      "epoch": 1.4579710144927536,
+      "grad_norm": 0.7857306599617004,
+      "learning_rate": 1.6486287359483422e-06,
+      "loss": 0.9943,
+      "step": 509
+    },
+    {
+      "epoch": 1.4608695652173913,
+      "grad_norm": 0.9342886209487915,
+      "learning_rate": 1.6315217817672142e-06,
+      "loss": 1.028,
+      "step": 510
+    },
+    {
+      "epoch": 1.463768115942029,
+      "grad_norm": 1.0333482027053833,
+      "learning_rate": 1.614486725268426e-06,
+      "loss": 0.9296,
+      "step": 511
+    },
+    {
+      "epoch": 1.4666666666666668,
+      "grad_norm": 0.7788994908332825,
+      "learning_rate": 1.5975239300526924e-06,
+      "loss": 0.9871,
+      "step": 512
+    },
+    {
+      "epoch": 1.4695652173913043,
+      "grad_norm": 0.764268159866333,
+      "learning_rate": 1.5806337581783593e-06,
+      "loss": 0.9603,
+      "step": 513
+    },
+    {
+      "epoch": 1.472463768115942,
+      "grad_norm": 0.9053126573562622,
+      "learning_rate": 1.5638165701536866e-06,
+      "loss": 1.003,
+      "step": 514
+    },
+    {
+      "epoch": 1.4753623188405798,
+      "grad_norm": 0.890696108341217,
+      "learning_rate": 1.5470727249291423e-06,
+      "loss": 0.9894,
+      "step": 515
+    },
+    {
+      "epoch": 1.4782608695652173,
+      "grad_norm": 0.755885124206543,
+      "learning_rate": 1.5304025798897521e-06,
+      "loss": 0.9355,
+      "step": 516
+    },
+    {
+      "epoch": 1.481159420289855,
+      "grad_norm": 0.8839924931526184,
+      "learning_rate": 1.5138064908474603e-06,
+      "loss": 0.9879,
+      "step": 517
+    },
+    {
+      "epoch": 1.4840579710144928,
+      "grad_norm": 0.919336199760437,
+      "learning_rate": 1.4972848120335453e-06,
+      "loss": 1.042,
+      "step": 518
+    },
+    {
+      "epoch": 1.4869565217391305,
+      "grad_norm": 1.0073022842407227,
+      "learning_rate": 1.4808378960910502e-06,
+      "loss": 1.0537,
+      "step": 519
+    },
+    {
+      "epoch": 1.4898550724637682,
+      "grad_norm": 0.9994317293167114,
+      "learning_rate": 1.4644660940672628e-06,
+      "loss": 1.042,
+      "step": 520
+    },
+    {
+      "epoch": 1.4927536231884058,
+      "grad_norm": 0.8237168788909912,
+      "learning_rate": 1.448169755406218e-06,
+      "loss": 0.9449,
+      "step": 521
+    },
+    {
+      "epoch": 1.4956521739130435,
+      "grad_norm": 0.8838447332382202,
+      "learning_rate": 1.4319492279412388e-06,
+      "loss": 0.9789,
+      "step": 522
+    },
+    {
+      "epoch": 1.4956521739130435,
+      "eval_loss": 0.9736447334289551,
+      "eval_runtime": 46.3906,
+      "eval_samples_per_second": 5.518,
+      "eval_steps_per_second": 0.69,
+      "step": 522
+    },
+    {
+      "epoch": 1.4985507246376812,
+      "grad_norm": 0.7661985754966736,
+      "learning_rate": 1.4158048578875211e-06,
+      "loss": 0.9991,
+      "step": 523
+    },
+    {
+      "epoch": 1.5014492753623188,
+      "grad_norm": 0.8049348592758179,
+      "learning_rate": 1.399736989834728e-06,
+      "loss": 0.9455,
+      "step": 524
+    },
+    {
+      "epoch": 1.5043478260869565,
+      "grad_norm": 0.8575480580329895,
+      "learning_rate": 1.383745966739652e-06,
+      "loss": 0.9764,
+      "step": 525
+    },
+    {
+      "epoch": 1.5072463768115942,
+      "grad_norm": 0.7336897253990173,
+      "learning_rate": 1.3678321299188802e-06,
+      "loss": 0.9613,
+      "step": 526
+    },
+    {
+      "epoch": 1.5101449275362318,
+      "grad_norm": 0.8718299865722656,
+      "learning_rate": 1.351995819041521e-06,
+      "loss": 0.9923,
+      "step": 527
+    },
+    {
+      "epoch": 1.5130434782608697,
+      "grad_norm": 0.9166209101676941,
+      "learning_rate": 1.336237372121944e-06,
+      "loss": 1.069,
+      "step": 528
+    },
+    {
+      "epoch": 1.5159420289855072,
+      "grad_norm": 0.9382581114768982,
+      "learning_rate": 1.320557125512575e-06,
+      "loss": 0.9671,
+      "step": 529
+    },
+    {
+      "epoch": 1.518840579710145,
+      "grad_norm": 0.8037452101707458,
+      "learning_rate": 1.3049554138967052e-06,
+      "loss": 0.9395,
+      "step": 530
+    },
+    {
+      "epoch": 1.5217391304347827,
+      "grad_norm": 0.6627395749092102,
+      "learning_rate": 1.289432570281361e-06,
+      "loss": 0.9025,
+      "step": 531
+    },
+    {
+      "epoch": 1.5246376811594202,
+      "grad_norm": 0.7865214943885803,
+      "learning_rate": 1.2739889259901866e-06,
+      "loss": 0.9021,
+      "step": 532
+    },
+    {
+      "epoch": 1.527536231884058,
+      "grad_norm": 0.8900570273399353,
+      "learning_rate": 1.258624810656376e-06,
+      "loss": 0.946,
+      "step": 533
+    },
+    {
+      "epoch": 1.5304347826086957,
+      "grad_norm": 0.8942597508430481,
+      "learning_rate": 1.2433405522156334e-06,
+      "loss": 1.0141,
+      "step": 534
+    },
+    {
+      "epoch": 1.5333333333333332,
+      "grad_norm": 0.8667037487030029,
+      "learning_rate": 1.2281364768991804e-06,
+      "loss": 1.0092,
+      "step": 535
+    },
+    {
+      "epoch": 1.5362318840579712,
+      "grad_norm": 0.7895119190216064,
+      "learning_rate": 1.213012909226786e-06,
+      "loss": 0.9251,
+      "step": 536
+    },
+    {
+      "epoch": 1.5391304347826087,
+      "grad_norm": 0.8225801587104797,
+      "learning_rate": 1.1979701719998454e-06,
+      "loss": 0.9449,
+      "step": 537
+    },
+    {
+      "epoch": 1.5420289855072464,
+      "grad_norm": 0.8342156410217285,
+      "learning_rate": 1.1830085862944851e-06,
+      "loss": 0.9676,
+      "step": 538
+    },
+    {
+      "epoch": 1.5449275362318842,
+      "grad_norm": 0.7941964864730835,
+      "learning_rate": 1.1681284714547147e-06,
+      "loss": 0.9907,
+      "step": 539
+    },
+    {
+      "epoch": 1.5478260869565217,
+      "grad_norm": 0.9655299782752991,
+      "learning_rate": 1.1533301450856054e-06,
+      "loss": 1.0126,
+      "step": 540
+    },
+    {
+      "epoch": 1.5507246376811594,
+      "grad_norm": 0.8632703423500061,
+      "learning_rate": 1.1386139230465176e-06,
+      "loss": 0.9452,
+      "step": 541
+    },
+    {
+      "epoch": 1.5536231884057972,
+      "grad_norm": 0.8908371329307556,
+      "learning_rate": 1.1239801194443507e-06,
+      "loss": 0.9821,
+      "step": 542
+    },
+    {
+      "epoch": 1.5565217391304347,
+      "grad_norm": 0.873409628868103,
+      "learning_rate": 1.1094290466268493e-06,
+      "loss": 0.969,
+      "step": 543
+    },
+    {
+      "epoch": 1.5594202898550724,
+      "grad_norm": 0.8888543844223022,
+      "learning_rate": 1.0949610151759233e-06,
+      "loss": 0.9593,
+      "step": 544
+    },
+    {
+      "epoch": 1.5623188405797102,
+      "grad_norm": 0.7646573781967163,
+      "learning_rate": 1.0805763339010329e-06,
+      "loss": 0.9287,
+      "step": 545
+    },
+    {
+      "epoch": 1.5652173913043477,
+      "grad_norm": 0.835421085357666,
+      "learning_rate": 1.066275309832584e-06,
+      "loss": 0.9732,
+      "step": 546
+    },
+    {
+      "epoch": 1.5681159420289856,
+      "grad_norm": 0.9228112697601318,
+      "learning_rate": 1.0520582482153874e-06,
+      "loss": 0.9675,
+      "step": 547
+    },
+    {
+      "epoch": 1.5710144927536231,
+      "grad_norm": 0.7750451564788818,
+      "learning_rate": 1.037925452502131e-06,
+      "loss": 0.9938,
+      "step": 548
+    },
+    {
+      "epoch": 1.5739130434782609,
+      "grad_norm": 0.8366883397102356,
+      "learning_rate": 1.0238772243469153e-06,
+      "loss": 0.962,
+      "step": 549
+    },
+    {
+      "epoch": 1.5768115942028986,
+      "grad_norm": 0.933855414390564,
+      "learning_rate": 1.0099138635988026e-06,
+      "loss": 0.9732,
+      "step": 550
+    },
+    {
+      "epoch": 1.5797101449275361,
+      "grad_norm": 0.9288073778152466,
+      "learning_rate": 9.960356682954293e-07,
+      "loss": 0.9958,
+      "step": 551
+    },
+    {
+      "epoch": 1.5826086956521739,
+      "grad_norm": 0.7197360992431641,
+      "learning_rate": 9.822429346566314e-07,
+      "loss": 0.9266,
+      "step": 552
+    },
+    {
+      "epoch": 1.5855072463768116,
+      "grad_norm": 0.8900216817855835,
+      "learning_rate": 9.685359570781344e-07,
+      "loss": 1.0006,
+      "step": 553
+    },
+    {
+      "epoch": 1.5884057971014491,
+      "grad_norm": 0.7970424294471741,
+      "learning_rate": 9.549150281252633e-07,
+      "loss": 0.968,
+      "step": 554
+    },
+    {
+      "epoch": 1.591304347826087,
+      "grad_norm": 0.9357386231422424,
+      "learning_rate": 9.41380438526694e-07,
+      "loss": 1.0361,
+      "step": 555
+    },
+    {
+      "epoch": 1.5942028985507246,
+      "grad_norm": 0.740880012512207,
+      "learning_rate": 9.279324771682586e-07,
+      "loss": 0.9492,
+      "step": 556
+    },
+    {
+      "epoch": 1.5971014492753624,
+      "grad_norm": 0.9611430764198303,
+      "learning_rate": 9.145714310867676e-07,
+      "loss": 0.9559,
+      "step": 557
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.9163907170295715,
+      "learning_rate": 9.01297585463895e-07,
+      "loss": 1.0112,
+      "step": 558
+    },
+    {
+      "epoch": 1.6028985507246376,
+      "grad_norm": 0.9926815032958984,
+      "learning_rate": 8.881112236200795e-07,
+      "loss": 1.0813,
+      "step": 559
+    },
+    {
+      "epoch": 1.6057971014492753,
+      "grad_norm": 0.8820666074752808,
+      "learning_rate": 8.750126270084891e-07,
+      "loss": 0.9911,
+      "step": 560
+    },
+    {
+      "epoch": 1.608695652173913,
+      "grad_norm": 0.817694365978241,
+      "learning_rate": 8.620020752090008e-07,
+      "loss": 0.9162,
+      "step": 561
+    },
+    {
+      "epoch": 1.6115942028985506,
+      "grad_norm": 0.9005435109138489,
+      "learning_rate": 8.490798459222477e-07,
+      "loss": 1.015,
+      "step": 562
+    },
+    {
+      "epoch": 1.6144927536231886,
+      "grad_norm": 0.8248128890991211,
+      "learning_rate": 8.362462149636757e-07,
+      "loss": 0.9976,
+      "step": 563
+    },
+    {
+      "epoch": 1.617391304347826,
+      "grad_norm": 0.8286884427070618,
+      "learning_rate": 8.235014562576732e-07,
+      "loss": 0.992,
+      "step": 564
+    },
+    {
+      "epoch": 1.6202898550724638,
+      "grad_norm": 0.8723387718200684,
+      "learning_rate": 8.108458418317089e-07,
+      "loss": 0.9381,
+      "step": 565
+    },
+    {
+      "epoch": 1.6231884057971016,
+      "grad_norm": 0.9833754897117615,
+      "learning_rate": 7.98279641810537e-07,
+      "loss": 0.9435,
+      "step": 566
+    },
+    {
+      "epoch": 1.626086956521739,
+      "grad_norm": 0.9212725162506104,
+      "learning_rate": 7.858031244104247e-07,
+      "loss": 0.9611,
+      "step": 567
+    },
+    {
+      "epoch": 1.6289855072463768,
+      "grad_norm": 0.852350115776062,
+      "learning_rate": 7.734165559334327e-07,
+      "loss": 0.9064,
+      "step": 568
+    },
+    {
+      "epoch": 1.6318840579710145,
+      "grad_norm": 0.8955137729644775,
+      "learning_rate": 7.611202007617241e-07,
+      "loss": 0.9547,
+      "step": 569
+    },
+    {
+      "epoch": 1.634782608695652,
+      "grad_norm": 0.8889902830123901,
+      "learning_rate": 7.489143213519301e-07,
+      "loss": 0.9533,
+      "step": 570
+    },
+    {
+      "epoch": 1.6376811594202898,
+      "grad_norm": 0.9037710428237915,
+      "learning_rate": 7.367991782295392e-07,
+      "loss": 0.9213,
+      "step": 571
+    },
+    {
+      "epoch": 1.6405797101449275,
+      "grad_norm": 0.8594886064529419,
+      "learning_rate": 7.24775029983345e-07,
+      "loss": 0.9765,
+      "step": 572
+    },
+    {
+      "epoch": 1.643478260869565,
+      "grad_norm": 0.7082343101501465,
+      "learning_rate": 7.128421332599189e-07,
+      "loss": 0.9871,
+      "step": 573
+    },
+    {
+      "epoch": 1.646376811594203,
+      "grad_norm": 0.878217339515686,
+      "learning_rate": 7.010007427581378e-07,
+      "loss": 0.9366,
+      "step": 574
+    },
+    {
+      "epoch": 1.6492753623188405,
+      "grad_norm": 0.9462459087371826,
+      "learning_rate": 6.892511112237472e-07,
+      "loss": 0.9505,
+      "step": 575
+    },
+    {
+      "epoch": 1.6521739130434783,
+      "grad_norm": 0.7900387644767761,
+      "learning_rate": 6.775934894439606e-07,
+      "loss": 0.9554,
+      "step": 576
+    },
+    {
+      "epoch": 1.655072463768116,
+      "grad_norm": 0.8542242050170898,
+      "learning_rate": 6.66028126242117e-07,
+      "loss": 0.9331,
+      "step": 577
+    },
+    {
+      "epoch": 1.6579710144927535,
+      "grad_norm": 0.9795560836791992,
+      "learning_rate": 6.545552684723583e-07,
+      "loss": 0.9203,
+      "step": 578
+    },
+    {
+      "epoch": 1.6608695652173913,
+      "grad_norm": 0.7833444476127625,
+      "learning_rate": 6.431751610143716e-07,
+      "loss": 0.9977,
+      "step": 579
+    },
+    {
+      "epoch": 1.663768115942029,
+      "grad_norm": 0.8404137492179871,
+      "learning_rate": 6.318880467681527e-07,
+      "loss": 0.9981,
+      "step": 580
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.9158584475517273,
+      "learning_rate": 6.206941666488287e-07,
+      "loss": 0.9584,
+      "step": 581
+    },
+    {
+      "epoch": 1.6695652173913045,
+      "grad_norm": 0.7720228433609009,
+      "learning_rate": 6.095937595815104e-07,
+      "loss": 0.9284,
+      "step": 582
+    },
+    {
+      "epoch": 1.672463768115942,
+      "grad_norm": 0.9077423214912415,
+      "learning_rate": 5.985870624961993e-07,
+      "loss": 1.0104,
+      "step": 583
+    },
+    {
+      "epoch": 1.6753623188405797,
+      "grad_norm": 0.7142834663391113,
+      "learning_rate": 5.876743103227217e-07,
+      "loss": 0.9617,
+      "step": 584
+    },
+    {
+      "epoch": 1.6782608695652175,
+      "grad_norm": 0.9244917035102844,
+      "learning_rate": 5.768557359857241e-07,
+      "loss": 0.9534,
+      "step": 585
+    },
+    {
+      "epoch": 1.681159420289855,
+      "grad_norm": 0.8961134552955627,
+      "learning_rate": 5.661315703996905e-07,
+      "loss": 0.9462,
+      "step": 586
+    },
+    {
+      "epoch": 1.6840579710144927,
+      "grad_norm": 0.9584707021713257,
+      "learning_rate": 5.555020424640267e-07,
+      "loss": 0.9483,
+      "step": 587
+    },
+    {
+      "epoch": 1.6869565217391305,
+      "grad_norm": 0.8094743490219116,
+      "learning_rate": 5.449673790581611e-07,
+      "loss": 0.9564,
+      "step": 588
+    },
+    {
+      "epoch": 1.689855072463768,
+      "grad_norm": 0.886703610420227,
+      "learning_rate": 5.345278050367142e-07,
+      "loss": 1.0153,
+      "step": 589
+    },
+    {
+      "epoch": 1.692753623188406,
+      "grad_norm": 0.9125918745994568,
+      "learning_rate": 5.241835432246888e-07,
+      "loss": 0.9749,
+      "step": 590
+    },
+    {
+      "epoch": 1.6956521739130435,
+      "grad_norm": 0.8972467184066772,
+      "learning_rate": 5.139348144127237e-07,
+      "loss": 1.0084,
+      "step": 591
+    },
+    {
+      "epoch": 1.6985507246376812,
+      "grad_norm": 0.7566870450973511,
+      "learning_rate": 5.037818373523723e-07,
+      "loss": 0.9932,
+      "step": 592
+    },
+    {
+      "epoch": 1.701449275362319,
+      "grad_norm": 0.8601511716842651,
+      "learning_rate": 4.937248287514407e-07,
+      "loss": 0.9747,
+      "step": 593
+    },
+    {
+      "epoch": 1.7043478260869565,
+      "grad_norm": 0.8272446393966675,
+      "learning_rate": 4.837640032693558e-07,
+      "loss": 1.0065,
+      "step": 594
+    },
+    {
+      "epoch": 1.7072463768115942,
+      "grad_norm": 0.7029653191566467,
+      "learning_rate": 4.738995735125895e-07,
+      "loss": 0.9384,
+      "step": 595
+    },
+    {
+      "epoch": 1.710144927536232,
+      "grad_norm": 0.913718044757843,
+      "learning_rate": 4.641317500301173e-07,
+      "loss": 0.9563,
+      "step": 596
+    },
+    {
+      "epoch": 1.7130434782608694,
+      "grad_norm": 0.9736040830612183,
+      "learning_rate": 4.5446074130892525e-07,
+      "loss": 0.9455,
+      "step": 597
+    },
+    {
+      "epoch": 1.7159420289855074,
+      "grad_norm": 0.8182763457298279,
+      "learning_rate": 4.448867537695578e-07,
+      "loss": 0.944,
+      "step": 598
+    },
+    {
+      "epoch": 1.718840579710145,
+      "grad_norm": 0.8536428213119507,
+      "learning_rate": 4.3540999176171717e-07,
+      "loss": 0.9029,
+      "step": 599
+    },
+    {
+      "epoch": 1.7217391304347827,
+      "grad_norm": 0.8713299036026001,
+      "learning_rate": 4.2603065755989493e-07,
+      "loss": 0.9448,
+      "step": 600
+    },
+    {
+      "epoch": 1.7246376811594204,
+      "grad_norm": 0.9857087135314941,
+      "learning_rate": 4.167489513590611e-07,
+      "loss": 1.0004,
+      "step": 601
+    },
+    {
+      "epoch": 1.727536231884058,
+      "grad_norm": 0.9195379018783569,
+      "learning_rate": 4.0756507127038494e-07,
+      "loss": 1.0247,
+      "step": 602
+    },
+    {
+      "epoch": 1.7304347826086957,
+      "grad_norm": 0.8422645926475525,
+      "learning_rate": 3.984792133170129e-07,
+      "loss": 1.0087,
+      "step": 603
+    },
+    {
+      "epoch": 1.7333333333333334,
+      "grad_norm": 0.8902682662010193,
+      "learning_rate": 3.894915714298775e-07,
+      "loss": 0.8793,
+      "step": 604
+    },
+    {
+      "epoch": 1.736231884057971,
+      "grad_norm": 0.8859000205993652,
+      "learning_rate": 3.8060233744356634e-07,
+      "loss": 1.0018,
+      "step": 605
+    },
+    {
+      "epoch": 1.7391304347826086,
+      "grad_norm": 0.8340051174163818,
+      "learning_rate": 3.71811701092219e-07,
+      "loss": 0.9534,
+      "step": 606
+    },
+    {
+      "epoch": 1.7420289855072464,
+      "grad_norm": 0.8677003979682922,
+      "learning_rate": 3.6311985000548223e-07,
+      "loss": 0.9525,
+      "step": 607
+    },
+    {
+      "epoch": 1.744927536231884,
+      "grad_norm": 0.932613730430603,
+      "learning_rate": 3.5452696970450674e-07,
+      "loss": 0.9257,
+      "step": 608
+    },
+    {
+      "epoch": 1.7478260869565219,
+      "grad_norm": 0.9657606482505798,
+      "learning_rate": 3.4603324359798016e-07,
+      "loss": 1.0033,
+      "step": 609
+    },
+    {
+      "epoch": 1.7478260869565219,
+      "eval_loss": 0.9723503589630127,
+      "eval_runtime": 46.2237,
+      "eval_samples_per_second": 5.538,
+      "eval_steps_per_second": 0.692,
+      "step": 609
+    },
+    {
+      "epoch": 1.7507246376811594,
+      "grad_norm": 0.860346257686615,
+      "learning_rate": 3.3763885297822153e-07,
+      "loss": 0.986,
+      "step": 610
+    },
+    {
+      "epoch": 1.7536231884057971,
+      "grad_norm": 0.8614711165428162,
+      "learning_rate": 3.293439770173046e-07,
+      "loss": 0.9976,
+      "step": 611
+    },
+    {
+      "epoch": 1.7565217391304349,
+      "grad_norm": 0.7311533689498901,
+      "learning_rate": 3.2114879276323783e-07,
+      "loss": 0.908,
+      "step": 612
+    },
+    {
+      "epoch": 1.7594202898550724,
+      "grad_norm": 0.9412534236907959,
+      "learning_rate": 3.130534751361808e-07,
+      "loss": 0.977,
+      "step": 613
+    },
+    {
+      "epoch": 1.76231884057971,
+      "grad_norm": 0.911098062992096,
+      "learning_rate": 3.0505819692471797e-07,
+      "loss": 0.9387,
+      "step": 614
+    },
+    {
+      "epoch": 1.7652173913043478,
+      "grad_norm": 0.8363705277442932,
+      "learning_rate": 2.9716312878216194e-07,
+      "loss": 0.9538,
+      "step": 615
+    },
+    {
+      "epoch": 1.7681159420289854,
+      "grad_norm": 0.9569475650787354,
+      "learning_rate": 2.893684392229185e-07,
+      "loss": 0.998,
+      "step": 616
+    },
+    {
+      "epoch": 1.7710144927536233,
+      "grad_norm": 0.8830727338790894,
+      "learning_rate": 2.8167429461888496e-07,
+      "loss": 0.9277,
+      "step": 617
+    },
+    {
+      "epoch": 1.7739130434782608,
+      "grad_norm": 0.9968934059143066,
+      "learning_rate": 2.7408085919590265e-07,
+      "loss": 1.0167,
+      "step": 618
+    },
+    {
+      "epoch": 1.7768115942028986,
+      "grad_norm": 0.7348361611366272,
+      "learning_rate": 2.6658829503024566e-07,
+      "loss": 0.9224,
+      "step": 619
+    },
+    {
+      "epoch": 1.7797101449275363,
+      "grad_norm": 0.9676991701126099,
+      "learning_rate": 2.5919676204517073e-07,
+      "loss": 0.9808,
+      "step": 620
+    },
+    {
+      "epoch": 1.7826086956521738,
+      "grad_norm": 0.8737136125564575,
+      "learning_rate": 2.5190641800749424e-07,
+      "loss": 0.9436,
+      "step": 621
+    },
+    {
+      "epoch": 1.7855072463768116,
+      "grad_norm": 0.8523948192596436,
+      "learning_rate": 2.447174185242324e-07,
+      "loss": 0.952,
+      "step": 622
+    },
+    {
+      "epoch": 1.7884057971014493,
+      "grad_norm": 0.7342602610588074,
+      "learning_rate": 2.3762991703927375e-07,
+      "loss": 0.9682,
+      "step": 623
+    },
+    {
+      "epoch": 1.7913043478260868,
+      "grad_norm": 1.044270634651184,
+      "learning_rate": 2.3064406483010947e-07,
+      "loss": 0.9725,
+      "step": 624
+    },
+    {
+      "epoch": 1.7942028985507248,
+      "grad_norm": 0.9236974120140076,
+      "learning_rate": 2.237600110046001e-07,
+      "loss": 0.951,
+      "step": 625
+    },
+    {
+      "epoch": 1.7971014492753623,
+      "grad_norm": 0.7988727688789368,
+      "learning_rate": 2.1697790249779638e-07,
+      "loss": 0.8851,
+      "step": 626
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.7906875014305115,
+      "learning_rate": 2.102978840687997e-07,
+      "loss": 0.9162,
+      "step": 627
+    },
+    {
+      "epoch": 1.8028985507246378,
+      "grad_norm": 0.7702775001525879,
+      "learning_rate": 2.0372009829767558e-07,
+      "loss": 0.9614,
+      "step": 628
+    },
+    {
+      "epoch": 1.8057971014492753,
+      "grad_norm": 0.9317652583122253,
+      "learning_rate": 1.9724468558240838e-07,
+      "loss": 0.9105,
+      "step": 629
+    },
+    {
+      "epoch": 1.808695652173913,
+      "grad_norm": 0.855368435382843,
+      "learning_rate": 1.908717841359048e-07,
+      "loss": 1.0019,
+      "step": 630
+    },
+    {
+      "epoch": 1.8115942028985508,
+      "grad_norm": 0.761951744556427,
+      "learning_rate": 1.8460152998304393e-07,
+      "loss": 0.9267,
+      "step": 631
+    },
+    {
+      "epoch": 1.8144927536231883,
+      "grad_norm": 0.8468912839889526,
+      "learning_rate": 1.7843405695777582e-07,
+      "loss": 1.0065,
+      "step": 632
+    },
+    {
+      "epoch": 1.8173913043478263,
+      "grad_norm": 0.889159619808197,
+      "learning_rate": 1.7236949670026037e-07,
+      "loss": 0.9332,
+      "step": 633
+    },
+    {
+      "epoch": 1.8202898550724638,
+      "grad_norm": 0.8339653015136719,
+      "learning_rate": 1.664079786540629e-07,
+      "loss": 0.9851,
+      "step": 634
+    },
+    {
+      "epoch": 1.8231884057971013,
+      "grad_norm": 0.7670577764511108,
+      "learning_rate": 1.6054963006338742e-07,
+      "loss": 0.9354,
+      "step": 635
+    },
+    {
+      "epoch": 1.8260869565217392,
+      "grad_norm": 0.8923590183258057,
+      "learning_rate": 1.547945759703623e-07,
+      "loss": 1.0162,
+      "step": 636
+    },
+    {
+      "epoch": 1.8289855072463768,
+      "grad_norm": 0.7903847098350525,
+      "learning_rate": 1.491429392123711e-07,
+      "loss": 0.979,
+      "step": 637
+    },
+    {
+      "epoch": 1.8318840579710145,
+      "grad_norm": 0.9351047873497009,
+      "learning_rate": 1.435948404194304e-07,
+      "loss": 0.9458,
+      "step": 638
+    },
+    {
+      "epoch": 1.8347826086956522,
+      "grad_norm": 0.8081286549568176,
+      "learning_rate": 1.3815039801161723e-07,
+      "loss": 0.9246,
+      "step": 639
+    },
+    {
+      "epoch": 1.8376811594202898,
+      "grad_norm": 0.752216100692749,
+      "learning_rate": 1.328097281965357e-07,
+      "loss": 0.9758,
+      "step": 640
+    },
+    {
+      "epoch": 1.8405797101449275,
+      "grad_norm": 0.9659929871559143,
+      "learning_rate": 1.2757294496684447e-07,
+      "loss": 1.0107,
+      "step": 641
+    },
+    {
+      "epoch": 1.8434782608695652,
+      "grad_norm": 1.0376217365264893,
+      "learning_rate": 1.22440160097817e-07,
+      "loss": 0.9631,
+      "step": 642
+    },
+    {
+      "epoch": 1.8463768115942027,
+      "grad_norm": 0.9361832141876221,
+      "learning_rate": 1.1741148314495965e-07,
+      "loss": 0.9867,
+      "step": 643
+    },
+    {
+      "epoch": 1.8492753623188407,
+      "grad_norm": 0.8664498329162598,
+      "learning_rate": 1.1248702144167123e-07,
+      "loss": 0.9703,
+      "step": 644
+    },
+    {
+      "epoch": 1.8521739130434782,
+      "grad_norm": 0.9653159379959106,
+      "learning_rate": 1.0766688009695548e-07,
+      "loss": 0.9662,
+      "step": 645
+    },
+    {
+      "epoch": 1.855072463768116,
+      "grad_norm": 1.0553069114685059,
+      "learning_rate": 1.0295116199317057e-07,
+      "loss": 0.9745,
+      "step": 646
+    },
+    {
+      "epoch": 1.8579710144927537,
+      "grad_norm": 0.9453853964805603,
+      "learning_rate": 9.833996778384259e-08,
+      "loss": 0.9802,
+      "step": 647
+    },
+    {
+      "epoch": 1.8608695652173912,
+      "grad_norm": 0.7949392795562744,
+      "learning_rate": 9.383339589150776e-08,
+      "loss": 0.9173,
+      "step": 648
+    },
+    {
+      "epoch": 1.863768115942029,
+      "grad_norm": 0.7941511273384094,
+      "learning_rate": 8.943154250562025e-08,
+      "loss": 0.9633,
+      "step": 649
+    },
+    {
+      "epoch": 1.8666666666666667,
+      "grad_norm": 0.8360518217086792,
+      "learning_rate": 8.513450158049109e-08,
+      "loss": 0.9565,
+      "step": 650
+    },
+    {
+      "epoch": 1.8695652173913042,
+      "grad_norm": 0.9996237754821777,
+      "learning_rate": 8.094236483329022e-08,
+      "loss": 0.9999,
+      "step": 651
+    },
+    {
+      "epoch": 1.8724637681159422,
+      "grad_norm": 0.7493065595626831,
+      "learning_rate": 7.685522174208205e-08,
+      "loss": 0.9733,
+      "step": 652
+    },
+    {
+      "epoch": 1.8753623188405797,
+      "grad_norm": 0.8603729605674744,
+      "learning_rate": 7.287315954392137e-08,
+      "loss": 0.9624,
+      "step": 653
+    },
+    {
+      "epoch": 1.8782608695652174,
+      "grad_norm": 0.7145766615867615,
+      "learning_rate": 6.899626323298714e-08,
+      "loss": 1.0049,
+      "step": 654
+    },
+    {
+      "epoch": 1.8811594202898552,
+      "grad_norm": 0.9684036374092102,
+      "learning_rate": 6.522461555877213e-08,
+      "loss": 0.9562,
+      "step": 655
+    },
+    {
+      "epoch": 1.8840579710144927,
+      "grad_norm": 0.8989734053611755,
+      "learning_rate": 6.15582970243117e-08,
+      "loss": 1.0268,
+      "step": 656
+    },
+    {
+      "epoch": 1.8869565217391304,
+      "grad_norm": 0.9243214726448059,
+      "learning_rate": 5.799738588447068e-08,
+      "loss": 0.9643,
+      "step": 657
+    },
+    {
+      "epoch": 1.8898550724637682,
+      "grad_norm": 0.9879785776138306,
+      "learning_rate": 5.454195814427021e-08,
+      "loss": 0.9417,
+      "step": 658
+    },
+    {
+      "epoch": 1.8927536231884057,
+      "grad_norm": 0.9754204154014587,
+      "learning_rate": 5.119208755726579e-08,
+      "loss": 1.063,
+      "step": 659
+    },
+    {
+      "epoch": 1.8956521739130436,
+      "grad_norm": 0.7662235498428345,
+      "learning_rate": 4.794784562397459e-08,
+      "loss": 0.9799,
+      "step": 660
+    },
+    {
+      "epoch": 1.8985507246376812,
+      "grad_norm": 0.8312128782272339,
+      "learning_rate": 4.4809301590345576e-08,
+      "loss": 0.9671,
+      "step": 661
+    },
+    {
+      "epoch": 1.901449275362319,
+      "grad_norm": 0.8354112505912781,
+      "learning_rate": 4.177652244628627e-08,
+      "loss": 0.9688,
+      "step": 662
+    },
+    {
+      "epoch": 1.9043478260869566,
+      "grad_norm": 0.9401686191558838,
+      "learning_rate": 3.884957292422997e-08,
+      "loss": 0.9989,
+      "step": 663
+    },
+    {
+      "epoch": 1.9072463768115941,
+      "grad_norm": 0.8864877820014954,
+      "learning_rate": 3.602851549775521e-08,
+      "loss": 1.0094,
+      "step": 664
+    },
+    {
+      "epoch": 1.9101449275362319,
+      "grad_norm": 0.9440781474113464,
+      "learning_rate": 3.3313410380250157e-08,
+      "loss": 0.9544,
+      "step": 665
+    },
+    {
+      "epoch": 1.9130434782608696,
+      "grad_norm": 1.0098837614059448,
+      "learning_rate": 3.0704315523631956e-08,
+      "loss": 0.9209,
+      "step": 666
+    },
+    {
+      "epoch": 1.9159420289855071,
+      "grad_norm": 0.9735342860221863,
+      "learning_rate": 2.8201286617103863e-08,
+      "loss": 1.0385,
+      "step": 667
+    },
+    {
+      "epoch": 1.9188405797101449,
+      "grad_norm": 0.9122427105903625,
+      "learning_rate": 2.5804377085972278e-08,
+      "loss": 0.9844,
+      "step": 668
+    },
+    {
+      "epoch": 1.9217391304347826,
+      "grad_norm": 0.8491829633712769,
+      "learning_rate": 2.351363809050211e-08,
+      "loss": 1.0045,
+      "step": 669
+    },
+    {
+      "epoch": 1.9246376811594201,
+      "grad_norm": 0.83339524269104,
+      "learning_rate": 2.1329118524827662e-08,
+      "loss": 0.9844,
+      "step": 670
+    },
+    {
+      "epoch": 1.927536231884058,
+      "grad_norm": 0.9295774102210999,
+      "learning_rate": 1.9250865015906784e-08,
+      "loss": 1.0247,
+      "step": 671
+    },
+    {
+      "epoch": 1.9304347826086956,
+      "grad_norm": 0.8484298586845398,
+      "learning_rate": 1.7278921922527224e-08,
+      "loss": 1.0195,
+      "step": 672
+    },
+    {
+      "epoch": 1.9333333333333333,
+      "grad_norm": 0.8862564563751221,
+      "learning_rate": 1.541333133436018e-08,
+      "loss": 0.9827,
+      "step": 673
+    },
+    {
+      "epoch": 1.936231884057971,
+      "grad_norm": 0.8401779532432556,
+      "learning_rate": 1.3654133071059894e-08,
+      "loss": 1.0295,
+      "step": 674
+    },
+    {
+      "epoch": 1.9391304347826086,
+      "grad_norm": 0.8818807005882263,
+      "learning_rate": 1.200136468141544e-08,
+      "loss": 0.9554,
+      "step": 675
+    },
+    {
+      "epoch": 1.9420289855072463,
+      "grad_norm": 0.8366807699203491,
+      "learning_rate": 1.0455061442548597e-08,
+      "loss": 0.9771,
+      "step": 676
+    },
+    {
+      "epoch": 1.944927536231884,
+      "grad_norm": 0.8115973472595215,
+      "learning_rate": 9.015256359161118e-09,
+      "loss": 1.0364,
+      "step": 677
+    },
+    {
+      "epoch": 1.9478260869565216,
+      "grad_norm": 0.925413191318512,
+      "learning_rate": 7.681980162830283e-09,
+      "loss": 1.0026,
+      "step": 678
+    },
+    {
+      "epoch": 1.9507246376811596,
+      "grad_norm": 0.8799839615821838,
+      "learning_rate": 6.455261311352767e-09,
+      "loss": 1.0164,
+      "step": 679
+    },
+    {
+      "epoch": 1.953623188405797,
+      "grad_norm": 0.8579555153846741,
+      "learning_rate": 5.3351259881379016e-09,
+      "loss": 0.9775,
+      "step": 680
+    },
+    {
+      "epoch": 1.9565217391304348,
+      "grad_norm": 0.8572901487350464,
+      "learning_rate": 4.321598101647007e-09,
+      "loss": 0.9926,
+      "step": 681
+    },
+    {
+      "epoch": 1.9594202898550726,
+      "grad_norm": 0.7731289863586426,
+      "learning_rate": 3.41469928488547e-09,
+      "loss": 1.0126,
+      "step": 682
+    },
+    {
+      "epoch": 1.96231884057971,
+      "grad_norm": 0.937656581401825,
+      "learning_rate": 2.6144488949392253e-09,
+      "loss": 0.9443,
+      "step": 683
+    },
+    {
+      "epoch": 1.9652173913043478,
+      "grad_norm": 0.8993798494338989,
+      "learning_rate": 1.9208640125628618e-09,
+      "loss": 0.946,
+      "step": 684
+    },
+    {
+      "epoch": 1.9681159420289855,
+      "grad_norm": 0.9831903576850891,
+      "learning_rate": 1.3339594418138036e-09,
+      "loss": 0.9799,
+      "step": 685
+    },
+    {
+      "epoch": 1.971014492753623,
+      "grad_norm": 0.9224021434783936,
+      "learning_rate": 8.537477097364522e-10,
+      "loss": 0.9299,
+      "step": 686
+    },
+    {
+      "epoch": 1.973913043478261,
+      "grad_norm": 0.8220890760421753,
+      "learning_rate": 4.802390660968437e-10,
+      "loss": 1.0307,
+      "step": 687
+    },
+    {
+      "epoch": 1.9768115942028985,
+      "grad_norm": 1.0893397331237793,
+      "learning_rate": 2.1344148316060352e-10,
+      "loss": 0.9523,
+      "step": 688
+    },
+    {
+      "epoch": 1.9797101449275363,
+      "grad_norm": 0.8536267280578613,
+      "learning_rate": 5.336065552641323e-11,
+      "loss": 0.9675,
+      "step": 689
+    },
+    {
+      "epoch": 1.982608695652174,
+      "grad_norm": 0.8123190999031067,
+      "learning_rate": 0.0,
+      "loss": 0.9576,
+      "step": 690
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 690,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 173,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.816855525560156e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}