{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3069888961463096, "eval_steps": 42, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002612671456564337, "eval_loss": 11.931962966918945, "eval_runtime": 3.324, "eval_samples_per_second": 775.568, "eval_steps_per_second": 24.368, "step": 1 }, { "epoch": 0.007838014369693011, "grad_norm": 0.01832103729248047, "learning_rate": 3e-05, "loss": 11.9321, "step": 3 }, { "epoch": 0.015676028739386023, "grad_norm": 0.022594580426812172, "learning_rate": 6e-05, "loss": 11.9319, "step": 6 }, { "epoch": 0.023514043109079032, "grad_norm": 0.01913524605333805, "learning_rate": 9e-05, "loss": 11.9322, "step": 9 }, { "epoch": 0.031352057478772045, "grad_norm": 0.024161087349057198, "learning_rate": 9.999588943391597e-05, "loss": 11.9314, "step": 12 }, { "epoch": 0.039190071848465055, "grad_norm": 0.02622823789715767, "learning_rate": 9.99743108100344e-05, "loss": 11.9308, "step": 15 }, { "epoch": 0.047028086218158065, "grad_norm": 0.032415881752967834, "learning_rate": 9.993424445916923e-05, "loss": 11.9309, "step": 18 }, { "epoch": 0.054866100587851074, "grad_norm": 0.04588017985224724, "learning_rate": 9.987570520365104e-05, "loss": 11.9306, "step": 21 }, { "epoch": 0.06270411495754409, "grad_norm": 0.045079998672008514, "learning_rate": 9.979871469976196e-05, "loss": 11.9295, "step": 24 }, { "epoch": 0.0705421293272371, "grad_norm": 0.06812106817960739, "learning_rate": 9.970330142972401e-05, "loss": 11.9288, "step": 27 }, { "epoch": 0.07838014369693011, "grad_norm": 0.07375520467758179, "learning_rate": 9.95895006911623e-05, "loss": 11.9281, "step": 30 }, { "epoch": 0.08621815806662313, "grad_norm": 0.08190234750509262, "learning_rate": 9.945735458404681e-05, "loss": 11.9272, "step": 33 }, { "epoch": 0.09405617243631613, "grad_norm": 0.08239512145519257, "learning_rate": 9.930691199511775e-05, "loss": 11.9254, "step": 36 }, { "epoch": 0.10189418680600915, "grad_norm": 0.07339413464069366, "learning_rate": 9.91382285798002e-05, "loss": 11.9242, "step": 39 }, { "epoch": 0.10973220117570215, "grad_norm": 0.0514436773955822, "learning_rate": 9.895136674161465e-05, "loss": 11.9244, "step": 42 }, { "epoch": 0.10973220117570215, "eval_loss": 11.923332214355469, "eval_runtime": 3.3299, "eval_samples_per_second": 774.195, "eval_steps_per_second": 24.325, "step": 42 }, { "epoch": 0.11757021554539517, "grad_norm": 0.03980684280395508, "learning_rate": 9.874639560909117e-05, "loss": 11.923, "step": 45 }, { "epoch": 0.12540822991508818, "grad_norm": 0.030828900635242462, "learning_rate": 9.852339101019574e-05, "loss": 11.9232, "step": 48 }, { "epoch": 0.13324624428478118, "grad_norm": 0.033988162875175476, "learning_rate": 9.828243544427796e-05, "loss": 11.922, "step": 51 }, { "epoch": 0.1410842586544742, "grad_norm": 0.03058473765850067, "learning_rate": 9.802361805155097e-05, "loss": 11.9221, "step": 54 }, { "epoch": 0.14892227302416722, "grad_norm": 0.02629922330379486, "learning_rate": 9.774703458011453e-05, "loss": 11.9221, "step": 57 }, { "epoch": 0.15676028739386022, "grad_norm": 0.029390348121523857, "learning_rate": 9.745278735053343e-05, "loss": 11.9205, "step": 60 }, { "epoch": 0.16459830176355322, "grad_norm": 0.02337743528187275, "learning_rate": 9.714098521798465e-05, "loss": 11.9216, "step": 63 }, { "epoch": 0.17243631613324625, "grad_norm": 0.017651265487074852, "learning_rate": 9.681174353198687e-05, "loss": 11.9212, "step": 66 }, { "epoch": 0.18027433050293926, "grad_norm": 0.018284769728779793, "learning_rate": 9.64651840937276e-05, "loss": 11.921, "step": 69 }, { "epoch": 0.18811234487263226, "grad_norm": 0.014632761478424072, "learning_rate": 9.610143511100354e-05, "loss": 11.9212, "step": 72 }, { "epoch": 0.1959503592423253, "grad_norm": 0.020750127732753754, "learning_rate": 9.572063115079063e-05, "loss": 11.9209, "step": 75 }, { "epoch": 0.2037883736120183, "grad_norm": 0.012179220095276833, "learning_rate": 9.53229130894619e-05, "loss": 11.921, "step": 78 }, { "epoch": 0.2116263879817113, "grad_norm": 0.014039278961718082, "learning_rate": 9.490842806067095e-05, "loss": 11.921, "step": 81 }, { "epoch": 0.2194644023514043, "grad_norm": 0.015738265588879585, "learning_rate": 9.44773294009206e-05, "loss": 11.9204, "step": 84 }, { "epoch": 0.2194644023514043, "eval_loss": 11.920842170715332, "eval_runtime": 3.3087, "eval_samples_per_second": 779.15, "eval_steps_per_second": 24.481, "step": 84 }, { "epoch": 0.22730241672109733, "grad_norm": 0.017396893352270126, "learning_rate": 9.40297765928369e-05, "loss": 11.92, "step": 87 }, { "epoch": 0.23514043109079033, "grad_norm": 0.01054318156093359, "learning_rate": 9.356593520616948e-05, "loss": 11.9203, "step": 90 }, { "epoch": 0.24297844546048333, "grad_norm": 0.016479285433888435, "learning_rate": 9.308597683653975e-05, "loss": 11.9206, "step": 93 }, { "epoch": 0.25081645983017636, "grad_norm": 0.01747283898293972, "learning_rate": 9.259007904196023e-05, "loss": 11.9208, "step": 96 }, { "epoch": 0.25865447419986937, "grad_norm": 0.016894422471523285, "learning_rate": 9.207842527714767e-05, "loss": 11.9204, "step": 99 }, { "epoch": 0.26649248856956237, "grad_norm": 0.015183241106569767, "learning_rate": 9.155120482565521e-05, "loss": 11.9209, "step": 102 }, { "epoch": 0.27433050293925537, "grad_norm": 0.014355632476508617, "learning_rate": 9.10086127298478e-05, "loss": 11.9201, "step": 105 }, { "epoch": 0.2821685173089484, "grad_norm": 0.022136807441711426, "learning_rate": 9.045084971874738e-05, "loss": 11.9204, "step": 108 }, { "epoch": 0.29000653167864143, "grad_norm": 0.02453417330980301, "learning_rate": 8.987812213377424e-05, "loss": 11.9206, "step": 111 }, { "epoch": 0.29784454604833444, "grad_norm": 0.022463472560048103, "learning_rate": 8.929064185241213e-05, "loss": 11.9194, "step": 114 }, { "epoch": 0.30568256041802744, "grad_norm": 0.020201250910758972, "learning_rate": 8.868862620982534e-05, "loss": 11.9192, "step": 117 }, { "epoch": 0.31352057478772044, "grad_norm": 0.023930294439196587, "learning_rate": 8.807229791845673e-05, "loss": 11.92, "step": 120 }, { "epoch": 0.32135858915741344, "grad_norm": 0.01951843872666359, "learning_rate": 8.744188498563641e-05, "loss": 11.9196, "step": 123 }, { "epoch": 0.32919660352710645, "grad_norm": 0.013068539090454578, "learning_rate": 8.679762062923175e-05, "loss": 11.9193, "step": 126 }, { "epoch": 0.32919660352710645, "eval_loss": 11.919238090515137, "eval_runtime": 3.3246, "eval_samples_per_second": 775.442, "eval_steps_per_second": 24.364, "step": 126 }, { "epoch": 0.3370346178967995, "grad_norm": 0.015373232774436474, "learning_rate": 8.613974319136958e-05, "loss": 11.9189, "step": 129 }, { "epoch": 0.3448726322664925, "grad_norm": 0.018870707601308823, "learning_rate": 8.54684960502629e-05, "loss": 11.9199, "step": 132 }, { "epoch": 0.3527106466361855, "grad_norm": 0.019645841792225838, "learning_rate": 8.478412753017433e-05, "loss": 11.9192, "step": 135 }, { "epoch": 0.3605486610058785, "grad_norm": 0.012606288306415081, "learning_rate": 8.408689080954998e-05, "loss": 11.9192, "step": 138 }, { "epoch": 0.3683866753755715, "grad_norm": 0.02213534526526928, "learning_rate": 8.33770438273574e-05, "loss": 11.9192, "step": 141 }, { "epoch": 0.3762246897452645, "grad_norm": 0.025056803598999977, "learning_rate": 8.265484918766243e-05, "loss": 11.9194, "step": 144 }, { "epoch": 0.3840627041149575, "grad_norm": 0.017324019223451614, "learning_rate": 8.192057406248028e-05, "loss": 11.919, "step": 147 }, { "epoch": 0.3919007184846506, "grad_norm": 0.016662944108247757, "learning_rate": 8.117449009293668e-05, "loss": 11.9196, "step": 150 }, { "epoch": 0.3997387328543436, "grad_norm": 0.01738261617720127, "learning_rate": 8.041687328877567e-05, "loss": 11.9183, "step": 153 }, { "epoch": 0.4075767472240366, "grad_norm": 0.014731982722878456, "learning_rate": 7.964800392625129e-05, "loss": 11.9195, "step": 156 }, { "epoch": 0.4154147615937296, "grad_norm": 0.019330400973558426, "learning_rate": 7.886816644444098e-05, "loss": 11.9189, "step": 159 }, { "epoch": 0.4232527759634226, "grad_norm": 0.01512803602963686, "learning_rate": 7.807764934001874e-05, "loss": 11.9189, "step": 162 }, { "epoch": 0.4310907903331156, "grad_norm": 0.017275972291827202, "learning_rate": 7.727674506052743e-05, "loss": 11.9183, "step": 165 }, { "epoch": 0.4389288047028086, "grad_norm": 0.016290944069623947, "learning_rate": 7.646574989618938e-05, "loss": 11.919, "step": 168 }, { "epoch": 0.4389288047028086, "eval_loss": 11.918442726135254, "eval_runtime": 3.3195, "eval_samples_per_second": 776.625, "eval_steps_per_second": 24.401, "step": 168 }, { "epoch": 0.44676681907250165, "grad_norm": 0.019369367510080338, "learning_rate": 7.564496387029532e-05, "loss": 11.9192, "step": 171 }, { "epoch": 0.45460483344219466, "grad_norm": 0.012970932759344578, "learning_rate": 7.481469062821252e-05, "loss": 11.9188, "step": 174 }, { "epoch": 0.46244284781188766, "grad_norm": 0.012391473166644573, "learning_rate": 7.39752373250527e-05, "loss": 11.919, "step": 177 }, { "epoch": 0.47028086218158066, "grad_norm": 0.014110087417066097, "learning_rate": 7.312691451204178e-05, "loss": 11.9184, "step": 180 }, { "epoch": 0.47811887655127366, "grad_norm": 0.016782281920313835, "learning_rate": 7.227003602163295e-05, "loss": 11.9177, "step": 183 }, { "epoch": 0.48595689092096667, "grad_norm": 0.013867055997252464, "learning_rate": 7.14049188514063e-05, "loss": 11.9188, "step": 186 }, { "epoch": 0.4937949052906597, "grad_norm": 0.022281071171164513, "learning_rate": 7.05318830467969e-05, "loss": 11.9179, "step": 189 }, { "epoch": 0.5016329196603527, "grad_norm": 0.014964847825467587, "learning_rate": 6.965125158269619e-05, "loss": 11.9187, "step": 192 }, { "epoch": 0.5094709340300457, "grad_norm": 0.012438995763659477, "learning_rate": 6.876335024396872e-05, "loss": 11.9191, "step": 195 }, { "epoch": 0.5173089483997387, "grad_norm": 0.016654090955853462, "learning_rate": 6.786850750493006e-05, "loss": 11.918, "step": 198 }, { "epoch": 0.5251469627694317, "grad_norm": 0.019343817606568336, "learning_rate": 6.696705440782938e-05, "loss": 11.9178, "step": 201 }, { "epoch": 0.5329849771391247, "grad_norm": 0.017074916511774063, "learning_rate": 6.605932444038229e-05, "loss": 11.9185, "step": 204 }, { "epoch": 0.5408229915088177, "grad_norm": 0.016646448522806168, "learning_rate": 6.514565341239861e-05, "loss": 11.918, "step": 207 }, { "epoch": 0.5486610058785107, "grad_norm": 0.013506593182682991, "learning_rate": 6.422637933155162e-05, "loss": 11.9185, "step": 210 }, { "epoch": 0.5486610058785107, "eval_loss": 11.917922019958496, "eval_runtime": 3.3182, "eval_samples_per_second": 776.922, "eval_steps_per_second": 24.411, "step": 210 }, { "epoch": 0.5564990202482037, "grad_norm": 0.01849060133099556, "learning_rate": 6.330184227833376e-05, "loss": 11.918, "step": 213 }, { "epoch": 0.5643370346178967, "grad_norm": 0.020746391266584396, "learning_rate": 6.237238428024572e-05, "loss": 11.9173, "step": 216 }, { "epoch": 0.5721750489875899, "grad_norm": 0.012400954961776733, "learning_rate": 6.143834918526527e-05, "loss": 11.9179, "step": 219 }, { "epoch": 0.5800130633572829, "grad_norm": 0.019317157566547394, "learning_rate": 6.0500082534642464e-05, "loss": 11.9188, "step": 222 }, { "epoch": 0.5878510777269759, "grad_norm": 0.01525891199707985, "learning_rate": 5.955793143506863e-05, "loss": 11.9184, "step": 225 }, { "epoch": 0.5956890920966689, "grad_norm": 0.013121162541210651, "learning_rate": 5.861224443026595e-05, "loss": 11.9178, "step": 228 }, { "epoch": 0.6035271064663619, "grad_norm": 0.018763018772006035, "learning_rate": 5.766337137204579e-05, "loss": 11.9179, "step": 231 }, { "epoch": 0.6113651208360549, "grad_norm": 0.018400780856609344, "learning_rate": 5.6711663290882776e-05, "loss": 11.9185, "step": 234 }, { "epoch": 0.6192031352057479, "grad_norm": 0.01755247637629509, "learning_rate": 5.575747226605298e-05, "loss": 11.918, "step": 237 }, { "epoch": 0.6270411495754409, "grad_norm": 0.013365167193114758, "learning_rate": 5.480115129538409e-05, "loss": 11.9172, "step": 240 }, { "epoch": 0.6348791639451339, "grad_norm": 0.013805567286908627, "learning_rate": 5.384305416466584e-05, "loss": 11.9173, "step": 243 }, { "epoch": 0.6427171783148269, "grad_norm": 0.013149751350283623, "learning_rate": 5.288353531676873e-05, "loss": 11.9184, "step": 246 }, { "epoch": 0.6505551926845199, "grad_norm": 0.015995411202311516, "learning_rate": 5.192294972051992e-05, "loss": 11.9183, "step": 249 }, { "epoch": 0.6583932070542129, "grad_norm": 0.014820579439401627, "learning_rate": 5.0961652739384356e-05, "loss": 11.9175, "step": 252 }, { "epoch": 0.6583932070542129, "eval_loss": 11.917555809020996, "eval_runtime": 3.3162, "eval_samples_per_second": 777.394, "eval_steps_per_second": 24.425, "step": 252 }, { "epoch": 0.6662312214239059, "grad_norm": 0.01651047170162201, "learning_rate": 5e-05, "loss": 11.9174, "step": 255 }, { "epoch": 0.674069235793599, "grad_norm": 0.016767192631959915, "learning_rate": 4.903834726061565e-05, "loss": 11.9184, "step": 258 }, { "epoch": 0.681907250163292, "grad_norm": 0.01721210964024067, "learning_rate": 4.807705027948008e-05, "loss": 11.9177, "step": 261 }, { "epoch": 0.689745264532985, "grad_norm": 0.0172222089022398, "learning_rate": 4.711646468323129e-05, "loss": 11.9178, "step": 264 }, { "epoch": 0.697583278902678, "grad_norm": 0.016364743933081627, "learning_rate": 4.6156945835334184e-05, "loss": 11.9181, "step": 267 }, { "epoch": 0.705421293272371, "grad_norm": 0.016128098592162132, "learning_rate": 4.5198848704615914e-05, "loss": 11.9178, "step": 270 }, { "epoch": 0.713259307642064, "grad_norm": 0.015332392416894436, "learning_rate": 4.424252773394704e-05, "loss": 11.9177, "step": 273 }, { "epoch": 0.721097322011757, "grad_norm": 0.01631457917392254, "learning_rate": 4.328833670911724e-05, "loss": 11.9181, "step": 276 }, { "epoch": 0.72893533638145, "grad_norm": 0.018029581755399704, "learning_rate": 4.23366286279542e-05, "loss": 11.9173, "step": 279 }, { "epoch": 0.736773350751143, "grad_norm": 0.016118017956614494, "learning_rate": 4.138775556973406e-05, "loss": 11.9175, "step": 282 }, { "epoch": 0.744611365120836, "grad_norm": 0.0235885176807642, "learning_rate": 4.04420685649314e-05, "loss": 11.9174, "step": 285 }, { "epoch": 0.752449379490529, "grad_norm": 0.02053360641002655, "learning_rate": 3.9499917465357534e-05, "loss": 11.9178, "step": 288 }, { "epoch": 0.760287393860222, "grad_norm": 0.018186945468187332, "learning_rate": 3.856165081473474e-05, "loss": 11.9181, "step": 291 }, { "epoch": 0.768125408229915, "grad_norm": 0.029430339112877846, "learning_rate": 3.762761571975429e-05, "loss": 11.9174, "step": 294 }, { "epoch": 0.768125408229915, "eval_loss": 11.917236328125, "eval_runtime": 3.3236, "eval_samples_per_second": 775.653, "eval_steps_per_second": 24.371, "step": 294 }, { "epoch": 0.7759634225996082, "grad_norm": 0.03046645224094391, "learning_rate": 3.6698157721666246e-05, "loss": 11.917, "step": 297 }, { "epoch": 0.7838014369693012, "grad_norm": 0.022532852366566658, "learning_rate": 3.5773620668448384e-05, "loss": 11.9169, "step": 300 }, { "epoch": 0.7916394513389942, "grad_norm": 0.019215036183595657, "learning_rate": 3.48543465876014e-05, "loss": 11.9183, "step": 303 }, { "epoch": 0.7994774657086872, "grad_norm": 0.02424040250480175, "learning_rate": 3.3940675559617724e-05, "loss": 11.9171, "step": 306 }, { "epoch": 0.8073154800783802, "grad_norm": 0.013650625012814999, "learning_rate": 3.303294559217063e-05, "loss": 11.9176, "step": 309 }, { "epoch": 0.8151534944480732, "grad_norm": 0.01347661204636097, "learning_rate": 3.213149249506997e-05, "loss": 11.9171, "step": 312 }, { "epoch": 0.8229915088177662, "grad_norm": 0.02356456220149994, "learning_rate": 3.12366497560313e-05, "loss": 11.9175, "step": 315 }, { "epoch": 0.8308295231874592, "grad_norm": 0.01948046311736107, "learning_rate": 3.0348748417303823e-05, "loss": 11.918, "step": 318 }, { "epoch": 0.8386675375571522, "grad_norm": 0.015809211879968643, "learning_rate": 2.9468116953203107e-05, "loss": 11.9178, "step": 321 }, { "epoch": 0.8465055519268452, "grad_norm": 0.01481384877115488, "learning_rate": 2.8595081148593738e-05, "loss": 11.9178, "step": 324 }, { "epoch": 0.8543435662965382, "grad_norm": 0.02252669259905815, "learning_rate": 2.772996397836704e-05, "loss": 11.9174, "step": 327 }, { "epoch": 0.8621815806662312, "grad_norm": 0.017606221139431, "learning_rate": 2.687308548795825e-05, "loss": 11.9176, "step": 330 }, { "epoch": 0.8700195950359242, "grad_norm": 0.024705080315470695, "learning_rate": 2.6024762674947313e-05, "loss": 11.9166, "step": 333 }, { "epoch": 0.8778576094056172, "grad_norm": 0.024166177958250046, "learning_rate": 2.5185309371787513e-05, "loss": 11.9176, "step": 336 }, { "epoch": 0.8778576094056172, "eval_loss": 11.916953086853027, "eval_runtime": 3.3243, "eval_samples_per_second": 775.492, "eval_steps_per_second": 24.366, "step": 336 }, { "epoch": 0.8856956237753103, "grad_norm": 0.01870771311223507, "learning_rate": 2.43550361297047e-05, "loss": 11.9178, "step": 339 }, { "epoch": 0.8935336381450033, "grad_norm": 0.014654111117124557, "learning_rate": 2.353425010381063e-05, "loss": 11.9177, "step": 342 }, { "epoch": 0.9013716525146963, "grad_norm": 0.01838817447423935, "learning_rate": 2.272325493947257e-05, "loss": 11.9171, "step": 345 }, { "epoch": 0.9092096668843893, "grad_norm": 0.021683456376194954, "learning_rate": 2.192235065998126e-05, "loss": 11.9179, "step": 348 }, { "epoch": 0.9170476812540823, "grad_norm": 0.02190260961651802, "learning_rate": 2.1131833555559037e-05, "loss": 11.917, "step": 351 }, { "epoch": 0.9248856956237753, "grad_norm": 0.014892240054905415, "learning_rate": 2.0351996073748713e-05, "loss": 11.917, "step": 354 }, { "epoch": 0.9327237099934683, "grad_norm": 0.020150186493992805, "learning_rate": 1.9583126711224343e-05, "loss": 11.9175, "step": 357 }, { "epoch": 0.9405617243631613, "grad_norm": 0.01918022148311138, "learning_rate": 1.8825509907063327e-05, "loss": 11.9172, "step": 360 }, { "epoch": 0.9483997387328543, "grad_norm": 0.020093288272619247, "learning_rate": 1.807942593751973e-05, "loss": 11.9177, "step": 363 }, { "epoch": 0.9562377531025473, "grad_norm": 0.015267434529960155, "learning_rate": 1.7345150812337564e-05, "loss": 11.9167, "step": 366 }, { "epoch": 0.9640757674722403, "grad_norm": 0.01452693808823824, "learning_rate": 1.66229561726426e-05, "loss": 11.9178, "step": 369 }, { "epoch": 0.9719137818419333, "grad_norm": 0.01645076647400856, "learning_rate": 1.5913109190450032e-05, "loss": 11.9171, "step": 372 }, { "epoch": 0.9797517962116263, "grad_norm": 0.02012869343161583, "learning_rate": 1.5215872469825682e-05, "loss": 11.9169, "step": 375 }, { "epoch": 0.9875898105813194, "grad_norm": 0.015907544642686844, "learning_rate": 1.4531503949737108e-05, "loss": 11.9176, "step": 378 }, { "epoch": 0.9875898105813194, "eval_loss": 11.916740417480469, "eval_runtime": 3.3252, "eval_samples_per_second": 775.289, "eval_steps_per_second": 24.359, "step": 378 }, { "epoch": 0.9954278249510125, "grad_norm": 0.03160930424928665, "learning_rate": 1.3860256808630428e-05, "loss": 11.9168, "step": 381 }, { "epoch": 1.0039190071848465, "grad_norm": 0.017380647361278534, "learning_rate": 1.3202379370768252e-05, "loss": 13.741, "step": 384 }, { "epoch": 1.0117570215545395, "grad_norm": 0.0193234421312809, "learning_rate": 1.2558115014363592e-05, "loss": 12.1761, "step": 387 }, { "epoch": 1.0195950359242325, "grad_norm": 0.02974247932434082, "learning_rate": 1.1927702081543279e-05, "loss": 11.7776, "step": 390 }, { "epoch": 1.0274330502939255, "grad_norm": 0.014591868035495281, "learning_rate": 1.1311373790174657e-05, "loss": 12.203, "step": 393 }, { "epoch": 1.0352710646636185, "grad_norm": 0.018022043630480766, "learning_rate": 1.0709358147587884e-05, "loss": 11.431, "step": 396 }, { "epoch": 1.0431090790333115, "grad_norm": 0.019987676292657852, "learning_rate": 1.0121877866225781e-05, "loss": 12.1733, "step": 399 }, { "epoch": 1.0509470934030045, "grad_norm": 0.022911233827471733, "learning_rate": 9.549150281252633e-06, "loss": 11.9096, "step": 402 }, { "epoch": 1.0587851077726975, "grad_norm": 0.019524535164237022, "learning_rate": 8.991387270152201e-06, "loss": 12.1372, "step": 405 }, { "epoch": 1.0666231221423905, "grad_norm": 0.016502438113093376, "learning_rate": 8.448795174344804e-06, "loss": 11.5679, "step": 408 }, { "epoch": 1.0744611365120835, "grad_norm": 0.015734922140836716, "learning_rate": 7.921574722852343e-06, "loss": 12.0084, "step": 411 }, { "epoch": 1.0822991508817765, "grad_norm": 0.019688883796334267, "learning_rate": 7.409920958039795e-06, "loss": 11.8272, "step": 414 }, { "epoch": 1.0901371652514695, "grad_norm": 0.016949467360973358, "learning_rate": 6.9140231634602485e-06, "loss": 12.0226, "step": 417 }, { "epoch": 1.0979751796211628, "grad_norm": 0.015972912311553955, "learning_rate": 6.43406479383053e-06, "loss": 12.0123, "step": 420 }, { "epoch": 1.0979751796211628, "eval_loss": 11.916641235351562, "eval_runtime": 3.3258, "eval_samples_per_second": 775.161, "eval_steps_per_second": 24.355, "step": 420 }, { "epoch": 1.1058131939908558, "grad_norm": 0.022812234237790108, "learning_rate": 5.9702234071631e-06, "loss": 11.7508, "step": 423 }, { "epoch": 1.1136512083605488, "grad_norm": 0.02215123549103737, "learning_rate": 5.5226705990794155e-06, "loss": 11.9357, "step": 426 }, { "epoch": 1.1214892227302418, "grad_norm": 0.018271734938025475, "learning_rate": 5.091571939329048e-06, "loss": 12.0107, "step": 429 }, { "epoch": 1.1293272370999348, "grad_norm": 0.018984654918313026, "learning_rate": 4.677086910538092e-06, "loss": 12.1886, "step": 432 }, { "epoch": 1.1371652514696278, "grad_norm": 0.018601972609758377, "learning_rate": 4.279368849209381e-06, "loss": 11.6696, "step": 435 }, { "epoch": 1.1450032658393208, "grad_norm": 0.0175609327852726, "learning_rate": 3.898564888996476e-06, "loss": 12.2369, "step": 438 }, { "epoch": 1.1528412802090138, "grad_norm": 0.018267886713147163, "learning_rate": 3.534815906272404e-06, "loss": 11.3125, "step": 441 }, { "epoch": 1.1606792945787068, "grad_norm": 0.018670443445444107, "learning_rate": 3.18825646801314e-06, "loss": 12.0095, "step": 444 }, { "epoch": 1.1685173089483998, "grad_norm": 0.01578577049076557, "learning_rate": 2.8590147820153513e-06, "loss": 11.9352, "step": 447 }, { "epoch": 1.1763553233180928, "grad_norm": 0.018620701506733894, "learning_rate": 2.547212649466568e-06, "loss": 11.819, "step": 450 }, { "epoch": 1.1841933376877858, "grad_norm": 0.020687857642769814, "learning_rate": 2.2529654198854835e-06, "loss": 12.2251, "step": 453 }, { "epoch": 1.1920313520574788, "grad_norm": 0.0191540215164423, "learning_rate": 1.9763819484490355e-06, "loss": 11.8286, "step": 456 }, { "epoch": 1.1998693664271718, "grad_norm": 0.017804287374019623, "learning_rate": 1.7175645557220566e-06, "loss": 11.6918, "step": 459 }, { "epoch": 1.2077073807968648, "grad_norm": 0.019628843292593956, "learning_rate": 1.4766089898042678e-06, "loss": 12.2151, "step": 462 }, { "epoch": 1.2077073807968648, "eval_loss": 11.916607856750488, "eval_runtime": 3.3257, "eval_samples_per_second": 775.165, "eval_steps_per_second": 24.355, "step": 462 }, { "epoch": 1.2155453951665578, "grad_norm": 0.013816201128065586, "learning_rate": 1.2536043909088191e-06, "loss": 11.9293, "step": 465 }, { "epoch": 1.2233834095362508, "grad_norm": 0.016839459538459778, "learning_rate": 1.0486332583853563e-06, "loss": 11.9467, "step": 468 }, { "epoch": 1.2312214239059438, "grad_norm": 0.01736452244222164, "learning_rate": 8.617714201998084e-07, "loss": 11.8521, "step": 471 }, { "epoch": 1.2390594382756368, "grad_norm": 0.025550948455929756, "learning_rate": 6.93088004882253e-07, "loss": 11.6322, "step": 474 }, { "epoch": 1.2468974526453298, "grad_norm": 0.03142261132597923, "learning_rate": 5.426454159531913e-07, "loss": 12.0707, "step": 477 }, { "epoch": 1.2547354670150228, "grad_norm": 0.014136346988379955, "learning_rate": 4.104993088376974e-07, "loss": 12.1411, "step": 480 }, { "epoch": 1.2625734813847158, "grad_norm": 0.023102182894945145, "learning_rate": 2.966985702759828e-07, "loss": 11.8335, "step": 483 }, { "epoch": 1.2704114957544088, "grad_norm": 0.020949246361851692, "learning_rate": 2.012853002380466e-07, "loss": 11.8762, "step": 486 }, { "epoch": 1.2782495101241018, "grad_norm": 0.013338472694158554, "learning_rate": 1.2429479634897267e-07, "loss": 11.7727, "step": 489 }, { "epoch": 1.2860875244937948, "grad_norm": 0.022523999214172363, "learning_rate": 6.575554083078084e-08, "loss": 12.2013, "step": 492 }, { "epoch": 1.2939255388634878, "grad_norm": 0.016933446750044823, "learning_rate": 2.568918996560532e-08, "loss": 11.8428, "step": 495 }, { "epoch": 1.3017635532331808, "grad_norm": 0.017216265201568604, "learning_rate": 4.110566084036816e-09, "loss": 11.9056, "step": 498 } ], "logging_steps": 3, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 42, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9645271941120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }