|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.3069888961463096, |
|
"eval_steps": 42, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002612671456564337, |
|
"eval_loss": 11.931962966918945, |
|
"eval_runtime": 3.324, |
|
"eval_samples_per_second": 775.568, |
|
"eval_steps_per_second": 24.368, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007838014369693011, |
|
"grad_norm": 0.01832103729248047, |
|
"learning_rate": 3e-05, |
|
"loss": 11.9321, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.015676028739386023, |
|
"grad_norm": 0.022594580426812172, |
|
"learning_rate": 6e-05, |
|
"loss": 11.9319, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.023514043109079032, |
|
"grad_norm": 0.01913524605333805, |
|
"learning_rate": 9e-05, |
|
"loss": 11.9322, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.031352057478772045, |
|
"grad_norm": 0.024161087349057198, |
|
"learning_rate": 9.999588943391597e-05, |
|
"loss": 11.9314, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.039190071848465055, |
|
"grad_norm": 0.02622823789715767, |
|
"learning_rate": 9.99743108100344e-05, |
|
"loss": 11.9308, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.047028086218158065, |
|
"grad_norm": 0.032415881752967834, |
|
"learning_rate": 9.993424445916923e-05, |
|
"loss": 11.9309, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.054866100587851074, |
|
"grad_norm": 0.04588017985224724, |
|
"learning_rate": 9.987570520365104e-05, |
|
"loss": 11.9306, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06270411495754409, |
|
"grad_norm": 0.045079998672008514, |
|
"learning_rate": 9.979871469976196e-05, |
|
"loss": 11.9295, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0705421293272371, |
|
"grad_norm": 0.06812106817960739, |
|
"learning_rate": 9.970330142972401e-05, |
|
"loss": 11.9288, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07838014369693011, |
|
"grad_norm": 0.07375520467758179, |
|
"learning_rate": 9.95895006911623e-05, |
|
"loss": 11.9281, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08621815806662313, |
|
"grad_norm": 0.08190234750509262, |
|
"learning_rate": 9.945735458404681e-05, |
|
"loss": 11.9272, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09405617243631613, |
|
"grad_norm": 0.08239512145519257, |
|
"learning_rate": 9.930691199511775e-05, |
|
"loss": 11.9254, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10189418680600915, |
|
"grad_norm": 0.07339413464069366, |
|
"learning_rate": 9.91382285798002e-05, |
|
"loss": 11.9242, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10973220117570215, |
|
"grad_norm": 0.0514436773955822, |
|
"learning_rate": 9.895136674161465e-05, |
|
"loss": 11.9244, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10973220117570215, |
|
"eval_loss": 11.923332214355469, |
|
"eval_runtime": 3.3299, |
|
"eval_samples_per_second": 774.195, |
|
"eval_steps_per_second": 24.325, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11757021554539517, |
|
"grad_norm": 0.03980684280395508, |
|
"learning_rate": 9.874639560909117e-05, |
|
"loss": 11.923, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12540822991508818, |
|
"grad_norm": 0.030828900635242462, |
|
"learning_rate": 9.852339101019574e-05, |
|
"loss": 11.9232, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13324624428478118, |
|
"grad_norm": 0.033988162875175476, |
|
"learning_rate": 9.828243544427796e-05, |
|
"loss": 11.922, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1410842586544742, |
|
"grad_norm": 0.03058473765850067, |
|
"learning_rate": 9.802361805155097e-05, |
|
"loss": 11.9221, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14892227302416722, |
|
"grad_norm": 0.02629922330379486, |
|
"learning_rate": 9.774703458011453e-05, |
|
"loss": 11.9221, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15676028739386022, |
|
"grad_norm": 0.029390348121523857, |
|
"learning_rate": 9.745278735053343e-05, |
|
"loss": 11.9205, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16459830176355322, |
|
"grad_norm": 0.02337743528187275, |
|
"learning_rate": 9.714098521798465e-05, |
|
"loss": 11.9216, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.17243631613324625, |
|
"grad_norm": 0.017651265487074852, |
|
"learning_rate": 9.681174353198687e-05, |
|
"loss": 11.9212, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.18027433050293926, |
|
"grad_norm": 0.018284769728779793, |
|
"learning_rate": 9.64651840937276e-05, |
|
"loss": 11.921, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.18811234487263226, |
|
"grad_norm": 0.014632761478424072, |
|
"learning_rate": 9.610143511100354e-05, |
|
"loss": 11.9212, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1959503592423253, |
|
"grad_norm": 0.020750127732753754, |
|
"learning_rate": 9.572063115079063e-05, |
|
"loss": 11.9209, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2037883736120183, |
|
"grad_norm": 0.012179220095276833, |
|
"learning_rate": 9.53229130894619e-05, |
|
"loss": 11.921, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2116263879817113, |
|
"grad_norm": 0.014039278961718082, |
|
"learning_rate": 9.490842806067095e-05, |
|
"loss": 11.921, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2194644023514043, |
|
"grad_norm": 0.015738265588879585, |
|
"learning_rate": 9.44773294009206e-05, |
|
"loss": 11.9204, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2194644023514043, |
|
"eval_loss": 11.920842170715332, |
|
"eval_runtime": 3.3087, |
|
"eval_samples_per_second": 779.15, |
|
"eval_steps_per_second": 24.481, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22730241672109733, |
|
"grad_norm": 0.017396893352270126, |
|
"learning_rate": 9.40297765928369e-05, |
|
"loss": 11.92, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.23514043109079033, |
|
"grad_norm": 0.01054318156093359, |
|
"learning_rate": 9.356593520616948e-05, |
|
"loss": 11.9203, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24297844546048333, |
|
"grad_norm": 0.016479285433888435, |
|
"learning_rate": 9.308597683653975e-05, |
|
"loss": 11.9206, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.25081645983017636, |
|
"grad_norm": 0.01747283898293972, |
|
"learning_rate": 9.259007904196023e-05, |
|
"loss": 11.9208, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25865447419986937, |
|
"grad_norm": 0.016894422471523285, |
|
"learning_rate": 9.207842527714767e-05, |
|
"loss": 11.9204, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.26649248856956237, |
|
"grad_norm": 0.015183241106569767, |
|
"learning_rate": 9.155120482565521e-05, |
|
"loss": 11.9209, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.27433050293925537, |
|
"grad_norm": 0.014355632476508617, |
|
"learning_rate": 9.10086127298478e-05, |
|
"loss": 11.9201, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2821685173089484, |
|
"grad_norm": 0.022136807441711426, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 11.9204, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.29000653167864143, |
|
"grad_norm": 0.02453417330980301, |
|
"learning_rate": 8.987812213377424e-05, |
|
"loss": 11.9206, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.29784454604833444, |
|
"grad_norm": 0.022463472560048103, |
|
"learning_rate": 8.929064185241213e-05, |
|
"loss": 11.9194, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.30568256041802744, |
|
"grad_norm": 0.020201250910758972, |
|
"learning_rate": 8.868862620982534e-05, |
|
"loss": 11.9192, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.31352057478772044, |
|
"grad_norm": 0.023930294439196587, |
|
"learning_rate": 8.807229791845673e-05, |
|
"loss": 11.92, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32135858915741344, |
|
"grad_norm": 0.01951843872666359, |
|
"learning_rate": 8.744188498563641e-05, |
|
"loss": 11.9196, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.32919660352710645, |
|
"grad_norm": 0.013068539090454578, |
|
"learning_rate": 8.679762062923175e-05, |
|
"loss": 11.9193, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.32919660352710645, |
|
"eval_loss": 11.919238090515137, |
|
"eval_runtime": 3.3246, |
|
"eval_samples_per_second": 775.442, |
|
"eval_steps_per_second": 24.364, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3370346178967995, |
|
"grad_norm": 0.015373232774436474, |
|
"learning_rate": 8.613974319136958e-05, |
|
"loss": 11.9189, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3448726322664925, |
|
"grad_norm": 0.018870707601308823, |
|
"learning_rate": 8.54684960502629e-05, |
|
"loss": 11.9199, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3527106466361855, |
|
"grad_norm": 0.019645841792225838, |
|
"learning_rate": 8.478412753017433e-05, |
|
"loss": 11.9192, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3605486610058785, |
|
"grad_norm": 0.012606288306415081, |
|
"learning_rate": 8.408689080954998e-05, |
|
"loss": 11.9192, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3683866753755715, |
|
"grad_norm": 0.02213534526526928, |
|
"learning_rate": 8.33770438273574e-05, |
|
"loss": 11.9192, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3762246897452645, |
|
"grad_norm": 0.025056803598999977, |
|
"learning_rate": 8.265484918766243e-05, |
|
"loss": 11.9194, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3840627041149575, |
|
"grad_norm": 0.017324019223451614, |
|
"learning_rate": 8.192057406248028e-05, |
|
"loss": 11.919, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3919007184846506, |
|
"grad_norm": 0.016662944108247757, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 11.9196, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3997387328543436, |
|
"grad_norm": 0.01738261617720127, |
|
"learning_rate": 8.041687328877567e-05, |
|
"loss": 11.9183, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.4075767472240366, |
|
"grad_norm": 0.014731982722878456, |
|
"learning_rate": 7.964800392625129e-05, |
|
"loss": 11.9195, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4154147615937296, |
|
"grad_norm": 0.019330400973558426, |
|
"learning_rate": 7.886816644444098e-05, |
|
"loss": 11.9189, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4232527759634226, |
|
"grad_norm": 0.01512803602963686, |
|
"learning_rate": 7.807764934001874e-05, |
|
"loss": 11.9189, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4310907903331156, |
|
"grad_norm": 0.017275972291827202, |
|
"learning_rate": 7.727674506052743e-05, |
|
"loss": 11.9183, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4389288047028086, |
|
"grad_norm": 0.016290944069623947, |
|
"learning_rate": 7.646574989618938e-05, |
|
"loss": 11.919, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4389288047028086, |
|
"eval_loss": 11.918442726135254, |
|
"eval_runtime": 3.3195, |
|
"eval_samples_per_second": 776.625, |
|
"eval_steps_per_second": 24.401, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.44676681907250165, |
|
"grad_norm": 0.019369367510080338, |
|
"learning_rate": 7.564496387029532e-05, |
|
"loss": 11.9192, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.45460483344219466, |
|
"grad_norm": 0.012970932759344578, |
|
"learning_rate": 7.481469062821252e-05, |
|
"loss": 11.9188, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.46244284781188766, |
|
"grad_norm": 0.012391473166644573, |
|
"learning_rate": 7.39752373250527e-05, |
|
"loss": 11.919, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.47028086218158066, |
|
"grad_norm": 0.014110087417066097, |
|
"learning_rate": 7.312691451204178e-05, |
|
"loss": 11.9184, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47811887655127366, |
|
"grad_norm": 0.016782281920313835, |
|
"learning_rate": 7.227003602163295e-05, |
|
"loss": 11.9177, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.48595689092096667, |
|
"grad_norm": 0.013867055997252464, |
|
"learning_rate": 7.14049188514063e-05, |
|
"loss": 11.9188, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4937949052906597, |
|
"grad_norm": 0.022281071171164513, |
|
"learning_rate": 7.05318830467969e-05, |
|
"loss": 11.9179, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5016329196603527, |
|
"grad_norm": 0.014964847825467587, |
|
"learning_rate": 6.965125158269619e-05, |
|
"loss": 11.9187, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5094709340300457, |
|
"grad_norm": 0.012438995763659477, |
|
"learning_rate": 6.876335024396872e-05, |
|
"loss": 11.9191, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5173089483997387, |
|
"grad_norm": 0.016654090955853462, |
|
"learning_rate": 6.786850750493006e-05, |
|
"loss": 11.918, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5251469627694317, |
|
"grad_norm": 0.019343817606568336, |
|
"learning_rate": 6.696705440782938e-05, |
|
"loss": 11.9178, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5329849771391247, |
|
"grad_norm": 0.017074916511774063, |
|
"learning_rate": 6.605932444038229e-05, |
|
"loss": 11.9185, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5408229915088177, |
|
"grad_norm": 0.016646448522806168, |
|
"learning_rate": 6.514565341239861e-05, |
|
"loss": 11.918, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5486610058785107, |
|
"grad_norm": 0.013506593182682991, |
|
"learning_rate": 6.422637933155162e-05, |
|
"loss": 11.9185, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5486610058785107, |
|
"eval_loss": 11.917922019958496, |
|
"eval_runtime": 3.3182, |
|
"eval_samples_per_second": 776.922, |
|
"eval_steps_per_second": 24.411, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5564990202482037, |
|
"grad_norm": 0.01849060133099556, |
|
"learning_rate": 6.330184227833376e-05, |
|
"loss": 11.918, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5643370346178967, |
|
"grad_norm": 0.020746391266584396, |
|
"learning_rate": 6.237238428024572e-05, |
|
"loss": 11.9173, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5721750489875899, |
|
"grad_norm": 0.012400954961776733, |
|
"learning_rate": 6.143834918526527e-05, |
|
"loss": 11.9179, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5800130633572829, |
|
"grad_norm": 0.019317157566547394, |
|
"learning_rate": 6.0500082534642464e-05, |
|
"loss": 11.9188, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5878510777269759, |
|
"grad_norm": 0.01525891199707985, |
|
"learning_rate": 5.955793143506863e-05, |
|
"loss": 11.9184, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5956890920966689, |
|
"grad_norm": 0.013121162541210651, |
|
"learning_rate": 5.861224443026595e-05, |
|
"loss": 11.9178, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6035271064663619, |
|
"grad_norm": 0.018763018772006035, |
|
"learning_rate": 5.766337137204579e-05, |
|
"loss": 11.9179, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6113651208360549, |
|
"grad_norm": 0.018400780856609344, |
|
"learning_rate": 5.6711663290882776e-05, |
|
"loss": 11.9185, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6192031352057479, |
|
"grad_norm": 0.01755247637629509, |
|
"learning_rate": 5.575747226605298e-05, |
|
"loss": 11.918, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6270411495754409, |
|
"grad_norm": 0.013365167193114758, |
|
"learning_rate": 5.480115129538409e-05, |
|
"loss": 11.9172, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6348791639451339, |
|
"grad_norm": 0.013805567286908627, |
|
"learning_rate": 5.384305416466584e-05, |
|
"loss": 11.9173, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6427171783148269, |
|
"grad_norm": 0.013149751350283623, |
|
"learning_rate": 5.288353531676873e-05, |
|
"loss": 11.9184, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6505551926845199, |
|
"grad_norm": 0.015995411202311516, |
|
"learning_rate": 5.192294972051992e-05, |
|
"loss": 11.9183, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6583932070542129, |
|
"grad_norm": 0.014820579439401627, |
|
"learning_rate": 5.0961652739384356e-05, |
|
"loss": 11.9175, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6583932070542129, |
|
"eval_loss": 11.917555809020996, |
|
"eval_runtime": 3.3162, |
|
"eval_samples_per_second": 777.394, |
|
"eval_steps_per_second": 24.425, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6662312214239059, |
|
"grad_norm": 0.01651047170162201, |
|
"learning_rate": 5e-05, |
|
"loss": 11.9174, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.674069235793599, |
|
"grad_norm": 0.016767192631959915, |
|
"learning_rate": 4.903834726061565e-05, |
|
"loss": 11.9184, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.681907250163292, |
|
"grad_norm": 0.01721210964024067, |
|
"learning_rate": 4.807705027948008e-05, |
|
"loss": 11.9177, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.689745264532985, |
|
"grad_norm": 0.0172222089022398, |
|
"learning_rate": 4.711646468323129e-05, |
|
"loss": 11.9178, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.697583278902678, |
|
"grad_norm": 0.016364743933081627, |
|
"learning_rate": 4.6156945835334184e-05, |
|
"loss": 11.9181, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.705421293272371, |
|
"grad_norm": 0.016128098592162132, |
|
"learning_rate": 4.5198848704615914e-05, |
|
"loss": 11.9178, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.713259307642064, |
|
"grad_norm": 0.015332392416894436, |
|
"learning_rate": 4.424252773394704e-05, |
|
"loss": 11.9177, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.721097322011757, |
|
"grad_norm": 0.01631457917392254, |
|
"learning_rate": 4.328833670911724e-05, |
|
"loss": 11.9181, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.72893533638145, |
|
"grad_norm": 0.018029581755399704, |
|
"learning_rate": 4.23366286279542e-05, |
|
"loss": 11.9173, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.736773350751143, |
|
"grad_norm": 0.016118017956614494, |
|
"learning_rate": 4.138775556973406e-05, |
|
"loss": 11.9175, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.744611365120836, |
|
"grad_norm": 0.0235885176807642, |
|
"learning_rate": 4.04420685649314e-05, |
|
"loss": 11.9174, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.752449379490529, |
|
"grad_norm": 0.02053360641002655, |
|
"learning_rate": 3.9499917465357534e-05, |
|
"loss": 11.9178, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.760287393860222, |
|
"grad_norm": 0.018186945468187332, |
|
"learning_rate": 3.856165081473474e-05, |
|
"loss": 11.9181, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.768125408229915, |
|
"grad_norm": 0.029430339112877846, |
|
"learning_rate": 3.762761571975429e-05, |
|
"loss": 11.9174, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.768125408229915, |
|
"eval_loss": 11.917236328125, |
|
"eval_runtime": 3.3236, |
|
"eval_samples_per_second": 775.653, |
|
"eval_steps_per_second": 24.371, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7759634225996082, |
|
"grad_norm": 0.03046645224094391, |
|
"learning_rate": 3.6698157721666246e-05, |
|
"loss": 11.917, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7838014369693012, |
|
"grad_norm": 0.022532852366566658, |
|
"learning_rate": 3.5773620668448384e-05, |
|
"loss": 11.9169, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7916394513389942, |
|
"grad_norm": 0.019215036183595657, |
|
"learning_rate": 3.48543465876014e-05, |
|
"loss": 11.9183, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7994774657086872, |
|
"grad_norm": 0.02424040250480175, |
|
"learning_rate": 3.3940675559617724e-05, |
|
"loss": 11.9171, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8073154800783802, |
|
"grad_norm": 0.013650625012814999, |
|
"learning_rate": 3.303294559217063e-05, |
|
"loss": 11.9176, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8151534944480732, |
|
"grad_norm": 0.01347661204636097, |
|
"learning_rate": 3.213149249506997e-05, |
|
"loss": 11.9171, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8229915088177662, |
|
"grad_norm": 0.02356456220149994, |
|
"learning_rate": 3.12366497560313e-05, |
|
"loss": 11.9175, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8308295231874592, |
|
"grad_norm": 0.01948046311736107, |
|
"learning_rate": 3.0348748417303823e-05, |
|
"loss": 11.918, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8386675375571522, |
|
"grad_norm": 0.015809211879968643, |
|
"learning_rate": 2.9468116953203107e-05, |
|
"loss": 11.9178, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8465055519268452, |
|
"grad_norm": 0.01481384877115488, |
|
"learning_rate": 2.8595081148593738e-05, |
|
"loss": 11.9178, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8543435662965382, |
|
"grad_norm": 0.02252669259905815, |
|
"learning_rate": 2.772996397836704e-05, |
|
"loss": 11.9174, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8621815806662312, |
|
"grad_norm": 0.017606221139431, |
|
"learning_rate": 2.687308548795825e-05, |
|
"loss": 11.9176, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8700195950359242, |
|
"grad_norm": 0.024705080315470695, |
|
"learning_rate": 2.6024762674947313e-05, |
|
"loss": 11.9166, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8778576094056172, |
|
"grad_norm": 0.024166177958250046, |
|
"learning_rate": 2.5185309371787513e-05, |
|
"loss": 11.9176, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8778576094056172, |
|
"eval_loss": 11.916953086853027, |
|
"eval_runtime": 3.3243, |
|
"eval_samples_per_second": 775.492, |
|
"eval_steps_per_second": 24.366, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8856956237753103, |
|
"grad_norm": 0.01870771311223507, |
|
"learning_rate": 2.43550361297047e-05, |
|
"loss": 11.9178, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8935336381450033, |
|
"grad_norm": 0.014654111117124557, |
|
"learning_rate": 2.353425010381063e-05, |
|
"loss": 11.9177, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9013716525146963, |
|
"grad_norm": 0.01838817447423935, |
|
"learning_rate": 2.272325493947257e-05, |
|
"loss": 11.9171, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9092096668843893, |
|
"grad_norm": 0.021683456376194954, |
|
"learning_rate": 2.192235065998126e-05, |
|
"loss": 11.9179, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9170476812540823, |
|
"grad_norm": 0.02190260961651802, |
|
"learning_rate": 2.1131833555559037e-05, |
|
"loss": 11.917, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9248856956237753, |
|
"grad_norm": 0.014892240054905415, |
|
"learning_rate": 2.0351996073748713e-05, |
|
"loss": 11.917, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9327237099934683, |
|
"grad_norm": 0.020150186493992805, |
|
"learning_rate": 1.9583126711224343e-05, |
|
"loss": 11.9175, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.9405617243631613, |
|
"grad_norm": 0.01918022148311138, |
|
"learning_rate": 1.8825509907063327e-05, |
|
"loss": 11.9172, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9483997387328543, |
|
"grad_norm": 0.020093288272619247, |
|
"learning_rate": 1.807942593751973e-05, |
|
"loss": 11.9177, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9562377531025473, |
|
"grad_norm": 0.015267434529960155, |
|
"learning_rate": 1.7345150812337564e-05, |
|
"loss": 11.9167, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9640757674722403, |
|
"grad_norm": 0.01452693808823824, |
|
"learning_rate": 1.66229561726426e-05, |
|
"loss": 11.9178, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9719137818419333, |
|
"grad_norm": 0.01645076647400856, |
|
"learning_rate": 1.5913109190450032e-05, |
|
"loss": 11.9171, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9797517962116263, |
|
"grad_norm": 0.02012869343161583, |
|
"learning_rate": 1.5215872469825682e-05, |
|
"loss": 11.9169, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9875898105813194, |
|
"grad_norm": 0.015907544642686844, |
|
"learning_rate": 1.4531503949737108e-05, |
|
"loss": 11.9176, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9875898105813194, |
|
"eval_loss": 11.916740417480469, |
|
"eval_runtime": 3.3252, |
|
"eval_samples_per_second": 775.289, |
|
"eval_steps_per_second": 24.359, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9954278249510125, |
|
"grad_norm": 0.03160930424928665, |
|
"learning_rate": 1.3860256808630428e-05, |
|
"loss": 11.9168, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.0039190071848465, |
|
"grad_norm": 0.017380647361278534, |
|
"learning_rate": 1.3202379370768252e-05, |
|
"loss": 13.741, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.0117570215545395, |
|
"grad_norm": 0.0193234421312809, |
|
"learning_rate": 1.2558115014363592e-05, |
|
"loss": 12.1761, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.0195950359242325, |
|
"grad_norm": 0.02974247932434082, |
|
"learning_rate": 1.1927702081543279e-05, |
|
"loss": 11.7776, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0274330502939255, |
|
"grad_norm": 0.014591868035495281, |
|
"learning_rate": 1.1311373790174657e-05, |
|
"loss": 12.203, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.0352710646636185, |
|
"grad_norm": 0.018022043630480766, |
|
"learning_rate": 1.0709358147587884e-05, |
|
"loss": 11.431, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.0431090790333115, |
|
"grad_norm": 0.019987676292657852, |
|
"learning_rate": 1.0121877866225781e-05, |
|
"loss": 12.1733, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.0509470934030045, |
|
"grad_norm": 0.022911233827471733, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 11.9096, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.0587851077726975, |
|
"grad_norm": 0.019524535164237022, |
|
"learning_rate": 8.991387270152201e-06, |
|
"loss": 12.1372, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0666231221423905, |
|
"grad_norm": 0.016502438113093376, |
|
"learning_rate": 8.448795174344804e-06, |
|
"loss": 11.5679, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.0744611365120835, |
|
"grad_norm": 0.015734922140836716, |
|
"learning_rate": 7.921574722852343e-06, |
|
"loss": 12.0084, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.0822991508817765, |
|
"grad_norm": 0.019688883796334267, |
|
"learning_rate": 7.409920958039795e-06, |
|
"loss": 11.8272, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.0901371652514695, |
|
"grad_norm": 0.016949467360973358, |
|
"learning_rate": 6.9140231634602485e-06, |
|
"loss": 12.0226, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.0979751796211628, |
|
"grad_norm": 0.015972912311553955, |
|
"learning_rate": 6.43406479383053e-06, |
|
"loss": 12.0123, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0979751796211628, |
|
"eval_loss": 11.916641235351562, |
|
"eval_runtime": 3.3258, |
|
"eval_samples_per_second": 775.161, |
|
"eval_steps_per_second": 24.355, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1058131939908558, |
|
"grad_norm": 0.022812234237790108, |
|
"learning_rate": 5.9702234071631e-06, |
|
"loss": 11.7508, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.1136512083605488, |
|
"grad_norm": 0.02215123549103737, |
|
"learning_rate": 5.5226705990794155e-06, |
|
"loss": 11.9357, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.1214892227302418, |
|
"grad_norm": 0.018271734938025475, |
|
"learning_rate": 5.091571939329048e-06, |
|
"loss": 12.0107, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.1293272370999348, |
|
"grad_norm": 0.018984654918313026, |
|
"learning_rate": 4.677086910538092e-06, |
|
"loss": 12.1886, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.1371652514696278, |
|
"grad_norm": 0.018601972609758377, |
|
"learning_rate": 4.279368849209381e-06, |
|
"loss": 11.6696, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.1450032658393208, |
|
"grad_norm": 0.0175609327852726, |
|
"learning_rate": 3.898564888996476e-06, |
|
"loss": 12.2369, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.1528412802090138, |
|
"grad_norm": 0.018267886713147163, |
|
"learning_rate": 3.534815906272404e-06, |
|
"loss": 11.3125, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.1606792945787068, |
|
"grad_norm": 0.018670443445444107, |
|
"learning_rate": 3.18825646801314e-06, |
|
"loss": 12.0095, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.1685173089483998, |
|
"grad_norm": 0.01578577049076557, |
|
"learning_rate": 2.8590147820153513e-06, |
|
"loss": 11.9352, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.1763553233180928, |
|
"grad_norm": 0.018620701506733894, |
|
"learning_rate": 2.547212649466568e-06, |
|
"loss": 11.819, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1841933376877858, |
|
"grad_norm": 0.020687857642769814, |
|
"learning_rate": 2.2529654198854835e-06, |
|
"loss": 12.2251, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.1920313520574788, |
|
"grad_norm": 0.0191540215164423, |
|
"learning_rate": 1.9763819484490355e-06, |
|
"loss": 11.8286, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.1998693664271718, |
|
"grad_norm": 0.017804287374019623, |
|
"learning_rate": 1.7175645557220566e-06, |
|
"loss": 11.6918, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.2077073807968648, |
|
"grad_norm": 0.019628843292593956, |
|
"learning_rate": 1.4766089898042678e-06, |
|
"loss": 12.2151, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.2077073807968648, |
|
"eval_loss": 11.916607856750488, |
|
"eval_runtime": 3.3257, |
|
"eval_samples_per_second": 775.165, |
|
"eval_steps_per_second": 24.355, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.2155453951665578, |
|
"grad_norm": 0.013816201128065586, |
|
"learning_rate": 1.2536043909088191e-06, |
|
"loss": 11.9293, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.2233834095362508, |
|
"grad_norm": 0.016839459538459778, |
|
"learning_rate": 1.0486332583853563e-06, |
|
"loss": 11.9467, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.2312214239059438, |
|
"grad_norm": 0.01736452244222164, |
|
"learning_rate": 8.617714201998084e-07, |
|
"loss": 11.8521, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.2390594382756368, |
|
"grad_norm": 0.025550948455929756, |
|
"learning_rate": 6.93088004882253e-07, |
|
"loss": 11.6322, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.2468974526453298, |
|
"grad_norm": 0.03142261132597923, |
|
"learning_rate": 5.426454159531913e-07, |
|
"loss": 12.0707, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.2547354670150228, |
|
"grad_norm": 0.014136346988379955, |
|
"learning_rate": 4.104993088376974e-07, |
|
"loss": 12.1411, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2625734813847158, |
|
"grad_norm": 0.023102182894945145, |
|
"learning_rate": 2.966985702759828e-07, |
|
"loss": 11.8335, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.2704114957544088, |
|
"grad_norm": 0.020949246361851692, |
|
"learning_rate": 2.012853002380466e-07, |
|
"loss": 11.8762, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.2782495101241018, |
|
"grad_norm": 0.013338472694158554, |
|
"learning_rate": 1.2429479634897267e-07, |
|
"loss": 11.7727, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.2860875244937948, |
|
"grad_norm": 0.022523999214172363, |
|
"learning_rate": 6.575554083078084e-08, |
|
"loss": 12.2013, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.2939255388634878, |
|
"grad_norm": 0.016933446750044823, |
|
"learning_rate": 2.568918996560532e-08, |
|
"loss": 11.8428, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.3017635532331808, |
|
"grad_norm": 0.017216265201568604, |
|
"learning_rate": 4.110566084036816e-09, |
|
"loss": 11.9056, |
|
"step": 498 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 42, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9645271941120.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|