|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3821579984474831, |
|
"eval_steps": 400, |
|
"global_step": 1600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00023884874902967696, |
|
"eval_loss": 1.5979785919189453, |
|
"eval_runtime": 224.9995, |
|
"eval_samples_per_second": 3.778, |
|
"eval_steps_per_second": 3.778, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0014330924941780617, |
|
"grad_norm": 20.875, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.8691, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0028661849883561234, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.8156, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.004299277482534185, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 1.6925, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.005732369976712247, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.612, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0071654624708903086, |
|
"grad_norm": 7.25, |
|
"learning_rate": 3e-06, |
|
"loss": 1.8222, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00859855496506837, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 1.6277, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.010031647459246432, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 1.5655, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.011464739953424494, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.7691, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.012897832447602555, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 1.7085, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.014330924941780617, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 6e-06, |
|
"loss": 1.4649, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01576401743595868, |
|
"grad_norm": 15.8125, |
|
"learning_rate": 6.600000000000001e-06, |
|
"loss": 1.6534, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.01719710993013674, |
|
"grad_norm": 42.75, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 1.673, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.018630202424314804, |
|
"grad_norm": 5.5, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 1.429, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.020063294918492864, |
|
"grad_norm": 3.875, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 1.6067, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.021496387412670927, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 9e-06, |
|
"loss": 1.4336, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.022929479906848987, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 1.5998, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.02436257240102705, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 1.02e-05, |
|
"loss": 1.5259, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.02579566489520511, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.0800000000000002e-05, |
|
"loss": 1.5255, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.027228757389383174, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.14e-05, |
|
"loss": 1.5375, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.028661849883561234, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.4729, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.030094942377739298, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.2600000000000001e-05, |
|
"loss": 1.5446, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.03152803487191736, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 1.6895, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.03296112736609542, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.38e-05, |
|
"loss": 1.6145, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.03439421986027348, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 1.4316, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.035827312354451545, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.5619, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03726040484862961, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 1.5600000000000003e-05, |
|
"loss": 1.6608, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.038693497342807665, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.62e-05, |
|
"loss": 1.6418, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.04012658983698573, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 1.5532, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.04155968233116379, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 1.7400000000000003e-05, |
|
"loss": 1.6124, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.042992774825341855, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.5629, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04442586731951991, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.86e-05, |
|
"loss": 1.5727, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.045858959813697975, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 1.4866, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.04729205230787604, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.98e-05, |
|
"loss": 1.6203, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.0487251448020541, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.9999756307053947e-05, |
|
"loss": 1.6003, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.05015823729623216, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 1.9998476951563914e-05, |
|
"loss": 1.7795, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05159132979041022, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.9996101150403543e-05, |
|
"loss": 1.6262, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.053024422284588285, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.999262916410621e-05, |
|
"loss": 1.5033, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.05445751477876635, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.9988061373414342e-05, |
|
"loss": 1.528, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.055890607272944405, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.9982398279237657e-05, |
|
"loss": 1.6706, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.05732369976712247, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 1.8826, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.05875679226130053, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 1.9967788784562474e-05, |
|
"loss": 1.6844, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.060189884755478595, |
|
"grad_norm": 14.0, |
|
"learning_rate": 1.9958843986159705e-05, |
|
"loss": 1.6681, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.06162297724965665, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.9948807088287884e-05, |
|
"loss": 1.5271, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.06305606974383472, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.9937679191605964e-05, |
|
"loss": 1.5941, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.06448916223801278, |
|
"grad_norm": 7.75, |
|
"learning_rate": 1.9925461516413224e-05, |
|
"loss": 1.6754, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06592225473219084, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.991215540251542e-05, |
|
"loss": 1.6616, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.0673553472263689, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.989776230907789e-05, |
|
"loss": 1.7207, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.06878843972054696, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.988228381446553e-05, |
|
"loss": 1.6092, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.07022153221472502, |
|
"grad_norm": 15.625, |
|
"learning_rate": 1.9865721616069695e-05, |
|
"loss": 1.6828, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.07165462470890309, |
|
"grad_norm": 7.125, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 1.7341, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07308771720308115, |
|
"grad_norm": 10.625, |
|
"learning_rate": 1.9829353491495545e-05, |
|
"loss": 1.6181, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.07452080969725922, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.9809551553491918e-05, |
|
"loss": 1.548, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.07595390219143727, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.9788673887616852e-05, |
|
"loss": 1.5703, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.07738699468561533, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.9766722783341682e-05, |
|
"loss": 1.7147, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.0788200871797934, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 1.9743700647852356e-05, |
|
"loss": 1.7598, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08025317967397146, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.9719610005785466e-05, |
|
"loss": 1.7136, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.08168627216814951, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.9694453498951392e-05, |
|
"loss": 1.7161, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.08311936466232758, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 1.9668233886044597e-05, |
|
"loss": 1.6319, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.08455245715650564, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.96409540423411e-05, |
|
"loss": 1.5857, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.08598554965068371, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 1.7632, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.08741864214486177, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 1.9583225744651334e-05, |
|
"loss": 1.4205, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.08885173463903982, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.9552783621223437e-05, |
|
"loss": 1.7812, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.0902848271332179, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 1.9521293927421388e-05, |
|
"loss": 1.5759, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.09171791962739595, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 1.9488760116444966e-05, |
|
"loss": 1.6537, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.09315101212157402, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.945518575599317e-05, |
|
"loss": 1.4973, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09458410461575208, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.942057452787297e-05, |
|
"loss": 1.578, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.09553949961187078, |
|
"eval_loss": 1.4027706384658813, |
|
"eval_runtime": 224.2305, |
|
"eval_samples_per_second": 3.791, |
|
"eval_steps_per_second": 3.791, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09601719710993013, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.938493022759556e-05, |
|
"loss": 1.6032, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.0974502896041082, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.9348256763960146e-05, |
|
"loss": 1.7055, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.09888338209828626, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.9310558158625286e-05, |
|
"loss": 1.7454, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.10031647459246432, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 1.9271838545667876e-05, |
|
"loss": 1.7345, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10174956708664239, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.923210217112981e-05, |
|
"loss": 1.6099, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.10318265958082044, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.9191353392552346e-05, |
|
"loss": 1.652, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.10461575207499851, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.914959667849825e-05, |
|
"loss": 1.7092, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.10604884456917657, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.910683660806177e-05, |
|
"loss": 1.7545, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.10748193706335463, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 1.9063077870366504e-05, |
|
"loss": 1.5287, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1089150295575327, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 1.901832526405114e-05, |
|
"loss": 1.7219, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.11034812205171075, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 1.8972583696743284e-05, |
|
"loss": 1.665, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.11178121454588881, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 1.892585818452126e-05, |
|
"loss": 1.6363, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.11321430704006688, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.8878153851364013e-05, |
|
"loss": 1.543, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.11464739953424494, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 1.5826, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.11608049202842301, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.8779829754279806e-05, |
|
"loss": 1.581, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.11751358452260106, |
|
"grad_norm": 9.75, |
|
"learning_rate": 1.8729220772698096e-05, |
|
"loss": 1.5841, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.11894667701677912, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 1.8677654533689287e-05, |
|
"loss": 1.6944, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.12037976951095719, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.8625136692072577e-05, |
|
"loss": 1.6203, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.12181286200513525, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 1.8571673007021124e-05, |
|
"loss": 1.5639, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1232459544993133, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.851726934143048e-05, |
|
"loss": 1.6397, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.12467904699349137, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.8461931661275642e-05, |
|
"loss": 1.7315, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.12611213948766944, |
|
"grad_norm": 6.25, |
|
"learning_rate": 1.8405666034956842e-05, |
|
"loss": 1.7201, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.1275452319818475, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 1.8348478632634067e-05, |
|
"loss": 1.6047, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.12897832447602556, |
|
"grad_norm": 46.25, |
|
"learning_rate": 1.8290375725550417e-05, |
|
"loss": 1.6949, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.13041141697020361, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.8231363685344422e-05, |
|
"loss": 1.7245, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.13184450946438167, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.8171448983351284e-05, |
|
"loss": 1.641, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.13327760195855975, |
|
"grad_norm": 24.125, |
|
"learning_rate": 1.8110638189893267e-05, |
|
"loss": 1.6125, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.1347106944527378, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.804893797355914e-05, |
|
"loss": 1.6647, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.13614378694691587, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 1.798635510047293e-05, |
|
"loss": 1.7073, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.13757687944109392, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.792289643355191e-05, |
|
"loss": 1.6271, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.13900997193527198, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.785856893175402e-05, |
|
"loss": 1.6317, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.14044306442945004, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.7793379649314743e-05, |
|
"loss": 1.6578, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.14187615692362812, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.7727335734973512e-05, |
|
"loss": 1.6554, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.14330924941780618, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 1.5523, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.14474234191198423, |
|
"grad_norm": 23.375, |
|
"learning_rate": 1.759271307334881e-05, |
|
"loss": 1.616, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.1461754344061623, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.7524149088957244e-05, |
|
"loss": 1.7729, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.14760852690034035, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.7454759996828622e-05, |
|
"loss": 1.5922, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.14904161939451843, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 1.7384553406258842e-05, |
|
"loss": 1.583, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.1504747118886965, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.7313537016191706e-05, |
|
"loss": 1.6019, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.15190780438287455, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.7241718614374678e-05, |
|
"loss": 1.6195, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.1533408968770526, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.716910607650483e-05, |
|
"loss": 1.5012, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.15477398937123066, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.709570736536521e-05, |
|
"loss": 1.7686, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.15620708186540874, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.7021530529951627e-05, |
|
"loss": 1.7922, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.1576401743595868, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 1.6946583704589973e-05, |
|
"loss": 1.623, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.15907326685376486, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 1.6870875108044233e-05, |
|
"loss": 1.6039, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.1605063593479429, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 1.6794413042615168e-05, |
|
"loss": 1.6392, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.16193945184212097, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.6717205893229904e-05, |
|
"loss": 1.5683, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.16337254433629902, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.6639262126522417e-05, |
|
"loss": 1.6165, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.1648056368304771, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.6560590289905074e-05, |
|
"loss": 1.5341, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.16623872932465517, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.6481199010631312e-05, |
|
"loss": 1.6573, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.16767182181883322, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.6401096994849558e-05, |
|
"loss": 1.5056, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.16910491431301128, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.632029302664851e-05, |
|
"loss": 1.5337, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.17053800680718934, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.6238795967093865e-05, |
|
"loss": 1.5038, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.17197109930136742, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 1.5587, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.17340419179554548, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.607375839723287e-05, |
|
"loss": 1.563, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.17483728428972353, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.599023598515586e-05, |
|
"loss": 1.6058, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.1762703767839016, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.5906056676199256e-05, |
|
"loss": 1.7244, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.17770346927807965, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.5821229701572897e-05, |
|
"loss": 1.6587, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.17913656177225773, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.573576436351046e-05, |
|
"loss": 1.6018, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1805696542664358, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.564967003424938e-05, |
|
"loss": 1.6205, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.18200274676061384, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.556295615500305e-05, |
|
"loss": 1.6345, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.1834358392547919, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.5475632234925505e-05, |
|
"loss": 1.5226, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.18486893174896996, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.5387707850068633e-05, |
|
"loss": 1.6488, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.18630202424314804, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.529919264233205e-05, |
|
"loss": 1.5393, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.1877351167373261, |
|
"grad_norm": 7.625, |
|
"learning_rate": 1.5210096318405768e-05, |
|
"loss": 1.5374, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.18916820923150415, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.5120428648705716e-05, |
|
"loss": 1.4963, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.1906013017256822, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.5030199466302354e-05, |
|
"loss": 1.5828, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.19107899922374155, |
|
"eval_loss": 1.3809266090393066, |
|
"eval_runtime": 223.0505, |
|
"eval_samples_per_second": 3.811, |
|
"eval_steps_per_second": 3.811, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.19203439421986027, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 1.493941866584231e-05, |
|
"loss": 1.5799, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.19346748671403832, |
|
"grad_norm": 8.5, |
|
"learning_rate": 1.4848096202463373e-05, |
|
"loss": 1.6519, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.1949005792082164, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 1.4756242090702756e-05, |
|
"loss": 1.5897, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.19633367170239446, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.4663866403398915e-05, |
|
"loss": 1.6454, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.19776676419657252, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.4570979270586944e-05, |
|
"loss": 1.5361, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.19919985669075058, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.4477590878387697e-05, |
|
"loss": 1.5086, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.20063294918492863, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.4383711467890776e-05, |
|
"loss": 1.6474, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.20206604167910672, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 1.4289351334031461e-05, |
|
"loss": 1.465, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.20349913417328477, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 1.4194520824461773e-05, |
|
"loss": 1.5312, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.20493222666746283, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.4099230338415728e-05, |
|
"loss": 1.4775, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.2063653191616409, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 1.4003490325568953e-05, |
|
"loss": 1.8343, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.20779841165581894, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 1.3907311284892737e-05, |
|
"loss": 1.537, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.20923150414999703, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 1.3810703763502744e-05, |
|
"loss": 1.7239, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.21066459664417508, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.371367835550235e-05, |
|
"loss": 1.5176, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.21209768913835314, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.3616245700820922e-05, |
|
"loss": 1.641, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.2135307816325312, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.3518416484047018e-05, |
|
"loss": 1.5882, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.21496387412670925, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.342020143325669e-05, |
|
"loss": 1.6042, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2163969666208873, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 1.3321611318837033e-05, |
|
"loss": 1.5516, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.2178300591150654, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.3222656952305113e-05, |
|
"loss": 1.5349, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.21926315160924345, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.3123349185122328e-05, |
|
"loss": 1.6652, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.2206962441034215, |
|
"grad_norm": 17.25, |
|
"learning_rate": 1.3023698907504447e-05, |
|
"loss": 1.7149, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.22212933659759956, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 1.2923717047227368e-05, |
|
"loss": 1.6285, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.22356242909177762, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.2823414568428767e-05, |
|
"loss": 1.5982, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.2249955215859557, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.2722802470405744e-05, |
|
"loss": 1.5901, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.22642861408013376, |
|
"grad_norm": 4.75, |
|
"learning_rate": 1.2621891786408648e-05, |
|
"loss": 1.5705, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.22786170657431182, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 1.252069358243114e-05, |
|
"loss": 1.5263, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.22929479906848987, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.2419218955996677e-05, |
|
"loss": 1.5622, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.23072789156266793, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.2317479034941572e-05, |
|
"loss": 1.5984, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.23216098405684601, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 1.2215484976194675e-05, |
|
"loss": 1.6465, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.23359407655102407, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.211324796455389e-05, |
|
"loss": 1.705, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.23502716904520213, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.2010779211459649e-05, |
|
"loss": 1.5316, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.23646026153938018, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.190808995376545e-05, |
|
"loss": 1.4676, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.23789335403355824, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.1805191452505602e-05, |
|
"loss": 1.5319, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.2393264465277363, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.1702094991660326e-05, |
|
"loss": 1.6112, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.24075953902191438, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 1.159881187691835e-05, |
|
"loss": 1.6341, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.24219263151609244, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 1.1495353434437098e-05, |
|
"loss": 1.4623, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.2436257240102705, |
|
"grad_norm": 19.625, |
|
"learning_rate": 1.1391731009600655e-05, |
|
"loss": 1.4166, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.24505881650444855, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.128795596577563e-05, |
|
"loss": 1.5813, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.2464919089986266, |
|
"grad_norm": 6.25, |
|
"learning_rate": 1.1184039683065014e-05, |
|
"loss": 1.5772, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.2479250014928047, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.1079993557060228e-05, |
|
"loss": 1.401, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.24935809398698275, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 1.0975828997591496e-05, |
|
"loss": 1.6248, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.2507911864811608, |
|
"grad_norm": 856.0, |
|
"learning_rate": 1.0871557427476585e-05, |
|
"loss": 1.775, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2522242789753389, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.0767190281268187e-05, |
|
"loss": 1.586, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.25365737146951695, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.0662739004000005e-05, |
|
"loss": 1.5397, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.255090463963695, |
|
"grad_norm": 4.125, |
|
"learning_rate": 1.055821504993164e-05, |
|
"loss": 1.8712, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.25652355645787306, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.0453629881292537e-05, |
|
"loss": 1.5357, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.2579566489520511, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 1.0348994967025012e-05, |
|
"loss": 1.4033, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.25938974144622917, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.0244321781526533e-05, |
|
"loss": 1.5611, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.26082283394040723, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.0139621803391454e-05, |
|
"loss": 1.577, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.2622559264345853, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.0034906514152239e-05, |
|
"loss": 1.5149, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.26368901892876334, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 9.930187397020385e-06, |
|
"loss": 1.5796, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.2651221114229414, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 9.825475935627165e-06, |
|
"loss": 1.5702, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.2665552039171195, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 9.720783612764314e-06, |
|
"loss": 1.5354, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.26798829641129757, |
|
"grad_norm": 4.375, |
|
"learning_rate": 9.616121909124801e-06, |
|
"loss": 1.4122, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.2694213889054756, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 9.511502302043867e-06, |
|
"loss": 1.6959, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.2708544813996537, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 9.406936264240386e-06, |
|
"loss": 1.5493, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.27228757389383174, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 9.302435262558748e-06, |
|
"loss": 1.4156, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2737206663880098, |
|
"grad_norm": 720.0, |
|
"learning_rate": 9.198010756711413e-06, |
|
"loss": 1.567, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.27515375888218785, |
|
"grad_norm": 3.875, |
|
"learning_rate": 9.093674198022201e-06, |
|
"loss": 1.3814, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.2765868513763659, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 8.989437028170537e-06, |
|
"loss": 1.4261, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.27801994387054396, |
|
"grad_norm": 10.375, |
|
"learning_rate": 8.885310677936746e-06, |
|
"loss": 1.506, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.279453036364722, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 8.781306565948528e-06, |
|
"loss": 1.3967, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.2808861288589001, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 8.677436097428775e-06, |
|
"loss": 1.5761, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.2823192213530782, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 8.573710662944884e-06, |
|
"loss": 1.5428, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 0.28375231384725624, |
|
"grad_norm": 6.25, |
|
"learning_rate": 8.47014163715962e-06, |
|
"loss": 1.5426, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 0.2851854063414343, |
|
"grad_norm": 6.25, |
|
"learning_rate": 8.366740377583781e-06, |
|
"loss": 1.503, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 0.28661849883561236, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 1.4355, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.28661849883561236, |
|
"eval_loss": 1.315157413482666, |
|
"eval_runtime": 223.8181, |
|
"eval_samples_per_second": 3.798, |
|
"eval_steps_per_second": 3.798, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2880515913297904, |
|
"grad_norm": 5.625, |
|
"learning_rate": 8.1604864938728e-06, |
|
"loss": 1.4389, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 0.28948468382396847, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 8.057656487800283e-06, |
|
"loss": 1.5346, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 0.2909177763181465, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 7.955039481582098e-06, |
|
"loss": 1.4492, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.2923508688123246, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 7.852646728329368e-06, |
|
"loss": 1.4305, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 0.29378396130650264, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 7.750489456561351e-06, |
|
"loss": 1.607, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.2952170538006807, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 7.6485788689741e-06, |
|
"loss": 1.3777, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 0.2966501462948588, |
|
"grad_norm": 5.875, |
|
"learning_rate": 7.546926141211975e-06, |
|
"loss": 1.5751, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 0.29808323878903686, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 7.445542420642097e-06, |
|
"loss": 1.5106, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.2995163312832149, |
|
"grad_norm": 4.875, |
|
"learning_rate": 7.344438825131912e-06, |
|
"loss": 1.5982, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 0.300949423777393, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 7.243626441830009e-06, |
|
"loss": 1.5328, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.30238251627157103, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 7.143116325950266e-06, |
|
"loss": 1.6138, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 0.3038156087657491, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 7.042919499559538e-06, |
|
"loss": 1.4547, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 0.30524870125992715, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 6.943046950368944e-06, |
|
"loss": 1.4393, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 0.3066817937541052, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 6.843509630528977e-06, |
|
"loss": 1.4009, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 0.30811488624828326, |
|
"grad_norm": 5.125, |
|
"learning_rate": 6.744318455428436e-06, |
|
"loss": 1.5134, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3095479787424613, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 6.645484302497452e-06, |
|
"loss": 1.5411, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.3109810712366394, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 6.547018010014654e-06, |
|
"loss": 1.5058, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.3124141637308175, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.448930375918632e-06, |
|
"loss": 1.4026, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 0.31384725622499554, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 6.351232156623803e-06, |
|
"loss": 1.3993, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 0.3152803487191736, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 6.25393406584088e-06, |
|
"loss": 1.6574, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.31671344121335165, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 6.157046773401964e-06, |
|
"loss": 1.5233, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 0.3181465337075297, |
|
"grad_norm": 5.25, |
|
"learning_rate": 6.06058090409049e-06, |
|
"loss": 1.5095, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 0.31957962620170777, |
|
"grad_norm": 4.625, |
|
"learning_rate": 5.9645470364761e-06, |
|
"loss": 1.3797, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 0.3210127186958858, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 5.868955701754584e-06, |
|
"loss": 1.6089, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.3224458111900639, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 5.773817382593008e-06, |
|
"loss": 1.4297, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.32387890368424194, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 5.679142511980176e-06, |
|
"loss": 1.327, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 0.32531199617842, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 5.584941472082549e-06, |
|
"loss": 1.4878, |
|
"step": 1362 |
|
}, |
|
{ |
|
"epoch": 0.32674508867259805, |
|
"grad_norm": 5.125, |
|
"learning_rate": 5.491224593105695e-06, |
|
"loss": 1.4593, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 0.32817818116677616, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 5.398002152161484e-06, |
|
"loss": 1.5287, |
|
"step": 1374 |
|
}, |
|
{ |
|
"epoch": 0.3296112736609542, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 5.305284372141095e-06, |
|
"loss": 1.4808, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3310443661551323, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 5.213081420593933e-06, |
|
"loss": 1.4244, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 0.33247745864931033, |
|
"grad_norm": 9.5, |
|
"learning_rate": 5.121403408612672e-06, |
|
"loss": 1.5213, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 0.3339105511434884, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 5.030260389724447e-06, |
|
"loss": 1.4455, |
|
"step": 1398 |
|
}, |
|
{ |
|
"epoch": 0.33534364363766644, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 4.939662358788364e-06, |
|
"loss": 1.5983, |
|
"step": 1404 |
|
}, |
|
{ |
|
"epoch": 0.3367767361318445, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 4.849619250899458e-06, |
|
"loss": 1.3544, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.33820982862602256, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 4.76014094029921e-06, |
|
"loss": 1.4412, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 0.3396429211202006, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 4.671237239292699e-06, |
|
"loss": 1.4463, |
|
"step": 1422 |
|
}, |
|
{ |
|
"epoch": 0.34107601361437867, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.582917897172603e-06, |
|
"loss": 1.5306, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 0.3425091061085568, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 4.495192599150045e-06, |
|
"loss": 1.5532, |
|
"step": 1434 |
|
}, |
|
{ |
|
"epoch": 0.34394219860273484, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.408070965292534e-06, |
|
"loss": 1.4818, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.3453752910969129, |
|
"grad_norm": 4.125, |
|
"learning_rate": 4.321562549468991e-06, |
|
"loss": 1.4144, |
|
"step": 1446 |
|
}, |
|
{ |
|
"epoch": 0.34680838359109095, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 4.235676838302069e-06, |
|
"loss": 1.4173, |
|
"step": 1452 |
|
}, |
|
{ |
|
"epoch": 0.348241476085269, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.150423250127846e-06, |
|
"loss": 1.4121, |
|
"step": 1458 |
|
}, |
|
{ |
|
"epoch": 0.34967456857944706, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 4.065811133962987e-06, |
|
"loss": 1.4121, |
|
"step": 1464 |
|
}, |
|
{ |
|
"epoch": 0.3511076610736251, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.981849768479516e-06, |
|
"loss": 1.3973, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.3525407535678032, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 3.898548360987325e-06, |
|
"loss": 1.4554, |
|
"step": 1476 |
|
}, |
|
{ |
|
"epoch": 0.35397384606198123, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 3.81591604642446e-06, |
|
"loss": 1.4958, |
|
"step": 1482 |
|
}, |
|
{ |
|
"epoch": 0.3554069385561593, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 3.7339618863553983e-06, |
|
"loss": 1.4843, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 0.35684003105033735, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 3.6526948679773256e-06, |
|
"loss": 1.6051, |
|
"step": 1494 |
|
}, |
|
{ |
|
"epoch": 0.35827312354451546, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 3.5721239031346067e-06, |
|
"loss": 1.4176, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3597062160386935, |
|
"grad_norm": 4.375, |
|
"learning_rate": 3.492257827341492e-06, |
|
"loss": 1.4049, |
|
"step": 1506 |
|
}, |
|
{ |
|
"epoch": 0.3611393085328716, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 3.4131053988131947e-06, |
|
"loss": 1.5823, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 0.36257240102704963, |
|
"grad_norm": 6.0, |
|
"learning_rate": 3.3346752975054763e-06, |
|
"loss": 1.4469, |
|
"step": 1518 |
|
}, |
|
{ |
|
"epoch": 0.3640054935212277, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 3.2569761241627694e-06, |
|
"loss": 1.4373, |
|
"step": 1524 |
|
}, |
|
{ |
|
"epoch": 0.36543858601540574, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 3.1800163993750166e-06, |
|
"loss": 1.4823, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.3668716785095838, |
|
"grad_norm": 4.625, |
|
"learning_rate": 3.103804562643302e-06, |
|
"loss": 1.4585, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 0.36830477100376185, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.028348971454356e-06, |
|
"loss": 1.4233, |
|
"step": 1542 |
|
}, |
|
{ |
|
"epoch": 0.3697378634979399, |
|
"grad_norm": 14.625, |
|
"learning_rate": 2.953657900364053e-06, |
|
"loss": 1.4869, |
|
"step": 1548 |
|
}, |
|
{ |
|
"epoch": 0.37117095599211797, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 2.8797395400900362e-06, |
|
"loss": 1.5315, |
|
"step": 1554 |
|
}, |
|
{ |
|
"epoch": 0.3726040484862961, |
|
"grad_norm": 4.125, |
|
"learning_rate": 2.8066019966134907e-06, |
|
"loss": 1.4887, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.37403714098047414, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 2.7342532902902418e-06, |
|
"loss": 1.4533, |
|
"step": 1566 |
|
}, |
|
{ |
|
"epoch": 0.3754702334746522, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 2.6627013549712355e-06, |
|
"loss": 1.4017, |
|
"step": 1572 |
|
}, |
|
{ |
|
"epoch": 0.37690332596883025, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 2.5919540371325005e-06, |
|
"loss": 1.3971, |
|
"step": 1578 |
|
}, |
|
{ |
|
"epoch": 0.3783364184630083, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 2.522019095014683e-06, |
|
"loss": 1.5576, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 0.37976951095718636, |
|
"grad_norm": 10.875, |
|
"learning_rate": 2.45290419777228e-06, |
|
"loss": 1.4719, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.3812026034513644, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 2.3846169246326345e-06, |
|
"loss": 1.4618, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 0.3821579984474831, |
|
"eval_loss": 1.2876688241958618, |
|
"eval_runtime": 226.2654, |
|
"eval_samples_per_second": 3.757, |
|
"eval_steps_per_second": 3.757, |
|
"step": 1600 |
|
} |
|
], |
|
"logging_steps": 6, |
|
"max_steps": 2000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 400, |
|
"total_flos": 2.9553261973639004e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|