Mistral-v0.3-6B / trainer_state.json
pszemraj's picture
step 1600
cba3201 verified
raw
history blame
44.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3821579984474831,
"eval_steps": 400,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00023884874902967696,
"eval_loss": 1.5979785919189453,
"eval_runtime": 224.9995,
"eval_samples_per_second": 3.778,
"eval_steps_per_second": 3.778,
"step": 1
},
{
"epoch": 0.0014330924941780617,
"grad_norm": 20.875,
"learning_rate": 6.000000000000001e-07,
"loss": 1.8691,
"step": 6
},
{
"epoch": 0.0028661849883561234,
"grad_norm": 14.0625,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.8156,
"step": 12
},
{
"epoch": 0.004299277482534185,
"grad_norm": 11.1875,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.6925,
"step": 18
},
{
"epoch": 0.005732369976712247,
"grad_norm": 7.15625,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.612,
"step": 24
},
{
"epoch": 0.0071654624708903086,
"grad_norm": 7.25,
"learning_rate": 3e-06,
"loss": 1.8222,
"step": 30
},
{
"epoch": 0.00859855496506837,
"grad_norm": 5.71875,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.6277,
"step": 36
},
{
"epoch": 0.010031647459246432,
"grad_norm": 5.65625,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.5655,
"step": 42
},
{
"epoch": 0.011464739953424494,
"grad_norm": 6.90625,
"learning_rate": 4.800000000000001e-06,
"loss": 1.7691,
"step": 48
},
{
"epoch": 0.012897832447602555,
"grad_norm": 6.96875,
"learning_rate": 5.400000000000001e-06,
"loss": 1.7085,
"step": 54
},
{
"epoch": 0.014330924941780617,
"grad_norm": 5.3125,
"learning_rate": 6e-06,
"loss": 1.4649,
"step": 60
},
{
"epoch": 0.01576401743595868,
"grad_norm": 15.8125,
"learning_rate": 6.600000000000001e-06,
"loss": 1.6534,
"step": 66
},
{
"epoch": 0.01719710993013674,
"grad_norm": 42.75,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.673,
"step": 72
},
{
"epoch": 0.018630202424314804,
"grad_norm": 5.5,
"learning_rate": 7.800000000000002e-06,
"loss": 1.429,
"step": 78
},
{
"epoch": 0.020063294918492864,
"grad_norm": 3.875,
"learning_rate": 8.400000000000001e-06,
"loss": 1.6067,
"step": 84
},
{
"epoch": 0.021496387412670927,
"grad_norm": 4.53125,
"learning_rate": 9e-06,
"loss": 1.4336,
"step": 90
},
{
"epoch": 0.022929479906848987,
"grad_norm": 4.40625,
"learning_rate": 9.600000000000001e-06,
"loss": 1.5998,
"step": 96
},
{
"epoch": 0.02436257240102705,
"grad_norm": 5.40625,
"learning_rate": 1.02e-05,
"loss": 1.5259,
"step": 102
},
{
"epoch": 0.02579566489520511,
"grad_norm": 9.0,
"learning_rate": 1.0800000000000002e-05,
"loss": 1.5255,
"step": 108
},
{
"epoch": 0.027228757389383174,
"grad_norm": 5.34375,
"learning_rate": 1.14e-05,
"loss": 1.5375,
"step": 114
},
{
"epoch": 0.028661849883561234,
"grad_norm": 4.625,
"learning_rate": 1.2e-05,
"loss": 1.4729,
"step": 120
},
{
"epoch": 0.030094942377739298,
"grad_norm": 5.78125,
"learning_rate": 1.2600000000000001e-05,
"loss": 1.5446,
"step": 126
},
{
"epoch": 0.03152803487191736,
"grad_norm": 5.15625,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.6895,
"step": 132
},
{
"epoch": 0.03296112736609542,
"grad_norm": 4.59375,
"learning_rate": 1.38e-05,
"loss": 1.6145,
"step": 138
},
{
"epoch": 0.03439421986027348,
"grad_norm": 4.96875,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.4316,
"step": 144
},
{
"epoch": 0.035827312354451545,
"grad_norm": 4.71875,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.5619,
"step": 150
},
{
"epoch": 0.03726040484862961,
"grad_norm": 7.9375,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.6608,
"step": 156
},
{
"epoch": 0.038693497342807665,
"grad_norm": 4.34375,
"learning_rate": 1.62e-05,
"loss": 1.6418,
"step": 162
},
{
"epoch": 0.04012658983698573,
"grad_norm": 4.8125,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.5532,
"step": 168
},
{
"epoch": 0.04155968233116379,
"grad_norm": 7.90625,
"learning_rate": 1.7400000000000003e-05,
"loss": 1.6124,
"step": 174
},
{
"epoch": 0.042992774825341855,
"grad_norm": 5.90625,
"learning_rate": 1.8e-05,
"loss": 1.5629,
"step": 180
},
{
"epoch": 0.04442586731951991,
"grad_norm": 9.4375,
"learning_rate": 1.86e-05,
"loss": 1.5727,
"step": 186
},
{
"epoch": 0.045858959813697975,
"grad_norm": 6.34375,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.4866,
"step": 192
},
{
"epoch": 0.04729205230787604,
"grad_norm": 10.9375,
"learning_rate": 1.98e-05,
"loss": 1.6203,
"step": 198
},
{
"epoch": 0.0487251448020541,
"grad_norm": 5.46875,
"learning_rate": 1.9999756307053947e-05,
"loss": 1.6003,
"step": 204
},
{
"epoch": 0.05015823729623216,
"grad_norm": 7.34375,
"learning_rate": 1.9998476951563914e-05,
"loss": 1.7795,
"step": 210
},
{
"epoch": 0.05159132979041022,
"grad_norm": 5.03125,
"learning_rate": 1.9996101150403543e-05,
"loss": 1.6262,
"step": 216
},
{
"epoch": 0.053024422284588285,
"grad_norm": 6.03125,
"learning_rate": 1.999262916410621e-05,
"loss": 1.5033,
"step": 222
},
{
"epoch": 0.05445751477876635,
"grad_norm": 6.375,
"learning_rate": 1.9988061373414342e-05,
"loss": 1.528,
"step": 228
},
{
"epoch": 0.055890607272944405,
"grad_norm": 5.375,
"learning_rate": 1.9982398279237657e-05,
"loss": 1.6706,
"step": 234
},
{
"epoch": 0.05732369976712247,
"grad_norm": 5.3125,
"learning_rate": 1.9975640502598243e-05,
"loss": 1.8826,
"step": 240
},
{
"epoch": 0.05875679226130053,
"grad_norm": 7.21875,
"learning_rate": 1.9967788784562474e-05,
"loss": 1.6844,
"step": 246
},
{
"epoch": 0.060189884755478595,
"grad_norm": 14.0,
"learning_rate": 1.9958843986159705e-05,
"loss": 1.6681,
"step": 252
},
{
"epoch": 0.06162297724965665,
"grad_norm": 5.3125,
"learning_rate": 1.9948807088287884e-05,
"loss": 1.5271,
"step": 258
},
{
"epoch": 0.06305606974383472,
"grad_norm": 5.78125,
"learning_rate": 1.9937679191605964e-05,
"loss": 1.5941,
"step": 264
},
{
"epoch": 0.06448916223801278,
"grad_norm": 7.75,
"learning_rate": 1.9925461516413224e-05,
"loss": 1.6754,
"step": 270
},
{
"epoch": 0.06592225473219084,
"grad_norm": 5.03125,
"learning_rate": 1.991215540251542e-05,
"loss": 1.6616,
"step": 276
},
{
"epoch": 0.0673553472263689,
"grad_norm": 5.46875,
"learning_rate": 1.989776230907789e-05,
"loss": 1.7207,
"step": 282
},
{
"epoch": 0.06878843972054696,
"grad_norm": 4.84375,
"learning_rate": 1.988228381446553e-05,
"loss": 1.6092,
"step": 288
},
{
"epoch": 0.07022153221472502,
"grad_norm": 15.625,
"learning_rate": 1.9865721616069695e-05,
"loss": 1.6828,
"step": 294
},
{
"epoch": 0.07165462470890309,
"grad_norm": 7.125,
"learning_rate": 1.9848077530122083e-05,
"loss": 1.7341,
"step": 300
},
{
"epoch": 0.07308771720308115,
"grad_norm": 10.625,
"learning_rate": 1.9829353491495545e-05,
"loss": 1.6181,
"step": 306
},
{
"epoch": 0.07452080969725922,
"grad_norm": 4.75,
"learning_rate": 1.9809551553491918e-05,
"loss": 1.548,
"step": 312
},
{
"epoch": 0.07595390219143727,
"grad_norm": 6.9375,
"learning_rate": 1.9788673887616852e-05,
"loss": 1.5703,
"step": 318
},
{
"epoch": 0.07738699468561533,
"grad_norm": 6.71875,
"learning_rate": 1.9766722783341682e-05,
"loss": 1.7147,
"step": 324
},
{
"epoch": 0.0788200871797934,
"grad_norm": 6.8125,
"learning_rate": 1.9743700647852356e-05,
"loss": 1.7598,
"step": 330
},
{
"epoch": 0.08025317967397146,
"grad_norm": 5.0625,
"learning_rate": 1.9719610005785466e-05,
"loss": 1.7136,
"step": 336
},
{
"epoch": 0.08168627216814951,
"grad_norm": 6.03125,
"learning_rate": 1.9694453498951392e-05,
"loss": 1.7161,
"step": 342
},
{
"epoch": 0.08311936466232758,
"grad_norm": 7.34375,
"learning_rate": 1.9668233886044597e-05,
"loss": 1.6319,
"step": 348
},
{
"epoch": 0.08455245715650564,
"grad_norm": 5.21875,
"learning_rate": 1.96409540423411e-05,
"loss": 1.5857,
"step": 354
},
{
"epoch": 0.08598554965068371,
"grad_norm": 10.6875,
"learning_rate": 1.961261695938319e-05,
"loss": 1.7632,
"step": 360
},
{
"epoch": 0.08741864214486177,
"grad_norm": 6.21875,
"learning_rate": 1.9583225744651334e-05,
"loss": 1.4205,
"step": 366
},
{
"epoch": 0.08885173463903982,
"grad_norm": 5.875,
"learning_rate": 1.9552783621223437e-05,
"loss": 1.7812,
"step": 372
},
{
"epoch": 0.0902848271332179,
"grad_norm": 4.46875,
"learning_rate": 1.9521293927421388e-05,
"loss": 1.5759,
"step": 378
},
{
"epoch": 0.09171791962739595,
"grad_norm": 6.53125,
"learning_rate": 1.9488760116444966e-05,
"loss": 1.6537,
"step": 384
},
{
"epoch": 0.09315101212157402,
"grad_norm": 10.8125,
"learning_rate": 1.945518575599317e-05,
"loss": 1.4973,
"step": 390
},
{
"epoch": 0.09458410461575208,
"grad_norm": 4.1875,
"learning_rate": 1.942057452787297e-05,
"loss": 1.578,
"step": 396
},
{
"epoch": 0.09553949961187078,
"eval_loss": 1.4027706384658813,
"eval_runtime": 224.2305,
"eval_samples_per_second": 3.791,
"eval_steps_per_second": 3.791,
"step": 400
},
{
"epoch": 0.09601719710993013,
"grad_norm": 3.875,
"learning_rate": 1.938493022759556e-05,
"loss": 1.6032,
"step": 402
},
{
"epoch": 0.0974502896041082,
"grad_norm": 6.125,
"learning_rate": 1.9348256763960146e-05,
"loss": 1.7055,
"step": 408
},
{
"epoch": 0.09888338209828626,
"grad_norm": 5.84375,
"learning_rate": 1.9310558158625286e-05,
"loss": 1.7454,
"step": 414
},
{
"epoch": 0.10031647459246432,
"grad_norm": 7.0625,
"learning_rate": 1.9271838545667876e-05,
"loss": 1.7345,
"step": 420
},
{
"epoch": 0.10174956708664239,
"grad_norm": 6.125,
"learning_rate": 1.923210217112981e-05,
"loss": 1.6099,
"step": 426
},
{
"epoch": 0.10318265958082044,
"grad_norm": 4.59375,
"learning_rate": 1.9191353392552346e-05,
"loss": 1.652,
"step": 432
},
{
"epoch": 0.10461575207499851,
"grad_norm": 5.96875,
"learning_rate": 1.914959667849825e-05,
"loss": 1.7092,
"step": 438
},
{
"epoch": 0.10604884456917657,
"grad_norm": 6.4375,
"learning_rate": 1.910683660806177e-05,
"loss": 1.7545,
"step": 444
},
{
"epoch": 0.10748193706335463,
"grad_norm": 10.4375,
"learning_rate": 1.9063077870366504e-05,
"loss": 1.5287,
"step": 450
},
{
"epoch": 0.1089150295575327,
"grad_norm": 7.84375,
"learning_rate": 1.901832526405114e-05,
"loss": 1.7219,
"step": 456
},
{
"epoch": 0.11034812205171075,
"grad_norm": 9.5625,
"learning_rate": 1.8972583696743284e-05,
"loss": 1.665,
"step": 462
},
{
"epoch": 0.11178121454588881,
"grad_norm": 10.0625,
"learning_rate": 1.892585818452126e-05,
"loss": 1.6363,
"step": 468
},
{
"epoch": 0.11321430704006688,
"grad_norm": 5.78125,
"learning_rate": 1.8878153851364013e-05,
"loss": 1.543,
"step": 474
},
{
"epoch": 0.11464739953424494,
"grad_norm": 6.125,
"learning_rate": 1.8829475928589272e-05,
"loss": 1.5826,
"step": 480
},
{
"epoch": 0.11608049202842301,
"grad_norm": 4.8125,
"learning_rate": 1.8779829754279806e-05,
"loss": 1.581,
"step": 486
},
{
"epoch": 0.11751358452260106,
"grad_norm": 9.75,
"learning_rate": 1.8729220772698096e-05,
"loss": 1.5841,
"step": 492
},
{
"epoch": 0.11894667701677912,
"grad_norm": 13.3125,
"learning_rate": 1.8677654533689287e-05,
"loss": 1.6944,
"step": 498
},
{
"epoch": 0.12037976951095719,
"grad_norm": 4.96875,
"learning_rate": 1.8625136692072577e-05,
"loss": 1.6203,
"step": 504
},
{
"epoch": 0.12181286200513525,
"grad_norm": 6.3125,
"learning_rate": 1.8571673007021124e-05,
"loss": 1.5639,
"step": 510
},
{
"epoch": 0.1232459544993133,
"grad_norm": 5.5,
"learning_rate": 1.851726934143048e-05,
"loss": 1.6397,
"step": 516
},
{
"epoch": 0.12467904699349137,
"grad_norm": 5.125,
"learning_rate": 1.8461931661275642e-05,
"loss": 1.7315,
"step": 522
},
{
"epoch": 0.12611213948766944,
"grad_norm": 6.25,
"learning_rate": 1.8405666034956842e-05,
"loss": 1.7201,
"step": 528
},
{
"epoch": 0.1275452319818475,
"grad_norm": 8.9375,
"learning_rate": 1.8348478632634067e-05,
"loss": 1.6047,
"step": 534
},
{
"epoch": 0.12897832447602556,
"grad_norm": 46.25,
"learning_rate": 1.8290375725550417e-05,
"loss": 1.6949,
"step": 540
},
{
"epoch": 0.13041141697020361,
"grad_norm": 5.9375,
"learning_rate": 1.8231363685344422e-05,
"loss": 1.7245,
"step": 546
},
{
"epoch": 0.13184450946438167,
"grad_norm": 5.78125,
"learning_rate": 1.8171448983351284e-05,
"loss": 1.641,
"step": 552
},
{
"epoch": 0.13327760195855975,
"grad_norm": 24.125,
"learning_rate": 1.8110638189893267e-05,
"loss": 1.6125,
"step": 558
},
{
"epoch": 0.1347106944527378,
"grad_norm": 6.4375,
"learning_rate": 1.804893797355914e-05,
"loss": 1.6647,
"step": 564
},
{
"epoch": 0.13614378694691587,
"grad_norm": 6.34375,
"learning_rate": 1.798635510047293e-05,
"loss": 1.7073,
"step": 570
},
{
"epoch": 0.13757687944109392,
"grad_norm": 6.1875,
"learning_rate": 1.792289643355191e-05,
"loss": 1.6271,
"step": 576
},
{
"epoch": 0.13900997193527198,
"grad_norm": 5.0625,
"learning_rate": 1.785856893175402e-05,
"loss": 1.6317,
"step": 582
},
{
"epoch": 0.14044306442945004,
"grad_norm": 4.6875,
"learning_rate": 1.7793379649314743e-05,
"loss": 1.6578,
"step": 588
},
{
"epoch": 0.14187615692362812,
"grad_norm": 4.84375,
"learning_rate": 1.7727335734973512e-05,
"loss": 1.6554,
"step": 594
},
{
"epoch": 0.14330924941780618,
"grad_norm": 6.1875,
"learning_rate": 1.766044443118978e-05,
"loss": 1.5523,
"step": 600
},
{
"epoch": 0.14474234191198423,
"grad_norm": 23.375,
"learning_rate": 1.759271307334881e-05,
"loss": 1.616,
"step": 606
},
{
"epoch": 0.1461754344061623,
"grad_norm": 6.9375,
"learning_rate": 1.7524149088957244e-05,
"loss": 1.7729,
"step": 612
},
{
"epoch": 0.14760852690034035,
"grad_norm": 10.25,
"learning_rate": 1.7454759996828622e-05,
"loss": 1.5922,
"step": 618
},
{
"epoch": 0.14904161939451843,
"grad_norm": 7.21875,
"learning_rate": 1.7384553406258842e-05,
"loss": 1.583,
"step": 624
},
{
"epoch": 0.1504747118886965,
"grad_norm": 6.9375,
"learning_rate": 1.7313537016191706e-05,
"loss": 1.6019,
"step": 630
},
{
"epoch": 0.15190780438287455,
"grad_norm": 11.5,
"learning_rate": 1.7241718614374678e-05,
"loss": 1.6195,
"step": 636
},
{
"epoch": 0.1533408968770526,
"grad_norm": 5.5,
"learning_rate": 1.716910607650483e-05,
"loss": 1.5012,
"step": 642
},
{
"epoch": 0.15477398937123066,
"grad_norm": 6.71875,
"learning_rate": 1.709570736536521e-05,
"loss": 1.7686,
"step": 648
},
{
"epoch": 0.15620708186540874,
"grad_norm": 5.71875,
"learning_rate": 1.7021530529951627e-05,
"loss": 1.7922,
"step": 654
},
{
"epoch": 0.1576401743595868,
"grad_norm": 7.8125,
"learning_rate": 1.6946583704589973e-05,
"loss": 1.623,
"step": 660
},
{
"epoch": 0.15907326685376486,
"grad_norm": 6.34375,
"learning_rate": 1.6870875108044233e-05,
"loss": 1.6039,
"step": 666
},
{
"epoch": 0.1605063593479429,
"grad_norm": 6.46875,
"learning_rate": 1.6794413042615168e-05,
"loss": 1.6392,
"step": 672
},
{
"epoch": 0.16193945184212097,
"grad_norm": 5.4375,
"learning_rate": 1.6717205893229904e-05,
"loss": 1.5683,
"step": 678
},
{
"epoch": 0.16337254433629902,
"grad_norm": 4.78125,
"learning_rate": 1.6639262126522417e-05,
"loss": 1.6165,
"step": 684
},
{
"epoch": 0.1648056368304771,
"grad_norm": 5.4375,
"learning_rate": 1.6560590289905074e-05,
"loss": 1.5341,
"step": 690
},
{
"epoch": 0.16623872932465517,
"grad_norm": 5.25,
"learning_rate": 1.6481199010631312e-05,
"loss": 1.6573,
"step": 696
},
{
"epoch": 0.16767182181883322,
"grad_norm": 5.21875,
"learning_rate": 1.6401096994849558e-05,
"loss": 1.5056,
"step": 702
},
{
"epoch": 0.16910491431301128,
"grad_norm": 12.625,
"learning_rate": 1.632029302664851e-05,
"loss": 1.5337,
"step": 708
},
{
"epoch": 0.17053800680718934,
"grad_norm": 4.28125,
"learning_rate": 1.6238795967093865e-05,
"loss": 1.5038,
"step": 714
},
{
"epoch": 0.17197109930136742,
"grad_norm": 6.96875,
"learning_rate": 1.6156614753256583e-05,
"loss": 1.5587,
"step": 720
},
{
"epoch": 0.17340419179554548,
"grad_norm": 4.90625,
"learning_rate": 1.607375839723287e-05,
"loss": 1.563,
"step": 726
},
{
"epoch": 0.17483728428972353,
"grad_norm": 5.34375,
"learning_rate": 1.599023598515586e-05,
"loss": 1.6058,
"step": 732
},
{
"epoch": 0.1762703767839016,
"grad_norm": 5.25,
"learning_rate": 1.5906056676199256e-05,
"loss": 1.7244,
"step": 738
},
{
"epoch": 0.17770346927807965,
"grad_norm": 4.5,
"learning_rate": 1.5821229701572897e-05,
"loss": 1.6587,
"step": 744
},
{
"epoch": 0.17913656177225773,
"grad_norm": 12.75,
"learning_rate": 1.573576436351046e-05,
"loss": 1.6018,
"step": 750
},
{
"epoch": 0.1805696542664358,
"grad_norm": 6.0,
"learning_rate": 1.564967003424938e-05,
"loss": 1.6205,
"step": 756
},
{
"epoch": 0.18200274676061384,
"grad_norm": 5.59375,
"learning_rate": 1.556295615500305e-05,
"loss": 1.6345,
"step": 762
},
{
"epoch": 0.1834358392547919,
"grad_norm": 4.59375,
"learning_rate": 1.5475632234925505e-05,
"loss": 1.5226,
"step": 768
},
{
"epoch": 0.18486893174896996,
"grad_norm": 4.78125,
"learning_rate": 1.5387707850068633e-05,
"loss": 1.6488,
"step": 774
},
{
"epoch": 0.18630202424314804,
"grad_norm": 4.28125,
"learning_rate": 1.529919264233205e-05,
"loss": 1.5393,
"step": 780
},
{
"epoch": 0.1877351167373261,
"grad_norm": 7.625,
"learning_rate": 1.5210096318405768e-05,
"loss": 1.5374,
"step": 786
},
{
"epoch": 0.18916820923150415,
"grad_norm": 4.21875,
"learning_rate": 1.5120428648705716e-05,
"loss": 1.4963,
"step": 792
},
{
"epoch": 0.1906013017256822,
"grad_norm": 4.25,
"learning_rate": 1.5030199466302354e-05,
"loss": 1.5828,
"step": 798
},
{
"epoch": 0.19107899922374155,
"eval_loss": 1.3809266090393066,
"eval_runtime": 223.0505,
"eval_samples_per_second": 3.811,
"eval_steps_per_second": 3.811,
"step": 800
},
{
"epoch": 0.19203439421986027,
"grad_norm": 6.21875,
"learning_rate": 1.493941866584231e-05,
"loss": 1.5799,
"step": 804
},
{
"epoch": 0.19346748671403832,
"grad_norm": 8.5,
"learning_rate": 1.4848096202463373e-05,
"loss": 1.6519,
"step": 810
},
{
"epoch": 0.1949005792082164,
"grad_norm": 4.59375,
"learning_rate": 1.4756242090702756e-05,
"loss": 1.5897,
"step": 816
},
{
"epoch": 0.19633367170239446,
"grad_norm": 5.75,
"learning_rate": 1.4663866403398915e-05,
"loss": 1.6454,
"step": 822
},
{
"epoch": 0.19776676419657252,
"grad_norm": 4.1875,
"learning_rate": 1.4570979270586944e-05,
"loss": 1.5361,
"step": 828
},
{
"epoch": 0.19919985669075058,
"grad_norm": 5.375,
"learning_rate": 1.4477590878387697e-05,
"loss": 1.5086,
"step": 834
},
{
"epoch": 0.20063294918492863,
"grad_norm": 4.375,
"learning_rate": 1.4383711467890776e-05,
"loss": 1.6474,
"step": 840
},
{
"epoch": 0.20206604167910672,
"grad_norm": 4.6875,
"learning_rate": 1.4289351334031461e-05,
"loss": 1.465,
"step": 846
},
{
"epoch": 0.20349913417328477,
"grad_norm": 8.6875,
"learning_rate": 1.4194520824461773e-05,
"loss": 1.5312,
"step": 852
},
{
"epoch": 0.20493222666746283,
"grad_norm": 5.53125,
"learning_rate": 1.4099230338415728e-05,
"loss": 1.4775,
"step": 858
},
{
"epoch": 0.2063653191616409,
"grad_norm": 9.8125,
"learning_rate": 1.4003490325568953e-05,
"loss": 1.8343,
"step": 864
},
{
"epoch": 0.20779841165581894,
"grad_norm": 8.0625,
"learning_rate": 1.3907311284892737e-05,
"loss": 1.537,
"step": 870
},
{
"epoch": 0.20923150414999703,
"grad_norm": 6.3125,
"learning_rate": 1.3810703763502744e-05,
"loss": 1.7239,
"step": 876
},
{
"epoch": 0.21066459664417508,
"grad_norm": 5.75,
"learning_rate": 1.371367835550235e-05,
"loss": 1.5176,
"step": 882
},
{
"epoch": 0.21209768913835314,
"grad_norm": 4.65625,
"learning_rate": 1.3616245700820922e-05,
"loss": 1.641,
"step": 888
},
{
"epoch": 0.2135307816325312,
"grad_norm": 4.0625,
"learning_rate": 1.3518416484047018e-05,
"loss": 1.5882,
"step": 894
},
{
"epoch": 0.21496387412670925,
"grad_norm": 5.09375,
"learning_rate": 1.342020143325669e-05,
"loss": 1.6042,
"step": 900
},
{
"epoch": 0.2163969666208873,
"grad_norm": 5.84375,
"learning_rate": 1.3321611318837033e-05,
"loss": 1.5516,
"step": 906
},
{
"epoch": 0.2178300591150654,
"grad_norm": 6.15625,
"learning_rate": 1.3222656952305113e-05,
"loss": 1.5349,
"step": 912
},
{
"epoch": 0.21926315160924345,
"grad_norm": 5.21875,
"learning_rate": 1.3123349185122328e-05,
"loss": 1.6652,
"step": 918
},
{
"epoch": 0.2206962441034215,
"grad_norm": 17.25,
"learning_rate": 1.3023698907504447e-05,
"loss": 1.7149,
"step": 924
},
{
"epoch": 0.22212933659759956,
"grad_norm": 6.8125,
"learning_rate": 1.2923717047227368e-05,
"loss": 1.6285,
"step": 930
},
{
"epoch": 0.22356242909177762,
"grad_norm": 4.1875,
"learning_rate": 1.2823414568428767e-05,
"loss": 1.5982,
"step": 936
},
{
"epoch": 0.2249955215859557,
"grad_norm": 5.8125,
"learning_rate": 1.2722802470405744e-05,
"loss": 1.5901,
"step": 942
},
{
"epoch": 0.22642861408013376,
"grad_norm": 4.75,
"learning_rate": 1.2621891786408648e-05,
"loss": 1.5705,
"step": 948
},
{
"epoch": 0.22786170657431182,
"grad_norm": 10.1875,
"learning_rate": 1.252069358243114e-05,
"loss": 1.5263,
"step": 954
},
{
"epoch": 0.22929479906848987,
"grad_norm": 3.671875,
"learning_rate": 1.2419218955996677e-05,
"loss": 1.5622,
"step": 960
},
{
"epoch": 0.23072789156266793,
"grad_norm": 4.625,
"learning_rate": 1.2317479034941572e-05,
"loss": 1.5984,
"step": 966
},
{
"epoch": 0.23216098405684601,
"grad_norm": 7.21875,
"learning_rate": 1.2215484976194675e-05,
"loss": 1.6465,
"step": 972
},
{
"epoch": 0.23359407655102407,
"grad_norm": 6.59375,
"learning_rate": 1.211324796455389e-05,
"loss": 1.705,
"step": 978
},
{
"epoch": 0.23502716904520213,
"grad_norm": 5.96875,
"learning_rate": 1.2010779211459649e-05,
"loss": 1.5316,
"step": 984
},
{
"epoch": 0.23646026153938018,
"grad_norm": 5.3125,
"learning_rate": 1.190808995376545e-05,
"loss": 1.4676,
"step": 990
},
{
"epoch": 0.23789335403355824,
"grad_norm": 4.9375,
"learning_rate": 1.1805191452505602e-05,
"loss": 1.5319,
"step": 996
},
{
"epoch": 0.2393264465277363,
"grad_norm": 5.625,
"learning_rate": 1.1702094991660326e-05,
"loss": 1.6112,
"step": 1002
},
{
"epoch": 0.24075953902191438,
"grad_norm": 4.71875,
"learning_rate": 1.159881187691835e-05,
"loss": 1.6341,
"step": 1008
},
{
"epoch": 0.24219263151609244,
"grad_norm": 4.3125,
"learning_rate": 1.1495353434437098e-05,
"loss": 1.4623,
"step": 1014
},
{
"epoch": 0.2436257240102705,
"grad_norm": 19.625,
"learning_rate": 1.1391731009600655e-05,
"loss": 1.4166,
"step": 1020
},
{
"epoch": 0.24505881650444855,
"grad_norm": 4.0625,
"learning_rate": 1.128795596577563e-05,
"loss": 1.5813,
"step": 1026
},
{
"epoch": 0.2464919089986266,
"grad_norm": 6.25,
"learning_rate": 1.1184039683065014e-05,
"loss": 1.5772,
"step": 1032
},
{
"epoch": 0.2479250014928047,
"grad_norm": 5.53125,
"learning_rate": 1.1079993557060228e-05,
"loss": 1.401,
"step": 1038
},
{
"epoch": 0.24935809398698275,
"grad_norm": 6.65625,
"learning_rate": 1.0975828997591496e-05,
"loss": 1.6248,
"step": 1044
},
{
"epoch": 0.2507911864811608,
"grad_norm": 856.0,
"learning_rate": 1.0871557427476585e-05,
"loss": 1.775,
"step": 1050
},
{
"epoch": 0.2522242789753389,
"grad_norm": 4.1875,
"learning_rate": 1.0767190281268187e-05,
"loss": 1.586,
"step": 1056
},
{
"epoch": 0.25365737146951695,
"grad_norm": 3.53125,
"learning_rate": 1.0662739004000005e-05,
"loss": 1.5397,
"step": 1062
},
{
"epoch": 0.255090463963695,
"grad_norm": 4.125,
"learning_rate": 1.055821504993164e-05,
"loss": 1.8712,
"step": 1068
},
{
"epoch": 0.25652355645787306,
"grad_norm": 5.1875,
"learning_rate": 1.0453629881292537e-05,
"loss": 1.5357,
"step": 1074
},
{
"epoch": 0.2579566489520511,
"grad_norm": 3.921875,
"learning_rate": 1.0348994967025012e-05,
"loss": 1.4033,
"step": 1080
},
{
"epoch": 0.25938974144622917,
"grad_norm": 5.3125,
"learning_rate": 1.0244321781526533e-05,
"loss": 1.5611,
"step": 1086
},
{
"epoch": 0.26082283394040723,
"grad_norm": 4.8125,
"learning_rate": 1.0139621803391454e-05,
"loss": 1.577,
"step": 1092
},
{
"epoch": 0.2622559264345853,
"grad_norm": 5.46875,
"learning_rate": 1.0034906514152239e-05,
"loss": 1.5149,
"step": 1098
},
{
"epoch": 0.26368901892876334,
"grad_norm": 6.4375,
"learning_rate": 9.930187397020385e-06,
"loss": 1.5796,
"step": 1104
},
{
"epoch": 0.2651221114229414,
"grad_norm": 4.28125,
"learning_rate": 9.825475935627165e-06,
"loss": 1.5702,
"step": 1110
},
{
"epoch": 0.2665552039171195,
"grad_norm": 5.34375,
"learning_rate": 9.720783612764314e-06,
"loss": 1.5354,
"step": 1116
},
{
"epoch": 0.26798829641129757,
"grad_norm": 4.375,
"learning_rate": 9.616121909124801e-06,
"loss": 1.4122,
"step": 1122
},
{
"epoch": 0.2694213889054756,
"grad_norm": 5.46875,
"learning_rate": 9.511502302043867e-06,
"loss": 1.6959,
"step": 1128
},
{
"epoch": 0.2708544813996537,
"grad_norm": 8.4375,
"learning_rate": 9.406936264240386e-06,
"loss": 1.5493,
"step": 1134
},
{
"epoch": 0.27228757389383174,
"grad_norm": 5.46875,
"learning_rate": 9.302435262558748e-06,
"loss": 1.4156,
"step": 1140
},
{
"epoch": 0.2737206663880098,
"grad_norm": 720.0,
"learning_rate": 9.198010756711413e-06,
"loss": 1.567,
"step": 1146
},
{
"epoch": 0.27515375888218785,
"grad_norm": 3.875,
"learning_rate": 9.093674198022201e-06,
"loss": 1.3814,
"step": 1152
},
{
"epoch": 0.2765868513763659,
"grad_norm": 3.671875,
"learning_rate": 8.989437028170537e-06,
"loss": 1.4261,
"step": 1158
},
{
"epoch": 0.27801994387054396,
"grad_norm": 10.375,
"learning_rate": 8.885310677936746e-06,
"loss": 1.506,
"step": 1164
},
{
"epoch": 0.279453036364722,
"grad_norm": 3.46875,
"learning_rate": 8.781306565948528e-06,
"loss": 1.3967,
"step": 1170
},
{
"epoch": 0.2808861288589001,
"grad_norm": 3.984375,
"learning_rate": 8.677436097428775e-06,
"loss": 1.5761,
"step": 1176
},
{
"epoch": 0.2823192213530782,
"grad_norm": 3.484375,
"learning_rate": 8.573710662944884e-06,
"loss": 1.5428,
"step": 1182
},
{
"epoch": 0.28375231384725624,
"grad_norm": 6.25,
"learning_rate": 8.47014163715962e-06,
"loss": 1.5426,
"step": 1188
},
{
"epoch": 0.2851854063414343,
"grad_norm": 6.25,
"learning_rate": 8.366740377583781e-06,
"loss": 1.503,
"step": 1194
},
{
"epoch": 0.28661849883561236,
"grad_norm": 3.828125,
"learning_rate": 8.263518223330698e-06,
"loss": 1.4355,
"step": 1200
},
{
"epoch": 0.28661849883561236,
"eval_loss": 1.315157413482666,
"eval_runtime": 223.8181,
"eval_samples_per_second": 3.798,
"eval_steps_per_second": 3.798,
"step": 1200
},
{
"epoch": 0.2880515913297904,
"grad_norm": 5.625,
"learning_rate": 8.1604864938728e-06,
"loss": 1.4389,
"step": 1206
},
{
"epoch": 0.28948468382396847,
"grad_norm": 5.0625,
"learning_rate": 8.057656487800283e-06,
"loss": 1.5346,
"step": 1212
},
{
"epoch": 0.2909177763181465,
"grad_norm": 4.21875,
"learning_rate": 7.955039481582098e-06,
"loss": 1.4492,
"step": 1218
},
{
"epoch": 0.2923508688123246,
"grad_norm": 4.9375,
"learning_rate": 7.852646728329368e-06,
"loss": 1.4305,
"step": 1224
},
{
"epoch": 0.29378396130650264,
"grad_norm": 4.9375,
"learning_rate": 7.750489456561351e-06,
"loss": 1.607,
"step": 1230
},
{
"epoch": 0.2952170538006807,
"grad_norm": 4.90625,
"learning_rate": 7.6485788689741e-06,
"loss": 1.3777,
"step": 1236
},
{
"epoch": 0.2966501462948588,
"grad_norm": 5.875,
"learning_rate": 7.546926141211975e-06,
"loss": 1.5751,
"step": 1242
},
{
"epoch": 0.29808323878903686,
"grad_norm": 4.8125,
"learning_rate": 7.445542420642097e-06,
"loss": 1.5106,
"step": 1248
},
{
"epoch": 0.2995163312832149,
"grad_norm": 4.875,
"learning_rate": 7.344438825131912e-06,
"loss": 1.5982,
"step": 1254
},
{
"epoch": 0.300949423777393,
"grad_norm": 5.09375,
"learning_rate": 7.243626441830009e-06,
"loss": 1.5328,
"step": 1260
},
{
"epoch": 0.30238251627157103,
"grad_norm": 4.09375,
"learning_rate": 7.143116325950266e-06,
"loss": 1.6138,
"step": 1266
},
{
"epoch": 0.3038156087657491,
"grad_norm": 3.8125,
"learning_rate": 7.042919499559538e-06,
"loss": 1.4547,
"step": 1272
},
{
"epoch": 0.30524870125992715,
"grad_norm": 4.1875,
"learning_rate": 6.943046950368944e-06,
"loss": 1.4393,
"step": 1278
},
{
"epoch": 0.3066817937541052,
"grad_norm": 5.34375,
"learning_rate": 6.843509630528977e-06,
"loss": 1.4009,
"step": 1284
},
{
"epoch": 0.30811488624828326,
"grad_norm": 5.125,
"learning_rate": 6.744318455428436e-06,
"loss": 1.5134,
"step": 1290
},
{
"epoch": 0.3095479787424613,
"grad_norm": 4.96875,
"learning_rate": 6.645484302497452e-06,
"loss": 1.5411,
"step": 1296
},
{
"epoch": 0.3109810712366394,
"grad_norm": 4.9375,
"learning_rate": 6.547018010014654e-06,
"loss": 1.5058,
"step": 1302
},
{
"epoch": 0.3124141637308175,
"grad_norm": 3.59375,
"learning_rate": 6.448930375918632e-06,
"loss": 1.4026,
"step": 1308
},
{
"epoch": 0.31384725622499554,
"grad_norm": 4.78125,
"learning_rate": 6.351232156623803e-06,
"loss": 1.3993,
"step": 1314
},
{
"epoch": 0.3152803487191736,
"grad_norm": 4.21875,
"learning_rate": 6.25393406584088e-06,
"loss": 1.6574,
"step": 1320
},
{
"epoch": 0.31671344121335165,
"grad_norm": 4.40625,
"learning_rate": 6.157046773401964e-06,
"loss": 1.5233,
"step": 1326
},
{
"epoch": 0.3181465337075297,
"grad_norm": 5.25,
"learning_rate": 6.06058090409049e-06,
"loss": 1.5095,
"step": 1332
},
{
"epoch": 0.31957962620170777,
"grad_norm": 4.625,
"learning_rate": 5.9645470364761e-06,
"loss": 1.3797,
"step": 1338
},
{
"epoch": 0.3210127186958858,
"grad_norm": 5.84375,
"learning_rate": 5.868955701754584e-06,
"loss": 1.6089,
"step": 1344
},
{
"epoch": 0.3224458111900639,
"grad_norm": 3.71875,
"learning_rate": 5.773817382593008e-06,
"loss": 1.4297,
"step": 1350
},
{
"epoch": 0.32387890368424194,
"grad_norm": 3.578125,
"learning_rate": 5.679142511980176e-06,
"loss": 1.327,
"step": 1356
},
{
"epoch": 0.32531199617842,
"grad_norm": 4.6875,
"learning_rate": 5.584941472082549e-06,
"loss": 1.4878,
"step": 1362
},
{
"epoch": 0.32674508867259805,
"grad_norm": 5.125,
"learning_rate": 5.491224593105695e-06,
"loss": 1.4593,
"step": 1368
},
{
"epoch": 0.32817818116677616,
"grad_norm": 7.1875,
"learning_rate": 5.398002152161484e-06,
"loss": 1.5287,
"step": 1374
},
{
"epoch": 0.3296112736609542,
"grad_norm": 5.71875,
"learning_rate": 5.305284372141095e-06,
"loss": 1.4808,
"step": 1380
},
{
"epoch": 0.3310443661551323,
"grad_norm": 4.09375,
"learning_rate": 5.213081420593933e-06,
"loss": 1.4244,
"step": 1386
},
{
"epoch": 0.33247745864931033,
"grad_norm": 9.5,
"learning_rate": 5.121403408612672e-06,
"loss": 1.5213,
"step": 1392
},
{
"epoch": 0.3339105511434884,
"grad_norm": 5.09375,
"learning_rate": 5.030260389724447e-06,
"loss": 1.4455,
"step": 1398
},
{
"epoch": 0.33534364363766644,
"grad_norm": 6.6875,
"learning_rate": 4.939662358788364e-06,
"loss": 1.5983,
"step": 1404
},
{
"epoch": 0.3367767361318445,
"grad_norm": 4.96875,
"learning_rate": 4.849619250899458e-06,
"loss": 1.3544,
"step": 1410
},
{
"epoch": 0.33820982862602256,
"grad_norm": 4.65625,
"learning_rate": 4.76014094029921e-06,
"loss": 1.4412,
"step": 1416
},
{
"epoch": 0.3396429211202006,
"grad_norm": 6.40625,
"learning_rate": 4.671237239292699e-06,
"loss": 1.4463,
"step": 1422
},
{
"epoch": 0.34107601361437867,
"grad_norm": 5.25,
"learning_rate": 4.582917897172603e-06,
"loss": 1.5306,
"step": 1428
},
{
"epoch": 0.3425091061085568,
"grad_norm": 4.40625,
"learning_rate": 4.495192599150045e-06,
"loss": 1.5532,
"step": 1434
},
{
"epoch": 0.34394219860273484,
"grad_norm": 5.15625,
"learning_rate": 4.408070965292534e-06,
"loss": 1.4818,
"step": 1440
},
{
"epoch": 0.3453752910969129,
"grad_norm": 4.125,
"learning_rate": 4.321562549468991e-06,
"loss": 1.4144,
"step": 1446
},
{
"epoch": 0.34680838359109095,
"grad_norm": 4.28125,
"learning_rate": 4.235676838302069e-06,
"loss": 1.4173,
"step": 1452
},
{
"epoch": 0.348241476085269,
"grad_norm": 8.5,
"learning_rate": 4.150423250127846e-06,
"loss": 1.4121,
"step": 1458
},
{
"epoch": 0.34967456857944706,
"grad_norm": 5.90625,
"learning_rate": 4.065811133962987e-06,
"loss": 1.4121,
"step": 1464
},
{
"epoch": 0.3511076610736251,
"grad_norm": 4.625,
"learning_rate": 3.981849768479516e-06,
"loss": 1.3973,
"step": 1470
},
{
"epoch": 0.3525407535678032,
"grad_norm": 5.1875,
"learning_rate": 3.898548360987325e-06,
"loss": 1.4554,
"step": 1476
},
{
"epoch": 0.35397384606198123,
"grad_norm": 5.40625,
"learning_rate": 3.81591604642446e-06,
"loss": 1.4958,
"step": 1482
},
{
"epoch": 0.3554069385561593,
"grad_norm": 5.28125,
"learning_rate": 3.7339618863553983e-06,
"loss": 1.4843,
"step": 1488
},
{
"epoch": 0.35684003105033735,
"grad_norm": 5.96875,
"learning_rate": 3.6526948679773256e-06,
"loss": 1.6051,
"step": 1494
},
{
"epoch": 0.35827312354451546,
"grad_norm": 3.6875,
"learning_rate": 3.5721239031346067e-06,
"loss": 1.4176,
"step": 1500
},
{
"epoch": 0.3597062160386935,
"grad_norm": 4.375,
"learning_rate": 3.492257827341492e-06,
"loss": 1.4049,
"step": 1506
},
{
"epoch": 0.3611393085328716,
"grad_norm": 3.71875,
"learning_rate": 3.4131053988131947e-06,
"loss": 1.5823,
"step": 1512
},
{
"epoch": 0.36257240102704963,
"grad_norm": 6.0,
"learning_rate": 3.3346752975054763e-06,
"loss": 1.4469,
"step": 1518
},
{
"epoch": 0.3640054935212277,
"grad_norm": 4.21875,
"learning_rate": 3.2569761241627694e-06,
"loss": 1.4373,
"step": 1524
},
{
"epoch": 0.36543858601540574,
"grad_norm": 6.03125,
"learning_rate": 3.1800163993750166e-06,
"loss": 1.4823,
"step": 1530
},
{
"epoch": 0.3668716785095838,
"grad_norm": 4.625,
"learning_rate": 3.103804562643302e-06,
"loss": 1.4585,
"step": 1536
},
{
"epoch": 0.36830477100376185,
"grad_norm": 4.28125,
"learning_rate": 3.028348971454356e-06,
"loss": 1.4233,
"step": 1542
},
{
"epoch": 0.3697378634979399,
"grad_norm": 14.625,
"learning_rate": 2.953657900364053e-06,
"loss": 1.4869,
"step": 1548
},
{
"epoch": 0.37117095599211797,
"grad_norm": 4.1875,
"learning_rate": 2.8797395400900362e-06,
"loss": 1.5315,
"step": 1554
},
{
"epoch": 0.3726040484862961,
"grad_norm": 4.125,
"learning_rate": 2.8066019966134907e-06,
"loss": 1.4887,
"step": 1560
},
{
"epoch": 0.37403714098047414,
"grad_norm": 3.796875,
"learning_rate": 2.7342532902902418e-06,
"loss": 1.4533,
"step": 1566
},
{
"epoch": 0.3754702334746522,
"grad_norm": 4.03125,
"learning_rate": 2.6627013549712355e-06,
"loss": 1.4017,
"step": 1572
},
{
"epoch": 0.37690332596883025,
"grad_norm": 6.84375,
"learning_rate": 2.5919540371325005e-06,
"loss": 1.3971,
"step": 1578
},
{
"epoch": 0.3783364184630083,
"grad_norm": 5.5625,
"learning_rate": 2.522019095014683e-06,
"loss": 1.5576,
"step": 1584
},
{
"epoch": 0.37976951095718636,
"grad_norm": 10.875,
"learning_rate": 2.45290419777228e-06,
"loss": 1.4719,
"step": 1590
},
{
"epoch": 0.3812026034513644,
"grad_norm": 5.15625,
"learning_rate": 2.3846169246326345e-06,
"loss": 1.4618,
"step": 1596
},
{
"epoch": 0.3821579984474831,
"eval_loss": 1.2876688241958618,
"eval_runtime": 226.2654,
"eval_samples_per_second": 3.757,
"eval_steps_per_second": 3.757,
"step": 1600
}
],
"logging_steps": 6,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"total_flos": 2.9553261973639004e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}