Llama-3.1-8B-MagPie-Ultra / trainer_state.json
gabrielmbmb's picture
gabrielmbmb HF staff
Model save
aa3d567 verified
raw
history blame
36.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997505612372163,
"eval_steps": 500,
"global_step": 1002,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009977550511349464,
"grad_norm": 3.95422100343339e+17,
"learning_rate": 1.9801980198019803e-07,
"loss": 1.1312,
"step": 1
},
{
"epoch": 0.004988775255674732,
"grad_norm": 2182.5525466022195,
"learning_rate": 9.900990099009902e-07,
"loss": 1.1202,
"step": 5
},
{
"epoch": 0.009977550511349464,
"grad_norm": 10.696306402986602,
"learning_rate": 1.9801980198019803e-06,
"loss": 1.0843,
"step": 10
},
{
"epoch": 0.014966325767024195,
"grad_norm": 4.8421359026925295,
"learning_rate": 2.9702970297029703e-06,
"loss": 1.024,
"step": 15
},
{
"epoch": 0.01995510102269893,
"grad_norm": 1.4664994361442427,
"learning_rate": 3.960396039603961e-06,
"loss": 0.9479,
"step": 20
},
{
"epoch": 0.024943876278373658,
"grad_norm": 1.1625196109582225,
"learning_rate": 4.950495049504951e-06,
"loss": 0.8935,
"step": 25
},
{
"epoch": 0.02993265153404839,
"grad_norm": 1.2801604519342376,
"learning_rate": 5.940594059405941e-06,
"loss": 0.871,
"step": 30
},
{
"epoch": 0.034921426789723126,
"grad_norm": 2.033559906503939,
"learning_rate": 6.930693069306931e-06,
"loss": 0.8425,
"step": 35
},
{
"epoch": 0.03991020204539786,
"grad_norm": 1.2072354289878107,
"learning_rate": 7.920792079207921e-06,
"loss": 0.8193,
"step": 40
},
{
"epoch": 0.04489897730107259,
"grad_norm": 1.3137258088382902,
"learning_rate": 8.910891089108911e-06,
"loss": 0.8013,
"step": 45
},
{
"epoch": 0.049887752556747315,
"grad_norm": 1.1213863664000594,
"learning_rate": 9.900990099009901e-06,
"loss": 0.7906,
"step": 50
},
{
"epoch": 0.05487652781242205,
"grad_norm": 0.9999895583902438,
"learning_rate": 1.0891089108910893e-05,
"loss": 0.7642,
"step": 55
},
{
"epoch": 0.05986530306809678,
"grad_norm": 1.3377086334649673,
"learning_rate": 1.1881188118811881e-05,
"loss": 0.7495,
"step": 60
},
{
"epoch": 0.06485407832377152,
"grad_norm": 1.0583607289478394,
"learning_rate": 1.2871287128712873e-05,
"loss": 0.7328,
"step": 65
},
{
"epoch": 0.06984285357944625,
"grad_norm": 1.3493827534349543,
"learning_rate": 1.3861386138613861e-05,
"loss": 0.7383,
"step": 70
},
{
"epoch": 0.07483162883512098,
"grad_norm": 1.2261995345556986,
"learning_rate": 1.4851485148514853e-05,
"loss": 0.7281,
"step": 75
},
{
"epoch": 0.07982040409079572,
"grad_norm": 1.3328058553211537,
"learning_rate": 1.5841584158415843e-05,
"loss": 0.7236,
"step": 80
},
{
"epoch": 0.08480917934647045,
"grad_norm": 1.1145579124084846,
"learning_rate": 1.683168316831683e-05,
"loss": 0.7255,
"step": 85
},
{
"epoch": 0.08979795460214518,
"grad_norm": 0.9193198348331784,
"learning_rate": 1.7821782178217823e-05,
"loss": 0.7181,
"step": 90
},
{
"epoch": 0.0947867298578199,
"grad_norm": 0.9946484577080871,
"learning_rate": 1.881188118811881e-05,
"loss": 0.7103,
"step": 95
},
{
"epoch": 0.09977550511349463,
"grad_norm": 1.041681712316516,
"learning_rate": 1.9801980198019803e-05,
"loss": 0.7007,
"step": 100
},
{
"epoch": 0.10476428036916936,
"grad_norm": 2.164939078852322,
"learning_rate": 1.9999027402586235e-05,
"loss": 0.7064,
"step": 105
},
{
"epoch": 0.1097530556248441,
"grad_norm": 2.7326170629817335,
"learning_rate": 1.9995076549835638e-05,
"loss": 0.7129,
"step": 110
},
{
"epoch": 0.11474183088051883,
"grad_norm": 0.958937879949106,
"learning_rate": 1.9988087854284224e-05,
"loss": 0.6984,
"step": 115
},
{
"epoch": 0.11973060613619356,
"grad_norm": 1.1161507946755316,
"learning_rate": 1.997806344003363e-05,
"loss": 0.7025,
"step": 120
},
{
"epoch": 0.12471938139186829,
"grad_norm": 0.9783646600814512,
"learning_rate": 1.996500635384337e-05,
"loss": 0.6918,
"step": 125
},
{
"epoch": 0.12970815664754304,
"grad_norm": 0.8060832238745128,
"learning_rate": 1.994892056420485e-05,
"loss": 0.6842,
"step": 130
},
{
"epoch": 0.13469693190321777,
"grad_norm": 0.8669120420326126,
"learning_rate": 1.992981096013517e-05,
"loss": 0.6806,
"step": 135
},
{
"epoch": 0.1396857071588925,
"grad_norm": 0.8551160872182698,
"learning_rate": 1.990768334969122e-05,
"loss": 0.6908,
"step": 140
},
{
"epoch": 0.14467448241456723,
"grad_norm": 0.6818189908779505,
"learning_rate": 1.9882544458204386e-05,
"loss": 0.6888,
"step": 145
},
{
"epoch": 0.14966325767024197,
"grad_norm": 0.7649549705168317,
"learning_rate": 1.9854401926236518e-05,
"loss": 0.6867,
"step": 150
},
{
"epoch": 0.1546520329259167,
"grad_norm": 0.8499401555713652,
"learning_rate": 1.9823264307257683e-05,
"loss": 0.6707,
"step": 155
},
{
"epoch": 0.15964080818159143,
"grad_norm": 0.7166408203324516,
"learning_rate": 1.9789141065046495e-05,
"loss": 0.676,
"step": 160
},
{
"epoch": 0.16462958343726616,
"grad_norm": 0.6182479498389858,
"learning_rate": 1.9752042570813733e-05,
"loss": 0.6738,
"step": 165
},
{
"epoch": 0.1696183586929409,
"grad_norm": 0.6690357471493384,
"learning_rate": 1.9711980100050196e-05,
"loss": 0.6672,
"step": 170
},
{
"epoch": 0.17460713394861563,
"grad_norm": 0.5822226665191199,
"learning_rate": 1.966896582909968e-05,
"loss": 0.6736,
"step": 175
},
{
"epoch": 0.17959590920429036,
"grad_norm": 0.7406095161368872,
"learning_rate": 1.962301283145819e-05,
"loss": 0.6761,
"step": 180
},
{
"epoch": 0.18458468445996506,
"grad_norm": 0.7296907738291923,
"learning_rate": 1.957413507380046e-05,
"loss": 0.6678,
"step": 185
},
{
"epoch": 0.1895734597156398,
"grad_norm": 0.8518753276369776,
"learning_rate": 1.952234741173499e-05,
"loss": 0.6733,
"step": 190
},
{
"epoch": 0.19456223497131453,
"grad_norm": 0.5554147873181055,
"learning_rate": 1.946766558528895e-05,
"loss": 0.6621,
"step": 195
},
{
"epoch": 0.19955101022698926,
"grad_norm": 0.6525658967181038,
"learning_rate": 1.941010621412422e-05,
"loss": 0.6649,
"step": 200
},
{
"epoch": 0.204539785482664,
"grad_norm": 0.5780920597815026,
"learning_rate": 1.9349686792486143e-05,
"loss": 0.657,
"step": 205
},
{
"epoch": 0.20952856073833873,
"grad_norm": 0.5578703671497164,
"learning_rate": 1.9286425683886403e-05,
"loss": 0.6687,
"step": 210
},
{
"epoch": 0.21451733599401346,
"grad_norm": 0.72305301104437,
"learning_rate": 1.9220342115521746e-05,
"loss": 0.6624,
"step": 215
},
{
"epoch": 0.2195061112496882,
"grad_norm": 0.6165020865206677,
"learning_rate": 1.9151456172430186e-05,
"loss": 0.6532,
"step": 220
},
{
"epoch": 0.22449488650536292,
"grad_norm": 0.5606109731701377,
"learning_rate": 1.9079788791386468e-05,
"loss": 0.6562,
"step": 225
},
{
"epoch": 0.22948366176103766,
"grad_norm": 0.6353820290325317,
"learning_rate": 1.9005361754538677e-05,
"loss": 0.6671,
"step": 230
},
{
"epoch": 0.2344724370167124,
"grad_norm": 0.5815877946035953,
"learning_rate": 1.8928197682787914e-05,
"loss": 0.6593,
"step": 235
},
{
"epoch": 0.23946121227238712,
"grad_norm": 0.6348680697801418,
"learning_rate": 1.8848320028913017e-05,
"loss": 0.6507,
"step": 240
},
{
"epoch": 0.24444998752806185,
"grad_norm": 0.5695451117286802,
"learning_rate": 1.8765753070442486e-05,
"loss": 0.6523,
"step": 245
},
{
"epoch": 0.24943876278373658,
"grad_norm": 0.5168318177147343,
"learning_rate": 1.868052190227571e-05,
"loss": 0.6591,
"step": 250
},
{
"epoch": 0.2544275380394113,
"grad_norm": 0.7348069026040828,
"learning_rate": 1.859265242905577e-05,
"loss": 0.6541,
"step": 255
},
{
"epoch": 0.2594163132950861,
"grad_norm": 0.5329392930102724,
"learning_rate": 1.8502171357296144e-05,
"loss": 0.6589,
"step": 260
},
{
"epoch": 0.2644050885507608,
"grad_norm": 0.6628783134030882,
"learning_rate": 1.84091061872637e-05,
"loss": 0.6443,
"step": 265
},
{
"epoch": 0.26939386380643554,
"grad_norm": 0.6959729166297904,
"learning_rate": 1.8313485204620428e-05,
"loss": 0.6459,
"step": 270
},
{
"epoch": 0.27438263906211025,
"grad_norm": 0.5848246972834032,
"learning_rate": 1.821533747182645e-05,
"loss": 0.6606,
"step": 275
},
{
"epoch": 0.279371414317785,
"grad_norm": 0.6475331258568309,
"learning_rate": 1.811469281930698e-05,
"loss": 0.656,
"step": 280
},
{
"epoch": 0.2843601895734597,
"grad_norm": 0.5357406950163816,
"learning_rate": 1.8011581836385828e-05,
"loss": 0.6474,
"step": 285
},
{
"epoch": 0.28934896482913447,
"grad_norm": 0.5973939212545811,
"learning_rate": 1.790603586198827e-05,
"loss": 0.6376,
"step": 290
},
{
"epoch": 0.2943377400848092,
"grad_norm": 0.5096252662581786,
"learning_rate": 1.7798086975116096e-05,
"loss": 0.6487,
"step": 295
},
{
"epoch": 0.29932651534048393,
"grad_norm": 0.545088543038122,
"learning_rate": 1.7687767985097695e-05,
"loss": 0.6526,
"step": 300
},
{
"epoch": 0.30431529059615864,
"grad_norm": 0.707465419807657,
"learning_rate": 1.7575112421616203e-05,
"loss": 0.6465,
"step": 305
},
{
"epoch": 0.3093040658518334,
"grad_norm": 0.520929461518716,
"learning_rate": 1.7460154524518688e-05,
"loss": 0.6346,
"step": 310
},
{
"epoch": 0.3142928411075081,
"grad_norm": 0.5724374980096262,
"learning_rate": 1.73429292334095e-05,
"loss": 0.6533,
"step": 315
},
{
"epoch": 0.31928161636318286,
"grad_norm": 0.545092535159253,
"learning_rate": 1.722347217703094e-05,
"loss": 0.6437,
"step": 320
},
{
"epoch": 0.32427039161885757,
"grad_norm": 0.49712776699676936,
"learning_rate": 1.710181966243447e-05,
"loss": 0.6373,
"step": 325
},
{
"epoch": 0.3292591668745323,
"grad_norm": 0.5463641929477563,
"learning_rate": 1.6978008663945794e-05,
"loss": 0.6496,
"step": 330
},
{
"epoch": 0.33424794213020703,
"grad_norm": 0.5263242364290428,
"learning_rate": 1.6852076811927066e-05,
"loss": 0.6369,
"step": 335
},
{
"epoch": 0.3392367173858818,
"grad_norm": 0.5461638677630304,
"learning_rate": 1.672406238133978e-05,
"loss": 0.639,
"step": 340
},
{
"epoch": 0.3442254926415565,
"grad_norm": 0.5328798983853884,
"learning_rate": 1.6594004280111697e-05,
"loss": 0.6497,
"step": 345
},
{
"epoch": 0.34921426789723126,
"grad_norm": 0.5449564175014876,
"learning_rate": 1.6461942037311406e-05,
"loss": 0.64,
"step": 350
},
{
"epoch": 0.35420304315290596,
"grad_norm": 0.4924182140259062,
"learning_rate": 1.6327915791134107e-05,
"loss": 0.6396,
"step": 355
},
{
"epoch": 0.3591918184085807,
"grad_norm": 0.6190239635709287,
"learning_rate": 1.6191966276702235e-05,
"loss": 0.6377,
"step": 360
},
{
"epoch": 0.3641805936642554,
"grad_norm": 0.6147520812137072,
"learning_rate": 1.6054134813684697e-05,
"loss": 0.6375,
"step": 365
},
{
"epoch": 0.36916936891993013,
"grad_norm": 0.5215763519986214,
"learning_rate": 1.5914463293738402e-05,
"loss": 0.6368,
"step": 370
},
{
"epoch": 0.3741581441756049,
"grad_norm": 0.6214647949635035,
"learning_rate": 1.5772994167775986e-05,
"loss": 0.6303,
"step": 375
},
{
"epoch": 0.3791469194312796,
"grad_norm": 0.6365612833821749,
"learning_rate": 1.5629770433063523e-05,
"loss": 0.6244,
"step": 380
},
{
"epoch": 0.38413569468695435,
"grad_norm": 0.594264835461608,
"learning_rate": 1.5484835620152198e-05,
"loss": 0.6323,
"step": 385
},
{
"epoch": 0.38912446994262906,
"grad_norm": 0.6107791643380025,
"learning_rate": 1.533823377964791e-05,
"loss": 0.6298,
"step": 390
},
{
"epoch": 0.3941132451983038,
"grad_norm": 0.6001005800935538,
"learning_rate": 1.5190009468822782e-05,
"loss": 0.63,
"step": 395
},
{
"epoch": 0.3991020204539785,
"grad_norm": 0.5176320349977592,
"learning_rate": 1.5040207738072714e-05,
"loss": 0.6296,
"step": 400
},
{
"epoch": 0.4040907957096533,
"grad_norm": 0.5530769815557066,
"learning_rate": 1.4888874117225013e-05,
"loss": 0.6202,
"step": 405
},
{
"epoch": 0.409079570965328,
"grad_norm": 0.4750983813132654,
"learning_rate": 1.4736054601700361e-05,
"loss": 0.6339,
"step": 410
},
{
"epoch": 0.41406834622100275,
"grad_norm": 0.5280260411948464,
"learning_rate": 1.4581795638533227e-05,
"loss": 0.6244,
"step": 415
},
{
"epoch": 0.41905712147667745,
"grad_norm": 0.49444720544930004,
"learning_rate": 1.4426144112255057e-05,
"loss": 0.6226,
"step": 420
},
{
"epoch": 0.4240458967323522,
"grad_norm": 0.5416758041687645,
"learning_rate": 1.426914733064444e-05,
"loss": 0.6281,
"step": 425
},
{
"epoch": 0.4290346719880269,
"grad_norm": 0.4913059879733367,
"learning_rate": 1.4110853010348717e-05,
"loss": 0.6327,
"step": 430
},
{
"epoch": 0.4340234472437017,
"grad_norm": 0.5475336882998988,
"learning_rate": 1.3951309262381231e-05,
"loss": 0.6319,
"step": 435
},
{
"epoch": 0.4390122224993764,
"grad_norm": 0.5887136176843448,
"learning_rate": 1.3790564577498791e-05,
"loss": 0.6323,
"step": 440
},
{
"epoch": 0.44400099775505114,
"grad_norm": 0.53325274747872,
"learning_rate": 1.3628667811463654e-05,
"loss": 0.6165,
"step": 445
},
{
"epoch": 0.44898977301072585,
"grad_norm": 0.5538441977213863,
"learning_rate": 1.3465668170194633e-05,
"loss": 0.6259,
"step": 450
},
{
"epoch": 0.4539785482664006,
"grad_norm": 0.516406002882252,
"learning_rate": 1.330161519481172e-05,
"loss": 0.6251,
"step": 455
},
{
"epoch": 0.4589673235220753,
"grad_norm": 0.5294430423934866,
"learning_rate": 1.3136558746578888e-05,
"loss": 0.6269,
"step": 460
},
{
"epoch": 0.46395609877775007,
"grad_norm": 0.5548195323966518,
"learning_rate": 1.2970548991749538e-05,
"loss": 0.6239,
"step": 465
},
{
"epoch": 0.4689448740334248,
"grad_norm": 0.48735736648704486,
"learning_rate": 1.2803636386319288e-05,
"loss": 0.62,
"step": 470
},
{
"epoch": 0.47393364928909953,
"grad_norm": 0.5093962136183301,
"learning_rate": 1.2635871660690677e-05,
"loss": 0.6259,
"step": 475
},
{
"epoch": 0.47892242454477424,
"grad_norm": 0.5220605413877938,
"learning_rate": 1.2467305804254472e-05,
"loss": 0.6233,
"step": 480
},
{
"epoch": 0.483911199800449,
"grad_norm": 0.46987756163402217,
"learning_rate": 1.2297990049892274e-05,
"loss": 0.6224,
"step": 485
},
{
"epoch": 0.4888999750561237,
"grad_norm": 0.4851271226507692,
"learning_rate": 1.2127975858405096e-05,
"loss": 0.6248,
"step": 490
},
{
"epoch": 0.49388875031179846,
"grad_norm": 0.49812035008570954,
"learning_rate": 1.1957314902872686e-05,
"loss": 0.6162,
"step": 495
},
{
"epoch": 0.49887752556747317,
"grad_norm": 0.4543789239256326,
"learning_rate": 1.178605905294832e-05,
"loss": 0.6191,
"step": 500
},
{
"epoch": 0.5038663008231479,
"grad_norm": 0.49450863626445246,
"learning_rate": 1.1614260359093869e-05,
"loss": 0.6298,
"step": 505
},
{
"epoch": 0.5088550760788226,
"grad_norm": 0.46204592956818197,
"learning_rate": 1.144197103675988e-05,
"loss": 0.6108,
"step": 510
},
{
"epoch": 0.5138438513344974,
"grad_norm": 0.46446807942496315,
"learning_rate": 1.1269243450515537e-05,
"loss": 0.6255,
"step": 515
},
{
"epoch": 0.5188326265901722,
"grad_norm": 0.4523344385336006,
"learning_rate": 1.1096130098133296e-05,
"loss": 0.621,
"step": 520
},
{
"epoch": 0.5238214018458468,
"grad_norm": 0.5281185337565003,
"learning_rate": 1.092268359463302e-05,
"loss": 0.6181,
"step": 525
},
{
"epoch": 0.5288101771015216,
"grad_norm": 0.5592357254092617,
"learning_rate": 1.0748956656290512e-05,
"loss": 0.625,
"step": 530
},
{
"epoch": 0.5337989523571963,
"grad_norm": 0.49985683194766767,
"learning_rate": 1.057500208461522e-05,
"loss": 0.6088,
"step": 535
},
{
"epoch": 0.5387877276128711,
"grad_norm": 0.5348402703139689,
"learning_rate": 1.0400872750302095e-05,
"loss": 0.6215,
"step": 540
},
{
"epoch": 0.5437765028685457,
"grad_norm": 0.5016453093230498,
"learning_rate": 1.0226621577162377e-05,
"loss": 0.6067,
"step": 545
},
{
"epoch": 0.5487652781242205,
"grad_norm": 0.5033968932851582,
"learning_rate": 1.005230152603826e-05,
"loss": 0.6056,
"step": 550
},
{
"epoch": 0.5537540533798953,
"grad_norm": 0.4779633495557308,
"learning_rate": 9.877965578706286e-06,
"loss": 0.6158,
"step": 555
},
{
"epoch": 0.55874282863557,
"grad_norm": 0.507768651916945,
"learning_rate": 9.703666721774403e-06,
"loss": 0.6168,
"step": 560
},
{
"epoch": 0.5637316038912447,
"grad_norm": 0.4975220405187006,
"learning_rate": 9.52945793057753e-06,
"loss": 0.6133,
"step": 565
},
{
"epoch": 0.5687203791469194,
"grad_norm": 0.567581970238524,
"learning_rate": 9.355392153076541e-06,
"loss": 0.6153,
"step": 570
},
{
"epoch": 0.5737091544025942,
"grad_norm": 0.45994820207954157,
"learning_rate": 9.18152229376561e-06,
"loss": 0.6075,
"step": 575
},
{
"epoch": 0.5786979296582689,
"grad_norm": 0.4776147923803636,
"learning_rate": 9.007901197592722e-06,
"loss": 0.6083,
"step": 580
},
{
"epoch": 0.5836867049139436,
"grad_norm": 0.4458298111691603,
"learning_rate": 8.834581633898307e-06,
"loss": 0.6151,
"step": 585
},
{
"epoch": 0.5886754801696183,
"grad_norm": 0.5015975267018272,
"learning_rate": 8.661616280376846e-06,
"loss": 0.6083,
"step": 590
},
{
"epoch": 0.5936642554252931,
"grad_norm": 0.44736317209890847,
"learning_rate": 8.489057707066335e-06,
"loss": 0.6077,
"step": 595
},
{
"epoch": 0.5986530306809679,
"grad_norm": 0.4542312795928054,
"learning_rate": 8.316958360370462e-06,
"loss": 0.6089,
"step": 600
},
{
"epoch": 0.6036418059366425,
"grad_norm": 0.4669024055915673,
"learning_rate": 8.145370547118374e-06,
"loss": 0.614,
"step": 605
},
{
"epoch": 0.6086305811923173,
"grad_norm": 0.460625150490194,
"learning_rate": 7.974346418666854e-06,
"loss": 0.6097,
"step": 610
},
{
"epoch": 0.613619356447992,
"grad_norm": 0.45169077334743357,
"learning_rate": 7.803937955049743e-06,
"loss": 0.6134,
"step": 615
},
{
"epoch": 0.6186081317036668,
"grad_norm": 0.5045442729073779,
"learning_rate": 7.634196949179472e-06,
"loss": 0.6056,
"step": 620
},
{
"epoch": 0.6235969069593414,
"grad_norm": 0.4976507270581154,
"learning_rate": 7.465174991105405e-06,
"loss": 0.6087,
"step": 625
},
{
"epoch": 0.6285856822150162,
"grad_norm": 0.47413589578954074,
"learning_rate": 7.296923452333908e-06,
"loss": 0.6073,
"step": 630
},
{
"epoch": 0.633574457470691,
"grad_norm": 0.4468358705510705,
"learning_rate": 7.129493470214775e-06,
"loss": 0.6065,
"step": 635
},
{
"epoch": 0.6385632327263657,
"grad_norm": 0.44813379558060285,
"learning_rate": 6.962935932398862e-06,
"loss": 0.5989,
"step": 640
},
{
"epoch": 0.6435520079820404,
"grad_norm": 0.4267618712107622,
"learning_rate": 6.797301461371626e-06,
"loss": 0.5981,
"step": 645
},
{
"epoch": 0.6485407832377151,
"grad_norm": 0.44491038507852154,
"learning_rate": 6.632640399067197e-06,
"loss": 0.602,
"step": 650
},
{
"epoch": 0.6535295584933899,
"grad_norm": 0.455548462410925,
"learning_rate": 6.469002791567792e-06,
"loss": 0.6077,
"step": 655
},
{
"epoch": 0.6585183337490647,
"grad_norm": 0.4655649758885719,
"learning_rate": 6.306438373892985e-06,
"loss": 0.6027,
"step": 660
},
{
"epoch": 0.6635071090047393,
"grad_norm": 0.4603705069002276,
"learning_rate": 6.144996554883556e-06,
"loss": 0.6072,
"step": 665
},
{
"epoch": 0.6684958842604141,
"grad_norm": 0.42809636369496656,
"learning_rate": 5.98472640218449e-06,
"loss": 0.5984,
"step": 670
},
{
"epoch": 0.6734846595160888,
"grad_norm": 0.4311787554066561,
"learning_rate": 5.825676627331614e-06,
"loss": 0.5997,
"step": 675
},
{
"epoch": 0.6784734347717636,
"grad_norm": 0.45825795981326356,
"learning_rate": 5.667895570946554e-06,
"loss": 0.6034,
"step": 680
},
{
"epoch": 0.6834622100274382,
"grad_norm": 0.4167893564717864,
"learning_rate": 5.5114311880443374e-06,
"loss": 0.5975,
"step": 685
},
{
"epoch": 0.688450985283113,
"grad_norm": 0.46394354368013807,
"learning_rate": 5.356331033458276e-06,
"loss": 0.6065,
"step": 690
},
{
"epoch": 0.6934397605387878,
"grad_norm": 0.45003494825662715,
"learning_rate": 5.202642247386409e-06,
"loss": 0.6052,
"step": 695
},
{
"epoch": 0.6984285357944625,
"grad_norm": 0.40702863232292363,
"learning_rate": 5.0504115410640105e-06,
"loss": 0.5985,
"step": 700
},
{
"epoch": 0.7034173110501372,
"grad_norm": 0.42165174105900677,
"learning_rate": 4.899685182566472e-06,
"loss": 0.5917,
"step": 705
},
{
"epoch": 0.7084060863058119,
"grad_norm": 0.5050037064515478,
"learning_rate": 4.7505089827468335e-06,
"loss": 0.5959,
"step": 710
},
{
"epoch": 0.7133948615614867,
"grad_norm": 0.4558754163109762,
"learning_rate": 4.602928281312351e-06,
"loss": 0.5933,
"step": 715
},
{
"epoch": 0.7183836368171614,
"grad_norm": 0.4071864016802162,
"learning_rate": 4.456987933044185e-06,
"loss": 0.5992,
"step": 720
},
{
"epoch": 0.7233724120728361,
"grad_norm": 0.4248472014437075,
"learning_rate": 4.3127322941645385e-06,
"loss": 0.5937,
"step": 725
},
{
"epoch": 0.7283611873285109,
"grad_norm": 0.40980252115955373,
"learning_rate": 4.170205208855281e-06,
"loss": 0.5968,
"step": 730
},
{
"epoch": 0.7333499625841856,
"grad_norm": 0.4052569143394039,
"learning_rate": 4.029449995932213e-06,
"loss": 0.5926,
"step": 735
},
{
"epoch": 0.7383387378398603,
"grad_norm": 0.41360853365570666,
"learning_rate": 3.890509435679026e-06,
"loss": 0.6021,
"step": 740
},
{
"epoch": 0.743327513095535,
"grad_norm": 0.41835345554744635,
"learning_rate": 3.7534257568448995e-06,
"loss": 0.5952,
"step": 745
},
{
"epoch": 0.7483162883512098,
"grad_norm": 0.401256370852048,
"learning_rate": 3.6182406238097745e-06,
"loss": 0.5972,
"step": 750
},
{
"epoch": 0.7533050636068845,
"grad_norm": 0.4174244244132666,
"learning_rate": 3.484995123921112e-06,
"loss": 0.5945,
"step": 755
},
{
"epoch": 0.7582938388625592,
"grad_norm": 0.4081805968439954,
"learning_rate": 3.353729755006081e-06,
"loss": 0.5952,
"step": 760
},
{
"epoch": 0.763282614118234,
"grad_norm": 0.40126195224404465,
"learning_rate": 3.2244844130628684e-06,
"loss": 0.5869,
"step": 765
},
{
"epoch": 0.7682713893739087,
"grad_norm": 0.4252743360297237,
"learning_rate": 3.0972983801349464e-06,
"loss": 0.6057,
"step": 770
},
{
"epoch": 0.7732601646295835,
"grad_norm": 0.40849879013758245,
"learning_rate": 2.9722103123719324e-06,
"loss": 0.5987,
"step": 775
},
{
"epoch": 0.7782489398852581,
"grad_norm": 0.4124888296503082,
"learning_rate": 2.849258228280656e-06,
"loss": 0.6048,
"step": 780
},
{
"epoch": 0.7832377151409329,
"grad_norm": 0.408929858016356,
"learning_rate": 2.728479497170066e-06,
"loss": 0.591,
"step": 785
},
{
"epoch": 0.7882264903966076,
"grad_norm": 0.40717905524554926,
"learning_rate": 2.6099108277934105e-06,
"loss": 0.5957,
"step": 790
},
{
"epoch": 0.7932152656522824,
"grad_norm": 0.40906715914325326,
"learning_rate": 2.4935882571912107e-06,
"loss": 0.585,
"step": 795
},
{
"epoch": 0.798204040907957,
"grad_norm": 0.39973439105002123,
"learning_rate": 2.379547139738392e-06,
"loss": 0.5881,
"step": 800
},
{
"epoch": 0.8031928161636318,
"grad_norm": 0.39566194107090014,
"learning_rate": 2.267822136398864e-06,
"loss": 0.5948,
"step": 805
},
{
"epoch": 0.8081815914193066,
"grad_norm": 0.3943529028329431,
"learning_rate": 2.15844720419091e-06,
"loss": 0.5928,
"step": 810
},
{
"epoch": 0.8131703666749813,
"grad_norm": 0.396743086451483,
"learning_rate": 2.0514555858664663e-06,
"loss": 0.5955,
"step": 815
},
{
"epoch": 0.818159141930656,
"grad_norm": 0.4158031464312053,
"learning_rate": 1.9468797998075494e-06,
"loss": 0.5938,
"step": 820
},
{
"epoch": 0.8231479171863307,
"grad_norm": 0.4159354667890639,
"learning_rate": 1.844751630142797e-06,
"loss": 0.5811,
"step": 825
},
{
"epoch": 0.8281366924420055,
"grad_norm": 0.38601272105412726,
"learning_rate": 1.7451021170871974e-06,
"loss": 0.5933,
"step": 830
},
{
"epoch": 0.8331254676976803,
"grad_norm": 0.3986644463931031,
"learning_rate": 1.6479615475079291e-06,
"loss": 0.5892,
"step": 835
},
{
"epoch": 0.8381142429533549,
"grad_norm": 0.40080909281458676,
"learning_rate": 1.5533594457191326e-06,
"loss": 0.5898,
"step": 840
},
{
"epoch": 0.8431030182090297,
"grad_norm": 0.3942697489480343,
"learning_rate": 1.4613245645084894e-06,
"loss": 0.5863,
"step": 845
},
{
"epoch": 0.8480917934647044,
"grad_norm": 0.39162896724772583,
"learning_rate": 1.3718848763982596e-06,
"loss": 0.5963,
"step": 850
},
{
"epoch": 0.8530805687203792,
"grad_norm": 0.38744891578031804,
"learning_rate": 1.2850675651434962e-06,
"loss": 0.5931,
"step": 855
},
{
"epoch": 0.8580693439760538,
"grad_norm": 0.3757895616733075,
"learning_rate": 1.2008990174699685e-06,
"loss": 0.5958,
"step": 860
},
{
"epoch": 0.8630581192317286,
"grad_norm": 0.38105622554372953,
"learning_rate": 1.1194048150543457e-06,
"loss": 0.5928,
"step": 865
},
{
"epoch": 0.8680468944874034,
"grad_norm": 0.38949843229532266,
"learning_rate": 1.0406097267490644e-06,
"loss": 0.5894,
"step": 870
},
{
"epoch": 0.8730356697430781,
"grad_norm": 0.39133896634308507,
"learning_rate": 9.645377010542212e-07,
"loss": 0.5893,
"step": 875
},
{
"epoch": 0.8780244449987528,
"grad_norm": 0.4015625907458179,
"learning_rate": 8.91211858838823e-07,
"loss": 0.5982,
"step": 880
},
{
"epoch": 0.8830132202544275,
"grad_norm": 0.4047912248494648,
"learning_rate": 8.206544863135612e-07,
"loss": 0.5865,
"step": 885
},
{
"epoch": 0.8880019955101023,
"grad_norm": 0.38503511137561774,
"learning_rate": 7.528870282572864e-07,
"loss": 0.5831,
"step": 890
},
{
"epoch": 0.892990770765777,
"grad_norm": 0.387759644652555,
"learning_rate": 6.879300814992007e-07,
"loss": 0.5985,
"step": 895
},
{
"epoch": 0.8979795460214517,
"grad_norm": 0.39050325202063846,
"learning_rate": 6.258033886587911e-07,
"loss": 0.5881,
"step": 900
},
{
"epoch": 0.9029683212771265,
"grad_norm": 0.3870890888051975,
"learning_rate": 5.66525832145377e-07,
"loss": 0.5945,
"step": 905
},
{
"epoch": 0.9079570965328012,
"grad_norm": 0.3856245616892951,
"learning_rate": 5.101154284191035e-07,
"loss": 0.5929,
"step": 910
},
{
"epoch": 0.912945871788476,
"grad_norm": 0.3910109970295147,
"learning_rate": 4.5658932251512856e-07,
"loss": 0.6021,
"step": 915
},
{
"epoch": 0.9179346470441506,
"grad_norm": 0.3676846272782438,
"learning_rate": 4.059637828326657e-07,
"loss": 0.5878,
"step": 920
},
{
"epoch": 0.9229234222998254,
"grad_norm": 0.3819934536854667,
"learning_rate": 3.5825419619046176e-07,
"loss": 0.5936,
"step": 925
},
{
"epoch": 0.9279121975555001,
"grad_norm": 0.3875995828243716,
"learning_rate": 3.1347506315023036e-07,
"loss": 0.6038,
"step": 930
},
{
"epoch": 0.9329009728111749,
"grad_norm": 0.38002441832217876,
"learning_rate": 2.716399936094294e-07,
"loss": 0.5931,
"step": 935
},
{
"epoch": 0.9378897480668495,
"grad_norm": 0.3848197669462124,
"learning_rate": 2.327617026647533e-07,
"loss": 0.595,
"step": 940
},
{
"epoch": 0.9428785233225243,
"grad_norm": 0.38930339170989675,
"learning_rate": 1.968520067475921e-07,
"loss": 0.5938,
"step": 945
},
{
"epoch": 0.9478672985781991,
"grad_norm": 0.3797874082101493,
"learning_rate": 1.6392182003260427e-07,
"loss": 0.5942,
"step": 950
},
{
"epoch": 0.9528560738338738,
"grad_norm": 0.3803276424025686,
"learning_rate": 1.3398115112054243e-07,
"loss": 0.5876,
"step": 955
},
{
"epoch": 0.9578448490895485,
"grad_norm": 0.3765573493263119,
"learning_rate": 1.070390999962867e-07,
"loss": 0.5922,
"step": 960
},
{
"epoch": 0.9628336243452232,
"grad_norm": 0.3849948624629748,
"learning_rate": 8.31038552630603e-08,
"loss": 0.5945,
"step": 965
},
{
"epoch": 0.967822399600898,
"grad_norm": 0.38879243314742995,
"learning_rate": 6.218269165363166e-08,
"loss": 0.5954,
"step": 970
},
{
"epoch": 0.9728111748565728,
"grad_norm": 0.38003968946404176,
"learning_rate": 4.42819678192774e-08,
"loss": 0.5913,
"step": 975
},
{
"epoch": 0.9777999501122474,
"grad_norm": 0.3869545513042188,
"learning_rate": 2.9407124397169418e-08,
"loss": 0.594,
"step": 980
},
{
"epoch": 0.9827887253679222,
"grad_norm": 0.3851047915663657,
"learning_rate": 1.7562682356786488e-08,
"loss": 0.5925,
"step": 985
},
{
"epoch": 0.9877775006235969,
"grad_norm": 0.38621262692876246,
"learning_rate": 8.752241625831215e-09,
"loss": 0.6004,
"step": 990
},
{
"epoch": 0.9927662758792717,
"grad_norm": 0.3798867531020366,
"learning_rate": 2.978479996098571e-09,
"loss": 0.5909,
"step": 995
},
{
"epoch": 0.9977550511349463,
"grad_norm": 0.38274768291794226,
"learning_rate": 2.4315230959359726e-10,
"loss": 0.5939,
"step": 1000
},
{
"epoch": 0.9997505612372163,
"eval_loss": 0.5914663672447205,
"eval_runtime": 139.1149,
"eval_samples_per_second": 48.528,
"eval_steps_per_second": 1.517,
"step": 1002
},
{
"epoch": 0.9997505612372163,
"step": 1002,
"total_flos": 838984280309760.0,
"train_loss": 0.6422406448099666,
"train_runtime": 10627.6191,
"train_samples_per_second": 12.071,
"train_steps_per_second": 0.094
}
],
"logging_steps": 5,
"max_steps": 1002,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 838984280309760.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}