NuminaMath-72B-TIR / trainer_state.json
lewtun's picture
lewtun HF staff
Add AI-MO/qwen2-72b-sft-aimo_v03.00 checkpoint
7682df3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 3188,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012547051442910915,
"grad_norm": 1.3249917794841757,
"learning_rate": 6.269592476489028e-08,
"loss": 0.5553,
"step": 1
},
{
"epoch": 0.006273525721455458,
"grad_norm": 1.3024175063683066,
"learning_rate": 3.134796238244514e-07,
"loss": 0.5542,
"step": 5
},
{
"epoch": 0.012547051442910916,
"grad_norm": 1.1325683997600668,
"learning_rate": 6.269592476489028e-07,
"loss": 0.5626,
"step": 10
},
{
"epoch": 0.018820577164366373,
"grad_norm": 1.1174840712777703,
"learning_rate": 9.404388714733543e-07,
"loss": 0.5642,
"step": 15
},
{
"epoch": 0.025094102885821833,
"grad_norm": 0.8002431481734196,
"learning_rate": 1.2539184952978056e-06,
"loss": 0.5294,
"step": 20
},
{
"epoch": 0.03136762860727729,
"grad_norm": 0.6333668642117685,
"learning_rate": 1.5673981191222572e-06,
"loss": 0.5079,
"step": 25
},
{
"epoch": 0.037641154328732745,
"grad_norm": 0.5771411644874956,
"learning_rate": 1.8808777429467086e-06,
"loss": 0.4655,
"step": 30
},
{
"epoch": 0.043914680050188205,
"grad_norm": 0.49532156556877105,
"learning_rate": 2.1943573667711602e-06,
"loss": 0.4592,
"step": 35
},
{
"epoch": 0.050188205771643665,
"grad_norm": 0.4975545815570396,
"learning_rate": 2.507836990595611e-06,
"loss": 0.4329,
"step": 40
},
{
"epoch": 0.056461731493099125,
"grad_norm": 0.4338284541758203,
"learning_rate": 2.8213166144200626e-06,
"loss": 0.4322,
"step": 45
},
{
"epoch": 0.06273525721455459,
"grad_norm": 0.40887016256028313,
"learning_rate": 3.1347962382445144e-06,
"loss": 0.426,
"step": 50
},
{
"epoch": 0.06900878293601004,
"grad_norm": 0.4045559102283436,
"learning_rate": 3.448275862068966e-06,
"loss": 0.4227,
"step": 55
},
{
"epoch": 0.07528230865746549,
"grad_norm": 0.3889912721227527,
"learning_rate": 3.7617554858934172e-06,
"loss": 0.4189,
"step": 60
},
{
"epoch": 0.08155583437892096,
"grad_norm": 0.3618436855347915,
"learning_rate": 4.075235109717869e-06,
"loss": 0.4177,
"step": 65
},
{
"epoch": 0.08782936010037641,
"grad_norm": 0.36459156314439295,
"learning_rate": 4.3887147335423205e-06,
"loss": 0.4049,
"step": 70
},
{
"epoch": 0.09410288582183186,
"grad_norm": 0.3488164206674813,
"learning_rate": 4.7021943573667714e-06,
"loss": 0.3961,
"step": 75
},
{
"epoch": 0.10037641154328733,
"grad_norm": 0.37238296726059605,
"learning_rate": 5.015673981191222e-06,
"loss": 0.3955,
"step": 80
},
{
"epoch": 0.10664993726474278,
"grad_norm": 0.3605666937163523,
"learning_rate": 5.329153605015674e-06,
"loss": 0.377,
"step": 85
},
{
"epoch": 0.11292346298619825,
"grad_norm": 0.35760877488985304,
"learning_rate": 5.642633228840125e-06,
"loss": 0.3915,
"step": 90
},
{
"epoch": 0.1191969887076537,
"grad_norm": 0.356976698911797,
"learning_rate": 5.956112852664577e-06,
"loss": 0.3938,
"step": 95
},
{
"epoch": 0.12547051442910917,
"grad_norm": 0.3620265985263758,
"learning_rate": 6.269592476489029e-06,
"loss": 0.404,
"step": 100
},
{
"epoch": 0.13174404015056462,
"grad_norm": 0.3710088599948379,
"learning_rate": 6.58307210031348e-06,
"loss": 0.3941,
"step": 105
},
{
"epoch": 0.13801756587202008,
"grad_norm": 0.3831687285367315,
"learning_rate": 6.896551724137932e-06,
"loss": 0.3859,
"step": 110
},
{
"epoch": 0.14429109159347553,
"grad_norm": 0.36915661135239697,
"learning_rate": 7.210031347962383e-06,
"loss": 0.3895,
"step": 115
},
{
"epoch": 0.15056461731493098,
"grad_norm": 0.3573608106603279,
"learning_rate": 7.5235109717868345e-06,
"loss": 0.3857,
"step": 120
},
{
"epoch": 0.15683814303638646,
"grad_norm": 0.3508976740749952,
"learning_rate": 7.836990595611285e-06,
"loss": 0.3915,
"step": 125
},
{
"epoch": 0.16311166875784192,
"grad_norm": 0.36876078420022057,
"learning_rate": 8.150470219435737e-06,
"loss": 0.3989,
"step": 130
},
{
"epoch": 0.16938519447929737,
"grad_norm": 0.38691192833572297,
"learning_rate": 8.463949843260189e-06,
"loss": 0.3874,
"step": 135
},
{
"epoch": 0.17565872020075282,
"grad_norm": 0.3735894147297392,
"learning_rate": 8.777429467084641e-06,
"loss": 0.385,
"step": 140
},
{
"epoch": 0.18193224592220827,
"grad_norm": 0.3710457112042887,
"learning_rate": 9.090909090909091e-06,
"loss": 0.3764,
"step": 145
},
{
"epoch": 0.18820577164366373,
"grad_norm": 0.3708916534975576,
"learning_rate": 9.404388714733543e-06,
"loss": 0.3808,
"step": 150
},
{
"epoch": 0.1944792973651192,
"grad_norm": 0.3552539505765215,
"learning_rate": 9.717868338557995e-06,
"loss": 0.4018,
"step": 155
},
{
"epoch": 0.20075282308657466,
"grad_norm": 0.3666385022952003,
"learning_rate": 1.0031347962382445e-05,
"loss": 0.3839,
"step": 160
},
{
"epoch": 0.20702634880803011,
"grad_norm": 0.3631078993279255,
"learning_rate": 1.0344827586206898e-05,
"loss": 0.3858,
"step": 165
},
{
"epoch": 0.21329987452948557,
"grad_norm": 0.39722641298153644,
"learning_rate": 1.0658307210031348e-05,
"loss": 0.3847,
"step": 170
},
{
"epoch": 0.21957340025094102,
"grad_norm": 0.3691443252830364,
"learning_rate": 1.09717868338558e-05,
"loss": 0.3826,
"step": 175
},
{
"epoch": 0.2258469259723965,
"grad_norm": 0.39755512136650856,
"learning_rate": 1.128526645768025e-05,
"loss": 0.3766,
"step": 180
},
{
"epoch": 0.23212045169385195,
"grad_norm": 0.3627344076525724,
"learning_rate": 1.1598746081504704e-05,
"loss": 0.409,
"step": 185
},
{
"epoch": 0.2383939774153074,
"grad_norm": 0.36755006735418844,
"learning_rate": 1.1912225705329154e-05,
"loss": 0.385,
"step": 190
},
{
"epoch": 0.24466750313676286,
"grad_norm": 0.3491529760844153,
"learning_rate": 1.2225705329153606e-05,
"loss": 0.3944,
"step": 195
},
{
"epoch": 0.25094102885821834,
"grad_norm": 0.38344322242421625,
"learning_rate": 1.2539184952978058e-05,
"loss": 0.3814,
"step": 200
},
{
"epoch": 0.2572145545796738,
"grad_norm": 0.3668326981157007,
"learning_rate": 1.285266457680251e-05,
"loss": 0.3769,
"step": 205
},
{
"epoch": 0.26348808030112925,
"grad_norm": 0.3606059535955065,
"learning_rate": 1.316614420062696e-05,
"loss": 0.3882,
"step": 210
},
{
"epoch": 0.2697616060225847,
"grad_norm": 0.3743059326798353,
"learning_rate": 1.3479623824451411e-05,
"loss": 0.3792,
"step": 215
},
{
"epoch": 0.27603513174404015,
"grad_norm": 0.3987264784021991,
"learning_rate": 1.3793103448275863e-05,
"loss": 0.3742,
"step": 220
},
{
"epoch": 0.2823086574654956,
"grad_norm": 0.41749115581528207,
"learning_rate": 1.4106583072100315e-05,
"loss": 0.3879,
"step": 225
},
{
"epoch": 0.28858218318695106,
"grad_norm": 0.36990941061813976,
"learning_rate": 1.4420062695924765e-05,
"loss": 0.3763,
"step": 230
},
{
"epoch": 0.2948557089084065,
"grad_norm": 0.350857226534173,
"learning_rate": 1.4733542319749217e-05,
"loss": 0.3936,
"step": 235
},
{
"epoch": 0.30112923462986196,
"grad_norm": 0.3699010810723354,
"learning_rate": 1.5047021943573669e-05,
"loss": 0.3827,
"step": 240
},
{
"epoch": 0.3074027603513174,
"grad_norm": 0.383417773018869,
"learning_rate": 1.536050156739812e-05,
"loss": 0.3753,
"step": 245
},
{
"epoch": 0.3136762860727729,
"grad_norm": 0.37418028131825143,
"learning_rate": 1.567398119122257e-05,
"loss": 0.393,
"step": 250
},
{
"epoch": 0.3199498117942284,
"grad_norm": 0.375158938790808,
"learning_rate": 1.598746081504702e-05,
"loss": 0.3803,
"step": 255
},
{
"epoch": 0.32622333751568383,
"grad_norm": 0.35545430825067814,
"learning_rate": 1.6300940438871475e-05,
"loss": 0.3858,
"step": 260
},
{
"epoch": 0.3324968632371393,
"grad_norm": 0.39047195961342007,
"learning_rate": 1.6614420062695925e-05,
"loss": 0.3956,
"step": 265
},
{
"epoch": 0.33877038895859474,
"grad_norm": 0.3763861614570858,
"learning_rate": 1.6927899686520378e-05,
"loss": 0.3863,
"step": 270
},
{
"epoch": 0.3450439146800502,
"grad_norm": 0.33810866516589266,
"learning_rate": 1.7241379310344828e-05,
"loss": 0.3861,
"step": 275
},
{
"epoch": 0.35131744040150564,
"grad_norm": 0.3577598772376036,
"learning_rate": 1.7554858934169282e-05,
"loss": 0.3847,
"step": 280
},
{
"epoch": 0.3575909661229611,
"grad_norm": 0.39952196485063435,
"learning_rate": 1.7868338557993732e-05,
"loss": 0.3803,
"step": 285
},
{
"epoch": 0.36386449184441655,
"grad_norm": 0.3560924326294842,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.3823,
"step": 290
},
{
"epoch": 0.370138017565872,
"grad_norm": 0.36592070219456535,
"learning_rate": 1.8495297805642636e-05,
"loss": 0.3877,
"step": 295
},
{
"epoch": 0.37641154328732745,
"grad_norm": 0.36996538529023604,
"learning_rate": 1.8808777429467086e-05,
"loss": 0.3816,
"step": 300
},
{
"epoch": 0.38268506900878296,
"grad_norm": 0.3944353625018826,
"learning_rate": 1.9122257053291536e-05,
"loss": 0.3885,
"step": 305
},
{
"epoch": 0.3889585947302384,
"grad_norm": 0.3605090389245764,
"learning_rate": 1.943573667711599e-05,
"loss": 0.3746,
"step": 310
},
{
"epoch": 0.39523212045169387,
"grad_norm": 0.3489216159243111,
"learning_rate": 1.9749216300940443e-05,
"loss": 0.3807,
"step": 315
},
{
"epoch": 0.4015056461731493,
"grad_norm": 0.3902867659960602,
"learning_rate": 1.9999994004731887e-05,
"loss": 0.394,
"step": 320
},
{
"epoch": 0.4077791718946048,
"grad_norm": 0.3833754079678544,
"learning_rate": 1.999978417110275e-05,
"loss": 0.3657,
"step": 325
},
{
"epoch": 0.41405269761606023,
"grad_norm": 0.37800625764548235,
"learning_rate": 1.9999274581256576e-05,
"loss": 0.3822,
"step": 330
},
{
"epoch": 0.4203262233375157,
"grad_norm": 0.3548880184290815,
"learning_rate": 1.999846525046898e-05,
"loss": 0.3816,
"step": 335
},
{
"epoch": 0.42659974905897113,
"grad_norm": 0.37094441921423654,
"learning_rate": 1.9997356203000667e-05,
"loss": 0.3853,
"step": 340
},
{
"epoch": 0.4328732747804266,
"grad_norm": 0.3594535908155001,
"learning_rate": 1.9995947472096752e-05,
"loss": 0.3703,
"step": 345
},
{
"epoch": 0.43914680050188204,
"grad_norm": 0.3492383976696482,
"learning_rate": 1.9994239099985727e-05,
"loss": 0.3858,
"step": 350
},
{
"epoch": 0.4454203262233375,
"grad_norm": 0.3545945901168298,
"learning_rate": 1.9992231137878213e-05,
"loss": 0.3723,
"step": 355
},
{
"epoch": 0.451693851944793,
"grad_norm": 0.36080708632510694,
"learning_rate": 1.9989923645965418e-05,
"loss": 0.3952,
"step": 360
},
{
"epoch": 0.45796737766624845,
"grad_norm": 0.5493517294545232,
"learning_rate": 1.998731669341735e-05,
"loss": 0.3723,
"step": 365
},
{
"epoch": 0.4642409033877039,
"grad_norm": 0.34501637664109114,
"learning_rate": 1.998441035838071e-05,
"loss": 0.3787,
"step": 370
},
{
"epoch": 0.47051442910915936,
"grad_norm": 0.36834831423716424,
"learning_rate": 1.9981204727976577e-05,
"loss": 0.3871,
"step": 375
},
{
"epoch": 0.4767879548306148,
"grad_norm": 0.359064185739471,
"learning_rate": 1.9977699898297794e-05,
"loss": 0.4078,
"step": 380
},
{
"epoch": 0.48306148055207027,
"grad_norm": 0.37676963093518556,
"learning_rate": 1.997389597440608e-05,
"loss": 0.3997,
"step": 385
},
{
"epoch": 0.4893350062735257,
"grad_norm": 0.35164360932787275,
"learning_rate": 1.9969793070328872e-05,
"loss": 0.3706,
"step": 390
},
{
"epoch": 0.49560853199498117,
"grad_norm": 0.3499090374786232,
"learning_rate": 1.996539130905593e-05,
"loss": 0.3931,
"step": 395
},
{
"epoch": 0.5018820577164367,
"grad_norm": 0.35516374008901963,
"learning_rate": 1.9960690822535632e-05,
"loss": 0.3917,
"step": 400
},
{
"epoch": 0.5081555834378921,
"grad_norm": 0.3859915521260531,
"learning_rate": 1.995569175167102e-05,
"loss": 0.3862,
"step": 405
},
{
"epoch": 0.5144291091593476,
"grad_norm": 0.3700726542619639,
"learning_rate": 1.9950394246315594e-05,
"loss": 0.3977,
"step": 410
},
{
"epoch": 0.520702634880803,
"grad_norm": 0.36807219365873434,
"learning_rate": 1.994479846526879e-05,
"loss": 0.391,
"step": 415
},
{
"epoch": 0.5269761606022585,
"grad_norm": 0.34834015226308745,
"learning_rate": 1.9938904576271247e-05,
"loss": 0.3947,
"step": 420
},
{
"epoch": 0.533249686323714,
"grad_norm": 0.3475144346489988,
"learning_rate": 1.9932712755999768e-05,
"loss": 0.3797,
"step": 425
},
{
"epoch": 0.5395232120451694,
"grad_norm": 0.3471973955561176,
"learning_rate": 1.9926223190062015e-05,
"loss": 0.3777,
"step": 430
},
{
"epoch": 0.5457967377666249,
"grad_norm": 0.3702513132489804,
"learning_rate": 1.9919436072990967e-05,
"loss": 0.4113,
"step": 435
},
{
"epoch": 0.5520702634880803,
"grad_norm": 0.3480545734033953,
"learning_rate": 1.9912351608239064e-05,
"loss": 0.3717,
"step": 440
},
{
"epoch": 0.5583437892095358,
"grad_norm": 0.3705010339965962,
"learning_rate": 1.9904970008172128e-05,
"loss": 0.3903,
"step": 445
},
{
"epoch": 0.5646173149309912,
"grad_norm": 0.3545524157937066,
"learning_rate": 1.989729149406298e-05,
"loss": 0.3971,
"step": 450
},
{
"epoch": 0.5708908406524467,
"grad_norm": 0.34489507859345525,
"learning_rate": 1.988931629608483e-05,
"loss": 0.3715,
"step": 455
},
{
"epoch": 0.5771643663739021,
"grad_norm": 0.3419133499625096,
"learning_rate": 1.9881044653304347e-05,
"loss": 0.3722,
"step": 460
},
{
"epoch": 0.5834378920953576,
"grad_norm": 0.3544487799327636,
"learning_rate": 1.9872476813674527e-05,
"loss": 0.3854,
"step": 465
},
{
"epoch": 0.589711417816813,
"grad_norm": 0.3526030038053533,
"learning_rate": 1.9863613034027224e-05,
"loss": 0.3854,
"step": 470
},
{
"epoch": 0.5959849435382685,
"grad_norm": 0.35801287965649553,
"learning_rate": 1.9854453580065485e-05,
"loss": 0.3773,
"step": 475
},
{
"epoch": 0.6022584692597239,
"grad_norm": 0.33687873507306343,
"learning_rate": 1.984499872635556e-05,
"loss": 0.3839,
"step": 480
},
{
"epoch": 0.6085319949811794,
"grad_norm": 0.34289605388996236,
"learning_rate": 1.983524875631868e-05,
"loss": 0.3899,
"step": 485
},
{
"epoch": 0.6148055207026348,
"grad_norm": 0.35193056323984084,
"learning_rate": 1.9825203962222573e-05,
"loss": 0.3736,
"step": 490
},
{
"epoch": 0.6210790464240903,
"grad_norm": 0.3603036239481184,
"learning_rate": 1.9814864645172684e-05,
"loss": 0.3927,
"step": 495
},
{
"epoch": 0.6273525721455459,
"grad_norm": 0.3610916261894253,
"learning_rate": 1.9804231115103155e-05,
"loss": 0.3729,
"step": 500
},
{
"epoch": 0.6336260978670013,
"grad_norm": 0.33695606360007346,
"learning_rate": 1.9793303690767543e-05,
"loss": 0.3773,
"step": 505
},
{
"epoch": 0.6398996235884568,
"grad_norm": 0.37708359246844253,
"learning_rate": 1.9782082699729255e-05,
"loss": 0.3915,
"step": 510
},
{
"epoch": 0.6461731493099122,
"grad_norm": 0.37063002060189804,
"learning_rate": 1.9770568478351736e-05,
"loss": 0.384,
"step": 515
},
{
"epoch": 0.6524466750313677,
"grad_norm": 0.347781428498976,
"learning_rate": 1.9758761371788376e-05,
"loss": 0.3912,
"step": 520
},
{
"epoch": 0.6587202007528231,
"grad_norm": 0.37037259025304425,
"learning_rate": 1.974666173397218e-05,
"loss": 0.3795,
"step": 525
},
{
"epoch": 0.6649937264742786,
"grad_norm": 0.35088852677024873,
"learning_rate": 1.9734269927605134e-05,
"loss": 0.3803,
"step": 530
},
{
"epoch": 0.671267252195734,
"grad_norm": 0.3569377164202312,
"learning_rate": 1.972158632414736e-05,
"loss": 0.3775,
"step": 535
},
{
"epoch": 0.6775407779171895,
"grad_norm": 0.3394929535259636,
"learning_rate": 1.970861130380596e-05,
"loss": 0.3741,
"step": 540
},
{
"epoch": 0.6838143036386449,
"grad_norm": 0.3420677685134694,
"learning_rate": 1.9695345255523634e-05,
"loss": 0.3878,
"step": 545
},
{
"epoch": 0.6900878293601004,
"grad_norm": 0.3531313507208391,
"learning_rate": 1.9681788576967004e-05,
"loss": 0.3787,
"step": 550
},
{
"epoch": 0.6963613550815558,
"grad_norm": 0.338822289408876,
"learning_rate": 1.9667941674514712e-05,
"loss": 0.3829,
"step": 555
},
{
"epoch": 0.7026348808030113,
"grad_norm": 0.36931664322135854,
"learning_rate": 1.9653804963245226e-05,
"loss": 0.3893,
"step": 560
},
{
"epoch": 0.7089084065244667,
"grad_norm": 0.35793185710348313,
"learning_rate": 1.9639378866924405e-05,
"loss": 0.3816,
"step": 565
},
{
"epoch": 0.7151819322459222,
"grad_norm": 0.36642001426471366,
"learning_rate": 1.9624663817992783e-05,
"loss": 0.3825,
"step": 570
},
{
"epoch": 0.7214554579673776,
"grad_norm": 0.36184277756969885,
"learning_rate": 1.960966025755262e-05,
"loss": 0.3918,
"step": 575
},
{
"epoch": 0.7277289836888331,
"grad_norm": 0.34191036136824954,
"learning_rate": 1.9594368635354676e-05,
"loss": 0.3812,
"step": 580
},
{
"epoch": 0.7340025094102886,
"grad_norm": 0.4053922705018229,
"learning_rate": 1.9578789409784727e-05,
"loss": 0.3796,
"step": 585
},
{
"epoch": 0.740276035131744,
"grad_norm": 0.3254681140813354,
"learning_rate": 1.9562923047849828e-05,
"loss": 0.381,
"step": 590
},
{
"epoch": 0.7465495608531995,
"grad_norm": 0.3655079090417533,
"learning_rate": 1.9546770025164304e-05,
"loss": 0.3808,
"step": 595
},
{
"epoch": 0.7528230865746549,
"grad_norm": 0.3309145310004598,
"learning_rate": 1.95303308259355e-05,
"loss": 0.3924,
"step": 600
},
{
"epoch": 0.7590966122961104,
"grad_norm": 0.34826624551002194,
"learning_rate": 1.9513605942949277e-05,
"loss": 0.3752,
"step": 605
},
{
"epoch": 0.7653701380175659,
"grad_norm": 0.3463792668326179,
"learning_rate": 1.9496595877555212e-05,
"loss": 0.3922,
"step": 610
},
{
"epoch": 0.7716436637390214,
"grad_norm": 0.3409526655488519,
"learning_rate": 1.94793011396516e-05,
"loss": 0.3868,
"step": 615
},
{
"epoch": 0.7779171894604768,
"grad_norm": 0.34614309257239506,
"learning_rate": 1.946172224767015e-05,
"loss": 0.3816,
"step": 620
},
{
"epoch": 0.7841907151819323,
"grad_norm": 0.36738342533490437,
"learning_rate": 1.9443859728560458e-05,
"loss": 0.3809,
"step": 625
},
{
"epoch": 0.7904642409033877,
"grad_norm": 0.5642673777402893,
"learning_rate": 1.9425714117774183e-05,
"loss": 0.3678,
"step": 630
},
{
"epoch": 0.7967377666248432,
"grad_norm": 0.3734297840619315,
"learning_rate": 1.940728595924904e-05,
"loss": 0.3856,
"step": 635
},
{
"epoch": 0.8030112923462986,
"grad_norm": 0.34992964702526325,
"learning_rate": 1.9388575805392453e-05,
"loss": 0.3937,
"step": 640
},
{
"epoch": 0.8092848180677541,
"grad_norm": 0.34532043284186725,
"learning_rate": 1.9369584217065025e-05,
"loss": 0.3718,
"step": 645
},
{
"epoch": 0.8155583437892095,
"grad_norm": 0.3390467838397306,
"learning_rate": 1.935031176356371e-05,
"loss": 0.3829,
"step": 650
},
{
"epoch": 0.821831869510665,
"grad_norm": 0.3522889151490647,
"learning_rate": 1.933075902260475e-05,
"loss": 0.3876,
"step": 655
},
{
"epoch": 0.8281053952321205,
"grad_norm": 0.38123310023732215,
"learning_rate": 1.9310926580306365e-05,
"loss": 0.3707,
"step": 660
},
{
"epoch": 0.8343789209535759,
"grad_norm": 0.37101425687357464,
"learning_rate": 1.929081503117117e-05,
"loss": 0.3777,
"step": 665
},
{
"epoch": 0.8406524466750314,
"grad_norm": 0.35327596368808917,
"learning_rate": 1.9270424978068368e-05,
"loss": 0.372,
"step": 670
},
{
"epoch": 0.8469259723964868,
"grad_norm": 0.48025305553666425,
"learning_rate": 1.9249757032215674e-05,
"loss": 0.3719,
"step": 675
},
{
"epoch": 0.8531994981179423,
"grad_norm": 1.3954653921865106,
"learning_rate": 1.9228811813160972e-05,
"loss": 0.3774,
"step": 680
},
{
"epoch": 0.8594730238393977,
"grad_norm": 0.3729066926513616,
"learning_rate": 1.920758994876379e-05,
"loss": 0.3845,
"step": 685
},
{
"epoch": 0.8657465495608532,
"grad_norm": 0.3368507345057043,
"learning_rate": 1.918609207517643e-05,
"loss": 0.3674,
"step": 690
},
{
"epoch": 0.8720200752823086,
"grad_norm": 0.3381829661628343,
"learning_rate": 1.9164318836824928e-05,
"loss": 0.3895,
"step": 695
},
{
"epoch": 0.8782936010037641,
"grad_norm": 0.35406742056379137,
"learning_rate": 1.9142270886389726e-05,
"loss": 0.3888,
"step": 700
},
{
"epoch": 0.8845671267252195,
"grad_norm": 0.35267605322252205,
"learning_rate": 1.911994888478611e-05,
"loss": 0.416,
"step": 705
},
{
"epoch": 0.890840652446675,
"grad_norm": 0.36994927588198245,
"learning_rate": 1.9097353501144403e-05,
"loss": 0.3881,
"step": 710
},
{
"epoch": 0.8971141781681304,
"grad_norm": 0.344878616221491,
"learning_rate": 1.9074485412789886e-05,
"loss": 0.3916,
"step": 715
},
{
"epoch": 0.903387703889586,
"grad_norm": 0.3618364967642679,
"learning_rate": 1.9051345305222527e-05,
"loss": 0.3877,
"step": 720
},
{
"epoch": 0.9096612296110415,
"grad_norm": 0.3348128268620291,
"learning_rate": 1.9027933872096403e-05,
"loss": 0.3758,
"step": 725
},
{
"epoch": 0.9159347553324969,
"grad_norm": 0.34960639615133043,
"learning_rate": 1.900425181519893e-05,
"loss": 0.3842,
"step": 730
},
{
"epoch": 0.9222082810539524,
"grad_norm": 0.36222991570969465,
"learning_rate": 1.8980299844429804e-05,
"loss": 0.379,
"step": 735
},
{
"epoch": 0.9284818067754078,
"grad_norm": 0.3499002387438987,
"learning_rate": 1.8956078677779738e-05,
"loss": 0.3715,
"step": 740
},
{
"epoch": 0.9347553324968633,
"grad_norm": 0.33211819623666405,
"learning_rate": 1.8931589041308926e-05,
"loss": 0.3736,
"step": 745
},
{
"epoch": 0.9410288582183187,
"grad_norm": 0.3495894811325005,
"learning_rate": 1.8906831669125293e-05,
"loss": 0.3778,
"step": 750
},
{
"epoch": 0.9473023839397742,
"grad_norm": 0.3315684966056081,
"learning_rate": 1.8881807303362484e-05,
"loss": 0.3776,
"step": 755
},
{
"epoch": 0.9535759096612296,
"grad_norm": 0.3409622621170755,
"learning_rate": 1.885651669415761e-05,
"loss": 0.3873,
"step": 760
},
{
"epoch": 0.9598494353826851,
"grad_norm": 0.34785709596472625,
"learning_rate": 1.883096059962876e-05,
"loss": 0.3861,
"step": 765
},
{
"epoch": 0.9661229611041405,
"grad_norm": 0.3477615933807258,
"learning_rate": 1.8805139785852297e-05,
"loss": 0.3783,
"step": 770
},
{
"epoch": 0.972396486825596,
"grad_norm": 0.343324271874741,
"learning_rate": 1.877905502683987e-05,
"loss": 0.3746,
"step": 775
},
{
"epoch": 0.9786700125470514,
"grad_norm": 0.47223667640674677,
"learning_rate": 1.8752707104515223e-05,
"loss": 0.3793,
"step": 780
},
{
"epoch": 0.9849435382685069,
"grad_norm": 0.3234411234996811,
"learning_rate": 1.8726096808690757e-05,
"loss": 0.3676,
"step": 785
},
{
"epoch": 0.9912170639899623,
"grad_norm": 0.3574211042049705,
"learning_rate": 1.8699224937043846e-05,
"loss": 0.3735,
"step": 790
},
{
"epoch": 0.9974905897114178,
"grad_norm": 0.3811194221638418,
"learning_rate": 1.8672092295092935e-05,
"loss": 0.3939,
"step": 795
},
{
"epoch": 1.0,
"eval_loss": 0.3792824149131775,
"eval_runtime": 2.8922,
"eval_samples_per_second": 12.101,
"eval_steps_per_second": 0.692,
"step": 797
},
{
"epoch": 1.0037641154328734,
"grad_norm": 0.3353538294927158,
"learning_rate": 1.8644699696173393e-05,
"loss": 0.3241,
"step": 800
},
{
"epoch": 1.0100376411543288,
"grad_norm": 0.3828496951284579,
"learning_rate": 1.8617047961413122e-05,
"loss": 0.2601,
"step": 805
},
{
"epoch": 1.0163111668757843,
"grad_norm": 0.33996880676819174,
"learning_rate": 1.858913791970795e-05,
"loss": 0.2689,
"step": 810
},
{
"epoch": 1.0225846925972397,
"grad_norm": 0.3787602779680391,
"learning_rate": 1.8560970407696787e-05,
"loss": 0.2686,
"step": 815
},
{
"epoch": 1.0288582183186952,
"grad_norm": 0.5196175862012924,
"learning_rate": 1.8532546269736546e-05,
"loss": 0.2747,
"step": 820
},
{
"epoch": 1.0351317440401506,
"grad_norm": 0.37332309268508856,
"learning_rate": 1.850386635787682e-05,
"loss": 0.2627,
"step": 825
},
{
"epoch": 1.041405269761606,
"grad_norm": 0.3471559800133524,
"learning_rate": 1.847493153183435e-05,
"loss": 0.2787,
"step": 830
},
{
"epoch": 1.0476787954830615,
"grad_norm": 0.3558613761958236,
"learning_rate": 1.844574265896726e-05,
"loss": 0.268,
"step": 835
},
{
"epoch": 1.053952321204517,
"grad_norm": 0.3816351874674652,
"learning_rate": 1.8416300614249044e-05,
"loss": 0.2668,
"step": 840
},
{
"epoch": 1.0602258469259724,
"grad_norm": 0.34373084787867114,
"learning_rate": 1.8386606280242342e-05,
"loss": 0.2648,
"step": 845
},
{
"epoch": 1.066499372647428,
"grad_norm": 0.34772061503647006,
"learning_rate": 1.8356660547072493e-05,
"loss": 0.2664,
"step": 850
},
{
"epoch": 1.0727728983688833,
"grad_norm": 0.3575051030279196,
"learning_rate": 1.8326464312400835e-05,
"loss": 0.2741,
"step": 855
},
{
"epoch": 1.0790464240903388,
"grad_norm": 0.3496509974703566,
"learning_rate": 1.8296018481397818e-05,
"loss": 0.2583,
"step": 860
},
{
"epoch": 1.0853199498117942,
"grad_norm": 0.3724719074552806,
"learning_rate": 1.826532396671585e-05,
"loss": 0.2694,
"step": 865
},
{
"epoch": 1.0915934755332497,
"grad_norm": 0.3727323786558387,
"learning_rate": 1.8234381688461943e-05,
"loss": 0.2772,
"step": 870
},
{
"epoch": 1.0978670012547052,
"grad_norm": 0.3335956714818362,
"learning_rate": 1.8203192574170154e-05,
"loss": 0.2563,
"step": 875
},
{
"epoch": 1.1041405269761606,
"grad_norm": 0.34610791981119976,
"learning_rate": 1.8171757558773747e-05,
"loss": 0.254,
"step": 880
},
{
"epoch": 1.110414052697616,
"grad_norm": 0.3647153604437883,
"learning_rate": 1.8140077584577193e-05,
"loss": 0.269,
"step": 885
},
{
"epoch": 1.1166875784190715,
"grad_norm": 0.3620894127576232,
"learning_rate": 1.81081536012279e-05,
"loss": 0.2642,
"step": 890
},
{
"epoch": 1.122961104140527,
"grad_norm": 0.3368502269678017,
"learning_rate": 1.8075986565687785e-05,
"loss": 0.2621,
"step": 895
},
{
"epoch": 1.1292346298619824,
"grad_norm": 0.3487017808626127,
"learning_rate": 1.804357744220454e-05,
"loss": 0.2741,
"step": 900
},
{
"epoch": 1.1355081555834379,
"grad_norm": 0.346569504128483,
"learning_rate": 1.8010927202282758e-05,
"loss": 0.2522,
"step": 905
},
{
"epoch": 1.1417816813048933,
"grad_norm": 0.3448349298442695,
"learning_rate": 1.7978036824654806e-05,
"loss": 0.2539,
"step": 910
},
{
"epoch": 1.1480552070263488,
"grad_norm": 0.35055993507582917,
"learning_rate": 1.7944907295251478e-05,
"loss": 0.2716,
"step": 915
},
{
"epoch": 1.1543287327478042,
"grad_norm": 0.3483298654120021,
"learning_rate": 1.7911539607172447e-05,
"loss": 0.2585,
"step": 920
},
{
"epoch": 1.1606022584692597,
"grad_norm": 0.3452159026039351,
"learning_rate": 1.78779347606565e-05,
"loss": 0.2598,
"step": 925
},
{
"epoch": 1.1668757841907151,
"grad_norm": 0.3434243523852655,
"learning_rate": 1.7844093763051543e-05,
"loss": 0.2681,
"step": 930
},
{
"epoch": 1.1731493099121706,
"grad_norm": 0.3459951917334934,
"learning_rate": 1.7810017628784416e-05,
"loss": 0.2567,
"step": 935
},
{
"epoch": 1.179422835633626,
"grad_norm": 0.31955125292674175,
"learning_rate": 1.777570737933047e-05,
"loss": 0.2673,
"step": 940
},
{
"epoch": 1.1856963613550815,
"grad_norm": 0.33191052346485606,
"learning_rate": 1.7741164043182967e-05,
"loss": 0.258,
"step": 945
},
{
"epoch": 1.191969887076537,
"grad_norm": 0.3745301206766351,
"learning_rate": 1.7706388655822223e-05,
"loss": 0.2671,
"step": 950
},
{
"epoch": 1.1982434127979924,
"grad_norm": 0.3435343557827377,
"learning_rate": 1.7671382259684603e-05,
"loss": 0.2711,
"step": 955
},
{
"epoch": 1.2045169385194479,
"grad_norm": 0.3522241286327592,
"learning_rate": 1.7636145904131233e-05,
"loss": 0.2715,
"step": 960
},
{
"epoch": 1.2107904642409033,
"grad_norm": 0.34428731270476376,
"learning_rate": 1.7600680645416583e-05,
"loss": 0.2655,
"step": 965
},
{
"epoch": 1.2170639899623588,
"grad_norm": 0.3534701552438621,
"learning_rate": 1.7564987546656778e-05,
"loss": 0.2601,
"step": 970
},
{
"epoch": 1.2233375156838142,
"grad_norm": 0.34480580113424486,
"learning_rate": 1.7529067677797727e-05,
"loss": 0.2581,
"step": 975
},
{
"epoch": 1.2296110414052697,
"grad_norm": 0.3741756431765012,
"learning_rate": 1.7492922115583077e-05,
"loss": 0.2701,
"step": 980
},
{
"epoch": 1.2358845671267251,
"grad_norm": 0.3555549239846533,
"learning_rate": 1.745655194352191e-05,
"loss": 0.2716,
"step": 985
},
{
"epoch": 1.2421580928481806,
"grad_norm": 0.3327711036535926,
"learning_rate": 1.7419958251856276e-05,
"loss": 0.2577,
"step": 990
},
{
"epoch": 1.248431618569636,
"grad_norm": 0.3661107928778811,
"learning_rate": 1.738314213752851e-05,
"loss": 0.2649,
"step": 995
},
{
"epoch": 1.2547051442910915,
"grad_norm": 0.3407939311803759,
"learning_rate": 1.7346104704148343e-05,
"loss": 0.2506,
"step": 1000
},
{
"epoch": 1.260978670012547,
"grad_norm": 0.33612826697533044,
"learning_rate": 1.730884706195983e-05,
"loss": 0.2645,
"step": 1005
},
{
"epoch": 1.2672521957340024,
"grad_norm": 0.34109027432250294,
"learning_rate": 1.727137032780807e-05,
"loss": 0.2687,
"step": 1010
},
{
"epoch": 1.2735257214554578,
"grad_norm": 0.3424345987544216,
"learning_rate": 1.7233675625105703e-05,
"loss": 0.2659,
"step": 1015
},
{
"epoch": 1.2797992471769133,
"grad_norm": 0.3670304260632612,
"learning_rate": 1.7195764083799277e-05,
"loss": 0.2785,
"step": 1020
},
{
"epoch": 1.286072772898369,
"grad_norm": 0.35224545856472056,
"learning_rate": 1.7157636840335334e-05,
"loss": 0.2736,
"step": 1025
},
{
"epoch": 1.2923462986198244,
"grad_norm": 0.8557471051222927,
"learning_rate": 1.7119295037626366e-05,
"loss": 0.2598,
"step": 1030
},
{
"epoch": 1.2986198243412799,
"grad_norm": 0.3367737302829996,
"learning_rate": 1.708073982501656e-05,
"loss": 0.2612,
"step": 1035
},
{
"epoch": 1.3048933500627353,
"grad_norm": 0.34877985553107826,
"learning_rate": 1.704197235824732e-05,
"loss": 0.2726,
"step": 1040
},
{
"epoch": 1.3111668757841908,
"grad_norm": 0.34440291584591926,
"learning_rate": 1.7002993799422652e-05,
"loss": 0.2618,
"step": 1045
},
{
"epoch": 1.3174404015056462,
"grad_norm": 0.33534085525712676,
"learning_rate": 1.6963805316974303e-05,
"loss": 0.2609,
"step": 1050
},
{
"epoch": 1.3237139272271017,
"grad_norm": 0.3662577678235449,
"learning_rate": 1.6924408085626756e-05,
"loss": 0.2571,
"step": 1055
},
{
"epoch": 1.3299874529485571,
"grad_norm": 0.3516887590807691,
"learning_rate": 1.6884803286362e-05,
"loss": 0.2549,
"step": 1060
},
{
"epoch": 1.3362609786700126,
"grad_norm": 0.35333946604034366,
"learning_rate": 1.684499210638414e-05,
"loss": 0.264,
"step": 1065
},
{
"epoch": 1.342534504391468,
"grad_norm": 0.34307627027241056,
"learning_rate": 1.6804975739083803e-05,
"loss": 0.2503,
"step": 1070
},
{
"epoch": 1.3488080301129235,
"grad_norm": 0.35493913250542247,
"learning_rate": 1.6764755384002372e-05,
"loss": 0.2759,
"step": 1075
},
{
"epoch": 1.355081555834379,
"grad_norm": 0.3540415537021871,
"learning_rate": 1.6724332246796008e-05,
"loss": 0.2697,
"step": 1080
},
{
"epoch": 1.3613550815558344,
"grad_norm": 0.3318252959324338,
"learning_rate": 1.6683707539199538e-05,
"loss": 0.2669,
"step": 1085
},
{
"epoch": 1.3676286072772899,
"grad_norm": 0.33137640825273385,
"learning_rate": 1.6642882478990112e-05,
"loss": 0.2485,
"step": 1090
},
{
"epoch": 1.3739021329987453,
"grad_norm": 0.34017928191383223,
"learning_rate": 1.66018582899507e-05,
"loss": 0.2784,
"step": 1095
},
{
"epoch": 1.3801756587202008,
"grad_norm": 0.34200366455572445,
"learning_rate": 1.6560636201833423e-05,
"loss": 0.2673,
"step": 1100
},
{
"epoch": 1.3864491844416562,
"grad_norm": 0.35384418379159516,
"learning_rate": 1.6519217450322657e-05,
"loss": 0.2713,
"step": 1105
},
{
"epoch": 1.3927227101631117,
"grad_norm": 0.33307624664228463,
"learning_rate": 1.6477603276998037e-05,
"loss": 0.2742,
"step": 1110
},
{
"epoch": 1.3989962358845671,
"grad_norm": 0.356765357000532,
"learning_rate": 1.64357949292972e-05,
"loss": 0.2689,
"step": 1115
},
{
"epoch": 1.4052697616060226,
"grad_norm": 0.3252207580977864,
"learning_rate": 1.6393793660478406e-05,
"loss": 0.2506,
"step": 1120
},
{
"epoch": 1.411543287327478,
"grad_norm": 0.36204092721369197,
"learning_rate": 1.6351600729582977e-05,
"loss": 0.2636,
"step": 1125
},
{
"epoch": 1.4178168130489335,
"grad_norm": 0.3335102584738542,
"learning_rate": 1.630921740139755e-05,
"loss": 0.2616,
"step": 1130
},
{
"epoch": 1.424090338770389,
"grad_norm": 0.34898006017841243,
"learning_rate": 1.6266644946416148e-05,
"loss": 0.2781,
"step": 1135
},
{
"epoch": 1.4303638644918444,
"grad_norm": 0.3487474238464629,
"learning_rate": 1.622388464080213e-05,
"loss": 0.2773,
"step": 1140
},
{
"epoch": 1.4366373902132998,
"grad_norm": 0.35273437538491903,
"learning_rate": 1.61809377663499e-05,
"loss": 0.2682,
"step": 1145
},
{
"epoch": 1.4429109159347553,
"grad_norm": 0.36466350484277693,
"learning_rate": 1.6137805610446508e-05,
"loss": 0.2685,
"step": 1150
},
{
"epoch": 1.4491844416562107,
"grad_norm": 0.35090011471222154,
"learning_rate": 1.609448946603304e-05,
"loss": 0.2657,
"step": 1155
},
{
"epoch": 1.4554579673776662,
"grad_norm": 0.36713922458350784,
"learning_rate": 1.6050990631565894e-05,
"loss": 0.276,
"step": 1160
},
{
"epoch": 1.4617314930991216,
"grad_norm": 0.3552048435134842,
"learning_rate": 1.6007310410977807e-05,
"loss": 0.2796,
"step": 1165
},
{
"epoch": 1.468005018820577,
"grad_norm": 0.3469345931045424,
"learning_rate": 1.5963450113638815e-05,
"loss": 0.2592,
"step": 1170
},
{
"epoch": 1.4742785445420326,
"grad_norm": 2.7973818683619016,
"learning_rate": 1.5919411054316966e-05,
"loss": 0.2727,
"step": 1175
},
{
"epoch": 1.480552070263488,
"grad_norm": 0.3477691166092876,
"learning_rate": 1.5875194553138942e-05,
"loss": 0.2708,
"step": 1180
},
{
"epoch": 1.4868255959849435,
"grad_norm": 0.42047181326852134,
"learning_rate": 1.5830801935550462e-05,
"loss": 0.2583,
"step": 1185
},
{
"epoch": 1.4930991217063991,
"grad_norm": 0.38351145606864295,
"learning_rate": 1.5786234532276555e-05,
"loss": 0.2665,
"step": 1190
},
{
"epoch": 1.4993726474278546,
"grad_norm": 0.33888116221517656,
"learning_rate": 1.574149367928168e-05,
"loss": 0.2788,
"step": 1195
},
{
"epoch": 1.50564617314931,
"grad_norm": 0.3533659596857954,
"learning_rate": 1.5696580717729665e-05,
"loss": 0.2709,
"step": 1200
},
{
"epoch": 1.5119196988707655,
"grad_norm": 0.33797242592368726,
"learning_rate": 1.5651496993943507e-05,
"loss": 0.2552,
"step": 1205
},
{
"epoch": 1.518193224592221,
"grad_norm": 0.3380805624100092,
"learning_rate": 1.5606243859365033e-05,
"loss": 0.2696,
"step": 1210
},
{
"epoch": 1.5244667503136764,
"grad_norm": 0.3589011909382504,
"learning_rate": 1.5560822670514356e-05,
"loss": 0.2729,
"step": 1215
},
{
"epoch": 1.5307402760351319,
"grad_norm": 0.3410908318740735,
"learning_rate": 1.5515234788949238e-05,
"loss": 0.2658,
"step": 1220
},
{
"epoch": 1.5370138017565873,
"grad_norm": 0.4071357427518539,
"learning_rate": 1.5469481581224274e-05,
"loss": 0.2618,
"step": 1225
},
{
"epoch": 1.5432873274780428,
"grad_norm": 0.4050327080992723,
"learning_rate": 1.5423564418849895e-05,
"loss": 0.2707,
"step": 1230
},
{
"epoch": 1.5495608531994982,
"grad_norm": 0.3512739607697075,
"learning_rate": 1.537748467825131e-05,
"loss": 0.2762,
"step": 1235
},
{
"epoch": 1.5558343789209537,
"grad_norm": 0.3548946907435895,
"learning_rate": 1.5331243740727203e-05,
"loss": 0.285,
"step": 1240
},
{
"epoch": 1.5621079046424091,
"grad_norm": 0.3241833124220892,
"learning_rate": 1.5284842992408336e-05,
"loss": 0.2675,
"step": 1245
},
{
"epoch": 1.5683814303638646,
"grad_norm": 0.35085102130243395,
"learning_rate": 1.5238283824216015e-05,
"loss": 0.2681,
"step": 1250
},
{
"epoch": 1.57465495608532,
"grad_norm": 0.37148590291911643,
"learning_rate": 1.5191567631820364e-05,
"loss": 0.27,
"step": 1255
},
{
"epoch": 1.5809284818067755,
"grad_norm": 0.3535604634586656,
"learning_rate": 1.5144695815598529e-05,
"loss": 0.2717,
"step": 1260
},
{
"epoch": 1.587202007528231,
"grad_norm": 0.34794195830922975,
"learning_rate": 1.5097669780592658e-05,
"loss": 0.2633,
"step": 1265
},
{
"epoch": 1.5934755332496864,
"grad_norm": 0.35489225458342305,
"learning_rate": 1.5050490936467814e-05,
"loss": 0.2735,
"step": 1270
},
{
"epoch": 1.5997490589711418,
"grad_norm": 0.3350637434620735,
"learning_rate": 1.5003160697469707e-05,
"loss": 0.2544,
"step": 1275
},
{
"epoch": 1.6060225846925973,
"grad_norm": 0.3381089856973841,
"learning_rate": 1.4955680482382296e-05,
"loss": 0.2564,
"step": 1280
},
{
"epoch": 1.6122961104140527,
"grad_norm": 0.32684634964616277,
"learning_rate": 1.4908051714485266e-05,
"loss": 0.2741,
"step": 1285
},
{
"epoch": 1.6185696361355082,
"grad_norm": 0.34645698891886967,
"learning_rate": 1.4860275821511359e-05,
"loss": 0.2748,
"step": 1290
},
{
"epoch": 1.6248431618569636,
"grad_norm": 0.36961906780329834,
"learning_rate": 1.481235423560358e-05,
"loss": 0.2721,
"step": 1295
},
{
"epoch": 1.631116687578419,
"grad_norm": 0.350579115602874,
"learning_rate": 1.4764288393272258e-05,
"loss": 0.2628,
"step": 1300
},
{
"epoch": 1.6373902132998746,
"grad_norm": 0.3306477114399784,
"learning_rate": 1.4716079735352006e-05,
"loss": 0.2729,
"step": 1305
},
{
"epoch": 1.64366373902133,
"grad_norm": 0.35455700175873195,
"learning_rate": 1.46677297069585e-05,
"loss": 0.2667,
"step": 1310
},
{
"epoch": 1.6499372647427855,
"grad_norm": 0.33847253281006606,
"learning_rate": 1.4619239757445187e-05,
"loss": 0.2706,
"step": 1315
},
{
"epoch": 1.656210790464241,
"grad_norm": 0.34327567130216446,
"learning_rate": 1.4570611340359821e-05,
"loss": 0.266,
"step": 1320
},
{
"epoch": 1.6624843161856964,
"grad_norm": 0.3557992340297897,
"learning_rate": 1.4521845913400891e-05,
"loss": 0.2746,
"step": 1325
},
{
"epoch": 1.6687578419071518,
"grad_norm": 0.35121604621554686,
"learning_rate": 1.4472944938373945e-05,
"loss": 0.2704,
"step": 1330
},
{
"epoch": 1.6750313676286073,
"grad_norm": 0.34165359487510566,
"learning_rate": 1.4423909881147747e-05,
"loss": 0.2692,
"step": 1335
},
{
"epoch": 1.6813048933500627,
"grad_norm": 0.36505578592627197,
"learning_rate": 1.4374742211610345e-05,
"loss": 0.2662,
"step": 1340
},
{
"epoch": 1.6875784190715182,
"grad_norm": 0.34422086687232467,
"learning_rate": 1.4325443403625012e-05,
"loss": 0.275,
"step": 1345
},
{
"epoch": 1.6938519447929736,
"grad_norm": 0.3675735392039838,
"learning_rate": 1.4276014934986064e-05,
"loss": 0.272,
"step": 1350
},
{
"epoch": 1.700125470514429,
"grad_norm": 0.35389447351847136,
"learning_rate": 1.4226458287374555e-05,
"loss": 0.2713,
"step": 1355
},
{
"epoch": 1.7063989962358845,
"grad_norm": 0.3379229270723559,
"learning_rate": 1.4176774946313872e-05,
"loss": 0.2625,
"step": 1360
},
{
"epoch": 1.71267252195734,
"grad_norm": 0.34402002879314064,
"learning_rate": 1.4126966401125189e-05,
"loss": 0.268,
"step": 1365
},
{
"epoch": 1.7189460476787954,
"grad_norm": 0.3532344899080162,
"learning_rate": 1.4077034144882843e-05,
"loss": 0.2632,
"step": 1370
},
{
"epoch": 1.725219573400251,
"grad_norm": 0.3401295622140909,
"learning_rate": 1.4026979674369566e-05,
"loss": 0.2613,
"step": 1375
},
{
"epoch": 1.7314930991217063,
"grad_norm": 0.3391840532185442,
"learning_rate": 1.3976804490031608e-05,
"loss": 0.2719,
"step": 1380
},
{
"epoch": 1.7377666248431618,
"grad_norm": 0.3427490028776178,
"learning_rate": 1.3926510095933781e-05,
"loss": 0.2692,
"step": 1385
},
{
"epoch": 1.7440401505646173,
"grad_norm": 0.3433287489294571,
"learning_rate": 1.387609799971435e-05,
"loss": 0.2649,
"step": 1390
},
{
"epoch": 1.7503136762860727,
"grad_norm": 0.33114909580993174,
"learning_rate": 1.3825569712539864e-05,
"loss": 0.2527,
"step": 1395
},
{
"epoch": 1.7565872020075282,
"grad_norm": 0.337515925683474,
"learning_rate": 1.3774926749059826e-05,
"loss": 0.2556,
"step": 1400
},
{
"epoch": 1.7628607277289836,
"grad_norm": 0.3419996547297016,
"learning_rate": 1.3724170627361323e-05,
"loss": 0.2638,
"step": 1405
},
{
"epoch": 1.769134253450439,
"grad_norm": 0.35073376743994084,
"learning_rate": 1.3673302868923491e-05,
"loss": 0.2704,
"step": 1410
},
{
"epoch": 1.7754077791718945,
"grad_norm": 0.32861646593191174,
"learning_rate": 1.3622324998571928e-05,
"loss": 0.2519,
"step": 1415
},
{
"epoch": 1.78168130489335,
"grad_norm": 0.33312714617584277,
"learning_rate": 1.3571238544432968e-05,
"loss": 0.2664,
"step": 1420
},
{
"epoch": 1.7879548306148054,
"grad_norm": 0.3374614354621205,
"learning_rate": 1.352004503788789e-05,
"loss": 0.2585,
"step": 1425
},
{
"epoch": 1.7942283563362609,
"grad_norm": 0.3554543246581463,
"learning_rate": 1.3468746013527e-05,
"loss": 0.2762,
"step": 1430
},
{
"epoch": 1.8005018820577163,
"grad_norm": 0.3367124035830617,
"learning_rate": 1.3417343009103634e-05,
"loss": 0.261,
"step": 1435
},
{
"epoch": 1.8067754077791718,
"grad_norm": 0.3181211071389625,
"learning_rate": 1.3365837565488065e-05,
"loss": 0.2715,
"step": 1440
},
{
"epoch": 1.8130489335006272,
"grad_norm": 0.3480040032313692,
"learning_rate": 1.3314231226621305e-05,
"loss": 0.2624,
"step": 1445
},
{
"epoch": 1.8193224592220827,
"grad_norm": 0.3416171971995866,
"learning_rate": 1.3262525539468839e-05,
"loss": 0.2642,
"step": 1450
},
{
"epoch": 1.8255959849435381,
"grad_norm": 0.3635819899278629,
"learning_rate": 1.3210722053974233e-05,
"loss": 0.2632,
"step": 1455
},
{
"epoch": 1.8318695106649936,
"grad_norm": 0.3265652400172599,
"learning_rate": 1.315882232301269e-05,
"loss": 0.2612,
"step": 1460
},
{
"epoch": 1.838143036386449,
"grad_norm": 0.3342794882432052,
"learning_rate": 1.3106827902344485e-05,
"loss": 0.2623,
"step": 1465
},
{
"epoch": 1.8444165621079045,
"grad_norm": 0.3347520583742969,
"learning_rate": 1.3054740350568346e-05,
"loss": 0.2741,
"step": 1470
},
{
"epoch": 1.85069008782936,
"grad_norm": 0.34328068285452285,
"learning_rate": 1.3002561229074719e-05,
"loss": 0.2561,
"step": 1475
},
{
"epoch": 1.8569636135508154,
"grad_norm": 0.35131941587904497,
"learning_rate": 1.2950292101998967e-05,
"loss": 0.2747,
"step": 1480
},
{
"epoch": 1.8632371392722709,
"grad_norm": 0.34360731854818805,
"learning_rate": 1.289793453617449e-05,
"loss": 0.2627,
"step": 1485
},
{
"epoch": 1.8695106649937263,
"grad_norm": 0.3498923646707763,
"learning_rate": 1.2845490101085744e-05,
"loss": 0.2562,
"step": 1490
},
{
"epoch": 1.875784190715182,
"grad_norm": 0.34701974294822086,
"learning_rate": 1.2792960368821212e-05,
"loss": 0.265,
"step": 1495
},
{
"epoch": 1.8820577164366374,
"grad_norm": 0.33841922800891855,
"learning_rate": 1.2740346914026258e-05,
"loss": 0.2638,
"step": 1500
},
{
"epoch": 1.888331242158093,
"grad_norm": 0.3375511420369947,
"learning_rate": 1.2687651313855937e-05,
"loss": 0.2589,
"step": 1505
},
{
"epoch": 1.8946047678795483,
"grad_norm": 0.34124342881268466,
"learning_rate": 1.2634875147927726e-05,
"loss": 0.2689,
"step": 1510
},
{
"epoch": 1.9008782936010038,
"grad_norm": 0.3492720225961315,
"learning_rate": 1.2582019998274142e-05,
"loss": 0.2619,
"step": 1515
},
{
"epoch": 1.9071518193224593,
"grad_norm": 0.3424592345393382,
"learning_rate": 1.252908744929536e-05,
"loss": 0.2673,
"step": 1520
},
{
"epoch": 1.9134253450439147,
"grad_norm": 0.3541786605023589,
"learning_rate": 1.2476079087711695e-05,
"loss": 0.2741,
"step": 1525
},
{
"epoch": 1.9196988707653702,
"grad_norm": 0.36100779817450435,
"learning_rate": 1.2422996502516023e-05,
"loss": 0.2708,
"step": 1530
},
{
"epoch": 1.9259723964868256,
"grad_norm": 0.3637900870051361,
"learning_rate": 1.236984128492619e-05,
"loss": 0.2679,
"step": 1535
},
{
"epoch": 1.932245922208281,
"grad_norm": 0.34654925833715405,
"learning_rate": 1.231661502833728e-05,
"loss": 0.2705,
"step": 1540
},
{
"epoch": 1.9385194479297365,
"grad_norm": 0.3555711761052598,
"learning_rate": 1.2263319328273853e-05,
"loss": 0.2732,
"step": 1545
},
{
"epoch": 1.944792973651192,
"grad_norm": 0.4361336719233523,
"learning_rate": 1.220995578234214e-05,
"loss": 0.2818,
"step": 1550
},
{
"epoch": 1.9510664993726474,
"grad_norm": 0.35818262065748885,
"learning_rate": 1.2156525990182132e-05,
"loss": 0.2714,
"step": 1555
},
{
"epoch": 1.9573400250941029,
"grad_norm": 0.34020072227504516,
"learning_rate": 1.2103031553419629e-05,
"loss": 0.2561,
"step": 1560
},
{
"epoch": 1.9636135508155583,
"grad_norm": 0.35424378525712236,
"learning_rate": 1.2049474075618244e-05,
"loss": 0.2817,
"step": 1565
},
{
"epoch": 1.9698870765370138,
"grad_norm": 0.3436811391936569,
"learning_rate": 1.1995855162231323e-05,
"loss": 0.2727,
"step": 1570
},
{
"epoch": 1.9761606022584692,
"grad_norm": 0.35547373104319596,
"learning_rate": 1.1942176420553817e-05,
"loss": 0.279,
"step": 1575
},
{
"epoch": 1.9824341279799247,
"grad_norm": 0.33151574599317196,
"learning_rate": 1.1888439459674107e-05,
"loss": 0.2736,
"step": 1580
},
{
"epoch": 1.9887076537013801,
"grad_norm": 0.3489287650284772,
"learning_rate": 1.1834645890425773e-05,
"loss": 0.2674,
"step": 1585
},
{
"epoch": 1.9949811794228356,
"grad_norm": 0.4674786217571983,
"learning_rate": 1.1780797325339301e-05,
"loss": 0.2618,
"step": 1590
},
{
"epoch": 2.0,
"eval_loss": 0.3876406252384186,
"eval_runtime": 2.3724,
"eval_samples_per_second": 14.753,
"eval_steps_per_second": 0.843,
"step": 1594
},
{
"epoch": 2.0012547051442913,
"grad_norm": 0.41826837808172157,
"learning_rate": 1.1726895378593745e-05,
"loss": 0.2453,
"step": 1595
},
{
"epoch": 2.0075282308657467,
"grad_norm": 0.3896113439351613,
"learning_rate": 1.167294166596834e-05,
"loss": 0.1307,
"step": 1600
},
{
"epoch": 2.013801756587202,
"grad_norm": 0.40723627183822325,
"learning_rate": 1.1618937804794077e-05,
"loss": 0.1253,
"step": 1605
},
{
"epoch": 2.0200752823086576,
"grad_norm": 0.3393249627107008,
"learning_rate": 1.1564885413905205e-05,
"loss": 0.1212,
"step": 1610
},
{
"epoch": 2.026348808030113,
"grad_norm": 0.3969867667512431,
"learning_rate": 1.1510786113590715e-05,
"loss": 0.1213,
"step": 1615
},
{
"epoch": 2.0326223337515685,
"grad_norm": 0.3807098036418188,
"learning_rate": 1.1456641525545768e-05,
"loss": 0.115,
"step": 1620
},
{
"epoch": 2.038895859473024,
"grad_norm": 0.3600283222530161,
"learning_rate": 1.1402453272823086e-05,
"loss": 0.1178,
"step": 1625
},
{
"epoch": 2.0451693851944794,
"grad_norm": 0.37082981681871713,
"learning_rate": 1.1348222979784289e-05,
"loss": 0.1186,
"step": 1630
},
{
"epoch": 2.051442910915935,
"grad_norm": 0.3701722986939684,
"learning_rate": 1.1293952272051217e-05,
"loss": 0.1161,
"step": 1635
},
{
"epoch": 2.0577164366373903,
"grad_norm": 0.39437640184582917,
"learning_rate": 1.1239642776457176e-05,
"loss": 0.112,
"step": 1640
},
{
"epoch": 2.063989962358846,
"grad_norm": 0.35512823472089206,
"learning_rate": 1.1185296120998208e-05,
"loss": 0.1227,
"step": 1645
},
{
"epoch": 2.0702634880803013,
"grad_norm": 0.39699134768151145,
"learning_rate": 1.1130913934784255e-05,
"loss": 0.118,
"step": 1650
},
{
"epoch": 2.0765370138017567,
"grad_norm": 0.361766879756225,
"learning_rate": 1.107649784799034e-05,
"loss": 0.1148,
"step": 1655
},
{
"epoch": 2.082810539523212,
"grad_norm": 0.37983521046428353,
"learning_rate": 1.1022049491807703e-05,
"loss": 0.1105,
"step": 1660
},
{
"epoch": 2.0890840652446676,
"grad_norm": 0.36791365726333974,
"learning_rate": 1.0967570498394895e-05,
"loss": 0.1197,
"step": 1665
},
{
"epoch": 2.095357590966123,
"grad_norm": 0.38778258880907535,
"learning_rate": 1.0913062500828865e-05,
"loss": 0.119,
"step": 1670
},
{
"epoch": 2.1016311166875785,
"grad_norm": 0.3686039497467697,
"learning_rate": 1.0858527133055994e-05,
"loss": 0.1197,
"step": 1675
},
{
"epoch": 2.107904642409034,
"grad_norm": 0.39330229406582323,
"learning_rate": 1.0803966029843114e-05,
"loss": 0.1166,
"step": 1680
},
{
"epoch": 2.1141781681304894,
"grad_norm": 0.3636181831711105,
"learning_rate": 1.0749380826728513e-05,
"loss": 0.1133,
"step": 1685
},
{
"epoch": 2.120451693851945,
"grad_norm": 0.4335397800674325,
"learning_rate": 1.0694773159972912e-05,
"loss": 0.1246,
"step": 1690
},
{
"epoch": 2.1267252195734003,
"grad_norm": 0.37508400928061725,
"learning_rate": 1.0640144666510392e-05,
"loss": 0.1196,
"step": 1695
},
{
"epoch": 2.132998745294856,
"grad_norm": 0.3987416537308343,
"learning_rate": 1.0585496983899361e-05,
"loss": 0.1226,
"step": 1700
},
{
"epoch": 2.1392722710163112,
"grad_norm": 0.38791702283560353,
"learning_rate": 1.0530831750273428e-05,
"loss": 0.1117,
"step": 1705
},
{
"epoch": 2.1455457967377667,
"grad_norm": 0.39592521931999036,
"learning_rate": 1.0476150604292329e-05,
"loss": 0.1198,
"step": 1710
},
{
"epoch": 2.151819322459222,
"grad_norm": 0.6134906074452066,
"learning_rate": 1.0421455185092784e-05,
"loss": 0.1168,
"step": 1715
},
{
"epoch": 2.1580928481806776,
"grad_norm": 0.3936394784460519,
"learning_rate": 1.0366747132239374e-05,
"loss": 0.1137,
"step": 1720
},
{
"epoch": 2.164366373902133,
"grad_norm": 0.38023062505112215,
"learning_rate": 1.0312028085675393e-05,
"loss": 0.1216,
"step": 1725
},
{
"epoch": 2.1706398996235885,
"grad_norm": 0.3879080250933175,
"learning_rate": 1.025729968567368e-05,
"loss": 0.1163,
"step": 1730
},
{
"epoch": 2.176913425345044,
"grad_norm": 0.37494689918032786,
"learning_rate": 1.0202563572787457e-05,
"loss": 0.1155,
"step": 1735
},
{
"epoch": 2.1831869510664994,
"grad_norm": 0.4170219240353852,
"learning_rate": 1.0147821387801154e-05,
"loss": 0.1231,
"step": 1740
},
{
"epoch": 2.189460476787955,
"grad_norm": 0.36447068742427746,
"learning_rate": 1.0093074771681214e-05,
"loss": 0.1173,
"step": 1745
},
{
"epoch": 2.1957340025094103,
"grad_norm": 0.37383691981995226,
"learning_rate": 1.003832536552691e-05,
"loss": 0.1181,
"step": 1750
},
{
"epoch": 2.2020075282308658,
"grad_norm": 0.35361007854482546,
"learning_rate": 9.983574810521151e-06,
"loss": 0.1141,
"step": 1755
},
{
"epoch": 2.208281053952321,
"grad_norm": 0.37593622722746173,
"learning_rate": 9.928824747881286e-06,
"loss": 0.117,
"step": 1760
},
{
"epoch": 2.2145545796737767,
"grad_norm": 0.38527811994324745,
"learning_rate": 9.874076818809903e-06,
"loss": 0.1222,
"step": 1765
},
{
"epoch": 2.220828105395232,
"grad_norm": 0.37756703878021675,
"learning_rate": 9.81933266444563e-06,
"loss": 0.117,
"step": 1770
},
{
"epoch": 2.2271016311166876,
"grad_norm": 0.3981056432095895,
"learning_rate": 9.76459392581395e-06,
"loss": 0.1187,
"step": 1775
},
{
"epoch": 2.233375156838143,
"grad_norm": 0.35779965724307555,
"learning_rate": 9.709862243777998e-06,
"loss": 0.1201,
"step": 1780
},
{
"epoch": 2.2396486825595985,
"grad_norm": 0.39287403460106407,
"learning_rate": 9.655139258989379e-06,
"loss": 0.1173,
"step": 1785
},
{
"epoch": 2.245922208281054,
"grad_norm": 0.39081524580807464,
"learning_rate": 9.60042661183899e-06,
"loss": 0.114,
"step": 1790
},
{
"epoch": 2.2521957340025094,
"grad_norm": 0.5348108468458116,
"learning_rate": 9.54572594240784e-06,
"loss": 0.1145,
"step": 1795
},
{
"epoch": 2.258469259723965,
"grad_norm": 0.35615572604956347,
"learning_rate": 9.491038890417894e-06,
"loss": 0.1128,
"step": 1800
},
{
"epoch": 2.2647427854454203,
"grad_norm": 0.37107476709616843,
"learning_rate": 9.436367095182916e-06,
"loss": 0.1228,
"step": 1805
},
{
"epoch": 2.2710163111668757,
"grad_norm": 0.38323250415960275,
"learning_rate": 9.381712195559324e-06,
"loss": 0.118,
"step": 1810
},
{
"epoch": 2.277289836888331,
"grad_norm": 0.359171122780413,
"learning_rate": 9.327075829897082e-06,
"loss": 0.1191,
"step": 1815
},
{
"epoch": 2.2835633626097867,
"grad_norm": 0.4114654516418914,
"learning_rate": 9.272459635990563e-06,
"loss": 0.1235,
"step": 1820
},
{
"epoch": 2.289836888331242,
"grad_norm": 0.39460304180293915,
"learning_rate": 9.217865251029469e-06,
"loss": 0.1187,
"step": 1825
},
{
"epoch": 2.2961104140526976,
"grad_norm": 0.35839055639361983,
"learning_rate": 9.163294311549753e-06,
"loss": 0.1156,
"step": 1830
},
{
"epoch": 2.302383939774153,
"grad_norm": 0.3482054803738314,
"learning_rate": 9.108748453384559e-06,
"loss": 0.1198,
"step": 1835
},
{
"epoch": 2.3086574654956085,
"grad_norm": 0.3820940219983755,
"learning_rate": 9.054229311615178e-06,
"loss": 0.117,
"step": 1840
},
{
"epoch": 2.314930991217064,
"grad_norm": 0.39037798204086893,
"learning_rate": 8.999738520522065e-06,
"loss": 0.1197,
"step": 1845
},
{
"epoch": 2.3212045169385194,
"grad_norm": 0.3822304512858218,
"learning_rate": 8.945277713535809e-06,
"loss": 0.1152,
"step": 1850
},
{
"epoch": 2.327478042659975,
"grad_norm": 0.3965998438998078,
"learning_rate": 8.890848523188192e-06,
"loss": 0.1243,
"step": 1855
},
{
"epoch": 2.3337515683814303,
"grad_norm": 0.3939416904284715,
"learning_rate": 8.836452581063248e-06,
"loss": 0.1195,
"step": 1860
},
{
"epoch": 2.3400250941028857,
"grad_norm": 0.4276348064874773,
"learning_rate": 8.78209151774835e-06,
"loss": 0.1211,
"step": 1865
},
{
"epoch": 2.346298619824341,
"grad_norm": 0.39140182898284753,
"learning_rate": 8.727766962785344e-06,
"loss": 0.1157,
"step": 1870
},
{
"epoch": 2.3525721455457966,
"grad_norm": 0.3634468198734603,
"learning_rate": 8.673480544621681e-06,
"loss": 0.1129,
"step": 1875
},
{
"epoch": 2.358845671267252,
"grad_norm": 0.39837989743789176,
"learning_rate": 8.61923389056162e-06,
"loss": 0.1198,
"step": 1880
},
{
"epoch": 2.3651191969887075,
"grad_norm": 0.3935841490044898,
"learning_rate": 8.565028626717435e-06,
"loss": 0.1203,
"step": 1885
},
{
"epoch": 2.371392722710163,
"grad_norm": 0.35305962598333074,
"learning_rate": 8.51086637796068e-06,
"loss": 0.1128,
"step": 1890
},
{
"epoch": 2.3776662484316184,
"grad_norm": 0.394308865525823,
"learning_rate": 8.456748767873474e-06,
"loss": 0.1124,
"step": 1895
},
{
"epoch": 2.383939774153074,
"grad_norm": 0.3841761354621664,
"learning_rate": 8.402677418699842e-06,
"loss": 0.1145,
"step": 1900
},
{
"epoch": 2.3902132998745294,
"grad_norm": 0.3893473191716482,
"learning_rate": 8.34865395129707e-06,
"loss": 0.1197,
"step": 1905
},
{
"epoch": 2.396486825595985,
"grad_norm": 0.42534674714643167,
"learning_rate": 8.294679985087137e-06,
"loss": 0.1179,
"step": 1910
},
{
"epoch": 2.4027603513174403,
"grad_norm": 0.40229677875453496,
"learning_rate": 8.240757138008149e-06,
"loss": 0.1236,
"step": 1915
},
{
"epoch": 2.4090338770388957,
"grad_norm": 0.37978448174191587,
"learning_rate": 8.186887026465857e-06,
"loss": 0.1125,
"step": 1920
},
{
"epoch": 2.415307402760351,
"grad_norm": 0.38090023973889275,
"learning_rate": 8.133071265285209e-06,
"loss": 0.1175,
"step": 1925
},
{
"epoch": 2.4215809284818066,
"grad_norm": 0.3709724709852035,
"learning_rate": 8.079311467661912e-06,
"loss": 0.1189,
"step": 1930
},
{
"epoch": 2.427854454203262,
"grad_norm": 0.3968859415543936,
"learning_rate": 8.025609245114107e-06,
"loss": 0.1208,
"step": 1935
},
{
"epoch": 2.4341279799247175,
"grad_norm": 0.38140929315858313,
"learning_rate": 7.971966207434045e-06,
"loss": 0.1167,
"step": 1940
},
{
"epoch": 2.440401505646173,
"grad_norm": 0.3813844728323988,
"learning_rate": 7.918383962639835e-06,
"loss": 0.1186,
"step": 1945
},
{
"epoch": 2.4466750313676284,
"grad_norm": 0.36925458542907064,
"learning_rate": 7.864864116927245e-06,
"loss": 0.1167,
"step": 1950
},
{
"epoch": 2.452948557089084,
"grad_norm": 0.3813928431538188,
"learning_rate": 7.811408274621549e-06,
"loss": 0.1217,
"step": 1955
},
{
"epoch": 2.4592220828105393,
"grad_norm": 0.38045397571366496,
"learning_rate": 7.75801803812944e-06,
"loss": 0.1176,
"step": 1960
},
{
"epoch": 2.4654956085319952,
"grad_norm": 0.409917244408148,
"learning_rate": 7.704695007890988e-06,
"loss": 0.1214,
"step": 1965
},
{
"epoch": 2.4717691342534502,
"grad_norm": 0.39881220216006136,
"learning_rate": 7.651440782331679e-06,
"loss": 0.1176,
"step": 1970
},
{
"epoch": 2.478042659974906,
"grad_norm": 0.3551138970811604,
"learning_rate": 7.598256957814479e-06,
"loss": 0.1156,
"step": 1975
},
{
"epoch": 2.484316185696361,
"grad_norm": 0.38209748278035194,
"learning_rate": 7.545145128592009e-06,
"loss": 0.1128,
"step": 1980
},
{
"epoch": 2.490589711417817,
"grad_norm": 0.38676641594007305,
"learning_rate": 7.49210688675873e-06,
"loss": 0.1175,
"step": 1985
},
{
"epoch": 2.496863237139272,
"grad_norm": 0.36950074837730973,
"learning_rate": 7.4391438222032265e-06,
"loss": 0.1139,
"step": 1990
},
{
"epoch": 2.503136762860728,
"grad_norm": 0.3915583584835345,
"learning_rate": 7.3862575225605535e-06,
"loss": 0.1179,
"step": 1995
},
{
"epoch": 2.509410288582183,
"grad_norm": 0.3866601171887957,
"learning_rate": 7.333449573164634e-06,
"loss": 0.1207,
"step": 2000
},
{
"epoch": 2.515683814303639,
"grad_norm": 0.4110664967201194,
"learning_rate": 7.280721557000759e-06,
"loss": 0.1166,
"step": 2005
},
{
"epoch": 2.521957340025094,
"grad_norm": 0.37778473055073203,
"learning_rate": 7.228075054658096e-06,
"loss": 0.1157,
"step": 2010
},
{
"epoch": 2.5282308657465498,
"grad_norm": 0.37225323631681123,
"learning_rate": 7.175511644282349e-06,
"loss": 0.1156,
"step": 2015
},
{
"epoch": 2.5345043914680048,
"grad_norm": 0.3824521057716352,
"learning_rate": 7.123032901528431e-06,
"loss": 0.1182,
"step": 2020
},
{
"epoch": 2.5407779171894607,
"grad_norm": 0.38115325746292966,
"learning_rate": 7.070640399513232e-06,
"loss": 0.1158,
"step": 2025
},
{
"epoch": 2.5470514429109157,
"grad_norm": 0.3822411406686136,
"learning_rate": 7.018335708768467e-06,
"loss": 0.1177,
"step": 2030
},
{
"epoch": 2.5533249686323716,
"grad_norm": 0.407300076312883,
"learning_rate": 6.966120397193605e-06,
"loss": 0.1152,
"step": 2035
},
{
"epoch": 2.5595984943538266,
"grad_norm": 0.3666367383521563,
"learning_rate": 6.913996030008853e-06,
"loss": 0.1153,
"step": 2040
},
{
"epoch": 2.5658720200752825,
"grad_norm": 0.3632368895081262,
"learning_rate": 6.861964169708245e-06,
"loss": 0.1107,
"step": 2045
},
{
"epoch": 2.572145545796738,
"grad_norm": 0.3649524530723953,
"learning_rate": 6.810026376012808e-06,
"loss": 0.1145,
"step": 2050
},
{
"epoch": 2.5784190715181934,
"grad_norm": 0.39293153907868017,
"learning_rate": 6.758184205823791e-06,
"loss": 0.1162,
"step": 2055
},
{
"epoch": 2.584692597239649,
"grad_norm": 0.3931749546166204,
"learning_rate": 6.706439213176028e-06,
"loss": 0.1132,
"step": 2060
},
{
"epoch": 2.5909661229611043,
"grad_norm": 0.3825377326068989,
"learning_rate": 6.654792949191317e-06,
"loss": 0.1222,
"step": 2065
},
{
"epoch": 2.5972396486825597,
"grad_norm": 0.35192545209688325,
"learning_rate": 6.603246962031942e-06,
"loss": 0.112,
"step": 2070
},
{
"epoch": 2.603513174404015,
"grad_norm": 0.3638182336052127,
"learning_rate": 6.551802796854265e-06,
"loss": 0.1144,
"step": 2075
},
{
"epoch": 2.6097867001254706,
"grad_norm": 0.38711833706267534,
"learning_rate": 6.500461995762402e-06,
"loss": 0.1133,
"step": 2080
},
{
"epoch": 2.616060225846926,
"grad_norm": 0.3897188834137444,
"learning_rate": 6.449226097762e-06,
"loss": 0.123,
"step": 2085
},
{
"epoch": 2.6223337515683816,
"grad_norm": 0.37823736524426615,
"learning_rate": 6.398096638714106e-06,
"loss": 0.1184,
"step": 2090
},
{
"epoch": 2.628607277289837,
"grad_norm": 0.3932949556705336,
"learning_rate": 6.34707515128912e-06,
"loss": 0.1143,
"step": 2095
},
{
"epoch": 2.6348808030112925,
"grad_norm": 0.3884831444296721,
"learning_rate": 6.296163164920858e-06,
"loss": 0.1113,
"step": 2100
},
{
"epoch": 2.641154328732748,
"grad_norm": 0.36382200051340396,
"learning_rate": 6.245362205760703e-06,
"loss": 0.1159,
"step": 2105
},
{
"epoch": 2.6474278544542034,
"grad_norm": 0.3924419492733963,
"learning_rate": 6.194673796631852e-06,
"loss": 0.109,
"step": 2110
},
{
"epoch": 2.653701380175659,
"grad_norm": 0.3947013863716826,
"learning_rate": 6.144099456983681e-06,
"loss": 0.1115,
"step": 2115
},
{
"epoch": 2.6599749058971143,
"grad_norm": 0.3943261219862062,
"learning_rate": 6.093640702846182e-06,
"loss": 0.1122,
"step": 2120
},
{
"epoch": 2.6662484316185697,
"grad_norm": 0.39005774295522977,
"learning_rate": 6.043299046784526e-06,
"loss": 0.1187,
"step": 2125
},
{
"epoch": 2.672521957340025,
"grad_norm": 0.37092235530502005,
"learning_rate": 5.993075997853719e-06,
"loss": 0.1149,
"step": 2130
},
{
"epoch": 2.6787954830614806,
"grad_norm": 0.3807422171419007,
"learning_rate": 5.94297306155337e-06,
"loss": 0.1135,
"step": 2135
},
{
"epoch": 2.685069008782936,
"grad_norm": 0.3792169573613968,
"learning_rate": 5.892991739782557e-06,
"loss": 0.1199,
"step": 2140
},
{
"epoch": 2.6913425345043915,
"grad_norm": 0.3831811436833811,
"learning_rate": 5.843133530794817e-06,
"loss": 0.1096,
"step": 2145
},
{
"epoch": 2.697616060225847,
"grad_norm": 0.37933754782226464,
"learning_rate": 5.793399929153216e-06,
"loss": 0.1106,
"step": 2150
},
{
"epoch": 2.7038895859473024,
"grad_norm": 0.38366361518402914,
"learning_rate": 5.743792425685554e-06,
"loss": 0.1154,
"step": 2155
},
{
"epoch": 2.710163111668758,
"grad_norm": 0.3631788769456335,
"learning_rate": 5.694312507439691e-06,
"loss": 0.1141,
"step": 2160
},
{
"epoch": 2.7164366373902133,
"grad_norm": 0.39467438603923816,
"learning_rate": 5.644961657638942e-06,
"loss": 0.1148,
"step": 2165
},
{
"epoch": 2.722710163111669,
"grad_norm": 0.3643220647785576,
"learning_rate": 5.595741355637645e-06,
"loss": 0.1098,
"step": 2170
},
{
"epoch": 2.7289836888331243,
"grad_norm": 0.39291715101415214,
"learning_rate": 5.5466530768768005e-06,
"loss": 0.1159,
"step": 2175
},
{
"epoch": 2.7352572145545797,
"grad_norm": 0.40345238378734466,
"learning_rate": 5.497698292839835e-06,
"loss": 0.1117,
"step": 2180
},
{
"epoch": 2.741530740276035,
"grad_norm": 0.41110337094460886,
"learning_rate": 5.448878471008513e-06,
"loss": 0.1134,
"step": 2185
},
{
"epoch": 2.7478042659974906,
"grad_norm": 0.4006579187318595,
"learning_rate": 5.400195074818924e-06,
"loss": 0.1228,
"step": 2190
},
{
"epoch": 2.754077791718946,
"grad_norm": 0.4006920463713775,
"learning_rate": 5.351649563617638e-06,
"loss": 0.1157,
"step": 2195
},
{
"epoch": 2.7603513174404015,
"grad_norm": 0.40311348122015783,
"learning_rate": 5.3032433926179395e-06,
"loss": 0.1211,
"step": 2200
},
{
"epoch": 2.766624843161857,
"grad_norm": 0.3833185052390898,
"learning_rate": 5.25497801285622e-06,
"loss": 0.1085,
"step": 2205
},
{
"epoch": 2.7728983688833124,
"grad_norm": 0.40698791566770903,
"learning_rate": 5.206854871148466e-06,
"loss": 0.1145,
"step": 2210
},
{
"epoch": 2.779171894604768,
"grad_norm": 0.37018390800544043,
"learning_rate": 5.158875410046906e-06,
"loss": 0.1148,
"step": 2215
},
{
"epoch": 2.7854454203262233,
"grad_norm": 0.4097654897566281,
"learning_rate": 5.111041067796754e-06,
"loss": 0.1112,
"step": 2220
},
{
"epoch": 2.791718946047679,
"grad_norm": 0.38185615087945834,
"learning_rate": 5.063353278293106e-06,
"loss": 0.1129,
"step": 2225
},
{
"epoch": 2.7979924717691342,
"grad_norm": 0.38860623269143496,
"learning_rate": 5.0158134710379595e-06,
"loss": 0.1157,
"step": 2230
},
{
"epoch": 2.8042659974905897,
"grad_norm": 0.41677106161054384,
"learning_rate": 4.9684230710973394e-06,
"loss": 0.1179,
"step": 2235
},
{
"epoch": 2.810539523212045,
"grad_norm": 0.3659456249492505,
"learning_rate": 4.921183499058615e-06,
"loss": 0.1179,
"step": 2240
},
{
"epoch": 2.8168130489335006,
"grad_norm": 0.390728899484992,
"learning_rate": 4.8740961709878834e-06,
"loss": 0.1118,
"step": 2245
},
{
"epoch": 2.823086574654956,
"grad_norm": 0.41326006522454833,
"learning_rate": 4.827162498387544e-06,
"loss": 0.1123,
"step": 2250
},
{
"epoch": 2.8293601003764115,
"grad_norm": 0.37982873419881247,
"learning_rate": 4.780383888153983e-06,
"loss": 0.1099,
"step": 2255
},
{
"epoch": 2.835633626097867,
"grad_norm": 0.3826347732319729,
"learning_rate": 4.733761742535381e-06,
"loss": 0.1119,
"step": 2260
},
{
"epoch": 2.8419071518193224,
"grad_norm": 0.4070056837379538,
"learning_rate": 4.687297459089708e-06,
"loss": 0.1169,
"step": 2265
},
{
"epoch": 2.848180677540778,
"grad_norm": 0.3907501846052804,
"learning_rate": 4.640992430642801e-06,
"loss": 0.1184,
"step": 2270
},
{
"epoch": 2.8544542032622333,
"grad_norm": 0.3980879608103747,
"learning_rate": 4.594848045246638e-06,
"loss": 0.1143,
"step": 2275
},
{
"epoch": 2.8607277289836888,
"grad_norm": 0.3831724940346972,
"learning_rate": 4.548865686137718e-06,
"loss": 0.1165,
"step": 2280
},
{
"epoch": 2.867001254705144,
"grad_norm": 0.3554795717929537,
"learning_rate": 4.503046731695584e-06,
"loss": 0.1103,
"step": 2285
},
{
"epoch": 2.8732747804265997,
"grad_norm": 0.37945026941432614,
"learning_rate": 4.457392555401531e-06,
"loss": 0.1165,
"step": 2290
},
{
"epoch": 2.879548306148055,
"grad_norm": 0.4040384960083639,
"learning_rate": 4.411904525797408e-06,
"loss": 0.112,
"step": 2295
},
{
"epoch": 2.8858218318695106,
"grad_norm": 0.37374435050142246,
"learning_rate": 4.3665840064446165e-06,
"loss": 0.1152,
"step": 2300
},
{
"epoch": 2.892095357590966,
"grad_norm": 0.38395743236739355,
"learning_rate": 4.321432355883219e-06,
"loss": 0.1158,
"step": 2305
},
{
"epoch": 2.8983688833124215,
"grad_norm": 0.38107246526906685,
"learning_rate": 4.276450927591229e-06,
"loss": 0.1099,
"step": 2310
},
{
"epoch": 2.904642409033877,
"grad_norm": 0.40288534078286153,
"learning_rate": 4.231641069944019e-06,
"loss": 0.1135,
"step": 2315
},
{
"epoch": 2.9109159347553324,
"grad_norm": 0.39407625180559624,
"learning_rate": 4.187004126173928e-06,
"loss": 0.1153,
"step": 2320
},
{
"epoch": 2.917189460476788,
"grad_norm": 0.390694853717115,
"learning_rate": 4.1425414343299734e-06,
"loss": 0.1113,
"step": 2325
},
{
"epoch": 2.9234629861982433,
"grad_norm": 0.37505537186817833,
"learning_rate": 4.098254327237742e-06,
"loss": 0.1107,
"step": 2330
},
{
"epoch": 2.9297365119196987,
"grad_norm": 0.3797569810287486,
"learning_rate": 4.054144132459471e-06,
"loss": 0.1125,
"step": 2335
},
{
"epoch": 2.936010037641154,
"grad_norm": 0.39039171406836526,
"learning_rate": 4.010212172254201e-06,
"loss": 0.1139,
"step": 2340
},
{
"epoch": 2.9422835633626097,
"grad_norm": 0.3735230020383592,
"learning_rate": 3.966459763538179e-06,
"loss": 0.1162,
"step": 2345
},
{
"epoch": 2.948557089084065,
"grad_norm": 0.3799822252359316,
"learning_rate": 3.92288821784536e-06,
"loss": 0.1157,
"step": 2350
},
{
"epoch": 2.9548306148055206,
"grad_norm": 0.37435313221883065,
"learning_rate": 3.879498841288105e-06,
"loss": 0.1109,
"step": 2355
},
{
"epoch": 2.961104140526976,
"grad_norm": 0.37834252836846144,
"learning_rate": 3.836292934518029e-06,
"loss": 0.1124,
"step": 2360
},
{
"epoch": 2.9673776662484315,
"grad_norm": 0.3755100276116184,
"learning_rate": 3.793271792686993e-06,
"loss": 0.1122,
"step": 2365
},
{
"epoch": 2.973651191969887,
"grad_norm": 0.38213190237560674,
"learning_rate": 3.750436705408311e-06,
"loss": 0.1131,
"step": 2370
},
{
"epoch": 2.9799247176913424,
"grad_norm": 0.39502326617269706,
"learning_rate": 3.7077889567180625e-06,
"loss": 0.113,
"step": 2375
},
{
"epoch": 2.9861982434127983,
"grad_norm": 0.3907005868892978,
"learning_rate": 3.6653298250366265e-06,
"loss": 0.1121,
"step": 2380
},
{
"epoch": 2.9924717691342533,
"grad_norm": 0.3757187294889032,
"learning_rate": 3.6230605831303354e-06,
"loss": 0.1138,
"step": 2385
},
{
"epoch": 2.998745294855709,
"grad_norm": 0.36519480053180337,
"learning_rate": 3.5809824980733445e-06,
"loss": 0.1141,
"step": 2390
},
{
"epoch": 3.0,
"eval_loss": 0.43099531531333923,
"eval_runtime": 2.3647,
"eval_samples_per_second": 14.801,
"eval_steps_per_second": 0.846,
"step": 2391
},
{
"epoch": 3.005018820577164,
"grad_norm": 0.2442197751144262,
"learning_rate": 3.5390968312096396e-06,
"loss": 0.0533,
"step": 2395
},
{
"epoch": 3.0112923462986196,
"grad_norm": 0.21528572471833773,
"learning_rate": 3.497404838115219e-06,
"loss": 0.0415,
"step": 2400
},
{
"epoch": 3.017565872020075,
"grad_norm": 0.36503732282533347,
"learning_rate": 3.455907768560477e-06,
"loss": 0.0419,
"step": 2405
},
{
"epoch": 3.0238393977415305,
"grad_norm": 0.34377984007445206,
"learning_rate": 3.414606866472707e-06,
"loss": 0.0402,
"step": 2410
},
{
"epoch": 3.030112923462986,
"grad_norm": 0.3334896061335003,
"learning_rate": 3.373503369898862e-06,
"loss": 0.04,
"step": 2415
},
{
"epoch": 3.0363864491844414,
"grad_norm": 0.2723536621745364,
"learning_rate": 3.3325985109683877e-06,
"loss": 0.0396,
"step": 2420
},
{
"epoch": 3.042659974905897,
"grad_norm": 0.2691974596202031,
"learning_rate": 3.291893515856334e-06,
"loss": 0.0389,
"step": 2425
},
{
"epoch": 3.0489335006273524,
"grad_norm": 0.28187769151055436,
"learning_rate": 3.2513896047465654e-06,
"loss": 0.0379,
"step": 2430
},
{
"epoch": 3.055207026348808,
"grad_norm": 0.26156534288635025,
"learning_rate": 3.211087991795201e-06,
"loss": 0.0385,
"step": 2435
},
{
"epoch": 3.0614805520702637,
"grad_norm": 0.2632608582642457,
"learning_rate": 3.1709898850942234e-06,
"loss": 0.0381,
"step": 2440
},
{
"epoch": 3.067754077791719,
"grad_norm": 0.28572880260972616,
"learning_rate": 3.1310964866352524e-06,
"loss": 0.0389,
"step": 2445
},
{
"epoch": 3.0740276035131746,
"grad_norm": 0.28301745878682383,
"learning_rate": 3.0914089922735215e-06,
"loss": 0.0375,
"step": 2450
},
{
"epoch": 3.08030112923463,
"grad_norm": 0.29208432776276283,
"learning_rate": 3.051928591692017e-06,
"loss": 0.0381,
"step": 2455
},
{
"epoch": 3.0865746549560855,
"grad_norm": 0.2847903910965424,
"learning_rate": 3.012656468365842e-06,
"loss": 0.0368,
"step": 2460
},
{
"epoch": 3.092848180677541,
"grad_norm": 0.2966192251884188,
"learning_rate": 2.9735937995267108e-06,
"loss": 0.0371,
"step": 2465
},
{
"epoch": 3.0991217063989964,
"grad_norm": 0.2796882316242176,
"learning_rate": 2.9347417561276812e-06,
"loss": 0.0376,
"step": 2470
},
{
"epoch": 3.105395232120452,
"grad_norm": 0.3038556064545983,
"learning_rate": 2.8961015028080506e-06,
"loss": 0.0385,
"step": 2475
},
{
"epoch": 3.1116687578419073,
"grad_norm": 0.2800625738008763,
"learning_rate": 2.8576741978584265e-06,
"loss": 0.0386,
"step": 2480
},
{
"epoch": 3.117942283563363,
"grad_norm": 0.29844382477038445,
"learning_rate": 2.819460993186032e-06,
"loss": 0.0388,
"step": 2485
},
{
"epoch": 3.1242158092848182,
"grad_norm": 0.3177106222509896,
"learning_rate": 2.781463034280153e-06,
"loss": 0.0379,
"step": 2490
},
{
"epoch": 3.1304893350062737,
"grad_norm": 0.2995753268233054,
"learning_rate": 2.7436814601778174e-06,
"loss": 0.0387,
"step": 2495
},
{
"epoch": 3.136762860727729,
"grad_norm": 0.30624087534086814,
"learning_rate": 2.7061174034296434e-06,
"loss": 0.0398,
"step": 2500
},
{
"epoch": 3.1430363864491846,
"grad_norm": 0.3087285778258557,
"learning_rate": 2.668771990065884e-06,
"loss": 0.0394,
"step": 2505
},
{
"epoch": 3.14930991217064,
"grad_norm": 0.2941075887579034,
"learning_rate": 2.631646339562689e-06,
"loss": 0.0386,
"step": 2510
},
{
"epoch": 3.1555834378920955,
"grad_norm": 0.3118597830883225,
"learning_rate": 2.594741564808527e-06,
"loss": 0.0397,
"step": 2515
},
{
"epoch": 3.161856963613551,
"grad_norm": 0.3208254308061256,
"learning_rate": 2.558058772070846e-06,
"loss": 0.0385,
"step": 2520
},
{
"epoch": 3.1681304893350064,
"grad_norm": 0.26274648640285136,
"learning_rate": 2.521599060962895e-06,
"loss": 0.039,
"step": 2525
},
{
"epoch": 3.174404015056462,
"grad_norm": 0.3244783368883984,
"learning_rate": 2.4853635244107743e-06,
"loss": 0.0373,
"step": 2530
},
{
"epoch": 3.1806775407779173,
"grad_norm": 0.3321834864243195,
"learning_rate": 2.449353248620657e-06,
"loss": 0.0369,
"step": 2535
},
{
"epoch": 3.1869510664993728,
"grad_norm": 0.298380472148583,
"learning_rate": 2.41356931304625e-06,
"loss": 0.0383,
"step": 2540
},
{
"epoch": 3.193224592220828,
"grad_norm": 0.2887212580399006,
"learning_rate": 2.37801279035642e-06,
"loss": 0.0383,
"step": 2545
},
{
"epoch": 3.1994981179422837,
"grad_norm": 0.2685957992866305,
"learning_rate": 2.342684746403037e-06,
"loss": 0.0382,
"step": 2550
},
{
"epoch": 3.205771643663739,
"grad_norm": 0.3137213818837783,
"learning_rate": 2.307586240189049e-06,
"loss": 0.04,
"step": 2555
},
{
"epoch": 3.2120451693851946,
"grad_norm": 0.29781867047646216,
"learning_rate": 2.272718323836701e-06,
"loss": 0.0379,
"step": 2560
},
{
"epoch": 3.21831869510665,
"grad_norm": 0.28001089505480686,
"learning_rate": 2.238082042556029e-06,
"loss": 0.0382,
"step": 2565
},
{
"epoch": 3.2245922208281055,
"grad_norm": 0.31914940913340023,
"learning_rate": 2.2036784346134976e-06,
"loss": 0.0376,
"step": 2570
},
{
"epoch": 3.230865746549561,
"grad_norm": 0.2728867395710102,
"learning_rate": 2.169508531300908e-06,
"loss": 0.0382,
"step": 2575
},
{
"epoch": 3.2371392722710164,
"grad_norm": 0.30042453420426346,
"learning_rate": 2.1355733569044633e-06,
"loss": 0.0389,
"step": 2580
},
{
"epoch": 3.243412797992472,
"grad_norm": 0.2752139872422629,
"learning_rate": 2.101873928674064e-06,
"loss": 0.0374,
"step": 2585
},
{
"epoch": 3.2496863237139273,
"grad_norm": 0.3662305601638148,
"learning_rate": 2.0684112567928314e-06,
"loss": 0.0369,
"step": 2590
},
{
"epoch": 3.2559598494353827,
"grad_norm": 0.2853508636394266,
"learning_rate": 2.035186344346801e-06,
"loss": 0.0379,
"step": 2595
},
{
"epoch": 3.262233375156838,
"grad_norm": 0.2909774250341541,
"learning_rate": 2.0022001872948814e-06,
"loss": 0.0374,
"step": 2600
},
{
"epoch": 3.2685069008782937,
"grad_norm": 0.303083557347797,
"learning_rate": 1.9694537744389754e-06,
"loss": 0.0372,
"step": 2605
},
{
"epoch": 3.274780426599749,
"grad_norm": 0.2837072081387949,
"learning_rate": 1.9369480873943524e-06,
"loss": 0.037,
"step": 2610
},
{
"epoch": 3.2810539523212046,
"grad_norm": 0.293463671873792,
"learning_rate": 1.9046841005602268e-06,
"loss": 0.0368,
"step": 2615
},
{
"epoch": 3.28732747804266,
"grad_norm": 0.29070250069814846,
"learning_rate": 1.8726627810905284e-06,
"loss": 0.037,
"step": 2620
},
{
"epoch": 3.2936010037641155,
"grad_norm": 0.28990000456159315,
"learning_rate": 1.8408850888649398e-06,
"loss": 0.0368,
"step": 2625
},
{
"epoch": 3.299874529485571,
"grad_norm": 0.2768135817440434,
"learning_rate": 1.8093519764600931e-06,
"loss": 0.0381,
"step": 2630
},
{
"epoch": 3.3061480552070264,
"grad_norm": 0.3106851238797327,
"learning_rate": 1.778064389121048e-06,
"loss": 0.0373,
"step": 2635
},
{
"epoch": 3.312421580928482,
"grad_norm": 0.26724960926250796,
"learning_rate": 1.7470232647329222e-06,
"loss": 0.0382,
"step": 2640
},
{
"epoch": 3.3186951066499373,
"grad_norm": 0.3030133716572175,
"learning_rate": 1.7162295337928036e-06,
"loss": 0.0384,
"step": 2645
},
{
"epoch": 3.3249686323713927,
"grad_norm": 0.31346010178385275,
"learning_rate": 1.685684119381844e-06,
"loss": 0.038,
"step": 2650
},
{
"epoch": 3.331242158092848,
"grad_norm": 0.28444292176954256,
"learning_rate": 1.655387937137589e-06,
"loss": 0.0363,
"step": 2655
},
{
"epoch": 3.3375156838143036,
"grad_norm": 0.29308964877533195,
"learning_rate": 1.6253418952265398e-06,
"loss": 0.0372,
"step": 2660
},
{
"epoch": 3.343789209535759,
"grad_norm": 0.29189420850538267,
"learning_rate": 1.5955468943169217e-06,
"loss": 0.0377,
"step": 2665
},
{
"epoch": 3.3500627352572145,
"grad_norm": 0.3102531079304282,
"learning_rate": 1.5660038275516898e-06,
"loss": 0.0355,
"step": 2670
},
{
"epoch": 3.35633626097867,
"grad_norm": 0.27790750204144,
"learning_rate": 1.536713580521746e-06,
"loss": 0.0372,
"step": 2675
},
{
"epoch": 3.3626097867001254,
"grad_norm": 0.3226354962908228,
"learning_rate": 1.5076770312394096e-06,
"loss": 0.0369,
"step": 2680
},
{
"epoch": 3.368883312421581,
"grad_norm": 0.26898149132297955,
"learning_rate": 1.4788950501120781e-06,
"loss": 0.0382,
"step": 2685
},
{
"epoch": 3.3751568381430364,
"grad_norm": 0.3188502485680553,
"learning_rate": 1.450368499916155e-06,
"loss": 0.0381,
"step": 2690
},
{
"epoch": 3.381430363864492,
"grad_norm": 0.3007960797574723,
"learning_rate": 1.4220982357711743e-06,
"loss": 0.0372,
"step": 2695
},
{
"epoch": 3.3877038895859473,
"grad_norm": 0.29650117453341984,
"learning_rate": 1.3940851051141646e-06,
"loss": 0.0377,
"step": 2700
},
{
"epoch": 3.3939774153074027,
"grad_norm": 0.29709762351304697,
"learning_rate": 1.366329947674263e-06,
"loss": 0.0372,
"step": 2705
},
{
"epoch": 3.400250941028858,
"grad_norm": 0.2951278070367851,
"learning_rate": 1.3388335954475207e-06,
"loss": 0.0375,
"step": 2710
},
{
"epoch": 3.4065244667503136,
"grad_norm": 0.27968586803836637,
"learning_rate": 1.3115968726719819e-06,
"loss": 0.0359,
"step": 2715
},
{
"epoch": 3.412797992471769,
"grad_norm": 0.30665692772003755,
"learning_rate": 1.284620595802969e-06,
"loss": 0.0376,
"step": 2720
},
{
"epoch": 3.4190715181932245,
"grad_norm": 0.3166229935564368,
"learning_rate": 1.2579055734886004e-06,
"loss": 0.0361,
"step": 2725
},
{
"epoch": 3.42534504391468,
"grad_norm": 0.28812682524392,
"learning_rate": 1.2314526065455678e-06,
"loss": 0.0371,
"step": 2730
},
{
"epoch": 3.4316185696361354,
"grad_norm": 0.2962918652302945,
"learning_rate": 1.2052624879351105e-06,
"loss": 0.0375,
"step": 2735
},
{
"epoch": 3.437892095357591,
"grad_norm": 0.2959207573782531,
"learning_rate": 1.179336002739263e-06,
"loss": 0.0368,
"step": 2740
},
{
"epoch": 3.4441656210790463,
"grad_norm": 0.3222288463448185,
"learning_rate": 1.1536739281373122e-06,
"loss": 0.0366,
"step": 2745
},
{
"epoch": 3.450439146800502,
"grad_norm": 0.2924594477361343,
"learning_rate": 1.1282770333825022e-06,
"loss": 0.0375,
"step": 2750
},
{
"epoch": 3.4567126725219572,
"grad_norm": 0.32299277919132047,
"learning_rate": 1.1031460797789718e-06,
"loss": 0.0374,
"step": 2755
},
{
"epoch": 3.4629861982434127,
"grad_norm": 0.3109436060749281,
"learning_rate": 1.0782818206589375e-06,
"loss": 0.0367,
"step": 2760
},
{
"epoch": 3.469259723964868,
"grad_norm": 0.27272406454567366,
"learning_rate": 1.053685001360112e-06,
"loss": 0.0356,
"step": 2765
},
{
"epoch": 3.4755332496863236,
"grad_norm": 0.30006540518061847,
"learning_rate": 1.0293563592033595e-06,
"loss": 0.0366,
"step": 2770
},
{
"epoch": 3.481806775407779,
"grad_norm": 0.28002730472344367,
"learning_rate": 1.0052966234705953e-06,
"loss": 0.0383,
"step": 2775
},
{
"epoch": 3.4880803011292345,
"grad_norm": 0.32173748821041304,
"learning_rate": 9.815065153829195e-07,
"loss": 0.0366,
"step": 2780
},
{
"epoch": 3.49435382685069,
"grad_norm": 0.3042093630820431,
"learning_rate": 9.579867480790061e-07,
"loss": 0.0369,
"step": 2785
},
{
"epoch": 3.5006273525721454,
"grad_norm": 0.2975724581779801,
"learning_rate": 9.347380265937167e-07,
"loss": 0.0361,
"step": 2790
},
{
"epoch": 3.506900878293601,
"grad_norm": 0.2893317769414031,
"learning_rate": 9.117610478369743e-07,
"loss": 0.0383,
"step": 2795
},
{
"epoch": 3.5131744040150563,
"grad_norm": 0.30137847743662044,
"learning_rate": 8.890565005728691e-07,
"loss": 0.0384,
"step": 2800
},
{
"epoch": 3.5194479297365118,
"grad_norm": 0.2713586307285477,
"learning_rate": 8.666250653990071e-07,
"loss": 0.0364,
"step": 2805
},
{
"epoch": 3.5257214554579672,
"grad_norm": 0.2749487801394873,
"learning_rate": 8.44467414726119e-07,
"loss": 0.0353,
"step": 2810
},
{
"epoch": 3.5319949811794227,
"grad_norm": 0.3243183464549784,
"learning_rate": 8.225842127578909e-07,
"loss": 0.0369,
"step": 2815
},
{
"epoch": 3.538268506900878,
"grad_norm": 0.2890309082875547,
"learning_rate": 8.009761154710671e-07,
"loss": 0.0369,
"step": 2820
},
{
"epoch": 3.544542032622334,
"grad_norm": 0.2879952226873715,
"learning_rate": 7.796437705957782e-07,
"loss": 0.0368,
"step": 2825
},
{
"epoch": 3.550815558343789,
"grad_norm": 0.2696320382813176,
"learning_rate": 7.585878175961237e-07,
"loss": 0.0371,
"step": 2830
},
{
"epoch": 3.557089084065245,
"grad_norm": 0.2936735805318314,
"learning_rate": 7.378088876510092e-07,
"loss": 0.0376,
"step": 2835
},
{
"epoch": 3.5633626097867,
"grad_norm": 0.25416101821322773,
"learning_rate": 7.1730760363522e-07,
"loss": 0.0352,
"step": 2840
},
{
"epoch": 3.569636135508156,
"grad_norm": 0.30300674321039756,
"learning_rate": 6.970845801007564e-07,
"loss": 0.0353,
"step": 2845
},
{
"epoch": 3.575909661229611,
"grad_norm": 0.2732998588656073,
"learning_rate": 6.771404232584011e-07,
"loss": 0.0362,
"step": 2850
},
{
"epoch": 3.5821831869510667,
"grad_norm": 0.3795076600221709,
"learning_rate": 6.574757309595636e-07,
"loss": 0.0361,
"step": 2855
},
{
"epoch": 3.5884567126725218,
"grad_norm": 0.28624108454045416,
"learning_rate": 6.380910926783402e-07,
"loss": 0.0363,
"step": 2860
},
{
"epoch": 3.5947302383939777,
"grad_norm": 0.2674910858708607,
"learning_rate": 6.189870894938587e-07,
"loss": 0.0353,
"step": 2865
},
{
"epoch": 3.6010037641154327,
"grad_norm": 0.2789232394059912,
"learning_rate": 6.001642940728503e-07,
"loss": 0.0354,
"step": 2870
},
{
"epoch": 3.6072772898368886,
"grad_norm": 0.30882594178964384,
"learning_rate": 5.816232706524838e-07,
"loss": 0.0366,
"step": 2875
},
{
"epoch": 3.6135508155583436,
"grad_norm": 0.2806952241750321,
"learning_rate": 5.63364575023465e-07,
"loss": 0.0361,
"step": 2880
},
{
"epoch": 3.6198243412797995,
"grad_norm": 0.30164828030350377,
"learning_rate": 5.453887545133563e-07,
"loss": 0.0378,
"step": 2885
},
{
"epoch": 3.6260978670012545,
"grad_norm": 0.2679619622153946,
"learning_rate": 5.276963479701857e-07,
"loss": 0.0358,
"step": 2890
},
{
"epoch": 3.6323713927227104,
"grad_norm": 0.3231146432041487,
"learning_rate": 5.102878857462811e-07,
"loss": 0.0389,
"step": 2895
},
{
"epoch": 3.6386449184441654,
"grad_norm": 0.2779119069062509,
"learning_rate": 4.931638896823876e-07,
"loss": 0.0384,
"step": 2900
},
{
"epoch": 3.6449184441656213,
"grad_norm": 0.2722147852694776,
"learning_rate": 4.763248730920089e-07,
"loss": 0.0358,
"step": 2905
},
{
"epoch": 3.6511919698870763,
"grad_norm": 0.293829432405976,
"learning_rate": 4.5977134074603246e-07,
"loss": 0.037,
"step": 2910
},
{
"epoch": 3.657465495608532,
"grad_norm": 0.3065193645356902,
"learning_rate": 4.4350378885759105e-07,
"loss": 0.0371,
"step": 2915
},
{
"epoch": 3.663739021329987,
"grad_norm": 0.28257044296271877,
"learning_rate": 4.275227050671904e-07,
"loss": 0.0364,
"step": 2920
},
{
"epoch": 3.670012547051443,
"grad_norm": 0.2782812011499114,
"learning_rate": 4.1182856842809204e-07,
"loss": 0.0348,
"step": 2925
},
{
"epoch": 3.676286072772898,
"grad_norm": 0.28872652510766195,
"learning_rate": 3.964218493919525e-07,
"loss": 0.0361,
"step": 2930
},
{
"epoch": 3.682559598494354,
"grad_norm": 0.3053021191496778,
"learning_rate": 3.813030097947212e-07,
"loss": 0.0357,
"step": 2935
},
{
"epoch": 3.6888331242158094,
"grad_norm": 0.3648251833459472,
"learning_rate": 3.6647250284279735e-07,
"loss": 0.0404,
"step": 2940
},
{
"epoch": 3.695106649937265,
"grad_norm": 0.3319781395595929,
"learning_rate": 3.5193077309943923e-07,
"loss": 0.0403,
"step": 2945
},
{
"epoch": 3.7013801756587204,
"grad_norm": 0.3085438905662011,
"learning_rate": 3.376782564714476e-07,
"loss": 0.0369,
"step": 2950
},
{
"epoch": 3.707653701380176,
"grad_norm": 0.2849277889599658,
"learning_rate": 3.237153801960868e-07,
"loss": 0.0362,
"step": 2955
},
{
"epoch": 3.7139272271016313,
"grad_norm": 0.31923000028121545,
"learning_rate": 3.100425628282899e-07,
"loss": 0.0369,
"step": 2960
},
{
"epoch": 3.7202007528230867,
"grad_norm": 0.2929761574906958,
"learning_rate": 2.9666021422810274e-07,
"loss": 0.0369,
"step": 2965
},
{
"epoch": 3.726474278544542,
"grad_norm": 0.2992636928106286,
"learning_rate": 2.8356873554840514e-07,
"loss": 0.0364,
"step": 2970
},
{
"epoch": 3.7327478042659976,
"grad_norm": 0.301134189273727,
"learning_rate": 2.7076851922287704e-07,
"loss": 0.0354,
"step": 2975
},
{
"epoch": 3.739021329987453,
"grad_norm": 0.2943853551390973,
"learning_rate": 2.5825994895424255e-07,
"loss": 0.0346,
"step": 2980
},
{
"epoch": 3.7452948557089085,
"grad_norm": 0.2920091794037564,
"learning_rate": 2.460433997027634e-07,
"loss": 0.0377,
"step": 2985
},
{
"epoch": 3.751568381430364,
"grad_norm": 0.27495401738635517,
"learning_rate": 2.3411923767500455e-07,
"loss": 0.0378,
"step": 2990
},
{
"epoch": 3.7578419071518194,
"grad_norm": 0.39264097272429527,
"learning_rate": 2.224878203128511e-07,
"loss": 0.037,
"step": 2995
},
{
"epoch": 3.764115432873275,
"grad_norm": 0.30250413845451674,
"learning_rate": 2.1114949628279201e-07,
"loss": 0.0366,
"step": 3000
},
{
"epoch": 3.7703889585947303,
"grad_norm": 0.35611796959076564,
"learning_rate": 2.001046054654776e-07,
"loss": 0.0371,
"step": 3005
},
{
"epoch": 3.776662484316186,
"grad_norm": 0.2561175523575295,
"learning_rate": 1.893534789455209e-07,
"loss": 0.0351,
"step": 3010
},
{
"epoch": 3.7829360100376412,
"grad_norm": 0.28295745535344086,
"learning_rate": 1.7889643900158016e-07,
"loss": 0.0361,
"step": 3015
},
{
"epoch": 3.7892095357590967,
"grad_norm": 0.28512419406398154,
"learning_rate": 1.6873379909669307e-07,
"loss": 0.0376,
"step": 3020
},
{
"epoch": 3.795483061480552,
"grad_norm": 0.30485457369447977,
"learning_rate": 1.5886586386888449e-07,
"loss": 0.0377,
"step": 3025
},
{
"epoch": 3.8017565872020076,
"grad_norm": 0.2901818797318041,
"learning_rate": 1.4929292912203354e-07,
"loss": 0.0357,
"step": 3030
},
{
"epoch": 3.808030112923463,
"grad_norm": 0.2667781630207582,
"learning_rate": 1.4001528181700196e-07,
"loss": 0.0382,
"step": 3035
},
{
"epoch": 3.8143036386449185,
"grad_norm": 0.3034166947926825,
"learning_rate": 1.3103320006303766e-07,
"loss": 0.0382,
"step": 3040
},
{
"epoch": 3.820577164366374,
"grad_norm": 0.3119770203364585,
"learning_rate": 1.2234695310944012e-07,
"loss": 0.0367,
"step": 3045
},
{
"epoch": 3.8268506900878294,
"grad_norm": 0.31429783494460234,
"learning_rate": 1.1395680133747811e-07,
"loss": 0.036,
"step": 3050
},
{
"epoch": 3.833124215809285,
"grad_norm": 0.2655885847696681,
"learning_rate": 1.0586299625259699e-07,
"loss": 0.0372,
"step": 3055
},
{
"epoch": 3.8393977415307403,
"grad_norm": 0.37692736629023765,
"learning_rate": 9.806578047687254e-08,
"loss": 0.0401,
"step": 3060
},
{
"epoch": 3.8456712672521958,
"grad_norm": 0.28037362911434216,
"learning_rate": 9.056538774174117e-08,
"loss": 0.0383,
"step": 3065
},
{
"epoch": 3.851944792973651,
"grad_norm": 0.2813649801748803,
"learning_rate": 8.336204288098671e-08,
"loss": 0.0409,
"step": 3070
},
{
"epoch": 3.8582183186951067,
"grad_norm": 0.28158812132522376,
"learning_rate": 7.64559618240146e-08,
"loss": 0.0371,
"step": 3075
},
{
"epoch": 3.864491844416562,
"grad_norm": 0.29334208788094884,
"learning_rate": 6.984735158936384e-08,
"loss": 0.0377,
"step": 3080
},
{
"epoch": 3.8707653701380176,
"grad_norm": 0.2776501474412646,
"learning_rate": 6.353641027850965e-08,
"loss": 0.0368,
"step": 3085
},
{
"epoch": 3.877038895859473,
"grad_norm": 0.27164861879214747,
"learning_rate": 5.7523327069926024e-08,
"loss": 0.0371,
"step": 3090
},
{
"epoch": 3.8833124215809285,
"grad_norm": 0.26782362958737654,
"learning_rate": 5.1808282213410276e-08,
"loss": 0.0366,
"step": 3095
},
{
"epoch": 3.889585947302384,
"grad_norm": 0.31086719768707505,
"learning_rate": 4.63914470246829e-08,
"loss": 0.0361,
"step": 3100
},
{
"epoch": 3.8958594730238394,
"grad_norm": 0.30662847914316993,
"learning_rate": 4.1272983880249476e-08,
"loss": 0.0374,
"step": 3105
},
{
"epoch": 3.902132998745295,
"grad_norm": 0.2915681277032509,
"learning_rate": 3.645304621253787e-08,
"loss": 0.0365,
"step": 3110
},
{
"epoch": 3.9084065244667503,
"grad_norm": 0.31776081537688317,
"learning_rate": 3.193177850529416e-08,
"loss": 0.0352,
"step": 3115
},
{
"epoch": 3.9146800501882058,
"grad_norm": 0.26314408385807814,
"learning_rate": 2.7709316289253885e-08,
"loss": 0.0362,
"step": 3120
},
{
"epoch": 3.920953575909661,
"grad_norm": 0.30353550353186937,
"learning_rate": 2.378578613807969e-08,
"loss": 0.0367,
"step": 3125
},
{
"epoch": 3.9272271016311167,
"grad_norm": 0.3008469541365023,
"learning_rate": 2.0161305664563312e-08,
"loss": 0.0377,
"step": 3130
},
{
"epoch": 3.933500627352572,
"grad_norm": 0.29989474797608723,
"learning_rate": 1.6835983517108357e-08,
"loss": 0.0364,
"step": 3135
},
{
"epoch": 3.9397741530740276,
"grad_norm": 0.3007906674460069,
"learning_rate": 1.3809919376461811e-08,
"loss": 0.0367,
"step": 3140
},
{
"epoch": 3.946047678795483,
"grad_norm": 0.2822359703350314,
"learning_rate": 1.1083203952737543e-08,
"loss": 0.0371,
"step": 3145
},
{
"epoch": 3.9523212045169385,
"grad_norm": 0.32240083725231283,
"learning_rate": 8.655918982689582e-09,
"loss": 0.0367,
"step": 3150
},
{
"epoch": 3.958594730238394,
"grad_norm": 0.27144908135189433,
"learning_rate": 6.528137227262976e-09,
"loss": 0.0368,
"step": 3155
},
{
"epoch": 3.9648682559598494,
"grad_norm": 0.29276670822521234,
"learning_rate": 4.6999224694166405e-09,
"loss": 0.0363,
"step": 3160
},
{
"epoch": 3.971141781681305,
"grad_norm": 0.2920658294802816,
"learning_rate": 3.1713295122071107e-09,
"loss": 0.0352,
"step": 3165
},
{
"epoch": 3.9774153074027603,
"grad_norm": 0.29999904692777374,
"learning_rate": 1.9424041771465286e-09,
"loss": 0.0354,
"step": 3170
},
{
"epoch": 3.9836888331242157,
"grad_norm": 0.28940456234051576,
"learning_rate": 1.013183302832621e-09,
"loss": 0.0379,
"step": 3175
},
{
"epoch": 3.989962358845671,
"grad_norm": 0.28743205163263064,
"learning_rate": 3.8369474383848083e-10,
"loss": 0.0349,
"step": 3180
},
{
"epoch": 3.9962358845671266,
"grad_norm": 0.3004497086648703,
"learning_rate": 5.395736988322853e-11,
"loss": 0.0363,
"step": 3185
},
{
"epoch": 4.0,
"eval_loss": 0.47916728258132935,
"eval_runtime": 2.4435,
"eval_samples_per_second": 14.324,
"eval_steps_per_second": 0.818,
"step": 3188
},
{
"epoch": 4.0,
"step": 3188,
"total_flos": 2680278636036096.0,
"train_loss": 0.20304497943459596,
"train_runtime": 21214.9362,
"train_samples_per_second": 4.806,
"train_steps_per_second": 0.15
}
],
"logging_steps": 5,
"max_steps": 3188,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 319,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2680278636036096.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}