spag_spag1_ckpt / trainer_state.json
ThWu's picture
Upload folder using huggingface_hub
bc7ab33 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1405,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007117437722419929,
"grad_norm": 7.60900351618248,
"learning_rate": 0.0,
"loss": 0.3226,
"step": 1
},
{
"epoch": 0.0014234875444839859,
"grad_norm": 5.560136785951512,
"learning_rate": 3.685776662974123e-07,
"loss": 0.2375,
"step": 2
},
{
"epoch": 0.002135231316725979,
"grad_norm": 11.276059972500056,
"learning_rate": 5.841817796847145e-07,
"loss": 0.3882,
"step": 3
},
{
"epoch": 0.0028469750889679717,
"grad_norm": 13.05602282686669,
"learning_rate": 7.371553325948246e-07,
"loss": 0.4181,
"step": 4
},
{
"epoch": 0.0035587188612099642,
"grad_norm": 16.506693089412153,
"learning_rate": 8.558108385239805e-07,
"loss": 0.643,
"step": 5
},
{
"epoch": 0.004270462633451958,
"grad_norm": 9.264657032728305,
"learning_rate": 9.527594459821267e-07,
"loss": 0.3291,
"step": 6
},
{
"epoch": 0.00498220640569395,
"grad_norm": 7.619801484573377,
"learning_rate": 1.0347283256405455e-06,
"loss": 0.3482,
"step": 7
},
{
"epoch": 0.0056939501779359435,
"grad_norm": 13.58337314966525,
"learning_rate": 1.1057329988922369e-06,
"loss": 0.5296,
"step": 8
},
{
"epoch": 0.006405693950177936,
"grad_norm": 12.954991407490938,
"learning_rate": 1.168363559369429e-06,
"loss": 0.4107,
"step": 9
},
{
"epoch": 0.0071174377224199285,
"grad_norm": 7.044442105752508,
"learning_rate": 1.2243885048213931e-06,
"loss": 0.251,
"step": 10
},
{
"epoch": 0.007829181494661922,
"grad_norm": 4.909048072229877,
"learning_rate": 1.2750692327128147e-06,
"loss": 0.1637,
"step": 11
},
{
"epoch": 0.008540925266903915,
"grad_norm": 10.090920444391536,
"learning_rate": 1.3213371122795392e-06,
"loss": 0.2977,
"step": 12
},
{
"epoch": 0.009252669039145907,
"grad_norm": 14.066506590693214,
"learning_rate": 1.363899435586698e-06,
"loss": 0.4403,
"step": 13
},
{
"epoch": 0.0099644128113879,
"grad_norm": 8.253082162037396,
"learning_rate": 1.4033059919379577e-06,
"loss": 0.3865,
"step": 14
},
{
"epoch": 0.010676156583629894,
"grad_norm": 6.319219959263539,
"learning_rate": 1.439992618208695e-06,
"loss": 0.1669,
"step": 15
},
{
"epoch": 0.011387900355871887,
"grad_norm": 14.359855834747316,
"learning_rate": 1.4743106651896492e-06,
"loss": 0.4247,
"step": 16
},
{
"epoch": 0.012099644128113879,
"grad_norm": 18.480766402036487,
"learning_rate": 1.5065475151054406e-06,
"loss": 0.3372,
"step": 17
},
{
"epoch": 0.012811387900355872,
"grad_norm": 9.522867087101119,
"learning_rate": 1.536941225666841e-06,
"loss": 0.3334,
"step": 18
},
{
"epoch": 0.013523131672597865,
"grad_norm": 12.335019039391405,
"learning_rate": 1.5656912095056063e-06,
"loss": 0.2071,
"step": 19
},
{
"epoch": 0.014234875444839857,
"grad_norm": 8.524794375269412,
"learning_rate": 1.592966171118805e-06,
"loss": -0.0933,
"step": 20
},
{
"epoch": 0.01494661921708185,
"grad_norm": 10.132122030975838,
"learning_rate": 1.61891010532526e-06,
"loss": 0.2318,
"step": 21
},
{
"epoch": 0.015658362989323844,
"grad_norm": 12.212529386799341,
"learning_rate": 1.6436468990102273e-06,
"loss": 0.3701,
"step": 22
},
{
"epoch": 0.016370106761565837,
"grad_norm": 5.9529833839373,
"learning_rate": 1.6672839091152516e-06,
"loss": 0.0561,
"step": 23
},
{
"epoch": 0.01708185053380783,
"grad_norm": 11.755479549489955,
"learning_rate": 1.6899147785769513e-06,
"loss": 0.274,
"step": 24
},
{
"epoch": 0.017793594306049824,
"grad_norm": 7.7205467449537295,
"learning_rate": 1.711621677047961e-06,
"loss": 0.231,
"step": 25
},
{
"epoch": 0.018505338078291814,
"grad_norm": 21.633394409156168,
"learning_rate": 1.7324771018841105e-06,
"loss": 0.5554,
"step": 26
},
{
"epoch": 0.019217081850533807,
"grad_norm": 6.9139962920558835,
"learning_rate": 1.7525453390541434e-06,
"loss": 0.1606,
"step": 27
},
{
"epoch": 0.0199288256227758,
"grad_norm": 10.241558041059887,
"learning_rate": 1.7718836582353703e-06,
"loss": 0.1679,
"step": 28
},
{
"epoch": 0.020640569395017794,
"grad_norm": 15.538854813374531,
"learning_rate": 1.7905432981013013e-06,
"loss": 0.2847,
"step": 29
},
{
"epoch": 0.021352313167259787,
"grad_norm": 6.217944529141481,
"learning_rate": 1.8085702845061074e-06,
"loss": 0.1358,
"step": 30
},
{
"epoch": 0.02206405693950178,
"grad_norm": 5.611328125,
"learning_rate": 1.826006114461645e-06,
"loss": 0.0723,
"step": 31
},
{
"epoch": 0.022775800711743774,
"grad_norm": 7.467791507779399,
"learning_rate": 1.8428883314870616e-06,
"loss": 0.0246,
"step": 32
},
{
"epoch": 0.023487544483985764,
"grad_norm": 9.448473077362943,
"learning_rate": 1.8592510123975292e-06,
"loss": 0.1425,
"step": 33
},
{
"epoch": 0.024199288256227757,
"grad_norm": 5.296569815426998,
"learning_rate": 1.8751251814028527e-06,
"loss": 0.2125,
"step": 34
},
{
"epoch": 0.02491103202846975,
"grad_norm": 16.286462809634553,
"learning_rate": 1.8905391641645261e-06,
"loss": 0.134,
"step": 35
},
{
"epoch": 0.025622775800711744,
"grad_norm": 6.165472103955093,
"learning_rate": 1.9055188919642534e-06,
"loss": 0.1923,
"step": 36
},
{
"epoch": 0.026334519572953737,
"grad_norm": 11.80539517960664,
"learning_rate": 1.9200881641887184e-06,
"loss": 0.3054,
"step": 37
},
{
"epoch": 0.02704626334519573,
"grad_norm": 8.523822835154286,
"learning_rate": 1.9342688758030187e-06,
"loss": 0.1712,
"step": 38
},
{
"epoch": 0.027758007117437724,
"grad_norm": 8.074927399961465,
"learning_rate": 1.9480812152714124e-06,
"loss": 0.1644,
"step": 39
},
{
"epoch": 0.028469750889679714,
"grad_norm": 8.360886709835688,
"learning_rate": 1.9615438374162175e-06,
"loss": 0.1798,
"step": 40
},
{
"epoch": 0.029181494661921707,
"grad_norm": 15.089155109836698,
"learning_rate": 1.9746740149291565e-06,
"loss": 0.2107,
"step": 41
},
{
"epoch": 0.0298932384341637,
"grad_norm": 6.37160626004211,
"learning_rate": 1.9874877716226724e-06,
"loss": 0.0778,
"step": 42
},
{
"epoch": 0.030604982206405694,
"grad_norm": 8.791224560461469,
"learning_rate": 1.9999999999999995e-06,
"loss": 0.0476,
"step": 43
},
{
"epoch": 0.03131672597864769,
"grad_norm": 5.854989569807006,
"learning_rate": 2e-06,
"loss": 0.2265,
"step": 44
},
{
"epoch": 0.03202846975088968,
"grad_norm": 10.330861154797649,
"learning_rate": 1.9985315712187957e-06,
"loss": 0.0709,
"step": 45
},
{
"epoch": 0.032740213523131674,
"grad_norm": 3.875388833503013,
"learning_rate": 1.997063142437592e-06,
"loss": 0.1164,
"step": 46
},
{
"epoch": 0.03345195729537367,
"grad_norm": 4.411229559661042,
"learning_rate": 1.9955947136563876e-06,
"loss": 0.1332,
"step": 47
},
{
"epoch": 0.03416370106761566,
"grad_norm": 6.6818738426669215,
"learning_rate": 1.9941262848751834e-06,
"loss": 0.0902,
"step": 48
},
{
"epoch": 0.034875444839857654,
"grad_norm": 6.317530978345379,
"learning_rate": 1.992657856093979e-06,
"loss": 0.1217,
"step": 49
},
{
"epoch": 0.03558718861209965,
"grad_norm": 5.011833112308298,
"learning_rate": 1.9911894273127754e-06,
"loss": 0.0623,
"step": 50
},
{
"epoch": 0.036298932384341634,
"grad_norm": 4.000249378060063,
"learning_rate": 1.989720998531571e-06,
"loss": 0.0499,
"step": 51
},
{
"epoch": 0.03701067615658363,
"grad_norm": 3.6701163767727008,
"learning_rate": 1.988252569750367e-06,
"loss": 0.0168,
"step": 52
},
{
"epoch": 0.03772241992882562,
"grad_norm": 6.081936227280345,
"learning_rate": 1.9867841409691626e-06,
"loss": 0.0552,
"step": 53
},
{
"epoch": 0.038434163701067614,
"grad_norm": 7.2095431450001835,
"learning_rate": 1.985315712187959e-06,
"loss": 0.1475,
"step": 54
},
{
"epoch": 0.03914590747330961,
"grad_norm": 3.911782215320487,
"learning_rate": 1.9838472834067546e-06,
"loss": 0.0324,
"step": 55
},
{
"epoch": 0.0398576512455516,
"grad_norm": 4.43042136812578,
"learning_rate": 1.982378854625551e-06,
"loss": 0.1017,
"step": 56
},
{
"epoch": 0.040569395017793594,
"grad_norm": 5.044943803185019,
"learning_rate": 1.9809104258443466e-06,
"loss": 0.1222,
"step": 57
},
{
"epoch": 0.04128113879003559,
"grad_norm": 3.9711826108138335,
"learning_rate": 1.9794419970631423e-06,
"loss": -0.0114,
"step": 58
},
{
"epoch": 0.04199288256227758,
"grad_norm": 5.835195552945358,
"learning_rate": 1.977973568281938e-06,
"loss": 0.0505,
"step": 59
},
{
"epoch": 0.042704626334519574,
"grad_norm": 6.459322781806197,
"learning_rate": 1.9765051395007343e-06,
"loss": 0.0874,
"step": 60
},
{
"epoch": 0.04341637010676157,
"grad_norm": 5.233507135667908,
"learning_rate": 1.97503671071953e-06,
"loss": 0.076,
"step": 61
},
{
"epoch": 0.04412811387900356,
"grad_norm": 5.151067730484437,
"learning_rate": 1.973568281938326e-06,
"loss": 0.0179,
"step": 62
},
{
"epoch": 0.044839857651245554,
"grad_norm": 3.957796133966113,
"learning_rate": 1.972099853157122e-06,
"loss": -0.0188,
"step": 63
},
{
"epoch": 0.04555160142348755,
"grad_norm": 5.406649536187206,
"learning_rate": 1.9706314243759178e-06,
"loss": 0.0788,
"step": 64
},
{
"epoch": 0.046263345195729534,
"grad_norm": 7.595276479001964,
"learning_rate": 1.9691629955947135e-06,
"loss": 0.1444,
"step": 65
},
{
"epoch": 0.04697508896797153,
"grad_norm": 3.711667023276038,
"learning_rate": 1.9676945668135093e-06,
"loss": 0.0428,
"step": 66
},
{
"epoch": 0.04768683274021352,
"grad_norm": 5.0012249399790605,
"learning_rate": 1.9662261380323055e-06,
"loss": 0.171,
"step": 67
},
{
"epoch": 0.048398576512455514,
"grad_norm": 4.020935110013036,
"learning_rate": 1.9647577092511012e-06,
"loss": 0.0764,
"step": 68
},
{
"epoch": 0.04911032028469751,
"grad_norm": 6.544660170834616,
"learning_rate": 1.963289280469897e-06,
"loss": 0.0735,
"step": 69
},
{
"epoch": 0.0498220640569395,
"grad_norm": 5.52421147860282,
"learning_rate": 1.9618208516886928e-06,
"loss": 0.0966,
"step": 70
},
{
"epoch": 0.050533807829181494,
"grad_norm": 5.826681249732675,
"learning_rate": 1.960352422907489e-06,
"loss": 0.0558,
"step": 71
},
{
"epoch": 0.05124555160142349,
"grad_norm": 3.651382466662001,
"learning_rate": 1.9588839941262847e-06,
"loss": 0.0579,
"step": 72
},
{
"epoch": 0.05195729537366548,
"grad_norm": 6.462907446997599,
"learning_rate": 1.9574155653450805e-06,
"loss": 0.2494,
"step": 73
},
{
"epoch": 0.052669039145907474,
"grad_norm": 6.418817168035706,
"learning_rate": 1.9559471365638767e-06,
"loss": 0.2365,
"step": 74
},
{
"epoch": 0.05338078291814947,
"grad_norm": 8.436266999752513,
"learning_rate": 1.9544787077826725e-06,
"loss": 0.186,
"step": 75
},
{
"epoch": 0.05409252669039146,
"grad_norm": 5.451772265172017,
"learning_rate": 1.9530102790014682e-06,
"loss": 0.0795,
"step": 76
},
{
"epoch": 0.054804270462633455,
"grad_norm": 6.3842834495967455,
"learning_rate": 1.9515418502202644e-06,
"loss": 0.0591,
"step": 77
},
{
"epoch": 0.05551601423487545,
"grad_norm": 7.865708940472592,
"learning_rate": 1.95007342143906e-06,
"loss": 0.117,
"step": 78
},
{
"epoch": 0.056227758007117434,
"grad_norm": 4.359122757809347,
"learning_rate": 1.948604992657856e-06,
"loss": 0.0383,
"step": 79
},
{
"epoch": 0.05693950177935943,
"grad_norm": 6.408372778537557,
"learning_rate": 1.947136563876652e-06,
"loss": 0.1241,
"step": 80
},
{
"epoch": 0.05765124555160142,
"grad_norm": 5.159773853580481,
"learning_rate": 1.945668135095448e-06,
"loss": 0.0865,
"step": 81
},
{
"epoch": 0.058362989323843414,
"grad_norm": 5.270421048450883,
"learning_rate": 1.9441997063142437e-06,
"loss": 0.1287,
"step": 82
},
{
"epoch": 0.05907473309608541,
"grad_norm": 7.587688522887674,
"learning_rate": 1.9427312775330394e-06,
"loss": 0.1413,
"step": 83
},
{
"epoch": 0.0597864768683274,
"grad_norm": 5.945961474414765,
"learning_rate": 1.9412628487518356e-06,
"loss": 0.1981,
"step": 84
},
{
"epoch": 0.060498220640569395,
"grad_norm": 5.193881969404905,
"learning_rate": 1.9397944199706314e-06,
"loss": 0.1441,
"step": 85
},
{
"epoch": 0.06120996441281139,
"grad_norm": 7.28816359886349,
"learning_rate": 1.938325991189427e-06,
"loss": 0.1343,
"step": 86
},
{
"epoch": 0.06192170818505338,
"grad_norm": 7.258030357417186,
"learning_rate": 1.936857562408223e-06,
"loss": -0.0681,
"step": 87
},
{
"epoch": 0.06263345195729537,
"grad_norm": 4.389772379889555,
"learning_rate": 1.935389133627019e-06,
"loss": 0.1312,
"step": 88
},
{
"epoch": 0.06334519572953737,
"grad_norm": 5.281648158125961,
"learning_rate": 1.933920704845815e-06,
"loss": -0.0927,
"step": 89
},
{
"epoch": 0.06405693950177936,
"grad_norm": 5.403255978657539,
"learning_rate": 1.9324522760646106e-06,
"loss": 0.0721,
"step": 90
},
{
"epoch": 0.06476868327402135,
"grad_norm": 5.316790654441754,
"learning_rate": 1.9309838472834064e-06,
"loss": 0.0725,
"step": 91
},
{
"epoch": 0.06548042704626335,
"grad_norm": 3.7253524767120534,
"learning_rate": 1.9295154185022026e-06,
"loss": 0.0371,
"step": 92
},
{
"epoch": 0.06619217081850534,
"grad_norm": 4.489100926703136,
"learning_rate": 1.9280469897209984e-06,
"loss": 0.073,
"step": 93
},
{
"epoch": 0.06690391459074733,
"grad_norm": 6.741548121116984,
"learning_rate": 1.9265785609397945e-06,
"loss": 0.0089,
"step": 94
},
{
"epoch": 0.06761565836298933,
"grad_norm": 8.5491531934284,
"learning_rate": 1.9251101321585903e-06,
"loss": 0.1794,
"step": 95
},
{
"epoch": 0.06832740213523132,
"grad_norm": 6.63521867473652,
"learning_rate": 1.923641703377386e-06,
"loss": 0.2012,
"step": 96
},
{
"epoch": 0.06903914590747331,
"grad_norm": 5.711657853102565,
"learning_rate": 1.9221732745961823e-06,
"loss": 0.018,
"step": 97
},
{
"epoch": 0.06975088967971531,
"grad_norm": 6.295723556615106,
"learning_rate": 1.920704845814978e-06,
"loss": 0.0415,
"step": 98
},
{
"epoch": 0.0704626334519573,
"grad_norm": 5.1015357678400886,
"learning_rate": 1.919236417033774e-06,
"loss": 0.1646,
"step": 99
},
{
"epoch": 0.0711743772241993,
"grad_norm": 8.01743609991442,
"learning_rate": 1.9177679882525696e-06,
"loss": 0.1194,
"step": 100
},
{
"epoch": 0.07188612099644127,
"grad_norm": 5.814410152154771,
"learning_rate": 1.9162995594713658e-06,
"loss": 0.1503,
"step": 101
},
{
"epoch": 0.07259786476868327,
"grad_norm": 3.963587970772288,
"learning_rate": 1.9148311306901615e-06,
"loss": 0.0283,
"step": 102
},
{
"epoch": 0.07330960854092526,
"grad_norm": 9.086552554015991,
"learning_rate": 1.9133627019089573e-06,
"loss": 0.0953,
"step": 103
},
{
"epoch": 0.07402135231316725,
"grad_norm": 6.49258146226826,
"learning_rate": 1.911894273127753e-06,
"loss": 0.1819,
"step": 104
},
{
"epoch": 0.07473309608540925,
"grad_norm": 4.830091946123574,
"learning_rate": 1.9104258443465492e-06,
"loss": 0.0733,
"step": 105
},
{
"epoch": 0.07544483985765124,
"grad_norm": 5.181409270596075,
"learning_rate": 1.908957415565345e-06,
"loss": 0.1348,
"step": 106
},
{
"epoch": 0.07615658362989323,
"grad_norm": 7.155432242100337,
"learning_rate": 1.9074889867841408e-06,
"loss": 0.1745,
"step": 107
},
{
"epoch": 0.07686832740213523,
"grad_norm": 5.385060321324789,
"learning_rate": 1.9060205580029367e-06,
"loss": 0.133,
"step": 108
},
{
"epoch": 0.07758007117437722,
"grad_norm": 8.457953634023596,
"learning_rate": 1.9045521292217325e-06,
"loss": 0.0449,
"step": 109
},
{
"epoch": 0.07829181494661921,
"grad_norm": 5.9295118705462775,
"learning_rate": 1.9030837004405285e-06,
"loss": 0.0189,
"step": 110
},
{
"epoch": 0.07900355871886121,
"grad_norm": 4.69031430002641,
"learning_rate": 1.9016152716593243e-06,
"loss": 0.1328,
"step": 111
},
{
"epoch": 0.0797153024911032,
"grad_norm": 7.986583187874881,
"learning_rate": 1.9001468428781202e-06,
"loss": 0.1729,
"step": 112
},
{
"epoch": 0.0804270462633452,
"grad_norm": 5.246412232049036,
"learning_rate": 1.8986784140969162e-06,
"loss": 0.1714,
"step": 113
},
{
"epoch": 0.08113879003558719,
"grad_norm": 7.792759973315551,
"learning_rate": 1.8972099853157122e-06,
"loss": 0.1794,
"step": 114
},
{
"epoch": 0.08185053380782918,
"grad_norm": 4.207402771649719,
"learning_rate": 1.8957415565345082e-06,
"loss": 0.0331,
"step": 115
},
{
"epoch": 0.08256227758007118,
"grad_norm": 5.116878121611436,
"learning_rate": 1.894273127753304e-06,
"loss": -0.0015,
"step": 116
},
{
"epoch": 0.08327402135231317,
"grad_norm": 3.6983379858284824,
"learning_rate": 1.8928046989721e-06,
"loss": -0.035,
"step": 117
},
{
"epoch": 0.08398576512455516,
"grad_norm": 5.951576331841862,
"learning_rate": 1.8913362701908957e-06,
"loss": 0.1008,
"step": 118
},
{
"epoch": 0.08469750889679716,
"grad_norm": 5.009799890233696,
"learning_rate": 1.8898678414096916e-06,
"loss": 0.1037,
"step": 119
},
{
"epoch": 0.08540925266903915,
"grad_norm": 4.965469332944383,
"learning_rate": 1.8883994126284874e-06,
"loss": 0.0348,
"step": 120
},
{
"epoch": 0.08612099644128114,
"grad_norm": 7.9142446995932,
"learning_rate": 1.8869309838472834e-06,
"loss": 0.1599,
"step": 121
},
{
"epoch": 0.08683274021352314,
"grad_norm": 3.3174167347213226,
"learning_rate": 1.8854625550660792e-06,
"loss": -0.1183,
"step": 122
},
{
"epoch": 0.08754448398576513,
"grad_norm": 5.90505921000642,
"learning_rate": 1.8839941262848751e-06,
"loss": 0.1535,
"step": 123
},
{
"epoch": 0.08825622775800712,
"grad_norm": 7.857140528690935,
"learning_rate": 1.882525697503671e-06,
"loss": 0.0556,
"step": 124
},
{
"epoch": 0.08896797153024912,
"grad_norm": 4.818522066694991,
"learning_rate": 1.8810572687224669e-06,
"loss": 0.1041,
"step": 125
},
{
"epoch": 0.08967971530249111,
"grad_norm": 9.055482028072397,
"learning_rate": 1.8795888399412626e-06,
"loss": 0.056,
"step": 126
},
{
"epoch": 0.0903914590747331,
"grad_norm": 6.0870744184366545,
"learning_rate": 1.8781204111600586e-06,
"loss": 0.1566,
"step": 127
},
{
"epoch": 0.0911032028469751,
"grad_norm": 6.217795754068048,
"learning_rate": 1.8766519823788544e-06,
"loss": 0.0989,
"step": 128
},
{
"epoch": 0.09181494661921709,
"grad_norm": 10.18800736550528,
"learning_rate": 1.8751835535976504e-06,
"loss": 0.071,
"step": 129
},
{
"epoch": 0.09252669039145907,
"grad_norm": 4.216140731359187,
"learning_rate": 1.8737151248164461e-06,
"loss": -0.0453,
"step": 130
},
{
"epoch": 0.09323843416370106,
"grad_norm": 7.185461136601148,
"learning_rate": 1.8722466960352421e-06,
"loss": 0.1436,
"step": 131
},
{
"epoch": 0.09395017793594305,
"grad_norm": 12.892273213757303,
"learning_rate": 1.8707782672540383e-06,
"loss": 0.0127,
"step": 132
},
{
"epoch": 0.09466192170818505,
"grad_norm": 5.811455089210124,
"learning_rate": 1.869309838472834e-06,
"loss": 0.0611,
"step": 133
},
{
"epoch": 0.09537366548042704,
"grad_norm": 8.637525668292405,
"learning_rate": 1.86784140969163e-06,
"loss": 0.1458,
"step": 134
},
{
"epoch": 0.09608540925266904,
"grad_norm": 4.6089392504571265,
"learning_rate": 1.8663729809104258e-06,
"loss": 0.0784,
"step": 135
},
{
"epoch": 0.09679715302491103,
"grad_norm": 3.9349535019101336,
"learning_rate": 1.8649045521292218e-06,
"loss": 0.0716,
"step": 136
},
{
"epoch": 0.09750889679715302,
"grad_norm": 7.3242760414425945,
"learning_rate": 1.8634361233480175e-06,
"loss": 0.0706,
"step": 137
},
{
"epoch": 0.09822064056939502,
"grad_norm": 8.010497834811732,
"learning_rate": 1.8619676945668135e-06,
"loss": 0.1117,
"step": 138
},
{
"epoch": 0.09893238434163701,
"grad_norm": 3.481371350459092,
"learning_rate": 1.8604992657856093e-06,
"loss": 0.1401,
"step": 139
},
{
"epoch": 0.099644128113879,
"grad_norm": 4.368546439955386,
"learning_rate": 1.8590308370044053e-06,
"loss": 0.1753,
"step": 140
},
{
"epoch": 0.100355871886121,
"grad_norm": 4.542224931382252,
"learning_rate": 1.857562408223201e-06,
"loss": 0.129,
"step": 141
},
{
"epoch": 0.10106761565836299,
"grad_norm": 4.088790801733064,
"learning_rate": 1.856093979441997e-06,
"loss": 0.0443,
"step": 142
},
{
"epoch": 0.10177935943060498,
"grad_norm": 3.4192080621509175,
"learning_rate": 1.8546255506607928e-06,
"loss": 0.0547,
"step": 143
},
{
"epoch": 0.10249110320284698,
"grad_norm": 7.379458873728003,
"learning_rate": 1.8531571218795888e-06,
"loss": 0.2053,
"step": 144
},
{
"epoch": 0.10320284697508897,
"grad_norm": 5.112633648829968,
"learning_rate": 1.8516886930983845e-06,
"loss": 0.0284,
"step": 145
},
{
"epoch": 0.10391459074733096,
"grad_norm": 9.952083611017523,
"learning_rate": 1.8502202643171805e-06,
"loss": 0.1943,
"step": 146
},
{
"epoch": 0.10462633451957296,
"grad_norm": 4.472282454017271,
"learning_rate": 1.8487518355359763e-06,
"loss": 0.0597,
"step": 147
},
{
"epoch": 0.10533807829181495,
"grad_norm": 4.735680126419607,
"learning_rate": 1.8472834067547722e-06,
"loss": 0.1466,
"step": 148
},
{
"epoch": 0.10604982206405694,
"grad_norm": 3.779905348102356,
"learning_rate": 1.845814977973568e-06,
"loss": 0.1324,
"step": 149
},
{
"epoch": 0.10676156583629894,
"grad_norm": 9.724957510651878,
"learning_rate": 1.844346549192364e-06,
"loss": 0.3093,
"step": 150
},
{
"epoch": 0.10747330960854093,
"grad_norm": 4.251906920827278,
"learning_rate": 1.84287812041116e-06,
"loss": 0.0364,
"step": 151
},
{
"epoch": 0.10818505338078292,
"grad_norm": 4.461130542219279,
"learning_rate": 1.841409691629956e-06,
"loss": 0.1279,
"step": 152
},
{
"epoch": 0.10889679715302492,
"grad_norm": 4.268606185166076,
"learning_rate": 1.839941262848752e-06,
"loss": 0.0549,
"step": 153
},
{
"epoch": 0.10960854092526691,
"grad_norm": 6.057791857897027,
"learning_rate": 1.8384728340675477e-06,
"loss": -0.0231,
"step": 154
},
{
"epoch": 0.1103202846975089,
"grad_norm": 5.236227043795447,
"learning_rate": 1.8370044052863437e-06,
"loss": 0.0098,
"step": 155
},
{
"epoch": 0.1110320284697509,
"grad_norm": 4.598051587320243,
"learning_rate": 1.8355359765051394e-06,
"loss": 0.0601,
"step": 156
},
{
"epoch": 0.11174377224199289,
"grad_norm": 9.861476861561522,
"learning_rate": 1.8340675477239354e-06,
"loss": 0.07,
"step": 157
},
{
"epoch": 0.11245551601423487,
"grad_norm": 6.225559819185409,
"learning_rate": 1.8325991189427312e-06,
"loss": 0.1472,
"step": 158
},
{
"epoch": 0.11316725978647686,
"grad_norm": 4.631070045753654,
"learning_rate": 1.8311306901615271e-06,
"loss": 0.0717,
"step": 159
},
{
"epoch": 0.11387900355871886,
"grad_norm": 5.736291549508864,
"learning_rate": 1.829662261380323e-06,
"loss": 0.1787,
"step": 160
},
{
"epoch": 0.11459074733096085,
"grad_norm": 4.815478307431745,
"learning_rate": 1.8281938325991189e-06,
"loss": 0.0704,
"step": 161
},
{
"epoch": 0.11530249110320284,
"grad_norm": 5.413176052520917,
"learning_rate": 1.8267254038179147e-06,
"loss": 0.1206,
"step": 162
},
{
"epoch": 0.11601423487544484,
"grad_norm": 4.795932739358852,
"learning_rate": 1.8252569750367106e-06,
"loss": 0.1241,
"step": 163
},
{
"epoch": 0.11672597864768683,
"grad_norm": 6.287620540244574,
"learning_rate": 1.8237885462555064e-06,
"loss": -0.0185,
"step": 164
},
{
"epoch": 0.11743772241992882,
"grad_norm": 5.088224444723123,
"learning_rate": 1.8223201174743024e-06,
"loss": 0.0659,
"step": 165
},
{
"epoch": 0.11814946619217082,
"grad_norm": 5.277869773642978,
"learning_rate": 1.8208516886930981e-06,
"loss": 0.1567,
"step": 166
},
{
"epoch": 0.11886120996441281,
"grad_norm": 4.098325312915435,
"learning_rate": 1.8193832599118941e-06,
"loss": 0.0174,
"step": 167
},
{
"epoch": 0.1195729537366548,
"grad_norm": 3.747692160944269,
"learning_rate": 1.81791483113069e-06,
"loss": 0.0087,
"step": 168
},
{
"epoch": 0.1202846975088968,
"grad_norm": 4.252810054108135,
"learning_rate": 1.8164464023494859e-06,
"loss": 0.0087,
"step": 169
},
{
"epoch": 0.12099644128113879,
"grad_norm": 4.828844134158512,
"learning_rate": 1.8149779735682818e-06,
"loss": 0.0951,
"step": 170
},
{
"epoch": 0.12170818505338078,
"grad_norm": 4.874472173824431,
"learning_rate": 1.8135095447870778e-06,
"loss": 0.1129,
"step": 171
},
{
"epoch": 0.12241992882562278,
"grad_norm": 7.430423222806325,
"learning_rate": 1.8120411160058738e-06,
"loss": 0.1398,
"step": 172
},
{
"epoch": 0.12313167259786477,
"grad_norm": 3.1268219786286453,
"learning_rate": 1.8105726872246696e-06,
"loss": -0.0363,
"step": 173
},
{
"epoch": 0.12384341637010676,
"grad_norm": 5.396860601762695,
"learning_rate": 1.8091042584434655e-06,
"loss": 0.1385,
"step": 174
},
{
"epoch": 0.12455516014234876,
"grad_norm": 4.718918348934545,
"learning_rate": 1.8076358296622613e-06,
"loss": 0.0925,
"step": 175
},
{
"epoch": 0.12526690391459075,
"grad_norm": 7.307204717516227,
"learning_rate": 1.8061674008810573e-06,
"loss": 0.2154,
"step": 176
},
{
"epoch": 0.12597864768683273,
"grad_norm": 5.0495562443417255,
"learning_rate": 1.804698972099853e-06,
"loss": 0.1373,
"step": 177
},
{
"epoch": 0.12669039145907474,
"grad_norm": 4.222659186010196,
"learning_rate": 1.803230543318649e-06,
"loss": 0.1066,
"step": 178
},
{
"epoch": 0.12740213523131672,
"grad_norm": 5.402167393915139,
"learning_rate": 1.8017621145374448e-06,
"loss": 0.0681,
"step": 179
},
{
"epoch": 0.12811387900355872,
"grad_norm": 4.224096682439072,
"learning_rate": 1.8002936857562408e-06,
"loss": 0.1081,
"step": 180
},
{
"epoch": 0.1288256227758007,
"grad_norm": 5.742877031304056,
"learning_rate": 1.7988252569750365e-06,
"loss": 0.0093,
"step": 181
},
{
"epoch": 0.1295373665480427,
"grad_norm": 5.540002826455837,
"learning_rate": 1.7973568281938325e-06,
"loss": 0.0098,
"step": 182
},
{
"epoch": 0.1302491103202847,
"grad_norm": 4.891124760744802,
"learning_rate": 1.7958883994126283e-06,
"loss": 0.1506,
"step": 183
},
{
"epoch": 0.1309608540925267,
"grad_norm": 2.99421547474528,
"learning_rate": 1.7944199706314243e-06,
"loss": -0.012,
"step": 184
},
{
"epoch": 0.13167259786476868,
"grad_norm": 8.379749374829832,
"learning_rate": 1.7929515418502202e-06,
"loss": 0.0833,
"step": 185
},
{
"epoch": 0.13238434163701068,
"grad_norm": 6.515892846619184,
"learning_rate": 1.791483113069016e-06,
"loss": 0.0955,
"step": 186
},
{
"epoch": 0.13309608540925266,
"grad_norm": 5.017827486261835,
"learning_rate": 1.790014684287812e-06,
"loss": 0.1188,
"step": 187
},
{
"epoch": 0.13380782918149467,
"grad_norm": 5.695263939586359,
"learning_rate": 1.7885462555066077e-06,
"loss": 0.124,
"step": 188
},
{
"epoch": 0.13451957295373665,
"grad_norm": 6.125125494469321,
"learning_rate": 1.7870778267254037e-06,
"loss": 0.0924,
"step": 189
},
{
"epoch": 0.13523131672597866,
"grad_norm": 4.521668926781665,
"learning_rate": 1.7856093979441997e-06,
"loss": -0.1061,
"step": 190
},
{
"epoch": 0.13594306049822064,
"grad_norm": 4.267143230844152,
"learning_rate": 1.7841409691629957e-06,
"loss": 0.0566,
"step": 191
},
{
"epoch": 0.13665480427046264,
"grad_norm": 6.08031214414718,
"learning_rate": 1.7826725403817914e-06,
"loss": 0.115,
"step": 192
},
{
"epoch": 0.13736654804270462,
"grad_norm": 5.940400710028165,
"learning_rate": 1.7812041116005874e-06,
"loss": 0.1274,
"step": 193
},
{
"epoch": 0.13807829181494663,
"grad_norm": 4.585486004779037,
"learning_rate": 1.7797356828193832e-06,
"loss": 0.1713,
"step": 194
},
{
"epoch": 0.1387900355871886,
"grad_norm": 4.136195163173087,
"learning_rate": 1.7782672540381792e-06,
"loss": 0.0155,
"step": 195
},
{
"epoch": 0.13950177935943062,
"grad_norm": 5.6989356134689615,
"learning_rate": 1.776798825256975e-06,
"loss": 0.0103,
"step": 196
},
{
"epoch": 0.1402135231316726,
"grad_norm": 6.1401351461541225,
"learning_rate": 1.775330396475771e-06,
"loss": 0.1273,
"step": 197
},
{
"epoch": 0.1409252669039146,
"grad_norm": 4.885723546305384,
"learning_rate": 1.7738619676945667e-06,
"loss": 0.027,
"step": 198
},
{
"epoch": 0.14163701067615658,
"grad_norm": 3.887840530837794,
"learning_rate": 1.7723935389133626e-06,
"loss": -0.0177,
"step": 199
},
{
"epoch": 0.1423487544483986,
"grad_norm": 6.78388018953942,
"learning_rate": 1.7709251101321584e-06,
"loss": 0.2093,
"step": 200
},
{
"epoch": 0.14306049822064057,
"grad_norm": 3.94780351917843,
"learning_rate": 1.7694566813509544e-06,
"loss": 0.1258,
"step": 201
},
{
"epoch": 0.14377224199288255,
"grad_norm": 3.8629567441046673,
"learning_rate": 1.7679882525697504e-06,
"loss": 0.0233,
"step": 202
},
{
"epoch": 0.14448398576512456,
"grad_norm": 4.979996147920275,
"learning_rate": 1.7665198237885461e-06,
"loss": 0.0793,
"step": 203
},
{
"epoch": 0.14519572953736654,
"grad_norm": 4.0538696169783215,
"learning_rate": 1.765051395007342e-06,
"loss": 0.0413,
"step": 204
},
{
"epoch": 0.14590747330960854,
"grad_norm": 6.589998597302671,
"learning_rate": 1.7635829662261379e-06,
"loss": 0.1296,
"step": 205
},
{
"epoch": 0.14661921708185052,
"grad_norm": 4.877422415596789,
"learning_rate": 1.7621145374449338e-06,
"loss": 0.0984,
"step": 206
},
{
"epoch": 0.14733096085409253,
"grad_norm": 4.852718393716642,
"learning_rate": 1.7606461086637296e-06,
"loss": 0.0614,
"step": 207
},
{
"epoch": 0.1480427046263345,
"grad_norm": 5.871070034362448,
"learning_rate": 1.7591776798825256e-06,
"loss": 0.0569,
"step": 208
},
{
"epoch": 0.14875444839857652,
"grad_norm": 6.958293374074386,
"learning_rate": 1.7577092511013214e-06,
"loss": 0.0621,
"step": 209
},
{
"epoch": 0.1494661921708185,
"grad_norm": 7.285285846283627,
"learning_rate": 1.7562408223201175e-06,
"loss": 0.0888,
"step": 210
},
{
"epoch": 0.1501779359430605,
"grad_norm": 5.471947133249475,
"learning_rate": 1.7547723935389133e-06,
"loss": 0.0383,
"step": 211
},
{
"epoch": 0.15088967971530248,
"grad_norm": 4.096639065092099,
"learning_rate": 1.7533039647577093e-06,
"loss": -0.0454,
"step": 212
},
{
"epoch": 0.1516014234875445,
"grad_norm": 3.473240875019082,
"learning_rate": 1.751835535976505e-06,
"loss": 0.0136,
"step": 213
},
{
"epoch": 0.15231316725978647,
"grad_norm": 4.26736649396595,
"learning_rate": 1.750367107195301e-06,
"loss": 0.0447,
"step": 214
},
{
"epoch": 0.15302491103202848,
"grad_norm": 3.2994470190796923,
"learning_rate": 1.7488986784140968e-06,
"loss": 0.0757,
"step": 215
},
{
"epoch": 0.15373665480427046,
"grad_norm": 8.31822386123182,
"learning_rate": 1.7474302496328928e-06,
"loss": 0.1598,
"step": 216
},
{
"epoch": 0.15444839857651246,
"grad_norm": 4.753324048290485,
"learning_rate": 1.7459618208516885e-06,
"loss": 0.0325,
"step": 217
},
{
"epoch": 0.15516014234875444,
"grad_norm": 5.741786563915158,
"learning_rate": 1.7444933920704845e-06,
"loss": 0.2178,
"step": 218
},
{
"epoch": 0.15587188612099645,
"grad_norm": 3.1404840997622285,
"learning_rate": 1.7430249632892805e-06,
"loss": 0.1023,
"step": 219
},
{
"epoch": 0.15658362989323843,
"grad_norm": 5.481284115553573,
"learning_rate": 1.7415565345080763e-06,
"loss": 0.0977,
"step": 220
},
{
"epoch": 0.15729537366548044,
"grad_norm": 5.366544771093536,
"learning_rate": 1.7400881057268722e-06,
"loss": 0.1473,
"step": 221
},
{
"epoch": 0.15800711743772242,
"grad_norm": 14.973850090188478,
"learning_rate": 1.738619676945668e-06,
"loss": 0.1034,
"step": 222
},
{
"epoch": 0.15871886120996442,
"grad_norm": 7.50503244046107,
"learning_rate": 1.737151248164464e-06,
"loss": 0.1243,
"step": 223
},
{
"epoch": 0.1594306049822064,
"grad_norm": 4.518201891315668,
"learning_rate": 1.7356828193832597e-06,
"loss": 0.092,
"step": 224
},
{
"epoch": 0.1601423487544484,
"grad_norm": 3.4314498370679942,
"learning_rate": 1.7342143906020557e-06,
"loss": 0.0721,
"step": 225
},
{
"epoch": 0.1608540925266904,
"grad_norm": 5.197626180947426,
"learning_rate": 1.7327459618208515e-06,
"loss": 0.0609,
"step": 226
},
{
"epoch": 0.1615658362989324,
"grad_norm": 5.547918860845338,
"learning_rate": 1.7312775330396475e-06,
"loss": 0.0444,
"step": 227
},
{
"epoch": 0.16227758007117438,
"grad_norm": 6.431239963890675,
"learning_rate": 1.7298091042584432e-06,
"loss": -0.0016,
"step": 228
},
{
"epoch": 0.16298932384341638,
"grad_norm": 4.553712298781429,
"learning_rate": 1.7283406754772394e-06,
"loss": 0.1477,
"step": 229
},
{
"epoch": 0.16370106761565836,
"grad_norm": 5.22540005699231,
"learning_rate": 1.7268722466960352e-06,
"loss": 0.0993,
"step": 230
},
{
"epoch": 0.16441281138790034,
"grad_norm": 6.437986781409808,
"learning_rate": 1.7254038179148312e-06,
"loss": 0.0359,
"step": 231
},
{
"epoch": 0.16512455516014235,
"grad_norm": 4.695253800264834,
"learning_rate": 1.723935389133627e-06,
"loss": 0.0645,
"step": 232
},
{
"epoch": 0.16583629893238433,
"grad_norm": 6.533685695462389,
"learning_rate": 1.722466960352423e-06,
"loss": 0.0512,
"step": 233
},
{
"epoch": 0.16654804270462634,
"grad_norm": 9.138116893479024,
"learning_rate": 1.7209985315712187e-06,
"loss": 0.0731,
"step": 234
},
{
"epoch": 0.16725978647686832,
"grad_norm": 5.624524583858941,
"learning_rate": 1.7195301027900147e-06,
"loss": 0.17,
"step": 235
},
{
"epoch": 0.16797153024911032,
"grad_norm": 3.2300353491770943,
"learning_rate": 1.7180616740088106e-06,
"loss": 0.0418,
"step": 236
},
{
"epoch": 0.1686832740213523,
"grad_norm": 3.1549379246525033,
"learning_rate": 1.7165932452276064e-06,
"loss": 0.0072,
"step": 237
},
{
"epoch": 0.1693950177935943,
"grad_norm": 6.45374448449067,
"learning_rate": 1.7151248164464024e-06,
"loss": 0.0965,
"step": 238
},
{
"epoch": 0.1701067615658363,
"grad_norm": 3.2046807209540225,
"learning_rate": 1.7136563876651981e-06,
"loss": 0.1052,
"step": 239
},
{
"epoch": 0.1708185053380783,
"grad_norm": 17.53155549951064,
"learning_rate": 1.7121879588839941e-06,
"loss": 0.0309,
"step": 240
},
{
"epoch": 0.17153024911032028,
"grad_norm": 7.999750610279058,
"learning_rate": 1.7107195301027899e-06,
"loss": 0.0794,
"step": 241
},
{
"epoch": 0.17224199288256228,
"grad_norm": 4.18793587764703,
"learning_rate": 1.7092511013215859e-06,
"loss": 0.1157,
"step": 242
},
{
"epoch": 0.17295373665480426,
"grad_norm": 5.043597689648659,
"learning_rate": 1.7077826725403816e-06,
"loss": 0.0871,
"step": 243
},
{
"epoch": 0.17366548042704627,
"grad_norm": 4.195142234600667,
"learning_rate": 1.7063142437591776e-06,
"loss": 0.0907,
"step": 244
},
{
"epoch": 0.17437722419928825,
"grad_norm": 7.052542629714854,
"learning_rate": 1.7048458149779734e-06,
"loss": 0.1538,
"step": 245
},
{
"epoch": 0.17508896797153026,
"grad_norm": 4.6646820116985115,
"learning_rate": 1.7033773861967693e-06,
"loss": 0.0998,
"step": 246
},
{
"epoch": 0.17580071174377224,
"grad_norm": 5.141792327153177,
"learning_rate": 1.7019089574155651e-06,
"loss": 0.0019,
"step": 247
},
{
"epoch": 0.17651245551601424,
"grad_norm": 5.1975182920191845,
"learning_rate": 1.700440528634361e-06,
"loss": 0.1805,
"step": 248
},
{
"epoch": 0.17722419928825622,
"grad_norm": 6.812105219890212,
"learning_rate": 1.698972099853157e-06,
"loss": 0.1567,
"step": 249
},
{
"epoch": 0.17793594306049823,
"grad_norm": 8.036900295351552,
"learning_rate": 1.697503671071953e-06,
"loss": 0.0388,
"step": 250
},
{
"epoch": 0.1786476868327402,
"grad_norm": 5.270335640108646,
"learning_rate": 1.6960352422907488e-06,
"loss": 0.0209,
"step": 251
},
{
"epoch": 0.17935943060498222,
"grad_norm": 4.691025692308499,
"learning_rate": 1.6945668135095448e-06,
"loss": 0.1134,
"step": 252
},
{
"epoch": 0.1800711743772242,
"grad_norm": 4.582434595019,
"learning_rate": 1.6930983847283406e-06,
"loss": 0.0534,
"step": 253
},
{
"epoch": 0.1807829181494662,
"grad_norm": 5.2674928516109505,
"learning_rate": 1.6916299559471365e-06,
"loss": 0.1076,
"step": 254
},
{
"epoch": 0.18149466192170818,
"grad_norm": 4.571391599369404,
"learning_rate": 1.6901615271659325e-06,
"loss": 0.0748,
"step": 255
},
{
"epoch": 0.1822064056939502,
"grad_norm": 6.935073015616605,
"learning_rate": 1.6886930983847283e-06,
"loss": 0.0991,
"step": 256
},
{
"epoch": 0.18291814946619217,
"grad_norm": 8.20458599340894,
"learning_rate": 1.6872246696035242e-06,
"loss": 0.0553,
"step": 257
},
{
"epoch": 0.18362989323843418,
"grad_norm": 3.884973795641647,
"learning_rate": 1.68575624082232e-06,
"loss": 0.1122,
"step": 258
},
{
"epoch": 0.18434163701067616,
"grad_norm": 7.161399887308854,
"learning_rate": 1.684287812041116e-06,
"loss": 0.0906,
"step": 259
},
{
"epoch": 0.18505338078291814,
"grad_norm": 5.799383314825413,
"learning_rate": 1.6828193832599118e-06,
"loss": 0.091,
"step": 260
},
{
"epoch": 0.18576512455516014,
"grad_norm": 8.574044346176157,
"learning_rate": 1.6813509544787077e-06,
"loss": 0.1818,
"step": 261
},
{
"epoch": 0.18647686832740212,
"grad_norm": 5.636631955055512,
"learning_rate": 1.6798825256975035e-06,
"loss": -0.0482,
"step": 262
},
{
"epoch": 0.18718861209964413,
"grad_norm": 4.843371961054096,
"learning_rate": 1.6784140969162995e-06,
"loss": 0.0977,
"step": 263
},
{
"epoch": 0.1879003558718861,
"grad_norm": 4.0177832121329375,
"learning_rate": 1.6769456681350952e-06,
"loss": 0.0209,
"step": 264
},
{
"epoch": 0.18861209964412812,
"grad_norm": 4.87016389073631,
"learning_rate": 1.6754772393538912e-06,
"loss": 0.1963,
"step": 265
},
{
"epoch": 0.1893238434163701,
"grad_norm": 5.152554013905234,
"learning_rate": 1.674008810572687e-06,
"loss": 0.0175,
"step": 266
},
{
"epoch": 0.1900355871886121,
"grad_norm": 5.382963624114512,
"learning_rate": 1.672540381791483e-06,
"loss": 0.0613,
"step": 267
},
{
"epoch": 0.19074733096085408,
"grad_norm": 5.0952696610386745,
"learning_rate": 1.671071953010279e-06,
"loss": 0.0462,
"step": 268
},
{
"epoch": 0.1914590747330961,
"grad_norm": 4.557392135239278,
"learning_rate": 1.669603524229075e-06,
"loss": 0.2422,
"step": 269
},
{
"epoch": 0.19217081850533807,
"grad_norm": 3.243416500403976,
"learning_rate": 1.6681350954478707e-06,
"loss": 0.0716,
"step": 270
},
{
"epoch": 0.19288256227758008,
"grad_norm": 4.752322181453295,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0106,
"step": 271
},
{
"epoch": 0.19359430604982206,
"grad_norm": 5.174592102487308,
"learning_rate": 1.6651982378854626e-06,
"loss": 0.0096,
"step": 272
},
{
"epoch": 0.19430604982206406,
"grad_norm": 6.572477476154677,
"learning_rate": 1.6637298091042584e-06,
"loss": 0.2208,
"step": 273
},
{
"epoch": 0.19501779359430604,
"grad_norm": 5.709859305283521,
"learning_rate": 1.6622613803230544e-06,
"loss": 0.1049,
"step": 274
},
{
"epoch": 0.19572953736654805,
"grad_norm": 5.854991524394585,
"learning_rate": 1.6607929515418501e-06,
"loss": 0.0919,
"step": 275
},
{
"epoch": 0.19644128113879003,
"grad_norm": 6.049764253421174,
"learning_rate": 1.6593245227606461e-06,
"loss": 0.0477,
"step": 276
},
{
"epoch": 0.19715302491103204,
"grad_norm": 7.285867561202631,
"learning_rate": 1.6578560939794419e-06,
"loss": 0.0226,
"step": 277
},
{
"epoch": 0.19786476868327402,
"grad_norm": 4.852360707346899,
"learning_rate": 1.6563876651982379e-06,
"loss": 0.0613,
"step": 278
},
{
"epoch": 0.19857651245551602,
"grad_norm": 7.958493322658308,
"learning_rate": 1.6549192364170336e-06,
"loss": 0.0974,
"step": 279
},
{
"epoch": 0.199288256227758,
"grad_norm": 12.052061950787715,
"learning_rate": 1.6534508076358296e-06,
"loss": 0.125,
"step": 280
},
{
"epoch": 0.2,
"grad_norm": 4.141730855422617,
"learning_rate": 1.6519823788546254e-06,
"loss": 0.0366,
"step": 281
},
{
"epoch": 0.200711743772242,
"grad_norm": 4.70039039776032,
"learning_rate": 1.6505139500734214e-06,
"loss": 0.099,
"step": 282
},
{
"epoch": 0.201423487544484,
"grad_norm": 3.6386434376824224,
"learning_rate": 1.6490455212922171e-06,
"loss": 0.0729,
"step": 283
},
{
"epoch": 0.20213523131672598,
"grad_norm": 5.342530763972189,
"learning_rate": 1.647577092511013e-06,
"loss": 0.115,
"step": 284
},
{
"epoch": 0.20284697508896798,
"grad_norm": 4.117094151055721,
"learning_rate": 1.6461086637298089e-06,
"loss": -0.0421,
"step": 285
},
{
"epoch": 0.20355871886120996,
"grad_norm": 7.676379588638591,
"learning_rate": 1.6446402349486048e-06,
"loss": 0.1763,
"step": 286
},
{
"epoch": 0.20427046263345194,
"grad_norm": 6.232882136914352,
"learning_rate": 1.6431718061674006e-06,
"loss": 0.0297,
"step": 287
},
{
"epoch": 0.20498220640569395,
"grad_norm": 4.461775879651974,
"learning_rate": 1.6417033773861968e-06,
"loss": 0.1226,
"step": 288
},
{
"epoch": 0.20569395017793593,
"grad_norm": 6.945988041879261,
"learning_rate": 1.6402349486049928e-06,
"loss": 0.2332,
"step": 289
},
{
"epoch": 0.20640569395017794,
"grad_norm": 5.492669508987496,
"learning_rate": 1.6387665198237885e-06,
"loss": 0.0424,
"step": 290
},
{
"epoch": 0.20711743772241992,
"grad_norm": 3.6574823633839104,
"learning_rate": 1.6372980910425845e-06,
"loss": -0.1162,
"step": 291
},
{
"epoch": 0.20782918149466192,
"grad_norm": 4.45017248955224,
"learning_rate": 1.6358296622613803e-06,
"loss": 0.0168,
"step": 292
},
{
"epoch": 0.2085409252669039,
"grad_norm": 5.901019005144134,
"learning_rate": 1.6343612334801763e-06,
"loss": 0.1194,
"step": 293
},
{
"epoch": 0.2092526690391459,
"grad_norm": 5.770360970932692,
"learning_rate": 1.632892804698972e-06,
"loss": 0.0818,
"step": 294
},
{
"epoch": 0.2099644128113879,
"grad_norm": 4.54764594919165,
"learning_rate": 1.631424375917768e-06,
"loss": 0.0749,
"step": 295
},
{
"epoch": 0.2106761565836299,
"grad_norm": 4.338576714724069,
"learning_rate": 1.6299559471365638e-06,
"loss": 0.0211,
"step": 296
},
{
"epoch": 0.21138790035587188,
"grad_norm": 5.744409248117972,
"learning_rate": 1.6284875183553597e-06,
"loss": 0.0809,
"step": 297
},
{
"epoch": 0.21209964412811388,
"grad_norm": 5.62291661993109,
"learning_rate": 1.6270190895741555e-06,
"loss": 0.0373,
"step": 298
},
{
"epoch": 0.21281138790035586,
"grad_norm": 6.451351633319061,
"learning_rate": 1.6255506607929515e-06,
"loss": 0.1273,
"step": 299
},
{
"epoch": 0.21352313167259787,
"grad_norm": 5.775770735398924,
"learning_rate": 1.6240822320117473e-06,
"loss": 0.1488,
"step": 300
},
{
"epoch": 0.21423487544483985,
"grad_norm": 5.65199853440014,
"learning_rate": 1.6226138032305432e-06,
"loss": 0.0648,
"step": 301
},
{
"epoch": 0.21494661921708186,
"grad_norm": 5.095604494601722,
"learning_rate": 1.621145374449339e-06,
"loss": 0.0744,
"step": 302
},
{
"epoch": 0.21565836298932384,
"grad_norm": 5.413118442504562,
"learning_rate": 1.619676945668135e-06,
"loss": 0.061,
"step": 303
},
{
"epoch": 0.21637010676156584,
"grad_norm": 3.7007801986026054,
"learning_rate": 1.6182085168869307e-06,
"loss": 0.0895,
"step": 304
},
{
"epoch": 0.21708185053380782,
"grad_norm": 5.775824232856091,
"learning_rate": 1.6167400881057267e-06,
"loss": 0.0592,
"step": 305
},
{
"epoch": 0.21779359430604983,
"grad_norm": 4.671481623083791,
"learning_rate": 1.6152716593245225e-06,
"loss": 0.0348,
"step": 306
},
{
"epoch": 0.2185053380782918,
"grad_norm": 5.548421466861163,
"learning_rate": 1.6138032305433187e-06,
"loss": 0.0918,
"step": 307
},
{
"epoch": 0.21921708185053382,
"grad_norm": 4.07343007369804,
"learning_rate": 1.6123348017621146e-06,
"loss": 0.0971,
"step": 308
},
{
"epoch": 0.2199288256227758,
"grad_norm": 3.7358505334627647,
"learning_rate": 1.6108663729809104e-06,
"loss": 0.0479,
"step": 309
},
{
"epoch": 0.2206405693950178,
"grad_norm": 6.594222933183962,
"learning_rate": 1.6093979441997064e-06,
"loss": 0.0583,
"step": 310
},
{
"epoch": 0.22135231316725978,
"grad_norm": 4.517549414900124,
"learning_rate": 1.6079295154185022e-06,
"loss": 0.026,
"step": 311
},
{
"epoch": 0.2220640569395018,
"grad_norm": 3.893732997312208,
"learning_rate": 1.6064610866372981e-06,
"loss": -0.041,
"step": 312
},
{
"epoch": 0.22277580071174377,
"grad_norm": 6.014426372062505,
"learning_rate": 1.604992657856094e-06,
"loss": 0.0646,
"step": 313
},
{
"epoch": 0.22348754448398578,
"grad_norm": 7.267265807250511,
"learning_rate": 1.6035242290748899e-06,
"loss": 0.1517,
"step": 314
},
{
"epoch": 0.22419928825622776,
"grad_norm": 3.7282693021207227,
"learning_rate": 1.6020558002936856e-06,
"loss": 0.0652,
"step": 315
},
{
"epoch": 0.22491103202846974,
"grad_norm": 4.892638160809301,
"learning_rate": 1.6005873715124816e-06,
"loss": 0.212,
"step": 316
},
{
"epoch": 0.22562277580071174,
"grad_norm": 12.588780182154638,
"learning_rate": 1.5991189427312774e-06,
"loss": 0.2024,
"step": 317
},
{
"epoch": 0.22633451957295372,
"grad_norm": 4.509674058119238,
"learning_rate": 1.5976505139500734e-06,
"loss": 0.0002,
"step": 318
},
{
"epoch": 0.22704626334519573,
"grad_norm": 5.275333318077758,
"learning_rate": 1.5961820851688691e-06,
"loss": 0.0628,
"step": 319
},
{
"epoch": 0.2277580071174377,
"grad_norm": 4.985806920162633,
"learning_rate": 1.5947136563876651e-06,
"loss": 0.12,
"step": 320
},
{
"epoch": 0.22846975088967972,
"grad_norm": 3.368050519976615,
"learning_rate": 1.5932452276064609e-06,
"loss": 0.0697,
"step": 321
},
{
"epoch": 0.2291814946619217,
"grad_norm": 3.7965809900554333,
"learning_rate": 1.5917767988252569e-06,
"loss": 0.0518,
"step": 322
},
{
"epoch": 0.2298932384341637,
"grad_norm": 6.750969993690751,
"learning_rate": 1.5903083700440526e-06,
"loss": 0.0444,
"step": 323
},
{
"epoch": 0.23060498220640568,
"grad_norm": 3.7359230312448544,
"learning_rate": 1.5888399412628486e-06,
"loss": 0.1547,
"step": 324
},
{
"epoch": 0.2313167259786477,
"grad_norm": 5.383174446682282,
"learning_rate": 1.5873715124816446e-06,
"loss": 0.1029,
"step": 325
},
{
"epoch": 0.23202846975088967,
"grad_norm": 4.133656521949432,
"learning_rate": 1.5859030837004403e-06,
"loss": 0.1273,
"step": 326
},
{
"epoch": 0.23274021352313168,
"grad_norm": 5.0061001758801495,
"learning_rate": 1.5844346549192365e-06,
"loss": 0.0476,
"step": 327
},
{
"epoch": 0.23345195729537366,
"grad_norm": 5.067461949390494,
"learning_rate": 1.5829662261380323e-06,
"loss": 0.0065,
"step": 328
},
{
"epoch": 0.23416370106761566,
"grad_norm": 4.070687362776556,
"learning_rate": 1.5814977973568283e-06,
"loss": 0.0182,
"step": 329
},
{
"epoch": 0.23487544483985764,
"grad_norm": 4.104503688755299,
"learning_rate": 1.580029368575624e-06,
"loss": 0.0249,
"step": 330
},
{
"epoch": 0.23558718861209965,
"grad_norm": 6.173363151286035,
"learning_rate": 1.57856093979442e-06,
"loss": 0.1274,
"step": 331
},
{
"epoch": 0.23629893238434163,
"grad_norm": 5.123209943077641,
"learning_rate": 1.5770925110132158e-06,
"loss": 0.1077,
"step": 332
},
{
"epoch": 0.23701067615658364,
"grad_norm": 9.147892910509416,
"learning_rate": 1.5756240822320118e-06,
"loss": 0.0639,
"step": 333
},
{
"epoch": 0.23772241992882562,
"grad_norm": 9.078930082093752,
"learning_rate": 1.5741556534508075e-06,
"loss": 0.1176,
"step": 334
},
{
"epoch": 0.23843416370106763,
"grad_norm": 7.396135723397457,
"learning_rate": 1.5726872246696035e-06,
"loss": 0.1046,
"step": 335
},
{
"epoch": 0.2391459074733096,
"grad_norm": 17.515198129834094,
"learning_rate": 1.5712187958883993e-06,
"loss": 0.1535,
"step": 336
},
{
"epoch": 0.2398576512455516,
"grad_norm": 5.654849558750833,
"learning_rate": 1.5697503671071952e-06,
"loss": 0.1156,
"step": 337
},
{
"epoch": 0.2405693950177936,
"grad_norm": 4.6881638120471925,
"learning_rate": 1.568281938325991e-06,
"loss": 0.1043,
"step": 338
},
{
"epoch": 0.2412811387900356,
"grad_norm": 3.9981055541920023,
"learning_rate": 1.566813509544787e-06,
"loss": -0.074,
"step": 339
},
{
"epoch": 0.24199288256227758,
"grad_norm": 6.089583473932049,
"learning_rate": 1.5653450807635827e-06,
"loss": 0.1465,
"step": 340
},
{
"epoch": 0.24270462633451959,
"grad_norm": 4.313100800611475,
"learning_rate": 1.5638766519823787e-06,
"loss": 0.053,
"step": 341
},
{
"epoch": 0.24341637010676156,
"grad_norm": 4.889467144975206,
"learning_rate": 1.5624082232011747e-06,
"loss": 0.0717,
"step": 342
},
{
"epoch": 0.24412811387900357,
"grad_norm": 3.7378928242805607,
"learning_rate": 1.5609397944199705e-06,
"loss": 0.0617,
"step": 343
},
{
"epoch": 0.24483985765124555,
"grad_norm": 5.483036932234192,
"learning_rate": 1.5594713656387664e-06,
"loss": -0.0442,
"step": 344
},
{
"epoch": 0.24555160142348753,
"grad_norm": 4.64961948273496,
"learning_rate": 1.5580029368575622e-06,
"loss": 0.0449,
"step": 345
},
{
"epoch": 0.24626334519572954,
"grad_norm": 9.0044620896761,
"learning_rate": 1.5565345080763584e-06,
"loss": 0.0847,
"step": 346
},
{
"epoch": 0.24697508896797152,
"grad_norm": 4.30818314035339,
"learning_rate": 1.5550660792951542e-06,
"loss": 0.0968,
"step": 347
},
{
"epoch": 0.24768683274021353,
"grad_norm": 7.698780498292484,
"learning_rate": 1.5535976505139501e-06,
"loss": 0.0552,
"step": 348
},
{
"epoch": 0.2483985765124555,
"grad_norm": 4.735780211542353,
"learning_rate": 1.552129221732746e-06,
"loss": -0.021,
"step": 349
},
{
"epoch": 0.2491103202846975,
"grad_norm": 7.141409465679317,
"learning_rate": 1.5506607929515419e-06,
"loss": 0.0741,
"step": 350
},
{
"epoch": 0.2498220640569395,
"grad_norm": 3.889744793088473,
"learning_rate": 1.5491923641703377e-06,
"loss": 0.0057,
"step": 351
},
{
"epoch": 0.2505338078291815,
"grad_norm": 6.426034438875903,
"learning_rate": 1.5477239353891336e-06,
"loss": 0.0225,
"step": 352
},
{
"epoch": 0.2512455516014235,
"grad_norm": 5.831394781705116,
"learning_rate": 1.5462555066079294e-06,
"loss": 0.1369,
"step": 353
},
{
"epoch": 0.25195729537366546,
"grad_norm": 4.004979610319777,
"learning_rate": 1.5447870778267254e-06,
"loss": 0.131,
"step": 354
},
{
"epoch": 0.2526690391459075,
"grad_norm": 6.7718243465456744,
"learning_rate": 1.5433186490455211e-06,
"loss": 0.0178,
"step": 355
},
{
"epoch": 0.25338078291814947,
"grad_norm": 6.211271610510303,
"learning_rate": 1.5418502202643171e-06,
"loss": -0.0385,
"step": 356
},
{
"epoch": 0.25409252669039145,
"grad_norm": 7.959437774662272,
"learning_rate": 1.5403817914831129e-06,
"loss": 0.1653,
"step": 357
},
{
"epoch": 0.25480427046263343,
"grad_norm": 10.470130943200923,
"learning_rate": 1.5389133627019089e-06,
"loss": 0.4195,
"step": 358
},
{
"epoch": 0.25551601423487547,
"grad_norm": 6.235314472897717,
"learning_rate": 1.5374449339207048e-06,
"loss": 0.0351,
"step": 359
},
{
"epoch": 0.25622775800711745,
"grad_norm": 18.27060737890617,
"learning_rate": 1.5359765051395006e-06,
"loss": 0.1399,
"step": 360
},
{
"epoch": 0.2569395017793594,
"grad_norm": 6.088106486287744,
"learning_rate": 1.5345080763582966e-06,
"loss": 0.1531,
"step": 361
},
{
"epoch": 0.2576512455516014,
"grad_norm": 5.902550563054421,
"learning_rate": 1.5330396475770923e-06,
"loss": 0.0878,
"step": 362
},
{
"epoch": 0.25836298932384344,
"grad_norm": 3.8566639138632017,
"learning_rate": 1.5315712187958883e-06,
"loss": 0.1211,
"step": 363
},
{
"epoch": 0.2590747330960854,
"grad_norm": 3.6802199316647317,
"learning_rate": 1.530102790014684e-06,
"loss": 0.0758,
"step": 364
},
{
"epoch": 0.2597864768683274,
"grad_norm": 7.416574999067755,
"learning_rate": 1.52863436123348e-06,
"loss": 0.0767,
"step": 365
},
{
"epoch": 0.2604982206405694,
"grad_norm": 17.2285305706842,
"learning_rate": 1.527165932452276e-06,
"loss": 0.1486,
"step": 366
},
{
"epoch": 0.2612099644128114,
"grad_norm": 4.5616956027842255,
"learning_rate": 1.525697503671072e-06,
"loss": 0.0447,
"step": 367
},
{
"epoch": 0.2619217081850534,
"grad_norm": 3.569935985386615,
"learning_rate": 1.5242290748898678e-06,
"loss": 0.0119,
"step": 368
},
{
"epoch": 0.26263345195729537,
"grad_norm": 5.312914843350158,
"learning_rate": 1.5227606461086638e-06,
"loss": 0.0733,
"step": 369
},
{
"epoch": 0.26334519572953735,
"grad_norm": 4.058544639606025,
"learning_rate": 1.5212922173274595e-06,
"loss": 0.1414,
"step": 370
},
{
"epoch": 0.2640569395017794,
"grad_norm": 5.341206620154259,
"learning_rate": 1.5198237885462555e-06,
"loss": 0.1118,
"step": 371
},
{
"epoch": 0.26476868327402137,
"grad_norm": 5.4139085447762385,
"learning_rate": 1.5183553597650513e-06,
"loss": 0.0318,
"step": 372
},
{
"epoch": 0.26548042704626335,
"grad_norm": 5.787271858480889,
"learning_rate": 1.5168869309838473e-06,
"loss": 0.1029,
"step": 373
},
{
"epoch": 0.2661921708185053,
"grad_norm": 5.893029326858708,
"learning_rate": 1.515418502202643e-06,
"loss": 0.051,
"step": 374
},
{
"epoch": 0.2669039145907473,
"grad_norm": 5.2863059136227015,
"learning_rate": 1.513950073421439e-06,
"loss": 0.0402,
"step": 375
},
{
"epoch": 0.26761565836298934,
"grad_norm": 4.423267805643734,
"learning_rate": 1.512481644640235e-06,
"loss": 0.114,
"step": 376
},
{
"epoch": 0.2683274021352313,
"grad_norm": 3.5517169990953104,
"learning_rate": 1.5110132158590307e-06,
"loss": -0.0008,
"step": 377
},
{
"epoch": 0.2690391459074733,
"grad_norm": 3.0540230655261307,
"learning_rate": 1.5095447870778267e-06,
"loss": -0.0426,
"step": 378
},
{
"epoch": 0.2697508896797153,
"grad_norm": 4.542990162110976,
"learning_rate": 1.5080763582966225e-06,
"loss": 0.1026,
"step": 379
},
{
"epoch": 0.2704626334519573,
"grad_norm": 4.400804550133596,
"learning_rate": 1.5066079295154185e-06,
"loss": 0.132,
"step": 380
},
{
"epoch": 0.2711743772241993,
"grad_norm": 4.707977326473843,
"learning_rate": 1.5051395007342142e-06,
"loss": 0.0669,
"step": 381
},
{
"epoch": 0.27188612099644127,
"grad_norm": 5.231473477237746,
"learning_rate": 1.5036710719530102e-06,
"loss": 0.0588,
"step": 382
},
{
"epoch": 0.27259786476868325,
"grad_norm": 4.141758025995536,
"learning_rate": 1.502202643171806e-06,
"loss": 0.0349,
"step": 383
},
{
"epoch": 0.2733096085409253,
"grad_norm": 3.417607123726839,
"learning_rate": 1.500734214390602e-06,
"loss": -0.0088,
"step": 384
},
{
"epoch": 0.27402135231316727,
"grad_norm": 6.084464642962031,
"learning_rate": 1.499265785609398e-06,
"loss": 0.114,
"step": 385
},
{
"epoch": 0.27473309608540925,
"grad_norm": 9.337398551992413,
"learning_rate": 1.497797356828194e-06,
"loss": 0.041,
"step": 386
},
{
"epoch": 0.2754448398576512,
"grad_norm": 6.293725828334604,
"learning_rate": 1.4963289280469897e-06,
"loss": 0.1886,
"step": 387
},
{
"epoch": 0.27615658362989326,
"grad_norm": 2.8274617128621595,
"learning_rate": 1.4948604992657856e-06,
"loss": -0.0402,
"step": 388
},
{
"epoch": 0.27686832740213524,
"grad_norm": 6.682761251392663,
"learning_rate": 1.4933920704845814e-06,
"loss": 0.0407,
"step": 389
},
{
"epoch": 0.2775800711743772,
"grad_norm": 5.532712064282298,
"learning_rate": 1.4919236417033774e-06,
"loss": 0.0822,
"step": 390
},
{
"epoch": 0.2782918149466192,
"grad_norm": 6.354605270840461,
"learning_rate": 1.4904552129221731e-06,
"loss": 0.2162,
"step": 391
},
{
"epoch": 0.27900355871886123,
"grad_norm": 4.605656562580033,
"learning_rate": 1.4889867841409691e-06,
"loss": 0.0018,
"step": 392
},
{
"epoch": 0.2797153024911032,
"grad_norm": 5.623376400067919,
"learning_rate": 1.4875183553597649e-06,
"loss": 0.1569,
"step": 393
},
{
"epoch": 0.2804270462633452,
"grad_norm": 3.9692180687934897,
"learning_rate": 1.4860499265785609e-06,
"loss": -0.0416,
"step": 394
},
{
"epoch": 0.28113879003558717,
"grad_norm": 7.423001302286039,
"learning_rate": 1.4845814977973568e-06,
"loss": 0.026,
"step": 395
},
{
"epoch": 0.2818505338078292,
"grad_norm": 8.085510065695358,
"learning_rate": 1.4831130690161526e-06,
"loss": 0.0845,
"step": 396
},
{
"epoch": 0.2825622775800712,
"grad_norm": 4.9124011701975245,
"learning_rate": 1.4816446402349486e-06,
"loss": 0.0907,
"step": 397
},
{
"epoch": 0.28327402135231317,
"grad_norm": 4.4743904849478655,
"learning_rate": 1.4801762114537444e-06,
"loss": 0.0154,
"step": 398
},
{
"epoch": 0.28398576512455515,
"grad_norm": 4.688328173910629,
"learning_rate": 1.4787077826725403e-06,
"loss": 0.0273,
"step": 399
},
{
"epoch": 0.2846975088967972,
"grad_norm": 6.729777242479757,
"learning_rate": 1.477239353891336e-06,
"loss": 0.1049,
"step": 400
},
{
"epoch": 0.28540925266903916,
"grad_norm": 3.998078838569102,
"learning_rate": 1.475770925110132e-06,
"loss": 0.0952,
"step": 401
},
{
"epoch": 0.28612099644128114,
"grad_norm": 3.632400325239873,
"learning_rate": 1.4743024963289278e-06,
"loss": 0.0122,
"step": 402
},
{
"epoch": 0.2868327402135231,
"grad_norm": 5.621302597481837,
"learning_rate": 1.4728340675477238e-06,
"loss": 0.1393,
"step": 403
},
{
"epoch": 0.2875444839857651,
"grad_norm": 3.4926245499112936,
"learning_rate": 1.4713656387665198e-06,
"loss": -0.0072,
"step": 404
},
{
"epoch": 0.28825622775800713,
"grad_norm": 6.871401868269233,
"learning_rate": 1.4698972099853158e-06,
"loss": 0.0845,
"step": 405
},
{
"epoch": 0.2889679715302491,
"grad_norm": 7.371299203052198,
"learning_rate": 1.4684287812041115e-06,
"loss": 0.1009,
"step": 406
},
{
"epoch": 0.2896797153024911,
"grad_norm": 5.3878488662242034,
"learning_rate": 1.4669603524229075e-06,
"loss": 0.0183,
"step": 407
},
{
"epoch": 0.29039145907473307,
"grad_norm": 6.202672985144754,
"learning_rate": 1.4654919236417033e-06,
"loss": -0.0093,
"step": 408
},
{
"epoch": 0.2911032028469751,
"grad_norm": 4.22439807463946,
"learning_rate": 1.4640234948604993e-06,
"loss": -0.0559,
"step": 409
},
{
"epoch": 0.2918149466192171,
"grad_norm": 5.732779881282407,
"learning_rate": 1.462555066079295e-06,
"loss": 0.1191,
"step": 410
},
{
"epoch": 0.29252669039145907,
"grad_norm": 4.403590705236258,
"learning_rate": 1.461086637298091e-06,
"loss": 0.0391,
"step": 411
},
{
"epoch": 0.29323843416370104,
"grad_norm": 7.371522503842622,
"learning_rate": 1.459618208516887e-06,
"loss": 0.2122,
"step": 412
},
{
"epoch": 0.2939501779359431,
"grad_norm": 4.063849826456104,
"learning_rate": 1.4581497797356827e-06,
"loss": -0.0325,
"step": 413
},
{
"epoch": 0.29466192170818506,
"grad_norm": 6.537322657205629,
"learning_rate": 1.4566813509544787e-06,
"loss": 0.1573,
"step": 414
},
{
"epoch": 0.29537366548042704,
"grad_norm": 3.9070658327741112,
"learning_rate": 1.4552129221732745e-06,
"loss": 0.0689,
"step": 415
},
{
"epoch": 0.296085409252669,
"grad_norm": 6.406279177715566,
"learning_rate": 1.4537444933920705e-06,
"loss": 0.0146,
"step": 416
},
{
"epoch": 0.29679715302491105,
"grad_norm": 5.901355794529173,
"learning_rate": 1.4522760646108662e-06,
"loss": 0.046,
"step": 417
},
{
"epoch": 0.29750889679715303,
"grad_norm": 3.850309156415311,
"learning_rate": 1.4508076358296622e-06,
"loss": 0.0394,
"step": 418
},
{
"epoch": 0.298220640569395,
"grad_norm": 3.258587568922452,
"learning_rate": 1.449339207048458e-06,
"loss": 0.1155,
"step": 419
},
{
"epoch": 0.298932384341637,
"grad_norm": 3.745109038860105,
"learning_rate": 1.447870778267254e-06,
"loss": 0.0726,
"step": 420
},
{
"epoch": 0.299644128113879,
"grad_norm": 4.505449386351336,
"learning_rate": 1.4464023494860497e-06,
"loss": 0.0293,
"step": 421
},
{
"epoch": 0.300355871886121,
"grad_norm": 5.907561625814238,
"learning_rate": 1.4449339207048457e-06,
"loss": 0.1216,
"step": 422
},
{
"epoch": 0.301067615658363,
"grad_norm": 4.612488189930256,
"learning_rate": 1.4434654919236415e-06,
"loss": 0.12,
"step": 423
},
{
"epoch": 0.30177935943060497,
"grad_norm": 6.018073200585667,
"learning_rate": 1.4419970631424377e-06,
"loss": -0.0622,
"step": 424
},
{
"epoch": 0.302491103202847,
"grad_norm": 3.359965143931201,
"learning_rate": 1.4405286343612334e-06,
"loss": -0.0557,
"step": 425
},
{
"epoch": 0.303202846975089,
"grad_norm": 4.530239650290313,
"learning_rate": 1.4390602055800294e-06,
"loss": 0.0155,
"step": 426
},
{
"epoch": 0.30391459074733096,
"grad_norm": 3.923181080766024,
"learning_rate": 1.4375917767988252e-06,
"loss": 0.0844,
"step": 427
},
{
"epoch": 0.30462633451957294,
"grad_norm": 3.5202703684229815,
"learning_rate": 1.4361233480176211e-06,
"loss": -0.0512,
"step": 428
},
{
"epoch": 0.305338078291815,
"grad_norm": 3.7198987196394206,
"learning_rate": 1.4346549192364171e-06,
"loss": 0.0487,
"step": 429
},
{
"epoch": 0.30604982206405695,
"grad_norm": 4.361700060061493,
"learning_rate": 1.4331864904552129e-06,
"loss": 0.0465,
"step": 430
},
{
"epoch": 0.30676156583629893,
"grad_norm": 4.3036197508138105,
"learning_rate": 1.4317180616740089e-06,
"loss": 0.0604,
"step": 431
},
{
"epoch": 0.3074733096085409,
"grad_norm": 5.652510613520501,
"learning_rate": 1.4302496328928046e-06,
"loss": 0.1603,
"step": 432
},
{
"epoch": 0.3081850533807829,
"grad_norm": 4.460907997795395,
"learning_rate": 1.4287812041116006e-06,
"loss": 0.0773,
"step": 433
},
{
"epoch": 0.3088967971530249,
"grad_norm": 6.627971054944022,
"learning_rate": 1.4273127753303964e-06,
"loss": 0.1324,
"step": 434
},
{
"epoch": 0.3096085409252669,
"grad_norm": 6.483300692256294,
"learning_rate": 1.4258443465491923e-06,
"loss": 0.1187,
"step": 435
},
{
"epoch": 0.3103202846975089,
"grad_norm": 3.818791810351555,
"learning_rate": 1.4243759177679881e-06,
"loss": 0.1222,
"step": 436
},
{
"epoch": 0.31103202846975087,
"grad_norm": 4.205532589817094,
"learning_rate": 1.422907488986784e-06,
"loss": 0.04,
"step": 437
},
{
"epoch": 0.3117437722419929,
"grad_norm": 4.138905524462921,
"learning_rate": 1.4214390602055799e-06,
"loss": 0.1356,
"step": 438
},
{
"epoch": 0.3124555160142349,
"grad_norm": 4.560324163124626,
"learning_rate": 1.4199706314243758e-06,
"loss": -0.001,
"step": 439
},
{
"epoch": 0.31316725978647686,
"grad_norm": 5.088704049660316,
"learning_rate": 1.4185022026431716e-06,
"loss": 0.0828,
"step": 440
},
{
"epoch": 0.31387900355871884,
"grad_norm": 3.643900825951513,
"learning_rate": 1.4170337738619676e-06,
"loss": -0.0633,
"step": 441
},
{
"epoch": 0.3145907473309609,
"grad_norm": 12.367839552654106,
"learning_rate": 1.4155653450807633e-06,
"loss": 0.2273,
"step": 442
},
{
"epoch": 0.31530249110320285,
"grad_norm": 3.035800153655871,
"learning_rate": 1.4140969162995595e-06,
"loss": -0.0196,
"step": 443
},
{
"epoch": 0.31601423487544483,
"grad_norm": 18.763915734499722,
"learning_rate": 1.4126284875183553e-06,
"loss": 0.0782,
"step": 444
},
{
"epoch": 0.3167259786476868,
"grad_norm": 2.571197728361492,
"learning_rate": 1.4111600587371513e-06,
"loss": -0.0196,
"step": 445
},
{
"epoch": 0.31743772241992885,
"grad_norm": 5.409603600214178,
"learning_rate": 1.4096916299559472e-06,
"loss": 0.1013,
"step": 446
},
{
"epoch": 0.3181494661921708,
"grad_norm": 5.796783528801567,
"learning_rate": 1.408223201174743e-06,
"loss": 0.085,
"step": 447
},
{
"epoch": 0.3188612099644128,
"grad_norm": 5.821264734394937,
"learning_rate": 1.406754772393539e-06,
"loss": 0.0568,
"step": 448
},
{
"epoch": 0.3195729537366548,
"grad_norm": 3.001625097715217,
"learning_rate": 1.4052863436123348e-06,
"loss": 0.0876,
"step": 449
},
{
"epoch": 0.3202846975088968,
"grad_norm": 4.7979999428447355,
"learning_rate": 1.4038179148311307e-06,
"loss": 0.1001,
"step": 450
},
{
"epoch": 0.3209964412811388,
"grad_norm": 6.605368263206687,
"learning_rate": 1.4023494860499265e-06,
"loss": 0.1632,
"step": 451
},
{
"epoch": 0.3217081850533808,
"grad_norm": 3.502035638929594,
"learning_rate": 1.4008810572687225e-06,
"loss": -0.0367,
"step": 452
},
{
"epoch": 0.32241992882562276,
"grad_norm": 9.002808450564668,
"learning_rate": 1.3994126284875182e-06,
"loss": 0.0653,
"step": 453
},
{
"epoch": 0.3231316725978648,
"grad_norm": 5.094547700731088,
"learning_rate": 1.3979441997063142e-06,
"loss": 0.0839,
"step": 454
},
{
"epoch": 0.3238434163701068,
"grad_norm": 5.66990673708365,
"learning_rate": 1.39647577092511e-06,
"loss": 0.0375,
"step": 455
},
{
"epoch": 0.32455516014234875,
"grad_norm": 7.343498323064397,
"learning_rate": 1.395007342143906e-06,
"loss": 0.0998,
"step": 456
},
{
"epoch": 0.32526690391459073,
"grad_norm": 8.127266905066636,
"learning_rate": 1.3935389133627017e-06,
"loss": 0.1422,
"step": 457
},
{
"epoch": 0.32597864768683277,
"grad_norm": 3.800517277896503,
"learning_rate": 1.3920704845814977e-06,
"loss": -0.0181,
"step": 458
},
{
"epoch": 0.32669039145907475,
"grad_norm": 5.43134160815067,
"learning_rate": 1.3906020558002935e-06,
"loss": 0.0497,
"step": 459
},
{
"epoch": 0.3274021352313167,
"grad_norm": 3.8471828111327633,
"learning_rate": 1.3891336270190894e-06,
"loss": 0.0897,
"step": 460
},
{
"epoch": 0.3281138790035587,
"grad_norm": 6.528410517911757,
"learning_rate": 1.3876651982378852e-06,
"loss": -0.0511,
"step": 461
},
{
"epoch": 0.3288256227758007,
"grad_norm": 4.252585353992694,
"learning_rate": 1.3861967694566812e-06,
"loss": 0.0951,
"step": 462
},
{
"epoch": 0.3295373665480427,
"grad_norm": 8.229788789692552,
"learning_rate": 1.3847283406754774e-06,
"loss": 0.0933,
"step": 463
},
{
"epoch": 0.3302491103202847,
"grad_norm": 4.1539330011510485,
"learning_rate": 1.3832599118942731e-06,
"loss": 0.023,
"step": 464
},
{
"epoch": 0.3309608540925267,
"grad_norm": 4.6663483556543826,
"learning_rate": 1.3817914831130691e-06,
"loss": 0.007,
"step": 465
},
{
"epoch": 0.33167259786476866,
"grad_norm": 4.669672111746775,
"learning_rate": 1.3803230543318649e-06,
"loss": 0.0893,
"step": 466
},
{
"epoch": 0.3323843416370107,
"grad_norm": 5.133795657125543,
"learning_rate": 1.3788546255506609e-06,
"loss": 0.1242,
"step": 467
},
{
"epoch": 0.3330960854092527,
"grad_norm": 5.475945496459967,
"learning_rate": 1.3773861967694566e-06,
"loss": 0.1022,
"step": 468
},
{
"epoch": 0.33380782918149465,
"grad_norm": 4.605885985857843,
"learning_rate": 1.3759177679882526e-06,
"loss": 0.1267,
"step": 469
},
{
"epoch": 0.33451957295373663,
"grad_norm": 3.0454480081847795,
"learning_rate": 1.3744493392070484e-06,
"loss": 0.0601,
"step": 470
},
{
"epoch": 0.33523131672597867,
"grad_norm": 4.401587000661199,
"learning_rate": 1.3729809104258444e-06,
"loss": 0.0515,
"step": 471
},
{
"epoch": 0.33594306049822065,
"grad_norm": 13.88100598913882,
"learning_rate": 1.3715124816446401e-06,
"loss": 0.0645,
"step": 472
},
{
"epoch": 0.3366548042704626,
"grad_norm": 5.4928368826174845,
"learning_rate": 1.370044052863436e-06,
"loss": 0.079,
"step": 473
},
{
"epoch": 0.3373665480427046,
"grad_norm": 4.3823646683805215,
"learning_rate": 1.3685756240822319e-06,
"loss": 0.0301,
"step": 474
},
{
"epoch": 0.33807829181494664,
"grad_norm": 5.159143918902839,
"learning_rate": 1.3671071953010278e-06,
"loss": 0.0445,
"step": 475
},
{
"epoch": 0.3387900355871886,
"grad_norm": 5.394407322017402,
"learning_rate": 1.3656387665198236e-06,
"loss": -0.0628,
"step": 476
},
{
"epoch": 0.3395017793594306,
"grad_norm": 5.8944791534932754,
"learning_rate": 1.3641703377386196e-06,
"loss": 0.1409,
"step": 477
},
{
"epoch": 0.3402135231316726,
"grad_norm": 3.5826852049675066,
"learning_rate": 1.3627019089574153e-06,
"loss": 0.0186,
"step": 478
},
{
"epoch": 0.3409252669039146,
"grad_norm": 8.927906783297527,
"learning_rate": 1.3612334801762113e-06,
"loss": 0.1988,
"step": 479
},
{
"epoch": 0.3416370106761566,
"grad_norm": 3.6064993892760655,
"learning_rate": 1.359765051395007e-06,
"loss": -0.0085,
"step": 480
},
{
"epoch": 0.3423487544483986,
"grad_norm": 5.465970450161264,
"learning_rate": 1.358296622613803e-06,
"loss": 0.018,
"step": 481
},
{
"epoch": 0.34306049822064055,
"grad_norm": 5.639957969725356,
"learning_rate": 1.3568281938325993e-06,
"loss": 0.0933,
"step": 482
},
{
"epoch": 0.3437722419928826,
"grad_norm": 5.134464735383474,
"learning_rate": 1.355359765051395e-06,
"loss": 0.0109,
"step": 483
},
{
"epoch": 0.34448398576512457,
"grad_norm": 6.339479408912485,
"learning_rate": 1.353891336270191e-06,
"loss": 0.1694,
"step": 484
},
{
"epoch": 0.34519572953736655,
"grad_norm": 5.629471252167426,
"learning_rate": 1.3524229074889868e-06,
"loss": 0.1749,
"step": 485
},
{
"epoch": 0.3459074733096085,
"grad_norm": 4.1844924830495565,
"learning_rate": 1.3509544787077827e-06,
"loss": 0.0548,
"step": 486
},
{
"epoch": 0.34661921708185056,
"grad_norm": 3.81599741125119,
"learning_rate": 1.3494860499265785e-06,
"loss": 0.0581,
"step": 487
},
{
"epoch": 0.34733096085409254,
"grad_norm": 3.82408146519255,
"learning_rate": 1.3480176211453745e-06,
"loss": -0.0245,
"step": 488
},
{
"epoch": 0.3480427046263345,
"grad_norm": 4.118872049980593,
"learning_rate": 1.3465491923641703e-06,
"loss": -0.0963,
"step": 489
},
{
"epoch": 0.3487544483985765,
"grad_norm": 6.1088792058255,
"learning_rate": 1.3450807635829662e-06,
"loss": 0.0747,
"step": 490
},
{
"epoch": 0.3494661921708185,
"grad_norm": 6.388375406316636,
"learning_rate": 1.343612334801762e-06,
"loss": -0.0349,
"step": 491
},
{
"epoch": 0.3501779359430605,
"grad_norm": 5.8039027796462905,
"learning_rate": 1.342143906020558e-06,
"loss": 0.0393,
"step": 492
},
{
"epoch": 0.3508896797153025,
"grad_norm": 4.94103529672343,
"learning_rate": 1.3406754772393537e-06,
"loss": 0.0246,
"step": 493
},
{
"epoch": 0.3516014234875445,
"grad_norm": 4.054831441628558,
"learning_rate": 1.3392070484581497e-06,
"loss": -0.0507,
"step": 494
},
{
"epoch": 0.35231316725978645,
"grad_norm": 3.061671242195688,
"learning_rate": 1.3377386196769455e-06,
"loss": 0.0015,
"step": 495
},
{
"epoch": 0.3530249110320285,
"grad_norm": 6.231153836747014,
"learning_rate": 1.3362701908957415e-06,
"loss": 0.1093,
"step": 496
},
{
"epoch": 0.35373665480427047,
"grad_norm": 4.803948868211813,
"learning_rate": 1.3348017621145372e-06,
"loss": -0.0375,
"step": 497
},
{
"epoch": 0.35444839857651245,
"grad_norm": 7.2124323426879755,
"learning_rate": 1.3333333333333332e-06,
"loss": 0.0418,
"step": 498
},
{
"epoch": 0.3551601423487544,
"grad_norm": 12.051137049042497,
"learning_rate": 1.3318649045521292e-06,
"loss": 0.0088,
"step": 499
},
{
"epoch": 0.35587188612099646,
"grad_norm": 5.94286256858033,
"learning_rate": 1.330396475770925e-06,
"loss": 0.1674,
"step": 500
},
{
"epoch": 0.35658362989323844,
"grad_norm": 4.965540395103566,
"learning_rate": 1.328928046989721e-06,
"loss": 0.0542,
"step": 501
},
{
"epoch": 0.3572953736654804,
"grad_norm": 4.590895615840333,
"learning_rate": 1.327459618208517e-06,
"loss": 0.0241,
"step": 502
},
{
"epoch": 0.3580071174377224,
"grad_norm": 5.8626146642862595,
"learning_rate": 1.3259911894273129e-06,
"loss": -0.0145,
"step": 503
},
{
"epoch": 0.35871886120996443,
"grad_norm": 4.977928656796741,
"learning_rate": 1.3245227606461086e-06,
"loss": 0.0542,
"step": 504
},
{
"epoch": 0.3594306049822064,
"grad_norm": 3.974031673502917,
"learning_rate": 1.3230543318649046e-06,
"loss": 0.0275,
"step": 505
},
{
"epoch": 0.3601423487544484,
"grad_norm": 5.703999157106998,
"learning_rate": 1.3215859030837004e-06,
"loss": 0.0349,
"step": 506
},
{
"epoch": 0.3608540925266904,
"grad_norm": 4.052662367583191,
"learning_rate": 1.3201174743024964e-06,
"loss": 0.0043,
"step": 507
},
{
"epoch": 0.3615658362989324,
"grad_norm": 3.643033258471114,
"learning_rate": 1.3186490455212921e-06,
"loss": 0.1718,
"step": 508
},
{
"epoch": 0.3622775800711744,
"grad_norm": 5.142755038140959,
"learning_rate": 1.3171806167400881e-06,
"loss": 0.1742,
"step": 509
},
{
"epoch": 0.36298932384341637,
"grad_norm": 5.241172817397939,
"learning_rate": 1.3157121879588839e-06,
"loss": 0.0968,
"step": 510
},
{
"epoch": 0.36370106761565835,
"grad_norm": 5.1078352635046445,
"learning_rate": 1.3142437591776798e-06,
"loss": 0.1197,
"step": 511
},
{
"epoch": 0.3644128113879004,
"grad_norm": 6.225810427937807,
"learning_rate": 1.3127753303964756e-06,
"loss": 0.1229,
"step": 512
},
{
"epoch": 0.36512455516014236,
"grad_norm": 6.7014735750723196,
"learning_rate": 1.3113069016152716e-06,
"loss": 0.12,
"step": 513
},
{
"epoch": 0.36583629893238434,
"grad_norm": 4.627706612086469,
"learning_rate": 1.3098384728340674e-06,
"loss": 0.0387,
"step": 514
},
{
"epoch": 0.3665480427046263,
"grad_norm": 4.634265382361618,
"learning_rate": 1.3083700440528633e-06,
"loss": 0.068,
"step": 515
},
{
"epoch": 0.36725978647686836,
"grad_norm": 7.336944153509603,
"learning_rate": 1.3069016152716593e-06,
"loss": 0.044,
"step": 516
},
{
"epoch": 0.36797153024911033,
"grad_norm": 5.110609552864615,
"learning_rate": 1.305433186490455e-06,
"loss": 0.0432,
"step": 517
},
{
"epoch": 0.3686832740213523,
"grad_norm": 5.7487944292813875,
"learning_rate": 1.303964757709251e-06,
"loss": 0.0086,
"step": 518
},
{
"epoch": 0.3693950177935943,
"grad_norm": 3.9281649664133167,
"learning_rate": 1.3024963289280468e-06,
"loss": -0.0906,
"step": 519
},
{
"epoch": 0.3701067615658363,
"grad_norm": 5.635784576684936,
"learning_rate": 1.3010279001468428e-06,
"loss": 0.0388,
"step": 520
},
{
"epoch": 0.3708185053380783,
"grad_norm": 4.504403185332134,
"learning_rate": 1.2995594713656388e-06,
"loss": 0.0674,
"step": 521
},
{
"epoch": 0.3715302491103203,
"grad_norm": 5.796118838611627,
"learning_rate": 1.2980910425844348e-06,
"loss": 0.0867,
"step": 522
},
{
"epoch": 0.37224199288256227,
"grad_norm": 3.7423957018184564,
"learning_rate": 1.2966226138032305e-06,
"loss": 0.1033,
"step": 523
},
{
"epoch": 0.37295373665480425,
"grad_norm": 4.653161989863049,
"learning_rate": 1.2951541850220265e-06,
"loss": 0.1115,
"step": 524
},
{
"epoch": 0.3736654804270463,
"grad_norm": 5.487569286282674,
"learning_rate": 1.2936857562408223e-06,
"loss": 0.0015,
"step": 525
},
{
"epoch": 0.37437722419928826,
"grad_norm": 3.660972349527697,
"learning_rate": 1.2922173274596182e-06,
"loss": -0.0158,
"step": 526
},
{
"epoch": 0.37508896797153024,
"grad_norm": 14.386677916212754,
"learning_rate": 1.290748898678414e-06,
"loss": 0.1662,
"step": 527
},
{
"epoch": 0.3758007117437722,
"grad_norm": 4.804991772949289,
"learning_rate": 1.28928046989721e-06,
"loss": 0.1037,
"step": 528
},
{
"epoch": 0.37651245551601425,
"grad_norm": 4.285523891761131,
"learning_rate": 1.2878120411160057e-06,
"loss": 0.1347,
"step": 529
},
{
"epoch": 0.37722419928825623,
"grad_norm": 8.680000416276632,
"learning_rate": 1.2863436123348017e-06,
"loss": 0.1935,
"step": 530
},
{
"epoch": 0.3779359430604982,
"grad_norm": 6.3227238717346435,
"learning_rate": 1.2848751835535975e-06,
"loss": 0.1116,
"step": 531
},
{
"epoch": 0.3786476868327402,
"grad_norm": 4.369406257640907,
"learning_rate": 1.2834067547723935e-06,
"loss": 0.0217,
"step": 532
},
{
"epoch": 0.37935943060498223,
"grad_norm": 3.511049947393694,
"learning_rate": 1.2819383259911892e-06,
"loss": 0.0571,
"step": 533
},
{
"epoch": 0.3800711743772242,
"grad_norm": 3.4769751828887587,
"learning_rate": 1.2804698972099852e-06,
"loss": 0.0599,
"step": 534
},
{
"epoch": 0.3807829181494662,
"grad_norm": 4.425270529198389,
"learning_rate": 1.2790014684287812e-06,
"loss": -0.0344,
"step": 535
},
{
"epoch": 0.38149466192170817,
"grad_norm": 4.27135554896897,
"learning_rate": 1.277533039647577e-06,
"loss": 0.0781,
"step": 536
},
{
"epoch": 0.3822064056939502,
"grad_norm": 4.215674289333216,
"learning_rate": 1.276064610866373e-06,
"loss": 0.0871,
"step": 537
},
{
"epoch": 0.3829181494661922,
"grad_norm": 6.4279361532063515,
"learning_rate": 1.2745961820851687e-06,
"loss": 0.1274,
"step": 538
},
{
"epoch": 0.38362989323843416,
"grad_norm": 2.9956366278559794,
"learning_rate": 1.2731277533039647e-06,
"loss": 0.0482,
"step": 539
},
{
"epoch": 0.38434163701067614,
"grad_norm": 2.833506148808736,
"learning_rate": 1.2716593245227604e-06,
"loss": 0.0121,
"step": 540
},
{
"epoch": 0.3850533807829182,
"grad_norm": 4.398064985474303,
"learning_rate": 1.2701908957415566e-06,
"loss": 0.0557,
"step": 541
},
{
"epoch": 0.38576512455516015,
"grad_norm": 4.301774106985314,
"learning_rate": 1.2687224669603524e-06,
"loss": 0.1388,
"step": 542
},
{
"epoch": 0.38647686832740213,
"grad_norm": 4.016092829782654,
"learning_rate": 1.2672540381791484e-06,
"loss": -0.0178,
"step": 543
},
{
"epoch": 0.3871886120996441,
"grad_norm": 4.230056193093481,
"learning_rate": 1.2657856093979441e-06,
"loss": 0.0911,
"step": 544
},
{
"epoch": 0.3879003558718861,
"grad_norm": 4.648048910359669,
"learning_rate": 1.2643171806167401e-06,
"loss": 0.0079,
"step": 545
},
{
"epoch": 0.38861209964412813,
"grad_norm": 6.046269980431698,
"learning_rate": 1.2628487518355359e-06,
"loss": 0.01,
"step": 546
},
{
"epoch": 0.3893238434163701,
"grad_norm": 5.598650442721456,
"learning_rate": 1.2613803230543319e-06,
"loss": 0.1025,
"step": 547
},
{
"epoch": 0.3900355871886121,
"grad_norm": 4.154987344183164,
"learning_rate": 1.2599118942731276e-06,
"loss": 0.073,
"step": 548
},
{
"epoch": 0.39074733096085407,
"grad_norm": 5.151314702783577,
"learning_rate": 1.2584434654919236e-06,
"loss": 0.1204,
"step": 549
},
{
"epoch": 0.3914590747330961,
"grad_norm": 4.410394761804895,
"learning_rate": 1.2569750367107194e-06,
"loss": 0.0851,
"step": 550
},
{
"epoch": 0.3921708185053381,
"grad_norm": 4.99583108672324,
"learning_rate": 1.2555066079295153e-06,
"loss": 0.0927,
"step": 551
},
{
"epoch": 0.39288256227758006,
"grad_norm": 5.8725775736382095,
"learning_rate": 1.2540381791483113e-06,
"loss": 0.2121,
"step": 552
},
{
"epoch": 0.39359430604982204,
"grad_norm": 4.747630682859618,
"learning_rate": 1.252569750367107e-06,
"loss": -0.0295,
"step": 553
},
{
"epoch": 0.3943060498220641,
"grad_norm": 9.183776100778932,
"learning_rate": 1.251101321585903e-06,
"loss": 0.3017,
"step": 554
},
{
"epoch": 0.39501779359430605,
"grad_norm": 6.072579411018435,
"learning_rate": 1.2496328928046988e-06,
"loss": 0.1708,
"step": 555
},
{
"epoch": 0.39572953736654803,
"grad_norm": 4.213015126956102,
"learning_rate": 1.2481644640234948e-06,
"loss": 0.09,
"step": 556
},
{
"epoch": 0.39644128113879,
"grad_norm": 7.692972607852313,
"learning_rate": 1.2466960352422906e-06,
"loss": 0.0703,
"step": 557
},
{
"epoch": 0.39715302491103205,
"grad_norm": 3.336700137869538,
"learning_rate": 1.2452276064610866e-06,
"loss": 0.0718,
"step": 558
},
{
"epoch": 0.39786476868327403,
"grad_norm": 3.437064819532712,
"learning_rate": 1.2437591776798823e-06,
"loss": 0.1232,
"step": 559
},
{
"epoch": 0.398576512455516,
"grad_norm": 4.752532484654912,
"learning_rate": 1.2422907488986785e-06,
"loss": -0.0222,
"step": 560
},
{
"epoch": 0.399288256227758,
"grad_norm": 5.671494316167505,
"learning_rate": 1.2408223201174743e-06,
"loss": 0.1919,
"step": 561
},
{
"epoch": 0.4,
"grad_norm": 4.530732644899779,
"learning_rate": 1.2393538913362703e-06,
"loss": 0.0283,
"step": 562
},
{
"epoch": 0.400711743772242,
"grad_norm": 8.233660614468986,
"learning_rate": 1.237885462555066e-06,
"loss": 0.0892,
"step": 563
},
{
"epoch": 0.401423487544484,
"grad_norm": 9.502726514967256,
"learning_rate": 1.236417033773862e-06,
"loss": 0.0579,
"step": 564
},
{
"epoch": 0.40213523131672596,
"grad_norm": 3.5236594390328295,
"learning_rate": 1.2349486049926578e-06,
"loss": 0.0199,
"step": 565
},
{
"epoch": 0.402846975088968,
"grad_norm": 5.95870557924922,
"learning_rate": 1.2334801762114537e-06,
"loss": 0.0288,
"step": 566
},
{
"epoch": 0.40355871886121,
"grad_norm": 7.876154981414457,
"learning_rate": 1.2320117474302495e-06,
"loss": 0.0504,
"step": 567
},
{
"epoch": 0.40427046263345195,
"grad_norm": 5.71977049680604,
"learning_rate": 1.2305433186490455e-06,
"loss": 0.0537,
"step": 568
},
{
"epoch": 0.40498220640569393,
"grad_norm": 5.252680866045221,
"learning_rate": 1.2290748898678415e-06,
"loss": -0.0052,
"step": 569
},
{
"epoch": 0.40569395017793597,
"grad_norm": 4.443807498007278,
"learning_rate": 1.2276064610866372e-06,
"loss": 0.0806,
"step": 570
},
{
"epoch": 0.40640569395017795,
"grad_norm": 4.114830428160394,
"learning_rate": 1.2261380323054332e-06,
"loss": 0.0369,
"step": 571
},
{
"epoch": 0.40711743772241993,
"grad_norm": 5.210110211222593,
"learning_rate": 1.224669603524229e-06,
"loss": 0.1161,
"step": 572
},
{
"epoch": 0.4078291814946619,
"grad_norm": 4.043897557109054,
"learning_rate": 1.223201174743025e-06,
"loss": -0.0277,
"step": 573
},
{
"epoch": 0.4085409252669039,
"grad_norm": 6.136939420563002,
"learning_rate": 1.2217327459618207e-06,
"loss": 0.1014,
"step": 574
},
{
"epoch": 0.4092526690391459,
"grad_norm": 5.27770281063622,
"learning_rate": 1.2202643171806167e-06,
"loss": 0.2431,
"step": 575
},
{
"epoch": 0.4099644128113879,
"grad_norm": 3.3599898374494637,
"learning_rate": 1.2187958883994125e-06,
"loss": 0.0897,
"step": 576
},
{
"epoch": 0.4106761565836299,
"grad_norm": 4.979236213911304,
"learning_rate": 1.2173274596182084e-06,
"loss": 0.0831,
"step": 577
},
{
"epoch": 0.41138790035587186,
"grad_norm": 4.575722750914581,
"learning_rate": 1.2158590308370042e-06,
"loss": 0.1251,
"step": 578
},
{
"epoch": 0.4120996441281139,
"grad_norm": 3.7621845658093855,
"learning_rate": 1.2143906020558002e-06,
"loss": -0.0291,
"step": 579
},
{
"epoch": 0.4128113879003559,
"grad_norm": 4.218297411342295,
"learning_rate": 1.2129221732745961e-06,
"loss": 0.1321,
"step": 580
},
{
"epoch": 0.41352313167259785,
"grad_norm": 3.879794323572462,
"learning_rate": 1.2114537444933921e-06,
"loss": 0.0918,
"step": 581
},
{
"epoch": 0.41423487544483983,
"grad_norm": 4.052563296464422,
"learning_rate": 1.2099853157121879e-06,
"loss": 0.0771,
"step": 582
},
{
"epoch": 0.41494661921708187,
"grad_norm": 4.85871887683837,
"learning_rate": 1.2085168869309839e-06,
"loss": 0.0863,
"step": 583
},
{
"epoch": 0.41565836298932385,
"grad_norm": 4.707446979825805,
"learning_rate": 1.2070484581497796e-06,
"loss": 0.0187,
"step": 584
},
{
"epoch": 0.41637010676156583,
"grad_norm": 3.8785600613390097,
"learning_rate": 1.2055800293685756e-06,
"loss": -0.0032,
"step": 585
},
{
"epoch": 0.4170818505338078,
"grad_norm": 4.765160109560956,
"learning_rate": 1.2041116005873716e-06,
"loss": 0.0677,
"step": 586
},
{
"epoch": 0.41779359430604984,
"grad_norm": 4.455527851930796,
"learning_rate": 1.2026431718061674e-06,
"loss": 0.111,
"step": 587
},
{
"epoch": 0.4185053380782918,
"grad_norm": 3.1973105155389394,
"learning_rate": 1.2011747430249633e-06,
"loss": 0.1831,
"step": 588
},
{
"epoch": 0.4192170818505338,
"grad_norm": 7.28943877562424,
"learning_rate": 1.199706314243759e-06,
"loss": 0.0971,
"step": 589
},
{
"epoch": 0.4199288256227758,
"grad_norm": 6.135188134626812,
"learning_rate": 1.198237885462555e-06,
"loss": 0.0729,
"step": 590
},
{
"epoch": 0.4206405693950178,
"grad_norm": 3.7048101655046723,
"learning_rate": 1.1967694566813508e-06,
"loss": -0.1257,
"step": 591
},
{
"epoch": 0.4213523131672598,
"grad_norm": 4.813423563533292,
"learning_rate": 1.1953010279001468e-06,
"loss": 0.0854,
"step": 592
},
{
"epoch": 0.4220640569395018,
"grad_norm": 3.208585076962423,
"learning_rate": 1.1938325991189426e-06,
"loss": 0.0278,
"step": 593
},
{
"epoch": 0.42277580071174375,
"grad_norm": 5.694992998368377,
"learning_rate": 1.1923641703377386e-06,
"loss": 0.122,
"step": 594
},
{
"epoch": 0.4234875444839858,
"grad_norm": 6.8821335283366505,
"learning_rate": 1.1908957415565343e-06,
"loss": 0.0321,
"step": 595
},
{
"epoch": 0.42419928825622777,
"grad_norm": 4.215796899366015,
"learning_rate": 1.1894273127753303e-06,
"loss": 0.0321,
"step": 596
},
{
"epoch": 0.42491103202846975,
"grad_norm": 5.316432439892324,
"learning_rate": 1.187958883994126e-06,
"loss": 0.068,
"step": 597
},
{
"epoch": 0.42562277580071173,
"grad_norm": 5.511432729500699,
"learning_rate": 1.186490455212922e-06,
"loss": 0.0614,
"step": 598
},
{
"epoch": 0.42633451957295376,
"grad_norm": 5.582771310939926,
"learning_rate": 1.185022026431718e-06,
"loss": 0.2272,
"step": 599
},
{
"epoch": 0.42704626334519574,
"grad_norm": 5.323291364746015,
"learning_rate": 1.183553597650514e-06,
"loss": 0.1055,
"step": 600
},
{
"epoch": 0.4277580071174377,
"grad_norm": 2.7981336095292777,
"learning_rate": 1.1820851688693098e-06,
"loss": 0.0135,
"step": 601
},
{
"epoch": 0.4284697508896797,
"grad_norm": 5.2514595546013,
"learning_rate": 1.1806167400881057e-06,
"loss": 0.1438,
"step": 602
},
{
"epoch": 0.4291814946619217,
"grad_norm": 4.8052648678120855,
"learning_rate": 1.1791483113069017e-06,
"loss": 0.0248,
"step": 603
},
{
"epoch": 0.4298932384341637,
"grad_norm": 4.272397432563753,
"learning_rate": 1.1776798825256975e-06,
"loss": 0.1677,
"step": 604
},
{
"epoch": 0.4306049822064057,
"grad_norm": 6.343124283623192,
"learning_rate": 1.1762114537444935e-06,
"loss": 0.1471,
"step": 605
},
{
"epoch": 0.4313167259786477,
"grad_norm": 13.126869867733665,
"learning_rate": 1.1747430249632892e-06,
"loss": 0.1401,
"step": 606
},
{
"epoch": 0.43202846975088965,
"grad_norm": 4.4220643559003765,
"learning_rate": 1.1732745961820852e-06,
"loss": 0.0717,
"step": 607
},
{
"epoch": 0.4327402135231317,
"grad_norm": 5.323938419745406,
"learning_rate": 1.171806167400881e-06,
"loss": 0.0522,
"step": 608
},
{
"epoch": 0.43345195729537367,
"grad_norm": 4.280517515464142,
"learning_rate": 1.170337738619677e-06,
"loss": -0.0424,
"step": 609
},
{
"epoch": 0.43416370106761565,
"grad_norm": 4.523195250354239,
"learning_rate": 1.1688693098384727e-06,
"loss": 0.0748,
"step": 610
},
{
"epoch": 0.43487544483985763,
"grad_norm": 3.867967315038678,
"learning_rate": 1.1674008810572687e-06,
"loss": -0.0254,
"step": 611
},
{
"epoch": 0.43558718861209966,
"grad_norm": 5.799611229856745,
"learning_rate": 1.1659324522760645e-06,
"loss": 0.0168,
"step": 612
},
{
"epoch": 0.43629893238434164,
"grad_norm": 5.017825585689327,
"learning_rate": 1.1644640234948604e-06,
"loss": -0.0632,
"step": 613
},
{
"epoch": 0.4370106761565836,
"grad_norm": 4.292392681648742,
"learning_rate": 1.1629955947136562e-06,
"loss": 0.1748,
"step": 614
},
{
"epoch": 0.4377224199288256,
"grad_norm": 5.436171095243215,
"learning_rate": 1.1615271659324522e-06,
"loss": 0.1117,
"step": 615
},
{
"epoch": 0.43843416370106764,
"grad_norm": 4.987884529606562,
"learning_rate": 1.160058737151248e-06,
"loss": -0.0053,
"step": 616
},
{
"epoch": 0.4391459074733096,
"grad_norm": 4.723339551994701,
"learning_rate": 1.158590308370044e-06,
"loss": 0.0186,
"step": 617
},
{
"epoch": 0.4398576512455516,
"grad_norm": 4.6346798200417,
"learning_rate": 1.15712187958884e-06,
"loss": 0.0858,
"step": 618
},
{
"epoch": 0.4405693950177936,
"grad_norm": 5.337842922227848,
"learning_rate": 1.1556534508076359e-06,
"loss": 0.052,
"step": 619
},
{
"epoch": 0.4412811387900356,
"grad_norm": 3.4759681450774633,
"learning_rate": 1.1541850220264319e-06,
"loss": 0.0147,
"step": 620
},
{
"epoch": 0.4419928825622776,
"grad_norm": 5.502678479043237,
"learning_rate": 1.1527165932452276e-06,
"loss": 0.0753,
"step": 621
},
{
"epoch": 0.44270462633451957,
"grad_norm": 3.188013970319323,
"learning_rate": 1.1512481644640236e-06,
"loss": 0.0928,
"step": 622
},
{
"epoch": 0.44341637010676155,
"grad_norm": 5.628840512931552,
"learning_rate": 1.1497797356828194e-06,
"loss": 0.1731,
"step": 623
},
{
"epoch": 0.4441281138790036,
"grad_norm": 5.2497544004123124,
"learning_rate": 1.1483113069016153e-06,
"loss": 0.1112,
"step": 624
},
{
"epoch": 0.44483985765124556,
"grad_norm": 5.598846841337847,
"learning_rate": 1.1468428781204111e-06,
"loss": 0.0839,
"step": 625
},
{
"epoch": 0.44555160142348754,
"grad_norm": 3.8121356477329487,
"learning_rate": 1.145374449339207e-06,
"loss": 0.0858,
"step": 626
},
{
"epoch": 0.4462633451957295,
"grad_norm": 6.402138781801384,
"learning_rate": 1.1439060205580029e-06,
"loss": 0.0405,
"step": 627
},
{
"epoch": 0.44697508896797156,
"grad_norm": 3.5680096501361027,
"learning_rate": 1.1424375917767988e-06,
"loss": 0.0179,
"step": 628
},
{
"epoch": 0.44768683274021354,
"grad_norm": 12.93989459742168,
"learning_rate": 1.1409691629955946e-06,
"loss": 0.139,
"step": 629
},
{
"epoch": 0.4483985765124555,
"grad_norm": 4.992624756195558,
"learning_rate": 1.1395007342143906e-06,
"loss": 0.1577,
"step": 630
},
{
"epoch": 0.4491103202846975,
"grad_norm": 5.4319610446325255,
"learning_rate": 1.1380323054331863e-06,
"loss": 0.1754,
"step": 631
},
{
"epoch": 0.4498220640569395,
"grad_norm": 5.069115166733579,
"learning_rate": 1.1365638766519823e-06,
"loss": -0.0696,
"step": 632
},
{
"epoch": 0.4505338078291815,
"grad_norm": 5.741666310849707,
"learning_rate": 1.135095447870778e-06,
"loss": 0.1294,
"step": 633
},
{
"epoch": 0.4512455516014235,
"grad_norm": 5.463031329189953,
"learning_rate": 1.133627019089574e-06,
"loss": 0.0757,
"step": 634
},
{
"epoch": 0.45195729537366547,
"grad_norm": 6.8511844474163395,
"learning_rate": 1.1321585903083698e-06,
"loss": 0.0903,
"step": 635
},
{
"epoch": 0.45266903914590745,
"grad_norm": 4.259850474495963,
"learning_rate": 1.1306901615271658e-06,
"loss": -0.0736,
"step": 636
},
{
"epoch": 0.4533807829181495,
"grad_norm": 7.009460323527026,
"learning_rate": 1.1292217327459616e-06,
"loss": -0.0667,
"step": 637
},
{
"epoch": 0.45409252669039146,
"grad_norm": 2.704359268907955,
"learning_rate": 1.1277533039647578e-06,
"loss": -0.1515,
"step": 638
},
{
"epoch": 0.45480427046263344,
"grad_norm": 4.892012036861605,
"learning_rate": 1.1262848751835537e-06,
"loss": 0.1136,
"step": 639
},
{
"epoch": 0.4555160142348754,
"grad_norm": 4.420897035322563,
"learning_rate": 1.1248164464023495e-06,
"loss": 0.0102,
"step": 640
},
{
"epoch": 0.45622775800711746,
"grad_norm": 6.963193037787631,
"learning_rate": 1.1233480176211455e-06,
"loss": 0.0197,
"step": 641
},
{
"epoch": 0.45693950177935944,
"grad_norm": 4.10057023897775,
"learning_rate": 1.1218795888399412e-06,
"loss": -0.0319,
"step": 642
},
{
"epoch": 0.4576512455516014,
"grad_norm": 5.54032283159087,
"learning_rate": 1.1204111600587372e-06,
"loss": 0.1388,
"step": 643
},
{
"epoch": 0.4583629893238434,
"grad_norm": 5.050577134494581,
"learning_rate": 1.118942731277533e-06,
"loss": 0.0912,
"step": 644
},
{
"epoch": 0.45907473309608543,
"grad_norm": 4.9180573689518665,
"learning_rate": 1.117474302496329e-06,
"loss": 0.0964,
"step": 645
},
{
"epoch": 0.4597864768683274,
"grad_norm": 5.7503008556439825,
"learning_rate": 1.1160058737151247e-06,
"loss": 0.2445,
"step": 646
},
{
"epoch": 0.4604982206405694,
"grad_norm": 4.905400682546049,
"learning_rate": 1.1145374449339207e-06,
"loss": 0.206,
"step": 647
},
{
"epoch": 0.46120996441281137,
"grad_norm": 5.087415254904014,
"learning_rate": 1.1130690161527165e-06,
"loss": 0.0828,
"step": 648
},
{
"epoch": 0.4619217081850534,
"grad_norm": 5.023255815082948,
"learning_rate": 1.1116005873715124e-06,
"loss": 0.0484,
"step": 649
},
{
"epoch": 0.4626334519572954,
"grad_norm": 4.355133071075149,
"learning_rate": 1.1101321585903082e-06,
"loss": 0.0645,
"step": 650
},
{
"epoch": 0.46334519572953736,
"grad_norm": 9.497353436091503,
"learning_rate": 1.1086637298091042e-06,
"loss": 0.0136,
"step": 651
},
{
"epoch": 0.46405693950177934,
"grad_norm": 4.8079658054906735,
"learning_rate": 1.1071953010279e-06,
"loss": 0.0363,
"step": 652
},
{
"epoch": 0.4647686832740214,
"grad_norm": 3.4912562824852427,
"learning_rate": 1.105726872246696e-06,
"loss": -0.0103,
"step": 653
},
{
"epoch": 0.46548042704626336,
"grad_norm": 3.907574848573548,
"learning_rate": 1.1042584434654917e-06,
"loss": 0.0148,
"step": 654
},
{
"epoch": 0.46619217081850534,
"grad_norm": 4.75599773605203,
"learning_rate": 1.1027900146842877e-06,
"loss": 0.0955,
"step": 655
},
{
"epoch": 0.4669039145907473,
"grad_norm": 6.334151198466081,
"learning_rate": 1.1013215859030837e-06,
"loss": 0.2039,
"step": 656
},
{
"epoch": 0.46761565836298935,
"grad_norm": 4.128379362299159,
"learning_rate": 1.0998531571218796e-06,
"loss": -0.07,
"step": 657
},
{
"epoch": 0.46832740213523133,
"grad_norm": 3.9676377547638273,
"learning_rate": 1.0983847283406756e-06,
"loss": -0.0183,
"step": 658
},
{
"epoch": 0.4690391459074733,
"grad_norm": 4.269930835672946,
"learning_rate": 1.0969162995594714e-06,
"loss": -0.0271,
"step": 659
},
{
"epoch": 0.4697508896797153,
"grad_norm": 5.596920542742016,
"learning_rate": 1.0954478707782674e-06,
"loss": 0.1345,
"step": 660
},
{
"epoch": 0.47046263345195727,
"grad_norm": 3.7310194957083724,
"learning_rate": 1.0939794419970631e-06,
"loss": 0.0609,
"step": 661
},
{
"epoch": 0.4711743772241993,
"grad_norm": 4.239345659471919,
"learning_rate": 1.092511013215859e-06,
"loss": 0.0363,
"step": 662
},
{
"epoch": 0.4718861209964413,
"grad_norm": 4.978031151336516,
"learning_rate": 1.0910425844346549e-06,
"loss": -0.0304,
"step": 663
},
{
"epoch": 0.47259786476868326,
"grad_norm": 6.292115791239946,
"learning_rate": 1.0895741556534508e-06,
"loss": 0.0228,
"step": 664
},
{
"epoch": 0.47330960854092524,
"grad_norm": 3.388984670136675,
"learning_rate": 1.0881057268722466e-06,
"loss": -0.0092,
"step": 665
},
{
"epoch": 0.4740213523131673,
"grad_norm": 4.9208120984624095,
"learning_rate": 1.0866372980910426e-06,
"loss": 0.1315,
"step": 666
},
{
"epoch": 0.47473309608540926,
"grad_norm": 8.255729766326498,
"learning_rate": 1.0851688693098383e-06,
"loss": 0.1453,
"step": 667
},
{
"epoch": 0.47544483985765124,
"grad_norm": 4.201986733068396,
"learning_rate": 1.0837004405286343e-06,
"loss": 0.016,
"step": 668
},
{
"epoch": 0.4761565836298932,
"grad_norm": 5.843632824380879,
"learning_rate": 1.08223201174743e-06,
"loss": 0.0414,
"step": 669
},
{
"epoch": 0.47686832740213525,
"grad_norm": 6.714297784487186,
"learning_rate": 1.080763582966226e-06,
"loss": -0.0167,
"step": 670
},
{
"epoch": 0.47758007117437723,
"grad_norm": 4.545591402594477,
"learning_rate": 1.0792951541850218e-06,
"loss": 0.0521,
"step": 671
},
{
"epoch": 0.4782918149466192,
"grad_norm": 5.895020805238048,
"learning_rate": 1.0778267254038178e-06,
"loss": 0.1075,
"step": 672
},
{
"epoch": 0.4790035587188612,
"grad_norm": 3.4508361134951264,
"learning_rate": 1.0763582966226136e-06,
"loss": 0.0254,
"step": 673
},
{
"epoch": 0.4797153024911032,
"grad_norm": 4.771987420077502,
"learning_rate": 1.0748898678414096e-06,
"loss": 0.0573,
"step": 674
},
{
"epoch": 0.4804270462633452,
"grad_norm": 11.226430999686528,
"learning_rate": 1.0734214390602055e-06,
"loss": 0.1012,
"step": 675
},
{
"epoch": 0.4811387900355872,
"grad_norm": 10.846049392039056,
"learning_rate": 1.0719530102790013e-06,
"loss": 0.0998,
"step": 676
},
{
"epoch": 0.48185053380782916,
"grad_norm": 6.275120719569148,
"learning_rate": 1.0704845814977975e-06,
"loss": 0.0751,
"step": 677
},
{
"epoch": 0.4825622775800712,
"grad_norm": 4.128723544695201,
"learning_rate": 1.0690161527165933e-06,
"loss": 0.0578,
"step": 678
},
{
"epoch": 0.4832740213523132,
"grad_norm": 5.355283727030443,
"learning_rate": 1.0675477239353892e-06,
"loss": -0.0071,
"step": 679
},
{
"epoch": 0.48398576512455516,
"grad_norm": 5.164628498494913,
"learning_rate": 1.066079295154185e-06,
"loss": 0.056,
"step": 680
},
{
"epoch": 0.48469750889679714,
"grad_norm": 3.9710771841378723,
"learning_rate": 1.064610866372981e-06,
"loss": 0.0391,
"step": 681
},
{
"epoch": 0.48540925266903917,
"grad_norm": 5.315392458503421,
"learning_rate": 1.0631424375917767e-06,
"loss": 0.0435,
"step": 682
},
{
"epoch": 0.48612099644128115,
"grad_norm": 3.2116434105620204,
"learning_rate": 1.0616740088105727e-06,
"loss": 0.0427,
"step": 683
},
{
"epoch": 0.48683274021352313,
"grad_norm": 8.68683517437045,
"learning_rate": 1.0602055800293685e-06,
"loss": 0.0512,
"step": 684
},
{
"epoch": 0.4875444839857651,
"grad_norm": 3.513970150885388,
"learning_rate": 1.0587371512481645e-06,
"loss": -0.032,
"step": 685
},
{
"epoch": 0.48825622775800714,
"grad_norm": 4.866089723085183,
"learning_rate": 1.0572687224669602e-06,
"loss": 0.1657,
"step": 686
},
{
"epoch": 0.4889679715302491,
"grad_norm": 6.017385408509331,
"learning_rate": 1.0558002936857562e-06,
"loss": 0.0817,
"step": 687
},
{
"epoch": 0.4896797153024911,
"grad_norm": 5.83203779093739,
"learning_rate": 1.054331864904552e-06,
"loss": 0.1678,
"step": 688
},
{
"epoch": 0.4903914590747331,
"grad_norm": 4.722061712990318,
"learning_rate": 1.052863436123348e-06,
"loss": 0.1022,
"step": 689
},
{
"epoch": 0.49110320284697506,
"grad_norm": 10.584918006143333,
"learning_rate": 1.0513950073421437e-06,
"loss": 0.0488,
"step": 690
},
{
"epoch": 0.4918149466192171,
"grad_norm": 3.746498762239735,
"learning_rate": 1.0499265785609397e-06,
"loss": 0.0109,
"step": 691
},
{
"epoch": 0.4925266903914591,
"grad_norm": 5.269616671379547,
"learning_rate": 1.0484581497797357e-06,
"loss": 0.198,
"step": 692
},
{
"epoch": 0.49323843416370106,
"grad_norm": 4.065242839095878,
"learning_rate": 1.0469897209985314e-06,
"loss": 0.0446,
"step": 693
},
{
"epoch": 0.49395017793594304,
"grad_norm": 5.2828106803180095,
"learning_rate": 1.0455212922173274e-06,
"loss": 0.1978,
"step": 694
},
{
"epoch": 0.49466192170818507,
"grad_norm": 13.032941895937794,
"learning_rate": 1.0440528634361232e-06,
"loss": 0.206,
"step": 695
},
{
"epoch": 0.49537366548042705,
"grad_norm": 4.385566159174779,
"learning_rate": 1.0425844346549194e-06,
"loss": 0.0638,
"step": 696
},
{
"epoch": 0.49608540925266903,
"grad_norm": 4.175354381756935,
"learning_rate": 1.0411160058737151e-06,
"loss": -0.0326,
"step": 697
},
{
"epoch": 0.496797153024911,
"grad_norm": 4.094552463766955,
"learning_rate": 1.0396475770925111e-06,
"loss": 0.0364,
"step": 698
},
{
"epoch": 0.49750889679715304,
"grad_norm": 8.284794808093139,
"learning_rate": 1.0381791483113069e-06,
"loss": 0.1064,
"step": 699
},
{
"epoch": 0.498220640569395,
"grad_norm": 4.51657295176749,
"learning_rate": 1.0367107195301028e-06,
"loss": 0.0779,
"step": 700
},
{
"epoch": 0.498932384341637,
"grad_norm": 6.743114491694153,
"learning_rate": 1.0352422907488986e-06,
"loss": 0.0201,
"step": 701
},
{
"epoch": 0.499644128113879,
"grad_norm": 4.109578522838562,
"learning_rate": 1.0337738619676946e-06,
"loss": 0.1309,
"step": 702
},
{
"epoch": 0.500355871886121,
"grad_norm": 4.307896243759484,
"learning_rate": 1.0323054331864904e-06,
"loss": 0.0356,
"step": 703
},
{
"epoch": 0.501067615658363,
"grad_norm": 3.970264416207935,
"learning_rate": 1.0308370044052863e-06,
"loss": 0.0292,
"step": 704
},
{
"epoch": 0.501779359430605,
"grad_norm": 6.6073855027819475,
"learning_rate": 1.029368575624082e-06,
"loss": 0.0101,
"step": 705
},
{
"epoch": 0.502491103202847,
"grad_norm": 6.492944849966541,
"learning_rate": 1.027900146842878e-06,
"loss": 0.1188,
"step": 706
},
{
"epoch": 0.503202846975089,
"grad_norm": 6.040494327148118,
"learning_rate": 1.0264317180616738e-06,
"loss": 0.0784,
"step": 707
},
{
"epoch": 0.5039145907473309,
"grad_norm": 4.63609509021837,
"learning_rate": 1.0249632892804698e-06,
"loss": 0.0678,
"step": 708
},
{
"epoch": 0.504626334519573,
"grad_norm": 6.6077951116687155,
"learning_rate": 1.0234948604992658e-06,
"loss": 0.1119,
"step": 709
},
{
"epoch": 0.505338078291815,
"grad_norm": 4.34544526095902,
"learning_rate": 1.0220264317180616e-06,
"loss": 0.2166,
"step": 710
},
{
"epoch": 0.5060498220640569,
"grad_norm": 4.0070871511015875,
"learning_rate": 1.0205580029368575e-06,
"loss": 0.0856,
"step": 711
},
{
"epoch": 0.5067615658362989,
"grad_norm": 4.446734852837578,
"learning_rate": 1.0190895741556533e-06,
"loss": -0.079,
"step": 712
},
{
"epoch": 0.507473309608541,
"grad_norm": 5.756477025097774,
"learning_rate": 1.0176211453744493e-06,
"loss": 0.0168,
"step": 713
},
{
"epoch": 0.5081850533807829,
"grad_norm": 5.620792934912572,
"learning_rate": 1.016152716593245e-06,
"loss": 0.0884,
"step": 714
},
{
"epoch": 0.5088967971530249,
"grad_norm": 3.263840475241881,
"learning_rate": 1.014684287812041e-06,
"loss": -0.0387,
"step": 715
},
{
"epoch": 0.5096085409252669,
"grad_norm": 4.9965648294818354,
"learning_rate": 1.013215859030837e-06,
"loss": 0.182,
"step": 716
},
{
"epoch": 0.5103202846975089,
"grad_norm": 4.602115202839881,
"learning_rate": 1.011747430249633e-06,
"loss": 0.0814,
"step": 717
},
{
"epoch": 0.5110320284697509,
"grad_norm": 3.365418715251831,
"learning_rate": 1.0102790014684287e-06,
"loss": -0.0525,
"step": 718
},
{
"epoch": 0.5117437722419929,
"grad_norm": 3.1922592834677044,
"learning_rate": 1.0088105726872247e-06,
"loss": 0.09,
"step": 719
},
{
"epoch": 0.5124555160142349,
"grad_norm": 3.549032487795268,
"learning_rate": 1.0073421439060205e-06,
"loss": -0.0008,
"step": 720
},
{
"epoch": 0.5131672597864768,
"grad_norm": 4.877635536832963,
"learning_rate": 1.0058737151248165e-06,
"loss": 0.1009,
"step": 721
},
{
"epoch": 0.5138790035587188,
"grad_norm": 3.822705726451478,
"learning_rate": 1.0044052863436122e-06,
"loss": 0.0335,
"step": 722
},
{
"epoch": 0.5145907473309609,
"grad_norm": 5.089722146655097,
"learning_rate": 1.0029368575624082e-06,
"loss": 0.0302,
"step": 723
},
{
"epoch": 0.5153024911032028,
"grad_norm": 5.256543214582418,
"learning_rate": 1.001468428781204e-06,
"loss": 0.0756,
"step": 724
},
{
"epoch": 0.5160142348754448,
"grad_norm": 6.205469721275733,
"learning_rate": 1e-06,
"loss": 0.0743,
"step": 725
},
{
"epoch": 0.5167259786476869,
"grad_norm": 4.824095195469929,
"learning_rate": 9.98531571218796e-07,
"loss": 0.0462,
"step": 726
},
{
"epoch": 0.5174377224199288,
"grad_norm": 6.044532184026359,
"learning_rate": 9.970631424375917e-07,
"loss": 0.1479,
"step": 727
},
{
"epoch": 0.5181494661921708,
"grad_norm": 6.564234622404848,
"learning_rate": 9.955947136563877e-07,
"loss": 0.0437,
"step": 728
},
{
"epoch": 0.5188612099644128,
"grad_norm": 4.015261622511888,
"learning_rate": 9.941262848751834e-07,
"loss": 0.0073,
"step": 729
},
{
"epoch": 0.5195729537366548,
"grad_norm": 5.187190035096562,
"learning_rate": 9.926578560939794e-07,
"loss": 0.0647,
"step": 730
},
{
"epoch": 0.5202846975088968,
"grad_norm": 7.051144002527871,
"learning_rate": 9.911894273127754e-07,
"loss": 0.0051,
"step": 731
},
{
"epoch": 0.5209964412811388,
"grad_norm": 6.711230803398463,
"learning_rate": 9.897209985315712e-07,
"loss": 0.1652,
"step": 732
},
{
"epoch": 0.5217081850533808,
"grad_norm": 7.284132489226529,
"learning_rate": 9.882525697503671e-07,
"loss": 0.0454,
"step": 733
},
{
"epoch": 0.5224199288256228,
"grad_norm": 6.953294799102971,
"learning_rate": 9.86784140969163e-07,
"loss": 0.0806,
"step": 734
},
{
"epoch": 0.5231316725978647,
"grad_norm": 4.530157128444712,
"learning_rate": 9.853157121879589e-07,
"loss": -0.0047,
"step": 735
},
{
"epoch": 0.5238434163701068,
"grad_norm": 5.280898042948249,
"learning_rate": 9.838472834067546e-07,
"loss": 0.0573,
"step": 736
},
{
"epoch": 0.5245551601423487,
"grad_norm": 8.207179057748009,
"learning_rate": 9.823788546255506e-07,
"loss": 0.0707,
"step": 737
},
{
"epoch": 0.5252669039145907,
"grad_norm": 5.339631573829629,
"learning_rate": 9.809104258443464e-07,
"loss": 0.1164,
"step": 738
},
{
"epoch": 0.5259786476868328,
"grad_norm": 3.3373897348541077,
"learning_rate": 9.794419970631424e-07,
"loss": -0.0819,
"step": 739
},
{
"epoch": 0.5266903914590747,
"grad_norm": 4.900868373919202,
"learning_rate": 9.779735682819383e-07,
"loss": 0.1355,
"step": 740
},
{
"epoch": 0.5274021352313167,
"grad_norm": 7.232896295285823,
"learning_rate": 9.765051395007341e-07,
"loss": 0.2699,
"step": 741
},
{
"epoch": 0.5281138790035588,
"grad_norm": 3.8975046146309227,
"learning_rate": 9.7503671071953e-07,
"loss": 0.0535,
"step": 742
},
{
"epoch": 0.5288256227758007,
"grad_norm": 6.188097992331352,
"learning_rate": 9.73568281938326e-07,
"loss": -0.0402,
"step": 743
},
{
"epoch": 0.5295373665480427,
"grad_norm": 4.807307228512303,
"learning_rate": 9.720998531571218e-07,
"loss": 0.1625,
"step": 744
},
{
"epoch": 0.5302491103202847,
"grad_norm": 6.3360410557197016,
"learning_rate": 9.706314243759178e-07,
"loss": 0.0756,
"step": 745
},
{
"epoch": 0.5309608540925267,
"grad_norm": 5.461588852093009,
"learning_rate": 9.691629955947136e-07,
"loss": 0.1382,
"step": 746
},
{
"epoch": 0.5316725978647687,
"grad_norm": 4.912735849402141,
"learning_rate": 9.676945668135096e-07,
"loss": 0.1125,
"step": 747
},
{
"epoch": 0.5323843416370106,
"grad_norm": 5.822463161799121,
"learning_rate": 9.662261380323053e-07,
"loss": 0.1063,
"step": 748
},
{
"epoch": 0.5330960854092527,
"grad_norm": 6.120739447607458,
"learning_rate": 9.647577092511013e-07,
"loss": 0.0771,
"step": 749
},
{
"epoch": 0.5338078291814946,
"grad_norm": 3.4986944488443847,
"learning_rate": 9.632892804698973e-07,
"loss": 0.1074,
"step": 750
},
{
"epoch": 0.5345195729537366,
"grad_norm": 3.188142300386645,
"learning_rate": 9.61820851688693e-07,
"loss": 0.0394,
"step": 751
},
{
"epoch": 0.5352313167259787,
"grad_norm": 4.301724447408916,
"learning_rate": 9.60352422907489e-07,
"loss": 0.1679,
"step": 752
},
{
"epoch": 0.5359430604982206,
"grad_norm": 3.6983788573076413,
"learning_rate": 9.588839941262848e-07,
"loss": -0.0002,
"step": 753
},
{
"epoch": 0.5366548042704626,
"grad_norm": 6.6257306541739345,
"learning_rate": 9.574155653450808e-07,
"loss": 0.0013,
"step": 754
},
{
"epoch": 0.5373665480427047,
"grad_norm": 3.138478014976094,
"learning_rate": 9.559471365638765e-07,
"loss": 0.11,
"step": 755
},
{
"epoch": 0.5380782918149466,
"grad_norm": 8.610525175552773,
"learning_rate": 9.544787077826725e-07,
"loss": 0.1,
"step": 756
},
{
"epoch": 0.5387900355871886,
"grad_norm": 4.293998726605181,
"learning_rate": 9.530102790014684e-07,
"loss": 0.0206,
"step": 757
},
{
"epoch": 0.5395017793594306,
"grad_norm": 7.107177663089392,
"learning_rate": 9.515418502202642e-07,
"loss": 0.1868,
"step": 758
},
{
"epoch": 0.5402135231316726,
"grad_norm": 4.437300771619768,
"learning_rate": 9.500734214390601e-07,
"loss": -0.0052,
"step": 759
},
{
"epoch": 0.5409252669039146,
"grad_norm": 4.933126903201541,
"learning_rate": 9.486049926578561e-07,
"loss": 0.0428,
"step": 760
},
{
"epoch": 0.5416370106761565,
"grad_norm": 4.007325616898821,
"learning_rate": 9.47136563876652e-07,
"loss": 0.1194,
"step": 761
},
{
"epoch": 0.5423487544483986,
"grad_norm": 3.3928461490539035,
"learning_rate": 9.456681350954478e-07,
"loss": 0.0942,
"step": 762
},
{
"epoch": 0.5430604982206406,
"grad_norm": 3.6371352861702553,
"learning_rate": 9.441997063142437e-07,
"loss": -0.0607,
"step": 763
},
{
"epoch": 0.5437722419928825,
"grad_norm": 4.584237142517063,
"learning_rate": 9.427312775330396e-07,
"loss": 0.0438,
"step": 764
},
{
"epoch": 0.5444839857651246,
"grad_norm": 5.492813096427028,
"learning_rate": 9.412628487518355e-07,
"loss": 0.0376,
"step": 765
},
{
"epoch": 0.5451957295373665,
"grad_norm": 6.072503086181962,
"learning_rate": 9.397944199706313e-07,
"loss": 0.0891,
"step": 766
},
{
"epoch": 0.5459074733096085,
"grad_norm": 4.134151825824703,
"learning_rate": 9.383259911894272e-07,
"loss": 0.0644,
"step": 767
},
{
"epoch": 0.5466192170818506,
"grad_norm": 4.4160799800430155,
"learning_rate": 9.368575624082231e-07,
"loss": 0.0636,
"step": 768
},
{
"epoch": 0.5473309608540925,
"grad_norm": 4.714994439195608,
"learning_rate": 9.353891336270191e-07,
"loss": -0.0305,
"step": 769
},
{
"epoch": 0.5480427046263345,
"grad_norm": 4.771488172519467,
"learning_rate": 9.33920704845815e-07,
"loss": 0.039,
"step": 770
},
{
"epoch": 0.5487544483985766,
"grad_norm": 4.60214711541248,
"learning_rate": 9.324522760646109e-07,
"loss": 0.0347,
"step": 771
},
{
"epoch": 0.5494661921708185,
"grad_norm": 3.7325284215071153,
"learning_rate": 9.309838472834068e-07,
"loss": 0.0216,
"step": 772
},
{
"epoch": 0.5501779359430605,
"grad_norm": 4.241197558370274,
"learning_rate": 9.295154185022026e-07,
"loss": 0.1269,
"step": 773
},
{
"epoch": 0.5508896797153024,
"grad_norm": 5.679316407559781,
"learning_rate": 9.280469897209985e-07,
"loss": 0.0889,
"step": 774
},
{
"epoch": 0.5516014234875445,
"grad_norm": 4.815106367320797,
"learning_rate": 9.265785609397944e-07,
"loss": 0.1188,
"step": 775
},
{
"epoch": 0.5523131672597865,
"grad_norm": 3.5294291435075626,
"learning_rate": 9.251101321585902e-07,
"loss": 0.0955,
"step": 776
},
{
"epoch": 0.5530249110320284,
"grad_norm": 5.426602372172119,
"learning_rate": 9.236417033773861e-07,
"loss": 0.0026,
"step": 777
},
{
"epoch": 0.5537366548042705,
"grad_norm": 3.412274413984105,
"learning_rate": 9.22173274596182e-07,
"loss": -0.0744,
"step": 778
},
{
"epoch": 0.5544483985765124,
"grad_norm": 5.901877747777146,
"learning_rate": 9.20704845814978e-07,
"loss": 0.1465,
"step": 779
},
{
"epoch": 0.5551601423487544,
"grad_norm": 4.37856953642428,
"learning_rate": 9.192364170337738e-07,
"loss": 0.1163,
"step": 780
},
{
"epoch": 0.5558718861209965,
"grad_norm": 5.049603459869766,
"learning_rate": 9.177679882525697e-07,
"loss": 0.028,
"step": 781
},
{
"epoch": 0.5565836298932384,
"grad_norm": 5.211098732805948,
"learning_rate": 9.162995594713656e-07,
"loss": -0.0797,
"step": 782
},
{
"epoch": 0.5572953736654804,
"grad_norm": 3.208772620971025,
"learning_rate": 9.148311306901615e-07,
"loss": 0.0965,
"step": 783
},
{
"epoch": 0.5580071174377225,
"grad_norm": 5.775537586374198,
"learning_rate": 9.133627019089573e-07,
"loss": 0.079,
"step": 784
},
{
"epoch": 0.5587188612099644,
"grad_norm": 3.7980849610478558,
"learning_rate": 9.118942731277532e-07,
"loss": 0.0555,
"step": 785
},
{
"epoch": 0.5594306049822064,
"grad_norm": 3.7820751456345447,
"learning_rate": 9.104258443465491e-07,
"loss": 0.102,
"step": 786
},
{
"epoch": 0.5601423487544483,
"grad_norm": 4.53042552945143,
"learning_rate": 9.08957415565345e-07,
"loss": 0.0037,
"step": 787
},
{
"epoch": 0.5608540925266904,
"grad_norm": 5.130468846659909,
"learning_rate": 9.074889867841409e-07,
"loss": 0.1375,
"step": 788
},
{
"epoch": 0.5615658362989324,
"grad_norm": 5.0011692587301795,
"learning_rate": 9.060205580029369e-07,
"loss": 0.12,
"step": 789
},
{
"epoch": 0.5622775800711743,
"grad_norm": 4.990387450257397,
"learning_rate": 9.045521292217328e-07,
"loss": 0.058,
"step": 790
},
{
"epoch": 0.5629893238434164,
"grad_norm": 4.713852317506844,
"learning_rate": 9.030837004405286e-07,
"loss": 0.1319,
"step": 791
},
{
"epoch": 0.5637010676156584,
"grad_norm": 7.664848388446512,
"learning_rate": 9.016152716593245e-07,
"loss": 0.0903,
"step": 792
},
{
"epoch": 0.5644128113879003,
"grad_norm": 4.3792586172064505,
"learning_rate": 9.001468428781204e-07,
"loss": -0.0151,
"step": 793
},
{
"epoch": 0.5651245551601424,
"grad_norm": 3.2950223083204007,
"learning_rate": 8.986784140969163e-07,
"loss": 0.0394,
"step": 794
},
{
"epoch": 0.5658362989323843,
"grad_norm": 7.631142065486189,
"learning_rate": 8.972099853157121e-07,
"loss": 0.1386,
"step": 795
},
{
"epoch": 0.5665480427046263,
"grad_norm": 4.223643311945042,
"learning_rate": 8.95741556534508e-07,
"loss": -0.0308,
"step": 796
},
{
"epoch": 0.5672597864768684,
"grad_norm": 3.4974575344995174,
"learning_rate": 8.942731277533039e-07,
"loss": -0.0312,
"step": 797
},
{
"epoch": 0.5679715302491103,
"grad_norm": 3.028662137320869,
"learning_rate": 8.928046989720998e-07,
"loss": -0.027,
"step": 798
},
{
"epoch": 0.5686832740213523,
"grad_norm": 3.39632600982153,
"learning_rate": 8.913362701908957e-07,
"loss": -0.0008,
"step": 799
},
{
"epoch": 0.5693950177935944,
"grad_norm": 6.1897883951874535,
"learning_rate": 8.898678414096916e-07,
"loss": 0.0858,
"step": 800
},
{
"epoch": 0.5701067615658363,
"grad_norm": 4.702874566694659,
"learning_rate": 8.883994126284875e-07,
"loss": 0.0146,
"step": 801
},
{
"epoch": 0.5708185053380783,
"grad_norm": 4.7294983376418305,
"learning_rate": 8.869309838472833e-07,
"loss": 0.1296,
"step": 802
},
{
"epoch": 0.5715302491103202,
"grad_norm": 4.857229660765176,
"learning_rate": 8.854625550660792e-07,
"loss": 0.0874,
"step": 803
},
{
"epoch": 0.5722419928825623,
"grad_norm": 10.192813914978096,
"learning_rate": 8.839941262848752e-07,
"loss": 0.1704,
"step": 804
},
{
"epoch": 0.5729537366548043,
"grad_norm": 3.644774204999828,
"learning_rate": 8.82525697503671e-07,
"loss": 0.0573,
"step": 805
},
{
"epoch": 0.5736654804270462,
"grad_norm": 6.851142966150024,
"learning_rate": 8.810572687224669e-07,
"loss": 0.1111,
"step": 806
},
{
"epoch": 0.5743772241992883,
"grad_norm": 5.16915314842292,
"learning_rate": 8.795888399412628e-07,
"loss": 0.0982,
"step": 807
},
{
"epoch": 0.5750889679715302,
"grad_norm": 3.7247312512833877,
"learning_rate": 8.781204111600588e-07,
"loss": 0.0495,
"step": 808
},
{
"epoch": 0.5758007117437722,
"grad_norm": 6.8775044821222275,
"learning_rate": 8.766519823788546e-07,
"loss": 0.0514,
"step": 809
},
{
"epoch": 0.5765124555160143,
"grad_norm": 10.730556558500393,
"learning_rate": 8.751835535976505e-07,
"loss": 0.0751,
"step": 810
},
{
"epoch": 0.5772241992882562,
"grad_norm": 9.122296494576403,
"learning_rate": 8.737151248164464e-07,
"loss": 0.0662,
"step": 811
},
{
"epoch": 0.5779359430604982,
"grad_norm": 3.808482445288645,
"learning_rate": 8.722466960352423e-07,
"loss": 0.0866,
"step": 812
},
{
"epoch": 0.5786476868327403,
"grad_norm": 4.963997925652648,
"learning_rate": 8.707782672540381e-07,
"loss": 0.0274,
"step": 813
},
{
"epoch": 0.5793594306049822,
"grad_norm": 4.972806414354329,
"learning_rate": 8.69309838472834e-07,
"loss": 0.0145,
"step": 814
},
{
"epoch": 0.5800711743772242,
"grad_norm": 5.200450862998976,
"learning_rate": 8.678414096916299e-07,
"loss": 0.0583,
"step": 815
},
{
"epoch": 0.5807829181494661,
"grad_norm": 5.21945185710134,
"learning_rate": 8.663729809104257e-07,
"loss": 0.0849,
"step": 816
},
{
"epoch": 0.5814946619217082,
"grad_norm": 5.9386659782373865,
"learning_rate": 8.649045521292216e-07,
"loss": 0.2583,
"step": 817
},
{
"epoch": 0.5822064056939502,
"grad_norm": 6.670258158616344,
"learning_rate": 8.634361233480176e-07,
"loss": 0.0157,
"step": 818
},
{
"epoch": 0.5829181494661921,
"grad_norm": 4.226534746399752,
"learning_rate": 8.619676945668135e-07,
"loss": 0.1208,
"step": 819
},
{
"epoch": 0.5836298932384342,
"grad_norm": 4.928546368395039,
"learning_rate": 8.604992657856093e-07,
"loss": 0.0773,
"step": 820
},
{
"epoch": 0.5843416370106762,
"grad_norm": 2.968915914617463,
"learning_rate": 8.590308370044053e-07,
"loss": 0.1178,
"step": 821
},
{
"epoch": 0.5850533807829181,
"grad_norm": 8.789100694361455,
"learning_rate": 8.575624082232012e-07,
"loss": -0.0098,
"step": 822
},
{
"epoch": 0.5857651245551602,
"grad_norm": 3.883119628548879,
"learning_rate": 8.560939794419971e-07,
"loss": 0.1088,
"step": 823
},
{
"epoch": 0.5864768683274021,
"grad_norm": 5.206495321843375,
"learning_rate": 8.546255506607929e-07,
"loss": 0.0098,
"step": 824
},
{
"epoch": 0.5871886120996441,
"grad_norm": 3.9007823061659894,
"learning_rate": 8.531571218795888e-07,
"loss": 0.0431,
"step": 825
},
{
"epoch": 0.5879003558718862,
"grad_norm": 3.8238623728705603,
"learning_rate": 8.516886930983847e-07,
"loss": 0.0652,
"step": 826
},
{
"epoch": 0.5886120996441281,
"grad_norm": 5.147270197655345,
"learning_rate": 8.502202643171805e-07,
"loss": 0.0553,
"step": 827
},
{
"epoch": 0.5893238434163701,
"grad_norm": 3.1488070211968027,
"learning_rate": 8.487518355359765e-07,
"loss": 0.0409,
"step": 828
},
{
"epoch": 0.5900355871886122,
"grad_norm": 6.545808327390556,
"learning_rate": 8.472834067547724e-07,
"loss": 0.0596,
"step": 829
},
{
"epoch": 0.5907473309608541,
"grad_norm": 4.262885139494661,
"learning_rate": 8.458149779735683e-07,
"loss": 0.0387,
"step": 830
},
{
"epoch": 0.5914590747330961,
"grad_norm": 5.216472562808622,
"learning_rate": 8.443465491923641e-07,
"loss": 0.1205,
"step": 831
},
{
"epoch": 0.592170818505338,
"grad_norm": 3.274785094453674,
"learning_rate": 8.4287812041116e-07,
"loss": 0.052,
"step": 832
},
{
"epoch": 0.5928825622775801,
"grad_norm": 5.076651115735975,
"learning_rate": 8.414096916299559e-07,
"loss": 0.165,
"step": 833
},
{
"epoch": 0.5935943060498221,
"grad_norm": 4.67123133075057,
"learning_rate": 8.399412628487518e-07,
"loss": 0.0592,
"step": 834
},
{
"epoch": 0.594306049822064,
"grad_norm": 11.997396186622707,
"learning_rate": 8.384728340675476e-07,
"loss": 0.0962,
"step": 835
},
{
"epoch": 0.5950177935943061,
"grad_norm": 5.373216554936181,
"learning_rate": 8.370044052863435e-07,
"loss": 0.1828,
"step": 836
},
{
"epoch": 0.595729537366548,
"grad_norm": 3.297467942273627,
"learning_rate": 8.355359765051395e-07,
"loss": -0.0903,
"step": 837
},
{
"epoch": 0.59644128113879,
"grad_norm": 6.0804009184913,
"learning_rate": 8.340675477239353e-07,
"loss": 0.1039,
"step": 838
},
{
"epoch": 0.5971530249110321,
"grad_norm": 8.681008527677944,
"learning_rate": 8.325991189427313e-07,
"loss": -0.0249,
"step": 839
},
{
"epoch": 0.597864768683274,
"grad_norm": 6.081078133109813,
"learning_rate": 8.311306901615272e-07,
"loss": 0.0362,
"step": 840
},
{
"epoch": 0.598576512455516,
"grad_norm": 4.786545358004393,
"learning_rate": 8.296622613803231e-07,
"loss": 0.0963,
"step": 841
},
{
"epoch": 0.599288256227758,
"grad_norm": 4.020818179930494,
"learning_rate": 8.281938325991189e-07,
"loss": 0.1366,
"step": 842
},
{
"epoch": 0.6,
"grad_norm": 4.908818635550565,
"learning_rate": 8.267254038179148e-07,
"loss": 0.0357,
"step": 843
},
{
"epoch": 0.600711743772242,
"grad_norm": 6.085069626321275,
"learning_rate": 8.252569750367107e-07,
"loss": 0.0482,
"step": 844
},
{
"epoch": 0.6014234875444839,
"grad_norm": 4.059258444201513,
"learning_rate": 8.237885462555065e-07,
"loss": 0.042,
"step": 845
},
{
"epoch": 0.602135231316726,
"grad_norm": 5.8277267033597875,
"learning_rate": 8.223201174743024e-07,
"loss": 0.079,
"step": 846
},
{
"epoch": 0.602846975088968,
"grad_norm": 3.5099927757601828,
"learning_rate": 8.208516886930984e-07,
"loss": -0.0134,
"step": 847
},
{
"epoch": 0.6035587188612099,
"grad_norm": 4.192415227971293,
"learning_rate": 8.193832599118943e-07,
"loss": 0.0844,
"step": 848
},
{
"epoch": 0.604270462633452,
"grad_norm": 4.586599376946333,
"learning_rate": 8.179148311306901e-07,
"loss": 0.1048,
"step": 849
},
{
"epoch": 0.604982206405694,
"grad_norm": 3.7801229515464962,
"learning_rate": 8.16446402349486e-07,
"loss": -0.0248,
"step": 850
},
{
"epoch": 0.6056939501779359,
"grad_norm": 5.737160320657065,
"learning_rate": 8.149779735682819e-07,
"loss": 0.063,
"step": 851
},
{
"epoch": 0.606405693950178,
"grad_norm": 4.308955186916713,
"learning_rate": 8.135095447870778e-07,
"loss": 0.0559,
"step": 852
},
{
"epoch": 0.6071174377224199,
"grad_norm": 6.269661761200003,
"learning_rate": 8.120411160058736e-07,
"loss": 0.0686,
"step": 853
},
{
"epoch": 0.6078291814946619,
"grad_norm": 3.4091553502783563,
"learning_rate": 8.105726872246695e-07,
"loss": 0.0133,
"step": 854
},
{
"epoch": 0.608540925266904,
"grad_norm": 4.54197927443203,
"learning_rate": 8.091042584434654e-07,
"loss": 0.0049,
"step": 855
},
{
"epoch": 0.6092526690391459,
"grad_norm": 5.638036244806688,
"learning_rate": 8.076358296622612e-07,
"loss": -0.0274,
"step": 856
},
{
"epoch": 0.6099644128113879,
"grad_norm": 4.050464584642166,
"learning_rate": 8.061674008810573e-07,
"loss": -0.0498,
"step": 857
},
{
"epoch": 0.61067615658363,
"grad_norm": 6.185879581851871,
"learning_rate": 8.046989720998532e-07,
"loss": 0.095,
"step": 858
},
{
"epoch": 0.6113879003558719,
"grad_norm": 5.597479402845821,
"learning_rate": 8.032305433186491e-07,
"loss": 0.1485,
"step": 859
},
{
"epoch": 0.6120996441281139,
"grad_norm": 6.770712263540968,
"learning_rate": 8.017621145374449e-07,
"loss": 0.0033,
"step": 860
},
{
"epoch": 0.6128113879003558,
"grad_norm": 5.023930503182311,
"learning_rate": 8.002936857562408e-07,
"loss": 0.0971,
"step": 861
},
{
"epoch": 0.6135231316725979,
"grad_norm": 3.8390228232172072,
"learning_rate": 7.988252569750367e-07,
"loss": 0.0792,
"step": 862
},
{
"epoch": 0.6142348754448399,
"grad_norm": 4.63076731937446,
"learning_rate": 7.973568281938326e-07,
"loss": 0.082,
"step": 863
},
{
"epoch": 0.6149466192170818,
"grad_norm": 3.772994305664748,
"learning_rate": 7.958883994126284e-07,
"loss": -0.0717,
"step": 864
},
{
"epoch": 0.6156583629893239,
"grad_norm": 2.651596750679966,
"learning_rate": 7.944199706314243e-07,
"loss": -0.0746,
"step": 865
},
{
"epoch": 0.6163701067615658,
"grad_norm": 3.339891863916958,
"learning_rate": 7.929515418502202e-07,
"loss": 0.0358,
"step": 866
},
{
"epoch": 0.6170818505338078,
"grad_norm": 9.563176149885061,
"learning_rate": 7.914831130690161e-07,
"loss": 0.138,
"step": 867
},
{
"epoch": 0.6177935943060499,
"grad_norm": 4.018473168029476,
"learning_rate": 7.90014684287812e-07,
"loss": 0.115,
"step": 868
},
{
"epoch": 0.6185053380782918,
"grad_norm": 5.509934902464323,
"learning_rate": 7.885462555066079e-07,
"loss": 0.0952,
"step": 869
},
{
"epoch": 0.6192170818505338,
"grad_norm": 5.151079579500843,
"learning_rate": 7.870778267254038e-07,
"loss": 0.1107,
"step": 870
},
{
"epoch": 0.6199288256227758,
"grad_norm": 4.313174208860737,
"learning_rate": 7.856093979441996e-07,
"loss": 0.0696,
"step": 871
},
{
"epoch": 0.6206405693950178,
"grad_norm": 8.448847621010032,
"learning_rate": 7.841409691629955e-07,
"loss": 0.1063,
"step": 872
},
{
"epoch": 0.6213523131672598,
"grad_norm": 4.076951132014819,
"learning_rate": 7.826725403817914e-07,
"loss": 0.1106,
"step": 873
},
{
"epoch": 0.6220640569395017,
"grad_norm": 5.086643246204726,
"learning_rate": 7.812041116005874e-07,
"loss": -0.0188,
"step": 874
},
{
"epoch": 0.6227758007117438,
"grad_norm": 4.08189971565891,
"learning_rate": 7.797356828193832e-07,
"loss": 0.0276,
"step": 875
},
{
"epoch": 0.6234875444839858,
"grad_norm": 3.766159589500804,
"learning_rate": 7.782672540381792e-07,
"loss": 0.0417,
"step": 876
},
{
"epoch": 0.6241992882562277,
"grad_norm": 2.713107337287823,
"learning_rate": 7.767988252569751e-07,
"loss": -0.021,
"step": 877
},
{
"epoch": 0.6249110320284698,
"grad_norm": 7.909322631614341,
"learning_rate": 7.753303964757709e-07,
"loss": 0.1673,
"step": 878
},
{
"epoch": 0.6256227758007118,
"grad_norm": 4.289154107343657,
"learning_rate": 7.738619676945668e-07,
"loss": 0.214,
"step": 879
},
{
"epoch": 0.6263345195729537,
"grad_norm": 4.613836682997994,
"learning_rate": 7.723935389133627e-07,
"loss": 0.1429,
"step": 880
},
{
"epoch": 0.6270462633451958,
"grad_norm": 9.965118224978214,
"learning_rate": 7.709251101321586e-07,
"loss": 0.1514,
"step": 881
},
{
"epoch": 0.6277580071174377,
"grad_norm": 3.7370059908481235,
"learning_rate": 7.694566813509544e-07,
"loss": 0.1028,
"step": 882
},
{
"epoch": 0.6284697508896797,
"grad_norm": 4.3755263965548465,
"learning_rate": 7.679882525697503e-07,
"loss": -0.0689,
"step": 883
},
{
"epoch": 0.6291814946619217,
"grad_norm": 5.050525207402243,
"learning_rate": 7.665198237885462e-07,
"loss": 0.063,
"step": 884
},
{
"epoch": 0.6298932384341637,
"grad_norm": 4.915976050198276,
"learning_rate": 7.65051395007342e-07,
"loss": 0.1072,
"step": 885
},
{
"epoch": 0.6306049822064057,
"grad_norm": 4.7320990300783805,
"learning_rate": 7.63582966226138e-07,
"loss": 0.0415,
"step": 886
},
{
"epoch": 0.6313167259786477,
"grad_norm": 4.438720750124185,
"learning_rate": 7.621145374449339e-07,
"loss": 0.0805,
"step": 887
},
{
"epoch": 0.6320284697508897,
"grad_norm": 5.093607333004949,
"learning_rate": 7.606461086637298e-07,
"loss": -0.0102,
"step": 888
},
{
"epoch": 0.6327402135231317,
"grad_norm": 6.2013902520603885,
"learning_rate": 7.591776798825256e-07,
"loss": 0.0973,
"step": 889
},
{
"epoch": 0.6334519572953736,
"grad_norm": 4.721127952995933,
"learning_rate": 7.577092511013215e-07,
"loss": 0.1244,
"step": 890
},
{
"epoch": 0.6341637010676157,
"grad_norm": 5.7984725519352835,
"learning_rate": 7.562408223201175e-07,
"loss": 0.1645,
"step": 891
},
{
"epoch": 0.6348754448398577,
"grad_norm": 4.2837367945151135,
"learning_rate": 7.547723935389134e-07,
"loss": 0.0197,
"step": 892
},
{
"epoch": 0.6355871886120996,
"grad_norm": 3.7337004878385387,
"learning_rate": 7.533039647577092e-07,
"loss": 0.1216,
"step": 893
},
{
"epoch": 0.6362989323843417,
"grad_norm": 4.525227088920381,
"learning_rate": 7.518355359765051e-07,
"loss": 0.1481,
"step": 894
},
{
"epoch": 0.6370106761565836,
"grad_norm": 3.7562219501279537,
"learning_rate": 7.50367107195301e-07,
"loss": 0.0086,
"step": 895
},
{
"epoch": 0.6377224199288256,
"grad_norm": 4.288567074927515,
"learning_rate": 7.48898678414097e-07,
"loss": 0.128,
"step": 896
},
{
"epoch": 0.6384341637010676,
"grad_norm": 3.9364468816203844,
"learning_rate": 7.474302496328928e-07,
"loss": -0.0388,
"step": 897
},
{
"epoch": 0.6391459074733096,
"grad_norm": 5.361060586448568,
"learning_rate": 7.459618208516887e-07,
"loss": -0.0335,
"step": 898
},
{
"epoch": 0.6398576512455516,
"grad_norm": 3.238064490634461,
"learning_rate": 7.444933920704846e-07,
"loss": -0.0411,
"step": 899
},
{
"epoch": 0.6405693950177936,
"grad_norm": 3.8410604107512456,
"learning_rate": 7.430249632892804e-07,
"loss": 0.092,
"step": 900
},
{
"epoch": 0.6412811387900356,
"grad_norm": 8.013488841819695,
"learning_rate": 7.415565345080763e-07,
"loss": 0.0765,
"step": 901
},
{
"epoch": 0.6419928825622776,
"grad_norm": 3.4830077358338465,
"learning_rate": 7.400881057268722e-07,
"loss": 0.0321,
"step": 902
},
{
"epoch": 0.6427046263345195,
"grad_norm": 4.148883228099494,
"learning_rate": 7.38619676945668e-07,
"loss": 0.0473,
"step": 903
},
{
"epoch": 0.6434163701067616,
"grad_norm": 8.750289912189382,
"learning_rate": 7.371512481644639e-07,
"loss": 0.1622,
"step": 904
},
{
"epoch": 0.6441281138790036,
"grad_norm": 6.100827845697011,
"learning_rate": 7.356828193832599e-07,
"loss": 0.0693,
"step": 905
},
{
"epoch": 0.6448398576512455,
"grad_norm": 3.436611684981597,
"learning_rate": 7.342143906020558e-07,
"loss": 0.0338,
"step": 906
},
{
"epoch": 0.6455516014234876,
"grad_norm": 3.2463466945109105,
"learning_rate": 7.327459618208516e-07,
"loss": -0.0213,
"step": 907
},
{
"epoch": 0.6462633451957296,
"grad_norm": 3.4580185681843463,
"learning_rate": 7.312775330396475e-07,
"loss": -0.0004,
"step": 908
},
{
"epoch": 0.6469750889679715,
"grad_norm": 7.1902770938455,
"learning_rate": 7.298091042584435e-07,
"loss": 0.1418,
"step": 909
},
{
"epoch": 0.6476868327402135,
"grad_norm": 5.850461644925145,
"learning_rate": 7.283406754772394e-07,
"loss": 0.1297,
"step": 910
},
{
"epoch": 0.6483985765124555,
"grad_norm": 4.174847975835514,
"learning_rate": 7.268722466960352e-07,
"loss": 0.048,
"step": 911
},
{
"epoch": 0.6491103202846975,
"grad_norm": 5.945931320969125,
"learning_rate": 7.254038179148311e-07,
"loss": 0.0002,
"step": 912
},
{
"epoch": 0.6498220640569395,
"grad_norm": 5.4566388982409215,
"learning_rate": 7.23935389133627e-07,
"loss": 0.0795,
"step": 913
},
{
"epoch": 0.6505338078291815,
"grad_norm": 7.77405555606714,
"learning_rate": 7.224669603524228e-07,
"loss": -0.014,
"step": 914
},
{
"epoch": 0.6512455516014235,
"grad_norm": 7.794588598487911,
"learning_rate": 7.209985315712188e-07,
"loss": 0.0568,
"step": 915
},
{
"epoch": 0.6519572953736655,
"grad_norm": 3.9613239167558505,
"learning_rate": 7.195301027900147e-07,
"loss": 0.0538,
"step": 916
},
{
"epoch": 0.6526690391459075,
"grad_norm": 5.250170205399911,
"learning_rate": 7.180616740088106e-07,
"loss": -0.0189,
"step": 917
},
{
"epoch": 0.6533807829181495,
"grad_norm": 3.7272174507575935,
"learning_rate": 7.165932452276064e-07,
"loss": 0.1008,
"step": 918
},
{
"epoch": 0.6540925266903914,
"grad_norm": 5.5762382329676825,
"learning_rate": 7.151248164464023e-07,
"loss": 0.096,
"step": 919
},
{
"epoch": 0.6548042704626335,
"grad_norm": 4.35001234732443,
"learning_rate": 7.136563876651982e-07,
"loss": 0.0592,
"step": 920
},
{
"epoch": 0.6555160142348755,
"grad_norm": 3.0296201748619684,
"learning_rate": 7.121879588839941e-07,
"loss": 0.0638,
"step": 921
},
{
"epoch": 0.6562277580071174,
"grad_norm": 8.409568840523043,
"learning_rate": 7.107195301027899e-07,
"loss": 0.2424,
"step": 922
},
{
"epoch": 0.6569395017793594,
"grad_norm": 7.918188678263488,
"learning_rate": 7.092511013215858e-07,
"loss": -0.0219,
"step": 923
},
{
"epoch": 0.6576512455516014,
"grad_norm": 10.7701330393842,
"learning_rate": 7.077826725403817e-07,
"loss": 0.032,
"step": 924
},
{
"epoch": 0.6583629893238434,
"grad_norm": 4.218135308589257,
"learning_rate": 7.063142437591776e-07,
"loss": -0.0272,
"step": 925
},
{
"epoch": 0.6590747330960854,
"grad_norm": 5.331667977838016,
"learning_rate": 7.048458149779736e-07,
"loss": 0.0864,
"step": 926
},
{
"epoch": 0.6597864768683274,
"grad_norm": 5.3750991812138444,
"learning_rate": 7.033773861967695e-07,
"loss": 0.0726,
"step": 927
},
{
"epoch": 0.6604982206405694,
"grad_norm": 5.207399920428084,
"learning_rate": 7.019089574155654e-07,
"loss": 0.1336,
"step": 928
},
{
"epoch": 0.6612099644128114,
"grad_norm": 6.606289334933452,
"learning_rate": 7.004405286343612e-07,
"loss": 0.123,
"step": 929
},
{
"epoch": 0.6619217081850534,
"grad_norm": 5.007336574559658,
"learning_rate": 6.989720998531571e-07,
"loss": 0.1693,
"step": 930
},
{
"epoch": 0.6626334519572954,
"grad_norm": 4.513131585131538,
"learning_rate": 6.97503671071953e-07,
"loss": 0.0533,
"step": 931
},
{
"epoch": 0.6633451957295373,
"grad_norm": 6.118064534929691,
"learning_rate": 6.960352422907489e-07,
"loss": -0.0167,
"step": 932
},
{
"epoch": 0.6640569395017794,
"grad_norm": 4.967171664241232,
"learning_rate": 6.945668135095447e-07,
"loss": 0.1309,
"step": 933
},
{
"epoch": 0.6647686832740214,
"grad_norm": 6.141986259243848,
"learning_rate": 6.930983847283406e-07,
"loss": 0.0683,
"step": 934
},
{
"epoch": 0.6654804270462633,
"grad_norm": 2.843838533134232,
"learning_rate": 6.916299559471366e-07,
"loss": -0.0248,
"step": 935
},
{
"epoch": 0.6661921708185053,
"grad_norm": 6.254489354452603,
"learning_rate": 6.901615271659324e-07,
"loss": 0.2101,
"step": 936
},
{
"epoch": 0.6669039145907474,
"grad_norm": 5.187993037681643,
"learning_rate": 6.886930983847283e-07,
"loss": 0.0127,
"step": 937
},
{
"epoch": 0.6676156583629893,
"grad_norm": 4.579762865615799,
"learning_rate": 6.872246696035242e-07,
"loss": 0.0603,
"step": 938
},
{
"epoch": 0.6683274021352313,
"grad_norm": 5.244168812482439,
"learning_rate": 6.857562408223201e-07,
"loss": 0.0201,
"step": 939
},
{
"epoch": 0.6690391459074733,
"grad_norm": 3.816794681206833,
"learning_rate": 6.842878120411159e-07,
"loss": 0.0043,
"step": 940
},
{
"epoch": 0.6697508896797153,
"grad_norm": 3.5284472119876193,
"learning_rate": 6.828193832599118e-07,
"loss": 0.0502,
"step": 941
},
{
"epoch": 0.6704626334519573,
"grad_norm": 3.933868913771442,
"learning_rate": 6.813509544787077e-07,
"loss": -0.0462,
"step": 942
},
{
"epoch": 0.6711743772241993,
"grad_norm": 4.004455469191327,
"learning_rate": 6.798825256975035e-07,
"loss": 0.0586,
"step": 943
},
{
"epoch": 0.6718861209964413,
"grad_norm": 6.617395318921544,
"learning_rate": 6.784140969162996e-07,
"loss": 0.1136,
"step": 944
},
{
"epoch": 0.6725978647686833,
"grad_norm": 5.621511437204682,
"learning_rate": 6.769456681350955e-07,
"loss": 0.1687,
"step": 945
},
{
"epoch": 0.6733096085409253,
"grad_norm": 3.562357447934307,
"learning_rate": 6.754772393538914e-07,
"loss": 0.0862,
"step": 946
},
{
"epoch": 0.6740213523131673,
"grad_norm": 6.125694391242706,
"learning_rate": 6.740088105726872e-07,
"loss": 0.1892,
"step": 947
},
{
"epoch": 0.6747330960854092,
"grad_norm": 3.9863843451296472,
"learning_rate": 6.725403817914831e-07,
"loss": -0.0151,
"step": 948
},
{
"epoch": 0.6754448398576512,
"grad_norm": 4.2001625029598,
"learning_rate": 6.71071953010279e-07,
"loss": 0.0487,
"step": 949
},
{
"epoch": 0.6761565836298933,
"grad_norm": 4.935771398113328,
"learning_rate": 6.696035242290749e-07,
"loss": 0.0966,
"step": 950
},
{
"epoch": 0.6768683274021352,
"grad_norm": 3.858605435514549,
"learning_rate": 6.681350954478707e-07,
"loss": -0.0329,
"step": 951
},
{
"epoch": 0.6775800711743772,
"grad_norm": 3.871614515457192,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0103,
"step": 952
},
{
"epoch": 0.6782918149466192,
"grad_norm": 4.820372050951537,
"learning_rate": 6.651982378854625e-07,
"loss": -0.0042,
"step": 953
},
{
"epoch": 0.6790035587188612,
"grad_norm": 4.562317308922618,
"learning_rate": 6.637298091042585e-07,
"loss": 0.1401,
"step": 954
},
{
"epoch": 0.6797153024911032,
"grad_norm": 8.34284882554388,
"learning_rate": 6.622613803230543e-07,
"loss": 0.1722,
"step": 955
},
{
"epoch": 0.6804270462633452,
"grad_norm": 3.5720720692334655,
"learning_rate": 6.607929515418502e-07,
"loss": 0.0558,
"step": 956
},
{
"epoch": 0.6811387900355872,
"grad_norm": 5.725857679596323,
"learning_rate": 6.593245227606461e-07,
"loss": 0.126,
"step": 957
},
{
"epoch": 0.6818505338078292,
"grad_norm": 10.288907153463756,
"learning_rate": 6.578560939794419e-07,
"loss": 0.2195,
"step": 958
},
{
"epoch": 0.6825622775800712,
"grad_norm": 5.933298009417249,
"learning_rate": 6.563876651982378e-07,
"loss": 0.119,
"step": 959
},
{
"epoch": 0.6832740213523132,
"grad_norm": 5.129065041215746,
"learning_rate": 6.549192364170337e-07,
"loss": -0.0546,
"step": 960
},
{
"epoch": 0.6839857651245551,
"grad_norm": 4.000375253241631,
"learning_rate": 6.534508076358297e-07,
"loss": 0.174,
"step": 961
},
{
"epoch": 0.6846975088967971,
"grad_norm": 7.894596482752509,
"learning_rate": 6.519823788546255e-07,
"loss": 0.1145,
"step": 962
},
{
"epoch": 0.6854092526690392,
"grad_norm": 4.82797863192124,
"learning_rate": 6.505139500734214e-07,
"loss": 0.0619,
"step": 963
},
{
"epoch": 0.6861209964412811,
"grad_norm": 6.221309269428655,
"learning_rate": 6.490455212922174e-07,
"loss": 0.0239,
"step": 964
},
{
"epoch": 0.6868327402135231,
"grad_norm": 4.86341008394864,
"learning_rate": 6.475770925110132e-07,
"loss": 0.086,
"step": 965
},
{
"epoch": 0.6875444839857652,
"grad_norm": 5.337886337127806,
"learning_rate": 6.461086637298091e-07,
"loss": 0.1969,
"step": 966
},
{
"epoch": 0.6882562277580071,
"grad_norm": 3.769358829646403,
"learning_rate": 6.44640234948605e-07,
"loss": 0.0967,
"step": 967
},
{
"epoch": 0.6889679715302491,
"grad_norm": 3.669919926032279,
"learning_rate": 6.431718061674009e-07,
"loss": 0.0377,
"step": 968
},
{
"epoch": 0.6896797153024911,
"grad_norm": 4.883374186443654,
"learning_rate": 6.417033773861967e-07,
"loss": 0.1519,
"step": 969
},
{
"epoch": 0.6903914590747331,
"grad_norm": 4.748723511086028,
"learning_rate": 6.402349486049926e-07,
"loss": 0.0907,
"step": 970
},
{
"epoch": 0.6911032028469751,
"grad_norm": 4.976284049252291,
"learning_rate": 6.387665198237885e-07,
"loss": -0.0896,
"step": 971
},
{
"epoch": 0.691814946619217,
"grad_norm": 5.283750585371445,
"learning_rate": 6.372980910425843e-07,
"loss": 0.0419,
"step": 972
},
{
"epoch": 0.6925266903914591,
"grad_norm": 5.255946107260031,
"learning_rate": 6.358296622613802e-07,
"loss": 0.0141,
"step": 973
},
{
"epoch": 0.6932384341637011,
"grad_norm": 5.006047787434039,
"learning_rate": 6.343612334801762e-07,
"loss": -0.0255,
"step": 974
},
{
"epoch": 0.693950177935943,
"grad_norm": 9.757667461016755,
"learning_rate": 6.328928046989721e-07,
"loss": 0.1332,
"step": 975
},
{
"epoch": 0.6946619217081851,
"grad_norm": 3.1321392144435065,
"learning_rate": 6.314243759177679e-07,
"loss": 0.081,
"step": 976
},
{
"epoch": 0.695373665480427,
"grad_norm": 2.8216884141577,
"learning_rate": 6.299559471365638e-07,
"loss": -0.0474,
"step": 977
},
{
"epoch": 0.696085409252669,
"grad_norm": 5.573286078737938,
"learning_rate": 6.284875183553597e-07,
"loss": 0.1186,
"step": 978
},
{
"epoch": 0.6967971530249111,
"grad_norm": 3.1582386059675924,
"learning_rate": 6.270190895741557e-07,
"loss": 0.0906,
"step": 979
},
{
"epoch": 0.697508896797153,
"grad_norm": 4.523244376004749,
"learning_rate": 6.255506607929515e-07,
"loss": 0.0386,
"step": 980
},
{
"epoch": 0.698220640569395,
"grad_norm": 3.4688155709332436,
"learning_rate": 6.240822320117474e-07,
"loss": 0.0308,
"step": 981
},
{
"epoch": 0.698932384341637,
"grad_norm": 5.411845227957125,
"learning_rate": 6.226138032305433e-07,
"loss": 0.0835,
"step": 982
},
{
"epoch": 0.699644128113879,
"grad_norm": 3.427770822010646,
"learning_rate": 6.211453744493393e-07,
"loss": 0.208,
"step": 983
},
{
"epoch": 0.700355871886121,
"grad_norm": 4.651382052221419,
"learning_rate": 6.196769456681351e-07,
"loss": 0.0119,
"step": 984
},
{
"epoch": 0.701067615658363,
"grad_norm": 4.988862029987204,
"learning_rate": 6.18208516886931e-07,
"loss": -0.0324,
"step": 985
},
{
"epoch": 0.701779359430605,
"grad_norm": 3.8284184557483334,
"learning_rate": 6.167400881057269e-07,
"loss": -0.0045,
"step": 986
},
{
"epoch": 0.702491103202847,
"grad_norm": 12.378991302169158,
"learning_rate": 6.152716593245227e-07,
"loss": 0.1294,
"step": 987
},
{
"epoch": 0.703202846975089,
"grad_norm": 6.56777945189323,
"learning_rate": 6.138032305433186e-07,
"loss": 0.0573,
"step": 988
},
{
"epoch": 0.703914590747331,
"grad_norm": 4.537732978493385,
"learning_rate": 6.123348017621145e-07,
"loss": 0.0434,
"step": 989
},
{
"epoch": 0.7046263345195729,
"grad_norm": 4.800209501780763,
"learning_rate": 6.108663729809104e-07,
"loss": 0.1148,
"step": 990
},
{
"epoch": 0.7053380782918149,
"grad_norm": 4.436723238161359,
"learning_rate": 6.093979441997062e-07,
"loss": 0.0949,
"step": 991
},
{
"epoch": 0.706049822064057,
"grad_norm": 4.622691635194972,
"learning_rate": 6.079295154185021e-07,
"loss": 0.1296,
"step": 992
},
{
"epoch": 0.7067615658362989,
"grad_norm": 3.1184266192212657,
"learning_rate": 6.064610866372981e-07,
"loss": -0.0653,
"step": 993
},
{
"epoch": 0.7074733096085409,
"grad_norm": 7.086422803719531,
"learning_rate": 6.049926578560939e-07,
"loss": 0.1266,
"step": 994
},
{
"epoch": 0.708185053380783,
"grad_norm": 4.147286982938081,
"learning_rate": 6.035242290748898e-07,
"loss": 0.0978,
"step": 995
},
{
"epoch": 0.7088967971530249,
"grad_norm": 3.494585071868179,
"learning_rate": 6.020558002936858e-07,
"loss": 0.0997,
"step": 996
},
{
"epoch": 0.7096085409252669,
"grad_norm": 5.382206897967533,
"learning_rate": 6.005873715124817e-07,
"loss": 0.0151,
"step": 997
},
{
"epoch": 0.7103202846975089,
"grad_norm": 6.120207176441689,
"learning_rate": 5.991189427312775e-07,
"loss": 0.0475,
"step": 998
},
{
"epoch": 0.7110320284697509,
"grad_norm": 4.805974921203533,
"learning_rate": 5.976505139500734e-07,
"loss": -0.0197,
"step": 999
},
{
"epoch": 0.7117437722419929,
"grad_norm": 11.14135465867464,
"learning_rate": 5.961820851688693e-07,
"loss": 0.1477,
"step": 1000
},
{
"epoch": 0.7124555160142348,
"grad_norm": 6.639000548093658,
"learning_rate": 5.947136563876652e-07,
"loss": 0.1196,
"step": 1001
},
{
"epoch": 0.7131672597864769,
"grad_norm": 3.8551733953168674,
"learning_rate": 5.93245227606461e-07,
"loss": 0.0621,
"step": 1002
},
{
"epoch": 0.7138790035587189,
"grad_norm": 5.225696988138599,
"learning_rate": 5.91776798825257e-07,
"loss": 0.1545,
"step": 1003
},
{
"epoch": 0.7145907473309608,
"grad_norm": 4.353477946786258,
"learning_rate": 5.903083700440529e-07,
"loss": 0.0293,
"step": 1004
},
{
"epoch": 0.7153024911032029,
"grad_norm": 5.739092224858361,
"learning_rate": 5.888399412628487e-07,
"loss": 0.0309,
"step": 1005
},
{
"epoch": 0.7160142348754448,
"grad_norm": 4.288231941227374,
"learning_rate": 5.873715124816446e-07,
"loss": 0.1673,
"step": 1006
},
{
"epoch": 0.7167259786476868,
"grad_norm": 6.874966985449849,
"learning_rate": 5.859030837004405e-07,
"loss": 0.179,
"step": 1007
},
{
"epoch": 0.7174377224199289,
"grad_norm": 3.872807774783665,
"learning_rate": 5.844346549192364e-07,
"loss": -0.0374,
"step": 1008
},
{
"epoch": 0.7181494661921708,
"grad_norm": 4.472528527447646,
"learning_rate": 5.829662261380322e-07,
"loss": 0.1206,
"step": 1009
},
{
"epoch": 0.7188612099644128,
"grad_norm": 4.577847937835349,
"learning_rate": 5.814977973568281e-07,
"loss": 0.0201,
"step": 1010
},
{
"epoch": 0.7195729537366548,
"grad_norm": 4.076940605668476,
"learning_rate": 5.80029368575624e-07,
"loss": -0.0147,
"step": 1011
},
{
"epoch": 0.7202846975088968,
"grad_norm": 4.729777807779337,
"learning_rate": 5.7856093979442e-07,
"loss": 0.0282,
"step": 1012
},
{
"epoch": 0.7209964412811388,
"grad_norm": 4.258425271272895,
"learning_rate": 5.770925110132159e-07,
"loss": 0.0342,
"step": 1013
},
{
"epoch": 0.7217081850533807,
"grad_norm": 5.128062751945477,
"learning_rate": 5.756240822320118e-07,
"loss": 0.072,
"step": 1014
},
{
"epoch": 0.7224199288256228,
"grad_norm": 3.325966291631845,
"learning_rate": 5.741556534508077e-07,
"loss": 0.0912,
"step": 1015
},
{
"epoch": 0.7231316725978648,
"grad_norm": 4.127479963710312,
"learning_rate": 5.726872246696035e-07,
"loss": 0.0248,
"step": 1016
},
{
"epoch": 0.7238434163701067,
"grad_norm": 5.122925315168087,
"learning_rate": 5.712187958883994e-07,
"loss": 0.0543,
"step": 1017
},
{
"epoch": 0.7245551601423488,
"grad_norm": 5.82638564758251,
"learning_rate": 5.697503671071953e-07,
"loss": 0.0469,
"step": 1018
},
{
"epoch": 0.7252669039145907,
"grad_norm": 4.348666694898038,
"learning_rate": 5.682819383259912e-07,
"loss": 0.0599,
"step": 1019
},
{
"epoch": 0.7259786476868327,
"grad_norm": 6.187173237505027,
"learning_rate": 5.66813509544787e-07,
"loss": 0.0595,
"step": 1020
},
{
"epoch": 0.7266903914590748,
"grad_norm": 6.884889183288436,
"learning_rate": 5.653450807635829e-07,
"loss": 0.1994,
"step": 1021
},
{
"epoch": 0.7274021352313167,
"grad_norm": 4.530112288138082,
"learning_rate": 5.638766519823789e-07,
"loss": 0.0517,
"step": 1022
},
{
"epoch": 0.7281138790035587,
"grad_norm": 4.690687596197606,
"learning_rate": 5.624082232011747e-07,
"loss": 0.0067,
"step": 1023
},
{
"epoch": 0.7288256227758008,
"grad_norm": 4.500960353525001,
"learning_rate": 5.609397944199706e-07,
"loss": 0.0875,
"step": 1024
},
{
"epoch": 0.7295373665480427,
"grad_norm": 5.880851591036031,
"learning_rate": 5.594713656387665e-07,
"loss": 0.0519,
"step": 1025
},
{
"epoch": 0.7302491103202847,
"grad_norm": 3.5596931509583087,
"learning_rate": 5.580029368575624e-07,
"loss": 0.0276,
"step": 1026
},
{
"epoch": 0.7309608540925266,
"grad_norm": 8.68598760160492,
"learning_rate": 5.565345080763582e-07,
"loss": 0.047,
"step": 1027
},
{
"epoch": 0.7316725978647687,
"grad_norm": 3.402191684272178,
"learning_rate": 5.550660792951541e-07,
"loss": 0.0965,
"step": 1028
},
{
"epoch": 0.7323843416370107,
"grad_norm": 4.095134238999353,
"learning_rate": 5.5359765051395e-07,
"loss": 0.0581,
"step": 1029
},
{
"epoch": 0.7330960854092526,
"grad_norm": 4.794722580506972,
"learning_rate": 5.521292217327459e-07,
"loss": 0.0635,
"step": 1030
},
{
"epoch": 0.7338078291814947,
"grad_norm": 6.070597795338894,
"learning_rate": 5.506607929515418e-07,
"loss": 0.0225,
"step": 1031
},
{
"epoch": 0.7345195729537367,
"grad_norm": 4.129191494330407,
"learning_rate": 5.491923641703378e-07,
"loss": -0.0081,
"step": 1032
},
{
"epoch": 0.7352313167259786,
"grad_norm": 3.571695740107198,
"learning_rate": 5.477239353891337e-07,
"loss": 0.0014,
"step": 1033
},
{
"epoch": 0.7359430604982207,
"grad_norm": 5.510492373585215,
"learning_rate": 5.462555066079295e-07,
"loss": 0.1682,
"step": 1034
},
{
"epoch": 0.7366548042704626,
"grad_norm": 5.0145359461275065,
"learning_rate": 5.447870778267254e-07,
"loss": 0.1367,
"step": 1035
},
{
"epoch": 0.7373665480427046,
"grad_norm": 6.322219768330761,
"learning_rate": 5.433186490455213e-07,
"loss": 0.0956,
"step": 1036
},
{
"epoch": 0.7380782918149467,
"grad_norm": 5.761999463500739,
"learning_rate": 5.418502202643172e-07,
"loss": 0.1635,
"step": 1037
},
{
"epoch": 0.7387900355871886,
"grad_norm": 4.837240656719542,
"learning_rate": 5.40381791483113e-07,
"loss": 0.0648,
"step": 1038
},
{
"epoch": 0.7395017793594306,
"grad_norm": 4.030925175791179,
"learning_rate": 5.389133627019089e-07,
"loss": 0.1624,
"step": 1039
},
{
"epoch": 0.7402135231316725,
"grad_norm": 4.088053226654463,
"learning_rate": 5.374449339207048e-07,
"loss": 0.0263,
"step": 1040
},
{
"epoch": 0.7409252669039146,
"grad_norm": 3.348281926516349,
"learning_rate": 5.359765051395006e-07,
"loss": 0.029,
"step": 1041
},
{
"epoch": 0.7416370106761566,
"grad_norm": 4.1162599390988985,
"learning_rate": 5.345080763582966e-07,
"loss": -0.0179,
"step": 1042
},
{
"epoch": 0.7423487544483985,
"grad_norm": 5.81149250445998,
"learning_rate": 5.330396475770925e-07,
"loss": 0.1455,
"step": 1043
},
{
"epoch": 0.7430604982206406,
"grad_norm": 3.1102540580282114,
"learning_rate": 5.315712187958884e-07,
"loss": 0.0614,
"step": 1044
},
{
"epoch": 0.7437722419928826,
"grad_norm": 2.6760635115723153,
"learning_rate": 5.301027900146842e-07,
"loss": 0.0029,
"step": 1045
},
{
"epoch": 0.7444839857651245,
"grad_norm": 5.107803896444269,
"learning_rate": 5.286343612334801e-07,
"loss": -0.0163,
"step": 1046
},
{
"epoch": 0.7451957295373666,
"grad_norm": 3.5529373052269086,
"learning_rate": 5.27165932452276e-07,
"loss": 0.1096,
"step": 1047
},
{
"epoch": 0.7459074733096085,
"grad_norm": 6.545845624530535,
"learning_rate": 5.256975036710719e-07,
"loss": 0.1949,
"step": 1048
},
{
"epoch": 0.7466192170818505,
"grad_norm": 4.83801737539602,
"learning_rate": 5.242290748898678e-07,
"loss": 0.0639,
"step": 1049
},
{
"epoch": 0.7473309608540926,
"grad_norm": 3.6609832904083195,
"learning_rate": 5.227606461086637e-07,
"loss": 0.0275,
"step": 1050
},
{
"epoch": 0.7480427046263345,
"grad_norm": 3.259977284685812,
"learning_rate": 5.212922173274597e-07,
"loss": 0.1671,
"step": 1051
},
{
"epoch": 0.7487544483985765,
"grad_norm": 4.963751239762047,
"learning_rate": 5.198237885462556e-07,
"loss": 0.0218,
"step": 1052
},
{
"epoch": 0.7494661921708186,
"grad_norm": 4.204189690387292,
"learning_rate": 5.183553597650514e-07,
"loss": -0.0142,
"step": 1053
},
{
"epoch": 0.7501779359430605,
"grad_norm": 4.215976736044352,
"learning_rate": 5.168869309838473e-07,
"loss": 0.0902,
"step": 1054
},
{
"epoch": 0.7508896797153025,
"grad_norm": 4.1399549409929755,
"learning_rate": 5.154185022026432e-07,
"loss": 0.0241,
"step": 1055
},
{
"epoch": 0.7516014234875444,
"grad_norm": 3.879652737090344,
"learning_rate": 5.13950073421439e-07,
"loss": -0.009,
"step": 1056
},
{
"epoch": 0.7523131672597865,
"grad_norm": 4.668346897274869,
"learning_rate": 5.124816446402349e-07,
"loss": 0.097,
"step": 1057
},
{
"epoch": 0.7530249110320285,
"grad_norm": 4.588309667165746,
"learning_rate": 5.110132158590308e-07,
"loss": 0.0533,
"step": 1058
},
{
"epoch": 0.7537366548042704,
"grad_norm": 6.046628330926524,
"learning_rate": 5.095447870778267e-07,
"loss": 0.0351,
"step": 1059
},
{
"epoch": 0.7544483985765125,
"grad_norm": 3.739341465912661,
"learning_rate": 5.080763582966225e-07,
"loss": -0.0943,
"step": 1060
},
{
"epoch": 0.7551601423487544,
"grad_norm": 4.556068381207451,
"learning_rate": 5.066079295154185e-07,
"loss": 0.06,
"step": 1061
},
{
"epoch": 0.7558718861209964,
"grad_norm": 4.202334872149713,
"learning_rate": 5.051395007342144e-07,
"loss": -0.0082,
"step": 1062
},
{
"epoch": 0.7565836298932385,
"grad_norm": 5.125659993171668,
"learning_rate": 5.036710719530102e-07,
"loss": 0.0845,
"step": 1063
},
{
"epoch": 0.7572953736654804,
"grad_norm": 5.27421670313849,
"learning_rate": 5.022026431718061e-07,
"loss": -0.001,
"step": 1064
},
{
"epoch": 0.7580071174377224,
"grad_norm": 4.368187532248877,
"learning_rate": 5.00734214390602e-07,
"loss": 0.0348,
"step": 1065
},
{
"epoch": 0.7587188612099645,
"grad_norm": 3.855942902787289,
"learning_rate": 4.99265785609398e-07,
"loss": -0.0982,
"step": 1066
},
{
"epoch": 0.7594306049822064,
"grad_norm": 4.977053248284057,
"learning_rate": 4.977973568281938e-07,
"loss": 0.0883,
"step": 1067
},
{
"epoch": 0.7601423487544484,
"grad_norm": 4.289193239981424,
"learning_rate": 4.963289280469897e-07,
"loss": 0.0051,
"step": 1068
},
{
"epoch": 0.7608540925266903,
"grad_norm": 4.006762034148149,
"learning_rate": 4.948604992657856e-07,
"loss": -0.002,
"step": 1069
},
{
"epoch": 0.7615658362989324,
"grad_norm": 2.808128859625345,
"learning_rate": 4.933920704845815e-07,
"loss": -0.0264,
"step": 1070
},
{
"epoch": 0.7622775800711744,
"grad_norm": 5.882154612844199,
"learning_rate": 4.919236417033773e-07,
"loss": 0.069,
"step": 1071
},
{
"epoch": 0.7629893238434163,
"grad_norm": 4.794409301460062,
"learning_rate": 4.904552129221732e-07,
"loss": -0.0557,
"step": 1072
},
{
"epoch": 0.7637010676156584,
"grad_norm": 4.826240443348902,
"learning_rate": 4.889867841409692e-07,
"loss": 0.0804,
"step": 1073
},
{
"epoch": 0.7644128113879004,
"grad_norm": 3.0164349510884447,
"learning_rate": 4.87518355359765e-07,
"loss": -0.0245,
"step": 1074
},
{
"epoch": 0.7651245551601423,
"grad_norm": 4.518231863704466,
"learning_rate": 4.860499265785609e-07,
"loss": 0.0782,
"step": 1075
},
{
"epoch": 0.7658362989323844,
"grad_norm": 4.864847615787066,
"learning_rate": 4.845814977973568e-07,
"loss": 0.1056,
"step": 1076
},
{
"epoch": 0.7665480427046263,
"grad_norm": 4.773824273405901,
"learning_rate": 4.831130690161527e-07,
"loss": 0.0828,
"step": 1077
},
{
"epoch": 0.7672597864768683,
"grad_norm": 4.336462040695792,
"learning_rate": 4.816446402349486e-07,
"loss": 0.1055,
"step": 1078
},
{
"epoch": 0.7679715302491104,
"grad_norm": 3.805670076736972,
"learning_rate": 4.801762114537445e-07,
"loss": 0.0526,
"step": 1079
},
{
"epoch": 0.7686832740213523,
"grad_norm": 6.332442706210725,
"learning_rate": 4.787077826725404e-07,
"loss": 0.0388,
"step": 1080
},
{
"epoch": 0.7693950177935943,
"grad_norm": 6.059518926680791,
"learning_rate": 4.772393538913363e-07,
"loss": 0.005,
"step": 1081
},
{
"epoch": 0.7701067615658364,
"grad_norm": 5.162646216291588,
"learning_rate": 4.757709251101321e-07,
"loss": 0.0971,
"step": 1082
},
{
"epoch": 0.7708185053380783,
"grad_norm": 3.3748493867399048,
"learning_rate": 4.7430249632892805e-07,
"loss": 0.0727,
"step": 1083
},
{
"epoch": 0.7715302491103203,
"grad_norm": 3.933000567201699,
"learning_rate": 4.728340675477239e-07,
"loss": -0.0607,
"step": 1084
},
{
"epoch": 0.7722419928825622,
"grad_norm": 5.233340215172262,
"learning_rate": 4.713656387665198e-07,
"loss": 0.0636,
"step": 1085
},
{
"epoch": 0.7729537366548043,
"grad_norm": 4.78712650764478,
"learning_rate": 4.6989720998531566e-07,
"loss": 0.0701,
"step": 1086
},
{
"epoch": 0.7736654804270463,
"grad_norm": 3.20609767023997,
"learning_rate": 4.6842878120411153e-07,
"loss": 0.0033,
"step": 1087
},
{
"epoch": 0.7743772241992882,
"grad_norm": 4.914803987244276,
"learning_rate": 4.669603524229075e-07,
"loss": 0.0541,
"step": 1088
},
{
"epoch": 0.7750889679715303,
"grad_norm": 5.8612510017626835,
"learning_rate": 4.654919236417034e-07,
"loss": 0.1272,
"step": 1089
},
{
"epoch": 0.7758007117437722,
"grad_norm": 4.283141226610423,
"learning_rate": 4.6402349486049925e-07,
"loss": 0.1489,
"step": 1090
},
{
"epoch": 0.7765124555160142,
"grad_norm": 4.109306305865953,
"learning_rate": 4.625550660792951e-07,
"loss": -0.0282,
"step": 1091
},
{
"epoch": 0.7772241992882563,
"grad_norm": 12.656682211948667,
"learning_rate": 4.61086637298091e-07,
"loss": 0.0796,
"step": 1092
},
{
"epoch": 0.7779359430604982,
"grad_norm": 4.479936163992436,
"learning_rate": 4.596182085168869e-07,
"loss": -0.009,
"step": 1093
},
{
"epoch": 0.7786476868327402,
"grad_norm": 8.41345994752303,
"learning_rate": 4.581497797356828e-07,
"loss": 0.0569,
"step": 1094
},
{
"epoch": 0.7793594306049823,
"grad_norm": 3.2918309681766584,
"learning_rate": 4.5668135095447866e-07,
"loss": 0.0434,
"step": 1095
},
{
"epoch": 0.7800711743772242,
"grad_norm": 7.22334810268064,
"learning_rate": 4.5521292217327454e-07,
"loss": 0.0356,
"step": 1096
},
{
"epoch": 0.7807829181494662,
"grad_norm": 4.178584158182841,
"learning_rate": 4.5374449339207046e-07,
"loss": 0.0572,
"step": 1097
},
{
"epoch": 0.7814946619217081,
"grad_norm": 7.574573557719195,
"learning_rate": 4.522760646108664e-07,
"loss": 0.023,
"step": 1098
},
{
"epoch": 0.7822064056939502,
"grad_norm": 4.004238029317759,
"learning_rate": 4.5080763582966226e-07,
"loss": 0.0766,
"step": 1099
},
{
"epoch": 0.7829181494661922,
"grad_norm": 2.7957439160059487,
"learning_rate": 4.4933920704845813e-07,
"loss": 0.004,
"step": 1100
},
{
"epoch": 0.7836298932384341,
"grad_norm": 4.801530895880167,
"learning_rate": 4.47870778267254e-07,
"loss": -0.0173,
"step": 1101
},
{
"epoch": 0.7843416370106762,
"grad_norm": 5.397057098691664,
"learning_rate": 4.464023494860499e-07,
"loss": 0.214,
"step": 1102
},
{
"epoch": 0.7850533807829182,
"grad_norm": 4.48820921837597,
"learning_rate": 4.449339207048458e-07,
"loss": 0.0879,
"step": 1103
},
{
"epoch": 0.7857651245551601,
"grad_norm": 5.375386246398824,
"learning_rate": 4.4346549192364167e-07,
"loss": 0.0434,
"step": 1104
},
{
"epoch": 0.7864768683274022,
"grad_norm": 3.3797885690956395,
"learning_rate": 4.419970631424376e-07,
"loss": -0.1084,
"step": 1105
},
{
"epoch": 0.7871886120996441,
"grad_norm": 4.5641016826887055,
"learning_rate": 4.4052863436123346e-07,
"loss": 0.1856,
"step": 1106
},
{
"epoch": 0.7879003558718861,
"grad_norm": 5.042023680265534,
"learning_rate": 4.390602055800294e-07,
"loss": 0.0168,
"step": 1107
},
{
"epoch": 0.7886120996441282,
"grad_norm": 6.028319917002458,
"learning_rate": 4.3759177679882526e-07,
"loss": 0.1204,
"step": 1108
},
{
"epoch": 0.7893238434163701,
"grad_norm": 4.695297266575226,
"learning_rate": 4.3612334801762113e-07,
"loss": 0.0593,
"step": 1109
},
{
"epoch": 0.7900355871886121,
"grad_norm": 4.941555048093157,
"learning_rate": 4.34654919236417e-07,
"loss": -0.0071,
"step": 1110
},
{
"epoch": 0.7907473309608541,
"grad_norm": 3.807724259723377,
"learning_rate": 4.3318649045521287e-07,
"loss": -0.1208,
"step": 1111
},
{
"epoch": 0.7914590747330961,
"grad_norm": 5.959791240748352,
"learning_rate": 4.317180616740088e-07,
"loss": 0.0559,
"step": 1112
},
{
"epoch": 0.7921708185053381,
"grad_norm": 4.177561338603296,
"learning_rate": 4.3024963289280467e-07,
"loss": 0.0331,
"step": 1113
},
{
"epoch": 0.79288256227758,
"grad_norm": 4.257159317730894,
"learning_rate": 4.287812041116006e-07,
"loss": 0.0398,
"step": 1114
},
{
"epoch": 0.7935943060498221,
"grad_norm": 5.799847028622386,
"learning_rate": 4.2731277533039646e-07,
"loss": 0.0275,
"step": 1115
},
{
"epoch": 0.7943060498220641,
"grad_norm": 4.0517315683275745,
"learning_rate": 4.2584434654919234e-07,
"loss": -0.0107,
"step": 1116
},
{
"epoch": 0.795017793594306,
"grad_norm": 5.283166842239091,
"learning_rate": 4.2437591776798826e-07,
"loss": 0.1154,
"step": 1117
},
{
"epoch": 0.7957295373665481,
"grad_norm": 4.4525632269695095,
"learning_rate": 4.2290748898678413e-07,
"loss": 0.0285,
"step": 1118
},
{
"epoch": 0.79644128113879,
"grad_norm": 3.550895932467998,
"learning_rate": 4.2143906020558e-07,
"loss": -0.0089,
"step": 1119
},
{
"epoch": 0.797153024911032,
"grad_norm": 5.442417070057488,
"learning_rate": 4.199706314243759e-07,
"loss": 0.1652,
"step": 1120
},
{
"epoch": 0.797864768683274,
"grad_norm": 4.130251460513657,
"learning_rate": 4.1850220264317175e-07,
"loss": 0.0325,
"step": 1121
},
{
"epoch": 0.798576512455516,
"grad_norm": 6.459392468964495,
"learning_rate": 4.1703377386196767e-07,
"loss": 0.1389,
"step": 1122
},
{
"epoch": 0.799288256227758,
"grad_norm": 3.8430552281131902,
"learning_rate": 4.155653450807636e-07,
"loss": 0.0573,
"step": 1123
},
{
"epoch": 0.8,
"grad_norm": 4.7470455016002004,
"learning_rate": 4.1409691629955947e-07,
"loss": 0.0415,
"step": 1124
},
{
"epoch": 0.800711743772242,
"grad_norm": 4.652746123906856,
"learning_rate": 4.1262848751835534e-07,
"loss": 0.1245,
"step": 1125
},
{
"epoch": 0.801423487544484,
"grad_norm": 8.278977337768074,
"learning_rate": 4.111600587371512e-07,
"loss": 0.1232,
"step": 1126
},
{
"epoch": 0.8021352313167259,
"grad_norm": 7.110397902459385,
"learning_rate": 4.0969162995594713e-07,
"loss": 0.0488,
"step": 1127
},
{
"epoch": 0.802846975088968,
"grad_norm": 7.176907903098698,
"learning_rate": 4.08223201174743e-07,
"loss": 0.0988,
"step": 1128
},
{
"epoch": 0.80355871886121,
"grad_norm": 5.053592710724586,
"learning_rate": 4.067547723935389e-07,
"loss": 0.2123,
"step": 1129
},
{
"epoch": 0.8042704626334519,
"grad_norm": 4.508350465827772,
"learning_rate": 4.0528634361233475e-07,
"loss": -0.0135,
"step": 1130
},
{
"epoch": 0.804982206405694,
"grad_norm": 3.865882853833389,
"learning_rate": 4.038179148311306e-07,
"loss": -0.0057,
"step": 1131
},
{
"epoch": 0.805693950177936,
"grad_norm": 6.146207257118892,
"learning_rate": 4.023494860499266e-07,
"loss": 0.0603,
"step": 1132
},
{
"epoch": 0.8064056939501779,
"grad_norm": 6.3975962177776315,
"learning_rate": 4.0088105726872247e-07,
"loss": 0.0544,
"step": 1133
},
{
"epoch": 0.80711743772242,
"grad_norm": 4.223977474195932,
"learning_rate": 3.9941262848751834e-07,
"loss": -0.0547,
"step": 1134
},
{
"epoch": 0.8078291814946619,
"grad_norm": 3.5733018376494896,
"learning_rate": 3.979441997063142e-07,
"loss": 0.042,
"step": 1135
},
{
"epoch": 0.8085409252669039,
"grad_norm": 4.780337776475858,
"learning_rate": 3.964757709251101e-07,
"loss": 0.0426,
"step": 1136
},
{
"epoch": 0.8092526690391459,
"grad_norm": 3.389181647310211,
"learning_rate": 3.95007342143906e-07,
"loss": 0.1313,
"step": 1137
},
{
"epoch": 0.8099644128113879,
"grad_norm": 5.303930753908911,
"learning_rate": 3.935389133627019e-07,
"loss": 0.0883,
"step": 1138
},
{
"epoch": 0.8106761565836299,
"grad_norm": 5.272766160971037,
"learning_rate": 3.9207048458149775e-07,
"loss": 0.0267,
"step": 1139
},
{
"epoch": 0.8113879003558719,
"grad_norm": 5.986879944998904,
"learning_rate": 3.906020558002937e-07,
"loss": 0.0682,
"step": 1140
},
{
"epoch": 0.8120996441281139,
"grad_norm": 5.61959460469509,
"learning_rate": 3.891336270190896e-07,
"loss": 0.0423,
"step": 1141
},
{
"epoch": 0.8128113879003559,
"grad_norm": 6.0012730201640405,
"learning_rate": 3.8766519823788547e-07,
"loss": 0.1035,
"step": 1142
},
{
"epoch": 0.8135231316725978,
"grad_norm": 3.5381491453003573,
"learning_rate": 3.8619676945668134e-07,
"loss": 0.0021,
"step": 1143
},
{
"epoch": 0.8142348754448399,
"grad_norm": 5.831482766300704,
"learning_rate": 3.847283406754772e-07,
"loss": 0.0062,
"step": 1144
},
{
"epoch": 0.8149466192170819,
"grad_norm": 3.4472659016465896,
"learning_rate": 3.832599118942731e-07,
"loss": 0.1066,
"step": 1145
},
{
"epoch": 0.8156583629893238,
"grad_norm": 3.5852129833847344,
"learning_rate": 3.81791483113069e-07,
"loss": 0.055,
"step": 1146
},
{
"epoch": 0.8163701067615659,
"grad_norm": 3.9696159319224646,
"learning_rate": 3.803230543318649e-07,
"loss": 0.0708,
"step": 1147
},
{
"epoch": 0.8170818505338078,
"grad_norm": 6.360338292751588,
"learning_rate": 3.7885462555066075e-07,
"loss": -0.0076,
"step": 1148
},
{
"epoch": 0.8177935943060498,
"grad_norm": 5.288302435982548,
"learning_rate": 3.773861967694567e-07,
"loss": 0.026,
"step": 1149
},
{
"epoch": 0.8185053380782918,
"grad_norm": 7.675548660598207,
"learning_rate": 3.7591776798825255e-07,
"loss": 0.1351,
"step": 1150
},
{
"epoch": 0.8192170818505338,
"grad_norm": 7.627203951141249,
"learning_rate": 3.744493392070485e-07,
"loss": 0.0791,
"step": 1151
},
{
"epoch": 0.8199288256227758,
"grad_norm": 5.0203299157902865,
"learning_rate": 3.7298091042584435e-07,
"loss": 0.0489,
"step": 1152
},
{
"epoch": 0.8206405693950178,
"grad_norm": 9.704403017560951,
"learning_rate": 3.715124816446402e-07,
"loss": 0.2749,
"step": 1153
},
{
"epoch": 0.8213523131672598,
"grad_norm": 5.885301040818936,
"learning_rate": 3.700440528634361e-07,
"loss": 0.121,
"step": 1154
},
{
"epoch": 0.8220640569395018,
"grad_norm": 3.541970782219233,
"learning_rate": 3.6857562408223196e-07,
"loss": -0.0055,
"step": 1155
},
{
"epoch": 0.8227758007117437,
"grad_norm": 4.392345376503781,
"learning_rate": 3.671071953010279e-07,
"loss": 0.0054,
"step": 1156
},
{
"epoch": 0.8234875444839858,
"grad_norm": 3.801961623381928,
"learning_rate": 3.6563876651982376e-07,
"loss": 0.1504,
"step": 1157
},
{
"epoch": 0.8241992882562278,
"grad_norm": 3.9349614997734994,
"learning_rate": 3.641703377386197e-07,
"loss": -0.0417,
"step": 1158
},
{
"epoch": 0.8249110320284697,
"grad_norm": 5.0205635642024555,
"learning_rate": 3.6270190895741555e-07,
"loss": 0.0424,
"step": 1159
},
{
"epoch": 0.8256227758007118,
"grad_norm": 4.786843811164654,
"learning_rate": 3.612334801762114e-07,
"loss": 0.2124,
"step": 1160
},
{
"epoch": 0.8263345195729538,
"grad_norm": 5.1614117304304825,
"learning_rate": 3.5976505139500735e-07,
"loss": 0.0754,
"step": 1161
},
{
"epoch": 0.8270462633451957,
"grad_norm": 4.045083138987178,
"learning_rate": 3.582966226138032e-07,
"loss": 0.0696,
"step": 1162
},
{
"epoch": 0.8277580071174377,
"grad_norm": 4.064066423519107,
"learning_rate": 3.568281938325991e-07,
"loss": 0.0532,
"step": 1163
},
{
"epoch": 0.8284697508896797,
"grad_norm": 4.849379092712681,
"learning_rate": 3.5535976505139496e-07,
"loss": 0.0967,
"step": 1164
},
{
"epoch": 0.8291814946619217,
"grad_norm": 5.927734053233105,
"learning_rate": 3.5389133627019083e-07,
"loss": 0.0504,
"step": 1165
},
{
"epoch": 0.8298932384341637,
"grad_norm": 3.2137744557515866,
"learning_rate": 3.524229074889868e-07,
"loss": -0.0482,
"step": 1166
},
{
"epoch": 0.8306049822064057,
"grad_norm": 4.127693683925132,
"learning_rate": 3.509544787077827e-07,
"loss": -0.0798,
"step": 1167
},
{
"epoch": 0.8313167259786477,
"grad_norm": 6.74093443010439,
"learning_rate": 3.4948604992657856e-07,
"loss": 0.0956,
"step": 1168
},
{
"epoch": 0.8320284697508897,
"grad_norm": 6.502392255278353,
"learning_rate": 3.4801762114537443e-07,
"loss": 0.0345,
"step": 1169
},
{
"epoch": 0.8327402135231317,
"grad_norm": 9.415956678603605,
"learning_rate": 3.465491923641703e-07,
"loss": 0.026,
"step": 1170
},
{
"epoch": 0.8334519572953737,
"grad_norm": 4.913570312692537,
"learning_rate": 3.450807635829662e-07,
"loss": -0.0465,
"step": 1171
},
{
"epoch": 0.8341637010676156,
"grad_norm": 4.883220100174979,
"learning_rate": 3.436123348017621e-07,
"loss": 0.1425,
"step": 1172
},
{
"epoch": 0.8348754448398576,
"grad_norm": 4.353421209791031,
"learning_rate": 3.4214390602055797e-07,
"loss": 0.0989,
"step": 1173
},
{
"epoch": 0.8355871886120997,
"grad_norm": 5.625728136195165,
"learning_rate": 3.4067547723935384e-07,
"loss": 0.0988,
"step": 1174
},
{
"epoch": 0.8362989323843416,
"grad_norm": 5.402349045889487,
"learning_rate": 3.392070484581498e-07,
"loss": 0.0214,
"step": 1175
},
{
"epoch": 0.8370106761565836,
"grad_norm": 6.621572507580396,
"learning_rate": 3.377386196769457e-07,
"loss": 0.1132,
"step": 1176
},
{
"epoch": 0.8377224199288256,
"grad_norm": 6.275645626335489,
"learning_rate": 3.3627019089574156e-07,
"loss": 0.1082,
"step": 1177
},
{
"epoch": 0.8384341637010676,
"grad_norm": 3.2447733299427126,
"learning_rate": 3.3480176211453743e-07,
"loss": 0.0059,
"step": 1178
},
{
"epoch": 0.8391459074733096,
"grad_norm": 4.915025189200319,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0367,
"step": 1179
},
{
"epoch": 0.8398576512455516,
"grad_norm": 5.089077918990016,
"learning_rate": 3.318649045521292e-07,
"loss": 0.1114,
"step": 1180
},
{
"epoch": 0.8405693950177936,
"grad_norm": 3.7565824593214563,
"learning_rate": 3.303964757709251e-07,
"loss": 0.0784,
"step": 1181
},
{
"epoch": 0.8412811387900356,
"grad_norm": 6.680757635403923,
"learning_rate": 3.2892804698972097e-07,
"loss": 0.0629,
"step": 1182
},
{
"epoch": 0.8419928825622776,
"grad_norm": 10.207636678370621,
"learning_rate": 3.2745961820851684e-07,
"loss": 0.1372,
"step": 1183
},
{
"epoch": 0.8427046263345196,
"grad_norm": 4.724435374922215,
"learning_rate": 3.2599118942731276e-07,
"loss": -0.0382,
"step": 1184
},
{
"epoch": 0.8434163701067615,
"grad_norm": 6.11825595065175,
"learning_rate": 3.245227606461087e-07,
"loss": 0.0118,
"step": 1185
},
{
"epoch": 0.8441281138790035,
"grad_norm": 4.777281267158339,
"learning_rate": 3.2305433186490456e-07,
"loss": 0.1171,
"step": 1186
},
{
"epoch": 0.8448398576512456,
"grad_norm": 3.8501119919378795,
"learning_rate": 3.2158590308370043e-07,
"loss": -0.0024,
"step": 1187
},
{
"epoch": 0.8455516014234875,
"grad_norm": 3.3414392776052138,
"learning_rate": 3.201174743024963e-07,
"loss": 0.1114,
"step": 1188
},
{
"epoch": 0.8462633451957295,
"grad_norm": 7.324431767735627,
"learning_rate": 3.186490455212922e-07,
"loss": 0.083,
"step": 1189
},
{
"epoch": 0.8469750889679716,
"grad_norm": 4.751681381589935,
"learning_rate": 3.171806167400881e-07,
"loss": 0.1198,
"step": 1190
},
{
"epoch": 0.8476868327402135,
"grad_norm": 4.526551224178008,
"learning_rate": 3.1571218795888397e-07,
"loss": 0.0667,
"step": 1191
},
{
"epoch": 0.8483985765124555,
"grad_norm": 5.568011350269368,
"learning_rate": 3.1424375917767984e-07,
"loss": 0.1299,
"step": 1192
},
{
"epoch": 0.8491103202846975,
"grad_norm": 4.716195128541494,
"learning_rate": 3.1277533039647577e-07,
"loss": 0.0358,
"step": 1193
},
{
"epoch": 0.8498220640569395,
"grad_norm": 4.930553523437846,
"learning_rate": 3.1130690161527164e-07,
"loss": 0.0242,
"step": 1194
},
{
"epoch": 0.8505338078291815,
"grad_norm": 4.167079498184484,
"learning_rate": 3.0983847283406756e-07,
"loss": 0.1109,
"step": 1195
},
{
"epoch": 0.8512455516014235,
"grad_norm": 8.865833115177155,
"learning_rate": 3.0837004405286343e-07,
"loss": 0.1126,
"step": 1196
},
{
"epoch": 0.8519572953736655,
"grad_norm": 6.24647575197954,
"learning_rate": 3.069016152716593e-07,
"loss": 0.1096,
"step": 1197
},
{
"epoch": 0.8526690391459075,
"grad_norm": 3.8633019858518334,
"learning_rate": 3.054331864904552e-07,
"loss": 0.0957,
"step": 1198
},
{
"epoch": 0.8533807829181494,
"grad_norm": 4.648285884147683,
"learning_rate": 3.0396475770925105e-07,
"loss": 0.0063,
"step": 1199
},
{
"epoch": 0.8540925266903915,
"grad_norm": 3.5454437460212467,
"learning_rate": 3.0249632892804697e-07,
"loss": 0.1057,
"step": 1200
},
{
"epoch": 0.8548042704626334,
"grad_norm": 3.751280248177442,
"learning_rate": 3.010279001468429e-07,
"loss": 0.0517,
"step": 1201
},
{
"epoch": 0.8555160142348754,
"grad_norm": 4.767039805037044,
"learning_rate": 2.9955947136563877e-07,
"loss": 0.1117,
"step": 1202
},
{
"epoch": 0.8562277580071175,
"grad_norm": 5.2435561097664944,
"learning_rate": 2.9809104258443464e-07,
"loss": -0.0266,
"step": 1203
},
{
"epoch": 0.8569395017793594,
"grad_norm": 4.347412541251751,
"learning_rate": 2.966226138032305e-07,
"loss": 0.157,
"step": 1204
},
{
"epoch": 0.8576512455516014,
"grad_norm": 3.863655095386486,
"learning_rate": 2.9515418502202644e-07,
"loss": 0.0768,
"step": 1205
},
{
"epoch": 0.8583629893238434,
"grad_norm": 5.329615051797061,
"learning_rate": 2.936857562408223e-07,
"loss": 0.0112,
"step": 1206
},
{
"epoch": 0.8590747330960854,
"grad_norm": 4.979521202755483,
"learning_rate": 2.922173274596182e-07,
"loss": 0.0948,
"step": 1207
},
{
"epoch": 0.8597864768683274,
"grad_norm": 3.8038100967899813,
"learning_rate": 2.9074889867841405e-07,
"loss": 0.1154,
"step": 1208
},
{
"epoch": 0.8604982206405694,
"grad_norm": 4.38046881966738,
"learning_rate": 2.8928046989721e-07,
"loss": -0.1282,
"step": 1209
},
{
"epoch": 0.8612099644128114,
"grad_norm": 3.1962156628258764,
"learning_rate": 2.878120411160059e-07,
"loss": -0.0098,
"step": 1210
},
{
"epoch": 0.8619217081850534,
"grad_norm": 3.725578258101522,
"learning_rate": 2.8634361233480177e-07,
"loss": 0.0316,
"step": 1211
},
{
"epoch": 0.8626334519572953,
"grad_norm": 3.162711450797923,
"learning_rate": 2.8487518355359764e-07,
"loss": 0.0854,
"step": 1212
},
{
"epoch": 0.8633451957295374,
"grad_norm": 6.603746980474714,
"learning_rate": 2.834067547723935e-07,
"loss": 0.1426,
"step": 1213
},
{
"epoch": 0.8640569395017793,
"grad_norm": 5.756610884728429,
"learning_rate": 2.8193832599118944e-07,
"loss": -0.0035,
"step": 1214
},
{
"epoch": 0.8647686832740213,
"grad_norm": 7.9170728345042845,
"learning_rate": 2.804698972099853e-07,
"loss": 0.001,
"step": 1215
},
{
"epoch": 0.8654804270462634,
"grad_norm": 3.5693745455760797,
"learning_rate": 2.790014684287812e-07,
"loss": -0.0611,
"step": 1216
},
{
"epoch": 0.8661921708185053,
"grad_norm": 5.3110198707848095,
"learning_rate": 2.7753303964757705e-07,
"loss": 0.0459,
"step": 1217
},
{
"epoch": 0.8669039145907473,
"grad_norm": 6.727355839358839,
"learning_rate": 2.760646108663729e-07,
"loss": 0.1669,
"step": 1218
},
{
"epoch": 0.8676156583629894,
"grad_norm": 6.640828064450381,
"learning_rate": 2.745961820851689e-07,
"loss": -0.019,
"step": 1219
},
{
"epoch": 0.8683274021352313,
"grad_norm": 3.3816289439070664,
"learning_rate": 2.731277533039648e-07,
"loss": -0.037,
"step": 1220
},
{
"epoch": 0.8690391459074733,
"grad_norm": 3.6714876011147255,
"learning_rate": 2.7165932452276065e-07,
"loss": 0.0014,
"step": 1221
},
{
"epoch": 0.8697508896797153,
"grad_norm": 4.3569622404376664,
"learning_rate": 2.701908957415565e-07,
"loss": 0.0987,
"step": 1222
},
{
"epoch": 0.8704626334519573,
"grad_norm": 6.344404816304247,
"learning_rate": 2.687224669603524e-07,
"loss": 0.0991,
"step": 1223
},
{
"epoch": 0.8711743772241993,
"grad_norm": 5.361162693827478,
"learning_rate": 2.672540381791483e-07,
"loss": 0.0084,
"step": 1224
},
{
"epoch": 0.8718861209964412,
"grad_norm": 7.6540302406353575,
"learning_rate": 2.657856093979442e-07,
"loss": 0.0212,
"step": 1225
},
{
"epoch": 0.8725978647686833,
"grad_norm": 5.700938351675065,
"learning_rate": 2.6431718061674006e-07,
"loss": 0.0438,
"step": 1226
},
{
"epoch": 0.8733096085409253,
"grad_norm": 8.474665866442852,
"learning_rate": 2.6284875183553593e-07,
"loss": 0.092,
"step": 1227
},
{
"epoch": 0.8740213523131672,
"grad_norm": 4.7571553996414595,
"learning_rate": 2.6138032305433185e-07,
"loss": 0.0078,
"step": 1228
},
{
"epoch": 0.8747330960854093,
"grad_norm": 3.8293542250786787,
"learning_rate": 2.599118942731278e-07,
"loss": 0.0439,
"step": 1229
},
{
"epoch": 0.8754448398576512,
"grad_norm": 2.71698186095146,
"learning_rate": 2.5844346549192365e-07,
"loss": -0.0106,
"step": 1230
},
{
"epoch": 0.8761565836298932,
"grad_norm": 4.79252959511404,
"learning_rate": 2.569750367107195e-07,
"loss": 0.0729,
"step": 1231
},
{
"epoch": 0.8768683274021353,
"grad_norm": 3.3103671134823385,
"learning_rate": 2.555066079295154e-07,
"loss": 0.0302,
"step": 1232
},
{
"epoch": 0.8775800711743772,
"grad_norm": 8.561876260356502,
"learning_rate": 2.5403817914831126e-07,
"loss": -0.0087,
"step": 1233
},
{
"epoch": 0.8782918149466192,
"grad_norm": 3.919649138416107,
"learning_rate": 2.525697503671072e-07,
"loss": 0.0727,
"step": 1234
},
{
"epoch": 0.8790035587188612,
"grad_norm": 3.8392293754828977,
"learning_rate": 2.5110132158590306e-07,
"loss": 0.0023,
"step": 1235
},
{
"epoch": 0.8797153024911032,
"grad_norm": 4.715863690361431,
"learning_rate": 2.49632892804699e-07,
"loss": 0.0673,
"step": 1236
},
{
"epoch": 0.8804270462633452,
"grad_norm": 4.228383008890401,
"learning_rate": 2.4816446402349485e-07,
"loss": 0.0424,
"step": 1237
},
{
"epoch": 0.8811387900355871,
"grad_norm": 5.061146307910219,
"learning_rate": 2.466960352422907e-07,
"loss": 0.1331,
"step": 1238
},
{
"epoch": 0.8818505338078292,
"grad_norm": 3.7345466095506494,
"learning_rate": 2.452276064610866e-07,
"loss": 0.2358,
"step": 1239
},
{
"epoch": 0.8825622775800712,
"grad_norm": 5.272468624826541,
"learning_rate": 2.437591776798825e-07,
"loss": 0.1534,
"step": 1240
},
{
"epoch": 0.8832740213523131,
"grad_norm": 3.1883609581358936,
"learning_rate": 2.422907488986784e-07,
"loss": 0.0114,
"step": 1241
},
{
"epoch": 0.8839857651245552,
"grad_norm": 5.573841149011958,
"learning_rate": 2.408223201174743e-07,
"loss": 0.0248,
"step": 1242
},
{
"epoch": 0.8846975088967971,
"grad_norm": 4.224567838156875,
"learning_rate": 2.393538913362702e-07,
"loss": 0.0828,
"step": 1243
},
{
"epoch": 0.8854092526690391,
"grad_norm": 3.9928614575669577,
"learning_rate": 2.3788546255506606e-07,
"loss": 0.0194,
"step": 1244
},
{
"epoch": 0.8861209964412812,
"grad_norm": 6.428441164422629,
"learning_rate": 2.3641703377386196e-07,
"loss": 0.2083,
"step": 1245
},
{
"epoch": 0.8868327402135231,
"grad_norm": 6.2312799481712,
"learning_rate": 2.3494860499265783e-07,
"loss": 0.0811,
"step": 1246
},
{
"epoch": 0.8875444839857651,
"grad_norm": 4.068910675860895,
"learning_rate": 2.3348017621145376e-07,
"loss": 0.0592,
"step": 1247
},
{
"epoch": 0.8882562277580072,
"grad_norm": 5.52784115335928,
"learning_rate": 2.3201174743024963e-07,
"loss": 0.089,
"step": 1248
},
{
"epoch": 0.8889679715302491,
"grad_norm": 4.580499547509674,
"learning_rate": 2.305433186490455e-07,
"loss": 0.1023,
"step": 1249
},
{
"epoch": 0.8896797153024911,
"grad_norm": 7.07961420893908,
"learning_rate": 2.290748898678414e-07,
"loss": 0.1685,
"step": 1250
},
{
"epoch": 0.890391459074733,
"grad_norm": 4.178254582429226,
"learning_rate": 2.2760646108663727e-07,
"loss": 0.0558,
"step": 1251
},
{
"epoch": 0.8911032028469751,
"grad_norm": 8.001441825637583,
"learning_rate": 2.261380323054332e-07,
"loss": 0.1452,
"step": 1252
},
{
"epoch": 0.8918149466192171,
"grad_norm": 3.946074459411681,
"learning_rate": 2.2466960352422906e-07,
"loss": 0.0363,
"step": 1253
},
{
"epoch": 0.892526690391459,
"grad_norm": 8.4028417775753,
"learning_rate": 2.2320117474302496e-07,
"loss": 0.0724,
"step": 1254
},
{
"epoch": 0.8932384341637011,
"grad_norm": 3.9318556520698187,
"learning_rate": 2.2173274596182083e-07,
"loss": -0.0137,
"step": 1255
},
{
"epoch": 0.8939501779359431,
"grad_norm": 3.458687974754859,
"learning_rate": 2.2026431718061673e-07,
"loss": -0.0946,
"step": 1256
},
{
"epoch": 0.894661921708185,
"grad_norm": 6.622453308056072,
"learning_rate": 2.1879588839941263e-07,
"loss": 0.172,
"step": 1257
},
{
"epoch": 0.8953736654804271,
"grad_norm": 3.9673919757254,
"learning_rate": 2.173274596182085e-07,
"loss": 0.0234,
"step": 1258
},
{
"epoch": 0.896085409252669,
"grad_norm": 5.118865458731527,
"learning_rate": 2.158590308370044e-07,
"loss": 0.0551,
"step": 1259
},
{
"epoch": 0.896797153024911,
"grad_norm": 6.76356471453516,
"learning_rate": 2.143906020558003e-07,
"loss": 0.1191,
"step": 1260
},
{
"epoch": 0.8975088967971531,
"grad_norm": 4.301543318112993,
"learning_rate": 2.1292217327459617e-07,
"loss": 0.1043,
"step": 1261
},
{
"epoch": 0.898220640569395,
"grad_norm": 4.590072093721571,
"learning_rate": 2.1145374449339207e-07,
"loss": 0.0605,
"step": 1262
},
{
"epoch": 0.898932384341637,
"grad_norm": 3.055191505785509,
"learning_rate": 2.0998531571218794e-07,
"loss": 0.031,
"step": 1263
},
{
"epoch": 0.899644128113879,
"grad_norm": 4.323248915562587,
"learning_rate": 2.0851688693098384e-07,
"loss": 0.0008,
"step": 1264
},
{
"epoch": 0.900355871886121,
"grad_norm": 4.7286854413693,
"learning_rate": 2.0704845814977973e-07,
"loss": -0.0549,
"step": 1265
},
{
"epoch": 0.901067615658363,
"grad_norm": 3.99503709950128,
"learning_rate": 2.055800293685756e-07,
"loss": 0.0984,
"step": 1266
},
{
"epoch": 0.9017793594306049,
"grad_norm": 6.636398298712216,
"learning_rate": 2.041116005873715e-07,
"loss": 0.082,
"step": 1267
},
{
"epoch": 0.902491103202847,
"grad_norm": 4.989754670558792,
"learning_rate": 2.0264317180616737e-07,
"loss": 0.0104,
"step": 1268
},
{
"epoch": 0.903202846975089,
"grad_norm": 6.410286278483913,
"learning_rate": 2.011747430249633e-07,
"loss": 0.0933,
"step": 1269
},
{
"epoch": 0.9039145907473309,
"grad_norm": 4.920601816354115,
"learning_rate": 1.9970631424375917e-07,
"loss": -0.0104,
"step": 1270
},
{
"epoch": 0.904626334519573,
"grad_norm": 4.892842822804742,
"learning_rate": 1.9823788546255504e-07,
"loss": 0.0739,
"step": 1271
},
{
"epoch": 0.9053380782918149,
"grad_norm": 6.840931769367717,
"learning_rate": 1.9676945668135094e-07,
"loss": 0.2031,
"step": 1272
},
{
"epoch": 0.9060498220640569,
"grad_norm": 4.812134766033649,
"learning_rate": 1.9530102790014684e-07,
"loss": 0.2425,
"step": 1273
},
{
"epoch": 0.906761565836299,
"grad_norm": 5.42050215080619,
"learning_rate": 1.9383259911894274e-07,
"loss": 0.0727,
"step": 1274
},
{
"epoch": 0.9074733096085409,
"grad_norm": 3.8417355600883645,
"learning_rate": 1.923641703377386e-07,
"loss": -0.0279,
"step": 1275
},
{
"epoch": 0.9081850533807829,
"grad_norm": 13.373758133606684,
"learning_rate": 1.908957415565345e-07,
"loss": -0.0461,
"step": 1276
},
{
"epoch": 0.908896797153025,
"grad_norm": 5.112087637912734,
"learning_rate": 1.8942731277533038e-07,
"loss": 0.0965,
"step": 1277
},
{
"epoch": 0.9096085409252669,
"grad_norm": 4.630412671872866,
"learning_rate": 1.8795888399412628e-07,
"loss": 0.1044,
"step": 1278
},
{
"epoch": 0.9103202846975089,
"grad_norm": 3.9406530683275625,
"learning_rate": 1.8649045521292217e-07,
"loss": 0.1613,
"step": 1279
},
{
"epoch": 0.9110320284697508,
"grad_norm": 6.648033551183105,
"learning_rate": 1.8502202643171804e-07,
"loss": -0.025,
"step": 1280
},
{
"epoch": 0.9117437722419929,
"grad_norm": 4.950505764486973,
"learning_rate": 1.8355359765051394e-07,
"loss": 0.1267,
"step": 1281
},
{
"epoch": 0.9124555160142349,
"grad_norm": 5.032867929697572,
"learning_rate": 1.8208516886930984e-07,
"loss": 0.0465,
"step": 1282
},
{
"epoch": 0.9131672597864768,
"grad_norm": 6.276816250394059,
"learning_rate": 1.806167400881057e-07,
"loss": 0.0473,
"step": 1283
},
{
"epoch": 0.9138790035587189,
"grad_norm": 2.9365357784700774,
"learning_rate": 1.791483113069016e-07,
"loss": -0.0447,
"step": 1284
},
{
"epoch": 0.9145907473309609,
"grad_norm": 3.8896844791827543,
"learning_rate": 1.7767988252569748e-07,
"loss": -0.0443,
"step": 1285
},
{
"epoch": 0.9153024911032028,
"grad_norm": 10.739547615966208,
"learning_rate": 1.762114537444934e-07,
"loss": 0.1637,
"step": 1286
},
{
"epoch": 0.9160142348754449,
"grad_norm": 5.993350158656473,
"learning_rate": 1.7474302496328928e-07,
"loss": 0.0432,
"step": 1287
},
{
"epoch": 0.9167259786476868,
"grad_norm": 6.71234933126412,
"learning_rate": 1.7327459618208515e-07,
"loss": 0.1618,
"step": 1288
},
{
"epoch": 0.9174377224199288,
"grad_norm": 6.316009150415693,
"learning_rate": 1.7180616740088105e-07,
"loss": 0.124,
"step": 1289
},
{
"epoch": 0.9181494661921709,
"grad_norm": 5.748994822339394,
"learning_rate": 1.7033773861967692e-07,
"loss": -0.0388,
"step": 1290
},
{
"epoch": 0.9188612099644128,
"grad_norm": 4.14975638536304,
"learning_rate": 1.6886930983847284e-07,
"loss": 0.0756,
"step": 1291
},
{
"epoch": 0.9195729537366548,
"grad_norm": 5.056815641283716,
"learning_rate": 1.6740088105726871e-07,
"loss": 0.0242,
"step": 1292
},
{
"epoch": 0.9202846975088967,
"grad_norm": 2.4390976389492653,
"learning_rate": 1.659324522760646e-07,
"loss": 0.043,
"step": 1293
},
{
"epoch": 0.9209964412811388,
"grad_norm": 3.907872953866281,
"learning_rate": 1.6446402349486048e-07,
"loss": 0.0113,
"step": 1294
},
{
"epoch": 0.9217081850533808,
"grad_norm": 5.101758967730574,
"learning_rate": 1.6299559471365638e-07,
"loss": 0.0432,
"step": 1295
},
{
"epoch": 0.9224199288256227,
"grad_norm": 5.3237433441686575,
"learning_rate": 1.6152716593245228e-07,
"loss": 0.1411,
"step": 1296
},
{
"epoch": 0.9231316725978648,
"grad_norm": 14.346407110236912,
"learning_rate": 1.6005873715124815e-07,
"loss": 0.0642,
"step": 1297
},
{
"epoch": 0.9238434163701068,
"grad_norm": 4.540641365625617,
"learning_rate": 1.5859030837004405e-07,
"loss": -0.0814,
"step": 1298
},
{
"epoch": 0.9245551601423487,
"grad_norm": 4.059461660539323,
"learning_rate": 1.5712187958883992e-07,
"loss": 0.1611,
"step": 1299
},
{
"epoch": 0.9252669039145908,
"grad_norm": 4.607153405634738,
"learning_rate": 1.5565345080763582e-07,
"loss": 0.1309,
"step": 1300
},
{
"epoch": 0.9259786476868327,
"grad_norm": 4.3752650589350015,
"learning_rate": 1.5418502202643172e-07,
"loss": 0.1112,
"step": 1301
},
{
"epoch": 0.9266903914590747,
"grad_norm": 4.68187610223401,
"learning_rate": 1.527165932452276e-07,
"loss": 0.0666,
"step": 1302
},
{
"epoch": 0.9274021352313168,
"grad_norm": 5.2790054878774315,
"learning_rate": 1.5124816446402349e-07,
"loss": 0.0326,
"step": 1303
},
{
"epoch": 0.9281138790035587,
"grad_norm": 4.7191667183286565,
"learning_rate": 1.4977973568281938e-07,
"loss": 0.0791,
"step": 1304
},
{
"epoch": 0.9288256227758007,
"grad_norm": 7.96740902856242,
"learning_rate": 1.4831130690161526e-07,
"loss": 0.2091,
"step": 1305
},
{
"epoch": 0.9295373665480428,
"grad_norm": 5.549196772831938,
"learning_rate": 1.4684287812041115e-07,
"loss": 0.093,
"step": 1306
},
{
"epoch": 0.9302491103202847,
"grad_norm": 3.030689305626086,
"learning_rate": 1.4537444933920703e-07,
"loss": -0.1049,
"step": 1307
},
{
"epoch": 0.9309608540925267,
"grad_norm": 4.364531936282188,
"learning_rate": 1.4390602055800295e-07,
"loss": 0.0521,
"step": 1308
},
{
"epoch": 0.9316725978647686,
"grad_norm": 3.8547858625772826,
"learning_rate": 1.4243759177679882e-07,
"loss": 0.1077,
"step": 1309
},
{
"epoch": 0.9323843416370107,
"grad_norm": 5.420010028843909,
"learning_rate": 1.4096916299559472e-07,
"loss": 0.0816,
"step": 1310
},
{
"epoch": 0.9330960854092527,
"grad_norm": 4.916350834072121,
"learning_rate": 1.395007342143906e-07,
"loss": 0.1923,
"step": 1311
},
{
"epoch": 0.9338078291814946,
"grad_norm": 5.0089287190817355,
"learning_rate": 1.3803230543318646e-07,
"loss": 0.0315,
"step": 1312
},
{
"epoch": 0.9345195729537367,
"grad_norm": 3.428274363182289,
"learning_rate": 1.365638766519824e-07,
"loss": 0.0982,
"step": 1313
},
{
"epoch": 0.9352313167259787,
"grad_norm": 4.359991652104497,
"learning_rate": 1.3509544787077826e-07,
"loss": 0.1368,
"step": 1314
},
{
"epoch": 0.9359430604982206,
"grad_norm": 4.532641387939283,
"learning_rate": 1.3362701908957416e-07,
"loss": 0.0313,
"step": 1315
},
{
"epoch": 0.9366548042704627,
"grad_norm": 4.1063156053014,
"learning_rate": 1.3215859030837003e-07,
"loss": 0.0529,
"step": 1316
},
{
"epoch": 0.9373665480427046,
"grad_norm": 4.411527678910333,
"learning_rate": 1.3069016152716593e-07,
"loss": 0.0312,
"step": 1317
},
{
"epoch": 0.9380782918149466,
"grad_norm": 5.7370203553214845,
"learning_rate": 1.2922173274596182e-07,
"loss": 0.02,
"step": 1318
},
{
"epoch": 0.9387900355871887,
"grad_norm": 3.874441168082076,
"learning_rate": 1.277533039647577e-07,
"loss": 0.1642,
"step": 1319
},
{
"epoch": 0.9395017793594306,
"grad_norm": 5.200612054163917,
"learning_rate": 1.262848751835536e-07,
"loss": -0.0318,
"step": 1320
},
{
"epoch": 0.9402135231316726,
"grad_norm": 6.360973410339866,
"learning_rate": 1.248164464023495e-07,
"loss": 0.1692,
"step": 1321
},
{
"epoch": 0.9409252669039145,
"grad_norm": 3.8992922654250495,
"learning_rate": 1.2334801762114536e-07,
"loss": -0.0379,
"step": 1322
},
{
"epoch": 0.9416370106761566,
"grad_norm": 4.993974679182252,
"learning_rate": 1.2187958883994126e-07,
"loss": 0.0495,
"step": 1323
},
{
"epoch": 0.9423487544483986,
"grad_norm": 4.408299395617472,
"learning_rate": 1.2041116005873716e-07,
"loss": 0.022,
"step": 1324
},
{
"epoch": 0.9430604982206405,
"grad_norm": 4.1691217945852745,
"learning_rate": 1.1894273127753303e-07,
"loss": -0.0083,
"step": 1325
},
{
"epoch": 0.9437722419928826,
"grad_norm": 3.5294179299244606,
"learning_rate": 1.1747430249632892e-07,
"loss": 0.0035,
"step": 1326
},
{
"epoch": 0.9444839857651246,
"grad_norm": 3.236758031475521,
"learning_rate": 1.1600587371512481e-07,
"loss": 0.0153,
"step": 1327
},
{
"epoch": 0.9451957295373665,
"grad_norm": 5.664962886409278,
"learning_rate": 1.145374449339207e-07,
"loss": 0.0604,
"step": 1328
},
{
"epoch": 0.9459074733096086,
"grad_norm": 5.32944058372216,
"learning_rate": 1.130690161527166e-07,
"loss": 0.0902,
"step": 1329
},
{
"epoch": 0.9466192170818505,
"grad_norm": 4.743520835196198,
"learning_rate": 1.1160058737151248e-07,
"loss": 0.0022,
"step": 1330
},
{
"epoch": 0.9473309608540925,
"grad_norm": 5.603546599883891,
"learning_rate": 1.1013215859030837e-07,
"loss": 0.0725,
"step": 1331
},
{
"epoch": 0.9480427046263346,
"grad_norm": 3.9105017144546097,
"learning_rate": 1.0866372980910425e-07,
"loss": -0.0624,
"step": 1332
},
{
"epoch": 0.9487544483985765,
"grad_norm": 3.991176887891522,
"learning_rate": 1.0719530102790015e-07,
"loss": 0.1552,
"step": 1333
},
{
"epoch": 0.9494661921708185,
"grad_norm": 4.159061943442253,
"learning_rate": 1.0572687224669603e-07,
"loss": -0.0307,
"step": 1334
},
{
"epoch": 0.9501779359430605,
"grad_norm": 4.900008197699716,
"learning_rate": 1.0425844346549192e-07,
"loss": 0.1238,
"step": 1335
},
{
"epoch": 0.9508896797153025,
"grad_norm": 5.8027748047438745,
"learning_rate": 1.027900146842878e-07,
"loss": -0.0756,
"step": 1336
},
{
"epoch": 0.9516014234875445,
"grad_norm": 5.125476908010209,
"learning_rate": 1.0132158590308369e-07,
"loss": 0.0641,
"step": 1337
},
{
"epoch": 0.9523131672597864,
"grad_norm": 4.25671529369216,
"learning_rate": 9.985315712187959e-08,
"loss": -0.006,
"step": 1338
},
{
"epoch": 0.9530249110320285,
"grad_norm": 4.7119212475657655,
"learning_rate": 9.838472834067547e-08,
"loss": -0.0246,
"step": 1339
},
{
"epoch": 0.9537366548042705,
"grad_norm": 8.596870438447308,
"learning_rate": 9.691629955947137e-08,
"loss": 0.1129,
"step": 1340
},
{
"epoch": 0.9544483985765124,
"grad_norm": 5.158601721630786,
"learning_rate": 9.544787077826725e-08,
"loss": 0.0471,
"step": 1341
},
{
"epoch": 0.9551601423487545,
"grad_norm": 4.435676952421707,
"learning_rate": 9.397944199706314e-08,
"loss": 0.0255,
"step": 1342
},
{
"epoch": 0.9558718861209965,
"grad_norm": 4.957502386310905,
"learning_rate": 9.251101321585902e-08,
"loss": 0.0724,
"step": 1343
},
{
"epoch": 0.9565836298932384,
"grad_norm": 3.944932976021612,
"learning_rate": 9.104258443465492e-08,
"loss": 0.0747,
"step": 1344
},
{
"epoch": 0.9572953736654805,
"grad_norm": 4.434897935933638,
"learning_rate": 8.95741556534508e-08,
"loss": 0.0596,
"step": 1345
},
{
"epoch": 0.9580071174377224,
"grad_norm": 5.776077843688367,
"learning_rate": 8.81057268722467e-08,
"loss": -0.0457,
"step": 1346
},
{
"epoch": 0.9587188612099644,
"grad_norm": 3.628808224506914,
"learning_rate": 8.663729809104257e-08,
"loss": 0.0567,
"step": 1347
},
{
"epoch": 0.9594306049822064,
"grad_norm": 3.6873894367776607,
"learning_rate": 8.516886930983846e-08,
"loss": 0.1175,
"step": 1348
},
{
"epoch": 0.9601423487544484,
"grad_norm": 5.519140231029694,
"learning_rate": 8.370044052863436e-08,
"loss": 0.0404,
"step": 1349
},
{
"epoch": 0.9608540925266904,
"grad_norm": 4.461927421017156,
"learning_rate": 8.223201174743024e-08,
"loss": 0.0566,
"step": 1350
},
{
"epoch": 0.9615658362989323,
"grad_norm": 5.333615692929449,
"learning_rate": 8.076358296622614e-08,
"loss": 0.129,
"step": 1351
},
{
"epoch": 0.9622775800711744,
"grad_norm": 3.538932343167442,
"learning_rate": 7.929515418502202e-08,
"loss": 0.0477,
"step": 1352
},
{
"epoch": 0.9629893238434164,
"grad_norm": 5.114559611196206,
"learning_rate": 7.782672540381791e-08,
"loss": 0.1573,
"step": 1353
},
{
"epoch": 0.9637010676156583,
"grad_norm": 4.230187404079494,
"learning_rate": 7.63582966226138e-08,
"loss": 0.0854,
"step": 1354
},
{
"epoch": 0.9644128113879004,
"grad_norm": 6.091438676748531,
"learning_rate": 7.488986784140969e-08,
"loss": -0.1022,
"step": 1355
},
{
"epoch": 0.9651245551601424,
"grad_norm": 3.569956288003313,
"learning_rate": 7.342143906020558e-08,
"loss": -0.0432,
"step": 1356
},
{
"epoch": 0.9658362989323843,
"grad_norm": 4.509091835357482,
"learning_rate": 7.195301027900148e-08,
"loss": 0.0026,
"step": 1357
},
{
"epoch": 0.9665480427046264,
"grad_norm": 3.621722877605747,
"learning_rate": 7.048458149779736e-08,
"loss": -0.0038,
"step": 1358
},
{
"epoch": 0.9672597864768683,
"grad_norm": 5.385886235182663,
"learning_rate": 6.901615271659323e-08,
"loss": 0.0568,
"step": 1359
},
{
"epoch": 0.9679715302491103,
"grad_norm": 5.242695404325819,
"learning_rate": 6.754772393538913e-08,
"loss": 0.1181,
"step": 1360
},
{
"epoch": 0.9686832740213523,
"grad_norm": 3.772667595014263,
"learning_rate": 6.607929515418501e-08,
"loss": 0.0671,
"step": 1361
},
{
"epoch": 0.9693950177935943,
"grad_norm": 4.239257250093547,
"learning_rate": 6.461086637298091e-08,
"loss": 0.0319,
"step": 1362
},
{
"epoch": 0.9701067615658363,
"grad_norm": 5.284811049505885,
"learning_rate": 6.31424375917768e-08,
"loss": 0.0672,
"step": 1363
},
{
"epoch": 0.9708185053380783,
"grad_norm": 4.001900698168065,
"learning_rate": 6.167400881057268e-08,
"loss": -0.0016,
"step": 1364
},
{
"epoch": 0.9715302491103203,
"grad_norm": 4.671011037902923,
"learning_rate": 6.020558002936858e-08,
"loss": 0.0601,
"step": 1365
},
{
"epoch": 0.9722419928825623,
"grad_norm": 3.676485713703232,
"learning_rate": 5.873715124816446e-08,
"loss": 0.053,
"step": 1366
},
{
"epoch": 0.9729537366548042,
"grad_norm": 3.878098264443752,
"learning_rate": 5.726872246696035e-08,
"loss": -0.0868,
"step": 1367
},
{
"epoch": 0.9736654804270463,
"grad_norm": 3.2959723660690217,
"learning_rate": 5.580029368575624e-08,
"loss": -0.0472,
"step": 1368
},
{
"epoch": 0.9743772241992883,
"grad_norm": 4.965343915355099,
"learning_rate": 5.4331864904552125e-08,
"loss": 0.1451,
"step": 1369
},
{
"epoch": 0.9750889679715302,
"grad_norm": 5.90877547589276,
"learning_rate": 5.2863436123348017e-08,
"loss": 0.1055,
"step": 1370
},
{
"epoch": 0.9758007117437723,
"grad_norm": 3.7198052712203014,
"learning_rate": 5.13950073421439e-08,
"loss": -0.0286,
"step": 1371
},
{
"epoch": 0.9765124555160143,
"grad_norm": 3.277868072070685,
"learning_rate": 4.992657856093979e-08,
"loss": -0.0904,
"step": 1372
},
{
"epoch": 0.9772241992882562,
"grad_norm": 4.233062202802242,
"learning_rate": 4.8458149779735684e-08,
"loss": 0.0484,
"step": 1373
},
{
"epoch": 0.9779359430604982,
"grad_norm": 3.2626225298676372,
"learning_rate": 4.698972099853157e-08,
"loss": 0.1087,
"step": 1374
},
{
"epoch": 0.9786476868327402,
"grad_norm": 5.509936287125614,
"learning_rate": 4.552129221732746e-08,
"loss": 0.0872,
"step": 1375
},
{
"epoch": 0.9793594306049822,
"grad_norm": 5.2997441356005135,
"learning_rate": 4.405286343612335e-08,
"loss": 0.1112,
"step": 1376
},
{
"epoch": 0.9800711743772242,
"grad_norm": 5.624439296327349,
"learning_rate": 4.258443465491923e-08,
"loss": 0.1325,
"step": 1377
},
{
"epoch": 0.9807829181494662,
"grad_norm": 4.796297616411571,
"learning_rate": 4.111600587371512e-08,
"loss": 0.058,
"step": 1378
},
{
"epoch": 0.9814946619217082,
"grad_norm": 5.256236051162191,
"learning_rate": 3.964757709251101e-08,
"loss": 0.0558,
"step": 1379
},
{
"epoch": 0.9822064056939501,
"grad_norm": 5.459748800458802,
"learning_rate": 3.81791483113069e-08,
"loss": 0.061,
"step": 1380
},
{
"epoch": 0.9829181494661922,
"grad_norm": 4.678092977838486,
"learning_rate": 3.671071953010279e-08,
"loss": 0.1689,
"step": 1381
},
{
"epoch": 0.9836298932384342,
"grad_norm": 6.37815001210416,
"learning_rate": 3.524229074889868e-08,
"loss": 0.1142,
"step": 1382
},
{
"epoch": 0.9843416370106761,
"grad_norm": 4.324133840276909,
"learning_rate": 3.3773861967694565e-08,
"loss": 0.0369,
"step": 1383
},
{
"epoch": 0.9850533807829182,
"grad_norm": 4.446104492002926,
"learning_rate": 3.2305433186490456e-08,
"loss": 0.1194,
"step": 1384
},
{
"epoch": 0.9857651245551602,
"grad_norm": 5.748282466289897,
"learning_rate": 3.083700440528634e-08,
"loss": 0.0976,
"step": 1385
},
{
"epoch": 0.9864768683274021,
"grad_norm": 3.3000657970920395,
"learning_rate": 2.936857562408223e-08,
"loss": 0.0993,
"step": 1386
},
{
"epoch": 0.9871886120996441,
"grad_norm": 6.565672480008891,
"learning_rate": 2.790014684287812e-08,
"loss": 0.0394,
"step": 1387
},
{
"epoch": 0.9879003558718861,
"grad_norm": 5.079375784962967,
"learning_rate": 2.6431718061674008e-08,
"loss": 0.042,
"step": 1388
},
{
"epoch": 0.9886120996441281,
"grad_norm": 3.9565078675418315,
"learning_rate": 2.4963289280469896e-08,
"loss": 0.109,
"step": 1389
},
{
"epoch": 0.9893238434163701,
"grad_norm": 3.839865233321451,
"learning_rate": 2.3494860499265784e-08,
"loss": 0.0618,
"step": 1390
},
{
"epoch": 0.9900355871886121,
"grad_norm": 5.350089077787284,
"learning_rate": 2.2026431718061676e-08,
"loss": 0.0817,
"step": 1391
},
{
"epoch": 0.9907473309608541,
"grad_norm": 6.152412264003426,
"learning_rate": 2.055800293685756e-08,
"loss": -0.0063,
"step": 1392
},
{
"epoch": 0.9914590747330961,
"grad_norm": 4.5760011921994455,
"learning_rate": 1.908957415565345e-08,
"loss": 0.0224,
"step": 1393
},
{
"epoch": 0.9921708185053381,
"grad_norm": 4.185879749727028,
"learning_rate": 1.762114537444934e-08,
"loss": 0.0795,
"step": 1394
},
{
"epoch": 0.9928825622775801,
"grad_norm": 3.633543010809335,
"learning_rate": 1.6152716593245228e-08,
"loss": 0.1394,
"step": 1395
},
{
"epoch": 0.993594306049822,
"grad_norm": 4.975818141202149,
"learning_rate": 1.4684287812041114e-08,
"loss": 0.0229,
"step": 1396
},
{
"epoch": 0.994306049822064,
"grad_norm": 5.877233527084385,
"learning_rate": 1.3215859030837004e-08,
"loss": 0.0481,
"step": 1397
},
{
"epoch": 0.9950177935943061,
"grad_norm": 5.1264733778517995,
"learning_rate": 1.1747430249632892e-08,
"loss": 0.0426,
"step": 1398
},
{
"epoch": 0.995729537366548,
"grad_norm": 3.3972353859001165,
"learning_rate": 1.027900146842878e-08,
"loss": -0.0471,
"step": 1399
},
{
"epoch": 0.99644128113879,
"grad_norm": 6.26148115386722,
"learning_rate": 8.81057268722467e-09,
"loss": 0.0972,
"step": 1400
},
{
"epoch": 0.9971530249110321,
"grad_norm": 5.05392313488687,
"learning_rate": 7.342143906020557e-09,
"loss": 0.0775,
"step": 1401
},
{
"epoch": 0.997864768683274,
"grad_norm": 6.8793645702833,
"learning_rate": 5.873715124816446e-09,
"loss": -0.0151,
"step": 1402
},
{
"epoch": 0.998576512455516,
"grad_norm": 5.4786247691333925,
"learning_rate": 4.405286343612335e-09,
"loss": 0.1371,
"step": 1403
},
{
"epoch": 0.999288256227758,
"grad_norm": 5.699875666282905,
"learning_rate": 2.936857562408223e-09,
"loss": -0.0478,
"step": 1404
},
{
"epoch": 1.0,
"grad_norm": 2.8003625703125516,
"learning_rate": 1.4684287812041115e-09,
"loss": -0.082,
"step": 1405
},
{
"epoch": 1.0,
"step": 1405,
"total_flos": 252991429017600.0,
"train_loss": 0.07296714245530635,
"train_runtime": 16463.8109,
"train_samples_per_second": 10.922,
"train_steps_per_second": 0.085
}
],
"logging_steps": 1.0,
"max_steps": 1405,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 252991429017600.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}