Lyra_Mini_3B / long_speech_lora /trainer_state.json
zszhong's picture
upload long speech lora
3a96b20 verified
raw
history blame
180 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9956458635703918,
"eval_steps": 500,
"global_step": 1032,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002902757619738752,
"grad_norm": 2.0846433767652153,
"learning_rate": 3.225806451612903e-06,
"loss": 2.2957,
"step": 1
},
{
"epoch": 0.005805515239477504,
"grad_norm": 1.9978416462534117,
"learning_rate": 6.451612903225806e-06,
"loss": 2.3178,
"step": 2
},
{
"epoch": 0.008708272859216255,
"grad_norm": 1.9430579171955504,
"learning_rate": 9.67741935483871e-06,
"loss": 2.2802,
"step": 3
},
{
"epoch": 0.011611030478955007,
"grad_norm": 1.9543596506034469,
"learning_rate": 1.2903225806451613e-05,
"loss": 2.318,
"step": 4
},
{
"epoch": 0.01451378809869376,
"grad_norm": 1.7061741200861744,
"learning_rate": 1.6129032258064517e-05,
"loss": 2.2252,
"step": 5
},
{
"epoch": 0.01741654571843251,
"grad_norm": 1.253144062080066,
"learning_rate": 1.935483870967742e-05,
"loss": 2.1205,
"step": 6
},
{
"epoch": 0.020319303338171262,
"grad_norm": 1.0003116901021947,
"learning_rate": 2.258064516129032e-05,
"loss": 2.1504,
"step": 7
},
{
"epoch": 0.023222060957910014,
"grad_norm": 0.9564264471241767,
"learning_rate": 2.5806451612903226e-05,
"loss": 2.1369,
"step": 8
},
{
"epoch": 0.026124818577648767,
"grad_norm": 1.0124626024446515,
"learning_rate": 2.9032258064516133e-05,
"loss": 1.9994,
"step": 9
},
{
"epoch": 0.02902757619738752,
"grad_norm": 1.0323826804432363,
"learning_rate": 3.2258064516129034e-05,
"loss": 1.9673,
"step": 10
},
{
"epoch": 0.03193033381712627,
"grad_norm": 1.1010706324174482,
"learning_rate": 3.548387096774194e-05,
"loss": 2.0209,
"step": 11
},
{
"epoch": 0.03483309143686502,
"grad_norm": 0.9587342794682953,
"learning_rate": 3.870967741935484e-05,
"loss": 1.9565,
"step": 12
},
{
"epoch": 0.03773584905660377,
"grad_norm": 0.8087276245182698,
"learning_rate": 4.1935483870967746e-05,
"loss": 1.9274,
"step": 13
},
{
"epoch": 0.040638606676342524,
"grad_norm": 0.7208949030491162,
"learning_rate": 4.516129032258064e-05,
"loss": 1.8956,
"step": 14
},
{
"epoch": 0.04354136429608128,
"grad_norm": 0.6632271677011528,
"learning_rate": 4.8387096774193554e-05,
"loss": 1.8824,
"step": 15
},
{
"epoch": 0.04644412191582003,
"grad_norm": 0.7435836539848341,
"learning_rate": 5.161290322580645e-05,
"loss": 1.9824,
"step": 16
},
{
"epoch": 0.04934687953555878,
"grad_norm": 0.7053044807794453,
"learning_rate": 5.4838709677419355e-05,
"loss": 1.8889,
"step": 17
},
{
"epoch": 0.05224963715529753,
"grad_norm": 0.6993555442321882,
"learning_rate": 5.8064516129032266e-05,
"loss": 1.9324,
"step": 18
},
{
"epoch": 0.055152394775036286,
"grad_norm": 0.7518400173275165,
"learning_rate": 6.129032258064517e-05,
"loss": 1.8978,
"step": 19
},
{
"epoch": 0.05805515239477504,
"grad_norm": 0.7136536191735433,
"learning_rate": 6.451612903225807e-05,
"loss": 1.8947,
"step": 20
},
{
"epoch": 0.06095791001451379,
"grad_norm": 0.700068972477509,
"learning_rate": 6.774193548387096e-05,
"loss": 1.8334,
"step": 21
},
{
"epoch": 0.06386066763425254,
"grad_norm": 0.6097922300754197,
"learning_rate": 7.096774193548388e-05,
"loss": 1.8708,
"step": 22
},
{
"epoch": 0.06676342525399129,
"grad_norm": 0.5892241733394149,
"learning_rate": 7.419354838709677e-05,
"loss": 1.8644,
"step": 23
},
{
"epoch": 0.06966618287373004,
"grad_norm": 0.5998517963533421,
"learning_rate": 7.741935483870968e-05,
"loss": 1.915,
"step": 24
},
{
"epoch": 0.07256894049346879,
"grad_norm": 0.578937324434216,
"learning_rate": 8.064516129032258e-05,
"loss": 1.8765,
"step": 25
},
{
"epoch": 0.07547169811320754,
"grad_norm": 0.5520167576257055,
"learning_rate": 8.387096774193549e-05,
"loss": 1.8907,
"step": 26
},
{
"epoch": 0.0783744557329463,
"grad_norm": 0.5324056505066851,
"learning_rate": 8.709677419354839e-05,
"loss": 1.8089,
"step": 27
},
{
"epoch": 0.08127721335268505,
"grad_norm": 0.5598070603112837,
"learning_rate": 9.032258064516129e-05,
"loss": 1.8114,
"step": 28
},
{
"epoch": 0.0841799709724238,
"grad_norm": 0.5722867654242503,
"learning_rate": 9.35483870967742e-05,
"loss": 1.894,
"step": 29
},
{
"epoch": 0.08708272859216255,
"grad_norm": 0.5501713866741947,
"learning_rate": 9.677419354838711e-05,
"loss": 1.8482,
"step": 30
},
{
"epoch": 0.0899854862119013,
"grad_norm": 0.5098039674637573,
"learning_rate": 0.0001,
"loss": 1.748,
"step": 31
},
{
"epoch": 0.09288824383164006,
"grad_norm": 0.5216650269643087,
"learning_rate": 9.999975375283309e-05,
"loss": 1.8241,
"step": 32
},
{
"epoch": 0.09579100145137881,
"grad_norm": 0.5245607583192764,
"learning_rate": 9.999901501375784e-05,
"loss": 1.7743,
"step": 33
},
{
"epoch": 0.09869375907111756,
"grad_norm": 0.5209846767687549,
"learning_rate": 9.999778379005078e-05,
"loss": 1.8433,
"step": 34
},
{
"epoch": 0.10159651669085631,
"grad_norm": 0.5193753847341327,
"learning_rate": 9.99960600938393e-05,
"loss": 1.7428,
"step": 35
},
{
"epoch": 0.10449927431059507,
"grad_norm": 0.5388545039567529,
"learning_rate": 9.99938439421016e-05,
"loss": 1.8043,
"step": 36
},
{
"epoch": 0.10740203193033382,
"grad_norm": 0.4642517613108308,
"learning_rate": 9.999113535666655e-05,
"loss": 1.8523,
"step": 37
},
{
"epoch": 0.11030478955007257,
"grad_norm": 0.5090771867527996,
"learning_rate": 9.99879343642134e-05,
"loss": 1.8782,
"step": 38
},
{
"epoch": 0.11320754716981132,
"grad_norm": 0.5180365979808446,
"learning_rate": 9.998424099627157e-05,
"loss": 1.7001,
"step": 39
},
{
"epoch": 0.11611030478955008,
"grad_norm": 0.5351792273020963,
"learning_rate": 9.99800552892203e-05,
"loss": 1.8074,
"step": 40
},
{
"epoch": 0.11901306240928883,
"grad_norm": 0.4909229371763726,
"learning_rate": 9.997537728428837e-05,
"loss": 1.8347,
"step": 41
},
{
"epoch": 0.12191582002902758,
"grad_norm": 0.5254118497248897,
"learning_rate": 9.997020702755353e-05,
"loss": 1.818,
"step": 42
},
{
"epoch": 0.12481857764876633,
"grad_norm": 0.531249671313219,
"learning_rate": 9.996454456994226e-05,
"loss": 1.7551,
"step": 43
},
{
"epoch": 0.12772133526850507,
"grad_norm": 0.4986989365776004,
"learning_rate": 9.995838996722914e-05,
"loss": 1.7201,
"step": 44
},
{
"epoch": 0.13062409288824384,
"grad_norm": 0.43995282498016386,
"learning_rate": 9.995174328003631e-05,
"loss": 1.7236,
"step": 45
},
{
"epoch": 0.13352685050798258,
"grad_norm": 0.47024950950165534,
"learning_rate": 9.994460457383284e-05,
"loss": 1.7314,
"step": 46
},
{
"epoch": 0.13642960812772134,
"grad_norm": 0.48793896577915136,
"learning_rate": 9.993697391893423e-05,
"loss": 1.8088,
"step": 47
},
{
"epoch": 0.13933236574746008,
"grad_norm": 0.4635304980180925,
"learning_rate": 9.992885139050154e-05,
"loss": 1.7849,
"step": 48
},
{
"epoch": 0.14223512336719885,
"grad_norm": 0.4553165993013694,
"learning_rate": 9.992023706854076e-05,
"loss": 1.7897,
"step": 49
},
{
"epoch": 0.14513788098693758,
"grad_norm": 0.4539654044984415,
"learning_rate": 9.991113103790198e-05,
"loss": 1.8023,
"step": 50
},
{
"epoch": 0.14804063860667635,
"grad_norm": 0.4740048482547077,
"learning_rate": 9.990153338827856e-05,
"loss": 1.6575,
"step": 51
},
{
"epoch": 0.1509433962264151,
"grad_norm": 0.4698406577608514,
"learning_rate": 9.98914442142063e-05,
"loss": 1.7658,
"step": 52
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.4761488120834836,
"learning_rate": 9.98808636150624e-05,
"loss": 1.7466,
"step": 53
},
{
"epoch": 0.1567489114658926,
"grad_norm": 0.49049936616721757,
"learning_rate": 9.986979169506453e-05,
"loss": 1.8218,
"step": 54
},
{
"epoch": 0.15965166908563136,
"grad_norm": 0.4704423046135022,
"learning_rate": 9.985822856326989e-05,
"loss": 1.7268,
"step": 55
},
{
"epoch": 0.1625544267053701,
"grad_norm": 0.46284810316184194,
"learning_rate": 9.9846174333574e-05,
"loss": 1.7364,
"step": 56
},
{
"epoch": 0.16545718432510886,
"grad_norm": 0.49596626860549764,
"learning_rate": 9.983362912470966e-05,
"loss": 1.747,
"step": 57
},
{
"epoch": 0.1683599419448476,
"grad_norm": 0.4848925097238957,
"learning_rate": 9.982059306024577e-05,
"loss": 1.7047,
"step": 58
},
{
"epoch": 0.17126269956458637,
"grad_norm": 0.46720475615200996,
"learning_rate": 9.980706626858607e-05,
"loss": 1.7173,
"step": 59
},
{
"epoch": 0.1741654571843251,
"grad_norm": 0.44529320530178296,
"learning_rate": 9.979304888296792e-05,
"loss": 1.7491,
"step": 60
},
{
"epoch": 0.17706821480406387,
"grad_norm": 0.44440577127650166,
"learning_rate": 9.977854104146099e-05,
"loss": 1.7776,
"step": 61
},
{
"epoch": 0.1799709724238026,
"grad_norm": 0.4421464284603703,
"learning_rate": 9.976354288696588e-05,
"loss": 1.7716,
"step": 62
},
{
"epoch": 0.18287373004354138,
"grad_norm": 0.46192862653440586,
"learning_rate": 9.97480545672127e-05,
"loss": 1.7546,
"step": 63
},
{
"epoch": 0.18577648766328012,
"grad_norm": 0.45493024275166133,
"learning_rate": 9.973207623475965e-05,
"loss": 1.6439,
"step": 64
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.4595793136325774,
"learning_rate": 9.971560804699148e-05,
"loss": 1.85,
"step": 65
},
{
"epoch": 0.19158200290275762,
"grad_norm": 0.4791479558997977,
"learning_rate": 9.9698650166118e-05,
"loss": 1.8014,
"step": 66
},
{
"epoch": 0.19448476052249636,
"grad_norm": 0.45867544729939197,
"learning_rate": 9.96812027591724e-05,
"loss": 1.7838,
"step": 67
},
{
"epoch": 0.19738751814223512,
"grad_norm": 0.5135716700008814,
"learning_rate": 9.966326599800967e-05,
"loss": 1.8346,
"step": 68
},
{
"epoch": 0.20029027576197386,
"grad_norm": 0.47740664831223145,
"learning_rate": 9.964484005930486e-05,
"loss": 1.7588,
"step": 69
},
{
"epoch": 0.20319303338171263,
"grad_norm": 0.4928720632558317,
"learning_rate": 9.962592512455138e-05,
"loss": 1.752,
"step": 70
},
{
"epoch": 0.20609579100145137,
"grad_norm": 0.47944983366239624,
"learning_rate": 9.960652138005921e-05,
"loss": 1.7496,
"step": 71
},
{
"epoch": 0.20899854862119013,
"grad_norm": 0.4817585857975088,
"learning_rate": 9.958662901695303e-05,
"loss": 1.7815,
"step": 72
},
{
"epoch": 0.21190130624092887,
"grad_norm": 0.48531787875139165,
"learning_rate": 9.956624823117036e-05,
"loss": 1.7124,
"step": 73
},
{
"epoch": 0.21480406386066764,
"grad_norm": 0.4934990779596015,
"learning_rate": 9.954537922345961e-05,
"loss": 1.7579,
"step": 74
},
{
"epoch": 0.21770682148040638,
"grad_norm": 0.4592934884013574,
"learning_rate": 9.952402219937816e-05,
"loss": 1.7661,
"step": 75
},
{
"epoch": 0.22060957910014514,
"grad_norm": 0.4461278786196646,
"learning_rate": 9.950217736929029e-05,
"loss": 1.827,
"step": 76
},
{
"epoch": 0.22351233671988388,
"grad_norm": 0.4607058394430432,
"learning_rate": 9.947984494836506e-05,
"loss": 1.7188,
"step": 77
},
{
"epoch": 0.22641509433962265,
"grad_norm": 0.5113841362017536,
"learning_rate": 9.945702515657434e-05,
"loss": 1.7003,
"step": 78
},
{
"epoch": 0.22931785195936139,
"grad_norm": 0.4556004543544776,
"learning_rate": 9.943371821869043e-05,
"loss": 1.7567,
"step": 79
},
{
"epoch": 0.23222060957910015,
"grad_norm": 0.45511479260974946,
"learning_rate": 9.940992436428409e-05,
"loss": 1.7175,
"step": 80
},
{
"epoch": 0.2351233671988389,
"grad_norm": 0.45337354415403,
"learning_rate": 9.938564382772205e-05,
"loss": 1.6922,
"step": 81
},
{
"epoch": 0.23802612481857766,
"grad_norm": 0.46578477908830707,
"learning_rate": 9.936087684816486e-05,
"loss": 1.7403,
"step": 82
},
{
"epoch": 0.2409288824383164,
"grad_norm": 0.4736398049281869,
"learning_rate": 9.933562366956445e-05,
"loss": 1.7911,
"step": 83
},
{
"epoch": 0.24383164005805516,
"grad_norm": 0.46580351089308164,
"learning_rate": 9.930988454066177e-05,
"loss": 1.7129,
"step": 84
},
{
"epoch": 0.2467343976777939,
"grad_norm": 0.4783921280895399,
"learning_rate": 9.928365971498435e-05,
"loss": 1.6999,
"step": 85
},
{
"epoch": 0.24963715529753266,
"grad_norm": 0.48639096186151054,
"learning_rate": 9.92569494508437e-05,
"loss": 1.7831,
"step": 86
},
{
"epoch": 0.2525399129172714,
"grad_norm": 0.47373508585291985,
"learning_rate": 9.922975401133293e-05,
"loss": 1.7481,
"step": 87
},
{
"epoch": 0.25544267053701014,
"grad_norm": 0.45963136146726274,
"learning_rate": 9.920207366432402e-05,
"loss": 1.7577,
"step": 88
},
{
"epoch": 0.25834542815674894,
"grad_norm": 0.44141708926107565,
"learning_rate": 9.917390868246527e-05,
"loss": 1.7426,
"step": 89
},
{
"epoch": 0.2612481857764877,
"grad_norm": 0.4723061395007264,
"learning_rate": 9.914525934317855e-05,
"loss": 1.7229,
"step": 90
},
{
"epoch": 0.2641509433962264,
"grad_norm": 0.4600724654623569,
"learning_rate": 9.911612592865663e-05,
"loss": 1.6709,
"step": 91
},
{
"epoch": 0.26705370101596515,
"grad_norm": 0.461423618524674,
"learning_rate": 9.908650872586029e-05,
"loss": 1.6044,
"step": 92
},
{
"epoch": 0.26995645863570394,
"grad_norm": 0.4484003156347003,
"learning_rate": 9.905640802651565e-05,
"loss": 1.7269,
"step": 93
},
{
"epoch": 0.2728592162554427,
"grad_norm": 0.45546155316732173,
"learning_rate": 9.90258241271112e-05,
"loss": 1.6815,
"step": 94
},
{
"epoch": 0.2757619738751814,
"grad_norm": 0.4793191665849694,
"learning_rate": 9.899475732889485e-05,
"loss": 1.7851,
"step": 95
},
{
"epoch": 0.27866473149492016,
"grad_norm": 0.44887393395771497,
"learning_rate": 9.896320793787106e-05,
"loss": 1.7097,
"step": 96
},
{
"epoch": 0.28156748911465895,
"grad_norm": 0.43842089686748076,
"learning_rate": 9.893117626479777e-05,
"loss": 1.6785,
"step": 97
},
{
"epoch": 0.2844702467343977,
"grad_norm": 0.4408495390489595,
"learning_rate": 9.889866262518331e-05,
"loss": 1.7806,
"step": 98
},
{
"epoch": 0.28737300435413643,
"grad_norm": 0.46860781799678797,
"learning_rate": 9.886566733928336e-05,
"loss": 1.6707,
"step": 99
},
{
"epoch": 0.29027576197387517,
"grad_norm": 0.4722122346714633,
"learning_rate": 9.883219073209772e-05,
"loss": 1.6392,
"step": 100
},
{
"epoch": 0.2931785195936139,
"grad_norm": 0.4766826387534979,
"learning_rate": 9.879823313336722e-05,
"loss": 1.7407,
"step": 101
},
{
"epoch": 0.2960812772133527,
"grad_norm": 0.49952288540860723,
"learning_rate": 9.876379487757034e-05,
"loss": 1.7379,
"step": 102
},
{
"epoch": 0.29898403483309144,
"grad_norm": 0.433128398635048,
"learning_rate": 9.872887630391998e-05,
"loss": 1.6883,
"step": 103
},
{
"epoch": 0.3018867924528302,
"grad_norm": 0.44194407401541336,
"learning_rate": 9.869347775636015e-05,
"loss": 1.6745,
"step": 104
},
{
"epoch": 0.3047895500725689,
"grad_norm": 0.45676481127165763,
"learning_rate": 9.865759958356251e-05,
"loss": 1.6969,
"step": 105
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.4255773819402685,
"learning_rate": 9.862124213892304e-05,
"loss": 1.6498,
"step": 106
},
{
"epoch": 0.31059506531204645,
"grad_norm": 0.47201009369127117,
"learning_rate": 9.858440578055842e-05,
"loss": 1.7208,
"step": 107
},
{
"epoch": 0.3134978229317852,
"grad_norm": 0.43906192490840906,
"learning_rate": 9.85470908713026e-05,
"loss": 1.6519,
"step": 108
},
{
"epoch": 0.3164005805515239,
"grad_norm": 0.46955483311816915,
"learning_rate": 9.850929777870324e-05,
"loss": 1.7158,
"step": 109
},
{
"epoch": 0.3193033381712627,
"grad_norm": 0.45985568659200904,
"learning_rate": 9.847102687501797e-05,
"loss": 1.7123,
"step": 110
},
{
"epoch": 0.32220609579100146,
"grad_norm": 0.4426308077595128,
"learning_rate": 9.843227853721088e-05,
"loss": 1.6953,
"step": 111
},
{
"epoch": 0.3251088534107402,
"grad_norm": 0.4392841669492503,
"learning_rate": 9.839305314694873e-05,
"loss": 1.6883,
"step": 112
},
{
"epoch": 0.32801161103047893,
"grad_norm": 0.45477227897009836,
"learning_rate": 9.835335109059713e-05,
"loss": 1.6705,
"step": 113
},
{
"epoch": 0.3309143686502177,
"grad_norm": 0.4510965751867643,
"learning_rate": 9.831317275921685e-05,
"loss": 1.7234,
"step": 114
},
{
"epoch": 0.33381712626995647,
"grad_norm": 0.4542106881532701,
"learning_rate": 9.827251854855991e-05,
"loss": 1.7472,
"step": 115
},
{
"epoch": 0.3367198838896952,
"grad_norm": 0.4163055427389403,
"learning_rate": 9.823138885906566e-05,
"loss": 1.6842,
"step": 116
},
{
"epoch": 0.33962264150943394,
"grad_norm": 0.449739499223869,
"learning_rate": 9.81897840958569e-05,
"loss": 1.7605,
"step": 117
},
{
"epoch": 0.34252539912917274,
"grad_norm": 0.4772390046331174,
"learning_rate": 9.814770466873585e-05,
"loss": 1.7861,
"step": 118
},
{
"epoch": 0.3454281567489115,
"grad_norm": 0.4413132468805602,
"learning_rate": 9.810515099218003e-05,
"loss": 1.7759,
"step": 119
},
{
"epoch": 0.3483309143686502,
"grad_norm": 0.46134791732871394,
"learning_rate": 9.806212348533838e-05,
"loss": 1.7047,
"step": 120
},
{
"epoch": 0.35123367198838895,
"grad_norm": 0.4566431321895948,
"learning_rate": 9.801862257202698e-05,
"loss": 1.6666,
"step": 121
},
{
"epoch": 0.35413642960812775,
"grad_norm": 0.46167645349264863,
"learning_rate": 9.797464868072488e-05,
"loss": 1.7031,
"step": 122
},
{
"epoch": 0.3570391872278665,
"grad_norm": 0.43146469610897786,
"learning_rate": 9.79302022445699e-05,
"loss": 1.7556,
"step": 123
},
{
"epoch": 0.3599419448476052,
"grad_norm": 0.45400915786981333,
"learning_rate": 9.788528370135443e-05,
"loss": 1.6255,
"step": 124
},
{
"epoch": 0.36284470246734396,
"grad_norm": 0.48140495646959364,
"learning_rate": 9.783989349352104e-05,
"loss": 1.693,
"step": 125
},
{
"epoch": 0.36574746008708275,
"grad_norm": 0.43964649788083776,
"learning_rate": 9.77940320681581e-05,
"loss": 1.7552,
"step": 126
},
{
"epoch": 0.3686502177068215,
"grad_norm": 0.4456518541367986,
"learning_rate": 9.774769987699548e-05,
"loss": 1.7405,
"step": 127
},
{
"epoch": 0.37155297532656023,
"grad_norm": 0.4354523516212369,
"learning_rate": 9.77008973764e-05,
"loss": 1.6851,
"step": 128
},
{
"epoch": 0.37445573294629897,
"grad_norm": 0.47469832181842386,
"learning_rate": 9.765362502737097e-05,
"loss": 1.7969,
"step": 129
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.456359078319975,
"learning_rate": 9.760588329553571e-05,
"loss": 1.6963,
"step": 130
},
{
"epoch": 0.3802612481857765,
"grad_norm": 0.4297887488755796,
"learning_rate": 9.755767265114484e-05,
"loss": 1.7259,
"step": 131
},
{
"epoch": 0.38316400580551524,
"grad_norm": 0.45977484840579963,
"learning_rate": 9.750899356906775e-05,
"loss": 1.6951,
"step": 132
},
{
"epoch": 0.386066763425254,
"grad_norm": 0.4525297404775678,
"learning_rate": 9.74598465287879e-05,
"loss": 1.6625,
"step": 133
},
{
"epoch": 0.3889695210449927,
"grad_norm": 0.46972762507628174,
"learning_rate": 9.741023201439803e-05,
"loss": 1.6819,
"step": 134
},
{
"epoch": 0.3918722786647315,
"grad_norm": 0.45696399024898127,
"learning_rate": 9.73601505145955e-05,
"loss": 1.7603,
"step": 135
},
{
"epoch": 0.39477503628447025,
"grad_norm": 0.4458342831069972,
"learning_rate": 9.730960252267743e-05,
"loss": 1.7223,
"step": 136
},
{
"epoch": 0.397677793904209,
"grad_norm": 0.43981548094464756,
"learning_rate": 9.72585885365358e-05,
"loss": 1.6766,
"step": 137
},
{
"epoch": 0.4005805515239477,
"grad_norm": 0.4455939651528885,
"learning_rate": 9.720710905865256e-05,
"loss": 1.7184,
"step": 138
},
{
"epoch": 0.4034833091436865,
"grad_norm": 0.47651464370433416,
"learning_rate": 9.715516459609478e-05,
"loss": 1.7439,
"step": 139
},
{
"epoch": 0.40638606676342526,
"grad_norm": 0.4227201150423829,
"learning_rate": 9.710275566050951e-05,
"loss": 1.7061,
"step": 140
},
{
"epoch": 0.409288824383164,
"grad_norm": 0.45752026350838704,
"learning_rate": 9.704988276811883e-05,
"loss": 1.7271,
"step": 141
},
{
"epoch": 0.41219158200290273,
"grad_norm": 0.42509841745667165,
"learning_rate": 9.699654643971472e-05,
"loss": 1.6074,
"step": 142
},
{
"epoch": 0.41509433962264153,
"grad_norm": 0.43248011489070165,
"learning_rate": 9.694274720065399e-05,
"loss": 1.7511,
"step": 143
},
{
"epoch": 0.41799709724238027,
"grad_norm": 0.4725281940940307,
"learning_rate": 9.688848558085306e-05,
"loss": 1.6572,
"step": 144
},
{
"epoch": 0.420899854862119,
"grad_norm": 0.4421316687078927,
"learning_rate": 9.68337621147827e-05,
"loss": 1.6201,
"step": 145
},
{
"epoch": 0.42380261248185774,
"grad_norm": 0.4535950708116043,
"learning_rate": 9.677857734146289e-05,
"loss": 1.6692,
"step": 146
},
{
"epoch": 0.42670537010159654,
"grad_norm": 0.43209953198807594,
"learning_rate": 9.672293180445735e-05,
"loss": 1.6533,
"step": 147
},
{
"epoch": 0.4296081277213353,
"grad_norm": 0.44782737678972356,
"learning_rate": 9.666682605186835e-05,
"loss": 1.7556,
"step": 148
},
{
"epoch": 0.432510885341074,
"grad_norm": 0.44463054319124506,
"learning_rate": 9.661026063633116e-05,
"loss": 1.6907,
"step": 149
},
{
"epoch": 0.43541364296081275,
"grad_norm": 0.4742061834759493,
"learning_rate": 9.655323611500875e-05,
"loss": 1.7614,
"step": 150
},
{
"epoch": 0.43831640058055155,
"grad_norm": 0.4511176764387876,
"learning_rate": 9.649575304958618e-05,
"loss": 1.669,
"step": 151
},
{
"epoch": 0.4412191582002903,
"grad_norm": 0.452395544979668,
"learning_rate": 9.643781200626511e-05,
"loss": 1.7131,
"step": 152
},
{
"epoch": 0.444121915820029,
"grad_norm": 0.4413036784139029,
"learning_rate": 9.637941355575829e-05,
"loss": 1.7243,
"step": 153
},
{
"epoch": 0.44702467343976776,
"grad_norm": 0.45409328535938615,
"learning_rate": 9.632055827328382e-05,
"loss": 1.7142,
"step": 154
},
{
"epoch": 0.44992743105950656,
"grad_norm": 0.4416791338736614,
"learning_rate": 9.626124673855953e-05,
"loss": 1.6888,
"step": 155
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.45065854670848216,
"learning_rate": 9.620147953579737e-05,
"loss": 1.6547,
"step": 156
},
{
"epoch": 0.45573294629898403,
"grad_norm": 0.4440691797775893,
"learning_rate": 9.614125725369747e-05,
"loss": 1.7076,
"step": 157
},
{
"epoch": 0.45863570391872277,
"grad_norm": 0.44912553064278754,
"learning_rate": 9.608058048544251e-05,
"loss": 1.7288,
"step": 158
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.44573130790298193,
"learning_rate": 9.601944982869178e-05,
"loss": 1.7162,
"step": 159
},
{
"epoch": 0.4644412191582003,
"grad_norm": 0.4416451246604798,
"learning_rate": 9.595786588557532e-05,
"loss": 1.688,
"step": 160
},
{
"epoch": 0.46734397677793904,
"grad_norm": 0.45331192505099466,
"learning_rate": 9.589582926268798e-05,
"loss": 1.6659,
"step": 161
},
{
"epoch": 0.4702467343976778,
"grad_norm": 0.43429591419151625,
"learning_rate": 9.583334057108346e-05,
"loss": 1.7391,
"step": 162
},
{
"epoch": 0.4731494920174166,
"grad_norm": 0.4515667244387313,
"learning_rate": 9.577040042626833e-05,
"loss": 1.699,
"step": 163
},
{
"epoch": 0.4760522496371553,
"grad_norm": 0.4388097039320571,
"learning_rate": 9.570700944819584e-05,
"loss": 1.6912,
"step": 164
},
{
"epoch": 0.47895500725689405,
"grad_norm": 0.45889865468721397,
"learning_rate": 9.564316826125997e-05,
"loss": 1.6626,
"step": 165
},
{
"epoch": 0.4818577648766328,
"grad_norm": 0.4299355182130924,
"learning_rate": 9.557887749428913e-05,
"loss": 1.6042,
"step": 166
},
{
"epoch": 0.4847605224963715,
"grad_norm": 0.4329249161629588,
"learning_rate": 9.551413778054014e-05,
"loss": 1.6579,
"step": 167
},
{
"epoch": 0.4876632801161103,
"grad_norm": 0.44400677003837674,
"learning_rate": 9.544894975769186e-05,
"loss": 1.74,
"step": 168
},
{
"epoch": 0.49056603773584906,
"grad_norm": 0.4313987173184158,
"learning_rate": 9.538331406783885e-05,
"loss": 1.7035,
"step": 169
},
{
"epoch": 0.4934687953555878,
"grad_norm": 0.44360491893979864,
"learning_rate": 9.531723135748529e-05,
"loss": 1.7241,
"step": 170
},
{
"epoch": 0.49637155297532654,
"grad_norm": 0.43560748469876714,
"learning_rate": 9.525070227753834e-05,
"loss": 1.6842,
"step": 171
},
{
"epoch": 0.49927431059506533,
"grad_norm": 0.4522807238143896,
"learning_rate": 9.518372748330194e-05,
"loss": 1.7524,
"step": 172
},
{
"epoch": 0.502177068214804,
"grad_norm": 0.4516001953202753,
"learning_rate": 9.511630763447019e-05,
"loss": 1.7675,
"step": 173
},
{
"epoch": 0.5050798258345428,
"grad_norm": 0.44105966868492147,
"learning_rate": 9.504844339512095e-05,
"loss": 1.746,
"step": 174
},
{
"epoch": 0.5079825834542816,
"grad_norm": 0.4849606402881667,
"learning_rate": 9.498013543370933e-05,
"loss": 1.6996,
"step": 175
},
{
"epoch": 0.5108853410740203,
"grad_norm": 0.44877678169069063,
"learning_rate": 9.4911384423061e-05,
"loss": 1.6681,
"step": 176
},
{
"epoch": 0.5137880986937591,
"grad_norm": 0.4105957887246027,
"learning_rate": 9.48421910403656e-05,
"loss": 1.7055,
"step": 177
},
{
"epoch": 0.5166908563134979,
"grad_norm": 0.4370878239882375,
"learning_rate": 9.477255596717012e-05,
"loss": 1.721,
"step": 178
},
{
"epoch": 0.5195936139332366,
"grad_norm": 0.4475796642548257,
"learning_rate": 9.470247988937214e-05,
"loss": 1.7089,
"step": 179
},
{
"epoch": 0.5224963715529753,
"grad_norm": 0.4493895378186208,
"learning_rate": 9.463196349721308e-05,
"loss": 1.673,
"step": 180
},
{
"epoch": 0.525399129172714,
"grad_norm": 0.4281856940042948,
"learning_rate": 9.456100748527143e-05,
"loss": 1.6323,
"step": 181
},
{
"epoch": 0.5283018867924528,
"grad_norm": 0.448730455712877,
"learning_rate": 9.448961255245584e-05,
"loss": 1.6809,
"step": 182
},
{
"epoch": 0.5312046444121916,
"grad_norm": 0.4507101270127422,
"learning_rate": 9.441777940199832e-05,
"loss": 1.7177,
"step": 183
},
{
"epoch": 0.5341074020319303,
"grad_norm": 0.49266458121627354,
"learning_rate": 9.434550874144728e-05,
"loss": 1.7051,
"step": 184
},
{
"epoch": 0.5370101596516691,
"grad_norm": 0.44355957269363283,
"learning_rate": 9.42728012826605e-05,
"loss": 1.688,
"step": 185
},
{
"epoch": 0.5399129172714079,
"grad_norm": 0.44170112328599515,
"learning_rate": 9.419965774179824e-05,
"loss": 1.651,
"step": 186
},
{
"epoch": 0.5428156748911466,
"grad_norm": 0.4435040896365132,
"learning_rate": 9.412607883931607e-05,
"loss": 1.6613,
"step": 187
},
{
"epoch": 0.5457184325108854,
"grad_norm": 0.4475448873591262,
"learning_rate": 9.405206529995785e-05,
"loss": 1.7489,
"step": 188
},
{
"epoch": 0.548621190130624,
"grad_norm": 0.44032542363240856,
"learning_rate": 9.397761785274855e-05,
"loss": 1.6927,
"step": 189
},
{
"epoch": 0.5515239477503628,
"grad_norm": 0.42320618772967467,
"learning_rate": 9.39027372309871e-05,
"loss": 1.6916,
"step": 190
},
{
"epoch": 0.5544267053701016,
"grad_norm": 0.44519721803586065,
"learning_rate": 9.382742417223913e-05,
"loss": 1.6295,
"step": 191
},
{
"epoch": 0.5573294629898403,
"grad_norm": 0.4647273184922122,
"learning_rate": 9.375167941832973e-05,
"loss": 1.6854,
"step": 192
},
{
"epoch": 0.5602322206095791,
"grad_norm": 0.4392017166893709,
"learning_rate": 9.367550371533614e-05,
"loss": 1.6715,
"step": 193
},
{
"epoch": 0.5631349782293179,
"grad_norm": 0.44996437022572094,
"learning_rate": 9.359889781358042e-05,
"loss": 1.6952,
"step": 194
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.461769596883055,
"learning_rate": 9.352186246762201e-05,
"loss": 1.7224,
"step": 195
},
{
"epoch": 0.5689404934687954,
"grad_norm": 0.46096270122354016,
"learning_rate": 9.344439843625034e-05,
"loss": 1.7067,
"step": 196
},
{
"epoch": 0.5718432510885341,
"grad_norm": 0.43532832620404305,
"learning_rate": 9.336650648247732e-05,
"loss": 1.6665,
"step": 197
},
{
"epoch": 0.5747460087082729,
"grad_norm": 0.45325621584387427,
"learning_rate": 9.32881873735299e-05,
"loss": 1.7239,
"step": 198
},
{
"epoch": 0.5776487663280117,
"grad_norm": 0.4433673974331805,
"learning_rate": 9.320944188084242e-05,
"loss": 1.7089,
"step": 199
},
{
"epoch": 0.5805515239477503,
"grad_norm": 0.4313072181776721,
"learning_rate": 9.313027078004903e-05,
"loss": 1.686,
"step": 200
},
{
"epoch": 0.5834542815674891,
"grad_norm": 0.41587032731666324,
"learning_rate": 9.305067485097615e-05,
"loss": 1.6424,
"step": 201
},
{
"epoch": 0.5863570391872278,
"grad_norm": 0.4412815721445294,
"learning_rate": 9.297065487763462e-05,
"loss": 1.661,
"step": 202
},
{
"epoch": 0.5892597968069666,
"grad_norm": 0.46726947815511294,
"learning_rate": 9.289021164821214e-05,
"loss": 1.7301,
"step": 203
},
{
"epoch": 0.5921625544267054,
"grad_norm": 0.45962461407461297,
"learning_rate": 9.280934595506536e-05,
"loss": 1.7223,
"step": 204
},
{
"epoch": 0.5950653120464441,
"grad_norm": 0.41982594349547003,
"learning_rate": 9.272805859471225e-05,
"loss": 1.6335,
"step": 205
},
{
"epoch": 0.5979680696661829,
"grad_norm": 0.4690787688896373,
"learning_rate": 9.264635036782405e-05,
"loss": 1.6864,
"step": 206
},
{
"epoch": 0.6008708272859217,
"grad_norm": 0.4241521450619991,
"learning_rate": 9.256422207921757e-05,
"loss": 1.6836,
"step": 207
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.41405976202828276,
"learning_rate": 9.248167453784711e-05,
"loss": 1.6965,
"step": 208
},
{
"epoch": 0.6066763425253991,
"grad_norm": 0.45850648087284657,
"learning_rate": 9.239870855679664e-05,
"loss": 1.7154,
"step": 209
},
{
"epoch": 0.6095791001451378,
"grad_norm": 0.44195519786790405,
"learning_rate": 9.231532495327165e-05,
"loss": 1.6385,
"step": 210
},
{
"epoch": 0.6124818577648766,
"grad_norm": 0.4450199528491929,
"learning_rate": 9.223152454859116e-05,
"loss": 1.7519,
"step": 211
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.4359625629528257,
"learning_rate": 9.21473081681797e-05,
"loss": 1.7282,
"step": 212
},
{
"epoch": 0.6182873730043541,
"grad_norm": 0.4238801711778006,
"learning_rate": 9.206267664155907e-05,
"loss": 1.6719,
"step": 213
},
{
"epoch": 0.6211901306240929,
"grad_norm": 0.44179537065960767,
"learning_rate": 9.197763080234019e-05,
"loss": 1.6467,
"step": 214
},
{
"epoch": 0.6240928882438317,
"grad_norm": 0.4422804377999602,
"learning_rate": 9.189217148821495e-05,
"loss": 1.7194,
"step": 215
},
{
"epoch": 0.6269956458635704,
"grad_norm": 0.44164585839328874,
"learning_rate": 9.180629954094792e-05,
"loss": 1.7347,
"step": 216
},
{
"epoch": 0.6298984034833092,
"grad_norm": 0.4278863683963636,
"learning_rate": 9.172001580636804e-05,
"loss": 1.7269,
"step": 217
},
{
"epoch": 0.6328011611030478,
"grad_norm": 0.43980909887162917,
"learning_rate": 9.163332113436032e-05,
"loss": 1.6935,
"step": 218
},
{
"epoch": 0.6357039187227866,
"grad_norm": 0.44425865440375695,
"learning_rate": 9.154621637885745e-05,
"loss": 1.669,
"step": 219
},
{
"epoch": 0.6386066763425254,
"grad_norm": 0.45665984376825003,
"learning_rate": 9.145870239783142e-05,
"loss": 1.6619,
"step": 220
},
{
"epoch": 0.6415094339622641,
"grad_norm": 0.4317790466894785,
"learning_rate": 9.1370780053285e-05,
"loss": 1.6749,
"step": 221
},
{
"epoch": 0.6444121915820029,
"grad_norm": 0.4374390480052056,
"learning_rate": 9.128245021124334e-05,
"loss": 1.632,
"step": 222
},
{
"epoch": 0.6473149492017417,
"grad_norm": 0.4281924955127084,
"learning_rate": 9.119371374174535e-05,
"loss": 1.6861,
"step": 223
},
{
"epoch": 0.6502177068214804,
"grad_norm": 0.4161162445315734,
"learning_rate": 9.110457151883523e-05,
"loss": 1.6968,
"step": 224
},
{
"epoch": 0.6531204644412192,
"grad_norm": 0.43870224472437325,
"learning_rate": 9.101502442055374e-05,
"loss": 1.599,
"step": 225
},
{
"epoch": 0.6560232220609579,
"grad_norm": 0.4359888202372141,
"learning_rate": 9.092507332892968e-05,
"loss": 1.6536,
"step": 226
},
{
"epoch": 0.6589259796806967,
"grad_norm": 0.43002636945264805,
"learning_rate": 9.083471912997108e-05,
"loss": 1.6152,
"step": 227
},
{
"epoch": 0.6618287373004355,
"grad_norm": 0.459041720599752,
"learning_rate": 9.07439627136566e-05,
"loss": 1.667,
"step": 228
},
{
"epoch": 0.6647314949201741,
"grad_norm": 0.42190326762874336,
"learning_rate": 9.065280497392663e-05,
"loss": 1.6458,
"step": 229
},
{
"epoch": 0.6676342525399129,
"grad_norm": 0.4519146931950673,
"learning_rate": 9.056124680867457e-05,
"loss": 1.7246,
"step": 230
},
{
"epoch": 0.6705370101596516,
"grad_norm": 0.42286975888040507,
"learning_rate": 9.046928911973799e-05,
"loss": 1.671,
"step": 231
},
{
"epoch": 0.6734397677793904,
"grad_norm": 0.42054642818212734,
"learning_rate": 9.037693281288969e-05,
"loss": 1.7556,
"step": 232
},
{
"epoch": 0.6763425253991292,
"grad_norm": 0.4267347059621492,
"learning_rate": 9.028417879782884e-05,
"loss": 1.6918,
"step": 233
},
{
"epoch": 0.6792452830188679,
"grad_norm": 0.4299200349186463,
"learning_rate": 9.019102798817197e-05,
"loss": 1.7213,
"step": 234
},
{
"epoch": 0.6821480406386067,
"grad_norm": 0.4679979939316583,
"learning_rate": 9.009748130144397e-05,
"loss": 1.7113,
"step": 235
},
{
"epoch": 0.6850507982583455,
"grad_norm": 0.4165197164286497,
"learning_rate": 9.000353965906917e-05,
"loss": 1.755,
"step": 236
},
{
"epoch": 0.6879535558780842,
"grad_norm": 0.4376865087058385,
"learning_rate": 8.990920398636205e-05,
"loss": 1.7524,
"step": 237
},
{
"epoch": 0.690856313497823,
"grad_norm": 0.43415428613852924,
"learning_rate": 8.981447521251831e-05,
"loss": 1.6645,
"step": 238
},
{
"epoch": 0.6937590711175616,
"grad_norm": 0.4308009233363977,
"learning_rate": 8.971935427060562e-05,
"loss": 1.6458,
"step": 239
},
{
"epoch": 0.6966618287373004,
"grad_norm": 0.4376225728875994,
"learning_rate": 8.962384209755452e-05,
"loss": 1.6881,
"step": 240
},
{
"epoch": 0.6995645863570392,
"grad_norm": 0.4351603262250282,
"learning_rate": 8.952793963414907e-05,
"loss": 1.6356,
"step": 241
},
{
"epoch": 0.7024673439767779,
"grad_norm": 0.41191976288984083,
"learning_rate": 8.943164782501765e-05,
"loss": 1.642,
"step": 242
},
{
"epoch": 0.7053701015965167,
"grad_norm": 0.4470078005612137,
"learning_rate": 8.933496761862368e-05,
"loss": 1.684,
"step": 243
},
{
"epoch": 0.7082728592162555,
"grad_norm": 0.4464448207075725,
"learning_rate": 8.923789996725624e-05,
"loss": 1.6926,
"step": 244
},
{
"epoch": 0.7111756168359942,
"grad_norm": 0.4439554476739618,
"learning_rate": 8.914044582702068e-05,
"loss": 1.6759,
"step": 245
},
{
"epoch": 0.714078374455733,
"grad_norm": 0.4266396669859411,
"learning_rate": 8.904260615782927e-05,
"loss": 1.6153,
"step": 246
},
{
"epoch": 0.7169811320754716,
"grad_norm": 0.4931892956511733,
"learning_rate": 8.894438192339163e-05,
"loss": 1.6291,
"step": 247
},
{
"epoch": 0.7198838896952104,
"grad_norm": 0.42881550025697013,
"learning_rate": 8.884577409120535e-05,
"loss": 1.7295,
"step": 248
},
{
"epoch": 0.7227866473149492,
"grad_norm": 0.44272726722256484,
"learning_rate": 8.874678363254642e-05,
"loss": 1.6403,
"step": 249
},
{
"epoch": 0.7256894049346879,
"grad_norm": 0.4576763000793492,
"learning_rate": 8.864741152245963e-05,
"loss": 1.6544,
"step": 250
},
{
"epoch": 0.7285921625544267,
"grad_norm": 0.4440310763777389,
"learning_rate": 8.854765873974898e-05,
"loss": 1.6978,
"step": 251
},
{
"epoch": 0.7314949201741655,
"grad_norm": 0.42684621033853837,
"learning_rate": 8.84475262669681e-05,
"loss": 1.6887,
"step": 252
},
{
"epoch": 0.7343976777939042,
"grad_norm": 0.4489760167291314,
"learning_rate": 8.83470150904105e-05,
"loss": 1.6672,
"step": 253
},
{
"epoch": 0.737300435413643,
"grad_norm": 0.43172408351691677,
"learning_rate": 8.824612620009987e-05,
"loss": 1.5953,
"step": 254
},
{
"epoch": 0.7402031930333817,
"grad_norm": 0.4110953233017019,
"learning_rate": 8.814486058978035e-05,
"loss": 1.6034,
"step": 255
},
{
"epoch": 0.7431059506531205,
"grad_norm": 0.40904113349658683,
"learning_rate": 8.804321925690672e-05,
"loss": 1.666,
"step": 256
},
{
"epoch": 0.7460087082728593,
"grad_norm": 0.4382496801354837,
"learning_rate": 8.794120320263459e-05,
"loss": 1.7069,
"step": 257
},
{
"epoch": 0.7489114658925979,
"grad_norm": 0.42893323771056535,
"learning_rate": 8.783881343181055e-05,
"loss": 1.7267,
"step": 258
},
{
"epoch": 0.7518142235123367,
"grad_norm": 0.4336644359165075,
"learning_rate": 8.773605095296223e-05,
"loss": 1.6652,
"step": 259
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.4273502890060714,
"learning_rate": 8.763291677828838e-05,
"loss": 1.7163,
"step": 260
},
{
"epoch": 0.7576197387518142,
"grad_norm": 0.4391253131606736,
"learning_rate": 8.752941192364897e-05,
"loss": 1.5948,
"step": 261
},
{
"epoch": 0.760522496371553,
"grad_norm": 0.44107969761858984,
"learning_rate": 8.742553740855506e-05,
"loss": 1.669,
"step": 262
},
{
"epoch": 0.7634252539912917,
"grad_norm": 0.42570452087852995,
"learning_rate": 8.732129425615887e-05,
"loss": 1.6528,
"step": 263
},
{
"epoch": 0.7663280116110305,
"grad_norm": 0.42162896720262233,
"learning_rate": 8.721668349324364e-05,
"loss": 1.6558,
"step": 264
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.4377952288804985,
"learning_rate": 8.71117061502135e-05,
"loss": 1.6984,
"step": 265
},
{
"epoch": 0.772133526850508,
"grad_norm": 0.4380368831777441,
"learning_rate": 8.700636326108342e-05,
"loss": 1.7376,
"step": 266
},
{
"epoch": 0.7750362844702468,
"grad_norm": 0.4400229537091114,
"learning_rate": 8.690065586346889e-05,
"loss": 1.6064,
"step": 267
},
{
"epoch": 0.7779390420899854,
"grad_norm": 0.42899135268116534,
"learning_rate": 8.679458499857582e-05,
"loss": 1.6499,
"step": 268
},
{
"epoch": 0.7808417997097242,
"grad_norm": 0.44231074532110715,
"learning_rate": 8.668815171119021e-05,
"loss": 1.6692,
"step": 269
},
{
"epoch": 0.783744557329463,
"grad_norm": 0.44652261911895086,
"learning_rate": 8.658135704966786e-05,
"loss": 1.6803,
"step": 270
},
{
"epoch": 0.7866473149492017,
"grad_norm": 0.43670835811843034,
"learning_rate": 8.647420206592409e-05,
"loss": 1.6069,
"step": 271
},
{
"epoch": 0.7895500725689405,
"grad_norm": 0.4757851454164046,
"learning_rate": 8.636668781542336e-05,
"loss": 1.709,
"step": 272
},
{
"epoch": 0.7924528301886793,
"grad_norm": 0.4706291238431953,
"learning_rate": 8.625881535716883e-05,
"loss": 1.7071,
"step": 273
},
{
"epoch": 0.795355587808418,
"grad_norm": 0.4209656304227592,
"learning_rate": 8.615058575369202e-05,
"loss": 1.629,
"step": 274
},
{
"epoch": 0.7982583454281568,
"grad_norm": 0.40246249569518394,
"learning_rate": 8.604200007104221e-05,
"loss": 1.6557,
"step": 275
},
{
"epoch": 0.8011611030478955,
"grad_norm": 0.4011849556528559,
"learning_rate": 8.593305937877614e-05,
"loss": 1.6376,
"step": 276
},
{
"epoch": 0.8040638606676342,
"grad_norm": 0.41650092329931404,
"learning_rate": 8.582376474994723e-05,
"loss": 1.6825,
"step": 277
},
{
"epoch": 0.806966618287373,
"grad_norm": 0.43881972878342623,
"learning_rate": 8.571411726109519e-05,
"loss": 1.6679,
"step": 278
},
{
"epoch": 0.8098693759071117,
"grad_norm": 0.4186770919262705,
"learning_rate": 8.560411799223538e-05,
"loss": 1.6319,
"step": 279
},
{
"epoch": 0.8127721335268505,
"grad_norm": 0.43295766196299695,
"learning_rate": 8.549376802684812e-05,
"loss": 1.6856,
"step": 280
},
{
"epoch": 0.8156748911465893,
"grad_norm": 0.44692408647350845,
"learning_rate": 8.538306845186808e-05,
"loss": 1.713,
"step": 281
},
{
"epoch": 0.818577648766328,
"grad_norm": 0.4205537150849456,
"learning_rate": 8.527202035767349e-05,
"loss": 1.6273,
"step": 282
},
{
"epoch": 0.8214804063860668,
"grad_norm": 0.4577742643165331,
"learning_rate": 8.516062483807556e-05,
"loss": 1.7133,
"step": 283
},
{
"epoch": 0.8243831640058055,
"grad_norm": 0.4187281454799487,
"learning_rate": 8.504888299030747e-05,
"loss": 1.6586,
"step": 284
},
{
"epoch": 0.8272859216255443,
"grad_norm": 0.43233950001160404,
"learning_rate": 8.49367959150138e-05,
"loss": 1.6911,
"step": 285
},
{
"epoch": 0.8301886792452831,
"grad_norm": 0.44054069970306636,
"learning_rate": 8.482436471623951e-05,
"loss": 1.6014,
"step": 286
},
{
"epoch": 0.8330914368650217,
"grad_norm": 0.419504563014629,
"learning_rate": 8.471159050141917e-05,
"loss": 1.6201,
"step": 287
},
{
"epoch": 0.8359941944847605,
"grad_norm": 0.4267273875690618,
"learning_rate": 8.459847438136605e-05,
"loss": 1.6561,
"step": 288
},
{
"epoch": 0.8388969521044993,
"grad_norm": 0.4095530104906412,
"learning_rate": 8.448501747026108e-05,
"loss": 1.6043,
"step": 289
},
{
"epoch": 0.841799709724238,
"grad_norm": 0.4256839492081417,
"learning_rate": 8.437122088564198e-05,
"loss": 1.5867,
"step": 290
},
{
"epoch": 0.8447024673439768,
"grad_norm": 0.42603345937629117,
"learning_rate": 8.425708574839222e-05,
"loss": 1.7064,
"step": 291
},
{
"epoch": 0.8476052249637155,
"grad_norm": 0.44000309127818,
"learning_rate": 8.414261318272996e-05,
"loss": 1.7288,
"step": 292
},
{
"epoch": 0.8505079825834543,
"grad_norm": 0.4705600537986732,
"learning_rate": 8.4027804316197e-05,
"loss": 1.6549,
"step": 293
},
{
"epoch": 0.8534107402031931,
"grad_norm": 0.4371349211092677,
"learning_rate": 8.391266027964771e-05,
"loss": 1.7102,
"step": 294
},
{
"epoch": 0.8563134978229318,
"grad_norm": 0.43342326008475557,
"learning_rate": 8.379718220723773e-05,
"loss": 1.6049,
"step": 295
},
{
"epoch": 0.8592162554426706,
"grad_norm": 0.4144010575039066,
"learning_rate": 8.368137123641302e-05,
"loss": 1.6604,
"step": 296
},
{
"epoch": 0.8621190130624092,
"grad_norm": 0.42770184288127466,
"learning_rate": 8.356522850789852e-05,
"loss": 1.797,
"step": 297
},
{
"epoch": 0.865021770682148,
"grad_norm": 0.43598223112240264,
"learning_rate": 8.344875516568695e-05,
"loss": 1.6497,
"step": 298
},
{
"epoch": 0.8679245283018868,
"grad_norm": 0.44313850001917815,
"learning_rate": 8.333195235702751e-05,
"loss": 1.6592,
"step": 299
},
{
"epoch": 0.8708272859216255,
"grad_norm": 0.4455207397038388,
"learning_rate": 8.321482123241464e-05,
"loss": 1.6683,
"step": 300
},
{
"epoch": 0.8737300435413643,
"grad_norm": 0.4271507500306176,
"learning_rate": 8.309736294557666e-05,
"loss": 1.677,
"step": 301
},
{
"epoch": 0.8766328011611031,
"grad_norm": 0.4327819977550384,
"learning_rate": 8.297957865346437e-05,
"loss": 1.7103,
"step": 302
},
{
"epoch": 0.8795355587808418,
"grad_norm": 0.4322356173842947,
"learning_rate": 8.28614695162397e-05,
"loss": 1.7342,
"step": 303
},
{
"epoch": 0.8824383164005806,
"grad_norm": 0.4458667249236009,
"learning_rate": 8.274303669726426e-05,
"loss": 1.6622,
"step": 304
},
{
"epoch": 0.8853410740203193,
"grad_norm": 0.4246602730793701,
"learning_rate": 8.262428136308791e-05,
"loss": 1.6357,
"step": 305
},
{
"epoch": 0.888243831640058,
"grad_norm": 0.42063333438955813,
"learning_rate": 8.250520468343722e-05,
"loss": 1.6771,
"step": 306
},
{
"epoch": 0.8911465892597968,
"grad_norm": 0.42285886799986405,
"learning_rate": 8.2385807831204e-05,
"loss": 1.66,
"step": 307
},
{
"epoch": 0.8940493468795355,
"grad_norm": 0.4230350942489117,
"learning_rate": 8.226609198243372e-05,
"loss": 1.6,
"step": 308
},
{
"epoch": 0.8969521044992743,
"grad_norm": 0.42632544312035536,
"learning_rate": 8.21460583163139e-05,
"loss": 1.5493,
"step": 309
},
{
"epoch": 0.8998548621190131,
"grad_norm": 0.4217084548017376,
"learning_rate": 8.202570801516256e-05,
"loss": 1.6828,
"step": 310
},
{
"epoch": 0.9027576197387518,
"grad_norm": 0.41889086453856245,
"learning_rate": 8.190504226441654e-05,
"loss": 1.6705,
"step": 311
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.4524646504864964,
"learning_rate": 8.178406225261981e-05,
"loss": 1.6602,
"step": 312
},
{
"epoch": 0.9085631349782293,
"grad_norm": 0.4159183959864635,
"learning_rate": 8.166276917141176e-05,
"loss": 1.526,
"step": 313
},
{
"epoch": 0.9114658925979681,
"grad_norm": 0.43623491249348967,
"learning_rate": 8.15411642155155e-05,
"loss": 1.6022,
"step": 314
},
{
"epoch": 0.9143686502177069,
"grad_norm": 0.43630207804428456,
"learning_rate": 8.141924858272609e-05,
"loss": 1.6419,
"step": 315
},
{
"epoch": 0.9172714078374455,
"grad_norm": 0.44003284623300876,
"learning_rate": 8.129702347389865e-05,
"loss": 1.7058,
"step": 316
},
{
"epoch": 0.9201741654571843,
"grad_norm": 0.4529852019735458,
"learning_rate": 8.117449009293668e-05,
"loss": 1.6486,
"step": 317
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.4238637935417013,
"learning_rate": 8.105164964678009e-05,
"loss": 1.6734,
"step": 318
},
{
"epoch": 0.9259796806966618,
"grad_norm": 0.42861201454328823,
"learning_rate": 8.092850334539337e-05,
"loss": 1.5744,
"step": 319
},
{
"epoch": 0.9288824383164006,
"grad_norm": 0.4349547932295138,
"learning_rate": 8.080505240175363e-05,
"loss": 1.6993,
"step": 320
},
{
"epoch": 0.9317851959361393,
"grad_norm": 0.43287719452832757,
"learning_rate": 8.068129803183864e-05,
"loss": 1.6767,
"step": 321
},
{
"epoch": 0.9346879535558781,
"grad_norm": 0.42959983515862316,
"learning_rate": 8.055724145461495e-05,
"loss": 1.6641,
"step": 322
},
{
"epoch": 0.9375907111756169,
"grad_norm": 0.45766846862813504,
"learning_rate": 8.043288389202578e-05,
"loss": 1.6885,
"step": 323
},
{
"epoch": 0.9404934687953556,
"grad_norm": 0.42789701721313445,
"learning_rate": 8.030822656897902e-05,
"loss": 1.6368,
"step": 324
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.43189980552562807,
"learning_rate": 8.018327071333521e-05,
"loss": 1.6091,
"step": 325
},
{
"epoch": 0.9462989840348331,
"grad_norm": 0.422299733205841,
"learning_rate": 8.005801755589532e-05,
"loss": 1.6582,
"step": 326
},
{
"epoch": 0.9492017416545718,
"grad_norm": 0.41856064274041027,
"learning_rate": 7.993246833038881e-05,
"loss": 1.636,
"step": 327
},
{
"epoch": 0.9521044992743106,
"grad_norm": 0.4099269881063958,
"learning_rate": 7.980662427346127e-05,
"loss": 1.6235,
"step": 328
},
{
"epoch": 0.9550072568940493,
"grad_norm": 0.4327704269392379,
"learning_rate": 7.968048662466244e-05,
"loss": 1.6926,
"step": 329
},
{
"epoch": 0.9579100145137881,
"grad_norm": 0.45778095424608994,
"learning_rate": 7.955405662643384e-05,
"loss": 1.7089,
"step": 330
},
{
"epoch": 0.9608127721335269,
"grad_norm": 0.43031857434840076,
"learning_rate": 7.942733552409662e-05,
"loss": 1.6863,
"step": 331
},
{
"epoch": 0.9637155297532656,
"grad_norm": 0.4302959280366904,
"learning_rate": 7.930032456583931e-05,
"loss": 1.6907,
"step": 332
},
{
"epoch": 0.9666182873730044,
"grad_norm": 0.43303508356068177,
"learning_rate": 7.917302500270544e-05,
"loss": 1.6244,
"step": 333
},
{
"epoch": 0.969521044992743,
"grad_norm": 0.4231111192621401,
"learning_rate": 7.904543808858127e-05,
"loss": 1.6138,
"step": 334
},
{
"epoch": 0.9724238026124818,
"grad_norm": 0.43441845159441095,
"learning_rate": 7.891756508018347e-05,
"loss": 1.6666,
"step": 335
},
{
"epoch": 0.9753265602322206,
"grad_norm": 0.46476422124026373,
"learning_rate": 7.878940723704664e-05,
"loss": 1.6873,
"step": 336
},
{
"epoch": 0.9782293178519593,
"grad_norm": 0.4173416711617669,
"learning_rate": 7.866096582151106e-05,
"loss": 1.6327,
"step": 337
},
{
"epoch": 0.9811320754716981,
"grad_norm": 0.43165259714088045,
"learning_rate": 7.853224209871007e-05,
"loss": 1.6907,
"step": 338
},
{
"epoch": 0.9840348330914369,
"grad_norm": 0.43798802078246135,
"learning_rate": 7.840323733655778e-05,
"loss": 1.7048,
"step": 339
},
{
"epoch": 0.9869375907111756,
"grad_norm": 0.4193212868688927,
"learning_rate": 7.82739528057365e-05,
"loss": 1.6611,
"step": 340
},
{
"epoch": 0.9898403483309144,
"grad_norm": 0.43758330970313436,
"learning_rate": 7.814438977968417e-05,
"loss": 1.6364,
"step": 341
},
{
"epoch": 0.9927431059506531,
"grad_norm": 0.4307454839904789,
"learning_rate": 7.801454953458193e-05,
"loss": 1.6779,
"step": 342
},
{
"epoch": 0.9956458635703919,
"grad_norm": 0.4414850225491405,
"learning_rate": 7.788443334934148e-05,
"loss": 1.6861,
"step": 343
},
{
"epoch": 0.9985486211901307,
"grad_norm": 0.4216564113946405,
"learning_rate": 7.775404250559249e-05,
"loss": 1.6926,
"step": 344
},
{
"epoch": 1.0014513788098693,
"grad_norm": 0.3927553987869672,
"learning_rate": 7.762337828767e-05,
"loss": 1.5918,
"step": 345
},
{
"epoch": 1.004354136429608,
"grad_norm": 0.408742748023074,
"learning_rate": 7.749244198260175e-05,
"loss": 1.5443,
"step": 346
},
{
"epoch": 1.007256894049347,
"grad_norm": 0.4239498040269477,
"learning_rate": 7.736123488009551e-05,
"loss": 1.465,
"step": 347
},
{
"epoch": 1.0101596516690856,
"grad_norm": 0.4090814175320919,
"learning_rate": 7.722975827252638e-05,
"loss": 1.515,
"step": 348
},
{
"epoch": 1.0130624092888243,
"grad_norm": 0.4046990263672099,
"learning_rate": 7.709801345492402e-05,
"loss": 1.5449,
"step": 349
},
{
"epoch": 1.0159651669085632,
"grad_norm": 0.44882958150904695,
"learning_rate": 7.696600172495997e-05,
"loss": 1.5235,
"step": 350
},
{
"epoch": 1.0188679245283019,
"grad_norm": 0.4479881510852371,
"learning_rate": 7.68337243829348e-05,
"loss": 1.4881,
"step": 351
},
{
"epoch": 1.0217706821480406,
"grad_norm": 0.46463406214710495,
"learning_rate": 7.670118273176534e-05,
"loss": 1.5212,
"step": 352
},
{
"epoch": 1.0246734397677795,
"grad_norm": 0.46598180830307795,
"learning_rate": 7.656837807697187e-05,
"loss": 1.4887,
"step": 353
},
{
"epoch": 1.0275761973875182,
"grad_norm": 0.43790982311840687,
"learning_rate": 7.643531172666513e-05,
"loss": 1.5496,
"step": 354
},
{
"epoch": 1.0304789550072568,
"grad_norm": 0.45187688140730564,
"learning_rate": 7.630198499153365e-05,
"loss": 1.5717,
"step": 355
},
{
"epoch": 1.0333817126269957,
"grad_norm": 0.45238616928753783,
"learning_rate": 7.616839918483061e-05,
"loss": 1.5723,
"step": 356
},
{
"epoch": 1.0362844702467344,
"grad_norm": 0.4343044191882898,
"learning_rate": 7.603455562236108e-05,
"loss": 1.5184,
"step": 357
},
{
"epoch": 1.039187227866473,
"grad_norm": 0.4269296516915997,
"learning_rate": 7.590045562246902e-05,
"loss": 1.4609,
"step": 358
},
{
"epoch": 1.042089985486212,
"grad_norm": 0.4268847911287365,
"learning_rate": 7.576610050602419e-05,
"loss": 1.4784,
"step": 359
},
{
"epoch": 1.0449927431059507,
"grad_norm": 0.4259921959504707,
"learning_rate": 7.563149159640929e-05,
"loss": 1.525,
"step": 360
},
{
"epoch": 1.0478955007256894,
"grad_norm": 0.4310613323251629,
"learning_rate": 7.54966302195068e-05,
"loss": 1.508,
"step": 361
},
{
"epoch": 1.050798258345428,
"grad_norm": 0.45388361418401785,
"learning_rate": 7.5361517703686e-05,
"loss": 1.5654,
"step": 362
},
{
"epoch": 1.053701015965167,
"grad_norm": 0.45712714344979927,
"learning_rate": 7.52261553797899e-05,
"loss": 1.612,
"step": 363
},
{
"epoch": 1.0566037735849056,
"grad_norm": 0.431367940553335,
"learning_rate": 7.509054458112202e-05,
"loss": 1.5303,
"step": 364
},
{
"epoch": 1.0595065312046443,
"grad_norm": 0.4300937918749757,
"learning_rate": 7.495468664343333e-05,
"loss": 1.6036,
"step": 365
},
{
"epoch": 1.0624092888243832,
"grad_norm": 0.4388398837222987,
"learning_rate": 7.481858290490917e-05,
"loss": 1.5068,
"step": 366
},
{
"epoch": 1.065312046444122,
"grad_norm": 0.42653096024854126,
"learning_rate": 7.468223470615593e-05,
"loss": 1.4428,
"step": 367
},
{
"epoch": 1.0682148040638606,
"grad_norm": 0.4428787842472243,
"learning_rate": 7.45456433901879e-05,
"loss": 1.4958,
"step": 368
},
{
"epoch": 1.0711175616835995,
"grad_norm": 0.4269992101825629,
"learning_rate": 7.440881030241407e-05,
"loss": 1.4692,
"step": 369
},
{
"epoch": 1.0740203193033382,
"grad_norm": 0.4420004900877966,
"learning_rate": 7.427173679062484e-05,
"loss": 1.4974,
"step": 370
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.42070168574249084,
"learning_rate": 7.413442420497881e-05,
"loss": 1.3889,
"step": 371
},
{
"epoch": 1.0798258345428158,
"grad_norm": 0.44924175657331056,
"learning_rate": 7.399687389798933e-05,
"loss": 1.4836,
"step": 372
},
{
"epoch": 1.0827285921625545,
"grad_norm": 0.44923515412003434,
"learning_rate": 7.385908722451136e-05,
"loss": 1.5055,
"step": 373
},
{
"epoch": 1.0856313497822931,
"grad_norm": 0.4306963280975785,
"learning_rate": 7.372106554172802e-05,
"loss": 1.5723,
"step": 374
},
{
"epoch": 1.0885341074020318,
"grad_norm": 0.4634883789912338,
"learning_rate": 7.358281020913725e-05,
"loss": 1.5051,
"step": 375
},
{
"epoch": 1.0914368650217707,
"grad_norm": 0.4407467253970956,
"learning_rate": 7.344432258853841e-05,
"loss": 1.5304,
"step": 376
},
{
"epoch": 1.0943396226415094,
"grad_norm": 0.4684616468886323,
"learning_rate": 7.330560404401885e-05,
"loss": 1.4777,
"step": 377
},
{
"epoch": 1.097242380261248,
"grad_norm": 0.4408488981936893,
"learning_rate": 7.316665594194053e-05,
"loss": 1.5379,
"step": 378
},
{
"epoch": 1.100145137880987,
"grad_norm": 0.44650617351083766,
"learning_rate": 7.302747965092651e-05,
"loss": 1.4846,
"step": 379
},
{
"epoch": 1.1030478955007257,
"grad_norm": 0.41274338104074,
"learning_rate": 7.288807654184747e-05,
"loss": 1.5091,
"step": 380
},
{
"epoch": 1.1059506531204644,
"grad_norm": 0.436817974565293,
"learning_rate": 7.274844798780826e-05,
"loss": 1.4659,
"step": 381
},
{
"epoch": 1.1088534107402033,
"grad_norm": 0.43134088076402816,
"learning_rate": 7.260859536413429e-05,
"loss": 1.4799,
"step": 382
},
{
"epoch": 1.111756168359942,
"grad_norm": 0.4286080820128778,
"learning_rate": 7.246852004835807e-05,
"loss": 1.5189,
"step": 383
},
{
"epoch": 1.1146589259796806,
"grad_norm": 0.4263779559224032,
"learning_rate": 7.232822342020557e-05,
"loss": 1.5645,
"step": 384
},
{
"epoch": 1.1175616835994195,
"grad_norm": 0.476132559707573,
"learning_rate": 7.218770686158271e-05,
"loss": 1.5727,
"step": 385
},
{
"epoch": 1.1204644412191582,
"grad_norm": 0.45274640109259867,
"learning_rate": 7.204697175656165e-05,
"loss": 1.5782,
"step": 386
},
{
"epoch": 1.123367198838897,
"grad_norm": 0.45242502456818096,
"learning_rate": 7.19060194913672e-05,
"loss": 1.5172,
"step": 387
},
{
"epoch": 1.1262699564586356,
"grad_norm": 0.4532270846512389,
"learning_rate": 7.176485145436325e-05,
"loss": 1.5026,
"step": 388
},
{
"epoch": 1.1291727140783745,
"grad_norm": 0.4263355677478294,
"learning_rate": 7.162346903603893e-05,
"loss": 1.5122,
"step": 389
},
{
"epoch": 1.1320754716981132,
"grad_norm": 0.4537042592874118,
"learning_rate": 7.148187362899505e-05,
"loss": 1.4688,
"step": 390
},
{
"epoch": 1.134978229317852,
"grad_norm": 0.45561433651230426,
"learning_rate": 7.134006662793031e-05,
"loss": 1.5007,
"step": 391
},
{
"epoch": 1.1378809869375908,
"grad_norm": 0.4387022455968704,
"learning_rate": 7.119804942962762e-05,
"loss": 1.5252,
"step": 392
},
{
"epoch": 1.1407837445573294,
"grad_norm": 0.44235834863811213,
"learning_rate": 7.10558234329403e-05,
"loss": 1.5252,
"step": 393
},
{
"epoch": 1.1436865021770681,
"grad_norm": 0.4368303556457009,
"learning_rate": 7.091339003877826e-05,
"loss": 1.4665,
"step": 394
},
{
"epoch": 1.146589259796807,
"grad_norm": 0.45803823280366857,
"learning_rate": 7.077075065009433e-05,
"loss": 1.478,
"step": 395
},
{
"epoch": 1.1494920174165457,
"grad_norm": 0.45290179376039674,
"learning_rate": 7.062790667187029e-05,
"loss": 1.4727,
"step": 396
},
{
"epoch": 1.1523947750362844,
"grad_norm": 0.4383395184823553,
"learning_rate": 7.048485951110317e-05,
"loss": 1.59,
"step": 397
},
{
"epoch": 1.1552975326560233,
"grad_norm": 0.45712023279133995,
"learning_rate": 7.034161057679127e-05,
"loss": 1.5599,
"step": 398
},
{
"epoch": 1.158200290275762,
"grad_norm": 0.4752149964978104,
"learning_rate": 7.019816127992039e-05,
"loss": 1.5493,
"step": 399
},
{
"epoch": 1.1611030478955007,
"grad_norm": 0.4432142480657464,
"learning_rate": 7.005451303344979e-05,
"loss": 1.5279,
"step": 400
},
{
"epoch": 1.1640058055152394,
"grad_norm": 0.45084593594628664,
"learning_rate": 6.991066725229848e-05,
"loss": 1.5041,
"step": 401
},
{
"epoch": 1.1669085631349783,
"grad_norm": 0.4419227545059286,
"learning_rate": 6.976662535333107e-05,
"loss": 1.4327,
"step": 402
},
{
"epoch": 1.169811320754717,
"grad_norm": 0.47097879348010746,
"learning_rate": 6.962238875534396e-05,
"loss": 1.5143,
"step": 403
},
{
"epoch": 1.1727140783744558,
"grad_norm": 0.45487726246647237,
"learning_rate": 6.947795887905127e-05,
"loss": 1.583,
"step": 404
},
{
"epoch": 1.1756168359941945,
"grad_norm": 0.44380204515587235,
"learning_rate": 6.933333714707094e-05,
"loss": 1.5113,
"step": 405
},
{
"epoch": 1.1785195936139332,
"grad_norm": 0.4703801549155496,
"learning_rate": 6.918852498391063e-05,
"loss": 1.5,
"step": 406
},
{
"epoch": 1.181422351233672,
"grad_norm": 0.43088174923594214,
"learning_rate": 6.904352381595374e-05,
"loss": 1.5153,
"step": 407
},
{
"epoch": 1.1843251088534108,
"grad_norm": 0.44575726222365986,
"learning_rate": 6.889833507144532e-05,
"loss": 1.4146,
"step": 408
},
{
"epoch": 1.1872278664731495,
"grad_norm": 0.44621234533749987,
"learning_rate": 6.87529601804781e-05,
"loss": 1.5305,
"step": 409
},
{
"epoch": 1.1901306240928882,
"grad_norm": 0.47632466917137917,
"learning_rate": 6.860740057497823e-05,
"loss": 1.5573,
"step": 410
},
{
"epoch": 1.193033381712627,
"grad_norm": 0.47486945237681016,
"learning_rate": 6.846165768869134e-05,
"loss": 1.4325,
"step": 411
},
{
"epoch": 1.1959361393323658,
"grad_norm": 0.4538463184737199,
"learning_rate": 6.831573295716837e-05,
"loss": 1.5314,
"step": 412
},
{
"epoch": 1.1988388969521044,
"grad_norm": 0.45892687942681115,
"learning_rate": 6.816962781775138e-05,
"loss": 1.5436,
"step": 413
},
{
"epoch": 1.2017416545718433,
"grad_norm": 0.46450537410648635,
"learning_rate": 6.802334370955941e-05,
"loss": 1.539,
"step": 414
},
{
"epoch": 1.204644412191582,
"grad_norm": 0.46592650439273525,
"learning_rate": 6.787688207347437e-05,
"loss": 1.5519,
"step": 415
},
{
"epoch": 1.2075471698113207,
"grad_norm": 0.468171344542888,
"learning_rate": 6.773024435212678e-05,
"loss": 1.5329,
"step": 416
},
{
"epoch": 1.2104499274310596,
"grad_norm": 0.4723173164935261,
"learning_rate": 6.758343198988159e-05,
"loss": 1.596,
"step": 417
},
{
"epoch": 1.2133526850507983,
"grad_norm": 0.45489546020663313,
"learning_rate": 6.743644643282388e-05,
"loss": 1.5041,
"step": 418
},
{
"epoch": 1.216255442670537,
"grad_norm": 0.4363555310712483,
"learning_rate": 6.728928912874479e-05,
"loss": 1.5472,
"step": 419
},
{
"epoch": 1.2191582002902757,
"grad_norm": 0.4468235429367978,
"learning_rate": 6.714196152712704e-05,
"loss": 1.4307,
"step": 420
},
{
"epoch": 1.2220609579100146,
"grad_norm": 0.4437220099157776,
"learning_rate": 6.699446507913084e-05,
"loss": 1.4494,
"step": 421
},
{
"epoch": 1.2249637155297532,
"grad_norm": 0.4210560575223417,
"learning_rate": 6.684680123757949e-05,
"loss": 1.5243,
"step": 422
},
{
"epoch": 1.227866473149492,
"grad_norm": 0.4668204293682215,
"learning_rate": 6.669897145694507e-05,
"loss": 1.6056,
"step": 423
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.4677982698995379,
"learning_rate": 6.65509771933342e-05,
"loss": 1.5191,
"step": 424
},
{
"epoch": 1.2336719883889695,
"grad_norm": 0.47861565507237985,
"learning_rate": 6.640281990447358e-05,
"loss": 1.5382,
"step": 425
},
{
"epoch": 1.2365747460087082,
"grad_norm": 0.4644120584159886,
"learning_rate": 6.625450104969572e-05,
"loss": 1.5999,
"step": 426
},
{
"epoch": 1.239477503628447,
"grad_norm": 0.45716291968073774,
"learning_rate": 6.610602208992454e-05,
"loss": 1.5676,
"step": 427
},
{
"epoch": 1.2423802612481858,
"grad_norm": 0.44841244015397297,
"learning_rate": 6.595738448766095e-05,
"loss": 1.5214,
"step": 428
},
{
"epoch": 1.2452830188679245,
"grad_norm": 0.4415623475511779,
"learning_rate": 6.580858970696851e-05,
"loss": 1.517,
"step": 429
},
{
"epoch": 1.2481857764876634,
"grad_norm": 0.4549608212636235,
"learning_rate": 6.565963921345895e-05,
"loss": 1.4918,
"step": 430
},
{
"epoch": 1.251088534107402,
"grad_norm": 0.44907033487555814,
"learning_rate": 6.551053447427773e-05,
"loss": 1.5803,
"step": 431
},
{
"epoch": 1.2539912917271407,
"grad_norm": 0.4468164076700206,
"learning_rate": 6.536127695808964e-05,
"loss": 1.5672,
"step": 432
},
{
"epoch": 1.2568940493468794,
"grad_norm": 0.4538931342945944,
"learning_rate": 6.521186813506432e-05,
"loss": 1.5487,
"step": 433
},
{
"epoch": 1.2597968069666183,
"grad_norm": 0.47264082419705356,
"learning_rate": 6.506230947686172e-05,
"loss": 1.5416,
"step": 434
},
{
"epoch": 1.262699564586357,
"grad_norm": 0.46196059052013255,
"learning_rate": 6.491260245661769e-05,
"loss": 1.469,
"step": 435
},
{
"epoch": 1.265602322206096,
"grad_norm": 0.44666383483972233,
"learning_rate": 6.47627485489294e-05,
"loss": 1.5755,
"step": 436
},
{
"epoch": 1.2685050798258346,
"grad_norm": 0.439937833142886,
"learning_rate": 6.461274922984086e-05,
"loss": 1.4735,
"step": 437
},
{
"epoch": 1.2714078374455733,
"grad_norm": 0.497583740863018,
"learning_rate": 6.446260597682839e-05,
"loss": 1.5019,
"step": 438
},
{
"epoch": 1.274310595065312,
"grad_norm": 0.4524167492136727,
"learning_rate": 6.431232026878597e-05,
"loss": 1.4957,
"step": 439
},
{
"epoch": 1.2772133526850509,
"grad_norm": 0.45156681267287047,
"learning_rate": 6.416189358601088e-05,
"loss": 1.5287,
"step": 440
},
{
"epoch": 1.2801161103047896,
"grad_norm": 0.4654951901556938,
"learning_rate": 6.40113274101888e-05,
"loss": 1.5336,
"step": 441
},
{
"epoch": 1.2830188679245282,
"grad_norm": 0.4454274095944744,
"learning_rate": 6.386062322437954e-05,
"loss": 1.4789,
"step": 442
},
{
"epoch": 1.2859216255442671,
"grad_norm": 0.44408122633885455,
"learning_rate": 6.370978251300225e-05,
"loss": 1.5258,
"step": 443
},
{
"epoch": 1.2888243831640058,
"grad_norm": 0.4549446161639688,
"learning_rate": 6.355880676182086e-05,
"loss": 1.4822,
"step": 444
},
{
"epoch": 1.2917271407837445,
"grad_norm": 0.44254753809735725,
"learning_rate": 6.340769745792938e-05,
"loss": 1.4909,
"step": 445
},
{
"epoch": 1.2946298984034832,
"grad_norm": 0.42790914763712823,
"learning_rate": 6.325645608973735e-05,
"loss": 1.5442,
"step": 446
},
{
"epoch": 1.297532656023222,
"grad_norm": 0.4411884226125474,
"learning_rate": 6.310508414695511e-05,
"loss": 1.5516,
"step": 447
},
{
"epoch": 1.3004354136429608,
"grad_norm": 0.43873177967484495,
"learning_rate": 6.295358312057914e-05,
"loss": 1.4638,
"step": 448
},
{
"epoch": 1.3033381712626997,
"grad_norm": 0.48026195443258446,
"learning_rate": 6.280195450287736e-05,
"loss": 1.6206,
"step": 449
},
{
"epoch": 1.3062409288824384,
"grad_norm": 0.43505896706866654,
"learning_rate": 6.26501997873745e-05,
"loss": 1.5997,
"step": 450
},
{
"epoch": 1.309143686502177,
"grad_norm": 0.4566060715651678,
"learning_rate": 6.24983204688373e-05,
"loss": 1.522,
"step": 451
},
{
"epoch": 1.3120464441219157,
"grad_norm": 0.45346291393985555,
"learning_rate": 6.234631804325981e-05,
"loss": 1.5017,
"step": 452
},
{
"epoch": 1.3149492017416546,
"grad_norm": 0.4632175640845113,
"learning_rate": 6.219419400784873e-05,
"loss": 1.581,
"step": 453
},
{
"epoch": 1.3178519593613933,
"grad_norm": 0.4794618393474947,
"learning_rate": 6.204194986100857e-05,
"loss": 1.528,
"step": 454
},
{
"epoch": 1.320754716981132,
"grad_norm": 0.43913428254888687,
"learning_rate": 6.188958710232691e-05,
"loss": 1.4955,
"step": 455
},
{
"epoch": 1.323657474600871,
"grad_norm": 0.4540278180950614,
"learning_rate": 6.173710723255966e-05,
"loss": 1.5065,
"step": 456
},
{
"epoch": 1.3265602322206096,
"grad_norm": 0.4533940019777895,
"learning_rate": 6.158451175361626e-05,
"loss": 1.5243,
"step": 457
},
{
"epoch": 1.3294629898403483,
"grad_norm": 0.44865367392985056,
"learning_rate": 6.143180216854487e-05,
"loss": 1.4677,
"step": 458
},
{
"epoch": 1.332365747460087,
"grad_norm": 0.46053367908901566,
"learning_rate": 6.127897998151763e-05,
"loss": 1.5493,
"step": 459
},
{
"epoch": 1.3352685050798259,
"grad_norm": 0.4398439392621813,
"learning_rate": 6.112604669781572e-05,
"loss": 1.6041,
"step": 460
},
{
"epoch": 1.3381712626995645,
"grad_norm": 0.4609129752157806,
"learning_rate": 6.09730038238147e-05,
"loss": 1.5159,
"step": 461
},
{
"epoch": 1.3410740203193035,
"grad_norm": 0.4597616497248396,
"learning_rate": 6.081985286696949e-05,
"loss": 1.5278,
"step": 462
},
{
"epoch": 1.3439767779390421,
"grad_norm": 0.4558124111887603,
"learning_rate": 6.0666595335799706e-05,
"loss": 1.4877,
"step": 463
},
{
"epoch": 1.3468795355587808,
"grad_norm": 0.46576544508432477,
"learning_rate": 6.051323273987463e-05,
"loss": 1.5074,
"step": 464
},
{
"epoch": 1.3497822931785195,
"grad_norm": 0.4812132538727371,
"learning_rate": 6.035976658979846e-05,
"loss": 1.5554,
"step": 465
},
{
"epoch": 1.3526850507982584,
"grad_norm": 0.46259607268668834,
"learning_rate": 6.020619839719538e-05,
"loss": 1.5327,
"step": 466
},
{
"epoch": 1.355587808417997,
"grad_norm": 0.4612250725580752,
"learning_rate": 6.005252967469469e-05,
"loss": 1.5261,
"step": 467
},
{
"epoch": 1.3584905660377358,
"grad_norm": 0.45060286698409624,
"learning_rate": 5.989876193591589e-05,
"loss": 1.5678,
"step": 468
},
{
"epoch": 1.3613933236574747,
"grad_norm": 0.4593887272117822,
"learning_rate": 5.9744896695453786e-05,
"loss": 1.4541,
"step": 469
},
{
"epoch": 1.3642960812772134,
"grad_norm": 0.463838086898208,
"learning_rate": 5.959093546886356e-05,
"loss": 1.5572,
"step": 470
},
{
"epoch": 1.367198838896952,
"grad_norm": 0.4790836819796382,
"learning_rate": 5.943687977264584e-05,
"loss": 1.5246,
"step": 471
},
{
"epoch": 1.3701015965166907,
"grad_norm": 0.44973349092931064,
"learning_rate": 5.928273112423177e-05,
"loss": 1.5642,
"step": 472
},
{
"epoch": 1.3730043541364296,
"grad_norm": 0.4654567214066549,
"learning_rate": 5.9128491041968094e-05,
"loss": 1.5488,
"step": 473
},
{
"epoch": 1.3759071117561683,
"grad_norm": 0.47770703506396384,
"learning_rate": 5.897416104510211e-05,
"loss": 1.5067,
"step": 474
},
{
"epoch": 1.3788098693759072,
"grad_norm": 0.4666108705962151,
"learning_rate": 5.881974265376682e-05,
"loss": 1.5864,
"step": 475
},
{
"epoch": 1.381712626995646,
"grad_norm": 0.45905196763434913,
"learning_rate": 5.866523738896587e-05,
"loss": 1.5306,
"step": 476
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.47728417457487704,
"learning_rate": 5.851064677255862e-05,
"loss": 1.6074,
"step": 477
},
{
"epoch": 1.3875181422351233,
"grad_norm": 0.4816526495790506,
"learning_rate": 5.835597232724511e-05,
"loss": 1.4794,
"step": 478
},
{
"epoch": 1.3904208998548622,
"grad_norm": 0.4432834674880185,
"learning_rate": 5.820121557655109e-05,
"loss": 1.5367,
"step": 479
},
{
"epoch": 1.3933236574746009,
"grad_norm": 0.4577064957189784,
"learning_rate": 5.804637804481306e-05,
"loss": 1.441,
"step": 480
},
{
"epoch": 1.3962264150943398,
"grad_norm": 0.44814321977117283,
"learning_rate": 5.7891461257163105e-05,
"loss": 1.5312,
"step": 481
},
{
"epoch": 1.3991291727140784,
"grad_norm": 0.4460529710794631,
"learning_rate": 5.773646673951406e-05,
"loss": 1.4984,
"step": 482
},
{
"epoch": 1.4020319303338171,
"grad_norm": 0.46635603854444996,
"learning_rate": 5.758139601854438e-05,
"loss": 1.4672,
"step": 483
},
{
"epoch": 1.4049346879535558,
"grad_norm": 0.4612099111362715,
"learning_rate": 5.742625062168303e-05,
"loss": 1.536,
"step": 484
},
{
"epoch": 1.4078374455732947,
"grad_norm": 0.4650033507770725,
"learning_rate": 5.727103207709463e-05,
"loss": 1.5304,
"step": 485
},
{
"epoch": 1.4107402031930334,
"grad_norm": 0.47554186899763407,
"learning_rate": 5.7115741913664264e-05,
"loss": 1.5835,
"step": 486
},
{
"epoch": 1.413642960812772,
"grad_norm": 0.4555029369185871,
"learning_rate": 5.696038166098241e-05,
"loss": 1.4847,
"step": 487
},
{
"epoch": 1.416545718432511,
"grad_norm": 0.47572763728954914,
"learning_rate": 5.680495284933e-05,
"loss": 1.5905,
"step": 488
},
{
"epoch": 1.4194484760522497,
"grad_norm": 0.435628238472591,
"learning_rate": 5.664945700966315e-05,
"loss": 1.4918,
"step": 489
},
{
"epoch": 1.4223512336719883,
"grad_norm": 0.4561494986233994,
"learning_rate": 5.649389567359831e-05,
"loss": 1.5253,
"step": 490
},
{
"epoch": 1.425253991291727,
"grad_norm": 0.44445343149653593,
"learning_rate": 5.633827037339699e-05,
"loss": 1.4427,
"step": 491
},
{
"epoch": 1.428156748911466,
"grad_norm": 0.4522116164135251,
"learning_rate": 5.6182582641950764e-05,
"loss": 1.5272,
"step": 492
},
{
"epoch": 1.4310595065312046,
"grad_norm": 0.4674310636745245,
"learning_rate": 5.602683401276615e-05,
"loss": 1.5245,
"step": 493
},
{
"epoch": 1.4339622641509435,
"grad_norm": 0.4399025243392155,
"learning_rate": 5.58710260199495e-05,
"loss": 1.4641,
"step": 494
},
{
"epoch": 1.4368650217706822,
"grad_norm": 0.4565675743813997,
"learning_rate": 5.571516019819186e-05,
"loss": 1.5198,
"step": 495
},
{
"epoch": 1.4397677793904209,
"grad_norm": 0.4775767456450886,
"learning_rate": 5.555923808275395e-05,
"loss": 1.5719,
"step": 496
},
{
"epoch": 1.4426705370101596,
"grad_norm": 0.4411506410615202,
"learning_rate": 5.54032612094509e-05,
"loss": 1.5642,
"step": 497
},
{
"epoch": 1.4455732946298985,
"grad_norm": 0.4596333065267962,
"learning_rate": 5.5247231114637256e-05,
"loss": 1.5033,
"step": 498
},
{
"epoch": 1.4484760522496372,
"grad_norm": 0.4809992156127711,
"learning_rate": 5.509114933519178e-05,
"loss": 1.5314,
"step": 499
},
{
"epoch": 1.4513788098693758,
"grad_norm": 0.46900967839257307,
"learning_rate": 5.4935017408502274e-05,
"loss": 1.4789,
"step": 500
},
{
"epoch": 1.4542815674891147,
"grad_norm": 0.4701040167078409,
"learning_rate": 5.477883687245058e-05,
"loss": 1.6075,
"step": 501
},
{
"epoch": 1.4571843251088534,
"grad_norm": 0.4579636786561259,
"learning_rate": 5.462260926539722e-05,
"loss": 1.5111,
"step": 502
},
{
"epoch": 1.4600870827285921,
"grad_norm": 0.4618378488727018,
"learning_rate": 5.446633612616644e-05,
"loss": 1.5685,
"step": 503
},
{
"epoch": 1.4629898403483308,
"grad_norm": 0.45744977910891693,
"learning_rate": 5.431001899403098e-05,
"loss": 1.5445,
"step": 504
},
{
"epoch": 1.4658925979680697,
"grad_norm": 0.4692824578251886,
"learning_rate": 5.4153659408696844e-05,
"loss": 1.574,
"step": 505
},
{
"epoch": 1.4687953555878084,
"grad_norm": 0.45392164865855067,
"learning_rate": 5.399725891028824e-05,
"loss": 1.5263,
"step": 506
},
{
"epoch": 1.4716981132075473,
"grad_norm": 0.4538892582351531,
"learning_rate": 5.384081903933235e-05,
"loss": 1.514,
"step": 507
},
{
"epoch": 1.474600870827286,
"grad_norm": 0.44339578135358343,
"learning_rate": 5.368434133674417e-05,
"loss": 1.4962,
"step": 508
},
{
"epoch": 1.4775036284470247,
"grad_norm": 0.4571620479951012,
"learning_rate": 5.3527827343811364e-05,
"loss": 1.486,
"step": 509
},
{
"epoch": 1.4804063860667633,
"grad_norm": 0.4527386902359813,
"learning_rate": 5.3371278602179e-05,
"loss": 1.5373,
"step": 510
},
{
"epoch": 1.4833091436865022,
"grad_norm": 0.4663342450854473,
"learning_rate": 5.321469665383443e-05,
"loss": 1.4923,
"step": 511
},
{
"epoch": 1.486211901306241,
"grad_norm": 0.45921074887586844,
"learning_rate": 5.305808304109214e-05,
"loss": 1.5258,
"step": 512
},
{
"epoch": 1.4891146589259796,
"grad_norm": 0.46855020704878303,
"learning_rate": 5.290143930657845e-05,
"loss": 1.5062,
"step": 513
},
{
"epoch": 1.4920174165457185,
"grad_norm": 0.47809752409131606,
"learning_rate": 5.274476699321638e-05,
"loss": 1.4989,
"step": 514
},
{
"epoch": 1.4949201741654572,
"grad_norm": 0.46699999992446817,
"learning_rate": 5.258806764421048e-05,
"loss": 1.4744,
"step": 515
},
{
"epoch": 1.4978229317851959,
"grad_norm": 0.4757548017153558,
"learning_rate": 5.243134280303156e-05,
"loss": 1.5495,
"step": 516
},
{
"epoch": 1.5007256894049346,
"grad_norm": 0.46528643942728176,
"learning_rate": 5.227459401340158e-05,
"loss": 1.5676,
"step": 517
},
{
"epoch": 1.5036284470246735,
"grad_norm": 0.4525780872974534,
"learning_rate": 5.211782281927832e-05,
"loss": 1.4699,
"step": 518
},
{
"epoch": 1.5065312046444121,
"grad_norm": 0.4836892335587226,
"learning_rate": 5.1961030764840294e-05,
"loss": 1.4013,
"step": 519
},
{
"epoch": 1.509433962264151,
"grad_norm": 0.48303473919463413,
"learning_rate": 5.1804219394471456e-05,
"loss": 1.5111,
"step": 520
},
{
"epoch": 1.5123367198838897,
"grad_norm": 0.45795346569968054,
"learning_rate": 5.164739025274604e-05,
"loss": 1.5913,
"step": 521
},
{
"epoch": 1.5152394775036284,
"grad_norm": 0.4689215853083974,
"learning_rate": 5.149054488441333e-05,
"loss": 1.5235,
"step": 522
},
{
"epoch": 1.518142235123367,
"grad_norm": 0.4797757576476214,
"learning_rate": 5.1333684834382425e-05,
"loss": 1.5735,
"step": 523
},
{
"epoch": 1.521044992743106,
"grad_norm": 0.4625094740074534,
"learning_rate": 5.117681164770704e-05,
"loss": 1.4845,
"step": 524
},
{
"epoch": 1.5239477503628447,
"grad_norm": 0.47707407588859363,
"learning_rate": 5.101992686957028e-05,
"loss": 1.4987,
"step": 525
},
{
"epoch": 1.5268505079825836,
"grad_norm": 0.47714957037227934,
"learning_rate": 5.086303204526943e-05,
"loss": 1.5133,
"step": 526
},
{
"epoch": 1.5297532656023223,
"grad_norm": 0.4695224397057286,
"learning_rate": 5.070612872020074e-05,
"loss": 1.4813,
"step": 527
},
{
"epoch": 1.532656023222061,
"grad_norm": 0.44929891130564936,
"learning_rate": 5.054921843984418e-05,
"loss": 1.5538,
"step": 528
},
{
"epoch": 1.5355587808417996,
"grad_norm": 0.44398059882392127,
"learning_rate": 5.039230274974823e-05,
"loss": 1.5292,
"step": 529
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.4693247757532975,
"learning_rate": 5.023538319551465e-05,
"loss": 1.5315,
"step": 530
},
{
"epoch": 1.5413642960812772,
"grad_norm": 0.4733961264638568,
"learning_rate": 5.007846132278327e-05,
"loss": 1.4856,
"step": 531
},
{
"epoch": 1.544267053701016,
"grad_norm": 0.47385759271452443,
"learning_rate": 4.9921538677216736e-05,
"loss": 1.5409,
"step": 532
},
{
"epoch": 1.5471698113207548,
"grad_norm": 0.4629528439677091,
"learning_rate": 4.976461680448536e-05,
"loss": 1.4852,
"step": 533
},
{
"epoch": 1.5500725689404935,
"grad_norm": 0.4578952706715695,
"learning_rate": 4.9607697250251786e-05,
"loss": 1.4684,
"step": 534
},
{
"epoch": 1.5529753265602322,
"grad_norm": 0.48246719282994843,
"learning_rate": 4.9450781560155816e-05,
"loss": 1.495,
"step": 535
},
{
"epoch": 1.5558780841799709,
"grad_norm": 0.4752127672900365,
"learning_rate": 4.929387127979927e-05,
"loss": 1.4497,
"step": 536
},
{
"epoch": 1.5587808417997098,
"grad_norm": 0.47309486603928297,
"learning_rate": 4.913696795473058e-05,
"loss": 1.6224,
"step": 537
},
{
"epoch": 1.5616835994194485,
"grad_norm": 0.4512600152955307,
"learning_rate": 4.898007313042975e-05,
"loss": 1.4754,
"step": 538
},
{
"epoch": 1.5645863570391874,
"grad_norm": 0.4501726532789813,
"learning_rate": 4.8823188352292974e-05,
"loss": 1.4962,
"step": 539
},
{
"epoch": 1.567489114658926,
"grad_norm": 0.4694506691591076,
"learning_rate": 4.866631516561759e-05,
"loss": 1.5947,
"step": 540
},
{
"epoch": 1.5703918722786647,
"grad_norm": 0.46515986398793024,
"learning_rate": 4.850945511558669e-05,
"loss": 1.5429,
"step": 541
},
{
"epoch": 1.5732946298984034,
"grad_norm": 0.46261684396896097,
"learning_rate": 4.835260974725397e-05,
"loss": 1.469,
"step": 542
},
{
"epoch": 1.576197387518142,
"grad_norm": 0.4536219827489985,
"learning_rate": 4.819578060552856e-05,
"loss": 1.4923,
"step": 543
},
{
"epoch": 1.579100145137881,
"grad_norm": 0.46759050071674485,
"learning_rate": 4.803896923515974e-05,
"loss": 1.5678,
"step": 544
},
{
"epoch": 1.58200290275762,
"grad_norm": 0.4516525480389553,
"learning_rate": 4.7882177180721693e-05,
"loss": 1.5492,
"step": 545
},
{
"epoch": 1.5849056603773586,
"grad_norm": 0.4675999192776536,
"learning_rate": 4.772540598659844e-05,
"loss": 1.5197,
"step": 546
},
{
"epoch": 1.5878084179970973,
"grad_norm": 0.47622503314430864,
"learning_rate": 4.756865719696845e-05,
"loss": 1.5602,
"step": 547
},
{
"epoch": 1.590711175616836,
"grad_norm": 0.45609339440113955,
"learning_rate": 4.741193235578952e-05,
"loss": 1.4665,
"step": 548
},
{
"epoch": 1.5936139332365746,
"grad_norm": 0.46634681130887157,
"learning_rate": 4.725523300678363e-05,
"loss": 1.5943,
"step": 549
},
{
"epoch": 1.5965166908563135,
"grad_norm": 0.49133146170858844,
"learning_rate": 4.7098560693421565e-05,
"loss": 1.4487,
"step": 550
},
{
"epoch": 1.5994194484760522,
"grad_norm": 0.473172790439045,
"learning_rate": 4.6941916958907876e-05,
"loss": 1.5324,
"step": 551
},
{
"epoch": 1.6023222060957911,
"grad_norm": 0.49491698691619707,
"learning_rate": 4.678530334616557e-05,
"loss": 1.5399,
"step": 552
},
{
"epoch": 1.6052249637155298,
"grad_norm": 0.4650691616495402,
"learning_rate": 4.662872139782102e-05,
"loss": 1.495,
"step": 553
},
{
"epoch": 1.6081277213352685,
"grad_norm": 0.48139950464873565,
"learning_rate": 4.647217265618866e-05,
"loss": 1.4942,
"step": 554
},
{
"epoch": 1.6110304789550072,
"grad_norm": 0.45446734863336363,
"learning_rate": 4.6315658663255834e-05,
"loss": 1.5517,
"step": 555
},
{
"epoch": 1.6139332365747459,
"grad_norm": 0.4669690386799071,
"learning_rate": 4.615918096066766e-05,
"loss": 1.457,
"step": 556
},
{
"epoch": 1.6168359941944848,
"grad_norm": 0.45350351214526136,
"learning_rate": 4.6002741089711785e-05,
"loss": 1.4692,
"step": 557
},
{
"epoch": 1.6197387518142237,
"grad_norm": 0.4727267083209049,
"learning_rate": 4.584634059130317e-05,
"loss": 1.579,
"step": 558
},
{
"epoch": 1.6226415094339623,
"grad_norm": 0.4494821108265869,
"learning_rate": 4.568998100596903e-05,
"loss": 1.4963,
"step": 559
},
{
"epoch": 1.625544267053701,
"grad_norm": 0.47991852803927754,
"learning_rate": 4.553366387383357e-05,
"loss": 1.447,
"step": 560
},
{
"epoch": 1.6284470246734397,
"grad_norm": 0.4739639493364909,
"learning_rate": 4.5377390734602804e-05,
"loss": 1.5047,
"step": 561
},
{
"epoch": 1.6313497822931784,
"grad_norm": 0.46129696380148877,
"learning_rate": 4.522116312754944e-05,
"loss": 1.4853,
"step": 562
},
{
"epoch": 1.6342525399129173,
"grad_norm": 0.4595556059685239,
"learning_rate": 4.506498259149774e-05,
"loss": 1.4723,
"step": 563
},
{
"epoch": 1.637155297532656,
"grad_norm": 0.46376746855844186,
"learning_rate": 4.4908850664808245e-05,
"loss": 1.514,
"step": 564
},
{
"epoch": 1.640058055152395,
"grad_norm": 0.449659708306414,
"learning_rate": 4.475276888536274e-05,
"loss": 1.4796,
"step": 565
},
{
"epoch": 1.6429608127721336,
"grad_norm": 0.4722793153883725,
"learning_rate": 4.4596738790549114e-05,
"loss": 1.4345,
"step": 566
},
{
"epoch": 1.6458635703918723,
"grad_norm": 0.44189671359270183,
"learning_rate": 4.4440761917246066e-05,
"loss": 1.4216,
"step": 567
},
{
"epoch": 1.648766328011611,
"grad_norm": 0.45205616427961226,
"learning_rate": 4.428483980180814e-05,
"loss": 1.58,
"step": 568
},
{
"epoch": 1.6516690856313496,
"grad_norm": 0.4623866078728102,
"learning_rate": 4.41289739800505e-05,
"loss": 1.5132,
"step": 569
},
{
"epoch": 1.6545718432510885,
"grad_norm": 0.46977859749326323,
"learning_rate": 4.397316598723385e-05,
"loss": 1.5424,
"step": 570
},
{
"epoch": 1.6574746008708274,
"grad_norm": 0.4673299997752393,
"learning_rate": 4.3817417358049234e-05,
"loss": 1.4785,
"step": 571
},
{
"epoch": 1.6603773584905661,
"grad_norm": 0.4603587387686837,
"learning_rate": 4.366172962660301e-05,
"loss": 1.5337,
"step": 572
},
{
"epoch": 1.6632801161103048,
"grad_norm": 0.5008208967633541,
"learning_rate": 4.350610432640171e-05,
"loss": 1.5028,
"step": 573
},
{
"epoch": 1.6661828737300435,
"grad_norm": 0.4666657367556008,
"learning_rate": 4.335054299033686e-05,
"loss": 1.48,
"step": 574
},
{
"epoch": 1.6690856313497822,
"grad_norm": 0.4629959785250477,
"learning_rate": 4.3195047150670015e-05,
"loss": 1.535,
"step": 575
},
{
"epoch": 1.671988388969521,
"grad_norm": 0.46819279339683645,
"learning_rate": 4.3039618339017595e-05,
"loss": 1.4148,
"step": 576
},
{
"epoch": 1.6748911465892597,
"grad_norm": 0.4708523269737979,
"learning_rate": 4.288425808633575e-05,
"loss": 1.4727,
"step": 577
},
{
"epoch": 1.6777939042089987,
"grad_norm": 0.46671566022092964,
"learning_rate": 4.272896792290537e-05,
"loss": 1.4642,
"step": 578
},
{
"epoch": 1.6806966618287373,
"grad_norm": 0.457759204954378,
"learning_rate": 4.257374937831698e-05,
"loss": 1.4828,
"step": 579
},
{
"epoch": 1.683599419448476,
"grad_norm": 0.4548047019040293,
"learning_rate": 4.241860398145565e-05,
"loss": 1.4916,
"step": 580
},
{
"epoch": 1.6865021770682147,
"grad_norm": 0.4756265738576344,
"learning_rate": 4.226353326048593e-05,
"loss": 1.5637,
"step": 581
},
{
"epoch": 1.6894049346879536,
"grad_norm": 0.4348629634323058,
"learning_rate": 4.2108538742836906e-05,
"loss": 1.547,
"step": 582
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.46688909682698415,
"learning_rate": 4.195362195518696e-05,
"loss": 1.5251,
"step": 583
},
{
"epoch": 1.6952104499274312,
"grad_norm": 0.46164941517808056,
"learning_rate": 4.179878442344892e-05,
"loss": 1.5753,
"step": 584
},
{
"epoch": 1.6981132075471699,
"grad_norm": 0.46203904974084825,
"learning_rate": 4.16440276727549e-05,
"loss": 1.5028,
"step": 585
},
{
"epoch": 1.7010159651669086,
"grad_norm": 0.45656971172049204,
"learning_rate": 4.14893532274414e-05,
"loss": 1.4794,
"step": 586
},
{
"epoch": 1.7039187227866472,
"grad_norm": 0.4486468546011067,
"learning_rate": 4.133476261103414e-05,
"loss": 1.4713,
"step": 587
},
{
"epoch": 1.706821480406386,
"grad_norm": 0.4830503358807284,
"learning_rate": 4.1180257346233186e-05,
"loss": 1.5095,
"step": 588
},
{
"epoch": 1.7097242380261248,
"grad_norm": 0.44950507829928493,
"learning_rate": 4.10258389548979e-05,
"loss": 1.4932,
"step": 589
},
{
"epoch": 1.7126269956458637,
"grad_norm": 0.4510556255280028,
"learning_rate": 4.0871508958031924e-05,
"loss": 1.4787,
"step": 590
},
{
"epoch": 1.7155297532656024,
"grad_norm": 0.4794827349455255,
"learning_rate": 4.0717268875768225e-05,
"loss": 1.5391,
"step": 591
},
{
"epoch": 1.718432510885341,
"grad_norm": 0.4498680951961932,
"learning_rate": 4.056312022735417e-05,
"loss": 1.5649,
"step": 592
},
{
"epoch": 1.7213352685050798,
"grad_norm": 0.4857270092777572,
"learning_rate": 4.0409064531136455e-05,
"loss": 1.5783,
"step": 593
},
{
"epoch": 1.7242380261248185,
"grad_norm": 0.47038702537497445,
"learning_rate": 4.025510330454621e-05,
"loss": 1.5527,
"step": 594
},
{
"epoch": 1.7271407837445574,
"grad_norm": 0.46431116226283564,
"learning_rate": 4.010123806408411e-05,
"loss": 1.5117,
"step": 595
},
{
"epoch": 1.730043541364296,
"grad_norm": 0.45761514606277265,
"learning_rate": 3.994747032530532e-05,
"loss": 1.574,
"step": 596
},
{
"epoch": 1.732946298984035,
"grad_norm": 0.4600487565052973,
"learning_rate": 3.9793801602804645e-05,
"loss": 1.4708,
"step": 597
},
{
"epoch": 1.7358490566037736,
"grad_norm": 0.46019779933798344,
"learning_rate": 3.9640233410201553e-05,
"loss": 1.4659,
"step": 598
},
{
"epoch": 1.7387518142235123,
"grad_norm": 0.4851334244646456,
"learning_rate": 3.948676726012538e-05,
"loss": 1.4958,
"step": 599
},
{
"epoch": 1.741654571843251,
"grad_norm": 0.4595344373834157,
"learning_rate": 3.933340466420032e-05,
"loss": 1.5883,
"step": 600
},
{
"epoch": 1.7445573294629897,
"grad_norm": 0.45325046808644665,
"learning_rate": 3.91801471330305e-05,
"loss": 1.4735,
"step": 601
},
{
"epoch": 1.7474600870827286,
"grad_norm": 0.47169776350023407,
"learning_rate": 3.902699617618531e-05,
"loss": 1.4546,
"step": 602
},
{
"epoch": 1.7503628447024675,
"grad_norm": 0.46959584781408814,
"learning_rate": 3.887395330218429e-05,
"loss": 1.4884,
"step": 603
},
{
"epoch": 1.7532656023222062,
"grad_norm": 0.4690319139879915,
"learning_rate": 3.872102001848238e-05,
"loss": 1.5223,
"step": 604
},
{
"epoch": 1.7561683599419449,
"grad_norm": 0.4547518691339486,
"learning_rate": 3.856819783145514e-05,
"loss": 1.5698,
"step": 605
},
{
"epoch": 1.7590711175616836,
"grad_norm": 0.48070035770466346,
"learning_rate": 3.841548824638376e-05,
"loss": 1.5422,
"step": 606
},
{
"epoch": 1.7619738751814222,
"grad_norm": 0.4758848424255886,
"learning_rate": 3.826289276744034e-05,
"loss": 1.5465,
"step": 607
},
{
"epoch": 1.7648766328011611,
"grad_norm": 0.45273294813525283,
"learning_rate": 3.8110412897673096e-05,
"loss": 1.5506,
"step": 608
},
{
"epoch": 1.7677793904208998,
"grad_norm": 0.46160319878871425,
"learning_rate": 3.7958050138991434e-05,
"loss": 1.5369,
"step": 609
},
{
"epoch": 1.7706821480406387,
"grad_norm": 0.46023428134326466,
"learning_rate": 3.7805805992151284e-05,
"loss": 1.4829,
"step": 610
},
{
"epoch": 1.7735849056603774,
"grad_norm": 0.4688321625003543,
"learning_rate": 3.76536819567402e-05,
"loss": 1.467,
"step": 611
},
{
"epoch": 1.776487663280116,
"grad_norm": 0.4863577906566358,
"learning_rate": 3.750167953116272e-05,
"loss": 1.4952,
"step": 612
},
{
"epoch": 1.7793904208998548,
"grad_norm": 0.47524313495087417,
"learning_rate": 3.7349800212625523e-05,
"loss": 1.4936,
"step": 613
},
{
"epoch": 1.7822931785195935,
"grad_norm": 0.4669525842981905,
"learning_rate": 3.719804549712265e-05,
"loss": 1.5434,
"step": 614
},
{
"epoch": 1.7851959361393324,
"grad_norm": 0.44553444353386806,
"learning_rate": 3.7046416879420874e-05,
"loss": 1.4759,
"step": 615
},
{
"epoch": 1.7880986937590713,
"grad_norm": 0.48197227984113444,
"learning_rate": 3.689491585304491e-05,
"loss": 1.4859,
"step": 616
},
{
"epoch": 1.79100145137881,
"grad_norm": 0.4533601208560744,
"learning_rate": 3.674354391026264e-05,
"loss": 1.4327,
"step": 617
},
{
"epoch": 1.7939042089985486,
"grad_norm": 0.466341690940055,
"learning_rate": 3.6592302542070624e-05,
"loss": 1.5411,
"step": 618
},
{
"epoch": 1.7968069666182873,
"grad_norm": 0.47239446299615545,
"learning_rate": 3.644119323817915e-05,
"loss": 1.5852,
"step": 619
},
{
"epoch": 1.799709724238026,
"grad_norm": 0.47716767196027376,
"learning_rate": 3.629021748699777e-05,
"loss": 1.4941,
"step": 620
},
{
"epoch": 1.802612481857765,
"grad_norm": 0.476305984803863,
"learning_rate": 3.613937677562047e-05,
"loss": 1.5047,
"step": 621
},
{
"epoch": 1.8055152394775036,
"grad_norm": 0.47338781135924396,
"learning_rate": 3.598867258981122e-05,
"loss": 1.5064,
"step": 622
},
{
"epoch": 1.8084179970972425,
"grad_norm": 0.47656915944176187,
"learning_rate": 3.583810641398916e-05,
"loss": 1.5204,
"step": 623
},
{
"epoch": 1.8113207547169812,
"grad_norm": 0.4649020835150998,
"learning_rate": 3.5687679731214016e-05,
"loss": 1.4855,
"step": 624
},
{
"epoch": 1.8142235123367199,
"grad_norm": 0.4580446679286633,
"learning_rate": 3.553739402317162e-05,
"loss": 1.5914,
"step": 625
},
{
"epoch": 1.8171262699564585,
"grad_norm": 0.4848352552831686,
"learning_rate": 3.538725077015915e-05,
"loss": 1.4898,
"step": 626
},
{
"epoch": 1.8200290275761972,
"grad_norm": 0.4605507034189255,
"learning_rate": 3.523725145107061e-05,
"loss": 1.5372,
"step": 627
},
{
"epoch": 1.8229317851959361,
"grad_norm": 0.4586247172246883,
"learning_rate": 3.5087397543382326e-05,
"loss": 1.4213,
"step": 628
},
{
"epoch": 1.825834542815675,
"grad_norm": 0.46928853500566464,
"learning_rate": 3.49376905231383e-05,
"loss": 1.4923,
"step": 629
},
{
"epoch": 1.8287373004354137,
"grad_norm": 0.45602812595479986,
"learning_rate": 3.478813186493569e-05,
"loss": 1.4821,
"step": 630
},
{
"epoch": 1.8316400580551524,
"grad_norm": 0.48505387940358696,
"learning_rate": 3.463872304191036e-05,
"loss": 1.5636,
"step": 631
},
{
"epoch": 1.834542815674891,
"grad_norm": 0.45620690698230715,
"learning_rate": 3.448946552572229e-05,
"loss": 1.4905,
"step": 632
},
{
"epoch": 1.8374455732946298,
"grad_norm": 0.46807539528967906,
"learning_rate": 3.4340360786541064e-05,
"loss": 1.4674,
"step": 633
},
{
"epoch": 1.8403483309143687,
"grad_norm": 0.47128154504804953,
"learning_rate": 3.419141029303149e-05,
"loss": 1.5567,
"step": 634
},
{
"epoch": 1.8432510885341074,
"grad_norm": 0.4548348308857796,
"learning_rate": 3.404261551233906e-05,
"loss": 1.5434,
"step": 635
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.47271373234671293,
"learning_rate": 3.389397791007548e-05,
"loss": 1.5872,
"step": 636
},
{
"epoch": 1.849056603773585,
"grad_norm": 0.46997525194247747,
"learning_rate": 3.374549895030429e-05,
"loss": 1.4608,
"step": 637
},
{
"epoch": 1.8519593613933236,
"grad_norm": 0.450820895278838,
"learning_rate": 3.3597180095526434e-05,
"loss": 1.5338,
"step": 638
},
{
"epoch": 1.8548621190130623,
"grad_norm": 0.4619339988047396,
"learning_rate": 3.344902280666582e-05,
"loss": 1.4078,
"step": 639
},
{
"epoch": 1.8577648766328012,
"grad_norm": 0.4581809687899449,
"learning_rate": 3.3301028543054935e-05,
"loss": 1.5301,
"step": 640
},
{
"epoch": 1.86066763425254,
"grad_norm": 0.45350784991468535,
"learning_rate": 3.315319876242052e-05,
"loss": 1.5055,
"step": 641
},
{
"epoch": 1.8635703918722788,
"grad_norm": 0.4525962863712468,
"learning_rate": 3.3005534920869176e-05,
"loss": 1.5276,
"step": 642
},
{
"epoch": 1.8664731494920175,
"grad_norm": 0.4626471481490199,
"learning_rate": 3.2858038472872975e-05,
"loss": 1.5238,
"step": 643
},
{
"epoch": 1.8693759071117562,
"grad_norm": 0.430379962270658,
"learning_rate": 3.271071087125522e-05,
"loss": 1.4994,
"step": 644
},
{
"epoch": 1.8722786647314948,
"grad_norm": 0.47987070787437025,
"learning_rate": 3.2563553567176134e-05,
"loss": 1.4499,
"step": 645
},
{
"epoch": 1.8751814223512335,
"grad_norm": 0.4562910479126226,
"learning_rate": 3.2416568010118435e-05,
"loss": 1.3916,
"step": 646
},
{
"epoch": 1.8780841799709724,
"grad_norm": 0.4675915599330702,
"learning_rate": 3.226975564787322e-05,
"loss": 1.4932,
"step": 647
},
{
"epoch": 1.8809869375907113,
"grad_norm": 0.46953833110028453,
"learning_rate": 3.212311792652564e-05,
"loss": 1.5346,
"step": 648
},
{
"epoch": 1.88388969521045,
"grad_norm": 0.4674764133043709,
"learning_rate": 3.19766562904406e-05,
"loss": 1.5799,
"step": 649
},
{
"epoch": 1.8867924528301887,
"grad_norm": 0.46108605181728374,
"learning_rate": 3.183037218224862e-05,
"loss": 1.4899,
"step": 650
},
{
"epoch": 1.8896952104499274,
"grad_norm": 0.4686891208918389,
"learning_rate": 3.168426704283164e-05,
"loss": 1.5242,
"step": 651
},
{
"epoch": 1.892597968069666,
"grad_norm": 0.4729148066836575,
"learning_rate": 3.153834231130866e-05,
"loss": 1.5382,
"step": 652
},
{
"epoch": 1.895500725689405,
"grad_norm": 0.46709660064986425,
"learning_rate": 3.139259942502177e-05,
"loss": 1.4668,
"step": 653
},
{
"epoch": 1.8984034833091437,
"grad_norm": 0.458043558776178,
"learning_rate": 3.124703981952191e-05,
"loss": 1.4917,
"step": 654
},
{
"epoch": 1.9013062409288826,
"grad_norm": 0.44617431196256735,
"learning_rate": 3.110166492855468e-05,
"loss": 1.491,
"step": 655
},
{
"epoch": 1.9042089985486212,
"grad_norm": 0.46560717144952807,
"learning_rate": 3.0956476184046275e-05,
"loss": 1.4855,
"step": 656
},
{
"epoch": 1.90711175616836,
"grad_norm": 0.4581395748562857,
"learning_rate": 3.081147501608936e-05,
"loss": 1.5075,
"step": 657
},
{
"epoch": 1.9100145137880986,
"grad_norm": 0.47234234248624307,
"learning_rate": 3.066666285292906e-05,
"loss": 1.5131,
"step": 658
},
{
"epoch": 1.9129172714078373,
"grad_norm": 0.4767670715915075,
"learning_rate": 3.052204112094873e-05,
"loss": 1.5215,
"step": 659
},
{
"epoch": 1.9158200290275762,
"grad_norm": 0.48119235206828453,
"learning_rate": 3.037761124465604e-05,
"loss": 1.4413,
"step": 660
},
{
"epoch": 1.918722786647315,
"grad_norm": 0.4699275613251767,
"learning_rate": 3.0233374646668933e-05,
"loss": 1.5172,
"step": 661
},
{
"epoch": 1.9216255442670538,
"grad_norm": 0.4653500581993405,
"learning_rate": 3.0089332747701527e-05,
"loss": 1.4612,
"step": 662
},
{
"epoch": 1.9245283018867925,
"grad_norm": 0.4538906862045723,
"learning_rate": 2.9945486966550202e-05,
"loss": 1.5152,
"step": 663
},
{
"epoch": 1.9274310595065312,
"grad_norm": 0.4846747531270697,
"learning_rate": 2.9801838720079633e-05,
"loss": 1.5071,
"step": 664
},
{
"epoch": 1.9303338171262698,
"grad_norm": 0.46054848449981595,
"learning_rate": 2.9658389423208733e-05,
"loss": 1.4473,
"step": 665
},
{
"epoch": 1.9332365747460087,
"grad_norm": 0.47139713221101254,
"learning_rate": 2.9515140488896847e-05,
"loss": 1.5357,
"step": 666
},
{
"epoch": 1.9361393323657474,
"grad_norm": 0.4609597306806886,
"learning_rate": 2.9372093328129712e-05,
"loss": 1.5898,
"step": 667
},
{
"epoch": 1.9390420899854863,
"grad_norm": 0.4572818066020498,
"learning_rate": 2.9229249349905684e-05,
"loss": 1.5062,
"step": 668
},
{
"epoch": 1.941944847605225,
"grad_norm": 0.4624114643272419,
"learning_rate": 2.9086609961221755e-05,
"loss": 1.5153,
"step": 669
},
{
"epoch": 1.9448476052249637,
"grad_norm": 0.46195798267626065,
"learning_rate": 2.8944176567059706e-05,
"loss": 1.5185,
"step": 670
},
{
"epoch": 1.9477503628447024,
"grad_norm": 0.45590830334110005,
"learning_rate": 2.8801950570372372e-05,
"loss": 1.5037,
"step": 671
},
{
"epoch": 1.950653120464441,
"grad_norm": 0.469654105481744,
"learning_rate": 2.86599333720697e-05,
"loss": 1.5239,
"step": 672
},
{
"epoch": 1.95355587808418,
"grad_norm": 0.4550705392965376,
"learning_rate": 2.851812637100496e-05,
"loss": 1.4685,
"step": 673
},
{
"epoch": 1.9564586357039189,
"grad_norm": 0.47125879921091807,
"learning_rate": 2.837653096396108e-05,
"loss": 1.5329,
"step": 674
},
{
"epoch": 1.9593613933236576,
"grad_norm": 0.4648620713257166,
"learning_rate": 2.8235148545636776e-05,
"loss": 1.5887,
"step": 675
},
{
"epoch": 1.9622641509433962,
"grad_norm": 0.455012959714779,
"learning_rate": 2.809398050863279e-05,
"loss": 1.4758,
"step": 676
},
{
"epoch": 1.965166908563135,
"grad_norm": 0.46446945970185327,
"learning_rate": 2.795302824343836e-05,
"loss": 1.5433,
"step": 677
},
{
"epoch": 1.9680696661828736,
"grad_norm": 0.480678095925712,
"learning_rate": 2.7812293138417312e-05,
"loss": 1.4767,
"step": 678
},
{
"epoch": 1.9709724238026125,
"grad_norm": 0.4785528901298419,
"learning_rate": 2.7671776579794438e-05,
"loss": 1.4933,
"step": 679
},
{
"epoch": 1.9738751814223512,
"grad_norm": 0.4786708493893446,
"learning_rate": 2.7531479951641924e-05,
"loss": 1.474,
"step": 680
},
{
"epoch": 1.97677793904209,
"grad_norm": 0.4814413948137428,
"learning_rate": 2.7391404635865724e-05,
"loss": 1.4621,
"step": 681
},
{
"epoch": 1.9796806966618288,
"grad_norm": 0.46625124006991403,
"learning_rate": 2.7251552012191762e-05,
"loss": 1.4667,
"step": 682
},
{
"epoch": 1.9825834542815675,
"grad_norm": 0.47224453354359264,
"learning_rate": 2.711192345815252e-05,
"loss": 1.5663,
"step": 683
},
{
"epoch": 1.9854862119013061,
"grad_norm": 0.47307960881659233,
"learning_rate": 2.697252034907351e-05,
"loss": 1.5666,
"step": 684
},
{
"epoch": 1.988388969521045,
"grad_norm": 0.46040861733715366,
"learning_rate": 2.6833344058059483e-05,
"loss": 1.4819,
"step": 685
},
{
"epoch": 1.9912917271407837,
"grad_norm": 0.4485186239012349,
"learning_rate": 2.6694395955981143e-05,
"loss": 1.5649,
"step": 686
},
{
"epoch": 1.9941944847605226,
"grad_norm": 0.47308758850055393,
"learning_rate": 2.6555677411461593e-05,
"loss": 1.5378,
"step": 687
},
{
"epoch": 1.9970972423802613,
"grad_norm": 0.4577196746977208,
"learning_rate": 2.641718979086277e-05,
"loss": 1.5085,
"step": 688
},
{
"epoch": 2.0,
"grad_norm": 0.4526992960849296,
"learning_rate": 2.6278934458271997e-05,
"loss": 1.4274,
"step": 689
},
{
"epoch": 2.0029027576197387,
"grad_norm": 0.4557321330496575,
"learning_rate": 2.614091277548864e-05,
"loss": 1.3635,
"step": 690
},
{
"epoch": 2.0058055152394774,
"grad_norm": 0.45622176350759885,
"learning_rate": 2.6003126102010695e-05,
"loss": 1.456,
"step": 691
},
{
"epoch": 2.008708272859216,
"grad_norm": 0.4725131985762358,
"learning_rate": 2.5865575795021218e-05,
"loss": 1.324,
"step": 692
},
{
"epoch": 2.011611030478955,
"grad_norm": 0.44436745357994795,
"learning_rate": 2.5728263209375148e-05,
"loss": 1.3873,
"step": 693
},
{
"epoch": 2.014513788098694,
"grad_norm": 0.4497919153312281,
"learning_rate": 2.559118969758595e-05,
"loss": 1.3915,
"step": 694
},
{
"epoch": 2.0174165457184325,
"grad_norm": 0.4317159655970588,
"learning_rate": 2.545435660981212e-05,
"loss": 1.3971,
"step": 695
},
{
"epoch": 2.0203193033381712,
"grad_norm": 0.4454945613563474,
"learning_rate": 2.531776529384407e-05,
"loss": 1.3837,
"step": 696
},
{
"epoch": 2.02322206095791,
"grad_norm": 0.4739025232744788,
"learning_rate": 2.518141709509084e-05,
"loss": 1.3939,
"step": 697
},
{
"epoch": 2.0261248185776486,
"grad_norm": 0.4699422042816283,
"learning_rate": 2.504531335656668e-05,
"loss": 1.4119,
"step": 698
},
{
"epoch": 2.0290275761973877,
"grad_norm": 0.477325925593779,
"learning_rate": 2.4909455418877985e-05,
"loss": 1.3265,
"step": 699
},
{
"epoch": 2.0319303338171264,
"grad_norm": 0.4835969472659004,
"learning_rate": 2.4773844620210118e-05,
"loss": 1.3681,
"step": 700
},
{
"epoch": 2.034833091436865,
"grad_norm": 0.4986977198057429,
"learning_rate": 2.4638482296314004e-05,
"loss": 1.4513,
"step": 701
},
{
"epoch": 2.0377358490566038,
"grad_norm": 0.49380267557508806,
"learning_rate": 2.450336978049322e-05,
"loss": 1.3194,
"step": 702
},
{
"epoch": 2.0406386066763424,
"grad_norm": 0.4885326251641957,
"learning_rate": 2.436850840359073e-05,
"loss": 1.3682,
"step": 703
},
{
"epoch": 2.043541364296081,
"grad_norm": 0.5008933995641366,
"learning_rate": 2.423389949397582e-05,
"loss": 1.375,
"step": 704
},
{
"epoch": 2.04644412191582,
"grad_norm": 0.49081491092972973,
"learning_rate": 2.4099544377530996e-05,
"loss": 1.4345,
"step": 705
},
{
"epoch": 2.049346879535559,
"grad_norm": 0.4904500558785078,
"learning_rate": 2.3965444377638906e-05,
"loss": 1.3821,
"step": 706
},
{
"epoch": 2.0522496371552976,
"grad_norm": 0.4983854848360754,
"learning_rate": 2.3831600815169408e-05,
"loss": 1.4282,
"step": 707
},
{
"epoch": 2.0551523947750363,
"grad_norm": 0.5046530456809294,
"learning_rate": 2.3698015008466372e-05,
"loss": 1.4525,
"step": 708
},
{
"epoch": 2.058055152394775,
"grad_norm": 0.48755264440645235,
"learning_rate": 2.3564688273334858e-05,
"loss": 1.3589,
"step": 709
},
{
"epoch": 2.0609579100145137,
"grad_norm": 0.49815722623164066,
"learning_rate": 2.3431621923028145e-05,
"loss": 1.4545,
"step": 710
},
{
"epoch": 2.0638606676342524,
"grad_norm": 0.46944624619176667,
"learning_rate": 2.329881726823466e-05,
"loss": 1.3532,
"step": 711
},
{
"epoch": 2.0667634252539915,
"grad_norm": 0.4923226798240205,
"learning_rate": 2.3166275617065213e-05,
"loss": 1.314,
"step": 712
},
{
"epoch": 2.06966618287373,
"grad_norm": 0.47975771302322007,
"learning_rate": 2.3033998275040046e-05,
"loss": 1.404,
"step": 713
},
{
"epoch": 2.072568940493469,
"grad_norm": 0.4856107722307854,
"learning_rate": 2.2901986545076e-05,
"loss": 1.3862,
"step": 714
},
{
"epoch": 2.0754716981132075,
"grad_norm": 0.4765429329735173,
"learning_rate": 2.277024172747364e-05,
"loss": 1.4092,
"step": 715
},
{
"epoch": 2.078374455732946,
"grad_norm": 0.47372634491840077,
"learning_rate": 2.26387651199045e-05,
"loss": 1.3414,
"step": 716
},
{
"epoch": 2.081277213352685,
"grad_norm": 0.46133665210994745,
"learning_rate": 2.2507558017398263e-05,
"loss": 1.4205,
"step": 717
},
{
"epoch": 2.084179970972424,
"grad_norm": 0.4858485492689923,
"learning_rate": 2.2376621712330015e-05,
"loss": 1.3465,
"step": 718
},
{
"epoch": 2.0870827285921627,
"grad_norm": 0.5010202222476934,
"learning_rate": 2.2245957494407526e-05,
"loss": 1.3321,
"step": 719
},
{
"epoch": 2.0899854862119014,
"grad_norm": 0.4968076077097432,
"learning_rate": 2.2115566650658536e-05,
"loss": 1.3432,
"step": 720
},
{
"epoch": 2.09288824383164,
"grad_norm": 0.5114363549837371,
"learning_rate": 2.1985450465418084e-05,
"loss": 1.3482,
"step": 721
},
{
"epoch": 2.0957910014513788,
"grad_norm": 0.49706126795541933,
"learning_rate": 2.185561022031582e-05,
"loss": 1.4613,
"step": 722
},
{
"epoch": 2.0986937590711174,
"grad_norm": 0.5057797994562251,
"learning_rate": 2.172604719426351e-05,
"loss": 1.3237,
"step": 723
},
{
"epoch": 2.101596516690856,
"grad_norm": 0.4927672351192254,
"learning_rate": 2.1596762663442218e-05,
"loss": 1.4089,
"step": 724
},
{
"epoch": 2.1044992743105952,
"grad_norm": 0.49942601156872896,
"learning_rate": 2.146775790128994e-05,
"loss": 1.41,
"step": 725
},
{
"epoch": 2.107402031930334,
"grad_norm": 0.4876340617340302,
"learning_rate": 2.1339034178488964e-05,
"loss": 1.3749,
"step": 726
},
{
"epoch": 2.1103047895500726,
"grad_norm": 0.47065769486277115,
"learning_rate": 2.1210592762953375e-05,
"loss": 1.4062,
"step": 727
},
{
"epoch": 2.1132075471698113,
"grad_norm": 0.4865422244332468,
"learning_rate": 2.1082434919816557e-05,
"loss": 1.4275,
"step": 728
},
{
"epoch": 2.11611030478955,
"grad_norm": 0.5067026412498637,
"learning_rate": 2.095456191141874e-05,
"loss": 1.377,
"step": 729
},
{
"epoch": 2.1190130624092887,
"grad_norm": 0.4890102367628194,
"learning_rate": 2.0826974997294574e-05,
"loss": 1.4117,
"step": 730
},
{
"epoch": 2.121915820029028,
"grad_norm": 0.48825969895857046,
"learning_rate": 2.06996754341607e-05,
"loss": 1.4088,
"step": 731
},
{
"epoch": 2.1248185776487665,
"grad_norm": 0.4935458048338819,
"learning_rate": 2.0572664475903386e-05,
"loss": 1.3391,
"step": 732
},
{
"epoch": 2.127721335268505,
"grad_norm": 0.47779751963728906,
"learning_rate": 2.044594337356618e-05,
"loss": 1.3334,
"step": 733
},
{
"epoch": 2.130624092888244,
"grad_norm": 0.5066341938207957,
"learning_rate": 2.0319513375337578e-05,
"loss": 1.4544,
"step": 734
},
{
"epoch": 2.1335268505079825,
"grad_norm": 0.5187919220893455,
"learning_rate": 2.0193375726538737e-05,
"loss": 1.4682,
"step": 735
},
{
"epoch": 2.136429608127721,
"grad_norm": 0.4878857156211185,
"learning_rate": 2.0067531669611206e-05,
"loss": 1.2992,
"step": 736
},
{
"epoch": 2.13933236574746,
"grad_norm": 0.4761358167524258,
"learning_rate": 1.9941982444104677e-05,
"loss": 1.3936,
"step": 737
},
{
"epoch": 2.142235123367199,
"grad_norm": 0.4978567543562124,
"learning_rate": 1.9816729286664798e-05,
"loss": 1.3913,
"step": 738
},
{
"epoch": 2.1451378809869377,
"grad_norm": 0.48409013445619,
"learning_rate": 1.9691773431020975e-05,
"loss": 1.3526,
"step": 739
},
{
"epoch": 2.1480406386066764,
"grad_norm": 0.4962610048271091,
"learning_rate": 1.9567116107974232e-05,
"loss": 1.371,
"step": 740
},
{
"epoch": 2.150943396226415,
"grad_norm": 0.4829458456214998,
"learning_rate": 1.9442758545385065e-05,
"loss": 1.3375,
"step": 741
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.4919306407776298,
"learning_rate": 1.931870196816138e-05,
"loss": 1.2971,
"step": 742
},
{
"epoch": 2.1567489114658924,
"grad_norm": 0.48170237227141577,
"learning_rate": 1.9194947598246394e-05,
"loss": 1.4046,
"step": 743
},
{
"epoch": 2.1596516690856316,
"grad_norm": 0.4887247950130197,
"learning_rate": 1.907149665460664e-05,
"loss": 1.3468,
"step": 744
},
{
"epoch": 2.1625544267053702,
"grad_norm": 0.4685233072820645,
"learning_rate": 1.894835035321991e-05,
"loss": 1.3602,
"step": 745
},
{
"epoch": 2.165457184325109,
"grad_norm": 0.48306280479073915,
"learning_rate": 1.8825509907063327e-05,
"loss": 1.3737,
"step": 746
},
{
"epoch": 2.1683599419448476,
"grad_norm": 0.47900549560949574,
"learning_rate": 1.8702976526101364e-05,
"loss": 1.4668,
"step": 747
},
{
"epoch": 2.1712626995645863,
"grad_norm": 0.4980927579928276,
"learning_rate": 1.8580751417273928e-05,
"loss": 1.3561,
"step": 748
},
{
"epoch": 2.174165457184325,
"grad_norm": 0.48557681598505736,
"learning_rate": 1.8458835784484503e-05,
"loss": 1.3856,
"step": 749
},
{
"epoch": 2.1770682148040637,
"grad_norm": 0.5057902018093277,
"learning_rate": 1.833723082858825e-05,
"loss": 1.3914,
"step": 750
},
{
"epoch": 2.1799709724238028,
"grad_norm": 0.482474372987207,
"learning_rate": 1.8215937747380203e-05,
"loss": 1.4226,
"step": 751
},
{
"epoch": 2.1828737300435415,
"grad_norm": 0.49414740028540627,
"learning_rate": 1.8094957735583463e-05,
"loss": 1.4138,
"step": 752
},
{
"epoch": 2.18577648766328,
"grad_norm": 0.4893020734599679,
"learning_rate": 1.7974291984837443e-05,
"loss": 1.3541,
"step": 753
},
{
"epoch": 2.188679245283019,
"grad_norm": 0.4997930723090434,
"learning_rate": 1.7853941683686114e-05,
"loss": 1.356,
"step": 754
},
{
"epoch": 2.1915820029027575,
"grad_norm": 0.4835852314226089,
"learning_rate": 1.7733908017566296e-05,
"loss": 1.2962,
"step": 755
},
{
"epoch": 2.194484760522496,
"grad_norm": 0.4963307361878724,
"learning_rate": 1.761419216879601e-05,
"loss": 1.3227,
"step": 756
},
{
"epoch": 2.1973875181422353,
"grad_norm": 0.507253503445589,
"learning_rate": 1.749479531656279e-05,
"loss": 1.3947,
"step": 757
},
{
"epoch": 2.200290275761974,
"grad_norm": 0.49452460788894004,
"learning_rate": 1.7375718636912103e-05,
"loss": 1.301,
"step": 758
},
{
"epoch": 2.2031930333817127,
"grad_norm": 0.5064223871657909,
"learning_rate": 1.725696330273575e-05,
"loss": 1.3948,
"step": 759
},
{
"epoch": 2.2060957910014514,
"grad_norm": 0.4906317272693405,
"learning_rate": 1.7138530483760314e-05,
"loss": 1.3527,
"step": 760
},
{
"epoch": 2.20899854862119,
"grad_norm": 0.4953498006018203,
"learning_rate": 1.7020421346535637e-05,
"loss": 1.429,
"step": 761
},
{
"epoch": 2.2119013062409287,
"grad_norm": 0.4978223511220571,
"learning_rate": 1.690263705442334e-05,
"loss": 1.4315,
"step": 762
},
{
"epoch": 2.214804063860668,
"grad_norm": 0.4970907680795578,
"learning_rate": 1.678517876758536e-05,
"loss": 1.338,
"step": 763
},
{
"epoch": 2.2177068214804065,
"grad_norm": 0.5184449533236907,
"learning_rate": 1.6668047642972494e-05,
"loss": 1.3566,
"step": 764
},
{
"epoch": 2.2206095791001452,
"grad_norm": 0.48855554130584444,
"learning_rate": 1.6551244834313062e-05,
"loss": 1.3442,
"step": 765
},
{
"epoch": 2.223512336719884,
"grad_norm": 0.5270551908185177,
"learning_rate": 1.6434771492101485e-05,
"loss": 1.2931,
"step": 766
},
{
"epoch": 2.2264150943396226,
"grad_norm": 0.4893431924149266,
"learning_rate": 1.631862876358699e-05,
"loss": 1.3592,
"step": 767
},
{
"epoch": 2.2293178519593613,
"grad_norm": 0.4973798951818951,
"learning_rate": 1.6202817792762282e-05,
"loss": 1.4016,
"step": 768
},
{
"epoch": 2.2322206095791,
"grad_norm": 0.4943300515226499,
"learning_rate": 1.6087339720352307e-05,
"loss": 1.3039,
"step": 769
},
{
"epoch": 2.235123367198839,
"grad_norm": 0.5036358219004383,
"learning_rate": 1.5972195683802992e-05,
"loss": 1.3854,
"step": 770
},
{
"epoch": 2.2380261248185778,
"grad_norm": 0.4907052752814604,
"learning_rate": 1.585738681727006e-05,
"loss": 1.3978,
"step": 771
},
{
"epoch": 2.2409288824383164,
"grad_norm": 0.4988082777324976,
"learning_rate": 1.5742914251607793e-05,
"loss": 1.3216,
"step": 772
},
{
"epoch": 2.243831640058055,
"grad_norm": 0.4988643623846494,
"learning_rate": 1.5628779114358034e-05,
"loss": 1.4209,
"step": 773
},
{
"epoch": 2.246734397677794,
"grad_norm": 0.5071115791821638,
"learning_rate": 1.5514982529738946e-05,
"loss": 1.3524,
"step": 774
},
{
"epoch": 2.2496371552975325,
"grad_norm": 0.5181860773942285,
"learning_rate": 1.5401525618633962e-05,
"loss": 1.4223,
"step": 775
},
{
"epoch": 2.252539912917271,
"grad_norm": 0.5117166725724893,
"learning_rate": 1.5288409498580824e-05,
"loss": 1.3106,
"step": 776
},
{
"epoch": 2.2554426705370103,
"grad_norm": 0.5085644146108604,
"learning_rate": 1.5175635283760498e-05,
"loss": 1.4452,
"step": 777
},
{
"epoch": 2.258345428156749,
"grad_norm": 0.49242216361740593,
"learning_rate": 1.506320408498621e-05,
"loss": 1.3657,
"step": 778
},
{
"epoch": 2.2612481857764877,
"grad_norm": 0.5033958192043735,
"learning_rate": 1.4951117009692528e-05,
"loss": 1.3486,
"step": 779
},
{
"epoch": 2.2641509433962264,
"grad_norm": 0.501556306751673,
"learning_rate": 1.4839375161924446e-05,
"loss": 1.399,
"step": 780
},
{
"epoch": 2.267053701015965,
"grad_norm": 0.5043401261121595,
"learning_rate": 1.47279796423265e-05,
"loss": 1.3427,
"step": 781
},
{
"epoch": 2.269956458635704,
"grad_norm": 0.5016851954141492,
"learning_rate": 1.4616931548131929e-05,
"loss": 1.408,
"step": 782
},
{
"epoch": 2.272859216255443,
"grad_norm": 0.503007502023911,
"learning_rate": 1.4506231973151884e-05,
"loss": 1.4321,
"step": 783
},
{
"epoch": 2.2757619738751815,
"grad_norm": 0.4998191511584849,
"learning_rate": 1.4395882007764644e-05,
"loss": 1.3539,
"step": 784
},
{
"epoch": 2.27866473149492,
"grad_norm": 0.49790149970362657,
"learning_rate": 1.4285882738904822e-05,
"loss": 1.2995,
"step": 785
},
{
"epoch": 2.281567489114659,
"grad_norm": 0.5250520154836357,
"learning_rate": 1.4176235250052788e-05,
"loss": 1.3714,
"step": 786
},
{
"epoch": 2.2844702467343976,
"grad_norm": 0.5127466155566004,
"learning_rate": 1.406694062122389e-05,
"loss": 1.4222,
"step": 787
},
{
"epoch": 2.2873730043541363,
"grad_norm": 0.5065590163522045,
"learning_rate": 1.3957999928957787e-05,
"loss": 1.3487,
"step": 788
},
{
"epoch": 2.2902757619738754,
"grad_norm": 0.49043631461350506,
"learning_rate": 1.3849414246307996e-05,
"loss": 1.3962,
"step": 789
},
{
"epoch": 2.293178519593614,
"grad_norm": 0.5108835180194123,
"learning_rate": 1.3741184642831189e-05,
"loss": 1.2766,
"step": 790
},
{
"epoch": 2.2960812772133528,
"grad_norm": 0.5138877705379701,
"learning_rate": 1.3633312184576651e-05,
"loss": 1.3612,
"step": 791
},
{
"epoch": 2.2989840348330914,
"grad_norm": 0.5298526548747806,
"learning_rate": 1.3525797934075912e-05,
"loss": 1.4138,
"step": 792
},
{
"epoch": 2.30188679245283,
"grad_norm": 0.5122921685877133,
"learning_rate": 1.341864295033215e-05,
"loss": 1.4061,
"step": 793
},
{
"epoch": 2.304789550072569,
"grad_norm": 0.49002586742191384,
"learning_rate": 1.3311848288809813e-05,
"loss": 1.4082,
"step": 794
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.5041135948947505,
"learning_rate": 1.3205415001424176e-05,
"loss": 1.3444,
"step": 795
},
{
"epoch": 2.3105950653120466,
"grad_norm": 0.5071150981383399,
"learning_rate": 1.3099344136531105e-05,
"loss": 1.3448,
"step": 796
},
{
"epoch": 2.3134978229317853,
"grad_norm": 0.5007312719246397,
"learning_rate": 1.2993636738916604e-05,
"loss": 1.3141,
"step": 797
},
{
"epoch": 2.316400580551524,
"grad_norm": 0.5016285196263098,
"learning_rate": 1.2888293849786503e-05,
"loss": 1.3987,
"step": 798
},
{
"epoch": 2.3193033381712627,
"grad_norm": 0.5143146682326721,
"learning_rate": 1.2783316506756377e-05,
"loss": 1.3866,
"step": 799
},
{
"epoch": 2.3222060957910013,
"grad_norm": 0.4988793387220004,
"learning_rate": 1.2678705743841152e-05,
"loss": 1.374,
"step": 800
},
{
"epoch": 2.32510885341074,
"grad_norm": 0.497268328746734,
"learning_rate": 1.257446259144494e-05,
"loss": 1.3899,
"step": 801
},
{
"epoch": 2.3280116110304787,
"grad_norm": 0.5004669903148027,
"learning_rate": 1.2470588076351036e-05,
"loss": 1.3527,
"step": 802
},
{
"epoch": 2.330914368650218,
"grad_norm": 0.511701233655665,
"learning_rate": 1.2367083221711639e-05,
"loss": 1.3961,
"step": 803
},
{
"epoch": 2.3338171262699565,
"grad_norm": 0.5129756304318318,
"learning_rate": 1.2263949047037786e-05,
"loss": 1.4234,
"step": 804
},
{
"epoch": 2.336719883889695,
"grad_norm": 0.5138138080878748,
"learning_rate": 1.2161186568189459e-05,
"loss": 1.3594,
"step": 805
},
{
"epoch": 2.339622641509434,
"grad_norm": 0.5059358852316208,
"learning_rate": 1.2058796797365423e-05,
"loss": 1.4111,
"step": 806
},
{
"epoch": 2.3425253991291726,
"grad_norm": 0.507752171505059,
"learning_rate": 1.19567807430933e-05,
"loss": 1.4224,
"step": 807
},
{
"epoch": 2.3454281567489117,
"grad_norm": 0.49887740788588064,
"learning_rate": 1.1855139410219657e-05,
"loss": 1.4154,
"step": 808
},
{
"epoch": 2.3483309143686504,
"grad_norm": 0.5192444115110244,
"learning_rate": 1.1753873799900133e-05,
"loss": 1.4032,
"step": 809
},
{
"epoch": 2.351233671988389,
"grad_norm": 0.5305883683714778,
"learning_rate": 1.1652984909589515e-05,
"loss": 1.3826,
"step": 810
},
{
"epoch": 2.3541364296081277,
"grad_norm": 0.5304755788595946,
"learning_rate": 1.1552473733031894e-05,
"loss": 1.3872,
"step": 811
},
{
"epoch": 2.3570391872278664,
"grad_norm": 0.5241059985415158,
"learning_rate": 1.145234126025102e-05,
"loss": 1.3818,
"step": 812
},
{
"epoch": 2.359941944847605,
"grad_norm": 0.5158747189270558,
"learning_rate": 1.1352588477540388e-05,
"loss": 1.3471,
"step": 813
},
{
"epoch": 2.362844702467344,
"grad_norm": 0.5235648331116857,
"learning_rate": 1.1253216367453578e-05,
"loss": 1.3765,
"step": 814
},
{
"epoch": 2.365747460087083,
"grad_norm": 0.501396168167529,
"learning_rate": 1.1154225908794642e-05,
"loss": 1.4147,
"step": 815
},
{
"epoch": 2.3686502177068216,
"grad_norm": 0.48002662639608085,
"learning_rate": 1.1055618076608381e-05,
"loss": 1.2803,
"step": 816
},
{
"epoch": 2.3715529753265603,
"grad_norm": 0.5102312548878859,
"learning_rate": 1.095739384217075e-05,
"loss": 1.291,
"step": 817
},
{
"epoch": 2.374455732946299,
"grad_norm": 0.5054912657183752,
"learning_rate": 1.085955417297932e-05,
"loss": 1.3159,
"step": 818
},
{
"epoch": 2.3773584905660377,
"grad_norm": 0.5009360091281737,
"learning_rate": 1.0762100032743783e-05,
"loss": 1.3619,
"step": 819
},
{
"epoch": 2.3802612481857763,
"grad_norm": 0.49586961064570917,
"learning_rate": 1.0665032381376338e-05,
"loss": 1.3883,
"step": 820
},
{
"epoch": 2.383164005805515,
"grad_norm": 0.505661088602094,
"learning_rate": 1.056835217498236e-05,
"loss": 1.397,
"step": 821
},
{
"epoch": 2.386066763425254,
"grad_norm": 0.5138105778409985,
"learning_rate": 1.047206036585095e-05,
"loss": 1.3653,
"step": 822
},
{
"epoch": 2.388969521044993,
"grad_norm": 0.4822618105950922,
"learning_rate": 1.0376157902445488e-05,
"loss": 1.3404,
"step": 823
},
{
"epoch": 2.3918722786647315,
"grad_norm": 0.49132290633305986,
"learning_rate": 1.0280645729394366e-05,
"loss": 1.2979,
"step": 824
},
{
"epoch": 2.39477503628447,
"grad_norm": 0.5035651996987953,
"learning_rate": 1.0185524787481693e-05,
"loss": 1.3394,
"step": 825
},
{
"epoch": 2.397677793904209,
"grad_norm": 0.5118690550363177,
"learning_rate": 1.0090796013637965e-05,
"loss": 1.3333,
"step": 826
},
{
"epoch": 2.4005805515239476,
"grad_norm": 0.5044802728060694,
"learning_rate": 9.99646034093083e-06,
"loss": 1.3782,
"step": 827
},
{
"epoch": 2.4034833091436867,
"grad_norm": 0.5046761953713803,
"learning_rate": 9.902518698556018e-06,
"loss": 1.3676,
"step": 828
},
{
"epoch": 2.4063860667634254,
"grad_norm": 0.48682898652325846,
"learning_rate": 9.808972011828055e-06,
"loss": 1.3647,
"step": 829
},
{
"epoch": 2.409288824383164,
"grad_norm": 0.5137970317883824,
"learning_rate": 9.715821202171178e-06,
"loss": 1.3816,
"step": 830
},
{
"epoch": 2.4121915820029027,
"grad_norm": 0.5181780356155109,
"learning_rate": 9.623067187110307e-06,
"loss": 1.3806,
"step": 831
},
{
"epoch": 2.4150943396226414,
"grad_norm": 0.5095349311416658,
"learning_rate": 9.530710880262023e-06,
"loss": 1.3403,
"step": 832
},
{
"epoch": 2.41799709724238,
"grad_norm": 0.5271513809867036,
"learning_rate": 9.438753191325439e-06,
"loss": 1.3531,
"step": 833
},
{
"epoch": 2.4208998548621192,
"grad_norm": 0.4968943434293811,
"learning_rate": 9.347195026073369e-06,
"loss": 1.4333,
"step": 834
},
{
"epoch": 2.423802612481858,
"grad_norm": 0.49003943814380985,
"learning_rate": 9.256037286343412e-06,
"loss": 1.3893,
"step": 835
},
{
"epoch": 2.4267053701015966,
"grad_norm": 0.5173684460105249,
"learning_rate": 9.16528087002892e-06,
"loss": 1.3254,
"step": 836
},
{
"epoch": 2.4296081277213353,
"grad_norm": 0.5063931362680633,
"learning_rate": 9.074926671070322e-06,
"loss": 1.3845,
"step": 837
},
{
"epoch": 2.432510885341074,
"grad_norm": 0.5027336574880931,
"learning_rate": 8.98497557944627e-06,
"loss": 1.4124,
"step": 838
},
{
"epoch": 2.4354136429608126,
"grad_norm": 0.4881635920339692,
"learning_rate": 8.895428481164792e-06,
"loss": 1.3367,
"step": 839
},
{
"epoch": 2.4383164005805513,
"grad_norm": 0.5040237832255919,
"learning_rate": 8.806286258254664e-06,
"loss": 1.3986,
"step": 840
},
{
"epoch": 2.4412191582002905,
"grad_norm": 0.5185080756819134,
"learning_rate": 8.717549788756679e-06,
"loss": 1.4287,
"step": 841
},
{
"epoch": 2.444121915820029,
"grad_norm": 0.5351373346531869,
"learning_rate": 8.62921994671501e-06,
"loss": 1.4077,
"step": 842
},
{
"epoch": 2.447024673439768,
"grad_norm": 0.49769828016586376,
"learning_rate": 8.541297602168591e-06,
"loss": 1.3505,
"step": 843
},
{
"epoch": 2.4499274310595065,
"grad_norm": 0.5208074426406215,
"learning_rate": 8.453783621142542e-06,
"loss": 1.3413,
"step": 844
},
{
"epoch": 2.452830188679245,
"grad_norm": 0.5124329105235204,
"learning_rate": 8.366678865639688e-06,
"loss": 1.3556,
"step": 845
},
{
"epoch": 2.455732946298984,
"grad_norm": 0.51757961569392,
"learning_rate": 8.279984193631967e-06,
"loss": 1.3583,
"step": 846
},
{
"epoch": 2.4586357039187225,
"grad_norm": 0.5141666526518351,
"learning_rate": 8.193700459052078e-06,
"loss": 1.408,
"step": 847
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.49883727302570513,
"learning_rate": 8.10782851178506e-06,
"loss": 1.3709,
"step": 848
},
{
"epoch": 2.4644412191582004,
"grad_norm": 0.503942767328435,
"learning_rate": 8.022369197659824e-06,
"loss": 1.3724,
"step": 849
},
{
"epoch": 2.467343976777939,
"grad_norm": 0.5101251999393575,
"learning_rate": 7.937323358440935e-06,
"loss": 1.4084,
"step": 850
},
{
"epoch": 2.4702467343976777,
"grad_norm": 0.529612244088611,
"learning_rate": 7.852691831820307e-06,
"loss": 1.3022,
"step": 851
},
{
"epoch": 2.4731494920174164,
"grad_norm": 0.5087857566695692,
"learning_rate": 7.768475451408847e-06,
"loss": 1.3652,
"step": 852
},
{
"epoch": 2.4760522496371555,
"grad_norm": 0.5026378194874479,
"learning_rate": 7.684675046728373e-06,
"loss": 1.4516,
"step": 853
},
{
"epoch": 2.478955007256894,
"grad_norm": 0.49950954407638787,
"learning_rate": 7.601291443203373e-06,
"loss": 1.3906,
"step": 854
},
{
"epoch": 2.481857764876633,
"grad_norm": 0.5170406218834634,
"learning_rate": 7.518325462152892e-06,
"loss": 1.307,
"step": 855
},
{
"epoch": 2.4847605224963716,
"grad_norm": 0.5220404801058548,
"learning_rate": 7.435777920782444e-06,
"loss": 1.3827,
"step": 856
},
{
"epoch": 2.4876632801161103,
"grad_norm": 0.5304365236422506,
"learning_rate": 7.353649632175957e-06,
"loss": 1.4055,
"step": 857
},
{
"epoch": 2.490566037735849,
"grad_norm": 0.5056290605708762,
"learning_rate": 7.271941405287763e-06,
"loss": 1.4271,
"step": 858
},
{
"epoch": 2.4934687953555876,
"grad_norm": 0.4971594174251956,
"learning_rate": 7.190654044934642e-06,
"loss": 1.4138,
"step": 859
},
{
"epoch": 2.4963715529753268,
"grad_norm": 0.510540754439777,
"learning_rate": 7.109788351787866e-06,
"loss": 1.3451,
"step": 860
},
{
"epoch": 2.4992743105950654,
"grad_norm": 0.5015971371017499,
"learning_rate": 7.029345122365389e-06,
"loss": 1.3916,
"step": 861
},
{
"epoch": 2.502177068214804,
"grad_norm": 0.5015272864493691,
"learning_rate": 6.949325149023861e-06,
"loss": 1.3676,
"step": 862
},
{
"epoch": 2.505079825834543,
"grad_norm": 0.5046822050793328,
"learning_rate": 6.86972921995096e-06,
"loss": 1.3456,
"step": 863
},
{
"epoch": 2.5079825834542815,
"grad_norm": 0.5213334414439604,
"learning_rate": 6.790558119157597e-06,
"loss": 1.4038,
"step": 864
},
{
"epoch": 2.51088534107402,
"grad_norm": 0.5038730547605234,
"learning_rate": 6.711812626470104e-06,
"loss": 1.3921,
"step": 865
},
{
"epoch": 2.513788098693759,
"grad_norm": 0.501461228178946,
"learning_rate": 6.633493517522687e-06,
"loss": 1.3485,
"step": 866
},
{
"epoch": 2.516690856313498,
"grad_norm": 0.5199104862070939,
"learning_rate": 6.555601563749675e-06,
"loss": 1.3784,
"step": 867
},
{
"epoch": 2.5195936139332367,
"grad_norm": 0.5169781637578375,
"learning_rate": 6.478137532378004e-06,
"loss": 1.362,
"step": 868
},
{
"epoch": 2.5224963715529753,
"grad_norm": 0.5013508498423759,
"learning_rate": 6.4011021864195885e-06,
"loss": 1.296,
"step": 869
},
{
"epoch": 2.525399129172714,
"grad_norm": 0.5030515933490873,
"learning_rate": 6.324496284663867e-06,
"loss": 1.4247,
"step": 870
},
{
"epoch": 2.5283018867924527,
"grad_norm": 0.5177816363991151,
"learning_rate": 6.248320581670281e-06,
"loss": 1.3704,
"step": 871
},
{
"epoch": 2.531204644412192,
"grad_norm": 0.5092274322111774,
"learning_rate": 6.172575827760885e-06,
"loss": 1.3203,
"step": 872
},
{
"epoch": 2.53410740203193,
"grad_norm": 0.5140046912292023,
"learning_rate": 6.097262769012912e-06,
"loss": 1.3403,
"step": 873
},
{
"epoch": 2.537010159651669,
"grad_norm": 0.49602927402162544,
"learning_rate": 6.022382147251454e-06,
"loss": 1.2882,
"step": 874
},
{
"epoch": 2.539912917271408,
"grad_norm": 0.5269273308467513,
"learning_rate": 5.947934700042162e-06,
"loss": 1.3556,
"step": 875
},
{
"epoch": 2.5428156748911466,
"grad_norm": 0.5182565149631638,
"learning_rate": 5.873921160683943e-06,
"loss": 1.413,
"step": 876
},
{
"epoch": 2.5457184325108853,
"grad_norm": 0.5151482293363229,
"learning_rate": 5.800342258201774e-06,
"loss": 1.3546,
"step": 877
},
{
"epoch": 2.548621190130624,
"grad_norm": 0.508277679506511,
"learning_rate": 5.727198717339511e-06,
"loss": 1.3641,
"step": 878
},
{
"epoch": 2.551523947750363,
"grad_norm": 0.5435889399161812,
"learning_rate": 5.654491258552736e-06,
"loss": 1.3751,
"step": 879
},
{
"epoch": 2.5544267053701017,
"grad_norm": 0.48717675362297463,
"learning_rate": 5.582220598001681e-06,
"loss": 1.3444,
"step": 880
},
{
"epoch": 2.5573294629898404,
"grad_norm": 0.5104381695238399,
"learning_rate": 5.510387447544168e-06,
"loss": 1.3389,
"step": 881
},
{
"epoch": 2.560232220609579,
"grad_norm": 0.5033562135872932,
"learning_rate": 5.438992514728586e-06,
"loss": 1.4053,
"step": 882
},
{
"epoch": 2.563134978229318,
"grad_norm": 0.5184770727956874,
"learning_rate": 5.368036502786927e-06,
"loss": 1.3709,
"step": 883
},
{
"epoch": 2.5660377358490565,
"grad_norm": 0.5058658804730517,
"learning_rate": 5.297520110627868e-06,
"loss": 1.3803,
"step": 884
},
{
"epoch": 2.568940493468795,
"grad_norm": 0.5159172488807618,
"learning_rate": 5.227444032829887e-06,
"loss": 1.4181,
"step": 885
},
{
"epoch": 2.5718432510885343,
"grad_norm": 0.5081523708347989,
"learning_rate": 5.157808959634408e-06,
"loss": 1.3352,
"step": 886
},
{
"epoch": 2.574746008708273,
"grad_norm": 0.5116219142410966,
"learning_rate": 5.08861557693901e-06,
"loss": 1.3811,
"step": 887
},
{
"epoch": 2.5776487663280117,
"grad_norm": 0.4821542546715502,
"learning_rate": 5.0198645662906666e-06,
"loss": 1.3791,
"step": 888
},
{
"epoch": 2.5805515239477503,
"grad_norm": 0.5047932657720178,
"learning_rate": 4.951556604879048e-06,
"loss": 1.4468,
"step": 889
},
{
"epoch": 2.583454281567489,
"grad_norm": 0.5082701448991395,
"learning_rate": 4.88369236552983e-06,
"loss": 1.3857,
"step": 890
},
{
"epoch": 2.5863570391872277,
"grad_norm": 0.5068038259373514,
"learning_rate": 4.816272516698073e-06,
"loss": 1.3518,
"step": 891
},
{
"epoch": 2.5892597968069664,
"grad_norm": 0.49604277625126053,
"learning_rate": 4.74929772246166e-06,
"loss": 1.4263,
"step": 892
},
{
"epoch": 2.5921625544267055,
"grad_norm": 0.5166732210194949,
"learning_rate": 4.682768642514723e-06,
"loss": 1.2918,
"step": 893
},
{
"epoch": 2.595065312046444,
"grad_norm": 0.4956720644914752,
"learning_rate": 4.616685932161152e-06,
"loss": 1.3491,
"step": 894
},
{
"epoch": 2.597968069666183,
"grad_norm": 0.5250136252293581,
"learning_rate": 4.551050242308158e-06,
"loss": 1.3351,
"step": 895
},
{
"epoch": 2.6008708272859216,
"grad_norm": 0.5072190139510312,
"learning_rate": 4.4858622194598525e-06,
"loss": 1.3247,
"step": 896
},
{
"epoch": 2.6037735849056602,
"grad_norm": 0.5371609824213853,
"learning_rate": 4.421122505710867e-06,
"loss": 1.326,
"step": 897
},
{
"epoch": 2.6066763425253994,
"grad_norm": 0.5216292074955919,
"learning_rate": 4.356831738740053e-06,
"loss": 1.342,
"step": 898
},
{
"epoch": 2.6095791001451376,
"grad_norm": 0.5224569341550335,
"learning_rate": 4.292990551804171e-06,
"loss": 1.4003,
"step": 899
},
{
"epoch": 2.6124818577648767,
"grad_norm": 0.49066645285583466,
"learning_rate": 4.229599573731685e-06,
"loss": 1.3677,
"step": 900
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.48822015231136007,
"learning_rate": 4.166659428916541e-06,
"loss": 1.3293,
"step": 901
},
{
"epoch": 2.618287373004354,
"grad_norm": 0.5169410366741958,
"learning_rate": 4.1041707373120356e-06,
"loss": 1.3537,
"step": 902
},
{
"epoch": 2.621190130624093,
"grad_norm": 0.5033953243716508,
"learning_rate": 4.042134114424695e-06,
"loss": 1.3518,
"step": 903
},
{
"epoch": 2.6240928882438315,
"grad_norm": 0.497328022255338,
"learning_rate": 3.980550171308228e-06,
"loss": 1.3499,
"step": 904
},
{
"epoch": 2.6269956458635706,
"grad_norm": 0.4993512760500125,
"learning_rate": 3.919419514557493e-06,
"loss": 1.389,
"step": 905
},
{
"epoch": 2.6298984034833093,
"grad_norm": 0.49577696692113127,
"learning_rate": 3.858742746302535e-06,
"loss": 1.4708,
"step": 906
},
{
"epoch": 2.632801161103048,
"grad_norm": 0.5095348959235028,
"learning_rate": 3.7985204642026482e-06,
"loss": 1.3972,
"step": 907
},
{
"epoch": 2.6357039187227866,
"grad_norm": 0.504132473704439,
"learning_rate": 3.738753261440475e-06,
"loss": 1.3319,
"step": 908
},
{
"epoch": 2.6386066763425253,
"grad_norm": 0.5106574941743576,
"learning_rate": 3.6794417267161984e-06,
"loss": 1.3224,
"step": 909
},
{
"epoch": 2.641509433962264,
"grad_norm": 0.4933885247457992,
"learning_rate": 3.6205864442417136e-06,
"loss": 1.3549,
"step": 910
},
{
"epoch": 2.6444121915820027,
"grad_norm": 0.5045818756594637,
"learning_rate": 3.5621879937348836e-06,
"loss": 1.3729,
"step": 911
},
{
"epoch": 2.647314949201742,
"grad_norm": 0.49301833752984214,
"learning_rate": 3.5042469504138363e-06,
"loss": 1.3441,
"step": 912
},
{
"epoch": 2.6502177068214805,
"grad_norm": 0.5316526304810579,
"learning_rate": 3.4467638849912497e-06,
"loss": 1.3937,
"step": 913
},
{
"epoch": 2.653120464441219,
"grad_norm": 0.5171439429808068,
"learning_rate": 3.3897393636688368e-06,
"loss": 1.3788,
"step": 914
},
{
"epoch": 2.656023222060958,
"grad_norm": 0.5294437263003443,
"learning_rate": 3.3331739481316624e-06,
"loss": 1.4573,
"step": 915
},
{
"epoch": 2.6589259796806965,
"grad_norm": 0.5107913904094072,
"learning_rate": 3.277068195542654e-06,
"loss": 1.3852,
"step": 916
},
{
"epoch": 2.6618287373004357,
"grad_norm": 0.5023154167004453,
"learning_rate": 3.2214226585371265e-06,
"loss": 1.3356,
"step": 917
},
{
"epoch": 2.664731494920174,
"grad_norm": 0.5233019063386156,
"learning_rate": 3.166237885217299e-06,
"loss": 1.4577,
"step": 918
},
{
"epoch": 2.667634252539913,
"grad_norm": 0.5252621679593605,
"learning_rate": 3.1115144191469493e-06,
"loss": 1.4475,
"step": 919
},
{
"epoch": 2.6705370101596517,
"grad_norm": 0.4966466055560178,
"learning_rate": 3.0572527993460053e-06,
"loss": 1.3461,
"step": 920
},
{
"epoch": 2.6734397677793904,
"grad_norm": 0.5044558616146575,
"learning_rate": 3.0034535602852797e-06,
"loss": 1.3766,
"step": 921
},
{
"epoch": 2.676342525399129,
"grad_norm": 0.5205402435681714,
"learning_rate": 2.950117231881183e-06,
"loss": 1.4068,
"step": 922
},
{
"epoch": 2.6792452830188678,
"grad_norm": 0.53158129182675,
"learning_rate": 2.89724433949049e-06,
"loss": 1.4079,
"step": 923
},
{
"epoch": 2.682148040638607,
"grad_norm": 0.5071608676745457,
"learning_rate": 2.8448354039052216e-06,
"loss": 1.3647,
"step": 924
},
{
"epoch": 2.6850507982583456,
"grad_norm": 0.5242803057589017,
"learning_rate": 2.7928909413474424e-06,
"loss": 1.3526,
"step": 925
},
{
"epoch": 2.6879535558780843,
"grad_norm": 0.5143261027297373,
"learning_rate": 2.741411463464211e-06,
"loss": 1.3732,
"step": 926
},
{
"epoch": 2.690856313497823,
"grad_norm": 0.5091328447989407,
"learning_rate": 2.6903974773225702e-06,
"loss": 1.3514,
"step": 927
},
{
"epoch": 2.6937590711175616,
"grad_norm": 0.5278079810032933,
"learning_rate": 2.639849485404505e-06,
"loss": 1.3671,
"step": 928
},
{
"epoch": 2.6966618287373003,
"grad_norm": 0.5162469457029244,
"learning_rate": 2.589767985601976e-06,
"loss": 1.3656,
"step": 929
},
{
"epoch": 2.699564586357039,
"grad_norm": 0.4924105120909186,
"learning_rate": 2.5401534712121146e-06,
"loss": 1.3818,
"step": 930
},
{
"epoch": 2.702467343976778,
"grad_norm": 0.5154022582971206,
"learning_rate": 2.4910064309322523e-06,
"loss": 1.4229,
"step": 931
},
{
"epoch": 2.705370101596517,
"grad_norm": 0.5014601223020894,
"learning_rate": 2.442327348855161e-06,
"loss": 1.3835,
"step": 932
},
{
"epoch": 2.7082728592162555,
"grad_norm": 0.5011745047358306,
"learning_rate": 2.3941167044642944e-06,
"loss": 1.3152,
"step": 933
},
{
"epoch": 2.711175616835994,
"grad_norm": 0.5080499109568192,
"learning_rate": 2.3463749726290286e-06,
"loss": 1.4209,
"step": 934
},
{
"epoch": 2.714078374455733,
"grad_norm": 0.5032137873479694,
"learning_rate": 2.299102623600019e-06,
"loss": 1.4381,
"step": 935
},
{
"epoch": 2.7169811320754715,
"grad_norm": 0.5131169546437747,
"learning_rate": 2.252300123004525e-06,
"loss": 1.3737,
"step": 936
},
{
"epoch": 2.71988388969521,
"grad_norm": 0.5100726087300796,
"learning_rate": 2.205967931841901e-06,
"loss": 1.354,
"step": 937
},
{
"epoch": 2.7227866473149493,
"grad_norm": 0.5151073468339058,
"learning_rate": 2.1601065064789704e-06,
"loss": 1.361,
"step": 938
},
{
"epoch": 2.725689404934688,
"grad_norm": 0.526955077204686,
"learning_rate": 2.114716298645564e-06,
"loss": 1.4174,
"step": 939
},
{
"epoch": 2.7285921625544267,
"grad_norm": 0.5300214734732652,
"learning_rate": 2.069797755430097e-06,
"loss": 1.3908,
"step": 940
},
{
"epoch": 2.7314949201741654,
"grad_norm": 0.4931016210698146,
"learning_rate": 2.0253513192751373e-06,
"loss": 1.347,
"step": 941
},
{
"epoch": 2.734397677793904,
"grad_norm": 0.5040485652358908,
"learning_rate": 1.981377427973019e-06,
"loss": 1.3539,
"step": 942
},
{
"epoch": 2.737300435413643,
"grad_norm": 0.5028635448800768,
"learning_rate": 1.937876514661613e-06,
"loss": 1.3462,
"step": 943
},
{
"epoch": 2.7402031930333814,
"grad_norm": 0.49312792694294416,
"learning_rate": 1.8948490078199764e-06,
"loss": 1.3591,
"step": 944
},
{
"epoch": 2.7431059506531206,
"grad_norm": 0.5040358290026028,
"learning_rate": 1.8522953312641755e-06,
"loss": 1.3315,
"step": 945
},
{
"epoch": 2.7460087082728593,
"grad_norm": 0.5166855707268636,
"learning_rate": 1.8102159041430922e-06,
"loss": 1.3668,
"step": 946
},
{
"epoch": 2.748911465892598,
"grad_norm": 0.5178874199013686,
"learning_rate": 1.7686111409343342e-06,
"loss": 1.3558,
"step": 947
},
{
"epoch": 2.7518142235123366,
"grad_norm": 0.4904227671998145,
"learning_rate": 1.7274814514400994e-06,
"loss": 1.4023,
"step": 948
},
{
"epoch": 2.7547169811320753,
"grad_norm": 0.507627671545522,
"learning_rate": 1.686827240783151e-06,
"loss": 1.4048,
"step": 949
},
{
"epoch": 2.7576197387518144,
"grad_norm": 0.4900145791976217,
"learning_rate": 1.6466489094028759e-06,
"loss": 1.3752,
"step": 950
},
{
"epoch": 2.760522496371553,
"grad_norm": 0.5022450146235269,
"learning_rate": 1.6069468530512832e-06,
"loss": 1.3576,
"step": 951
},
{
"epoch": 2.763425253991292,
"grad_norm": 0.4897033470200573,
"learning_rate": 1.5677214627891179e-06,
"loss": 1.368,
"step": 952
},
{
"epoch": 2.7663280116110305,
"grad_norm": 0.49639970909661785,
"learning_rate": 1.528973124982036e-06,
"loss": 1.4423,
"step": 953
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.500578087029502,
"learning_rate": 1.4907022212967803e-06,
"loss": 1.3669,
"step": 954
},
{
"epoch": 2.772133526850508,
"grad_norm": 0.5018433548924393,
"learning_rate": 1.4529091286973995e-06,
"loss": 1.4394,
"step": 955
},
{
"epoch": 2.7750362844702465,
"grad_norm": 0.526996294444137,
"learning_rate": 1.415594219441585e-06,
"loss": 1.4012,
"step": 956
},
{
"epoch": 2.7779390420899857,
"grad_norm": 0.5118271403347019,
"learning_rate": 1.3787578610769624e-06,
"loss": 1.3903,
"step": 957
},
{
"epoch": 2.7808417997097243,
"grad_norm": 0.5171995028189363,
"learning_rate": 1.3424004164374837e-06,
"loss": 1.4309,
"step": 958
},
{
"epoch": 2.783744557329463,
"grad_norm": 0.5214493345083933,
"learning_rate": 1.3065222436398572e-06,
"loss": 1.3346,
"step": 959
},
{
"epoch": 2.7866473149492017,
"grad_norm": 0.5092311351300531,
"learning_rate": 1.2711236960800287e-06,
"loss": 1.3195,
"step": 960
},
{
"epoch": 2.7895500725689404,
"grad_norm": 0.5396674308985979,
"learning_rate": 1.2362051224296734e-06,
"loss": 1.3292,
"step": 961
},
{
"epoch": 2.7924528301886795,
"grad_norm": 0.5143627693435764,
"learning_rate": 1.2017668666327753e-06,
"loss": 1.3348,
"step": 962
},
{
"epoch": 2.7953555878084178,
"grad_norm": 0.4875818587153554,
"learning_rate": 1.16780926790227e-06,
"loss": 1.3623,
"step": 963
},
{
"epoch": 2.798258345428157,
"grad_norm": 0.5039943847581683,
"learning_rate": 1.1343326607166527e-06,
"loss": 1.4143,
"step": 964
},
{
"epoch": 2.8011611030478956,
"grad_norm": 0.5176888960875069,
"learning_rate": 1.1013373748166912e-06,
"loss": 1.3408,
"step": 965
},
{
"epoch": 2.8040638606676342,
"grad_norm": 0.5224254539981171,
"learning_rate": 1.0688237352022345e-06,
"loss": 1.3811,
"step": 966
},
{
"epoch": 2.806966618287373,
"grad_norm": 0.5178401265472633,
"learning_rate": 1.0367920621289495e-06,
"loss": 1.4287,
"step": 967
},
{
"epoch": 2.8098693759071116,
"grad_norm": 0.5435668945236312,
"learning_rate": 1.0052426711051666e-06,
"loss": 1.4013,
"step": 968
},
{
"epoch": 2.8127721335268507,
"grad_norm": 0.5100311805279316,
"learning_rate": 9.741758728888218e-07,
"loss": 1.4024,
"step": 969
},
{
"epoch": 2.8156748911465894,
"grad_norm": 0.5043622576172422,
"learning_rate": 9.435919734843645e-07,
"loss": 1.4582,
"step": 970
},
{
"epoch": 2.818577648766328,
"grad_norm": 0.5052190490107448,
"learning_rate": 9.134912741397272e-07,
"loss": 1.4224,
"step": 971
},
{
"epoch": 2.821480406386067,
"grad_norm": 0.500367078563273,
"learning_rate": 8.83874071343388e-07,
"loss": 1.3278,
"step": 972
},
{
"epoch": 2.8243831640058055,
"grad_norm": 0.5016104693203676,
"learning_rate": 8.547406568214456e-07,
"loss": 1.374,
"step": 973
},
{
"epoch": 2.827285921625544,
"grad_norm": 0.5099793577669696,
"learning_rate": 8.260913175347273e-07,
"loss": 1.3584,
"step": 974
},
{
"epoch": 2.830188679245283,
"grad_norm": 0.5015530124896996,
"learning_rate": 7.979263356759748e-07,
"loss": 1.4006,
"step": 975
},
{
"epoch": 2.833091436865022,
"grad_norm": 0.5105087376603901,
"learning_rate": 7.702459886670788e-07,
"loss": 1.3583,
"step": 976
},
{
"epoch": 2.8359941944847606,
"grad_norm": 0.5083958150579762,
"learning_rate": 7.4305054915631e-07,
"loss": 1.3632,
"step": 977
},
{
"epoch": 2.8388969521044993,
"grad_norm": 0.5245591090055167,
"learning_rate": 7.163402850156653e-07,
"loss": 1.4253,
"step": 978
},
{
"epoch": 2.841799709724238,
"grad_norm": 0.4980601261676422,
"learning_rate": 6.901154593382309e-07,
"loss": 1.3251,
"step": 979
},
{
"epoch": 2.8447024673439767,
"grad_norm": 0.5097166168182306,
"learning_rate": 6.643763304355566e-07,
"loss": 1.4185,
"step": 980
},
{
"epoch": 2.8476052249637154,
"grad_norm": 0.5150892752909462,
"learning_rate": 6.39123151835147e-07,
"loss": 1.3235,
"step": 981
},
{
"epoch": 2.850507982583454,
"grad_norm": 0.5192076749407817,
"learning_rate": 6.143561722779523e-07,
"loss": 1.3545,
"step": 982
},
{
"epoch": 2.853410740203193,
"grad_norm": 0.49627606737819685,
"learning_rate": 5.900756357159143e-07,
"loss": 1.3342,
"step": 983
},
{
"epoch": 2.856313497822932,
"grad_norm": 0.5088423319290685,
"learning_rate": 5.662817813095633e-07,
"loss": 1.3772,
"step": 984
},
{
"epoch": 2.8592162554426706,
"grad_norm": 0.5017003419203407,
"learning_rate": 5.429748434256699e-07,
"loss": 1.3668,
"step": 985
},
{
"epoch": 2.8621190130624092,
"grad_norm": 0.5210417902039252,
"learning_rate": 5.201550516349407e-07,
"loss": 1.3844,
"step": 986
},
{
"epoch": 2.865021770682148,
"grad_norm": 0.5067748700864746,
"learning_rate": 4.978226307097266e-07,
"loss": 1.4295,
"step": 987
},
{
"epoch": 2.867924528301887,
"grad_norm": 0.49233248684232,
"learning_rate": 4.7597780062184073e-07,
"loss": 1.4001,
"step": 988
},
{
"epoch": 2.8708272859216253,
"grad_norm": 0.5433349878065107,
"learning_rate": 4.546207765403987e-07,
"loss": 1.322,
"step": 989
},
{
"epoch": 2.8737300435413644,
"grad_norm": 0.5311833853344032,
"learning_rate": 4.337517688296544e-07,
"loss": 1.33,
"step": 990
},
{
"epoch": 2.876632801161103,
"grad_norm": 0.5163955351769733,
"learning_rate": 4.1337098304696784e-07,
"loss": 1.2479,
"step": 991
},
{
"epoch": 2.8795355587808418,
"grad_norm": 0.5153591591160972,
"learning_rate": 3.9347861994078474e-07,
"loss": 1.3705,
"step": 992
},
{
"epoch": 2.8824383164005805,
"grad_norm": 0.511148791041711,
"learning_rate": 3.740748754486156e-07,
"loss": 1.4441,
"step": 993
},
{
"epoch": 2.885341074020319,
"grad_norm": 0.49564300067462175,
"learning_rate": 3.551599406951434e-07,
"loss": 1.4013,
"step": 994
},
{
"epoch": 2.8882438316400583,
"grad_norm": 0.507022305631989,
"learning_rate": 3.3673400199033534e-07,
"loss": 1.3939,
"step": 995
},
{
"epoch": 2.891146589259797,
"grad_norm": 0.5263778954167362,
"learning_rate": 3.1879724082760076e-07,
"loss": 1.4592,
"step": 996
},
{
"epoch": 2.8940493468795356,
"grad_norm": 0.5059056772044945,
"learning_rate": 3.013498338820031e-07,
"loss": 1.3643,
"step": 997
},
{
"epoch": 2.8969521044992743,
"grad_norm": 0.530021262072049,
"learning_rate": 2.843919530085226e-07,
"loss": 1.369,
"step": 998
},
{
"epoch": 2.899854862119013,
"grad_norm": 0.5169811310498346,
"learning_rate": 2.6792376524036877e-07,
"loss": 1.3075,
"step": 999
},
{
"epoch": 2.9027576197387517,
"grad_norm": 0.5081387484868812,
"learning_rate": 2.51945432787315e-07,
"loss": 1.3431,
"step": 1000
},
{
"epoch": 2.9056603773584904,
"grad_norm": 0.5185228126203555,
"learning_rate": 2.364571130341331e-07,
"loss": 1.3231,
"step": 1001
},
{
"epoch": 2.9085631349782295,
"grad_norm": 0.4992860610054546,
"learning_rate": 2.21458958539017e-07,
"loss": 1.416,
"step": 1002
},
{
"epoch": 2.911465892597968,
"grad_norm": 0.5031174334271055,
"learning_rate": 2.0695111703208925e-07,
"loss": 1.3493,
"step": 1003
},
{
"epoch": 2.914368650217707,
"grad_norm": 0.5083212239167739,
"learning_rate": 1.9293373141394122e-07,
"loss": 1.4214,
"step": 1004
},
{
"epoch": 2.9172714078374455,
"grad_norm": 0.5126277819683225,
"learning_rate": 1.7940693975423972e-07,
"loss": 1.3824,
"step": 1005
},
{
"epoch": 2.9201741654571842,
"grad_norm": 0.5256873343721312,
"learning_rate": 1.6637087529033923e-07,
"loss": 1.3647,
"step": 1006
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.5108208624651032,
"learning_rate": 1.5382566642600516e-07,
"loss": 1.3596,
"step": 1007
},
{
"epoch": 2.9259796806966616,
"grad_norm": 0.5055869922560303,
"learning_rate": 1.4177143673011484e-07,
"loss": 1.3299,
"step": 1008
},
{
"epoch": 2.9288824383164007,
"grad_norm": 0.5090352331699387,
"learning_rate": 1.302083049354752e-07,
"loss": 1.3798,
"step": 1009
},
{
"epoch": 2.9317851959361394,
"grad_norm": 0.5116510716770774,
"learning_rate": 1.191363849376237e-07,
"loss": 1.3138,
"step": 1010
},
{
"epoch": 2.934687953555878,
"grad_norm": 0.5013518456672409,
"learning_rate": 1.0855578579370695e-07,
"loss": 1.4109,
"step": 1011
},
{
"epoch": 2.9375907111756168,
"grad_norm": 0.5161202040778329,
"learning_rate": 9.846661172143723e-08,
"loss": 1.445,
"step": 1012
},
{
"epoch": 2.9404934687953554,
"grad_norm": 0.5214505934486886,
"learning_rate": 8.886896209803208e-08,
"loss": 1.3423,
"step": 1013
},
{
"epoch": 2.9433962264150946,
"grad_norm": 0.5044668356218843,
"learning_rate": 7.976293145924852e-08,
"loss": 1.3901,
"step": 1014
},
{
"epoch": 2.9462989840348333,
"grad_norm": 0.5365354989635048,
"learning_rate": 7.114860949846703e-08,
"loss": 1.4406,
"step": 1015
},
{
"epoch": 2.949201741654572,
"grad_norm": 0.5040282345086635,
"learning_rate": 6.302608106577568e-08,
"loss": 1.3383,
"step": 1016
},
{
"epoch": 2.9521044992743106,
"grad_norm": 0.4922869444184136,
"learning_rate": 5.539542616715965e-08,
"loss": 1.3398,
"step": 1017
},
{
"epoch": 2.9550072568940493,
"grad_norm": 0.5254172844858651,
"learning_rate": 4.825671996370185e-08,
"loss": 1.3326,
"step": 1018
},
{
"epoch": 2.957910014513788,
"grad_norm": 0.5055834915626891,
"learning_rate": 4.161003277085573e-08,
"loss": 1.4167,
"step": 1019
},
{
"epoch": 2.9608127721335267,
"grad_norm": 0.5058511618232302,
"learning_rate": 3.545543005773477e-08,
"loss": 1.4244,
"step": 1020
},
{
"epoch": 2.963715529753266,
"grad_norm": 0.506944487317272,
"learning_rate": 2.9792972446479605e-08,
"loss": 1.272,
"step": 1021
},
{
"epoch": 2.9666182873730045,
"grad_norm": 0.5065487926302941,
"learning_rate": 2.4622715711647426e-08,
"loss": 1.4038,
"step": 1022
},
{
"epoch": 2.969521044992743,
"grad_norm": 0.5061289548587645,
"learning_rate": 1.994471077969573e-08,
"loss": 1.3135,
"step": 1023
},
{
"epoch": 2.972423802612482,
"grad_norm": 0.5227174630028666,
"learning_rate": 1.5759003728427202e-08,
"loss": 1.4317,
"step": 1024
},
{
"epoch": 2.9753265602322205,
"grad_norm": 0.5063524748843257,
"learning_rate": 1.2065635786595586e-08,
"loss": 1.3438,
"step": 1025
},
{
"epoch": 2.978229317851959,
"grad_norm": 0.5228174437595524,
"learning_rate": 8.864643333450495e-09,
"loss": 1.3984,
"step": 1026
},
{
"epoch": 2.981132075471698,
"grad_norm": 0.5154756775417685,
"learning_rate": 6.156057898398793e-09,
"loss": 1.2921,
"step": 1027
},
{
"epoch": 2.984034833091437,
"grad_norm": 0.4926348240472133,
"learning_rate": 3.9399061607103825e-09,
"loss": 1.3304,
"step": 1028
},
{
"epoch": 2.9869375907111757,
"grad_norm": 0.5096302536210523,
"learning_rate": 2.216209949229553e-09,
"loss": 1.4275,
"step": 1029
},
{
"epoch": 2.9898403483309144,
"grad_norm": 0.5062257490124061,
"learning_rate": 9.84986242158481e-10,
"loss": 1.4537,
"step": 1030
},
{
"epoch": 2.992743105950653,
"grad_norm": 0.506160269859511,
"learning_rate": 2.462471669184563e-10,
"loss": 1.3882,
"step": 1031
},
{
"epoch": 2.9956458635703918,
"grad_norm": 0.5147092377641741,
"learning_rate": 0.0,
"loss": 1.3799,
"step": 1032
},
{
"epoch": 2.9956458635703918,
"step": 1032,
"total_flos": 1335874505801728.0,
"train_loss": 1.5377236583898233,
"train_runtime": 12027.3057,
"train_samples_per_second": 2.747,
"train_steps_per_second": 0.086
}
],
"logging_steps": 1.0,
"max_steps": 1032,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1335874505801728.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}