{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05842054222052586, "eval_steps": 500, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.9210271110262932e-05, "grad_norm": 5.397985935211182, "learning_rate": 2.9999999936841446e-05, "loss": 2.1156, "step": 15 }, { "epoch": 5.8420542220525864e-05, "grad_norm": 3.999190092086792, "learning_rate": 2.999999974736578e-05, "loss": 2.5268, "step": 30 }, { "epoch": 8.76308133307888e-05, "grad_norm": 5.580333709716797, "learning_rate": 2.999999943157301e-05, "loss": 2.3955, "step": 45 }, { "epoch": 0.00011684108444105173, "grad_norm": 3.6752560138702393, "learning_rate": 2.9999998989463132e-05, "loss": 2.171, "step": 60 }, { "epoch": 0.00014605135555131467, "grad_norm": 2.655461072921753, "learning_rate": 2.9999998421036153e-05, "loss": 2.188, "step": 75 }, { "epoch": 0.0001752616266615776, "grad_norm": 2.572371006011963, "learning_rate": 2.9999997726292083e-05, "loss": 2.1977, "step": 90 }, { "epoch": 0.00020447189777184053, "grad_norm": 8.544811248779297, "learning_rate": 2.999999690523092e-05, "loss": 2.1871, "step": 105 }, { "epoch": 0.00023368216888210345, "grad_norm": 3.530529737472534, "learning_rate": 2.999999595785267e-05, "loss": 2.2238, "step": 120 }, { "epoch": 0.00026289243999236636, "grad_norm": 3.671182155609131, "learning_rate": 2.9999994884157345e-05, "loss": 2.1338, "step": 135 }, { "epoch": 0.00029210271110262934, "grad_norm": 3.9643137454986572, "learning_rate": 2.9999993684144956e-05, "loss": 2.0788, "step": 150 }, { "epoch": 0.00032131298221289227, "grad_norm": 2.62164044380188, "learning_rate": 2.9999992357815508e-05, "loss": 2.117, "step": 165 }, { "epoch": 0.0003505232533231552, "grad_norm": 4.189517021179199, "learning_rate": 2.999999090516902e-05, "loss": 2.262, "step": 180 }, { "epoch": 0.0003797335244334181, "grad_norm": 2.880622625350952, "learning_rate": 2.9999989326205494e-05, "loss": 2.1102, "step": 195 }, { "epoch": 0.00040894379554368105, "grad_norm": 5.852839469909668, "learning_rate": 2.9999987620924948e-05, "loss": 2.0684, "step": 210 }, { "epoch": 0.000438154066653944, "grad_norm": 3.6339924335479736, "learning_rate": 2.9999985789327394e-05, "loss": 2.2518, "step": 225 }, { "epoch": 0.0004673643377642069, "grad_norm": 2.743267297744751, "learning_rate": 2.9999983831412858e-05, "loss": 2.1236, "step": 240 }, { "epoch": 0.0004965746088744698, "grad_norm": 3.9490363597869873, "learning_rate": 2.9999981747181345e-05, "loss": 2.0878, "step": 255 }, { "epoch": 0.0005257848799847327, "grad_norm": 3.0342047214508057, "learning_rate": 2.9999979536632872e-05, "loss": 2.1415, "step": 270 }, { "epoch": 0.0005549951510949957, "grad_norm": 4.045855522155762, "learning_rate": 2.9999977199767467e-05, "loss": 2.0558, "step": 285 }, { "epoch": 0.0005842054222052587, "grad_norm": 4.325068950653076, "learning_rate": 2.999997473658514e-05, "loss": 2.0617, "step": 300 }, { "epoch": 0.0006134156933155216, "grad_norm": 4.770871162414551, "learning_rate": 2.999997214708592e-05, "loss": 2.0922, "step": 315 }, { "epoch": 0.0006426259644257845, "grad_norm": 4.1802754402160645, "learning_rate": 2.999996943126982e-05, "loss": 2.217, "step": 330 }, { "epoch": 0.0006718362355360475, "grad_norm": 3.044260263442993, "learning_rate": 2.999996658913687e-05, "loss": 2.0508, "step": 345 }, { "epoch": 0.0007010465066463104, "grad_norm": 4.1525187492370605, "learning_rate": 2.9999963620687095e-05, "loss": 2.1678, "step": 360 }, { "epoch": 0.0007302567777565733, "grad_norm": 2.5676231384277344, "learning_rate": 2.9999960525920515e-05, "loss": 2.1693, "step": 375 }, { "epoch": 0.0007594670488668362, "grad_norm": 3.880307674407959, "learning_rate": 2.9999957304837156e-05, "loss": 1.9008, "step": 390 }, { "epoch": 0.0007886773199770992, "grad_norm": 3.3433754444122314, "learning_rate": 2.999995395743705e-05, "loss": 2.0734, "step": 405 }, { "epoch": 0.0008178875910873621, "grad_norm": 3.451378583908081, "learning_rate": 2.999995048372022e-05, "loss": 1.9901, "step": 420 }, { "epoch": 0.000847097862197625, "grad_norm": 3.4411580562591553, "learning_rate": 2.9999946883686695e-05, "loss": 1.9734, "step": 435 }, { "epoch": 0.000876308133307888, "grad_norm": 3.514651298522949, "learning_rate": 2.999994315733651e-05, "loss": 2.1632, "step": 450 }, { "epoch": 0.0009055184044181509, "grad_norm": 5.380824565887451, "learning_rate": 2.999993930466969e-05, "loss": 2.1534, "step": 465 }, { "epoch": 0.0009347286755284138, "grad_norm": 3.768618106842041, "learning_rate": 2.999993532568628e-05, "loss": 2.2182, "step": 480 }, { "epoch": 0.0009639389466386767, "grad_norm": 2.9490509033203125, "learning_rate": 2.99999312203863e-05, "loss": 1.9677, "step": 495 }, { "epoch": 0.0009931492177489396, "grad_norm": 3.4881184101104736, "learning_rate": 2.9999926988769788e-05, "loss": 2.0786, "step": 510 }, { "epoch": 0.0010223594888592025, "grad_norm": 3.788675546646118, "learning_rate": 2.9999922630836784e-05, "loss": 1.9696, "step": 525 }, { "epoch": 0.0010515697599694654, "grad_norm": 3.070878744125366, "learning_rate": 2.999991814658732e-05, "loss": 1.9053, "step": 540 }, { "epoch": 0.0010807800310797286, "grad_norm": 2.456150531768799, "learning_rate": 2.9999913536021436e-05, "loss": 1.9813, "step": 555 }, { "epoch": 0.0011099903021899915, "grad_norm": 4.81670618057251, "learning_rate": 2.999990879913917e-05, "loss": 1.9317, "step": 570 }, { "epoch": 0.0011392005733002544, "grad_norm": 4.014492988586426, "learning_rate": 2.9999903935940567e-05, "loss": 1.9709, "step": 585 }, { "epoch": 0.0011684108444105174, "grad_norm": 4.765859603881836, "learning_rate": 2.999989894642566e-05, "loss": 1.9252, "step": 600 }, { "epoch": 0.0011976211155207803, "grad_norm": 3.5599727630615234, "learning_rate": 2.9999893830594492e-05, "loss": 2.0175, "step": 615 }, { "epoch": 0.0012268313866310432, "grad_norm": 4.441170692443848, "learning_rate": 2.999988858844711e-05, "loss": 2.1166, "step": 630 }, { "epoch": 0.0012560416577413061, "grad_norm": 2.691765308380127, "learning_rate": 2.9999883219983558e-05, "loss": 1.991, "step": 645 }, { "epoch": 0.001285251928851569, "grad_norm": 4.039743423461914, "learning_rate": 2.9999877725203878e-05, "loss": 1.9917, "step": 660 }, { "epoch": 0.001314462199961832, "grad_norm": 3.0612881183624268, "learning_rate": 2.9999872104108115e-05, "loss": 2.0436, "step": 675 }, { "epoch": 0.001343672471072095, "grad_norm": 3.3579518795013428, "learning_rate": 2.9999866356696326e-05, "loss": 1.9659, "step": 690 }, { "epoch": 0.0013728827421823579, "grad_norm": 4.145946502685547, "learning_rate": 2.9999860482968552e-05, "loss": 2.0267, "step": 705 }, { "epoch": 0.0014020930132926208, "grad_norm": 5.213028430938721, "learning_rate": 2.999985448292484e-05, "loss": 2.0953, "step": 720 }, { "epoch": 0.0014313032844028837, "grad_norm": 2.440793752670288, "learning_rate": 2.9999848356565246e-05, "loss": 2.0218, "step": 735 }, { "epoch": 0.0014605135555131466, "grad_norm": 2.6167280673980713, "learning_rate": 2.9999842103889813e-05, "loss": 2.0638, "step": 750 }, { "epoch": 0.0014897238266234096, "grad_norm": 2.2680811882019043, "learning_rate": 2.999983572489861e-05, "loss": 1.973, "step": 765 }, { "epoch": 0.0015189340977336725, "grad_norm": 2.905947208404541, "learning_rate": 2.999982921959167e-05, "loss": 1.9346, "step": 780 }, { "epoch": 0.0015481443688439354, "grad_norm": 4.673079967498779, "learning_rate": 2.999982258796907e-05, "loss": 2.1666, "step": 795 }, { "epoch": 0.0015773546399541984, "grad_norm": 2.5963408946990967, "learning_rate": 2.9999815830030846e-05, "loss": 2.1063, "step": 810 }, { "epoch": 0.0016065649110644613, "grad_norm": 3.656632661819458, "learning_rate": 2.9999808945777066e-05, "loss": 2.1328, "step": 825 }, { "epoch": 0.0016357751821747242, "grad_norm": 4.4109368324279785, "learning_rate": 2.9999801935207786e-05, "loss": 1.9602, "step": 840 }, { "epoch": 0.0016649854532849871, "grad_norm": 2.10768985748291, "learning_rate": 2.9999794798323065e-05, "loss": 1.9766, "step": 855 }, { "epoch": 0.00169419572439525, "grad_norm": 4.301672458648682, "learning_rate": 2.999978753512296e-05, "loss": 1.9386, "step": 870 }, { "epoch": 0.001723405995505513, "grad_norm": 8.171302795410156, "learning_rate": 2.9999780145607538e-05, "loss": 2.1288, "step": 885 }, { "epoch": 0.001752616266615776, "grad_norm": 3.4784069061279297, "learning_rate": 2.999977262977685e-05, "loss": 2.1346, "step": 900 }, { "epoch": 0.0017818265377260389, "grad_norm": 2.9945054054260254, "learning_rate": 2.9999764987630976e-05, "loss": 2.0432, "step": 915 }, { "epoch": 0.0018110368088363018, "grad_norm": 3.632039785385132, "learning_rate": 2.9999757219169964e-05, "loss": 2.0642, "step": 930 }, { "epoch": 0.0018402470799465647, "grad_norm": 3.9847047328948975, "learning_rate": 2.999974932439389e-05, "loss": 2.016, "step": 945 }, { "epoch": 0.0018694573510568276, "grad_norm": 1.89451003074646, "learning_rate": 2.9999741303302816e-05, "loss": 2.0355, "step": 960 }, { "epoch": 0.0018986676221670906, "grad_norm": 6.945189952850342, "learning_rate": 2.9999733155896814e-05, "loss": 1.97, "step": 975 }, { "epoch": 0.0019278778932773535, "grad_norm": 3.347123384475708, "learning_rate": 2.9999724882175947e-05, "loss": 2.0421, "step": 990 }, { "epoch": 0.0019570881643876164, "grad_norm": 4.331577777862549, "learning_rate": 2.9999716482140292e-05, "loss": 2.133, "step": 1005 }, { "epoch": 0.001986298435497879, "grad_norm": 4.221049785614014, "learning_rate": 2.999970795578991e-05, "loss": 1.9272, "step": 1020 }, { "epoch": 0.0020155087066081423, "grad_norm": 2.820549488067627, "learning_rate": 2.9999699303124876e-05, "loss": 2.0116, "step": 1035 }, { "epoch": 0.002044718977718405, "grad_norm": 2.2391343116760254, "learning_rate": 2.9999690524145268e-05, "loss": 1.9337, "step": 1050 }, { "epoch": 0.002073929248828668, "grad_norm": 2.285027503967285, "learning_rate": 2.9999681618851156e-05, "loss": 1.9773, "step": 1065 }, { "epoch": 0.002103139519938931, "grad_norm": 3.266909599304199, "learning_rate": 2.9999672587242616e-05, "loss": 1.9471, "step": 1080 }, { "epoch": 0.002132349791049194, "grad_norm": 5.03222131729126, "learning_rate": 2.9999663429319722e-05, "loss": 2.1337, "step": 1095 }, { "epoch": 0.002161560062159457, "grad_norm": 2.044246196746826, "learning_rate": 2.9999654145082552e-05, "loss": 2.06, "step": 1110 }, { "epoch": 0.00219077033326972, "grad_norm": 3.1399905681610107, "learning_rate": 2.9999644734531183e-05, "loss": 1.9174, "step": 1125 }, { "epoch": 0.002219980604379983, "grad_norm": 3.2445454597473145, "learning_rate": 2.99996351976657e-05, "loss": 1.9728, "step": 1140 }, { "epoch": 0.0022491908754902457, "grad_norm": 2.3682138919830322, "learning_rate": 2.999962553448618e-05, "loss": 1.9779, "step": 1155 }, { "epoch": 0.002278401146600509, "grad_norm": 2.283808708190918, "learning_rate": 2.99996157449927e-05, "loss": 1.964, "step": 1170 }, { "epoch": 0.0023076114177107716, "grad_norm": 3.048382043838501, "learning_rate": 2.9999605829185353e-05, "loss": 2.0908, "step": 1185 }, { "epoch": 0.0023368216888210347, "grad_norm": 6.139120101928711, "learning_rate": 2.999959578706421e-05, "loss": 1.9535, "step": 1200 }, { "epoch": 0.0023660319599312974, "grad_norm": 2.104069232940674, "learning_rate": 2.9999585618629363e-05, "loss": 1.8775, "step": 1215 }, { "epoch": 0.0023952422310415606, "grad_norm": 2.531449317932129, "learning_rate": 2.9999575323880894e-05, "loss": 2.0309, "step": 1230 }, { "epoch": 0.0024244525021518233, "grad_norm": 2.841078996658325, "learning_rate": 2.9999564902818894e-05, "loss": 2.0546, "step": 1245 }, { "epoch": 0.0024536627732620864, "grad_norm": 2.0627243518829346, "learning_rate": 2.9999554355443445e-05, "loss": 1.8833, "step": 1260 }, { "epoch": 0.002482873044372349, "grad_norm": 3.4014151096343994, "learning_rate": 2.9999543681754645e-05, "loss": 1.8935, "step": 1275 }, { "epoch": 0.0025120833154826123, "grad_norm": 3.507380962371826, "learning_rate": 2.999953288175257e-05, "loss": 1.9512, "step": 1290 }, { "epoch": 0.002541293586592875, "grad_norm": 5.4273362159729, "learning_rate": 2.9999521955437325e-05, "loss": 1.9794, "step": 1305 }, { "epoch": 0.002570503857703138, "grad_norm": 4.466185092926025, "learning_rate": 2.9999510902808993e-05, "loss": 1.8266, "step": 1320 }, { "epoch": 0.002599714128813401, "grad_norm": 2.4276440143585205, "learning_rate": 2.9999499723867672e-05, "loss": 2.0149, "step": 1335 }, { "epoch": 0.002628924399923664, "grad_norm": 3.589876413345337, "learning_rate": 2.9999488418613454e-05, "loss": 1.8865, "step": 1350 }, { "epoch": 0.0026581346710339267, "grad_norm": 3.750687599182129, "learning_rate": 2.9999476987046434e-05, "loss": 2.13, "step": 1365 }, { "epoch": 0.00268734494214419, "grad_norm": 3.638850212097168, "learning_rate": 2.999946542916671e-05, "loss": 1.981, "step": 1380 }, { "epoch": 0.0027165552132544526, "grad_norm": 4.610542297363281, "learning_rate": 2.9999453744974375e-05, "loss": 1.8887, "step": 1395 }, { "epoch": 0.0027457654843647157, "grad_norm": 2.3758933544158936, "learning_rate": 2.9999441934469534e-05, "loss": 1.7867, "step": 1410 }, { "epoch": 0.0027749757554749784, "grad_norm": 1.9892805814743042, "learning_rate": 2.999942999765228e-05, "loss": 1.8632, "step": 1425 }, { "epoch": 0.0028041860265852416, "grad_norm": 2.5921459197998047, "learning_rate": 2.999941793452272e-05, "loss": 1.9155, "step": 1440 }, { "epoch": 0.0028333962976955043, "grad_norm": 6.098895072937012, "learning_rate": 2.9999405745080948e-05, "loss": 2.0112, "step": 1455 }, { "epoch": 0.0028626065688057674, "grad_norm": 2.7144930362701416, "learning_rate": 2.9999393429327073e-05, "loss": 2.0019, "step": 1470 }, { "epoch": 0.00289181683991603, "grad_norm": 2.3964602947235107, "learning_rate": 2.9999380987261195e-05, "loss": 1.9524, "step": 1485 }, { "epoch": 0.0029210271110262933, "grad_norm": 3.2644741535186768, "learning_rate": 2.9999368418883422e-05, "loss": 1.881, "step": 1500 }, { "epoch": 0.002950237382136556, "grad_norm": 3.0930943489074707, "learning_rate": 2.9999355724193854e-05, "loss": 2.0001, "step": 1515 }, { "epoch": 0.002979447653246819, "grad_norm": 2.6339657306671143, "learning_rate": 2.9999342903192608e-05, "loss": 1.9344, "step": 1530 }, { "epoch": 0.003008657924357082, "grad_norm": 3.7254459857940674, "learning_rate": 2.999932995587978e-05, "loss": 2.0673, "step": 1545 }, { "epoch": 0.003037868195467345, "grad_norm": 2.7078473567962646, "learning_rate": 2.999931688225549e-05, "loss": 2.0223, "step": 1560 }, { "epoch": 0.0030670784665776077, "grad_norm": 2.7792718410491943, "learning_rate": 2.999930368231984e-05, "loss": 1.902, "step": 1575 }, { "epoch": 0.003096288737687871, "grad_norm": 4.607854843139648, "learning_rate": 2.999929035607294e-05, "loss": 1.9536, "step": 1590 }, { "epoch": 0.0031254990087981336, "grad_norm": 2.4656715393066406, "learning_rate": 2.999927690351491e-05, "loss": 2.1792, "step": 1605 }, { "epoch": 0.0031547092799083967, "grad_norm": 2.8231770992279053, "learning_rate": 2.9999263324645863e-05, "loss": 1.8908, "step": 1620 }, { "epoch": 0.0031839195510186594, "grad_norm": 3.5741395950317383, "learning_rate": 2.9999249619465904e-05, "loss": 1.9505, "step": 1635 }, { "epoch": 0.0032131298221289226, "grad_norm": 2.0066606998443604, "learning_rate": 2.999923578797516e-05, "loss": 2.0346, "step": 1650 }, { "epoch": 0.0032423400932391853, "grad_norm": 2.5830793380737305, "learning_rate": 2.999922183017374e-05, "loss": 1.9546, "step": 1665 }, { "epoch": 0.0032715503643494484, "grad_norm": 2.8811633586883545, "learning_rate": 2.999920774606176e-05, "loss": 1.8517, "step": 1680 }, { "epoch": 0.003300760635459711, "grad_norm": 3.0571601390838623, "learning_rate": 2.999919353563935e-05, "loss": 1.8746, "step": 1695 }, { "epoch": 0.0033299709065699743, "grad_norm": 2.927371025085449, "learning_rate": 2.9999179198906614e-05, "loss": 2.0249, "step": 1710 }, { "epoch": 0.003359181177680237, "grad_norm": 2.425579071044922, "learning_rate": 2.9999164735863685e-05, "loss": 1.9149, "step": 1725 }, { "epoch": 0.0033883914487905, "grad_norm": 3.0711071491241455, "learning_rate": 2.9999150146510678e-05, "loss": 1.8335, "step": 1740 }, { "epoch": 0.003417601719900763, "grad_norm": 4.509474277496338, "learning_rate": 2.9999135430847718e-05, "loss": 2.0002, "step": 1755 }, { "epoch": 0.003446811991011026, "grad_norm": 2.784403085708618, "learning_rate": 2.9999120588874927e-05, "loss": 1.985, "step": 1770 }, { "epoch": 0.0034760222621212887, "grad_norm": 4.556781768798828, "learning_rate": 2.9999105620592434e-05, "loss": 1.8832, "step": 1785 }, { "epoch": 0.003505232533231552, "grad_norm": 3.1926143169403076, "learning_rate": 2.9999090526000364e-05, "loss": 2.0136, "step": 1800 }, { "epoch": 0.0035344428043418146, "grad_norm": 3.627634286880493, "learning_rate": 2.9999075305098846e-05, "loss": 2.0413, "step": 1815 }, { "epoch": 0.0035636530754520777, "grad_norm": 4.250141620635986, "learning_rate": 2.9999059957887998e-05, "loss": 1.9076, "step": 1830 }, { "epoch": 0.0035928633465623404, "grad_norm": 4.263555526733398, "learning_rate": 2.999904448436796e-05, "loss": 1.9782, "step": 1845 }, { "epoch": 0.0036220736176726036, "grad_norm": 2.5871047973632812, "learning_rate": 2.999902888453886e-05, "loss": 1.9703, "step": 1860 }, { "epoch": 0.0036512838887828663, "grad_norm": 2.2850048542022705, "learning_rate": 2.9999013158400827e-05, "loss": 1.9571, "step": 1875 }, { "epoch": 0.0036804941598931294, "grad_norm": 2.419032096862793, "learning_rate": 2.9998997305953993e-05, "loss": 2.1583, "step": 1890 }, { "epoch": 0.003709704431003392, "grad_norm": 3.190084934234619, "learning_rate": 2.9998981327198497e-05, "loss": 1.9732, "step": 1905 }, { "epoch": 0.0037389147021136553, "grad_norm": 3.31540846824646, "learning_rate": 2.9998965222134468e-05, "loss": 1.8646, "step": 1920 }, { "epoch": 0.003768124973223918, "grad_norm": 2.423227310180664, "learning_rate": 2.9998948990762044e-05, "loss": 1.9282, "step": 1935 }, { "epoch": 0.003797335244334181, "grad_norm": 3.4635472297668457, "learning_rate": 2.999893263308136e-05, "loss": 1.9859, "step": 1950 }, { "epoch": 0.003826545515444444, "grad_norm": 2.474353790283203, "learning_rate": 2.9998916149092556e-05, "loss": 2.0055, "step": 1965 }, { "epoch": 0.003855755786554707, "grad_norm": 3.264435052871704, "learning_rate": 2.9998899538795766e-05, "loss": 1.9755, "step": 1980 }, { "epoch": 0.0038849660576649697, "grad_norm": 2.028916358947754, "learning_rate": 2.9998882802191137e-05, "loss": 1.9914, "step": 1995 }, { "epoch": 0.003914176328775233, "grad_norm": 1.9189203977584839, "learning_rate": 2.9998865939278805e-05, "loss": 1.8109, "step": 2010 }, { "epoch": 0.0039433865998854956, "grad_norm": 3.143857955932617, "learning_rate": 2.9998848950058913e-05, "loss": 1.8233, "step": 2025 }, { "epoch": 0.003972596870995758, "grad_norm": 2.662853479385376, "learning_rate": 2.9998831834531608e-05, "loss": 2.0015, "step": 2040 }, { "epoch": 0.004001807142106022, "grad_norm": 2.0305709838867188, "learning_rate": 2.9998814592697027e-05, "loss": 1.8817, "step": 2055 }, { "epoch": 0.0040310174132162846, "grad_norm": 4.386982440948486, "learning_rate": 2.999879722455532e-05, "loss": 1.8511, "step": 2070 }, { "epoch": 0.004060227684326547, "grad_norm": 3.3638317584991455, "learning_rate": 2.9998779730106633e-05, "loss": 1.9738, "step": 2085 }, { "epoch": 0.00408943795543681, "grad_norm": 3.6872494220733643, "learning_rate": 2.9998762109351107e-05, "loss": 1.8496, "step": 2100 }, { "epoch": 0.0041186482265470736, "grad_norm": 3.2654178142547607, "learning_rate": 2.9998744362288902e-05, "loss": 1.9003, "step": 2115 }, { "epoch": 0.004147858497657336, "grad_norm": 3.4839179515838623, "learning_rate": 2.9998726488920162e-05, "loss": 2.016, "step": 2130 }, { "epoch": 0.004177068768767599, "grad_norm": 2.537320613861084, "learning_rate": 2.9998708489245034e-05, "loss": 2.0073, "step": 2145 }, { "epoch": 0.004206279039877862, "grad_norm": 3.2071378231048584, "learning_rate": 2.999869036326367e-05, "loss": 1.9835, "step": 2160 }, { "epoch": 0.004235489310988125, "grad_norm": 2.7338645458221436, "learning_rate": 2.999867211097623e-05, "loss": 1.8995, "step": 2175 }, { "epoch": 0.004264699582098388, "grad_norm": 4.866488933563232, "learning_rate": 2.999865373238286e-05, "loss": 1.9656, "step": 2190 }, { "epoch": 0.004293909853208651, "grad_norm": 2.67726731300354, "learning_rate": 2.9998635227483715e-05, "loss": 2.0401, "step": 2205 }, { "epoch": 0.004323120124318914, "grad_norm": 3.1271047592163086, "learning_rate": 2.9998616596278955e-05, "loss": 2.0406, "step": 2220 }, { "epoch": 0.004352330395429177, "grad_norm": 5.673630237579346, "learning_rate": 2.9998597838768738e-05, "loss": 1.9733, "step": 2235 }, { "epoch": 0.00438154066653944, "grad_norm": 2.696476697921753, "learning_rate": 2.9998578954953216e-05, "loss": 2.16, "step": 2250 }, { "epoch": 0.004410750937649702, "grad_norm": 5.409951686859131, "learning_rate": 2.9998559944832553e-05, "loss": 1.9401, "step": 2265 }, { "epoch": 0.004439961208759966, "grad_norm": 2.183530330657959, "learning_rate": 2.9998540808406903e-05, "loss": 1.9405, "step": 2280 }, { "epoch": 0.004469171479870229, "grad_norm": 3.3076109886169434, "learning_rate": 2.9998521545676438e-05, "loss": 1.9617, "step": 2295 }, { "epoch": 0.004498381750980491, "grad_norm": 2.793837547302246, "learning_rate": 2.999850215664131e-05, "loss": 1.9463, "step": 2310 }, { "epoch": 0.004527592022090754, "grad_norm": 2.6732029914855957, "learning_rate": 2.9998482641301687e-05, "loss": 1.9114, "step": 2325 }, { "epoch": 0.004556802293201018, "grad_norm": 2.1354994773864746, "learning_rate": 2.999846299965773e-05, "loss": 2.0321, "step": 2340 }, { "epoch": 0.00458601256431128, "grad_norm": 3.889298439025879, "learning_rate": 2.9998443231709608e-05, "loss": 2.0722, "step": 2355 }, { "epoch": 0.004615222835421543, "grad_norm": 3.6255850791931152, "learning_rate": 2.9998423337457486e-05, "loss": 2.0355, "step": 2370 }, { "epoch": 0.004644433106531806, "grad_norm": 2.4627747535705566, "learning_rate": 2.9998403316901533e-05, "loss": 2.0076, "step": 2385 }, { "epoch": 0.004673643377642069, "grad_norm": 3.3934834003448486, "learning_rate": 2.9998383170041916e-05, "loss": 1.9877, "step": 2400 }, { "epoch": 0.004702853648752332, "grad_norm": 2.2475860118865967, "learning_rate": 2.99983628968788e-05, "loss": 1.8463, "step": 2415 }, { "epoch": 0.004732063919862595, "grad_norm": 2.8804898262023926, "learning_rate": 2.9998342497412365e-05, "loss": 2.0943, "step": 2430 }, { "epoch": 0.0047612741909728576, "grad_norm": 2.2566545009613037, "learning_rate": 2.999832197164278e-05, "loss": 1.8834, "step": 2445 }, { "epoch": 0.004790484462083121, "grad_norm": 2.948420524597168, "learning_rate": 2.9998301319570216e-05, "loss": 2.002, "step": 2460 }, { "epoch": 0.004819694733193384, "grad_norm": 3.6838879585266113, "learning_rate": 2.999828054119484e-05, "loss": 1.9071, "step": 2475 }, { "epoch": 0.0048489050043036466, "grad_norm": 2.2608137130737305, "learning_rate": 2.9998259636516845e-05, "loss": 1.9241, "step": 2490 }, { "epoch": 0.004878115275413909, "grad_norm": 2.5766665935516357, "learning_rate": 2.999823860553639e-05, "loss": 1.9269, "step": 2505 }, { "epoch": 0.004907325546524173, "grad_norm": 3.382236957550049, "learning_rate": 2.9998217448253658e-05, "loss": 2.0165, "step": 2520 }, { "epoch": 0.0049365358176344356, "grad_norm": 2.600278854370117, "learning_rate": 2.999819616466883e-05, "loss": 1.8852, "step": 2535 }, { "epoch": 0.004965746088744698, "grad_norm": 2.8558318614959717, "learning_rate": 2.999817475478208e-05, "loss": 1.9202, "step": 2550 }, { "epoch": 0.004994956359854961, "grad_norm": 2.5992493629455566, "learning_rate": 2.9998153218593594e-05, "loss": 2.0815, "step": 2565 }, { "epoch": 0.0050241666309652246, "grad_norm": 3.0813794136047363, "learning_rate": 2.9998131556103545e-05, "loss": 2.0847, "step": 2580 }, { "epoch": 0.005053376902075487, "grad_norm": 3.12187123298645, "learning_rate": 2.999810976731213e-05, "loss": 1.8592, "step": 2595 }, { "epoch": 0.00508258717318575, "grad_norm": 3.4060111045837402, "learning_rate": 2.9998087852219514e-05, "loss": 1.8904, "step": 2610 }, { "epoch": 0.005111797444296013, "grad_norm": 1.8444907665252686, "learning_rate": 2.9998065810825895e-05, "loss": 2.0493, "step": 2625 }, { "epoch": 0.005141007715406276, "grad_norm": 2.9746735095977783, "learning_rate": 2.999804364313145e-05, "loss": 2.0021, "step": 2640 }, { "epoch": 0.005170217986516539, "grad_norm": 2.150517463684082, "learning_rate": 2.9998021349136373e-05, "loss": 1.8938, "step": 2655 }, { "epoch": 0.005199428257626802, "grad_norm": 2.4695417881011963, "learning_rate": 2.9997998928840854e-05, "loss": 1.956, "step": 2670 }, { "epoch": 0.005228638528737064, "grad_norm": 3.365466356277466, "learning_rate": 2.999797638224507e-05, "loss": 1.8762, "step": 2685 }, { "epoch": 0.005257848799847328, "grad_norm": 2.206486940383911, "learning_rate": 2.999795370934922e-05, "loss": 2.0777, "step": 2700 }, { "epoch": 0.005287059070957591, "grad_norm": 2.5338962078094482, "learning_rate": 2.9997930910153492e-05, "loss": 1.9524, "step": 2715 }, { "epoch": 0.005316269342067853, "grad_norm": 2.7835092544555664, "learning_rate": 2.999790798465808e-05, "loss": 1.9849, "step": 2730 }, { "epoch": 0.005345479613178116, "grad_norm": 3.5604777336120605, "learning_rate": 2.999788493286317e-05, "loss": 1.9226, "step": 2745 }, { "epoch": 0.00537468988428838, "grad_norm": 2.7719836235046387, "learning_rate": 2.9997861754768965e-05, "loss": 2.0174, "step": 2760 }, { "epoch": 0.005403900155398642, "grad_norm": 2.0660643577575684, "learning_rate": 2.999783845037566e-05, "loss": 2.0959, "step": 2775 }, { "epoch": 0.005433110426508905, "grad_norm": 2.501246690750122, "learning_rate": 2.9997815019683443e-05, "loss": 1.9407, "step": 2790 }, { "epoch": 0.005462320697619168, "grad_norm": 3.6545281410217285, "learning_rate": 2.9997791462692518e-05, "loss": 2.0155, "step": 2805 }, { "epoch": 0.005491530968729431, "grad_norm": 2.2655766010284424, "learning_rate": 2.9997767779403085e-05, "loss": 1.7096, "step": 2820 }, { "epoch": 0.005520741239839694, "grad_norm": 1.9551692008972168, "learning_rate": 2.9997743969815337e-05, "loss": 2.0597, "step": 2835 }, { "epoch": 0.005549951510949957, "grad_norm": 4.887934684753418, "learning_rate": 2.999772003392948e-05, "loss": 2.0615, "step": 2850 }, { "epoch": 0.0055791617820602196, "grad_norm": 3.462581157684326, "learning_rate": 2.999769597174571e-05, "loss": 1.9574, "step": 2865 }, { "epoch": 0.005608372053170483, "grad_norm": 4.445065498352051, "learning_rate": 2.9997671783264234e-05, "loss": 1.9514, "step": 2880 }, { "epoch": 0.005637582324280746, "grad_norm": 2.81374454498291, "learning_rate": 2.9997647468485254e-05, "loss": 1.9163, "step": 2895 }, { "epoch": 0.0056667925953910086, "grad_norm": 3.4871456623077393, "learning_rate": 2.999762302740898e-05, "loss": 1.966, "step": 2910 }, { "epoch": 0.005696002866501271, "grad_norm": 2.006155490875244, "learning_rate": 2.9997598460035608e-05, "loss": 2.009, "step": 2925 }, { "epoch": 0.005725213137611535, "grad_norm": 2.104846477508545, "learning_rate": 2.9997573766365353e-05, "loss": 1.9507, "step": 2940 }, { "epoch": 0.0057544234087217976, "grad_norm": 3.3199880123138428, "learning_rate": 2.999754894639842e-05, "loss": 1.9102, "step": 2955 }, { "epoch": 0.00578363367983206, "grad_norm": 3.4421074390411377, "learning_rate": 2.9997524000135015e-05, "loss": 1.9148, "step": 2970 }, { "epoch": 0.005812843950942323, "grad_norm": 3.9583024978637695, "learning_rate": 2.9997498927575352e-05, "loss": 1.9872, "step": 2985 }, { "epoch": 0.0058420542220525866, "grad_norm": 2.90704607963562, "learning_rate": 2.999747372871964e-05, "loss": 1.8581, "step": 3000 }, { "epoch": 0.005871264493162849, "grad_norm": 2.327897548675537, "learning_rate": 2.99974484035681e-05, "loss": 2.155, "step": 3015 }, { "epoch": 0.005900474764273112, "grad_norm": 2.7539587020874023, "learning_rate": 2.999742295212093e-05, "loss": 2.0412, "step": 3030 }, { "epoch": 0.005929685035383375, "grad_norm": 3.2530441284179688, "learning_rate": 2.999739737437835e-05, "loss": 1.8964, "step": 3045 }, { "epoch": 0.005958895306493638, "grad_norm": 4.038791179656982, "learning_rate": 2.9997371670340583e-05, "loss": 1.8992, "step": 3060 }, { "epoch": 0.005988105577603901, "grad_norm": 3.810509443283081, "learning_rate": 2.999734584000784e-05, "loss": 2.0157, "step": 3075 }, { "epoch": 0.006017315848714164, "grad_norm": 2.6701083183288574, "learning_rate": 2.9997319883380334e-05, "loss": 1.9206, "step": 3090 }, { "epoch": 0.006046526119824426, "grad_norm": 2.0933597087860107, "learning_rate": 2.999729380045829e-05, "loss": 1.8986, "step": 3105 }, { "epoch": 0.00607573639093469, "grad_norm": 7.184755802154541, "learning_rate": 2.9997267591241924e-05, "loss": 2.0928, "step": 3120 }, { "epoch": 0.006104946662044953, "grad_norm": 4.04668664932251, "learning_rate": 2.9997241255731465e-05, "loss": 2.1005, "step": 3135 }, { "epoch": 0.006134156933155215, "grad_norm": 2.4879837036132812, "learning_rate": 2.9997214793927122e-05, "loss": 2.0177, "step": 3150 }, { "epoch": 0.006163367204265478, "grad_norm": 2.1891894340515137, "learning_rate": 2.9997188205829127e-05, "loss": 1.827, "step": 3165 }, { "epoch": 0.006192577475375742, "grad_norm": 2.6386845111846924, "learning_rate": 2.9997161491437696e-05, "loss": 2.0118, "step": 3180 }, { "epoch": 0.006221787746486004, "grad_norm": 2.792214870452881, "learning_rate": 2.9997134650753066e-05, "loss": 1.7433, "step": 3195 }, { "epoch": 0.006250998017596267, "grad_norm": 3.470515727996826, "learning_rate": 2.999710768377545e-05, "loss": 2.0895, "step": 3210 }, { "epoch": 0.00628020828870653, "grad_norm": 3.3043479919433594, "learning_rate": 2.9997080590505085e-05, "loss": 2.0023, "step": 3225 }, { "epoch": 0.006309418559816793, "grad_norm": 2.1124866008758545, "learning_rate": 2.9997053370942195e-05, "loss": 2.0457, "step": 3240 }, { "epoch": 0.006338628830927056, "grad_norm": 3.1867144107818604, "learning_rate": 2.9997026025087004e-05, "loss": 1.9503, "step": 3255 }, { "epoch": 0.006367839102037319, "grad_norm": 2.3057665824890137, "learning_rate": 2.999699855293975e-05, "loss": 1.8968, "step": 3270 }, { "epoch": 0.0063970493731475815, "grad_norm": 2.416084051132202, "learning_rate": 2.999697095450066e-05, "loss": 1.8304, "step": 3285 }, { "epoch": 0.006426259644257845, "grad_norm": 2.481602191925049, "learning_rate": 2.9996943229769977e-05, "loss": 2.1386, "step": 3300 }, { "epoch": 0.006455469915368108, "grad_norm": 2.4862520694732666, "learning_rate": 2.9996915378747918e-05, "loss": 1.9672, "step": 3315 }, { "epoch": 0.0064846801864783706, "grad_norm": 3.704164743423462, "learning_rate": 2.999688740143473e-05, "loss": 1.9671, "step": 3330 }, { "epoch": 0.006513890457588634, "grad_norm": 4.340814113616943, "learning_rate": 2.999685929783064e-05, "loss": 1.889, "step": 3345 }, { "epoch": 0.006543100728698897, "grad_norm": 2.8202598094940186, "learning_rate": 2.999683106793589e-05, "loss": 1.883, "step": 3360 }, { "epoch": 0.0065723109998091596, "grad_norm": 2.3717639446258545, "learning_rate": 2.9996802711750716e-05, "loss": 1.8932, "step": 3375 }, { "epoch": 0.006601521270919422, "grad_norm": 2.1720776557922363, "learning_rate": 2.999677422927536e-05, "loss": 1.8262, "step": 3390 }, { "epoch": 0.006630731542029686, "grad_norm": 6.15664005279541, "learning_rate": 2.9996745620510055e-05, "loss": 1.9139, "step": 3405 }, { "epoch": 0.0066599418131399486, "grad_norm": 3.080832004547119, "learning_rate": 2.9996716885455047e-05, "loss": 1.9838, "step": 3420 }, { "epoch": 0.006689152084250211, "grad_norm": 2.1853530406951904, "learning_rate": 2.9996688024110577e-05, "loss": 1.9353, "step": 3435 }, { "epoch": 0.006718362355360474, "grad_norm": 2.609475612640381, "learning_rate": 2.9996659036476886e-05, "loss": 1.9734, "step": 3450 }, { "epoch": 0.0067475726264707376, "grad_norm": 3.808016061782837, "learning_rate": 2.9996629922554225e-05, "loss": 1.9872, "step": 3465 }, { "epoch": 0.006776782897581, "grad_norm": 4.373661994934082, "learning_rate": 2.999660068234283e-05, "loss": 1.8084, "step": 3480 }, { "epoch": 0.006805993168691263, "grad_norm": 4.299337387084961, "learning_rate": 2.9996571315842954e-05, "loss": 2.0801, "step": 3495 }, { "epoch": 0.006835203439801526, "grad_norm": 4.49350118637085, "learning_rate": 2.999654182305484e-05, "loss": 1.8882, "step": 3510 }, { "epoch": 0.006864413710911789, "grad_norm": 2.319342613220215, "learning_rate": 2.999651220397874e-05, "loss": 1.9627, "step": 3525 }, { "epoch": 0.006893623982022052, "grad_norm": 3.2572271823883057, "learning_rate": 2.99964824586149e-05, "loss": 1.9482, "step": 3540 }, { "epoch": 0.006922834253132315, "grad_norm": 1.7764968872070312, "learning_rate": 2.9996452586963575e-05, "loss": 1.8938, "step": 3555 }, { "epoch": 0.006952044524242577, "grad_norm": 2.2628538608551025, "learning_rate": 2.9996422589025007e-05, "loss": 1.9644, "step": 3570 }, { "epoch": 0.006981254795352841, "grad_norm": 3.8122570514678955, "learning_rate": 2.999639246479946e-05, "loss": 1.7568, "step": 3585 }, { "epoch": 0.007010465066463104, "grad_norm": 3.868978977203369, "learning_rate": 2.999636221428718e-05, "loss": 2.077, "step": 3600 }, { "epoch": 0.007039675337573366, "grad_norm": 4.426783561706543, "learning_rate": 2.999633183748843e-05, "loss": 1.8161, "step": 3615 }, { "epoch": 0.007068885608683629, "grad_norm": 2.3112306594848633, "learning_rate": 2.9996301334403456e-05, "loss": 1.8738, "step": 3630 }, { "epoch": 0.007098095879793893, "grad_norm": 1.7659751176834106, "learning_rate": 2.9996270705032523e-05, "loss": 1.9714, "step": 3645 }, { "epoch": 0.007127306150904155, "grad_norm": 5.0493364334106445, "learning_rate": 2.9996239949375882e-05, "loss": 1.9451, "step": 3660 }, { "epoch": 0.007156516422014418, "grad_norm": 1.6928982734680176, "learning_rate": 2.9996209067433794e-05, "loss": 1.7872, "step": 3675 }, { "epoch": 0.007185726693124681, "grad_norm": 2.2954142093658447, "learning_rate": 2.9996178059206525e-05, "loss": 2.0124, "step": 3690 }, { "epoch": 0.007214936964234944, "grad_norm": 4.649162292480469, "learning_rate": 2.9996146924694327e-05, "loss": 1.823, "step": 3705 }, { "epoch": 0.007244147235345207, "grad_norm": 2.248623847961426, "learning_rate": 2.9996115663897468e-05, "loss": 1.9382, "step": 3720 }, { "epoch": 0.00727335750645547, "grad_norm": 3.033177375793457, "learning_rate": 2.999608427681621e-05, "loss": 2.0324, "step": 3735 }, { "epoch": 0.0073025677775657325, "grad_norm": 3.4453887939453125, "learning_rate": 2.9996052763450817e-05, "loss": 1.9293, "step": 3750 }, { "epoch": 0.007331778048675996, "grad_norm": 3.873504877090454, "learning_rate": 2.9996021123801556e-05, "loss": 2.0132, "step": 3765 }, { "epoch": 0.007360988319786259, "grad_norm": 3.7370665073394775, "learning_rate": 2.999598935786869e-05, "loss": 1.8681, "step": 3780 }, { "epoch": 0.0073901985908965215, "grad_norm": 5.5719523429870605, "learning_rate": 2.999595746565249e-05, "loss": 1.8799, "step": 3795 }, { "epoch": 0.007419408862006784, "grad_norm": 3.7253758907318115, "learning_rate": 2.9995925447153226e-05, "loss": 1.8633, "step": 3810 }, { "epoch": 0.007448619133117048, "grad_norm": 2.792862892150879, "learning_rate": 2.9995893302371158e-05, "loss": 1.9642, "step": 3825 }, { "epoch": 0.0074778294042273106, "grad_norm": 2.693080425262451, "learning_rate": 2.999586103130657e-05, "loss": 1.8963, "step": 3840 }, { "epoch": 0.007507039675337573, "grad_norm": 2.7681262493133545, "learning_rate": 2.9995828633959724e-05, "loss": 2.0581, "step": 3855 }, { "epoch": 0.007536249946447836, "grad_norm": 3.635828733444214, "learning_rate": 2.9995796110330894e-05, "loss": 1.7659, "step": 3870 }, { "epoch": 0.0075654602175580996, "grad_norm": 2.46408748626709, "learning_rate": 2.9995763460420358e-05, "loss": 1.7966, "step": 3885 }, { "epoch": 0.007594670488668362, "grad_norm": 2.2896721363067627, "learning_rate": 2.999573068422839e-05, "loss": 1.9918, "step": 3900 }, { "epoch": 0.007623880759778625, "grad_norm": 2.3533968925476074, "learning_rate": 2.9995697781755262e-05, "loss": 1.8725, "step": 3915 }, { "epoch": 0.007653091030888888, "grad_norm": 2.5543251037597656, "learning_rate": 2.999566475300125e-05, "loss": 1.8896, "step": 3930 }, { "epoch": 0.007682301301999151, "grad_norm": 3.2201671600341797, "learning_rate": 2.999563159796665e-05, "loss": 1.8444, "step": 3945 }, { "epoch": 0.007711511573109414, "grad_norm": 2.2435178756713867, "learning_rate": 2.9995598316651713e-05, "loss": 1.976, "step": 3960 }, { "epoch": 0.007740721844219677, "grad_norm": 3.219825506210327, "learning_rate": 2.999556490905674e-05, "loss": 1.8561, "step": 3975 }, { "epoch": 0.007769932115329939, "grad_norm": 2.4510769844055176, "learning_rate": 2.9995531375182008e-05, "loss": 2.068, "step": 3990 }, { "epoch": 0.007799142386440203, "grad_norm": 4.242166519165039, "learning_rate": 2.999549771502779e-05, "loss": 1.8497, "step": 4005 }, { "epoch": 0.007828352657550466, "grad_norm": 2.9145870208740234, "learning_rate": 2.9995463928594383e-05, "loss": 1.8896, "step": 4020 }, { "epoch": 0.007857562928660728, "grad_norm": 2.33396315574646, "learning_rate": 2.9995430015882064e-05, "loss": 1.9502, "step": 4035 }, { "epoch": 0.007886773199770991, "grad_norm": 3.1073617935180664, "learning_rate": 2.9995395976891118e-05, "loss": 1.9178, "step": 4050 }, { "epoch": 0.007915983470881254, "grad_norm": 3.444310188293457, "learning_rate": 2.9995361811621838e-05, "loss": 1.7477, "step": 4065 }, { "epoch": 0.007945193741991517, "grad_norm": 3.131169080734253, "learning_rate": 2.9995327520074504e-05, "loss": 1.8799, "step": 4080 }, { "epoch": 0.007974404013101781, "grad_norm": 2.9107017517089844, "learning_rate": 2.9995293102249408e-05, "loss": 1.8785, "step": 4095 }, { "epoch": 0.008003614284212044, "grad_norm": 2.795280694961548, "learning_rate": 2.9995258558146834e-05, "loss": 2.0386, "step": 4110 }, { "epoch": 0.008032824555322306, "grad_norm": 5.588748455047607, "learning_rate": 2.9995223887767087e-05, "loss": 1.9852, "step": 4125 }, { "epoch": 0.008062034826432569, "grad_norm": 2.659045934677124, "learning_rate": 2.999518909111045e-05, "loss": 1.8707, "step": 4140 }, { "epoch": 0.008091245097542832, "grad_norm": 2.9934346675872803, "learning_rate": 2.9995154168177214e-05, "loss": 1.9004, "step": 4155 }, { "epoch": 0.008120455368653095, "grad_norm": 2.6598739624023438, "learning_rate": 2.9995119118967674e-05, "loss": 1.8897, "step": 4170 }, { "epoch": 0.008149665639763357, "grad_norm": 3.6846165657043457, "learning_rate": 2.9995083943482126e-05, "loss": 1.9932, "step": 4185 }, { "epoch": 0.00817887591087362, "grad_norm": 3.6916184425354004, "learning_rate": 2.9995048641720873e-05, "loss": 1.7672, "step": 4200 }, { "epoch": 0.008208086181983884, "grad_norm": 3.650599956512451, "learning_rate": 2.9995013213684202e-05, "loss": 1.9271, "step": 4215 }, { "epoch": 0.008237296453094147, "grad_norm": 4.196152687072754, "learning_rate": 2.999497765937242e-05, "loss": 1.8244, "step": 4230 }, { "epoch": 0.00826650672420441, "grad_norm": 3.1078038215637207, "learning_rate": 2.9994941978785817e-05, "loss": 1.893, "step": 4245 }, { "epoch": 0.008295716995314673, "grad_norm": 4.401791095733643, "learning_rate": 2.9994906171924703e-05, "loss": 1.8844, "step": 4260 }, { "epoch": 0.008324927266424935, "grad_norm": 2.6136245727539062, "learning_rate": 2.999487023878937e-05, "loss": 1.8981, "step": 4275 }, { "epoch": 0.008354137537535198, "grad_norm": 3.334519624710083, "learning_rate": 2.9994834179380134e-05, "loss": 1.9983, "step": 4290 }, { "epoch": 0.00838334780864546, "grad_norm": 1.8010269403457642, "learning_rate": 2.9994797993697283e-05, "loss": 1.8192, "step": 4305 }, { "epoch": 0.008412558079755723, "grad_norm": 3.2746548652648926, "learning_rate": 2.9994761681741135e-05, "loss": 1.9696, "step": 4320 }, { "epoch": 0.008441768350865988, "grad_norm": 2.172431468963623, "learning_rate": 2.9994725243511982e-05, "loss": 1.9068, "step": 4335 }, { "epoch": 0.00847097862197625, "grad_norm": 3.5535871982574463, "learning_rate": 2.999468867901015e-05, "loss": 1.985, "step": 4350 }, { "epoch": 0.008500188893086513, "grad_norm": 3.866422176361084, "learning_rate": 2.9994651988235923e-05, "loss": 1.8057, "step": 4365 }, { "epoch": 0.008529399164196776, "grad_norm": 4.770716190338135, "learning_rate": 2.999461517118963e-05, "loss": 1.9949, "step": 4380 }, { "epoch": 0.008558609435307039, "grad_norm": 2.9273180961608887, "learning_rate": 2.999457822787157e-05, "loss": 1.8686, "step": 4395 }, { "epoch": 0.008587819706417301, "grad_norm": 3.192166328430176, "learning_rate": 2.9994541158282063e-05, "loss": 1.9539, "step": 4410 }, { "epoch": 0.008617029977527564, "grad_norm": 3.507930040359497, "learning_rate": 2.9994503962421417e-05, "loss": 1.7732, "step": 4425 }, { "epoch": 0.008646240248637829, "grad_norm": 2.666705369949341, "learning_rate": 2.9994466640289938e-05, "loss": 1.9615, "step": 4440 }, { "epoch": 0.008675450519748091, "grad_norm": 2.640362501144409, "learning_rate": 2.999442919188795e-05, "loss": 1.89, "step": 4455 }, { "epoch": 0.008704660790858354, "grad_norm": 2.524216890335083, "learning_rate": 2.9994391617215765e-05, "loss": 2.0473, "step": 4470 }, { "epoch": 0.008733871061968617, "grad_norm": 2.6444146633148193, "learning_rate": 2.9994353916273696e-05, "loss": 1.8889, "step": 4485 }, { "epoch": 0.00876308133307888, "grad_norm": 2.4632787704467773, "learning_rate": 2.9994316089062068e-05, "loss": 1.9616, "step": 4500 }, { "epoch": 0.008792291604189142, "grad_norm": 5.182243347167969, "learning_rate": 2.999427813558119e-05, "loss": 1.8216, "step": 4515 }, { "epoch": 0.008821501875299405, "grad_norm": 3.6622776985168457, "learning_rate": 2.9994240055831395e-05, "loss": 1.8692, "step": 4530 }, { "epoch": 0.008850712146409668, "grad_norm": 2.899912118911743, "learning_rate": 2.9994201849812988e-05, "loss": 1.9363, "step": 4545 }, { "epoch": 0.008879922417519932, "grad_norm": 1.8940974473953247, "learning_rate": 2.99941635175263e-05, "loss": 1.8404, "step": 4560 }, { "epoch": 0.008909132688630195, "grad_norm": 3.581655263900757, "learning_rate": 2.9994125058971657e-05, "loss": 1.9623, "step": 4575 }, { "epoch": 0.008938342959740457, "grad_norm": 2.6804749965667725, "learning_rate": 2.9994086474149375e-05, "loss": 2.0692, "step": 4590 }, { "epoch": 0.00896755323085072, "grad_norm": 4.302793502807617, "learning_rate": 2.999404776305978e-05, "loss": 1.9728, "step": 4605 }, { "epoch": 0.008996763501960983, "grad_norm": 2.143483877182007, "learning_rate": 2.9994008925703202e-05, "loss": 1.941, "step": 4620 }, { "epoch": 0.009025973773071246, "grad_norm": 2.2785732746124268, "learning_rate": 2.9993969962079964e-05, "loss": 1.9828, "step": 4635 }, { "epoch": 0.009055184044181508, "grad_norm": 3.918194055557251, "learning_rate": 2.9993930872190398e-05, "loss": 1.9211, "step": 4650 }, { "epoch": 0.009084394315291771, "grad_norm": 2.7863261699676514, "learning_rate": 2.999389165603483e-05, "loss": 1.8564, "step": 4665 }, { "epoch": 0.009113604586402035, "grad_norm": 2.655966281890869, "learning_rate": 2.9993852313613596e-05, "loss": 1.9327, "step": 4680 }, { "epoch": 0.009142814857512298, "grad_norm": 2.987030506134033, "learning_rate": 2.999381284492702e-05, "loss": 1.8325, "step": 4695 }, { "epoch": 0.00917202512862256, "grad_norm": 3.127544641494751, "learning_rate": 2.9993773249975435e-05, "loss": 1.7246, "step": 4710 }, { "epoch": 0.009201235399732824, "grad_norm": 2.395202398300171, "learning_rate": 2.999373352875918e-05, "loss": 1.9827, "step": 4725 }, { "epoch": 0.009230445670843086, "grad_norm": 4.163525104522705, "learning_rate": 2.9993693681278582e-05, "loss": 1.9441, "step": 4740 }, { "epoch": 0.009259655941953349, "grad_norm": 2.84067964553833, "learning_rate": 2.9993653707533985e-05, "loss": 2.0252, "step": 4755 }, { "epoch": 0.009288866213063612, "grad_norm": 1.949599266052246, "learning_rate": 2.9993613607525717e-05, "loss": 1.9497, "step": 4770 }, { "epoch": 0.009318076484173874, "grad_norm": 4.164729595184326, "learning_rate": 2.9993573381254124e-05, "loss": 2.1013, "step": 4785 }, { "epoch": 0.009347286755284139, "grad_norm": 1.7438952922821045, "learning_rate": 2.9993533028719537e-05, "loss": 1.9461, "step": 4800 }, { "epoch": 0.009376497026394402, "grad_norm": 1.9006491899490356, "learning_rate": 2.9993492549922302e-05, "loss": 1.9273, "step": 4815 }, { "epoch": 0.009405707297504664, "grad_norm": 3.693070411682129, "learning_rate": 2.9993451944862762e-05, "loss": 1.9791, "step": 4830 }, { "epoch": 0.009434917568614927, "grad_norm": 2.9572765827178955, "learning_rate": 2.9993411213541248e-05, "loss": 1.7725, "step": 4845 }, { "epoch": 0.00946412783972519, "grad_norm": 2.882349967956543, "learning_rate": 2.999337035595811e-05, "loss": 1.8094, "step": 4860 }, { "epoch": 0.009493338110835452, "grad_norm": 3.9489054679870605, "learning_rate": 2.9993329372113695e-05, "loss": 1.8678, "step": 4875 }, { "epoch": 0.009522548381945715, "grad_norm": 2.7020881175994873, "learning_rate": 2.999328826200834e-05, "loss": 2.0528, "step": 4890 }, { "epoch": 0.009551758653055978, "grad_norm": 3.121814489364624, "learning_rate": 2.99932470256424e-05, "loss": 1.9087, "step": 4905 }, { "epoch": 0.009580968924166242, "grad_norm": 2.713003158569336, "learning_rate": 2.9993205663016218e-05, "loss": 1.8721, "step": 4920 }, { "epoch": 0.009610179195276505, "grad_norm": 3.5016112327575684, "learning_rate": 2.9993164174130137e-05, "loss": 1.8573, "step": 4935 }, { "epoch": 0.009639389466386768, "grad_norm": 4.257192611694336, "learning_rate": 2.9993122558984516e-05, "loss": 1.9638, "step": 4950 }, { "epoch": 0.00966859973749703, "grad_norm": 3.4689440727233887, "learning_rate": 2.9993080817579702e-05, "loss": 1.801, "step": 4965 }, { "epoch": 0.009697810008607293, "grad_norm": 2.3255503177642822, "learning_rate": 2.999303894991605e-05, "loss": 1.6792, "step": 4980 }, { "epoch": 0.009727020279717556, "grad_norm": 2.5599734783172607, "learning_rate": 2.9992996955993898e-05, "loss": 2.0037, "step": 4995 }, { "epoch": 0.009756230550827819, "grad_norm": 2.528571605682373, "learning_rate": 2.9992954835813616e-05, "loss": 1.9778, "step": 5010 }, { "epoch": 0.009785440821938081, "grad_norm": 3.466859817504883, "learning_rate": 2.999291258937555e-05, "loss": 1.9507, "step": 5025 }, { "epoch": 0.009814651093048346, "grad_norm": 2.9515936374664307, "learning_rate": 2.999287021668006e-05, "loss": 1.9506, "step": 5040 }, { "epoch": 0.009843861364158608, "grad_norm": 2.116895914077759, "learning_rate": 2.99928277177275e-05, "loss": 2.0403, "step": 5055 }, { "epoch": 0.009873071635268871, "grad_norm": 2.0982749462127686, "learning_rate": 2.999278509251823e-05, "loss": 1.981, "step": 5070 }, { "epoch": 0.009902281906379134, "grad_norm": 2.1464314460754395, "learning_rate": 2.9992742341052612e-05, "loss": 2.0126, "step": 5085 }, { "epoch": 0.009931492177489397, "grad_norm": 5.098091125488281, "learning_rate": 2.9992699463330995e-05, "loss": 1.8511, "step": 5100 }, { "epoch": 0.00996070244859966, "grad_norm": 3.8748703002929688, "learning_rate": 2.999265645935375e-05, "loss": 1.9058, "step": 5115 }, { "epoch": 0.009989912719709922, "grad_norm": 3.9060122966766357, "learning_rate": 2.999261332912124e-05, "loss": 1.9298, "step": 5130 }, { "epoch": 0.010019122990820185, "grad_norm": 3.6169393062591553, "learning_rate": 2.999257007263382e-05, "loss": 1.6655, "step": 5145 }, { "epoch": 0.010048333261930449, "grad_norm": 2.0531461238861084, "learning_rate": 2.999252668989186e-05, "loss": 2.0188, "step": 5160 }, { "epoch": 0.010077543533040712, "grad_norm": 2.554202079772949, "learning_rate": 2.9992483180895725e-05, "loss": 1.8039, "step": 5175 }, { "epoch": 0.010106753804150975, "grad_norm": 3.2061095237731934, "learning_rate": 2.9992439545645778e-05, "loss": 1.8889, "step": 5190 }, { "epoch": 0.010135964075261237, "grad_norm": 2.8294739723205566, "learning_rate": 2.9992395784142395e-05, "loss": 1.9291, "step": 5205 }, { "epoch": 0.0101651743463715, "grad_norm": 2.8096394538879395, "learning_rate": 2.9992351896385932e-05, "loss": 1.924, "step": 5220 }, { "epoch": 0.010194384617481763, "grad_norm": 3.7793962955474854, "learning_rate": 2.999230788237677e-05, "loss": 1.933, "step": 5235 }, { "epoch": 0.010223594888592025, "grad_norm": 5.307252407073975, "learning_rate": 2.999226374211527e-05, "loss": 1.9296, "step": 5250 }, { "epoch": 0.010252805159702288, "grad_norm": 3.207782506942749, "learning_rate": 2.9992219475601806e-05, "loss": 2.0676, "step": 5265 }, { "epoch": 0.010282015430812553, "grad_norm": 2.7217934131622314, "learning_rate": 2.9992175082836765e-05, "loss": 1.8675, "step": 5280 }, { "epoch": 0.010311225701922815, "grad_norm": 3.455260753631592, "learning_rate": 2.9992130563820497e-05, "loss": 1.924, "step": 5295 }, { "epoch": 0.010340435973033078, "grad_norm": 2.638262987136841, "learning_rate": 2.9992085918553393e-05, "loss": 1.9263, "step": 5310 }, { "epoch": 0.01036964624414334, "grad_norm": 3.1709418296813965, "learning_rate": 2.9992041147035828e-05, "loss": 1.8848, "step": 5325 }, { "epoch": 0.010398856515253603, "grad_norm": 2.1629397869110107, "learning_rate": 2.9991996249268175e-05, "loss": 1.9271, "step": 5340 }, { "epoch": 0.010428066786363866, "grad_norm": 4.2046284675598145, "learning_rate": 2.999195122525081e-05, "loss": 1.9208, "step": 5355 }, { "epoch": 0.010457277057474129, "grad_norm": 4.966402053833008, "learning_rate": 2.9991906074984116e-05, "loss": 1.8059, "step": 5370 }, { "epoch": 0.010486487328584392, "grad_norm": 3.0322844982147217, "learning_rate": 2.9991860798468473e-05, "loss": 1.9608, "step": 5385 }, { "epoch": 0.010515697599694656, "grad_norm": 2.3690319061279297, "learning_rate": 2.9991815395704266e-05, "loss": 1.9424, "step": 5400 }, { "epoch": 0.010544907870804919, "grad_norm": 2.81915545463562, "learning_rate": 2.9991769866691865e-05, "loss": 1.8246, "step": 5415 }, { "epoch": 0.010574118141915181, "grad_norm": 3.064317464828491, "learning_rate": 2.9991724211431667e-05, "loss": 1.8012, "step": 5430 }, { "epoch": 0.010603328413025444, "grad_norm": 4.302711009979248, "learning_rate": 2.999167842992405e-05, "loss": 1.9098, "step": 5445 }, { "epoch": 0.010632538684135707, "grad_norm": 3.3137192726135254, "learning_rate": 2.9991632522169398e-05, "loss": 1.8922, "step": 5460 }, { "epoch": 0.01066174895524597, "grad_norm": 3.0742363929748535, "learning_rate": 2.9991586488168104e-05, "loss": 2.0403, "step": 5475 }, { "epoch": 0.010690959226356232, "grad_norm": 3.034343957901001, "learning_rate": 2.9991540327920547e-05, "loss": 1.8944, "step": 5490 }, { "epoch": 0.010720169497466495, "grad_norm": 4.131673812866211, "learning_rate": 2.9991494041427124e-05, "loss": 1.8897, "step": 5505 }, { "epoch": 0.01074937976857676, "grad_norm": 3.8295650482177734, "learning_rate": 2.999144762868822e-05, "loss": 1.8522, "step": 5520 }, { "epoch": 0.010778590039687022, "grad_norm": 4.337125301361084, "learning_rate": 2.999140108970423e-05, "loss": 1.8039, "step": 5535 }, { "epoch": 0.010807800310797285, "grad_norm": 4.047338008880615, "learning_rate": 2.999135442447554e-05, "loss": 1.964, "step": 5550 }, { "epoch": 0.010837010581907548, "grad_norm": 3.5203754901885986, "learning_rate": 2.9991307633002546e-05, "loss": 1.9198, "step": 5565 }, { "epoch": 0.01086622085301781, "grad_norm": 3.9507977962493896, "learning_rate": 2.9991260715285642e-05, "loss": 1.9079, "step": 5580 }, { "epoch": 0.010895431124128073, "grad_norm": 4.072928428649902, "learning_rate": 2.9991213671325223e-05, "loss": 2.032, "step": 5595 }, { "epoch": 0.010924641395238336, "grad_norm": 2.8438544273376465, "learning_rate": 2.9991166501121685e-05, "loss": 1.9995, "step": 5610 }, { "epoch": 0.0109538516663486, "grad_norm": 3.2885472774505615, "learning_rate": 2.9991119204675425e-05, "loss": 1.8653, "step": 5625 }, { "epoch": 0.010983061937458863, "grad_norm": 3.6044580936431885, "learning_rate": 2.9991071781986843e-05, "loss": 1.9103, "step": 5640 }, { "epoch": 0.011012272208569126, "grad_norm": 4.155179977416992, "learning_rate": 2.9991024233056335e-05, "loss": 1.8887, "step": 5655 }, { "epoch": 0.011041482479679388, "grad_norm": 4.087657451629639, "learning_rate": 2.9990976557884308e-05, "loss": 1.967, "step": 5670 }, { "epoch": 0.011070692750789651, "grad_norm": 4.724533557891846, "learning_rate": 2.999092875647116e-05, "loss": 1.8383, "step": 5685 }, { "epoch": 0.011099903021899914, "grad_norm": 7.630526065826416, "learning_rate": 2.9990880828817287e-05, "loss": 1.9142, "step": 5700 }, { "epoch": 0.011129113293010176, "grad_norm": 4.646370887756348, "learning_rate": 2.99908327749231e-05, "loss": 1.8862, "step": 5715 }, { "epoch": 0.011158323564120439, "grad_norm": 1.9703487157821655, "learning_rate": 2.9990784594789e-05, "loss": 1.8747, "step": 5730 }, { "epoch": 0.011187533835230704, "grad_norm": 2.7092244625091553, "learning_rate": 2.99907362884154e-05, "loss": 1.9593, "step": 5745 }, { "epoch": 0.011216744106340966, "grad_norm": 2.3020477294921875, "learning_rate": 2.9990687855802695e-05, "loss": 1.9351, "step": 5760 }, { "epoch": 0.011245954377451229, "grad_norm": 3.7924842834472656, "learning_rate": 2.9990639296951303e-05, "loss": 1.9888, "step": 5775 }, { "epoch": 0.011275164648561492, "grad_norm": 2.8485357761383057, "learning_rate": 2.9990590611861625e-05, "loss": 1.7921, "step": 5790 }, { "epoch": 0.011304374919671754, "grad_norm": 2.7510149478912354, "learning_rate": 2.999054180053408e-05, "loss": 1.9655, "step": 5805 }, { "epoch": 0.011333585190782017, "grad_norm": 4.522819519042969, "learning_rate": 2.999049286296907e-05, "loss": 1.9384, "step": 5820 }, { "epoch": 0.01136279546189228, "grad_norm": 2.325582265853882, "learning_rate": 2.9990443799167018e-05, "loss": 1.8342, "step": 5835 }, { "epoch": 0.011392005733002543, "grad_norm": 3.583799123764038, "learning_rate": 2.999039460912832e-05, "loss": 1.9671, "step": 5850 }, { "epoch": 0.011421216004112807, "grad_norm": 2.342571973800659, "learning_rate": 2.999034529285341e-05, "loss": 1.8169, "step": 5865 }, { "epoch": 0.01145042627522307, "grad_norm": 3.772407054901123, "learning_rate": 2.9990295850342694e-05, "loss": 1.8066, "step": 5880 }, { "epoch": 0.011479636546333332, "grad_norm": 2.550743341445923, "learning_rate": 2.9990246281596583e-05, "loss": 1.9611, "step": 5895 }, { "epoch": 0.011508846817443595, "grad_norm": 1.9916815757751465, "learning_rate": 2.9990196586615502e-05, "loss": 1.8308, "step": 5910 }, { "epoch": 0.011538057088553858, "grad_norm": 3.8781256675720215, "learning_rate": 2.9990146765399868e-05, "loss": 1.804, "step": 5925 }, { "epoch": 0.01156726735966412, "grad_norm": 1.9027587175369263, "learning_rate": 2.99900968179501e-05, "loss": 1.8449, "step": 5940 }, { "epoch": 0.011596477630774383, "grad_norm": 2.9147605895996094, "learning_rate": 2.9990046744266612e-05, "loss": 1.9283, "step": 5955 }, { "epoch": 0.011625687901884646, "grad_norm": 2.0023555755615234, "learning_rate": 2.9989996544349842e-05, "loss": 1.9464, "step": 5970 }, { "epoch": 0.01165489817299491, "grad_norm": 2.819077253341675, "learning_rate": 2.9989946218200195e-05, "loss": 1.7682, "step": 5985 }, { "epoch": 0.011684108444105173, "grad_norm": 4.221833229064941, "learning_rate": 2.998989576581811e-05, "loss": 1.6921, "step": 6000 }, { "epoch": 0.011713318715215436, "grad_norm": 2.9984993934631348, "learning_rate": 2.9989845187204e-05, "loss": 1.8921, "step": 6015 }, { "epoch": 0.011742528986325699, "grad_norm": 2.866452932357788, "learning_rate": 2.9989794482358293e-05, "loss": 1.9759, "step": 6030 }, { "epoch": 0.011771739257435961, "grad_norm": 2.398813486099243, "learning_rate": 2.9989743651281424e-05, "loss": 1.8948, "step": 6045 }, { "epoch": 0.011800949528546224, "grad_norm": 2.6533145904541016, "learning_rate": 2.998969269397381e-05, "loss": 1.8532, "step": 6060 }, { "epoch": 0.011830159799656487, "grad_norm": 3.8135955333709717, "learning_rate": 2.998964161043589e-05, "loss": 1.9564, "step": 6075 }, { "epoch": 0.01185937007076675, "grad_norm": 2.5285484790802, "learning_rate": 2.9989590400668086e-05, "loss": 1.8229, "step": 6090 }, { "epoch": 0.011888580341877014, "grad_norm": 2.0157272815704346, "learning_rate": 2.9989539064670838e-05, "loss": 1.9952, "step": 6105 }, { "epoch": 0.011917790612987277, "grad_norm": 1.8600102663040161, "learning_rate": 2.998948760244457e-05, "loss": 2.0176, "step": 6120 }, { "epoch": 0.01194700088409754, "grad_norm": 2.4452428817749023, "learning_rate": 2.9989436013989718e-05, "loss": 1.8965, "step": 6135 }, { "epoch": 0.011976211155207802, "grad_norm": 3.593918561935425, "learning_rate": 2.998938429930672e-05, "loss": 1.9629, "step": 6150 }, { "epoch": 0.012005421426318065, "grad_norm": 2.161616086959839, "learning_rate": 2.9989332458396005e-05, "loss": 2.0643, "step": 6165 }, { "epoch": 0.012034631697428327, "grad_norm": 4.1473493576049805, "learning_rate": 2.9989280491258015e-05, "loss": 1.8227, "step": 6180 }, { "epoch": 0.01206384196853859, "grad_norm": 3.4432153701782227, "learning_rate": 2.9989228397893186e-05, "loss": 1.9787, "step": 6195 }, { "epoch": 0.012093052239648853, "grad_norm": 3.591747283935547, "learning_rate": 2.9989176178301955e-05, "loss": 1.8248, "step": 6210 }, { "epoch": 0.012122262510759117, "grad_norm": 2.385715961456299, "learning_rate": 2.9989123832484767e-05, "loss": 2.0869, "step": 6225 }, { "epoch": 0.01215147278186938, "grad_norm": 3.128617763519287, "learning_rate": 2.9989071360442058e-05, "loss": 1.8622, "step": 6240 }, { "epoch": 0.012180683052979643, "grad_norm": 5.122420787811279, "learning_rate": 2.998901876217427e-05, "loss": 1.828, "step": 6255 }, { "epoch": 0.012209893324089905, "grad_norm": 2.451355218887329, "learning_rate": 2.9988966037681844e-05, "loss": 1.8766, "step": 6270 }, { "epoch": 0.012239103595200168, "grad_norm": 2.8492448329925537, "learning_rate": 2.9988913186965232e-05, "loss": 1.9675, "step": 6285 }, { "epoch": 0.01226831386631043, "grad_norm": 4.332848072052002, "learning_rate": 2.998886021002487e-05, "loss": 2.0243, "step": 6300 }, { "epoch": 0.012297524137420694, "grad_norm": 2.7769408226013184, "learning_rate": 2.9988807106861208e-05, "loss": 1.9015, "step": 6315 }, { "epoch": 0.012326734408530956, "grad_norm": 2.2205896377563477, "learning_rate": 2.9988753877474696e-05, "loss": 1.8927, "step": 6330 }, { "epoch": 0.01235594467964122, "grad_norm": 4.445504665374756, "learning_rate": 2.9988700521865777e-05, "loss": 1.9646, "step": 6345 }, { "epoch": 0.012385154950751483, "grad_norm": 2.951155662536621, "learning_rate": 2.9988647040034905e-05, "loss": 1.8487, "step": 6360 }, { "epoch": 0.012414365221861746, "grad_norm": 4.908761501312256, "learning_rate": 2.998859343198253e-05, "loss": 1.8704, "step": 6375 }, { "epoch": 0.012443575492972009, "grad_norm": 3.0668880939483643, "learning_rate": 2.9988539697709098e-05, "loss": 1.8632, "step": 6390 }, { "epoch": 0.012472785764082272, "grad_norm": 1.8523426055908203, "learning_rate": 2.9988485837215068e-05, "loss": 1.8434, "step": 6405 }, { "epoch": 0.012501996035192534, "grad_norm": 4.285421848297119, "learning_rate": 2.9988431850500887e-05, "loss": 1.8438, "step": 6420 }, { "epoch": 0.012531206306302797, "grad_norm": 1.9119669198989868, "learning_rate": 2.9988377737567013e-05, "loss": 1.8469, "step": 6435 }, { "epoch": 0.01256041657741306, "grad_norm": 2.430860757827759, "learning_rate": 2.9988323498413907e-05, "loss": 2.1108, "step": 6450 }, { "epoch": 0.012589626848523324, "grad_norm": 2.2205920219421387, "learning_rate": 2.9988269133042016e-05, "loss": 1.8481, "step": 6465 }, { "epoch": 0.012618837119633587, "grad_norm": 2.0588815212249756, "learning_rate": 2.9988214641451804e-05, "loss": 1.8007, "step": 6480 }, { "epoch": 0.01264804739074385, "grad_norm": 2.527836322784424, "learning_rate": 2.998816002364373e-05, "loss": 1.9974, "step": 6495 }, { "epoch": 0.012677257661854112, "grad_norm": 3.0486202239990234, "learning_rate": 2.9988105279618253e-05, "loss": 1.8459, "step": 6510 }, { "epoch": 0.012706467932964375, "grad_norm": 2.6629745960235596, "learning_rate": 2.998805040937583e-05, "loss": 1.9373, "step": 6525 }, { "epoch": 0.012735678204074638, "grad_norm": 3.2922544479370117, "learning_rate": 2.9987995412916928e-05, "loss": 1.8839, "step": 6540 }, { "epoch": 0.0127648884751849, "grad_norm": 4.026648044586182, "learning_rate": 2.998794029024201e-05, "loss": 2.1104, "step": 6555 }, { "epoch": 0.012794098746295163, "grad_norm": 2.473214864730835, "learning_rate": 2.998788504135154e-05, "loss": 1.664, "step": 6570 }, { "epoch": 0.012823309017405428, "grad_norm": 2.8834683895111084, "learning_rate": 2.998782966624598e-05, "loss": 1.9737, "step": 6585 }, { "epoch": 0.01285251928851569, "grad_norm": 3.123595952987671, "learning_rate": 2.99877741649258e-05, "loss": 2.1071, "step": 6600 }, { "epoch": 0.012881729559625953, "grad_norm": 2.3172380924224854, "learning_rate": 2.998771853739146e-05, "loss": 1.8585, "step": 6615 }, { "epoch": 0.012910939830736216, "grad_norm": 3.4954440593719482, "learning_rate": 2.998766278364344e-05, "loss": 1.8458, "step": 6630 }, { "epoch": 0.012940150101846478, "grad_norm": 2.48248553276062, "learning_rate": 2.9987606903682203e-05, "loss": 1.9398, "step": 6645 }, { "epoch": 0.012969360372956741, "grad_norm": 2.7575950622558594, "learning_rate": 2.998755089750822e-05, "loss": 1.8692, "step": 6660 }, { "epoch": 0.012998570644067004, "grad_norm": 2.821286678314209, "learning_rate": 2.9987494765121962e-05, "loss": 2.1104, "step": 6675 }, { "epoch": 0.013027780915177268, "grad_norm": 3.881669521331787, "learning_rate": 2.99874385065239e-05, "loss": 1.8453, "step": 6690 }, { "epoch": 0.013056991186287531, "grad_norm": 2.573246955871582, "learning_rate": 2.9987382121714516e-05, "loss": 1.9538, "step": 6705 }, { "epoch": 0.013086201457397794, "grad_norm": 4.5084052085876465, "learning_rate": 2.9987325610694277e-05, "loss": 1.9216, "step": 6720 }, { "epoch": 0.013115411728508056, "grad_norm": 3.882384777069092, "learning_rate": 2.9987268973463662e-05, "loss": 1.7694, "step": 6735 }, { "epoch": 0.013144621999618319, "grad_norm": 2.769113302230835, "learning_rate": 2.9987212210023147e-05, "loss": 1.9591, "step": 6750 }, { "epoch": 0.013173832270728582, "grad_norm": 3.9864275455474854, "learning_rate": 2.9987155320373207e-05, "loss": 1.9079, "step": 6765 }, { "epoch": 0.013203042541838845, "grad_norm": 1.9832093715667725, "learning_rate": 2.998709830451433e-05, "loss": 1.8263, "step": 6780 }, { "epoch": 0.013232252812949107, "grad_norm": 5.0461106300354, "learning_rate": 2.9987041162446985e-05, "loss": 1.8265, "step": 6795 }, { "epoch": 0.013261463084059372, "grad_norm": 2.5139777660369873, "learning_rate": 2.998698389417166e-05, "loss": 1.8644, "step": 6810 }, { "epoch": 0.013290673355169634, "grad_norm": 2.5377390384674072, "learning_rate": 2.998692649968884e-05, "loss": 1.9956, "step": 6825 }, { "epoch": 0.013319883626279897, "grad_norm": 2.68589186668396, "learning_rate": 2.9986868978998998e-05, "loss": 1.9298, "step": 6840 }, { "epoch": 0.01334909389739016, "grad_norm": 5.362221717834473, "learning_rate": 2.9986811332102624e-05, "loss": 1.8319, "step": 6855 }, { "epoch": 0.013378304168500423, "grad_norm": 3.440687417984009, "learning_rate": 2.9986753559000207e-05, "loss": 1.8854, "step": 6870 }, { "epoch": 0.013407514439610685, "grad_norm": 2.2173657417297363, "learning_rate": 2.9986695659692233e-05, "loss": 1.8672, "step": 6885 }, { "epoch": 0.013436724710720948, "grad_norm": 2.1564152240753174, "learning_rate": 2.998663763417918e-05, "loss": 1.9548, "step": 6900 }, { "epoch": 0.01346593498183121, "grad_norm": 2.339550018310547, "learning_rate": 2.9986579482461552e-05, "loss": 1.7549, "step": 6915 }, { "epoch": 0.013495145252941475, "grad_norm": 8.277188301086426, "learning_rate": 2.9986521204539824e-05, "loss": 1.8958, "step": 6930 }, { "epoch": 0.013524355524051738, "grad_norm": 3.044663190841675, "learning_rate": 2.9986462800414498e-05, "loss": 1.9019, "step": 6945 }, { "epoch": 0.013553565795162, "grad_norm": 2.622844696044922, "learning_rate": 2.9986404270086056e-05, "loss": 1.8349, "step": 6960 }, { "epoch": 0.013582776066272263, "grad_norm": 2.686244010925293, "learning_rate": 2.9986345613554998e-05, "loss": 1.9178, "step": 6975 }, { "epoch": 0.013611986337382526, "grad_norm": 3.487210512161255, "learning_rate": 2.9986286830821817e-05, "loss": 2.0353, "step": 6990 }, { "epoch": 0.013641196608492789, "grad_norm": 3.297060966491699, "learning_rate": 2.9986227921887005e-05, "loss": 2.0344, "step": 7005 }, { "epoch": 0.013670406879603051, "grad_norm": 6.4708571434021, "learning_rate": 2.9986168886751064e-05, "loss": 1.946, "step": 7020 }, { "epoch": 0.013699617150713314, "grad_norm": 3.04331636428833, "learning_rate": 2.9986109725414485e-05, "loss": 1.9229, "step": 7035 }, { "epoch": 0.013728827421823579, "grad_norm": 3.3614730834960938, "learning_rate": 2.9986050437877762e-05, "loss": 1.8943, "step": 7050 }, { "epoch": 0.013758037692933841, "grad_norm": 2.632763624191284, "learning_rate": 2.998599102414141e-05, "loss": 1.907, "step": 7065 }, { "epoch": 0.013787247964044104, "grad_norm": 2.441969871520996, "learning_rate": 2.998593148420592e-05, "loss": 1.9382, "step": 7080 }, { "epoch": 0.013816458235154367, "grad_norm": 3.3013856410980225, "learning_rate": 2.9985871818071784e-05, "loss": 1.7866, "step": 7095 }, { "epoch": 0.01384566850626463, "grad_norm": 3.12922739982605, "learning_rate": 2.9985812025739518e-05, "loss": 1.9155, "step": 7110 }, { "epoch": 0.013874878777374892, "grad_norm": 3.3649489879608154, "learning_rate": 2.998575210720962e-05, "loss": 1.7256, "step": 7125 }, { "epoch": 0.013904089048485155, "grad_norm": 2.381049871444702, "learning_rate": 2.9985692062482603e-05, "loss": 1.9324, "step": 7140 }, { "epoch": 0.013933299319595418, "grad_norm": 2.5328924655914307, "learning_rate": 2.998563189155896e-05, "loss": 2.0635, "step": 7155 }, { "epoch": 0.013962509590705682, "grad_norm": 3.5615475177764893, "learning_rate": 2.99855715944392e-05, "loss": 1.8377, "step": 7170 }, { "epoch": 0.013991719861815945, "grad_norm": 3.7230849266052246, "learning_rate": 2.998551117112384e-05, "loss": 1.839, "step": 7185 }, { "epoch": 0.014020930132926207, "grad_norm": 2.7863929271698, "learning_rate": 2.998545062161338e-05, "loss": 1.7265, "step": 7200 }, { "epoch": 0.01405014040403647, "grad_norm": 2.238398790359497, "learning_rate": 2.9985389945908332e-05, "loss": 1.9747, "step": 7215 }, { "epoch": 0.014079350675146733, "grad_norm": 2.3368685245513916, "learning_rate": 2.998532914400921e-05, "loss": 2.0277, "step": 7230 }, { "epoch": 0.014108560946256996, "grad_norm": 3.667415142059326, "learning_rate": 2.9985268215916523e-05, "loss": 1.8947, "step": 7245 }, { "epoch": 0.014137771217367258, "grad_norm": 5.902375221252441, "learning_rate": 2.9985207161630784e-05, "loss": 2.023, "step": 7260 }, { "epoch": 0.014166981488477521, "grad_norm": 3.5179100036621094, "learning_rate": 2.998514598115251e-05, "loss": 1.9654, "step": 7275 }, { "epoch": 0.014196191759587785, "grad_norm": 4.09523344039917, "learning_rate": 2.9985084674482207e-05, "loss": 1.9822, "step": 7290 }, { "epoch": 0.014225402030698048, "grad_norm": 3.9758706092834473, "learning_rate": 2.9985023241620405e-05, "loss": 1.7241, "step": 7305 }, { "epoch": 0.01425461230180831, "grad_norm": 4.463382720947266, "learning_rate": 2.9984961682567614e-05, "loss": 1.9208, "step": 7320 }, { "epoch": 0.014283822572918574, "grad_norm": 2.2591640949249268, "learning_rate": 2.9984899997324357e-05, "loss": 1.9372, "step": 7335 }, { "epoch": 0.014313032844028836, "grad_norm": 2.958768129348755, "learning_rate": 2.998483818589114e-05, "loss": 1.9092, "step": 7350 }, { "epoch": 0.014342243115139099, "grad_norm": 2.4651241302490234, "learning_rate": 2.99847762482685e-05, "loss": 1.7909, "step": 7365 }, { "epoch": 0.014371453386249362, "grad_norm": 4.785153388977051, "learning_rate": 2.9984714184456948e-05, "loss": 2.0846, "step": 7380 }, { "epoch": 0.014400663657359624, "grad_norm": 5.212136745452881, "learning_rate": 2.9984651994457013e-05, "loss": 1.7623, "step": 7395 }, { "epoch": 0.014429873928469889, "grad_norm": 3.5508365631103516, "learning_rate": 2.9984589678269216e-05, "loss": 1.8922, "step": 7410 }, { "epoch": 0.014459084199580152, "grad_norm": 4.388883113861084, "learning_rate": 2.998452723589408e-05, "loss": 1.9701, "step": 7425 }, { "epoch": 0.014488294470690414, "grad_norm": 2.4577128887176514, "learning_rate": 2.9984464667332135e-05, "loss": 1.8807, "step": 7440 }, { "epoch": 0.014517504741800677, "grad_norm": 3.046642541885376, "learning_rate": 2.99844019725839e-05, "loss": 2.0671, "step": 7455 }, { "epoch": 0.01454671501291094, "grad_norm": 2.639798879623413, "learning_rate": 2.9984339151649913e-05, "loss": 1.7981, "step": 7470 }, { "epoch": 0.014575925284021202, "grad_norm": 4.231945991516113, "learning_rate": 2.9984276204530702e-05, "loss": 1.822, "step": 7485 }, { "epoch": 0.014605135555131465, "grad_norm": 2.3419246673583984, "learning_rate": 2.9984213131226788e-05, "loss": 1.8931, "step": 7500 }, { "epoch": 0.014634345826241728, "grad_norm": 3.034775733947754, "learning_rate": 2.998414993173871e-05, "loss": 1.8925, "step": 7515 }, { "epoch": 0.014663556097351992, "grad_norm": 2.818657875061035, "learning_rate": 2.9984086606066997e-05, "loss": 1.8714, "step": 7530 }, { "epoch": 0.014692766368462255, "grad_norm": 2.0940043926239014, "learning_rate": 2.9984023154212183e-05, "loss": 1.9476, "step": 7545 }, { "epoch": 0.014721976639572518, "grad_norm": 1.9247641563415527, "learning_rate": 2.9983959576174807e-05, "loss": 1.8433, "step": 7560 }, { "epoch": 0.01475118691068278, "grad_norm": 4.496128559112549, "learning_rate": 2.9983895871955397e-05, "loss": 1.8137, "step": 7575 }, { "epoch": 0.014780397181793043, "grad_norm": 1.846083402633667, "learning_rate": 2.998383204155449e-05, "loss": 1.9474, "step": 7590 }, { "epoch": 0.014809607452903306, "grad_norm": 2.063615083694458, "learning_rate": 2.9983768084972626e-05, "loss": 1.9017, "step": 7605 }, { "epoch": 0.014838817724013569, "grad_norm": 3.480355978012085, "learning_rate": 2.9983704002210346e-05, "loss": 1.8554, "step": 7620 }, { "epoch": 0.014868027995123831, "grad_norm": 3.3823037147521973, "learning_rate": 2.9983639793268187e-05, "loss": 2.067, "step": 7635 }, { "epoch": 0.014897238266234096, "grad_norm": 4.111497402191162, "learning_rate": 2.998357545814669e-05, "loss": 1.8451, "step": 7650 }, { "epoch": 0.014926448537344358, "grad_norm": 2.51603102684021, "learning_rate": 2.9983510996846397e-05, "loss": 1.835, "step": 7665 }, { "epoch": 0.014955658808454621, "grad_norm": 2.576188087463379, "learning_rate": 2.9983446409367846e-05, "loss": 2.0444, "step": 7680 }, { "epoch": 0.014984869079564884, "grad_norm": 2.695582151412964, "learning_rate": 2.9983381695711595e-05, "loss": 1.9372, "step": 7695 }, { "epoch": 0.015014079350675147, "grad_norm": 2.580967664718628, "learning_rate": 2.9983316855878172e-05, "loss": 1.7984, "step": 7710 }, { "epoch": 0.01504328962178541, "grad_norm": 2.779932737350464, "learning_rate": 2.9983251889868133e-05, "loss": 2.0286, "step": 7725 }, { "epoch": 0.015072499892895672, "grad_norm": 4.4765849113464355, "learning_rate": 2.998318679768202e-05, "loss": 1.873, "step": 7740 }, { "epoch": 0.015101710164005935, "grad_norm": 5.089080810546875, "learning_rate": 2.9983121579320387e-05, "loss": 1.9452, "step": 7755 }, { "epoch": 0.015130920435116199, "grad_norm": 3.860607862472534, "learning_rate": 2.9983056234783774e-05, "loss": 1.9469, "step": 7770 }, { "epoch": 0.015160130706226462, "grad_norm": 3.014214277267456, "learning_rate": 2.998299076407274e-05, "loss": 2.0488, "step": 7785 }, { "epoch": 0.015189340977336725, "grad_norm": 3.1203925609588623, "learning_rate": 2.998292516718784e-05, "loss": 1.8334, "step": 7800 }, { "epoch": 0.015218551248446987, "grad_norm": 1.6761817932128906, "learning_rate": 2.998285944412961e-05, "loss": 2.0263, "step": 7815 }, { "epoch": 0.01524776151955725, "grad_norm": 2.3997247219085693, "learning_rate": 2.9982793594898623e-05, "loss": 1.9195, "step": 7830 }, { "epoch": 0.015276971790667513, "grad_norm": 2.0805671215057373, "learning_rate": 2.998272761949542e-05, "loss": 1.8461, "step": 7845 }, { "epoch": 0.015306182061777775, "grad_norm": 3.0058650970458984, "learning_rate": 2.998266151792056e-05, "loss": 2.0221, "step": 7860 }, { "epoch": 0.01533539233288804, "grad_norm": 4.376441955566406, "learning_rate": 2.99825952901746e-05, "loss": 1.8942, "step": 7875 }, { "epoch": 0.015364602603998303, "grad_norm": 2.509711503982544, "learning_rate": 2.9982528936258096e-05, "loss": 1.7979, "step": 7890 }, { "epoch": 0.015393812875108565, "grad_norm": 2.763103723526001, "learning_rate": 2.9982462456171605e-05, "loss": 1.9112, "step": 7905 }, { "epoch": 0.015423023146218828, "grad_norm": 2.519554615020752, "learning_rate": 2.9982395849915698e-05, "loss": 1.9375, "step": 7920 }, { "epoch": 0.01545223341732909, "grad_norm": 2.679543972015381, "learning_rate": 2.9982329117490926e-05, "loss": 1.8226, "step": 7935 }, { "epoch": 0.015481443688439353, "grad_norm": 3.619253158569336, "learning_rate": 2.9982262258897855e-05, "loss": 1.8639, "step": 7950 }, { "epoch": 0.015510653959549616, "grad_norm": 1.7691304683685303, "learning_rate": 2.9982195274137042e-05, "loss": 1.7112, "step": 7965 }, { "epoch": 0.015539864230659879, "grad_norm": 3.720994234085083, "learning_rate": 2.9982128163209058e-05, "loss": 1.8901, "step": 7980 }, { "epoch": 0.015569074501770143, "grad_norm": 3.499058723449707, "learning_rate": 2.9982060926114467e-05, "loss": 1.9527, "step": 7995 }, { "epoch": 0.015598284772880406, "grad_norm": 2.401658773422241, "learning_rate": 2.9981993562853833e-05, "loss": 1.9037, "step": 8010 }, { "epoch": 0.015627495043990667, "grad_norm": 3.7763659954071045, "learning_rate": 2.9981926073427724e-05, "loss": 2.0314, "step": 8025 }, { "epoch": 0.01565670531510093, "grad_norm": 3.2521162033081055, "learning_rate": 2.9981858457836707e-05, "loss": 1.7982, "step": 8040 }, { "epoch": 0.015685915586211196, "grad_norm": 3.2938101291656494, "learning_rate": 2.9981790716081353e-05, "loss": 1.9225, "step": 8055 }, { "epoch": 0.015715125857321457, "grad_norm": 2.9045004844665527, "learning_rate": 2.9981722848162233e-05, "loss": 2.04, "step": 8070 }, { "epoch": 0.01574433612843172, "grad_norm": 2.2528388500213623, "learning_rate": 2.9981654854079918e-05, "loss": 1.792, "step": 8085 }, { "epoch": 0.015773546399541982, "grad_norm": 2.9156363010406494, "learning_rate": 2.998158673383498e-05, "loss": 1.9291, "step": 8100 }, { "epoch": 0.015802756670652247, "grad_norm": 2.375291347503662, "learning_rate": 2.9981518487427996e-05, "loss": 1.8216, "step": 8115 }, { "epoch": 0.015831966941762508, "grad_norm": 4.628626823425293, "learning_rate": 2.9981450114859532e-05, "loss": 1.7895, "step": 8130 }, { "epoch": 0.015861177212872772, "grad_norm": 2.200885534286499, "learning_rate": 2.9981381616130172e-05, "loss": 1.9302, "step": 8145 }, { "epoch": 0.015890387483983033, "grad_norm": 3.7578535079956055, "learning_rate": 2.998131299124049e-05, "loss": 1.9202, "step": 8160 }, { "epoch": 0.015919597755093298, "grad_norm": 2.235351800918579, "learning_rate": 2.9981244240191063e-05, "loss": 1.9151, "step": 8175 }, { "epoch": 0.015948808026203562, "grad_norm": 1.905808925628662, "learning_rate": 2.9981175362982473e-05, "loss": 2.0942, "step": 8190 }, { "epoch": 0.015978018297313823, "grad_norm": 2.797210454940796, "learning_rate": 2.99811063596153e-05, "loss": 2.0168, "step": 8205 }, { "epoch": 0.016007228568424087, "grad_norm": 2.7250447273254395, "learning_rate": 2.9981037230090125e-05, "loss": 1.8033, "step": 8220 }, { "epoch": 0.01603643883953435, "grad_norm": 2.6646409034729004, "learning_rate": 2.9980967974407525e-05, "loss": 1.8914, "step": 8235 }, { "epoch": 0.016065649110644613, "grad_norm": 4.621725082397461, "learning_rate": 2.9980898592568086e-05, "loss": 1.9805, "step": 8250 }, { "epoch": 0.016094859381754874, "grad_norm": 7.544195175170898, "learning_rate": 2.9980829084572393e-05, "loss": 1.8876, "step": 8265 }, { "epoch": 0.016124069652865138, "grad_norm": 3.1491341590881348, "learning_rate": 2.9980759450421032e-05, "loss": 1.8096, "step": 8280 }, { "epoch": 0.016153279923975403, "grad_norm": 3.049743175506592, "learning_rate": 2.998068969011459e-05, "loss": 1.9014, "step": 8295 }, { "epoch": 0.016182490195085664, "grad_norm": 3.4776172637939453, "learning_rate": 2.998061980365365e-05, "loss": 1.9011, "step": 8310 }, { "epoch": 0.016211700466195928, "grad_norm": 3.4769961833953857, "learning_rate": 2.9980549791038804e-05, "loss": 2.2367, "step": 8325 }, { "epoch": 0.01624091073730619, "grad_norm": 2.252976894378662, "learning_rate": 2.9980479652270645e-05, "loss": 1.9017, "step": 8340 }, { "epoch": 0.016270121008416454, "grad_norm": 2.3866465091705322, "learning_rate": 2.998040938734976e-05, "loss": 1.8849, "step": 8355 }, { "epoch": 0.016299331279526715, "grad_norm": 4.388668060302734, "learning_rate": 2.998033899627674e-05, "loss": 1.8601, "step": 8370 }, { "epoch": 0.01632854155063698, "grad_norm": 3.2678322792053223, "learning_rate": 2.9980268479052173e-05, "loss": 1.8799, "step": 8385 }, { "epoch": 0.01635775182174724, "grad_norm": 3.3072402477264404, "learning_rate": 2.9980197835676665e-05, "loss": 1.8995, "step": 8400 }, { "epoch": 0.016386962092857504, "grad_norm": 2.1432266235351562, "learning_rate": 2.99801270661508e-05, "loss": 1.9894, "step": 8415 }, { "epoch": 0.01641617236396777, "grad_norm": 4.359271049499512, "learning_rate": 2.998005617047518e-05, "loss": 1.8252, "step": 8430 }, { "epoch": 0.01644538263507803, "grad_norm": 4.683668613433838, "learning_rate": 2.99799851486504e-05, "loss": 1.8865, "step": 8445 }, { "epoch": 0.016474592906188294, "grad_norm": 2.4480679035186768, "learning_rate": 2.997991400067706e-05, "loss": 1.8999, "step": 8460 }, { "epoch": 0.016503803177298555, "grad_norm": 4.169287204742432, "learning_rate": 2.9979842726555753e-05, "loss": 1.8878, "step": 8475 }, { "epoch": 0.01653301344840882, "grad_norm": 2.4626035690307617, "learning_rate": 2.9979771326287084e-05, "loss": 2.0727, "step": 8490 }, { "epoch": 0.01656222371951908, "grad_norm": 3.7514021396636963, "learning_rate": 2.9979699799871658e-05, "loss": 1.9559, "step": 8505 }, { "epoch": 0.016591433990629345, "grad_norm": 3.0994958877563477, "learning_rate": 2.9979628147310068e-05, "loss": 1.8535, "step": 8520 }, { "epoch": 0.01662064426173961, "grad_norm": 5.931899070739746, "learning_rate": 2.9979556368602924e-05, "loss": 1.9244, "step": 8535 }, { "epoch": 0.01664985453284987, "grad_norm": 2.016737699508667, "learning_rate": 2.9979484463750833e-05, "loss": 1.9107, "step": 8550 }, { "epoch": 0.016679064803960135, "grad_norm": 2.3796162605285645, "learning_rate": 2.9979412432754394e-05, "loss": 2.0904, "step": 8565 }, { "epoch": 0.016708275075070396, "grad_norm": 1.870884656906128, "learning_rate": 2.9979340275614217e-05, "loss": 1.926, "step": 8580 }, { "epoch": 0.01673748534618066, "grad_norm": 2.833564281463623, "learning_rate": 2.997926799233091e-05, "loss": 1.871, "step": 8595 }, { "epoch": 0.01676669561729092, "grad_norm": 3.7916762828826904, "learning_rate": 2.9979195582905075e-05, "loss": 1.8166, "step": 8610 }, { "epoch": 0.016795905888401186, "grad_norm": 2.3426475524902344, "learning_rate": 2.997912304733733e-05, "loss": 1.8514, "step": 8625 }, { "epoch": 0.016825116159511447, "grad_norm": 4.586437225341797, "learning_rate": 2.9979050385628286e-05, "loss": 1.8544, "step": 8640 }, { "epoch": 0.01685432643062171, "grad_norm": 4.281703472137451, "learning_rate": 2.997897759777855e-05, "loss": 1.8978, "step": 8655 }, { "epoch": 0.016883536701731976, "grad_norm": 2.1048526763916016, "learning_rate": 2.9978904683788735e-05, "loss": 1.9104, "step": 8670 }, { "epoch": 0.016912746972842237, "grad_norm": 4.526645660400391, "learning_rate": 2.9978831643659462e-05, "loss": 1.9814, "step": 8685 }, { "epoch": 0.0169419572439525, "grad_norm": 1.9656120538711548, "learning_rate": 2.9978758477391334e-05, "loss": 1.878, "step": 8700 }, { "epoch": 0.016971167515062762, "grad_norm": 2.5091657638549805, "learning_rate": 2.997868518498498e-05, "loss": 1.8586, "step": 8715 }, { "epoch": 0.017000377786173027, "grad_norm": 3.0821568965911865, "learning_rate": 2.997861176644101e-05, "loss": 2.1301, "step": 8730 }, { "epoch": 0.017029588057283288, "grad_norm": 2.465061902999878, "learning_rate": 2.997853822176004e-05, "loss": 1.9435, "step": 8745 }, { "epoch": 0.017058798328393552, "grad_norm": 4.7108306884765625, "learning_rate": 2.9978464550942697e-05, "loss": 1.8857, "step": 8760 }, { "epoch": 0.017088008599503816, "grad_norm": 2.838949203491211, "learning_rate": 2.9978390753989597e-05, "loss": 1.9367, "step": 8775 }, { "epoch": 0.017117218870614077, "grad_norm": 6.954312324523926, "learning_rate": 2.9978316830901358e-05, "loss": 1.6954, "step": 8790 }, { "epoch": 0.017146429141724342, "grad_norm": 3.075137138366699, "learning_rate": 2.997824278167861e-05, "loss": 1.8787, "step": 8805 }, { "epoch": 0.017175639412834603, "grad_norm": 2.7041006088256836, "learning_rate": 2.9978168606321975e-05, "loss": 1.8062, "step": 8820 }, { "epoch": 0.017204849683944867, "grad_norm": 2.9021966457366943, "learning_rate": 2.997809430483207e-05, "loss": 1.9083, "step": 8835 }, { "epoch": 0.017234059955055128, "grad_norm": 3.350419521331787, "learning_rate": 2.9978019877209528e-05, "loss": 2.0403, "step": 8850 }, { "epoch": 0.017263270226165393, "grad_norm": 4.803377628326416, "learning_rate": 2.9977945323454977e-05, "loss": 1.8116, "step": 8865 }, { "epoch": 0.017292480497275657, "grad_norm": 4.362671375274658, "learning_rate": 2.997787064356904e-05, "loss": 1.8089, "step": 8880 }, { "epoch": 0.017321690768385918, "grad_norm": 3.1585206985473633, "learning_rate": 2.9977795837552347e-05, "loss": 2.1531, "step": 8895 }, { "epoch": 0.017350901039496183, "grad_norm": 2.9386544227600098, "learning_rate": 2.997772090540553e-05, "loss": 1.8995, "step": 8910 }, { "epoch": 0.017380111310606444, "grad_norm": 4.238663673400879, "learning_rate": 2.9977645847129216e-05, "loss": 1.9374, "step": 8925 }, { "epoch": 0.017409321581716708, "grad_norm": 3.94399356842041, "learning_rate": 2.9977570662724047e-05, "loss": 1.9782, "step": 8940 }, { "epoch": 0.01743853185282697, "grad_norm": 3.2048075199127197, "learning_rate": 2.9977495352190643e-05, "loss": 1.8984, "step": 8955 }, { "epoch": 0.017467742123937233, "grad_norm": 2.4807546138763428, "learning_rate": 2.9977419915529646e-05, "loss": 1.8613, "step": 8970 }, { "epoch": 0.017496952395047494, "grad_norm": 2.518021583557129, "learning_rate": 2.9977344352741686e-05, "loss": 2.0321, "step": 8985 }, { "epoch": 0.01752616266615776, "grad_norm": 4.450172424316406, "learning_rate": 2.9977268663827403e-05, "loss": 1.9419, "step": 9000 }, { "epoch": 0.017555372937268023, "grad_norm": 3.8519856929779053, "learning_rate": 2.9977192848787437e-05, "loss": 1.9083, "step": 9015 }, { "epoch": 0.017584583208378284, "grad_norm": 5.37404727935791, "learning_rate": 2.9977116907622422e-05, "loss": 1.9904, "step": 9030 }, { "epoch": 0.01761379347948855, "grad_norm": 2.9285478591918945, "learning_rate": 2.9977040840333e-05, "loss": 1.7262, "step": 9045 }, { "epoch": 0.01764300375059881, "grad_norm": 3.8375890254974365, "learning_rate": 2.9976964646919814e-05, "loss": 1.945, "step": 9060 }, { "epoch": 0.017672214021709074, "grad_norm": 2.987417459487915, "learning_rate": 2.9976888327383497e-05, "loss": 2.1196, "step": 9075 }, { "epoch": 0.017701424292819335, "grad_norm": 3.6940131187438965, "learning_rate": 2.99768118817247e-05, "loss": 2.002, "step": 9090 }, { "epoch": 0.0177306345639296, "grad_norm": 1.8791909217834473, "learning_rate": 2.997673530994406e-05, "loss": 1.9082, "step": 9105 }, { "epoch": 0.017759844835039864, "grad_norm": 5.242600440979004, "learning_rate": 2.997665861204223e-05, "loss": 1.8396, "step": 9120 }, { "epoch": 0.017789055106150125, "grad_norm": 3.7789740562438965, "learning_rate": 2.997658178801985e-05, "loss": 1.9959, "step": 9135 }, { "epoch": 0.01781826537726039, "grad_norm": 2.6715869903564453, "learning_rate": 2.9976504837877566e-05, "loss": 1.8346, "step": 9150 }, { "epoch": 0.01784747564837065, "grad_norm": 3.229962110519409, "learning_rate": 2.997642776161603e-05, "loss": 2.0002, "step": 9165 }, { "epoch": 0.017876685919480915, "grad_norm": 2.602320671081543, "learning_rate": 2.997635055923589e-05, "loss": 2.0765, "step": 9180 }, { "epoch": 0.017905896190591176, "grad_norm": 3.277393102645874, "learning_rate": 2.9976273230737795e-05, "loss": 1.8345, "step": 9195 }, { "epoch": 0.01793510646170144, "grad_norm": 2.1936373710632324, "learning_rate": 2.9976195776122397e-05, "loss": 1.9265, "step": 9210 }, { "epoch": 0.0179643167328117, "grad_norm": 4.018658638000488, "learning_rate": 2.997611819539035e-05, "loss": 1.9237, "step": 9225 }, { "epoch": 0.017993527003921966, "grad_norm": 3.5876455307006836, "learning_rate": 2.9976040488542304e-05, "loss": 1.8761, "step": 9240 }, { "epoch": 0.01802273727503223, "grad_norm": 2.9479000568389893, "learning_rate": 2.9975962655578915e-05, "loss": 1.9062, "step": 9255 }, { "epoch": 0.01805194754614249, "grad_norm": 3.185248613357544, "learning_rate": 2.9975884696500835e-05, "loss": 1.9958, "step": 9270 }, { "epoch": 0.018081157817252756, "grad_norm": 2.5612637996673584, "learning_rate": 2.9975806611308725e-05, "loss": 2.0438, "step": 9285 }, { "epoch": 0.018110368088363017, "grad_norm": 3.9929094314575195, "learning_rate": 2.9975728400003244e-05, "loss": 1.8404, "step": 9300 }, { "epoch": 0.01813957835947328, "grad_norm": 2.15783953666687, "learning_rate": 2.9975650062585043e-05, "loss": 1.8677, "step": 9315 }, { "epoch": 0.018168788630583542, "grad_norm": 3.7899887561798096, "learning_rate": 2.997557159905479e-05, "loss": 1.9269, "step": 9330 }, { "epoch": 0.018197998901693806, "grad_norm": 4.0455145835876465, "learning_rate": 2.9975493009413144e-05, "loss": 1.8552, "step": 9345 }, { "epoch": 0.01822720917280407, "grad_norm": 2.837963581085205, "learning_rate": 2.9975414293660766e-05, "loss": 2.1462, "step": 9360 }, { "epoch": 0.018256419443914332, "grad_norm": 3.8562536239624023, "learning_rate": 2.9975335451798317e-05, "loss": 1.8625, "step": 9375 }, { "epoch": 0.018285629715024596, "grad_norm": 2.4235100746154785, "learning_rate": 2.9975256483826453e-05, "loss": 1.8512, "step": 9390 }, { "epoch": 0.018314839986134857, "grad_norm": 5.756357192993164, "learning_rate": 2.997517738974586e-05, "loss": 1.84, "step": 9405 }, { "epoch": 0.01834405025724512, "grad_norm": 2.3832759857177734, "learning_rate": 2.9975098169557187e-05, "loss": 1.8721, "step": 9420 }, { "epoch": 0.018373260528355383, "grad_norm": 2.0849883556365967, "learning_rate": 2.9975018823261106e-05, "loss": 1.914, "step": 9435 }, { "epoch": 0.018402470799465647, "grad_norm": 2.3563778400421143, "learning_rate": 2.997493935085829e-05, "loss": 1.7283, "step": 9450 }, { "epoch": 0.018431681070575908, "grad_norm": 2.018721580505371, "learning_rate": 2.9974859752349396e-05, "loss": 1.7748, "step": 9465 }, { "epoch": 0.018460891341686173, "grad_norm": 2.725719451904297, "learning_rate": 2.9974780027735103e-05, "loss": 1.9287, "step": 9480 }, { "epoch": 0.018490101612796437, "grad_norm": 3.2846550941467285, "learning_rate": 2.9974700177016082e-05, "loss": 1.8214, "step": 9495 }, { "epoch": 0.018519311883906698, "grad_norm": 4.153242111206055, "learning_rate": 2.997462020019301e-05, "loss": 1.9377, "step": 9510 }, { "epoch": 0.018548522155016962, "grad_norm": 2.386509656906128, "learning_rate": 2.997454009726655e-05, "loss": 1.7513, "step": 9525 }, { "epoch": 0.018577732426127223, "grad_norm": 3.9288957118988037, "learning_rate": 2.9974459868237384e-05, "loss": 1.8623, "step": 9540 }, { "epoch": 0.018606942697237488, "grad_norm": 2.2002370357513428, "learning_rate": 2.9974379513106184e-05, "loss": 2.0153, "step": 9555 }, { "epoch": 0.01863615296834775, "grad_norm": 2.4271867275238037, "learning_rate": 2.9974299031873625e-05, "loss": 1.8772, "step": 9570 }, { "epoch": 0.018665363239458013, "grad_norm": 2.355729341506958, "learning_rate": 2.9974218424540395e-05, "loss": 1.9947, "step": 9585 }, { "epoch": 0.018694573510568278, "grad_norm": 4.210724830627441, "learning_rate": 2.9974137691107164e-05, "loss": 1.8113, "step": 9600 }, { "epoch": 0.01872378378167854, "grad_norm": 3.092832088470459, "learning_rate": 2.997405683157461e-05, "loss": 1.8457, "step": 9615 }, { "epoch": 0.018752994052788803, "grad_norm": 3.154505491256714, "learning_rate": 2.997397584594342e-05, "loss": 1.9223, "step": 9630 }, { "epoch": 0.018782204323899064, "grad_norm": 2.661449909210205, "learning_rate": 2.997389473421427e-05, "loss": 1.8362, "step": 9645 }, { "epoch": 0.01881141459500933, "grad_norm": 4.499241352081299, "learning_rate": 2.997381349638785e-05, "loss": 1.974, "step": 9660 }, { "epoch": 0.01884062486611959, "grad_norm": 8.712298393249512, "learning_rate": 2.9973732132464838e-05, "loss": 1.793, "step": 9675 }, { "epoch": 0.018869835137229854, "grad_norm": 3.771261215209961, "learning_rate": 2.9973650642445926e-05, "loss": 1.8158, "step": 9690 }, { "epoch": 0.018899045408340115, "grad_norm": 2.3334624767303467, "learning_rate": 2.997356902633179e-05, "loss": 1.9791, "step": 9705 }, { "epoch": 0.01892825567945038, "grad_norm": 3.4070332050323486, "learning_rate": 2.997348728412313e-05, "loss": 1.8621, "step": 9720 }, { "epoch": 0.018957465950560644, "grad_norm": 2.4055604934692383, "learning_rate": 2.997340541582062e-05, "loss": 1.8424, "step": 9735 }, { "epoch": 0.018986676221670905, "grad_norm": 3.209122896194458, "learning_rate": 2.9973323421424962e-05, "loss": 1.8791, "step": 9750 }, { "epoch": 0.01901588649278117, "grad_norm": 2.720518112182617, "learning_rate": 2.9973241300936842e-05, "loss": 1.9752, "step": 9765 }, { "epoch": 0.01904509676389143, "grad_norm": 1.9057116508483887, "learning_rate": 2.9973159054356948e-05, "loss": 1.9658, "step": 9780 }, { "epoch": 0.019074307035001695, "grad_norm": 3.4243197441101074, "learning_rate": 2.9973076681685977e-05, "loss": 1.9394, "step": 9795 }, { "epoch": 0.019103517306111956, "grad_norm": 3.557957410812378, "learning_rate": 2.997299418292462e-05, "loss": 1.8854, "step": 9810 }, { "epoch": 0.01913272757722222, "grad_norm": 2.576314926147461, "learning_rate": 2.9972911558073575e-05, "loss": 1.8975, "step": 9825 }, { "epoch": 0.019161937848332485, "grad_norm": 2.4958183765411377, "learning_rate": 2.9972828807133537e-05, "loss": 1.765, "step": 9840 }, { "epoch": 0.019191148119442746, "grad_norm": 3.1889865398406982, "learning_rate": 2.99727459301052e-05, "loss": 1.9785, "step": 9855 }, { "epoch": 0.01922035839055301, "grad_norm": 4.607937335968018, "learning_rate": 2.9972662926989267e-05, "loss": 1.7931, "step": 9870 }, { "epoch": 0.01924956866166327, "grad_norm": 3.380537271499634, "learning_rate": 2.997257979778643e-05, "loss": 1.8366, "step": 9885 }, { "epoch": 0.019278778932773535, "grad_norm": 1.7773466110229492, "learning_rate": 2.9972496542497393e-05, "loss": 1.841, "step": 9900 }, { "epoch": 0.019307989203883796, "grad_norm": 3.43685245513916, "learning_rate": 2.9972413161122858e-05, "loss": 1.8255, "step": 9915 }, { "epoch": 0.01933719947499406, "grad_norm": 4.238219261169434, "learning_rate": 2.9972329653663525e-05, "loss": 1.8403, "step": 9930 }, { "epoch": 0.019366409746104322, "grad_norm": 3.8438355922698975, "learning_rate": 2.99722460201201e-05, "loss": 1.8005, "step": 9945 }, { "epoch": 0.019395620017214586, "grad_norm": 4.286600589752197, "learning_rate": 2.997216226049328e-05, "loss": 1.8684, "step": 9960 }, { "epoch": 0.01942483028832485, "grad_norm": 1.7342430353164673, "learning_rate": 2.997207837478378e-05, "loss": 1.9657, "step": 9975 }, { "epoch": 0.01945404055943511, "grad_norm": 3.3315911293029785, "learning_rate": 2.9971994362992304e-05, "loss": 1.9156, "step": 9990 }, { "epoch": 0.019483250830545376, "grad_norm": 4.346848011016846, "learning_rate": 2.9971910225119556e-05, "loss": 1.9114, "step": 10005 }, { "epoch": 0.019512461101655637, "grad_norm": 2.8670308589935303, "learning_rate": 2.9971825961166248e-05, "loss": 1.8471, "step": 10020 }, { "epoch": 0.0195416713727659, "grad_norm": 2.1465935707092285, "learning_rate": 2.9971741571133085e-05, "loss": 1.8608, "step": 10035 }, { "epoch": 0.019570881643876162, "grad_norm": 3.3292319774627686, "learning_rate": 2.9971657055020782e-05, "loss": 1.877, "step": 10050 }, { "epoch": 0.019600091914986427, "grad_norm": 2.3058934211730957, "learning_rate": 2.9971572412830045e-05, "loss": 1.9125, "step": 10065 }, { "epoch": 0.01962930218609669, "grad_norm": 4.684175968170166, "learning_rate": 2.9971487644561597e-05, "loss": 1.9237, "step": 10080 }, { "epoch": 0.019658512457206952, "grad_norm": 3.6633639335632324, "learning_rate": 2.9971402750216144e-05, "loss": 1.856, "step": 10095 }, { "epoch": 0.019687722728317217, "grad_norm": 3.837944269180298, "learning_rate": 2.9971317729794404e-05, "loss": 2.0146, "step": 10110 }, { "epoch": 0.019716932999427478, "grad_norm": 4.053643226623535, "learning_rate": 2.997123258329709e-05, "loss": 1.9125, "step": 10125 }, { "epoch": 0.019746143270537742, "grad_norm": 1.9680250883102417, "learning_rate": 2.9971147310724923e-05, "loss": 1.9431, "step": 10140 }, { "epoch": 0.019775353541648003, "grad_norm": 1.941953420639038, "learning_rate": 2.9971061912078615e-05, "loss": 1.8638, "step": 10155 }, { "epoch": 0.019804563812758268, "grad_norm": 3.1855714321136475, "learning_rate": 2.997097638735889e-05, "loss": 1.6231, "step": 10170 }, { "epoch": 0.019833774083868532, "grad_norm": 4.022531509399414, "learning_rate": 2.997089073656647e-05, "loss": 1.8637, "step": 10185 }, { "epoch": 0.019862984354978793, "grad_norm": 4.320540904998779, "learning_rate": 2.997080495970207e-05, "loss": 1.8383, "step": 10200 }, { "epoch": 0.019892194626089058, "grad_norm": 3.0604958534240723, "learning_rate": 2.997071905676642e-05, "loss": 1.8843, "step": 10215 }, { "epoch": 0.01992140489719932, "grad_norm": 3.2216265201568604, "learning_rate": 2.9970633027760235e-05, "loss": 2.004, "step": 10230 }, { "epoch": 0.019950615168309583, "grad_norm": 2.4354753494262695, "learning_rate": 2.997054687268425e-05, "loss": 1.9255, "step": 10245 }, { "epoch": 0.019979825439419844, "grad_norm": 2.3556060791015625, "learning_rate": 2.9970460591539175e-05, "loss": 2.0436, "step": 10260 }, { "epoch": 0.02000903571053011, "grad_norm": 3.6977896690368652, "learning_rate": 2.9970374184325753e-05, "loss": 1.9181, "step": 10275 }, { "epoch": 0.02003824598164037, "grad_norm": 1.909177541732788, "learning_rate": 2.99702876510447e-05, "loss": 1.7653, "step": 10290 }, { "epoch": 0.020067456252750634, "grad_norm": 2.486943006515503, "learning_rate": 2.997020099169675e-05, "loss": 1.9137, "step": 10305 }, { "epoch": 0.020096666523860898, "grad_norm": 3.0627150535583496, "learning_rate": 2.9970114206282634e-05, "loss": 1.8442, "step": 10320 }, { "epoch": 0.02012587679497116, "grad_norm": 2.7819271087646484, "learning_rate": 2.997002729480308e-05, "loss": 2.1325, "step": 10335 }, { "epoch": 0.020155087066081424, "grad_norm": 2.097712755203247, "learning_rate": 2.9969940257258823e-05, "loss": 1.8952, "step": 10350 }, { "epoch": 0.020184297337191685, "grad_norm": 2.317915439605713, "learning_rate": 2.9969853093650592e-05, "loss": 1.8896, "step": 10365 }, { "epoch": 0.02021350760830195, "grad_norm": 3.3799221515655518, "learning_rate": 2.996976580397912e-05, "loss": 1.9032, "step": 10380 }, { "epoch": 0.02024271787941221, "grad_norm": 4.226128578186035, "learning_rate": 2.996967838824515e-05, "loss": 1.9812, "step": 10395 }, { "epoch": 0.020271928150522475, "grad_norm": 2.872182607650757, "learning_rate": 2.996959084644941e-05, "loss": 1.7335, "step": 10410 }, { "epoch": 0.02030113842163274, "grad_norm": 1.931477427482605, "learning_rate": 2.9969503178592638e-05, "loss": 1.8178, "step": 10425 }, { "epoch": 0.020330348692743, "grad_norm": 1.9570348262786865, "learning_rate": 2.9969415384675577e-05, "loss": 1.9652, "step": 10440 }, { "epoch": 0.020359558963853264, "grad_norm": 2.1548566818237305, "learning_rate": 2.996932746469896e-05, "loss": 1.9105, "step": 10455 }, { "epoch": 0.020388769234963525, "grad_norm": 2.138561248779297, "learning_rate": 2.9969239418663538e-05, "loss": 2.2595, "step": 10470 }, { "epoch": 0.02041797950607379, "grad_norm": 3.682020425796509, "learning_rate": 2.9969151246570038e-05, "loss": 1.8828, "step": 10485 }, { "epoch": 0.02044718977718405, "grad_norm": 1.9086365699768066, "learning_rate": 2.9969062948419213e-05, "loss": 1.8974, "step": 10500 }, { "epoch": 0.020476400048294315, "grad_norm": 3.4526236057281494, "learning_rate": 2.9968974524211807e-05, "loss": 1.8972, "step": 10515 }, { "epoch": 0.020505610319404576, "grad_norm": 5.137340068817139, "learning_rate": 2.996888597394856e-05, "loss": 1.7885, "step": 10530 }, { "epoch": 0.02053482059051484, "grad_norm": 2.410789966583252, "learning_rate": 2.9968797297630215e-05, "loss": 1.9339, "step": 10545 }, { "epoch": 0.020564030861625105, "grad_norm": 2.756986618041992, "learning_rate": 2.9968708495257527e-05, "loss": 1.9682, "step": 10560 }, { "epoch": 0.020593241132735366, "grad_norm": 4.116410255432129, "learning_rate": 2.9968619566831238e-05, "loss": 1.8814, "step": 10575 }, { "epoch": 0.02062245140384563, "grad_norm": 3.892730712890625, "learning_rate": 2.9968530512352098e-05, "loss": 1.9744, "step": 10590 }, { "epoch": 0.02065166167495589, "grad_norm": 3.5503933429718018, "learning_rate": 2.9968441331820856e-05, "loss": 2.0472, "step": 10605 }, { "epoch": 0.020680871946066156, "grad_norm": 4.085498809814453, "learning_rate": 2.9968352025238263e-05, "loss": 1.9428, "step": 10620 }, { "epoch": 0.020710082217176417, "grad_norm": 3.806868553161621, "learning_rate": 2.996826259260508e-05, "loss": 1.8212, "step": 10635 }, { "epoch": 0.02073929248828668, "grad_norm": 2.24172306060791, "learning_rate": 2.9968173033922045e-05, "loss": 1.7338, "step": 10650 }, { "epoch": 0.020768502759396946, "grad_norm": 2.516962766647339, "learning_rate": 2.996808334918992e-05, "loss": 1.9185, "step": 10665 }, { "epoch": 0.020797713030507207, "grad_norm": 3.8772926330566406, "learning_rate": 2.9967993538409465e-05, "loss": 1.8053, "step": 10680 }, { "epoch": 0.02082692330161747, "grad_norm": 5.146918296813965, "learning_rate": 2.9967903601581427e-05, "loss": 1.8488, "step": 10695 }, { "epoch": 0.020856133572727732, "grad_norm": 6.216543197631836, "learning_rate": 2.9967813538706568e-05, "loss": 1.9446, "step": 10710 }, { "epoch": 0.020885343843837997, "grad_norm": 3.0393502712249756, "learning_rate": 2.9967723349785648e-05, "loss": 1.8881, "step": 10725 }, { "epoch": 0.020914554114948258, "grad_norm": 2.7038638591766357, "learning_rate": 2.996763303481942e-05, "loss": 1.9454, "step": 10740 }, { "epoch": 0.020943764386058522, "grad_norm": 2.4057178497314453, "learning_rate": 2.9967542593808655e-05, "loss": 2.0256, "step": 10755 }, { "epoch": 0.020972974657168783, "grad_norm": 2.2479588985443115, "learning_rate": 2.9967452026754104e-05, "loss": 1.8835, "step": 10770 }, { "epoch": 0.021002184928279048, "grad_norm": 2.2106525897979736, "learning_rate": 2.996736133365654e-05, "loss": 1.8594, "step": 10785 }, { "epoch": 0.021031395199389312, "grad_norm": 2.273165225982666, "learning_rate": 2.9967270514516718e-05, "loss": 1.772, "step": 10800 }, { "epoch": 0.021060605470499573, "grad_norm": 4.13364315032959, "learning_rate": 2.9967179569335407e-05, "loss": 1.8364, "step": 10815 }, { "epoch": 0.021089815741609837, "grad_norm": 4.664285182952881, "learning_rate": 2.9967088498113368e-05, "loss": 1.7126, "step": 10830 }, { "epoch": 0.0211190260127201, "grad_norm": 3.9353389739990234, "learning_rate": 2.9966997300851376e-05, "loss": 1.8366, "step": 10845 }, { "epoch": 0.021148236283830363, "grad_norm": 3.433561086654663, "learning_rate": 2.996690597755019e-05, "loss": 1.7921, "step": 10860 }, { "epoch": 0.021177446554940624, "grad_norm": 2.0325567722320557, "learning_rate": 2.996681452821059e-05, "loss": 1.88, "step": 10875 }, { "epoch": 0.021206656826050888, "grad_norm": 3.6523592472076416, "learning_rate": 2.9966722952833335e-05, "loss": 1.8464, "step": 10890 }, { "epoch": 0.021235867097161153, "grad_norm": 4.08983039855957, "learning_rate": 2.99666312514192e-05, "loss": 2.0643, "step": 10905 }, { "epoch": 0.021265077368271414, "grad_norm": 4.714212894439697, "learning_rate": 2.9966539423968964e-05, "loss": 1.9458, "step": 10920 }, { "epoch": 0.021294287639381678, "grad_norm": 3.2425897121429443, "learning_rate": 2.996644747048339e-05, "loss": 1.9802, "step": 10935 }, { "epoch": 0.02132349791049194, "grad_norm": 5.025219440460205, "learning_rate": 2.9966355390963258e-05, "loss": 1.8679, "step": 10950 }, { "epoch": 0.021352708181602204, "grad_norm": 3.669241428375244, "learning_rate": 2.9966263185409343e-05, "loss": 1.9634, "step": 10965 }, { "epoch": 0.021381918452712464, "grad_norm": 3.5736284255981445, "learning_rate": 2.996617085382242e-05, "loss": 1.9348, "step": 10980 }, { "epoch": 0.02141112872382273, "grad_norm": 2.8263094425201416, "learning_rate": 2.996607839620327e-05, "loss": 1.7897, "step": 10995 }, { "epoch": 0.02144033899493299, "grad_norm": 4.033946990966797, "learning_rate": 2.996598581255267e-05, "loss": 1.9587, "step": 11010 }, { "epoch": 0.021469549266043254, "grad_norm": 3.7127420902252197, "learning_rate": 2.996589310287139e-05, "loss": 1.874, "step": 11025 }, { "epoch": 0.02149875953715352, "grad_norm": 4.083348751068115, "learning_rate": 2.9965800267160223e-05, "loss": 1.7831, "step": 11040 }, { "epoch": 0.02152796980826378, "grad_norm": 4.666345596313477, "learning_rate": 2.996570730541995e-05, "loss": 1.8516, "step": 11055 }, { "epoch": 0.021557180079374044, "grad_norm": 2.3497631549835205, "learning_rate": 2.996561421765135e-05, "loss": 1.9213, "step": 11070 }, { "epoch": 0.021586390350484305, "grad_norm": 4.355146408081055, "learning_rate": 2.996552100385521e-05, "loss": 1.8072, "step": 11085 }, { "epoch": 0.02161560062159457, "grad_norm": 3.6673879623413086, "learning_rate": 2.996542766403231e-05, "loss": 1.8541, "step": 11100 }, { "epoch": 0.02164481089270483, "grad_norm": 2.2446086406707764, "learning_rate": 2.996533419818344e-05, "loss": 1.9764, "step": 11115 }, { "epoch": 0.021674021163815095, "grad_norm": 3.8175010681152344, "learning_rate": 2.996524060630938e-05, "loss": 1.9664, "step": 11130 }, { "epoch": 0.02170323143492536, "grad_norm": 4.360842704772949, "learning_rate": 2.996514688841093e-05, "loss": 1.9145, "step": 11145 }, { "epoch": 0.02173244170603562, "grad_norm": 3.110860586166382, "learning_rate": 2.996505304448887e-05, "loss": 1.8939, "step": 11160 }, { "epoch": 0.021761651977145885, "grad_norm": 2.5407495498657227, "learning_rate": 2.9964959074544e-05, "loss": 1.8649, "step": 11175 }, { "epoch": 0.021790862248256146, "grad_norm": 2.933225393295288, "learning_rate": 2.9964864978577103e-05, "loss": 1.8987, "step": 11190 }, { "epoch": 0.02182007251936641, "grad_norm": 3.003664493560791, "learning_rate": 2.996477075658897e-05, "loss": 1.936, "step": 11205 }, { "epoch": 0.02184928279047667, "grad_norm": 3.119703531265259, "learning_rate": 2.99646764085804e-05, "loss": 1.7183, "step": 11220 }, { "epoch": 0.021878493061586936, "grad_norm": 2.6627697944641113, "learning_rate": 2.9964581934552182e-05, "loss": 1.9218, "step": 11235 }, { "epoch": 0.0219077033326972, "grad_norm": 4.310539245605469, "learning_rate": 2.9964487334505114e-05, "loss": 2.0666, "step": 11250 }, { "epoch": 0.02193691360380746, "grad_norm": 2.611443519592285, "learning_rate": 2.9964392608439997e-05, "loss": 1.8154, "step": 11265 }, { "epoch": 0.021966123874917726, "grad_norm": 3.391406774520874, "learning_rate": 2.996429775635763e-05, "loss": 1.9308, "step": 11280 }, { "epoch": 0.021995334146027987, "grad_norm": 3.1492297649383545, "learning_rate": 2.9964202778258797e-05, "loss": 1.939, "step": 11295 }, { "epoch": 0.02202454441713825, "grad_norm": 3.680859088897705, "learning_rate": 2.9964107674144313e-05, "loss": 1.9048, "step": 11310 }, { "epoch": 0.022053754688248512, "grad_norm": 2.0522656440734863, "learning_rate": 2.9964012444014972e-05, "loss": 1.9477, "step": 11325 }, { "epoch": 0.022082964959358777, "grad_norm": 3.265316963195801, "learning_rate": 2.996391708787158e-05, "loss": 1.8116, "step": 11340 }, { "epoch": 0.022112175230469037, "grad_norm": 2.0570802688598633, "learning_rate": 2.9963821605714934e-05, "loss": 1.8493, "step": 11355 }, { "epoch": 0.022141385501579302, "grad_norm": 4.1805877685546875, "learning_rate": 2.9963725997545844e-05, "loss": 1.8909, "step": 11370 }, { "epoch": 0.022170595772689566, "grad_norm": 2.6846070289611816, "learning_rate": 2.9963630263365116e-05, "loss": 1.8424, "step": 11385 }, { "epoch": 0.022199806043799827, "grad_norm": 4.480174541473389, "learning_rate": 2.996353440317355e-05, "loss": 1.919, "step": 11400 }, { "epoch": 0.022229016314910092, "grad_norm": 2.179137706756592, "learning_rate": 2.996343841697195e-05, "loss": 1.8023, "step": 11415 }, { "epoch": 0.022258226586020353, "grad_norm": 4.341340065002441, "learning_rate": 2.996334230476114e-05, "loss": 1.8263, "step": 11430 }, { "epoch": 0.022287436857130617, "grad_norm": 1.6666501760482788, "learning_rate": 2.9963246066541913e-05, "loss": 1.8854, "step": 11445 }, { "epoch": 0.022316647128240878, "grad_norm": 2.9793460369110107, "learning_rate": 2.9963149702315093e-05, "loss": 1.9214, "step": 11460 }, { "epoch": 0.022345857399351143, "grad_norm": 4.338296413421631, "learning_rate": 2.996305321208148e-05, "loss": 1.9517, "step": 11475 }, { "epoch": 0.022375067670461407, "grad_norm": 4.449549674987793, "learning_rate": 2.99629565958419e-05, "loss": 2.0761, "step": 11490 }, { "epoch": 0.022404277941571668, "grad_norm": 2.6661598682403564, "learning_rate": 2.9962859853597146e-05, "loss": 1.9029, "step": 11505 }, { "epoch": 0.022433488212681933, "grad_norm": 2.5066778659820557, "learning_rate": 2.996276298534805e-05, "loss": 1.7934, "step": 11520 }, { "epoch": 0.022462698483792193, "grad_norm": 2.2089765071868896, "learning_rate": 2.9962665991095424e-05, "loss": 2.0755, "step": 11535 }, { "epoch": 0.022491908754902458, "grad_norm": 3.543388605117798, "learning_rate": 2.9962568870840078e-05, "loss": 1.8682, "step": 11550 }, { "epoch": 0.02252111902601272, "grad_norm": 3.1596784591674805, "learning_rate": 2.9962471624582838e-05, "loss": 2.0225, "step": 11565 }, { "epoch": 0.022550329297122983, "grad_norm": 3.338447332382202, "learning_rate": 2.9962374252324524e-05, "loss": 1.7248, "step": 11580 }, { "epoch": 0.022579539568233244, "grad_norm": 2.6090617179870605, "learning_rate": 2.996227675406595e-05, "loss": 2.1125, "step": 11595 }, { "epoch": 0.02260874983934351, "grad_norm": 3.618283271789551, "learning_rate": 2.9962179129807936e-05, "loss": 2.0012, "step": 11610 }, { "epoch": 0.022637960110453773, "grad_norm": 2.161893129348755, "learning_rate": 2.996208137955131e-05, "loss": 1.8541, "step": 11625 }, { "epoch": 0.022667170381564034, "grad_norm": 4.789167881011963, "learning_rate": 2.996198350329689e-05, "loss": 1.7935, "step": 11640 }, { "epoch": 0.0226963806526743, "grad_norm": 2.7407031059265137, "learning_rate": 2.9961885501045505e-05, "loss": 1.8944, "step": 11655 }, { "epoch": 0.02272559092378456, "grad_norm": 2.0403008460998535, "learning_rate": 2.9961787372797977e-05, "loss": 1.9245, "step": 11670 }, { "epoch": 0.022754801194894824, "grad_norm": 2.2842493057250977, "learning_rate": 2.996168911855513e-05, "loss": 1.8867, "step": 11685 }, { "epoch": 0.022784011466005085, "grad_norm": 3.4035165309906006, "learning_rate": 2.99615907383178e-05, "loss": 1.8029, "step": 11700 }, { "epoch": 0.02281322173711535, "grad_norm": 4.04712438583374, "learning_rate": 2.996149223208681e-05, "loss": 1.8933, "step": 11715 }, { "epoch": 0.022842432008225614, "grad_norm": 3.2550394535064697, "learning_rate": 2.996139359986299e-05, "loss": 1.8503, "step": 11730 }, { "epoch": 0.022871642279335875, "grad_norm": 2.3266172409057617, "learning_rate": 2.9961294841647164e-05, "loss": 1.939, "step": 11745 }, { "epoch": 0.02290085255044614, "grad_norm": 1.7769925594329834, "learning_rate": 2.9961195957440172e-05, "loss": 2.0939, "step": 11760 }, { "epoch": 0.0229300628215564, "grad_norm": 3.4839985370635986, "learning_rate": 2.9961096947242846e-05, "loss": 1.8933, "step": 11775 }, { "epoch": 0.022959273092666665, "grad_norm": 4.051612377166748, "learning_rate": 2.9960997811056017e-05, "loss": 1.8464, "step": 11790 }, { "epoch": 0.022988483363776926, "grad_norm": 5.2628703117370605, "learning_rate": 2.9960898548880525e-05, "loss": 1.9296, "step": 11805 }, { "epoch": 0.02301769363488719, "grad_norm": 4.6693434715271, "learning_rate": 2.99607991607172e-05, "loss": 2.0126, "step": 11820 }, { "epoch": 0.02304690390599745, "grad_norm": 2.2805821895599365, "learning_rate": 2.996069964656688e-05, "loss": 1.7084, "step": 11835 }, { "epoch": 0.023076114177107716, "grad_norm": 5.130448818206787, "learning_rate": 2.996060000643041e-05, "loss": 1.8437, "step": 11850 }, { "epoch": 0.02310532444821798, "grad_norm": 3.6057186126708984, "learning_rate": 2.9960500240308616e-05, "loss": 2.0847, "step": 11865 }, { "epoch": 0.02313453471932824, "grad_norm": 3.6711442470550537, "learning_rate": 2.9960400348202348e-05, "loss": 1.9817, "step": 11880 }, { "epoch": 0.023163744990438506, "grad_norm": 2.514784097671509, "learning_rate": 2.9960300330112445e-05, "loss": 1.7633, "step": 11895 }, { "epoch": 0.023192955261548766, "grad_norm": 2.1372082233428955, "learning_rate": 2.996020018603975e-05, "loss": 1.7941, "step": 11910 }, { "epoch": 0.02322216553265903, "grad_norm": 2.8542985916137695, "learning_rate": 2.9960099915985104e-05, "loss": 1.7778, "step": 11925 }, { "epoch": 0.023251375803769292, "grad_norm": 2.1213855743408203, "learning_rate": 2.9959999519949354e-05, "loss": 1.8966, "step": 11940 }, { "epoch": 0.023280586074879556, "grad_norm": 2.751647710800171, "learning_rate": 2.995989899793334e-05, "loss": 1.7919, "step": 11955 }, { "epoch": 0.02330979634598982, "grad_norm": 2.2502388954162598, "learning_rate": 2.9959798349937915e-05, "loss": 1.8062, "step": 11970 }, { "epoch": 0.023339006617100082, "grad_norm": 2.5316834449768066, "learning_rate": 2.995969757596392e-05, "loss": 1.9685, "step": 11985 }, { "epoch": 0.023368216888210346, "grad_norm": 2.8897788524627686, "learning_rate": 2.995959667601221e-05, "loss": 1.9659, "step": 12000 }, { "epoch": 0.023397427159320607, "grad_norm": 3.8147826194763184, "learning_rate": 2.9959495650083634e-05, "loss": 1.7452, "step": 12015 }, { "epoch": 0.02342663743043087, "grad_norm": 1.9649070501327515, "learning_rate": 2.9959394498179043e-05, "loss": 1.7987, "step": 12030 }, { "epoch": 0.023455847701541133, "grad_norm": 2.2376818656921387, "learning_rate": 2.9959293220299287e-05, "loss": 1.87, "step": 12045 }, { "epoch": 0.023485057972651397, "grad_norm": 2.3533129692077637, "learning_rate": 2.9959191816445217e-05, "loss": 2.006, "step": 12060 }, { "epoch": 0.023514268243761658, "grad_norm": 2.5106008052825928, "learning_rate": 2.9959090286617686e-05, "loss": 1.8053, "step": 12075 }, { "epoch": 0.023543478514871923, "grad_norm": 2.843824863433838, "learning_rate": 2.9958988630817555e-05, "loss": 2.0062, "step": 12090 }, { "epoch": 0.023572688785982187, "grad_norm": 3.826493501663208, "learning_rate": 2.9958886849045678e-05, "loss": 1.8213, "step": 12105 }, { "epoch": 0.023601899057092448, "grad_norm": 2.669509172439575, "learning_rate": 2.9958784941302908e-05, "loss": 2.0031, "step": 12120 }, { "epoch": 0.023631109328202712, "grad_norm": 5.227283477783203, "learning_rate": 2.995868290759011e-05, "loss": 1.9724, "step": 12135 }, { "epoch": 0.023660319599312973, "grad_norm": 2.67545485496521, "learning_rate": 2.9958580747908134e-05, "loss": 1.7393, "step": 12150 }, { "epoch": 0.023689529870423238, "grad_norm": 4.294065952301025, "learning_rate": 2.9958478462257847e-05, "loss": 2.0262, "step": 12165 }, { "epoch": 0.0237187401415335, "grad_norm": 4.024529933929443, "learning_rate": 2.9958376050640114e-05, "loss": 1.8878, "step": 12180 }, { "epoch": 0.023747950412643763, "grad_norm": 3.88948130607605, "learning_rate": 2.9958273513055785e-05, "loss": 1.9365, "step": 12195 }, { "epoch": 0.023777160683754028, "grad_norm": 2.3247134685516357, "learning_rate": 2.9958170849505736e-05, "loss": 1.9275, "step": 12210 }, { "epoch": 0.02380637095486429, "grad_norm": 2.8689277172088623, "learning_rate": 2.9958068059990827e-05, "loss": 1.9015, "step": 12225 }, { "epoch": 0.023835581225974553, "grad_norm": 2.650768756866455, "learning_rate": 2.995796514451192e-05, "loss": 2.0453, "step": 12240 }, { "epoch": 0.023864791497084814, "grad_norm": 3.2005455493927, "learning_rate": 2.9957862103069886e-05, "loss": 1.8641, "step": 12255 }, { "epoch": 0.02389400176819508, "grad_norm": 3.2274022102355957, "learning_rate": 2.9957758935665592e-05, "loss": 2.0379, "step": 12270 }, { "epoch": 0.02392321203930534, "grad_norm": 2.933011054992676, "learning_rate": 2.9957655642299903e-05, "loss": 2.0304, "step": 12285 }, { "epoch": 0.023952422310415604, "grad_norm": 2.051677942276001, "learning_rate": 2.9957552222973696e-05, "loss": 1.9656, "step": 12300 }, { "epoch": 0.02398163258152587, "grad_norm": 2.488805055618286, "learning_rate": 2.995744867768784e-05, "loss": 1.8997, "step": 12315 }, { "epoch": 0.02401084285263613, "grad_norm": 2.1613032817840576, "learning_rate": 2.99573450064432e-05, "loss": 1.9733, "step": 12330 }, { "epoch": 0.024040053123746394, "grad_norm": 3.817368745803833, "learning_rate": 2.9957241209240656e-05, "loss": 1.74, "step": 12345 }, { "epoch": 0.024069263394856655, "grad_norm": 2.048835515975952, "learning_rate": 2.995713728608108e-05, "loss": 2.0201, "step": 12360 }, { "epoch": 0.02409847366596692, "grad_norm": 3.2480578422546387, "learning_rate": 2.995703323696535e-05, "loss": 1.8652, "step": 12375 }, { "epoch": 0.02412768393707718, "grad_norm": 2.2918214797973633, "learning_rate": 2.9956929061894334e-05, "loss": 1.9089, "step": 12390 }, { "epoch": 0.024156894208187445, "grad_norm": 2.4332637786865234, "learning_rate": 2.995682476086892e-05, "loss": 1.8168, "step": 12405 }, { "epoch": 0.024186104479297706, "grad_norm": 3.9398860931396484, "learning_rate": 2.9956720333889978e-05, "loss": 1.8114, "step": 12420 }, { "epoch": 0.02421531475040797, "grad_norm": 1.3877410888671875, "learning_rate": 2.995661578095839e-05, "loss": 1.7609, "step": 12435 }, { "epoch": 0.024244525021518235, "grad_norm": 3.1245439052581787, "learning_rate": 2.9956511102075043e-05, "loss": 1.9191, "step": 12450 }, { "epoch": 0.024273735292628495, "grad_norm": 4.245307445526123, "learning_rate": 2.9956406297240805e-05, "loss": 1.9551, "step": 12465 }, { "epoch": 0.02430294556373876, "grad_norm": 3.033841371536255, "learning_rate": 2.995630136645657e-05, "loss": 2.063, "step": 12480 }, { "epoch": 0.02433215583484902, "grad_norm": 2.9336037635803223, "learning_rate": 2.9956196309723217e-05, "loss": 1.8383, "step": 12495 }, { "epoch": 0.024361366105959285, "grad_norm": 2.946824312210083, "learning_rate": 2.9956091127041628e-05, "loss": 2.012, "step": 12510 }, { "epoch": 0.024390576377069546, "grad_norm": 1.9165891408920288, "learning_rate": 2.9955985818412695e-05, "loss": 1.8422, "step": 12525 }, { "epoch": 0.02441978664817981, "grad_norm": 2.4142305850982666, "learning_rate": 2.9955880383837304e-05, "loss": 2.027, "step": 12540 }, { "epoch": 0.024448996919290075, "grad_norm": 3.1490895748138428, "learning_rate": 2.9955774823316337e-05, "loss": 1.8485, "step": 12555 }, { "epoch": 0.024478207190400336, "grad_norm": 3.187546968460083, "learning_rate": 2.995566913685069e-05, "loss": 1.8955, "step": 12570 }, { "epoch": 0.0245074174615106, "grad_norm": 4.743402481079102, "learning_rate": 2.9955563324441246e-05, "loss": 1.8438, "step": 12585 }, { "epoch": 0.02453662773262086, "grad_norm": 4.372682094573975, "learning_rate": 2.9955457386088904e-05, "loss": 1.6814, "step": 12600 }, { "epoch": 0.024565838003731126, "grad_norm": 4.228031158447266, "learning_rate": 2.995535132179455e-05, "loss": 1.8358, "step": 12615 }, { "epoch": 0.024595048274841387, "grad_norm": 1.7856305837631226, "learning_rate": 2.9955245131559078e-05, "loss": 1.9314, "step": 12630 }, { "epoch": 0.02462425854595165, "grad_norm": 2.232226610183716, "learning_rate": 2.9955138815383383e-05, "loss": 1.7662, "step": 12645 }, { "epoch": 0.024653468817061912, "grad_norm": 2.812988758087158, "learning_rate": 2.9955032373268366e-05, "loss": 1.8475, "step": 12660 }, { "epoch": 0.024682679088172177, "grad_norm": 4.58977746963501, "learning_rate": 2.995492580521491e-05, "loss": 1.8952, "step": 12675 }, { "epoch": 0.02471188935928244, "grad_norm": 2.619033098220825, "learning_rate": 2.995481911122393e-05, "loss": 1.7343, "step": 12690 }, { "epoch": 0.024741099630392702, "grad_norm": 2.20595645904541, "learning_rate": 2.9954712291296303e-05, "loss": 1.8573, "step": 12705 }, { "epoch": 0.024770309901502967, "grad_norm": 4.261920928955078, "learning_rate": 2.9954605345432948e-05, "loss": 1.9771, "step": 12720 }, { "epoch": 0.024799520172613228, "grad_norm": 4.171009063720703, "learning_rate": 2.995449827363476e-05, "loss": 1.8328, "step": 12735 }, { "epoch": 0.024828730443723492, "grad_norm": 1.9221203327178955, "learning_rate": 2.9954391075902634e-05, "loss": 2.0122, "step": 12750 }, { "epoch": 0.024857940714833753, "grad_norm": 4.801123142242432, "learning_rate": 2.9954283752237478e-05, "loss": 1.9288, "step": 12765 }, { "epoch": 0.024887150985944018, "grad_norm": 5.146918296813965, "learning_rate": 2.99541763026402e-05, "loss": 1.8986, "step": 12780 }, { "epoch": 0.024916361257054282, "grad_norm": 3.8306210041046143, "learning_rate": 2.9954068727111694e-05, "loss": 1.9432, "step": 12795 }, { "epoch": 0.024945571528164543, "grad_norm": 3.569969892501831, "learning_rate": 2.9953961025652875e-05, "loss": 1.802, "step": 12810 }, { "epoch": 0.024974781799274808, "grad_norm": 2.11238956451416, "learning_rate": 2.995385319826465e-05, "loss": 1.7489, "step": 12825 }, { "epoch": 0.02500399207038507, "grad_norm": 2.0586655139923096, "learning_rate": 2.995374524494792e-05, "loss": 1.8308, "step": 12840 }, { "epoch": 0.025033202341495333, "grad_norm": 2.133302688598633, "learning_rate": 2.9953637165703597e-05, "loss": 1.8642, "step": 12855 }, { "epoch": 0.025062412612605594, "grad_norm": 6.014092922210693, "learning_rate": 2.9953528960532594e-05, "loss": 1.838, "step": 12870 }, { "epoch": 0.02509162288371586, "grad_norm": 4.038995742797852, "learning_rate": 2.9953420629435823e-05, "loss": 1.9485, "step": 12885 }, { "epoch": 0.02512083315482612, "grad_norm": 2.016037940979004, "learning_rate": 2.995331217241419e-05, "loss": 1.8532, "step": 12900 }, { "epoch": 0.025150043425936384, "grad_norm": 2.268634796142578, "learning_rate": 2.9953203589468617e-05, "loss": 1.7899, "step": 12915 }, { "epoch": 0.025179253697046648, "grad_norm": 4.676908016204834, "learning_rate": 2.995309488060001e-05, "loss": 1.875, "step": 12930 }, { "epoch": 0.02520846396815691, "grad_norm": 2.0081729888916016, "learning_rate": 2.9952986045809284e-05, "loss": 1.9863, "step": 12945 }, { "epoch": 0.025237674239267174, "grad_norm": 2.9928226470947266, "learning_rate": 2.9952877085097364e-05, "loss": 1.8073, "step": 12960 }, { "epoch": 0.025266884510377435, "grad_norm": 3.514173746109009, "learning_rate": 2.9952767998465164e-05, "loss": 1.849, "step": 12975 }, { "epoch": 0.0252960947814877, "grad_norm": 1.9361449480056763, "learning_rate": 2.99526587859136e-05, "loss": 1.8925, "step": 12990 }, { "epoch": 0.02532530505259796, "grad_norm": 4.396752834320068, "learning_rate": 2.9952549447443595e-05, "loss": 1.8844, "step": 13005 }, { "epoch": 0.025354515323708225, "grad_norm": 2.9122262001037598, "learning_rate": 2.9952439983056066e-05, "loss": 1.9564, "step": 13020 }, { "epoch": 0.02538372559481849, "grad_norm": 2.6632208824157715, "learning_rate": 2.9952330392751935e-05, "loss": 1.8514, "step": 13035 }, { "epoch": 0.02541293586592875, "grad_norm": 2.678126573562622, "learning_rate": 2.995222067653213e-05, "loss": 1.8146, "step": 13050 }, { "epoch": 0.025442146137039014, "grad_norm": 3.901380777359009, "learning_rate": 2.9952110834397572e-05, "loss": 2.0268, "step": 13065 }, { "epoch": 0.025471356408149275, "grad_norm": 2.7066125869750977, "learning_rate": 2.9952000866349185e-05, "loss": 1.8731, "step": 13080 }, { "epoch": 0.02550056667925954, "grad_norm": 4.8974456787109375, "learning_rate": 2.9951890772387897e-05, "loss": 1.908, "step": 13095 }, { "epoch": 0.0255297769503698, "grad_norm": 3.9215996265411377, "learning_rate": 2.995178055251463e-05, "loss": 2.0428, "step": 13110 }, { "epoch": 0.025558987221480065, "grad_norm": 2.519742250442505, "learning_rate": 2.9951670206730318e-05, "loss": 1.7794, "step": 13125 }, { "epoch": 0.025588197492590326, "grad_norm": 2.1564877033233643, "learning_rate": 2.995155973503589e-05, "loss": 1.6714, "step": 13140 }, { "epoch": 0.02561740776370059, "grad_norm": 4.537586688995361, "learning_rate": 2.9951449137432275e-05, "loss": 2.0812, "step": 13155 }, { "epoch": 0.025646618034810855, "grad_norm": 3.659740686416626, "learning_rate": 2.9951338413920403e-05, "loss": 1.995, "step": 13170 }, { "epoch": 0.025675828305921116, "grad_norm": 2.057332992553711, "learning_rate": 2.9951227564501207e-05, "loss": 1.8206, "step": 13185 }, { "epoch": 0.02570503857703138, "grad_norm": 3.4197821617126465, "learning_rate": 2.995111658917562e-05, "loss": 1.9727, "step": 13200 }, { "epoch": 0.02573424884814164, "grad_norm": 2.212815761566162, "learning_rate": 2.995100548794458e-05, "loss": 1.959, "step": 13215 }, { "epoch": 0.025763459119251906, "grad_norm": 2.1396286487579346, "learning_rate": 2.9950894260809015e-05, "loss": 1.9048, "step": 13230 }, { "epoch": 0.025792669390362167, "grad_norm": 2.823305606842041, "learning_rate": 2.995078290776987e-05, "loss": 1.9381, "step": 13245 }, { "epoch": 0.02582187966147243, "grad_norm": 3.0127432346343994, "learning_rate": 2.9950671428828083e-05, "loss": 1.959, "step": 13260 }, { "epoch": 0.025851089932582696, "grad_norm": 2.1536638736724854, "learning_rate": 2.9950559823984583e-05, "loss": 1.7713, "step": 13275 }, { "epoch": 0.025880300203692957, "grad_norm": 2.784940242767334, "learning_rate": 2.9950448093240318e-05, "loss": 1.8586, "step": 13290 }, { "epoch": 0.02590951047480322, "grad_norm": 2.103855848312378, "learning_rate": 2.9950336236596226e-05, "loss": 1.8167, "step": 13305 }, { "epoch": 0.025938720745913482, "grad_norm": 2.465585947036743, "learning_rate": 2.9950224254053254e-05, "loss": 1.9863, "step": 13320 }, { "epoch": 0.025967931017023747, "grad_norm": 3.2060279846191406, "learning_rate": 2.9950112145612335e-05, "loss": 1.838, "step": 13335 }, { "epoch": 0.025997141288134008, "grad_norm": 3.0637335777282715, "learning_rate": 2.9949999911274427e-05, "loss": 1.9325, "step": 13350 }, { "epoch": 0.026026351559244272, "grad_norm": 2.8039963245391846, "learning_rate": 2.994988755104046e-05, "loss": 1.8671, "step": 13365 }, { "epoch": 0.026055561830354537, "grad_norm": 4.239367485046387, "learning_rate": 2.9949775064911388e-05, "loss": 1.9305, "step": 13380 }, { "epoch": 0.026084772101464797, "grad_norm": 4.794821262359619, "learning_rate": 2.994966245288816e-05, "loss": 1.8946, "step": 13395 }, { "epoch": 0.026113982372575062, "grad_norm": 3.5099828243255615, "learning_rate": 2.994954971497172e-05, "loss": 1.919, "step": 13410 }, { "epoch": 0.026143192643685323, "grad_norm": 4.3362603187561035, "learning_rate": 2.994943685116302e-05, "loss": 1.9266, "step": 13425 }, { "epoch": 0.026172402914795587, "grad_norm": 4.010773658752441, "learning_rate": 2.994932386146301e-05, "loss": 1.8712, "step": 13440 }, { "epoch": 0.02620161318590585, "grad_norm": 2.9033546447753906, "learning_rate": 2.9949210745872638e-05, "loss": 1.8179, "step": 13455 }, { "epoch": 0.026230823457016113, "grad_norm": 2.215955972671509, "learning_rate": 2.9949097504392866e-05, "loss": 1.8573, "step": 13470 }, { "epoch": 0.026260033728126374, "grad_norm": 4.64263391494751, "learning_rate": 2.994898413702464e-05, "loss": 1.8504, "step": 13485 }, { "epoch": 0.026289243999236638, "grad_norm": 2.8851144313812256, "learning_rate": 2.9948870643768915e-05, "loss": 1.7891, "step": 13500 }, { "epoch": 0.026318454270346903, "grad_norm": 5.705179214477539, "learning_rate": 2.9948757024626645e-05, "loss": 1.8502, "step": 13515 }, { "epoch": 0.026347664541457164, "grad_norm": 4.427610397338867, "learning_rate": 2.994864327959879e-05, "loss": 1.746, "step": 13530 }, { "epoch": 0.026376874812567428, "grad_norm": 2.9682793617248535, "learning_rate": 2.994852940868631e-05, "loss": 1.8766, "step": 13545 }, { "epoch": 0.02640608508367769, "grad_norm": 2.8406543731689453, "learning_rate": 2.9948415411890164e-05, "loss": 1.8637, "step": 13560 }, { "epoch": 0.026435295354787954, "grad_norm": 2.9661149978637695, "learning_rate": 2.9948301289211308e-05, "loss": 1.7703, "step": 13575 }, { "epoch": 0.026464505625898214, "grad_norm": 2.961155652999878, "learning_rate": 2.99481870406507e-05, "loss": 1.8123, "step": 13590 }, { "epoch": 0.02649371589700848, "grad_norm": 3.7241668701171875, "learning_rate": 2.9948072666209308e-05, "loss": 2.018, "step": 13605 }, { "epoch": 0.026522926168118743, "grad_norm": 2.8102498054504395, "learning_rate": 2.9947958165888096e-05, "loss": 1.8577, "step": 13620 }, { "epoch": 0.026552136439229004, "grad_norm": 2.061007022857666, "learning_rate": 2.9947843539688027e-05, "loss": 1.9684, "step": 13635 }, { "epoch": 0.02658134671033927, "grad_norm": 4.699859619140625, "learning_rate": 2.994772878761006e-05, "loss": 1.9438, "step": 13650 }, { "epoch": 0.02661055698144953, "grad_norm": 5.8805952072143555, "learning_rate": 2.994761390965517e-05, "loss": 1.8862, "step": 13665 }, { "epoch": 0.026639767252559794, "grad_norm": 3.6178531646728516, "learning_rate": 2.994749890582432e-05, "loss": 1.9754, "step": 13680 }, { "epoch": 0.026668977523670055, "grad_norm": 2.891448497772217, "learning_rate": 2.9947383776118482e-05, "loss": 1.8838, "step": 13695 }, { "epoch": 0.02669818779478032, "grad_norm": 2.5380797386169434, "learning_rate": 2.994726852053862e-05, "loss": 2.0006, "step": 13710 }, { "epoch": 0.02672739806589058, "grad_norm": 3.083801031112671, "learning_rate": 2.994715313908571e-05, "loss": 1.9287, "step": 13725 }, { "epoch": 0.026756608337000845, "grad_norm": 3.9220306873321533, "learning_rate": 2.9947037631760717e-05, "loss": 2.0063, "step": 13740 }, { "epoch": 0.02678581860811111, "grad_norm": 2.41329288482666, "learning_rate": 2.994692199856462e-05, "loss": 1.7779, "step": 13755 }, { "epoch": 0.02681502887922137, "grad_norm": 3.137281656265259, "learning_rate": 2.9946806239498392e-05, "loss": 1.7686, "step": 13770 }, { "epoch": 0.026844239150331635, "grad_norm": 3.8897507190704346, "learning_rate": 2.994669035456301e-05, "loss": 1.9879, "step": 13785 }, { "epoch": 0.026873449421441896, "grad_norm": 2.888145685195923, "learning_rate": 2.994657434375944e-05, "loss": 2.0012, "step": 13800 }, { "epoch": 0.02690265969255216, "grad_norm": 2.683145523071289, "learning_rate": 2.9946458207088667e-05, "loss": 1.8579, "step": 13815 }, { "epoch": 0.02693186996366242, "grad_norm": 2.5023186206817627, "learning_rate": 2.9946341944551668e-05, "loss": 1.8899, "step": 13830 }, { "epoch": 0.026961080234772686, "grad_norm": 4.522122383117676, "learning_rate": 2.994622555614942e-05, "loss": 1.8373, "step": 13845 }, { "epoch": 0.02699029050588295, "grad_norm": 1.9197810888290405, "learning_rate": 2.9946109041882902e-05, "loss": 1.874, "step": 13860 }, { "epoch": 0.02701950077699321, "grad_norm": 2.0907135009765625, "learning_rate": 2.9945992401753103e-05, "loss": 1.9878, "step": 13875 }, { "epoch": 0.027048711048103476, "grad_norm": 3.0691592693328857, "learning_rate": 2.9945875635761e-05, "loss": 1.8859, "step": 13890 }, { "epoch": 0.027077921319213737, "grad_norm": 2.0707552433013916, "learning_rate": 2.9945758743907573e-05, "loss": 1.7612, "step": 13905 }, { "epoch": 0.027107131590324, "grad_norm": 2.2770462036132812, "learning_rate": 2.994564172619381e-05, "loss": 1.8028, "step": 13920 }, { "epoch": 0.027136341861434262, "grad_norm": 2.681814193725586, "learning_rate": 2.9945524582620695e-05, "loss": 1.7967, "step": 13935 }, { "epoch": 0.027165552132544526, "grad_norm": 3.0529186725616455, "learning_rate": 2.994540731318922e-05, "loss": 1.7972, "step": 13950 }, { "epoch": 0.027194762403654787, "grad_norm": 3.369091033935547, "learning_rate": 2.9945289917900368e-05, "loss": 1.8092, "step": 13965 }, { "epoch": 0.027223972674765052, "grad_norm": 2.190134048461914, "learning_rate": 2.9945172396755124e-05, "loss": 2.0228, "step": 13980 }, { "epoch": 0.027253182945875316, "grad_norm": 2.805100202560425, "learning_rate": 2.9945054749754483e-05, "loss": 1.9312, "step": 13995 }, { "epoch": 0.027282393216985577, "grad_norm": 2.195697546005249, "learning_rate": 2.9944936976899433e-05, "loss": 1.9791, "step": 14010 }, { "epoch": 0.027311603488095842, "grad_norm": 1.723713755607605, "learning_rate": 2.9944819078190967e-05, "loss": 1.8542, "step": 14025 }, { "epoch": 0.027340813759206103, "grad_norm": 2.633101463317871, "learning_rate": 2.9944701053630075e-05, "loss": 1.8127, "step": 14040 }, { "epoch": 0.027370024030316367, "grad_norm": 1.9390171766281128, "learning_rate": 2.9944582903217756e-05, "loss": 1.9183, "step": 14055 }, { "epoch": 0.027399234301426628, "grad_norm": 3.9491968154907227, "learning_rate": 2.9944464626955003e-05, "loss": 2.0849, "step": 14070 }, { "epoch": 0.027428444572536893, "grad_norm": 2.4679179191589355, "learning_rate": 2.9944346224842812e-05, "loss": 1.9285, "step": 14085 }, { "epoch": 0.027457654843647157, "grad_norm": 2.999509334564209, "learning_rate": 2.994422769688218e-05, "loss": 1.9523, "step": 14100 }, { "epoch": 0.027486865114757418, "grad_norm": 3.8798091411590576, "learning_rate": 2.9944109043074104e-05, "loss": 1.9014, "step": 14115 }, { "epoch": 0.027516075385867683, "grad_norm": 2.5288240909576416, "learning_rate": 2.9943990263419582e-05, "loss": 2.1135, "step": 14130 }, { "epoch": 0.027545285656977943, "grad_norm": 2.2120304107666016, "learning_rate": 2.994387135791962e-05, "loss": 1.7418, "step": 14145 }, { "epoch": 0.027574495928088208, "grad_norm": 2.805328607559204, "learning_rate": 2.994375232657521e-05, "loss": 1.8776, "step": 14160 }, { "epoch": 0.02760370619919847, "grad_norm": 2.8841097354888916, "learning_rate": 2.9943633169387365e-05, "loss": 1.9106, "step": 14175 }, { "epoch": 0.027632916470308733, "grad_norm": 1.8887025117874146, "learning_rate": 2.994351388635708e-05, "loss": 1.8916, "step": 14190 }, { "epoch": 0.027662126741418994, "grad_norm": 2.8623757362365723, "learning_rate": 2.9943394477485363e-05, "loss": 1.8735, "step": 14205 }, { "epoch": 0.02769133701252926, "grad_norm": 3.1046249866485596, "learning_rate": 2.994327494277322e-05, "loss": 1.9124, "step": 14220 }, { "epoch": 0.027720547283639523, "grad_norm": 2.653933525085449, "learning_rate": 2.9943155282221663e-05, "loss": 1.9387, "step": 14235 }, { "epoch": 0.027749757554749784, "grad_norm": 2.975820779800415, "learning_rate": 2.9943035495831688e-05, "loss": 1.8232, "step": 14250 }, { "epoch": 0.02777896782586005, "grad_norm": 5.906015396118164, "learning_rate": 2.9942915583604307e-05, "loss": 1.9167, "step": 14265 }, { "epoch": 0.02780817809697031, "grad_norm": 2.592456102371216, "learning_rate": 2.994279554554054e-05, "loss": 1.7433, "step": 14280 }, { "epoch": 0.027837388368080574, "grad_norm": 5.042680263519287, "learning_rate": 2.994267538164138e-05, "loss": 1.7878, "step": 14295 }, { "epoch": 0.027866598639190835, "grad_norm": 4.092184066772461, "learning_rate": 2.9942555091907853e-05, "loss": 1.6955, "step": 14310 }, { "epoch": 0.0278958089103011, "grad_norm": 4.623755931854248, "learning_rate": 2.994243467634097e-05, "loss": 1.7866, "step": 14325 }, { "epoch": 0.027925019181411364, "grad_norm": 3.042306661605835, "learning_rate": 2.994231413494174e-05, "loss": 1.8891, "step": 14340 }, { "epoch": 0.027954229452521625, "grad_norm": 2.784275531768799, "learning_rate": 2.9942193467711184e-05, "loss": 2.0112, "step": 14355 }, { "epoch": 0.02798343972363189, "grad_norm": 1.9308695793151855, "learning_rate": 2.9942072674650317e-05, "loss": 1.7964, "step": 14370 }, { "epoch": 0.02801264999474215, "grad_norm": 3.7377004623413086, "learning_rate": 2.994195175576015e-05, "loss": 1.8661, "step": 14385 }, { "epoch": 0.028041860265852415, "grad_norm": 2.484870195388794, "learning_rate": 2.994183071104171e-05, "loss": 1.7358, "step": 14400 }, { "epoch": 0.028071070536962676, "grad_norm": 2.6344974040985107, "learning_rate": 2.9941709540496013e-05, "loss": 1.9183, "step": 14415 }, { "epoch": 0.02810028080807294, "grad_norm": 2.168701410293579, "learning_rate": 2.9941588244124072e-05, "loss": 1.999, "step": 14430 }, { "epoch": 0.028129491079183205, "grad_norm": 2.986727476119995, "learning_rate": 2.994146682192692e-05, "loss": 1.8344, "step": 14445 }, { "epoch": 0.028158701350293466, "grad_norm": 3.3715713024139404, "learning_rate": 2.9941345273905573e-05, "loss": 2.0468, "step": 14460 }, { "epoch": 0.02818791162140373, "grad_norm": 2.2077038288116455, "learning_rate": 2.9941223600061054e-05, "loss": 2.0255, "step": 14475 }, { "epoch": 0.02821712189251399, "grad_norm": 3.4651224613189697, "learning_rate": 2.994110180039439e-05, "loss": 1.8604, "step": 14490 }, { "epoch": 0.028246332163624256, "grad_norm": 2.0584287643432617, "learning_rate": 2.994097987490661e-05, "loss": 2.0636, "step": 14505 }, { "epoch": 0.028275542434734516, "grad_norm": 3.1285014152526855, "learning_rate": 2.9940857823598736e-05, "loss": 1.7656, "step": 14520 }, { "epoch": 0.02830475270584478, "grad_norm": 2.541280746459961, "learning_rate": 2.9940735646471793e-05, "loss": 1.8682, "step": 14535 }, { "epoch": 0.028333962976955042, "grad_norm": 2.4748847484588623, "learning_rate": 2.9940613343526817e-05, "loss": 2.0047, "step": 14550 }, { "epoch": 0.028363173248065306, "grad_norm": 3.2105560302734375, "learning_rate": 2.9940490914764834e-05, "loss": 1.7709, "step": 14565 }, { "epoch": 0.02839238351917557, "grad_norm": 3.491591215133667, "learning_rate": 2.9940368360186878e-05, "loss": 1.786, "step": 14580 }, { "epoch": 0.028421593790285832, "grad_norm": 3.315342903137207, "learning_rate": 2.9940245679793978e-05, "loss": 1.917, "step": 14595 }, { "epoch": 0.028450804061396096, "grad_norm": 1.7594997882843018, "learning_rate": 2.9940122873587164e-05, "loss": 1.877, "step": 14610 }, { "epoch": 0.028480014332506357, "grad_norm": 2.442725896835327, "learning_rate": 2.9939999941567474e-05, "loss": 1.9577, "step": 14625 }, { "epoch": 0.02850922460361662, "grad_norm": 3.146977663040161, "learning_rate": 2.993987688373595e-05, "loss": 1.8722, "step": 14640 }, { "epoch": 0.028538434874726883, "grad_norm": 2.6678929328918457, "learning_rate": 2.9939753700093618e-05, "loss": 1.659, "step": 14655 }, { "epoch": 0.028567645145837147, "grad_norm": 2.4769906997680664, "learning_rate": 2.9939630390641518e-05, "loss": 1.8257, "step": 14670 }, { "epoch": 0.02859685541694741, "grad_norm": 2.9314770698547363, "learning_rate": 2.993950695538069e-05, "loss": 1.8004, "step": 14685 }, { "epoch": 0.028626065688057672, "grad_norm": 3.2279980182647705, "learning_rate": 2.993938339431217e-05, "loss": 1.9438, "step": 14700 }, { "epoch": 0.028655275959167937, "grad_norm": 2.8929495811462402, "learning_rate": 2.9939259707437002e-05, "loss": 1.7995, "step": 14715 }, { "epoch": 0.028684486230278198, "grad_norm": 4.861998558044434, "learning_rate": 2.9939135894756232e-05, "loss": 1.8188, "step": 14730 }, { "epoch": 0.028713696501388462, "grad_norm": 5.37394905090332, "learning_rate": 2.9939011956270893e-05, "loss": 2.061, "step": 14745 }, { "epoch": 0.028742906772498723, "grad_norm": 2.2253520488739014, "learning_rate": 2.9938887891982035e-05, "loss": 1.963, "step": 14760 }, { "epoch": 0.028772117043608988, "grad_norm": 3.424954414367676, "learning_rate": 2.99387637018907e-05, "loss": 2.0077, "step": 14775 }, { "epoch": 0.02880132731471925, "grad_norm": 2.8398706912994385, "learning_rate": 2.9938639385997934e-05, "loss": 2.0516, "step": 14790 }, { "epoch": 0.028830537585829513, "grad_norm": 2.371492385864258, "learning_rate": 2.9938514944304788e-05, "loss": 1.8057, "step": 14805 }, { "epoch": 0.028859747856939778, "grad_norm": 1.870301365852356, "learning_rate": 2.9938390376812304e-05, "loss": 1.8335, "step": 14820 }, { "epoch": 0.02888895812805004, "grad_norm": 3.1508800983428955, "learning_rate": 2.9938265683521533e-05, "loss": 2.0272, "step": 14835 }, { "epoch": 0.028918168399160303, "grad_norm": 2.8456640243530273, "learning_rate": 2.9938140864433528e-05, "loss": 1.889, "step": 14850 }, { "epoch": 0.028947378670270564, "grad_norm": 2.3040804862976074, "learning_rate": 2.9938015919549337e-05, "loss": 1.9274, "step": 14865 }, { "epoch": 0.02897658894138083, "grad_norm": 3.075559139251709, "learning_rate": 2.9937890848870012e-05, "loss": 1.9239, "step": 14880 }, { "epoch": 0.02900579921249109, "grad_norm": 2.4840190410614014, "learning_rate": 2.9937765652396608e-05, "loss": 1.9836, "step": 14895 }, { "epoch": 0.029035009483601354, "grad_norm": 2.915515422821045, "learning_rate": 2.9937640330130182e-05, "loss": 2.0196, "step": 14910 }, { "epoch": 0.02906421975471162, "grad_norm": 2.684401035308838, "learning_rate": 2.993751488207178e-05, "loss": 1.9699, "step": 14925 }, { "epoch": 0.02909343002582188, "grad_norm": 1.7906841039657593, "learning_rate": 2.9937389308222468e-05, "loss": 1.9435, "step": 14940 }, { "epoch": 0.029122640296932144, "grad_norm": 2.7629384994506836, "learning_rate": 2.9937263608583297e-05, "loss": 1.9266, "step": 14955 }, { "epoch": 0.029151850568042405, "grad_norm": 3.65447735786438, "learning_rate": 2.9937137783155326e-05, "loss": 1.8818, "step": 14970 }, { "epoch": 0.02918106083915267, "grad_norm": 2.684885025024414, "learning_rate": 2.993701183193962e-05, "loss": 1.744, "step": 14985 }, { "epoch": 0.02921027111026293, "grad_norm": 2.995678424835205, "learning_rate": 2.9936885754937237e-05, "loss": 1.868, "step": 15000 }, { "epoch": 0.029239481381373195, "grad_norm": 1.9314979314804077, "learning_rate": 2.993675955214924e-05, "loss": 1.8982, "step": 15015 }, { "epoch": 0.029268691652483456, "grad_norm": 3.8655660152435303, "learning_rate": 2.993663322357669e-05, "loss": 2.103, "step": 15030 }, { "epoch": 0.02929790192359372, "grad_norm": 3.2133545875549316, "learning_rate": 2.993650676922065e-05, "loss": 1.867, "step": 15045 }, { "epoch": 0.029327112194703985, "grad_norm": 1.7590382099151611, "learning_rate": 2.9936380189082184e-05, "loss": 1.9596, "step": 15060 }, { "epoch": 0.029356322465814245, "grad_norm": 3.5091662406921387, "learning_rate": 2.993625348316236e-05, "loss": 2.0665, "step": 15075 }, { "epoch": 0.02938553273692451, "grad_norm": 2.493689775466919, "learning_rate": 2.9936126651462246e-05, "loss": 1.8168, "step": 15090 }, { "epoch": 0.02941474300803477, "grad_norm": 2.158205509185791, "learning_rate": 2.993599969398291e-05, "loss": 1.9542, "step": 15105 }, { "epoch": 0.029443953279145035, "grad_norm": 1.6943566799163818, "learning_rate": 2.9935872610725415e-05, "loss": 1.9762, "step": 15120 }, { "epoch": 0.029473163550255296, "grad_norm": 4.148080825805664, "learning_rate": 2.993574540169084e-05, "loss": 1.8338, "step": 15135 }, { "epoch": 0.02950237382136556, "grad_norm": 3.9066367149353027, "learning_rate": 2.993561806688025e-05, "loss": 1.875, "step": 15150 }, { "epoch": 0.029531584092475825, "grad_norm": 4.32364559173584, "learning_rate": 2.9935490606294726e-05, "loss": 1.7329, "step": 15165 }, { "epoch": 0.029560794363586086, "grad_norm": 3.791557788848877, "learning_rate": 2.9935363019935327e-05, "loss": 1.8515, "step": 15180 }, { "epoch": 0.02959000463469635, "grad_norm": 2.274827241897583, "learning_rate": 2.9935235307803137e-05, "loss": 1.7744, "step": 15195 }, { "epoch": 0.02961921490580661, "grad_norm": 3.765348196029663, "learning_rate": 2.9935107469899235e-05, "loss": 1.7427, "step": 15210 }, { "epoch": 0.029648425176916876, "grad_norm": 1.8818484544754028, "learning_rate": 2.9934979506224687e-05, "loss": 1.912, "step": 15225 }, { "epoch": 0.029677635448027137, "grad_norm": 3.2676870822906494, "learning_rate": 2.993485141678058e-05, "loss": 2.022, "step": 15240 }, { "epoch": 0.0297068457191374, "grad_norm": 3.329846143722534, "learning_rate": 2.9934723201567986e-05, "loss": 1.9317, "step": 15255 }, { "epoch": 0.029736055990247662, "grad_norm": 4.478529453277588, "learning_rate": 2.993459486058799e-05, "loss": 1.8172, "step": 15270 }, { "epoch": 0.029765266261357927, "grad_norm": 5.912947654724121, "learning_rate": 2.9934466393841667e-05, "loss": 1.8828, "step": 15285 }, { "epoch": 0.02979447653246819, "grad_norm": 3.079585552215576, "learning_rate": 2.9934337801330102e-05, "loss": 1.946, "step": 15300 }, { "epoch": 0.029823686803578452, "grad_norm": 2.770911455154419, "learning_rate": 2.993420908305438e-05, "loss": 1.884, "step": 15315 }, { "epoch": 0.029852897074688717, "grad_norm": 2.8034188747406006, "learning_rate": 2.993408023901558e-05, "loss": 1.7856, "step": 15330 }, { "epoch": 0.029882107345798978, "grad_norm": 4.039456367492676, "learning_rate": 2.9933951269214793e-05, "loss": 1.9302, "step": 15345 }, { "epoch": 0.029911317616909242, "grad_norm": 3.329083204269409, "learning_rate": 2.99338221736531e-05, "loss": 1.9541, "step": 15360 }, { "epoch": 0.029940527888019503, "grad_norm": 2.9838755130767822, "learning_rate": 2.9933692952331593e-05, "loss": 1.8633, "step": 15375 }, { "epoch": 0.029969738159129768, "grad_norm": 3.2724435329437256, "learning_rate": 2.9933563605251356e-05, "loss": 1.9572, "step": 15390 }, { "epoch": 0.029998948430240032, "grad_norm": 1.9584968090057373, "learning_rate": 2.993343413241348e-05, "loss": 1.7946, "step": 15405 }, { "epoch": 0.030028158701350293, "grad_norm": 3.7978711128234863, "learning_rate": 2.9933304533819053e-05, "loss": 1.7179, "step": 15420 }, { "epoch": 0.030057368972460557, "grad_norm": 3.708948850631714, "learning_rate": 2.993317480946917e-05, "loss": 1.7777, "step": 15435 }, { "epoch": 0.03008657924357082, "grad_norm": 2.204556941986084, "learning_rate": 2.993304495936492e-05, "loss": 2.0804, "step": 15450 }, { "epoch": 0.030115789514681083, "grad_norm": 4.719995021820068, "learning_rate": 2.9932914983507398e-05, "loss": 1.8505, "step": 15465 }, { "epoch": 0.030144999785791344, "grad_norm": 2.958409547805786, "learning_rate": 2.9932784881897703e-05, "loss": 1.8833, "step": 15480 }, { "epoch": 0.03017421005690161, "grad_norm": 2.398984909057617, "learning_rate": 2.993265465453692e-05, "loss": 1.6463, "step": 15495 }, { "epoch": 0.03020342032801187, "grad_norm": 2.104973077774048, "learning_rate": 2.9932524301426155e-05, "loss": 1.7249, "step": 15510 }, { "epoch": 0.030232630599122134, "grad_norm": 5.241917610168457, "learning_rate": 2.99323938225665e-05, "loss": 1.7852, "step": 15525 }, { "epoch": 0.030261840870232398, "grad_norm": 2.9521865844726562, "learning_rate": 2.9932263217959064e-05, "loss": 1.8941, "step": 15540 }, { "epoch": 0.03029105114134266, "grad_norm": 3.14367413520813, "learning_rate": 2.9932132487604936e-05, "loss": 1.9962, "step": 15555 }, { "epoch": 0.030320261412452924, "grad_norm": 1.763987421989441, "learning_rate": 2.9932001631505217e-05, "loss": 1.9057, "step": 15570 }, { "epoch": 0.030349471683563185, "grad_norm": 4.6951446533203125, "learning_rate": 2.993187064966101e-05, "loss": 1.8781, "step": 15585 }, { "epoch": 0.03037868195467345, "grad_norm": 2.424650192260742, "learning_rate": 2.993173954207343e-05, "loss": 1.8808, "step": 15600 }, { "epoch": 0.03040789222578371, "grad_norm": 2.743579387664795, "learning_rate": 2.9931608308743562e-05, "loss": 1.899, "step": 15615 }, { "epoch": 0.030437102496893974, "grad_norm": 4.443767070770264, "learning_rate": 2.9931476949672524e-05, "loss": 1.8727, "step": 15630 }, { "epoch": 0.03046631276800424, "grad_norm": 2.6599271297454834, "learning_rate": 2.9931345464861418e-05, "loss": 1.8466, "step": 15645 }, { "epoch": 0.0304955230391145, "grad_norm": 2.598816394805908, "learning_rate": 2.993121385431135e-05, "loss": 2.0863, "step": 15660 }, { "epoch": 0.030524733310224764, "grad_norm": 3.126720666885376, "learning_rate": 2.9931082118023432e-05, "loss": 1.7731, "step": 15675 }, { "epoch": 0.030553943581335025, "grad_norm": 3.7353076934814453, "learning_rate": 2.9930950255998773e-05, "loss": 2.1104, "step": 15690 }, { "epoch": 0.03058315385244529, "grad_norm": 4.932044982910156, "learning_rate": 2.9930818268238483e-05, "loss": 1.8693, "step": 15705 }, { "epoch": 0.03061236412355555, "grad_norm": 2.6772072315216064, "learning_rate": 2.9930686154743666e-05, "loss": 1.8159, "step": 15720 }, { "epoch": 0.030641574394665815, "grad_norm": 3.73004412651062, "learning_rate": 2.9930553915515445e-05, "loss": 1.6705, "step": 15735 }, { "epoch": 0.03067078466577608, "grad_norm": 2.2986795902252197, "learning_rate": 2.993042155055493e-05, "loss": 2.05, "step": 15750 }, { "epoch": 0.03069999493688634, "grad_norm": 2.9536030292510986, "learning_rate": 2.9930289059863234e-05, "loss": 1.9297, "step": 15765 }, { "epoch": 0.030729205207996605, "grad_norm": 1.8823219537734985, "learning_rate": 2.9930156443441477e-05, "loss": 1.9812, "step": 15780 }, { "epoch": 0.030758415479106866, "grad_norm": 2.553921699523926, "learning_rate": 2.993002370129077e-05, "loss": 2.0723, "step": 15795 }, { "epoch": 0.03078762575021713, "grad_norm": 2.391080617904663, "learning_rate": 2.9929890833412233e-05, "loss": 1.8342, "step": 15810 }, { "epoch": 0.03081683602132739, "grad_norm": 1.898431420326233, "learning_rate": 2.9929757839806985e-05, "loss": 1.8282, "step": 15825 }, { "epoch": 0.030846046292437656, "grad_norm": 3.6443545818328857, "learning_rate": 2.9929624720476153e-05, "loss": 1.7823, "step": 15840 }, { "epoch": 0.030875256563547917, "grad_norm": 2.1495354175567627, "learning_rate": 2.9929491475420844e-05, "loss": 1.9565, "step": 15855 }, { "epoch": 0.03090446683465818, "grad_norm": 4.317218780517578, "learning_rate": 2.992935810464219e-05, "loss": 1.9516, "step": 15870 }, { "epoch": 0.030933677105768446, "grad_norm": 2.2863664627075195, "learning_rate": 2.992922460814131e-05, "loss": 1.9399, "step": 15885 }, { "epoch": 0.030962887376878707, "grad_norm": 3.2142059803009033, "learning_rate": 2.9929090985919334e-05, "loss": 2.0122, "step": 15900 }, { "epoch": 0.03099209764798897, "grad_norm": 3.0333306789398193, "learning_rate": 2.992895723797738e-05, "loss": 1.7733, "step": 15915 }, { "epoch": 0.031021307919099232, "grad_norm": 2.6775388717651367, "learning_rate": 2.9928823364316575e-05, "loss": 1.8205, "step": 15930 }, { "epoch": 0.031050518190209497, "grad_norm": 2.1693944931030273, "learning_rate": 2.9928689364938057e-05, "loss": 1.895, "step": 15945 }, { "epoch": 0.031079728461319758, "grad_norm": 2.754948377609253, "learning_rate": 2.992855523984294e-05, "loss": 1.8623, "step": 15960 }, { "epoch": 0.031108938732430022, "grad_norm": 2.659349203109741, "learning_rate": 2.9928420989032357e-05, "loss": 1.9873, "step": 15975 }, { "epoch": 0.031138149003540287, "grad_norm": 3.5560247898101807, "learning_rate": 2.9928286612507445e-05, "loss": 1.9364, "step": 15990 }, { "epoch": 0.031167359274650547, "grad_norm": 4.824398994445801, "learning_rate": 2.9928152110269335e-05, "loss": 1.8708, "step": 16005 }, { "epoch": 0.031196569545760812, "grad_norm": 2.5657832622528076, "learning_rate": 2.992801748231915e-05, "loss": 2.0086, "step": 16020 }, { "epoch": 0.031225779816871073, "grad_norm": 5.117823123931885, "learning_rate": 2.9927882728658036e-05, "loss": 1.7608, "step": 16035 }, { "epoch": 0.031254990087981334, "grad_norm": 1.9517539739608765, "learning_rate": 2.992774784928712e-05, "loss": 1.9367, "step": 16050 }, { "epoch": 0.0312842003590916, "grad_norm": 3.202143669128418, "learning_rate": 2.9927612844207537e-05, "loss": 1.8725, "step": 16065 }, { "epoch": 0.03131341063020186, "grad_norm": 2.5217974185943604, "learning_rate": 2.992747771342043e-05, "loss": 1.9029, "step": 16080 }, { "epoch": 0.031342620901312124, "grad_norm": 1.6550703048706055, "learning_rate": 2.992734245692693e-05, "loss": 1.9016, "step": 16095 }, { "epoch": 0.03137183117242239, "grad_norm": 2.7231898307800293, "learning_rate": 2.9927207074728187e-05, "loss": 1.9735, "step": 16110 }, { "epoch": 0.03140104144353265, "grad_norm": 3.855612277984619, "learning_rate": 2.9927071566825328e-05, "loss": 1.7641, "step": 16125 }, { "epoch": 0.031430251714642914, "grad_norm": 3.016885757446289, "learning_rate": 2.99269359332195e-05, "loss": 1.9789, "step": 16140 }, { "epoch": 0.031459461985753175, "grad_norm": 2.0709967613220215, "learning_rate": 2.9926800173911845e-05, "loss": 1.922, "step": 16155 }, { "epoch": 0.03148867225686344, "grad_norm": 5.23029088973999, "learning_rate": 2.992666428890351e-05, "loss": 1.9681, "step": 16170 }, { "epoch": 0.031517882527973703, "grad_norm": 2.7961294651031494, "learning_rate": 2.9926528278195634e-05, "loss": 1.7172, "step": 16185 }, { "epoch": 0.031547092799083964, "grad_norm": 2.5741660594940186, "learning_rate": 2.992639214178936e-05, "loss": 1.7638, "step": 16200 }, { "epoch": 0.031576303070194225, "grad_norm": 1.9811028242111206, "learning_rate": 2.9926255879685846e-05, "loss": 2.0028, "step": 16215 }, { "epoch": 0.03160551334130449, "grad_norm": 4.054990768432617, "learning_rate": 2.992611949188623e-05, "loss": 1.9305, "step": 16230 }, { "epoch": 0.031634723612414754, "grad_norm": 2.189107894897461, "learning_rate": 2.992598297839166e-05, "loss": 1.7656, "step": 16245 }, { "epoch": 0.031663933883525015, "grad_norm": 2.2795944213867188, "learning_rate": 2.9925846339203285e-05, "loss": 1.8474, "step": 16260 }, { "epoch": 0.03169314415463528, "grad_norm": 5.294840335845947, "learning_rate": 2.9925709574322262e-05, "loss": 1.8983, "step": 16275 }, { "epoch": 0.031722354425745544, "grad_norm": 3.4388887882232666, "learning_rate": 2.9925572683749742e-05, "loss": 1.7128, "step": 16290 }, { "epoch": 0.031751564696855805, "grad_norm": 4.72010612487793, "learning_rate": 2.992543566748687e-05, "loss": 1.9095, "step": 16305 }, { "epoch": 0.031780774967966066, "grad_norm": 3.0153849124908447, "learning_rate": 2.9925298525534807e-05, "loss": 1.8908, "step": 16320 }, { "epoch": 0.031809985239076334, "grad_norm": 2.4210598468780518, "learning_rate": 2.992516125789471e-05, "loss": 2.0611, "step": 16335 }, { "epoch": 0.031839195510186595, "grad_norm": 2.9562766551971436, "learning_rate": 2.9925023864567727e-05, "loss": 1.7784, "step": 16350 }, { "epoch": 0.031868405781296856, "grad_norm": 2.944288969039917, "learning_rate": 2.992488634555502e-05, "loss": 1.8543, "step": 16365 }, { "epoch": 0.031897616052407124, "grad_norm": 3.5835225582122803, "learning_rate": 2.9924748700857747e-05, "loss": 1.9654, "step": 16380 }, { "epoch": 0.031926826323517385, "grad_norm": 3.5023677349090576, "learning_rate": 2.9924610930477062e-05, "loss": 1.7496, "step": 16395 }, { "epoch": 0.031956036594627646, "grad_norm": 2.08420467376709, "learning_rate": 2.9924473034414136e-05, "loss": 1.8234, "step": 16410 }, { "epoch": 0.03198524686573791, "grad_norm": 4.309360504150391, "learning_rate": 2.992433501267012e-05, "loss": 1.9149, "step": 16425 }, { "epoch": 0.032014457136848175, "grad_norm": 3.0357537269592285, "learning_rate": 2.9924196865246175e-05, "loss": 1.9551, "step": 16440 }, { "epoch": 0.032043667407958436, "grad_norm": 4.165322780609131, "learning_rate": 2.9924058592143473e-05, "loss": 1.7488, "step": 16455 }, { "epoch": 0.0320728776790687, "grad_norm": 3.4814274311065674, "learning_rate": 2.9923920193363176e-05, "loss": 1.9659, "step": 16470 }, { "epoch": 0.032102087950178965, "grad_norm": 2.5652506351470947, "learning_rate": 2.992378166890645e-05, "loss": 1.7986, "step": 16485 }, { "epoch": 0.032131298221289226, "grad_norm": 1.9026885032653809, "learning_rate": 2.9923643018774455e-05, "loss": 1.8294, "step": 16500 }, { "epoch": 0.03216050849239949, "grad_norm": 3.861070156097412, "learning_rate": 2.9923504242968365e-05, "loss": 1.7451, "step": 16515 }, { "epoch": 0.03218971876350975, "grad_norm": 1.9591658115386963, "learning_rate": 2.992336534148935e-05, "loss": 1.805, "step": 16530 }, { "epoch": 0.032218929034620016, "grad_norm": 4.189550399780273, "learning_rate": 2.992322631433857e-05, "loss": 1.8049, "step": 16545 }, { "epoch": 0.032248139305730276, "grad_norm": 3.1660592555999756, "learning_rate": 2.9923087161517205e-05, "loss": 1.9415, "step": 16560 }, { "epoch": 0.03227734957684054, "grad_norm": 2.801609754562378, "learning_rate": 2.9922947883026426e-05, "loss": 1.8096, "step": 16575 }, { "epoch": 0.032306559847950805, "grad_norm": 2.699336290359497, "learning_rate": 2.9922808478867403e-05, "loss": 1.723, "step": 16590 }, { "epoch": 0.032335770119061066, "grad_norm": 2.2953224182128906, "learning_rate": 2.992266894904131e-05, "loss": 2.0549, "step": 16605 }, { "epoch": 0.03236498039017133, "grad_norm": 3.8196887969970703, "learning_rate": 2.9922529293549327e-05, "loss": 1.8312, "step": 16620 }, { "epoch": 0.03239419066128159, "grad_norm": 2.974578380584717, "learning_rate": 2.9922389512392622e-05, "loss": 1.9242, "step": 16635 }, { "epoch": 0.032423400932391856, "grad_norm": 4.62038516998291, "learning_rate": 2.9922249605572376e-05, "loss": 1.9786, "step": 16650 }, { "epoch": 0.03245261120350212, "grad_norm": 1.5149112939834595, "learning_rate": 2.992210957308977e-05, "loss": 1.8984, "step": 16665 }, { "epoch": 0.03248182147461238, "grad_norm": 2.2014825344085693, "learning_rate": 2.992196941494598e-05, "loss": 1.7994, "step": 16680 }, { "epoch": 0.032511031745722646, "grad_norm": 4.065785884857178, "learning_rate": 2.9921829131142186e-05, "loss": 1.8157, "step": 16695 }, { "epoch": 0.03254024201683291, "grad_norm": 4.191883087158203, "learning_rate": 2.992168872167957e-05, "loss": 1.8684, "step": 16710 }, { "epoch": 0.03256945228794317, "grad_norm": 3.4441282749176025, "learning_rate": 2.9921548186559314e-05, "loss": 1.8299, "step": 16725 }, { "epoch": 0.03259866255905343, "grad_norm": 4.0237345695495605, "learning_rate": 2.9921407525782604e-05, "loss": 1.9262, "step": 16740 }, { "epoch": 0.0326278728301637, "grad_norm": 2.171717405319214, "learning_rate": 2.992126673935062e-05, "loss": 1.7679, "step": 16755 }, { "epoch": 0.03265708310127396, "grad_norm": 4.097175121307373, "learning_rate": 2.992112582726455e-05, "loss": 1.9499, "step": 16770 }, { "epoch": 0.03268629337238422, "grad_norm": 4.390431880950928, "learning_rate": 2.9920984789525583e-05, "loss": 1.8797, "step": 16785 }, { "epoch": 0.03271550364349448, "grad_norm": 3.068178176879883, "learning_rate": 2.9920843626134907e-05, "loss": 2.0427, "step": 16800 }, { "epoch": 0.03274471391460475, "grad_norm": 3.378275156021118, "learning_rate": 2.9920702337093707e-05, "loss": 1.8853, "step": 16815 }, { "epoch": 0.03277392418571501, "grad_norm": 2.2076075077056885, "learning_rate": 2.992056092240317e-05, "loss": 1.9406, "step": 16830 }, { "epoch": 0.03280313445682527, "grad_norm": 3.0358242988586426, "learning_rate": 2.992041938206449e-05, "loss": 2.0143, "step": 16845 }, { "epoch": 0.03283234472793554, "grad_norm": 4.094939231872559, "learning_rate": 2.9920277716078868e-05, "loss": 1.9757, "step": 16860 }, { "epoch": 0.0328615549990458, "grad_norm": 1.8015364408493042, "learning_rate": 2.9920135924447484e-05, "loss": 2.0698, "step": 16875 }, { "epoch": 0.03289076527015606, "grad_norm": 3.4187324047088623, "learning_rate": 2.9919994007171535e-05, "loss": 1.9332, "step": 16890 }, { "epoch": 0.03291997554126632, "grad_norm": 3.91485595703125, "learning_rate": 2.991985196425222e-05, "loss": 1.9433, "step": 16905 }, { "epoch": 0.03294918581237659, "grad_norm": 3.0797996520996094, "learning_rate": 2.9919709795690732e-05, "loss": 1.7478, "step": 16920 }, { "epoch": 0.03297839608348685, "grad_norm": 4.396271705627441, "learning_rate": 2.9919567501488273e-05, "loss": 1.9191, "step": 16935 }, { "epoch": 0.03300760635459711, "grad_norm": 2.7897489070892334, "learning_rate": 2.9919425081646036e-05, "loss": 1.9495, "step": 16950 }, { "epoch": 0.03303681662570738, "grad_norm": 3.063068151473999, "learning_rate": 2.991928253616522e-05, "loss": 2.1288, "step": 16965 }, { "epoch": 0.03306602689681764, "grad_norm": 5.236184120178223, "learning_rate": 2.991913986504703e-05, "loss": 1.8698, "step": 16980 }, { "epoch": 0.0330952371679279, "grad_norm": 2.758821725845337, "learning_rate": 2.9918997068292666e-05, "loss": 1.8664, "step": 16995 }, { "epoch": 0.03312444743903816, "grad_norm": 3.3068835735321045, "learning_rate": 2.9918854145903326e-05, "loss": 2.0247, "step": 17010 }, { "epoch": 0.03315365771014843, "grad_norm": 5.055883884429932, "learning_rate": 2.991871109788022e-05, "loss": 1.7333, "step": 17025 }, { "epoch": 0.03318286798125869, "grad_norm": 2.9909791946411133, "learning_rate": 2.9918567924224545e-05, "loss": 1.7868, "step": 17040 }, { "epoch": 0.03321207825236895, "grad_norm": 4.3141913414001465, "learning_rate": 2.9918424624937514e-05, "loss": 1.8976, "step": 17055 }, { "epoch": 0.03324128852347922, "grad_norm": 3.3365187644958496, "learning_rate": 2.991828120002033e-05, "loss": 1.8246, "step": 17070 }, { "epoch": 0.03327049879458948, "grad_norm": 1.7029449939727783, "learning_rate": 2.991813764947421e-05, "loss": 1.7464, "step": 17085 }, { "epoch": 0.03329970906569974, "grad_norm": 2.34187650680542, "learning_rate": 2.9917993973300343e-05, "loss": 2.0614, "step": 17100 }, { "epoch": 0.03332891933681, "grad_norm": 2.3299100399017334, "learning_rate": 2.9917850171499957e-05, "loss": 1.9139, "step": 17115 }, { "epoch": 0.03335812960792027, "grad_norm": 2.58627986907959, "learning_rate": 2.9917706244074254e-05, "loss": 2.0092, "step": 17130 }, { "epoch": 0.03338733987903053, "grad_norm": 3.580124855041504, "learning_rate": 2.991756219102445e-05, "loss": 1.8548, "step": 17145 }, { "epoch": 0.03341655015014079, "grad_norm": 1.6242703199386597, "learning_rate": 2.9917418012351755e-05, "loss": 1.8455, "step": 17160 }, { "epoch": 0.03344576042125106, "grad_norm": 3.0739760398864746, "learning_rate": 2.991727370805739e-05, "loss": 1.846, "step": 17175 }, { "epoch": 0.03347497069236132, "grad_norm": 2.6895573139190674, "learning_rate": 2.991712927814256e-05, "loss": 1.8201, "step": 17190 }, { "epoch": 0.03350418096347158, "grad_norm": 3.353482961654663, "learning_rate": 2.9916984722608488e-05, "loss": 1.9598, "step": 17205 }, { "epoch": 0.03353339123458184, "grad_norm": 3.334764242172241, "learning_rate": 2.991684004145639e-05, "loss": 1.8388, "step": 17220 }, { "epoch": 0.03356260150569211, "grad_norm": 2.8904106616973877, "learning_rate": 2.9916695234687484e-05, "loss": 1.9402, "step": 17235 }, { "epoch": 0.03359181177680237, "grad_norm": 2.01975154876709, "learning_rate": 2.991655030230299e-05, "loss": 1.9462, "step": 17250 }, { "epoch": 0.03362102204791263, "grad_norm": 3.490748882293701, "learning_rate": 2.9916405244304123e-05, "loss": 1.9171, "step": 17265 }, { "epoch": 0.033650232319022894, "grad_norm": 3.2388625144958496, "learning_rate": 2.9916260060692114e-05, "loss": 1.8305, "step": 17280 }, { "epoch": 0.03367944259013316, "grad_norm": 3.952791690826416, "learning_rate": 2.991611475146818e-05, "loss": 1.9577, "step": 17295 }, { "epoch": 0.03370865286124342, "grad_norm": 2.0647149085998535, "learning_rate": 2.9915969316633548e-05, "loss": 1.7789, "step": 17310 }, { "epoch": 0.03373786313235368, "grad_norm": 2.9091007709503174, "learning_rate": 2.9915823756189438e-05, "loss": 2.0568, "step": 17325 }, { "epoch": 0.03376707340346395, "grad_norm": 3.5968873500823975, "learning_rate": 2.9915678070137078e-05, "loss": 1.9264, "step": 17340 }, { "epoch": 0.03379628367457421, "grad_norm": 2.7411301136016846, "learning_rate": 2.9915532258477697e-05, "loss": 1.832, "step": 17355 }, { "epoch": 0.03382549394568447, "grad_norm": 3.8037030696868896, "learning_rate": 2.991538632121252e-05, "loss": 1.9751, "step": 17370 }, { "epoch": 0.033854704216794734, "grad_norm": 2.2467167377471924, "learning_rate": 2.9915240258342776e-05, "loss": 1.8501, "step": 17385 }, { "epoch": 0.033883914487905, "grad_norm": 2.01926589012146, "learning_rate": 2.9915094069869696e-05, "loss": 1.8398, "step": 17400 }, { "epoch": 0.03391312475901526, "grad_norm": 3.427839994430542, "learning_rate": 2.9914947755794515e-05, "loss": 2.0322, "step": 17415 }, { "epoch": 0.033942335030125524, "grad_norm": 2.9915285110473633, "learning_rate": 2.991480131611846e-05, "loss": 1.7704, "step": 17430 }, { "epoch": 0.03397154530123579, "grad_norm": 2.0821945667266846, "learning_rate": 2.9914654750842765e-05, "loss": 1.9759, "step": 17445 }, { "epoch": 0.03400075557234605, "grad_norm": 3.167320966720581, "learning_rate": 2.9914508059968664e-05, "loss": 1.8321, "step": 17460 }, { "epoch": 0.034029965843456314, "grad_norm": 3.4027440547943115, "learning_rate": 2.991436124349739e-05, "loss": 1.8643, "step": 17475 }, { "epoch": 0.034059176114566575, "grad_norm": 1.9895273447036743, "learning_rate": 2.9914214301430183e-05, "loss": 2.018, "step": 17490 }, { "epoch": 0.03408838638567684, "grad_norm": 3.4797520637512207, "learning_rate": 2.9914067233768285e-05, "loss": 1.8666, "step": 17505 }, { "epoch": 0.034117596656787104, "grad_norm": 2.578434944152832, "learning_rate": 2.9913920040512925e-05, "loss": 1.8808, "step": 17520 }, { "epoch": 0.034146806927897365, "grad_norm": 3.499577522277832, "learning_rate": 2.991377272166535e-05, "loss": 2.112, "step": 17535 }, { "epoch": 0.03417601719900763, "grad_norm": 2.6072142124176025, "learning_rate": 2.9913625277226795e-05, "loss": 1.7954, "step": 17550 }, { "epoch": 0.034205227470117894, "grad_norm": 4.382633686065674, "learning_rate": 2.9913477707198505e-05, "loss": 1.8943, "step": 17565 }, { "epoch": 0.034234437741228155, "grad_norm": 2.883723497390747, "learning_rate": 2.9913330011581718e-05, "loss": 1.8615, "step": 17580 }, { "epoch": 0.034263648012338416, "grad_norm": 2.309401273727417, "learning_rate": 2.991318219037769e-05, "loss": 1.9813, "step": 17595 }, { "epoch": 0.034292858283448684, "grad_norm": 3.3916573524475098, "learning_rate": 2.991303424358765e-05, "loss": 2.0303, "step": 17610 }, { "epoch": 0.034322068554558945, "grad_norm": 2.0413196086883545, "learning_rate": 2.9912886171212855e-05, "loss": 1.7976, "step": 17625 }, { "epoch": 0.034351278825669206, "grad_norm": 2.3423221111297607, "learning_rate": 2.991273797325454e-05, "loss": 2.0449, "step": 17640 }, { "epoch": 0.034380489096779474, "grad_norm": 2.9180712699890137, "learning_rate": 2.991258964971397e-05, "loss": 1.937, "step": 17655 }, { "epoch": 0.034409699367889734, "grad_norm": 2.8222568035125732, "learning_rate": 2.9912441200592385e-05, "loss": 1.8245, "step": 17670 }, { "epoch": 0.034438909638999995, "grad_norm": 2.49934720993042, "learning_rate": 2.991229262589103e-05, "loss": 1.7475, "step": 17685 }, { "epoch": 0.034468119910110256, "grad_norm": 2.517383098602295, "learning_rate": 2.9912143925611166e-05, "loss": 2.0156, "step": 17700 }, { "epoch": 0.034497330181220524, "grad_norm": 1.9787654876708984, "learning_rate": 2.9911995099754037e-05, "loss": 1.7771, "step": 17715 }, { "epoch": 0.034526540452330785, "grad_norm": 3.3665497303009033, "learning_rate": 2.9911846148320903e-05, "loss": 1.7394, "step": 17730 }, { "epoch": 0.034555750723441046, "grad_norm": 2.7270398139953613, "learning_rate": 2.9911697071313017e-05, "loss": 1.8666, "step": 17745 }, { "epoch": 0.034584960994551314, "grad_norm": 2.7560067176818848, "learning_rate": 2.9911547868731626e-05, "loss": 1.7811, "step": 17760 }, { "epoch": 0.034614171265661575, "grad_norm": 3.9651834964752197, "learning_rate": 2.9911398540577996e-05, "loss": 1.8758, "step": 17775 }, { "epoch": 0.034643381536771836, "grad_norm": 2.7909586429595947, "learning_rate": 2.9911249086853386e-05, "loss": 2.0029, "step": 17790 }, { "epoch": 0.0346725918078821, "grad_norm": 3.109741449356079, "learning_rate": 2.9911099507559045e-05, "loss": 1.9692, "step": 17805 }, { "epoch": 0.034701802078992365, "grad_norm": 1.7844855785369873, "learning_rate": 2.9910949802696244e-05, "loss": 1.7244, "step": 17820 }, { "epoch": 0.034731012350102626, "grad_norm": 1.8406388759613037, "learning_rate": 2.9910799972266232e-05, "loss": 1.8689, "step": 17835 }, { "epoch": 0.03476022262121289, "grad_norm": 2.9145405292510986, "learning_rate": 2.9910650016270278e-05, "loss": 1.8982, "step": 17850 }, { "epoch": 0.03478943289232315, "grad_norm": 1.9398448467254639, "learning_rate": 2.991049993470964e-05, "loss": 1.5994, "step": 17865 }, { "epoch": 0.034818643163433416, "grad_norm": 2.3878304958343506, "learning_rate": 2.991034972758559e-05, "loss": 1.8854, "step": 17880 }, { "epoch": 0.03484785343454368, "grad_norm": 1.8284587860107422, "learning_rate": 2.9910199394899385e-05, "loss": 1.8261, "step": 17895 }, { "epoch": 0.03487706370565394, "grad_norm": 3.306748151779175, "learning_rate": 2.9910048936652294e-05, "loss": 1.8727, "step": 17910 }, { "epoch": 0.034906273976764206, "grad_norm": 3.8391776084899902, "learning_rate": 2.9909898352845585e-05, "loss": 1.8347, "step": 17925 }, { "epoch": 0.03493548424787447, "grad_norm": 2.275381565093994, "learning_rate": 2.9909747643480526e-05, "loss": 1.8268, "step": 17940 }, { "epoch": 0.03496469451898473, "grad_norm": 2.871506929397583, "learning_rate": 2.9909596808558385e-05, "loss": 1.9375, "step": 17955 }, { "epoch": 0.03499390479009499, "grad_norm": 3.8660874366760254, "learning_rate": 2.990944584808043e-05, "loss": 1.678, "step": 17970 }, { "epoch": 0.03502311506120526, "grad_norm": 4.991106033325195, "learning_rate": 2.9909294762047935e-05, "loss": 1.9136, "step": 17985 }, { "epoch": 0.03505232533231552, "grad_norm": 3.8985953330993652, "learning_rate": 2.990914355046217e-05, "loss": 1.7425, "step": 18000 }, { "epoch": 0.03508153560342578, "grad_norm": 2.2547903060913086, "learning_rate": 2.9908992213324413e-05, "loss": 1.9613, "step": 18015 }, { "epoch": 0.035110745874536047, "grad_norm": 4.33575963973999, "learning_rate": 2.9908840750635936e-05, "loss": 2.0328, "step": 18030 }, { "epoch": 0.03513995614564631, "grad_norm": 3.8632030487060547, "learning_rate": 2.9908689162398012e-05, "loss": 1.7863, "step": 18045 }, { "epoch": 0.03516916641675657, "grad_norm": 4.932112216949463, "learning_rate": 2.9908537448611927e-05, "loss": 1.7159, "step": 18060 }, { "epoch": 0.03519837668786683, "grad_norm": 2.3018031120300293, "learning_rate": 2.9908385609278943e-05, "loss": 1.9104, "step": 18075 }, { "epoch": 0.0352275869589771, "grad_norm": 2.7413060665130615, "learning_rate": 2.990823364440035e-05, "loss": 1.6733, "step": 18090 }, { "epoch": 0.03525679723008736, "grad_norm": 3.928980588912964, "learning_rate": 2.9908081553977424e-05, "loss": 1.9619, "step": 18105 }, { "epoch": 0.03528600750119762, "grad_norm": 1.6751407384872437, "learning_rate": 2.9907929338011447e-05, "loss": 1.8283, "step": 18120 }, { "epoch": 0.03531521777230789, "grad_norm": 4.515170574188232, "learning_rate": 2.99077769965037e-05, "loss": 1.7808, "step": 18135 }, { "epoch": 0.03534442804341815, "grad_norm": 3.367410182952881, "learning_rate": 2.9907624529455468e-05, "loss": 1.9427, "step": 18150 }, { "epoch": 0.03537363831452841, "grad_norm": 2.9150290489196777, "learning_rate": 2.990747193686803e-05, "loss": 2.021, "step": 18165 }, { "epoch": 0.03540284858563867, "grad_norm": 3.456367015838623, "learning_rate": 2.9907319218742677e-05, "loss": 1.8711, "step": 18180 }, { "epoch": 0.03543205885674894, "grad_norm": 2.504117727279663, "learning_rate": 2.990716637508069e-05, "loss": 1.9605, "step": 18195 }, { "epoch": 0.0354612691278592, "grad_norm": 3.5525166988372803, "learning_rate": 2.990701340588336e-05, "loss": 1.6192, "step": 18210 }, { "epoch": 0.03549047939896946, "grad_norm": 4.048015594482422, "learning_rate": 2.9906860311151973e-05, "loss": 1.9809, "step": 18225 }, { "epoch": 0.03551968967007973, "grad_norm": 3.0211918354034424, "learning_rate": 2.990670709088782e-05, "loss": 1.8898, "step": 18240 }, { "epoch": 0.03554889994118999, "grad_norm": 2.4371862411499023, "learning_rate": 2.9906553745092184e-05, "loss": 1.8836, "step": 18255 }, { "epoch": 0.03557811021230025, "grad_norm": 3.461329460144043, "learning_rate": 2.990640027376637e-05, "loss": 1.7893, "step": 18270 }, { "epoch": 0.03560732048341051, "grad_norm": 2.6082050800323486, "learning_rate": 2.990624667691166e-05, "loss": 1.94, "step": 18285 }, { "epoch": 0.03563653075452078, "grad_norm": 2.8006784915924072, "learning_rate": 2.990609295452935e-05, "loss": 1.7607, "step": 18300 }, { "epoch": 0.03566574102563104, "grad_norm": 4.304664611816406, "learning_rate": 2.990593910662073e-05, "loss": 1.8973, "step": 18315 }, { "epoch": 0.0356949512967413, "grad_norm": 2.063380479812622, "learning_rate": 2.9905785133187108e-05, "loss": 1.7456, "step": 18330 }, { "epoch": 0.03572416156785156, "grad_norm": 4.626766681671143, "learning_rate": 2.9905631034229772e-05, "loss": 1.8402, "step": 18345 }, { "epoch": 0.03575337183896183, "grad_norm": 4.0876288414001465, "learning_rate": 2.9905476809750017e-05, "loss": 1.8239, "step": 18360 }, { "epoch": 0.03578258211007209, "grad_norm": 2.5533339977264404, "learning_rate": 2.9905322459749148e-05, "loss": 1.9771, "step": 18375 }, { "epoch": 0.03581179238118235, "grad_norm": 2.268286943435669, "learning_rate": 2.990516798422846e-05, "loss": 1.8886, "step": 18390 }, { "epoch": 0.03584100265229262, "grad_norm": 2.343240261077881, "learning_rate": 2.9905013383189257e-05, "loss": 1.8676, "step": 18405 }, { "epoch": 0.03587021292340288, "grad_norm": 3.544220209121704, "learning_rate": 2.990485865663284e-05, "loss": 1.7922, "step": 18420 }, { "epoch": 0.03589942319451314, "grad_norm": 3.608947277069092, "learning_rate": 2.9904703804560516e-05, "loss": 1.7506, "step": 18435 }, { "epoch": 0.0359286334656234, "grad_norm": 2.520986557006836, "learning_rate": 2.990454882697358e-05, "loss": 1.9715, "step": 18450 }, { "epoch": 0.03595784373673367, "grad_norm": 2.598273277282715, "learning_rate": 2.9904393723873342e-05, "loss": 2.0018, "step": 18465 }, { "epoch": 0.03598705400784393, "grad_norm": 3.4648282527923584, "learning_rate": 2.990423849526111e-05, "loss": 1.8609, "step": 18480 }, { "epoch": 0.03601626427895419, "grad_norm": 4.387876510620117, "learning_rate": 2.9904083141138194e-05, "loss": 1.9949, "step": 18495 }, { "epoch": 0.03604547455006446, "grad_norm": 2.121781349182129, "learning_rate": 2.9903927661505888e-05, "loss": 1.91, "step": 18510 }, { "epoch": 0.03607468482117472, "grad_norm": 2.5395514965057373, "learning_rate": 2.9903772056365516e-05, "loss": 1.7663, "step": 18525 }, { "epoch": 0.03610389509228498, "grad_norm": 4.20078706741333, "learning_rate": 2.9903616325718385e-05, "loss": 1.8051, "step": 18540 }, { "epoch": 0.03613310536339524, "grad_norm": 3.0787668228149414, "learning_rate": 2.9903460469565802e-05, "loss": 1.7457, "step": 18555 }, { "epoch": 0.03616231563450551, "grad_norm": 2.493914842605591, "learning_rate": 2.9903304487909084e-05, "loss": 1.841, "step": 18570 }, { "epoch": 0.03619152590561577, "grad_norm": 4.367134094238281, "learning_rate": 2.9903148380749543e-05, "loss": 1.8202, "step": 18585 }, { "epoch": 0.03622073617672603, "grad_norm": 4.713270664215088, "learning_rate": 2.990299214808849e-05, "loss": 1.7427, "step": 18600 }, { "epoch": 0.0362499464478363, "grad_norm": 2.3630452156066895, "learning_rate": 2.9902835789927246e-05, "loss": 1.8379, "step": 18615 }, { "epoch": 0.03627915671894656, "grad_norm": 4.240983009338379, "learning_rate": 2.9902679306267127e-05, "loss": 1.8876, "step": 18630 }, { "epoch": 0.03630836699005682, "grad_norm": 2.1189780235290527, "learning_rate": 2.990252269710945e-05, "loss": 1.7273, "step": 18645 }, { "epoch": 0.036337577261167084, "grad_norm": 4.207765579223633, "learning_rate": 2.9902365962455533e-05, "loss": 1.7961, "step": 18660 }, { "epoch": 0.03636678753227735, "grad_norm": 4.526651859283447, "learning_rate": 2.9902209102306694e-05, "loss": 1.8292, "step": 18675 }, { "epoch": 0.03639599780338761, "grad_norm": 4.371520042419434, "learning_rate": 2.990205211666426e-05, "loss": 1.9646, "step": 18690 }, { "epoch": 0.036425208074497874, "grad_norm": 3.766552209854126, "learning_rate": 2.990189500552955e-05, "loss": 1.6872, "step": 18705 }, { "epoch": 0.03645441834560814, "grad_norm": 3.610987901687622, "learning_rate": 2.9901737768903882e-05, "loss": 1.8095, "step": 18720 }, { "epoch": 0.0364836286167184, "grad_norm": 2.77374529838562, "learning_rate": 2.9901580406788583e-05, "loss": 1.7853, "step": 18735 }, { "epoch": 0.036512838887828664, "grad_norm": 1.9032909870147705, "learning_rate": 2.9901422919184984e-05, "loss": 2.066, "step": 18750 }, { "epoch": 0.036542049158938925, "grad_norm": 6.839906215667725, "learning_rate": 2.9901265306094406e-05, "loss": 2.0159, "step": 18765 }, { "epoch": 0.03657125943004919, "grad_norm": 3.8178064823150635, "learning_rate": 2.9901107567518177e-05, "loss": 1.9257, "step": 18780 }, { "epoch": 0.036600469701159453, "grad_norm": 2.0359580516815186, "learning_rate": 2.990094970345762e-05, "loss": 1.8261, "step": 18795 }, { "epoch": 0.036629679972269714, "grad_norm": 3.5759871006011963, "learning_rate": 2.9900791713914078e-05, "loss": 1.9477, "step": 18810 }, { "epoch": 0.036658890243379975, "grad_norm": 2.471161127090454, "learning_rate": 2.990063359888887e-05, "loss": 1.9519, "step": 18825 }, { "epoch": 0.03668810051449024, "grad_norm": 4.123219966888428, "learning_rate": 2.9900475358383327e-05, "loss": 1.9215, "step": 18840 }, { "epoch": 0.036717310785600504, "grad_norm": 2.517751693725586, "learning_rate": 2.9900316992398793e-05, "loss": 1.9601, "step": 18855 }, { "epoch": 0.036746521056710765, "grad_norm": 3.0052502155303955, "learning_rate": 2.9900158500936587e-05, "loss": 1.8937, "step": 18870 }, { "epoch": 0.03677573132782103, "grad_norm": 3.820679187774658, "learning_rate": 2.9899999883998057e-05, "loss": 1.7623, "step": 18885 }, { "epoch": 0.036804941598931294, "grad_norm": 4.087255954742432, "learning_rate": 2.989984114158453e-05, "loss": 1.7651, "step": 18900 }, { "epoch": 0.036834151870041555, "grad_norm": 2.3399763107299805, "learning_rate": 2.989968227369734e-05, "loss": 1.8389, "step": 18915 }, { "epoch": 0.036863362141151816, "grad_norm": 4.5868072509765625, "learning_rate": 2.9899523280337836e-05, "loss": 1.9238, "step": 18930 }, { "epoch": 0.036892572412262084, "grad_norm": 2.5856025218963623, "learning_rate": 2.989936416150735e-05, "loss": 1.8044, "step": 18945 }, { "epoch": 0.036921782683372345, "grad_norm": 2.635301351547241, "learning_rate": 2.9899204917207222e-05, "loss": 1.9433, "step": 18960 }, { "epoch": 0.036950992954482606, "grad_norm": 2.7841100692749023, "learning_rate": 2.9899045547438792e-05, "loss": 1.8401, "step": 18975 }, { "epoch": 0.036980203225592874, "grad_norm": 3.0509424209594727, "learning_rate": 2.9898886052203407e-05, "loss": 1.8828, "step": 18990 }, { "epoch": 0.037009413496703135, "grad_norm": 2.99225115776062, "learning_rate": 2.9898726431502402e-05, "loss": 1.8158, "step": 19005 }, { "epoch": 0.037038623767813396, "grad_norm": 2.5677740573883057, "learning_rate": 2.9898566685337135e-05, "loss": 1.8706, "step": 19020 }, { "epoch": 0.03706783403892366, "grad_norm": 3.502387762069702, "learning_rate": 2.9898406813708934e-05, "loss": 1.9128, "step": 19035 }, { "epoch": 0.037097044310033925, "grad_norm": 3.5594568252563477, "learning_rate": 2.9898246816619162e-05, "loss": 1.732, "step": 19050 }, { "epoch": 0.037126254581144186, "grad_norm": 2.2263100147247314, "learning_rate": 2.989808669406915e-05, "loss": 1.9151, "step": 19065 }, { "epoch": 0.03715546485225445, "grad_norm": 3.859412670135498, "learning_rate": 2.989792644606026e-05, "loss": 1.9925, "step": 19080 }, { "epoch": 0.037184675123364715, "grad_norm": 2.2860965728759766, "learning_rate": 2.9897766072593834e-05, "loss": 1.9347, "step": 19095 }, { "epoch": 0.037213885394474976, "grad_norm": 3.5833935737609863, "learning_rate": 2.9897605573671224e-05, "loss": 1.7977, "step": 19110 }, { "epoch": 0.03724309566558524, "grad_norm": 3.612175226211548, "learning_rate": 2.989744494929378e-05, "loss": 1.9048, "step": 19125 }, { "epoch": 0.0372723059366955, "grad_norm": 2.6882388591766357, "learning_rate": 2.989728419946286e-05, "loss": 1.814, "step": 19140 }, { "epoch": 0.037301516207805765, "grad_norm": 2.753767728805542, "learning_rate": 2.989712332417982e-05, "loss": 1.8261, "step": 19155 }, { "epoch": 0.037330726478916026, "grad_norm": 3.5201592445373535, "learning_rate": 2.9896962323446004e-05, "loss": 1.9855, "step": 19170 }, { "epoch": 0.03735993675002629, "grad_norm": 2.892357587814331, "learning_rate": 2.9896801197262773e-05, "loss": 1.8325, "step": 19185 }, { "epoch": 0.037389147021136555, "grad_norm": 3.37709903717041, "learning_rate": 2.9896639945631483e-05, "loss": 1.9563, "step": 19200 }, { "epoch": 0.037418357292246816, "grad_norm": 1.9762299060821533, "learning_rate": 2.9896478568553492e-05, "loss": 1.8228, "step": 19215 }, { "epoch": 0.03744756756335708, "grad_norm": 5.543561935424805, "learning_rate": 2.9896317066030162e-05, "loss": 1.7656, "step": 19230 }, { "epoch": 0.03747677783446734, "grad_norm": 1.8155004978179932, "learning_rate": 2.9896155438062852e-05, "loss": 1.8033, "step": 19245 }, { "epoch": 0.037505988105577606, "grad_norm": 3.613931655883789, "learning_rate": 2.989599368465292e-05, "loss": 1.9275, "step": 19260 }, { "epoch": 0.03753519837668787, "grad_norm": 2.2457938194274902, "learning_rate": 2.989583180580173e-05, "loss": 1.9459, "step": 19275 }, { "epoch": 0.03756440864779813, "grad_norm": 2.4014956951141357, "learning_rate": 2.9895669801510646e-05, "loss": 1.7973, "step": 19290 }, { "epoch": 0.037593618918908396, "grad_norm": 2.1578032970428467, "learning_rate": 2.9895507671781032e-05, "loss": 1.8551, "step": 19305 }, { "epoch": 0.03762282919001866, "grad_norm": 2.952676296234131, "learning_rate": 2.9895345416614254e-05, "loss": 1.7808, "step": 19320 }, { "epoch": 0.03765203946112892, "grad_norm": 3.352534770965576, "learning_rate": 2.989518303601167e-05, "loss": 1.8889, "step": 19335 }, { "epoch": 0.03768124973223918, "grad_norm": 5.104668617248535, "learning_rate": 2.9895020529974667e-05, "loss": 1.9522, "step": 19350 }, { "epoch": 0.03771046000334945, "grad_norm": 3.0823004245758057, "learning_rate": 2.9894857898504595e-05, "loss": 1.916, "step": 19365 }, { "epoch": 0.03773967027445971, "grad_norm": 2.544391393661499, "learning_rate": 2.9894695141602824e-05, "loss": 1.9019, "step": 19380 }, { "epoch": 0.03776888054556997, "grad_norm": 2.4823343753814697, "learning_rate": 2.989453225927074e-05, "loss": 1.7927, "step": 19395 }, { "epoch": 0.03779809081668023, "grad_norm": 1.8794801235198975, "learning_rate": 2.98943692515097e-05, "loss": 2.0112, "step": 19410 }, { "epoch": 0.0378273010877905, "grad_norm": 2.5527946949005127, "learning_rate": 2.9894206118321083e-05, "loss": 1.8095, "step": 19425 }, { "epoch": 0.03785651135890076, "grad_norm": 3.4643588066101074, "learning_rate": 2.9894042859706265e-05, "loss": 1.7435, "step": 19440 }, { "epoch": 0.03788572163001102, "grad_norm": 3.913656234741211, "learning_rate": 2.9893879475666613e-05, "loss": 2.0963, "step": 19455 }, { "epoch": 0.03791493190112129, "grad_norm": 5.583499431610107, "learning_rate": 2.9893715966203502e-05, "loss": 1.8735, "step": 19470 }, { "epoch": 0.03794414217223155, "grad_norm": 1.9320791959762573, "learning_rate": 2.989355233131832e-05, "loss": 1.9904, "step": 19485 }, { "epoch": 0.03797335244334181, "grad_norm": 3.6746532917022705, "learning_rate": 2.9893388571012443e-05, "loss": 1.8231, "step": 19500 }, { "epoch": 0.03800256271445207, "grad_norm": 2.002924680709839, "learning_rate": 2.989322468528724e-05, "loss": 1.989, "step": 19515 }, { "epoch": 0.03803177298556234, "grad_norm": 3.4807581901550293, "learning_rate": 2.98930606741441e-05, "loss": 1.849, "step": 19530 }, { "epoch": 0.0380609832566726, "grad_norm": 4.430756568908691, "learning_rate": 2.98928965375844e-05, "loss": 1.9161, "step": 19545 }, { "epoch": 0.03809019352778286, "grad_norm": 3.5762476921081543, "learning_rate": 2.9892732275609525e-05, "loss": 1.8861, "step": 19560 }, { "epoch": 0.03811940379889313, "grad_norm": 2.6605224609375, "learning_rate": 2.9892567888220855e-05, "loss": 1.9023, "step": 19575 }, { "epoch": 0.03814861407000339, "grad_norm": 2.929426431655884, "learning_rate": 2.9892403375419778e-05, "loss": 1.7741, "step": 19590 }, { "epoch": 0.03817782434111365, "grad_norm": 3.075709342956543, "learning_rate": 2.9892238737207677e-05, "loss": 1.9575, "step": 19605 }, { "epoch": 0.03820703461222391, "grad_norm": 3.5462255477905273, "learning_rate": 2.9892073973585942e-05, "loss": 1.8336, "step": 19620 }, { "epoch": 0.03823624488333418, "grad_norm": 2.098604202270508, "learning_rate": 2.9891909084555954e-05, "loss": 1.8161, "step": 19635 }, { "epoch": 0.03826545515444444, "grad_norm": 3.255275249481201, "learning_rate": 2.9891744070119106e-05, "loss": 1.9291, "step": 19650 }, { "epoch": 0.0382946654255547, "grad_norm": 3.4731807708740234, "learning_rate": 2.9891578930276787e-05, "loss": 1.9403, "step": 19665 }, { "epoch": 0.03832387569666497, "grad_norm": 2.0823373794555664, "learning_rate": 2.9891413665030387e-05, "loss": 1.8713, "step": 19680 }, { "epoch": 0.03835308596777523, "grad_norm": 2.6273398399353027, "learning_rate": 2.98912482743813e-05, "loss": 1.8764, "step": 19695 }, { "epoch": 0.03838229623888549, "grad_norm": 3.4412078857421875, "learning_rate": 2.9891082758330915e-05, "loss": 1.8471, "step": 19710 }, { "epoch": 0.03841150650999575, "grad_norm": 2.652684211730957, "learning_rate": 2.9890917116880625e-05, "loss": 2.0607, "step": 19725 }, { "epoch": 0.03844071678110602, "grad_norm": 2.8258442878723145, "learning_rate": 2.989075135003183e-05, "loss": 1.8497, "step": 19740 }, { "epoch": 0.03846992705221628, "grad_norm": 3.239922285079956, "learning_rate": 2.9890585457785923e-05, "loss": 1.7763, "step": 19755 }, { "epoch": 0.03849913732332654, "grad_norm": 2.4102368354797363, "learning_rate": 2.9890419440144303e-05, "loss": 1.9937, "step": 19770 }, { "epoch": 0.03852834759443681, "grad_norm": 3.4451348781585693, "learning_rate": 2.989025329710837e-05, "loss": 1.8446, "step": 19785 }, { "epoch": 0.03855755786554707, "grad_norm": 2.9538424015045166, "learning_rate": 2.9890087028679517e-05, "loss": 1.8478, "step": 19800 }, { "epoch": 0.03858676813665733, "grad_norm": 2.6566977500915527, "learning_rate": 2.9889920634859144e-05, "loss": 1.7005, "step": 19815 }, { "epoch": 0.03861597840776759, "grad_norm": 1.9971015453338623, "learning_rate": 2.988975411564866e-05, "loss": 1.8428, "step": 19830 }, { "epoch": 0.03864518867887786, "grad_norm": 2.648861885070801, "learning_rate": 2.9889587471049456e-05, "loss": 2.0656, "step": 19845 }, { "epoch": 0.03867439894998812, "grad_norm": 4.70402193069458, "learning_rate": 2.9889420701062947e-05, "loss": 1.8098, "step": 19860 }, { "epoch": 0.03870360922109838, "grad_norm": 4.031128883361816, "learning_rate": 2.988925380569053e-05, "loss": 1.9847, "step": 19875 }, { "epoch": 0.038732819492208644, "grad_norm": 3.7528605461120605, "learning_rate": 2.988908678493361e-05, "loss": 1.8671, "step": 19890 }, { "epoch": 0.03876202976331891, "grad_norm": 2.24607253074646, "learning_rate": 2.9888919638793604e-05, "loss": 1.8053, "step": 19905 }, { "epoch": 0.03879124003442917, "grad_norm": 2.6132078170776367, "learning_rate": 2.9888752367271903e-05, "loss": 2.0429, "step": 19920 }, { "epoch": 0.03882045030553943, "grad_norm": 2.334711790084839, "learning_rate": 2.988858497036993e-05, "loss": 1.7526, "step": 19935 }, { "epoch": 0.0388496605766497, "grad_norm": 3.065218210220337, "learning_rate": 2.988841744808909e-05, "loss": 1.9287, "step": 19950 }, { "epoch": 0.03887887084775996, "grad_norm": 4.216036796569824, "learning_rate": 2.9888249800430787e-05, "loss": 1.7889, "step": 19965 }, { "epoch": 0.03890808111887022, "grad_norm": 4.185488224029541, "learning_rate": 2.988808202739644e-05, "loss": 1.865, "step": 19980 }, { "epoch": 0.038937291389980484, "grad_norm": 2.3708608150482178, "learning_rate": 2.9887914128987465e-05, "loss": 1.852, "step": 19995 }, { "epoch": 0.03896650166109075, "grad_norm": 4.136464595794678, "learning_rate": 2.9887746105205264e-05, "loss": 1.8378, "step": 20010 }, { "epoch": 0.03899571193220101, "grad_norm": 3.3061976432800293, "learning_rate": 2.9887577956051263e-05, "loss": 1.8702, "step": 20025 }, { "epoch": 0.039024922203311274, "grad_norm": 3.5822839736938477, "learning_rate": 2.9887409681526876e-05, "loss": 1.87, "step": 20040 }, { "epoch": 0.03905413247442154, "grad_norm": 4.132532119750977, "learning_rate": 2.9887241281633518e-05, "loss": 2.051, "step": 20055 }, { "epoch": 0.0390833427455318, "grad_norm": 3.9095962047576904, "learning_rate": 2.9887072756372606e-05, "loss": 1.8271, "step": 20070 }, { "epoch": 0.039112553016642064, "grad_norm": 3.028393507003784, "learning_rate": 2.988690410574556e-05, "loss": 1.825, "step": 20085 }, { "epoch": 0.039141763287752325, "grad_norm": 1.7903603315353394, "learning_rate": 2.98867353297538e-05, "loss": 1.9565, "step": 20100 }, { "epoch": 0.03917097355886259, "grad_norm": 2.168895959854126, "learning_rate": 2.988656642839875e-05, "loss": 1.8483, "step": 20115 }, { "epoch": 0.039200183829972854, "grad_norm": 7.789179801940918, "learning_rate": 2.988639740168183e-05, "loss": 2.0627, "step": 20130 }, { "epoch": 0.039229394101083115, "grad_norm": 2.033658981323242, "learning_rate": 2.9886228249604464e-05, "loss": 1.8905, "step": 20145 }, { "epoch": 0.03925860437219338, "grad_norm": 3.6176931858062744, "learning_rate": 2.9886058972168076e-05, "loss": 1.8981, "step": 20160 }, { "epoch": 0.039287814643303644, "grad_norm": 3.776540517807007, "learning_rate": 2.9885889569374088e-05, "loss": 1.7417, "step": 20175 }, { "epoch": 0.039317024914413905, "grad_norm": 2.138796329498291, "learning_rate": 2.9885720041223934e-05, "loss": 1.8845, "step": 20190 }, { "epoch": 0.039346235185524166, "grad_norm": 5.0401482582092285, "learning_rate": 2.988555038771904e-05, "loss": 1.8966, "step": 20205 }, { "epoch": 0.039375445456634434, "grad_norm": 2.2886552810668945, "learning_rate": 2.9885380608860827e-05, "loss": 1.8898, "step": 20220 }, { "epoch": 0.039404655727744695, "grad_norm": 2.286959648132324, "learning_rate": 2.9885210704650734e-05, "loss": 1.9597, "step": 20235 }, { "epoch": 0.039433865998854956, "grad_norm": 2.538752794265747, "learning_rate": 2.988504067509019e-05, "loss": 1.8451, "step": 20250 }, { "epoch": 0.039463076269965223, "grad_norm": 3.4922502040863037, "learning_rate": 2.988487052018062e-05, "loss": 1.8717, "step": 20265 }, { "epoch": 0.039492286541075484, "grad_norm": 3.5919086933135986, "learning_rate": 2.9884700239923467e-05, "loss": 1.8182, "step": 20280 }, { "epoch": 0.039521496812185745, "grad_norm": 1.9253063201904297, "learning_rate": 2.988452983432016e-05, "loss": 1.8067, "step": 20295 }, { "epoch": 0.039550707083296006, "grad_norm": 3.168278217315674, "learning_rate": 2.9884359303372127e-05, "loss": 1.9619, "step": 20310 }, { "epoch": 0.039579917354406274, "grad_norm": 2.574296236038208, "learning_rate": 2.9884188647080816e-05, "loss": 1.9037, "step": 20325 }, { "epoch": 0.039609127625516535, "grad_norm": 3.87908673286438, "learning_rate": 2.9884017865447657e-05, "loss": 1.8106, "step": 20340 }, { "epoch": 0.039638337896626796, "grad_norm": 3.060088872909546, "learning_rate": 2.9883846958474093e-05, "loss": 1.8878, "step": 20355 }, { "epoch": 0.039667548167737064, "grad_norm": 2.6092071533203125, "learning_rate": 2.988367592616156e-05, "loss": 2.0189, "step": 20370 }, { "epoch": 0.039696758438847325, "grad_norm": 2.688831329345703, "learning_rate": 2.9883504768511496e-05, "loss": 1.9439, "step": 20385 }, { "epoch": 0.039725968709957586, "grad_norm": 4.126415252685547, "learning_rate": 2.988333348552535e-05, "loss": 1.7334, "step": 20400 }, { "epoch": 0.03975517898106785, "grad_norm": 2.5536105632781982, "learning_rate": 2.988316207720455e-05, "loss": 1.8741, "step": 20415 }, { "epoch": 0.039784389252178115, "grad_norm": 4.146584510803223, "learning_rate": 2.9882990543550557e-05, "loss": 1.839, "step": 20430 }, { "epoch": 0.039813599523288376, "grad_norm": 4.531203746795654, "learning_rate": 2.9882818884564805e-05, "loss": 1.801, "step": 20445 }, { "epoch": 0.03984280979439864, "grad_norm": 4.3206562995910645, "learning_rate": 2.988264710024874e-05, "loss": 1.9127, "step": 20460 }, { "epoch": 0.0398720200655089, "grad_norm": 2.258627414703369, "learning_rate": 2.9882475190603815e-05, "loss": 1.9329, "step": 20475 }, { "epoch": 0.039901230336619166, "grad_norm": 2.874171018600464, "learning_rate": 2.988230315563147e-05, "loss": 1.8632, "step": 20490 }, { "epoch": 0.03993044060772943, "grad_norm": 2.2248058319091797, "learning_rate": 2.988213099533316e-05, "loss": 1.7653, "step": 20505 }, { "epoch": 0.03995965087883969, "grad_norm": 2.2211756706237793, "learning_rate": 2.988195870971033e-05, "loss": 2.0817, "step": 20520 }, { "epoch": 0.039988861149949956, "grad_norm": 2.266099214553833, "learning_rate": 2.9881786298764432e-05, "loss": 2.0993, "step": 20535 }, { "epoch": 0.04001807142106022, "grad_norm": 3.2420692443847656, "learning_rate": 2.988161376249692e-05, "loss": 1.8448, "step": 20550 }, { "epoch": 0.04004728169217048, "grad_norm": 2.1219029426574707, "learning_rate": 2.9881441100909244e-05, "loss": 1.9547, "step": 20565 }, { "epoch": 0.04007649196328074, "grad_norm": 4.585850238800049, "learning_rate": 2.988126831400286e-05, "loss": 1.9619, "step": 20580 }, { "epoch": 0.04010570223439101, "grad_norm": 3.4670188426971436, "learning_rate": 2.9881095401779224e-05, "loss": 2.0, "step": 20595 }, { "epoch": 0.04013491250550127, "grad_norm": 2.0285186767578125, "learning_rate": 2.9880922364239787e-05, "loss": 1.9434, "step": 20610 }, { "epoch": 0.04016412277661153, "grad_norm": 1.873279094696045, "learning_rate": 2.9880749201386014e-05, "loss": 1.8546, "step": 20625 }, { "epoch": 0.040193333047721796, "grad_norm": 2.1878137588500977, "learning_rate": 2.9880575913219354e-05, "loss": 1.8624, "step": 20640 }, { "epoch": 0.04022254331883206, "grad_norm": 3.9366180896759033, "learning_rate": 2.988040249974128e-05, "loss": 1.9143, "step": 20655 }, { "epoch": 0.04025175358994232, "grad_norm": 2.3610141277313232, "learning_rate": 2.9880228960953236e-05, "loss": 1.9616, "step": 20670 }, { "epoch": 0.04028096386105258, "grad_norm": 2.8658604621887207, "learning_rate": 2.9880055296856695e-05, "loss": 1.7601, "step": 20685 }, { "epoch": 0.04031017413216285, "grad_norm": 4.454057216644287, "learning_rate": 2.9879881507453112e-05, "loss": 1.7578, "step": 20700 }, { "epoch": 0.04033938440327311, "grad_norm": 2.244455575942993, "learning_rate": 2.9879707592743957e-05, "loss": 1.8334, "step": 20715 }, { "epoch": 0.04036859467438337, "grad_norm": 1.61302649974823, "learning_rate": 2.987953355273069e-05, "loss": 1.9746, "step": 20730 }, { "epoch": 0.04039780494549364, "grad_norm": 2.8323678970336914, "learning_rate": 2.987935938741478e-05, "loss": 2.1251, "step": 20745 }, { "epoch": 0.0404270152166039, "grad_norm": 2.418241262435913, "learning_rate": 2.987918509679769e-05, "loss": 2.0798, "step": 20760 }, { "epoch": 0.04045622548771416, "grad_norm": 3.465550661087036, "learning_rate": 2.987901068088089e-05, "loss": 1.8706, "step": 20775 }, { "epoch": 0.04048543575882442, "grad_norm": 4.148263931274414, "learning_rate": 2.987883613966585e-05, "loss": 1.7925, "step": 20790 }, { "epoch": 0.04051464602993469, "grad_norm": 2.4039666652679443, "learning_rate": 2.9878661473154037e-05, "loss": 1.8672, "step": 20805 }, { "epoch": 0.04054385630104495, "grad_norm": 2.12880277633667, "learning_rate": 2.9878486681346923e-05, "loss": 1.8978, "step": 20820 }, { "epoch": 0.04057306657215521, "grad_norm": 1.9612979888916016, "learning_rate": 2.987831176424598e-05, "loss": 1.9175, "step": 20835 }, { "epoch": 0.04060227684326548, "grad_norm": 2.161982297897339, "learning_rate": 2.9878136721852682e-05, "loss": 1.7752, "step": 20850 }, { "epoch": 0.04063148711437574, "grad_norm": 4.4879961013793945, "learning_rate": 2.9877961554168498e-05, "loss": 2.0857, "step": 20865 }, { "epoch": 0.040660697385486, "grad_norm": 4.1571364402771, "learning_rate": 2.9877786261194914e-05, "loss": 1.949, "step": 20880 }, { "epoch": 0.04068990765659626, "grad_norm": 3.3120033740997314, "learning_rate": 2.9877610842933397e-05, "loss": 1.8585, "step": 20895 }, { "epoch": 0.04071911792770653, "grad_norm": 3.193117618560791, "learning_rate": 2.9877435299385424e-05, "loss": 1.731, "step": 20910 }, { "epoch": 0.04074832819881679, "grad_norm": 2.375343084335327, "learning_rate": 2.987725963055248e-05, "loss": 1.8269, "step": 20925 }, { "epoch": 0.04077753846992705, "grad_norm": 2.3607242107391357, "learning_rate": 2.9877083836436036e-05, "loss": 1.8305, "step": 20940 }, { "epoch": 0.04080674874103731, "grad_norm": 3.0205342769622803, "learning_rate": 2.987690791703758e-05, "loss": 1.8631, "step": 20955 }, { "epoch": 0.04083595901214758, "grad_norm": 3.530947685241699, "learning_rate": 2.9876731872358585e-05, "loss": 1.8431, "step": 20970 }, { "epoch": 0.04086516928325784, "grad_norm": 2.8419220447540283, "learning_rate": 2.987655570240054e-05, "loss": 1.8519, "step": 20985 }, { "epoch": 0.0408943795543681, "grad_norm": 2.253532886505127, "learning_rate": 2.9876379407164933e-05, "loss": 1.8688, "step": 21000 }, { "epoch": 0.04092358982547837, "grad_norm": 1.9279251098632812, "learning_rate": 2.987620298665324e-05, "loss": 1.9111, "step": 21015 }, { "epoch": 0.04095280009658863, "grad_norm": 3.0645790100097656, "learning_rate": 2.987602644086695e-05, "loss": 1.8359, "step": 21030 }, { "epoch": 0.04098201036769889, "grad_norm": 2.9384896755218506, "learning_rate": 2.9875849769807544e-05, "loss": 1.747, "step": 21045 }, { "epoch": 0.04101122063880915, "grad_norm": 3.7138500213623047, "learning_rate": 2.987567297347652e-05, "loss": 1.8528, "step": 21060 }, { "epoch": 0.04104043090991942, "grad_norm": 4.440821170806885, "learning_rate": 2.987549605187536e-05, "loss": 1.9611, "step": 21075 }, { "epoch": 0.04106964118102968, "grad_norm": 3.0378854274749756, "learning_rate": 2.9875319005005552e-05, "loss": 2.012, "step": 21090 }, { "epoch": 0.04109885145213994, "grad_norm": 4.378201961517334, "learning_rate": 2.9875141832868598e-05, "loss": 1.7366, "step": 21105 }, { "epoch": 0.04112806172325021, "grad_norm": 4.794327735900879, "learning_rate": 2.9874964535465978e-05, "loss": 1.8728, "step": 21120 }, { "epoch": 0.04115727199436047, "grad_norm": 2.5137903690338135, "learning_rate": 2.987478711279919e-05, "loss": 1.9813, "step": 21135 }, { "epoch": 0.04118648226547073, "grad_norm": 2.2024412155151367, "learning_rate": 2.987460956486973e-05, "loss": 1.7721, "step": 21150 }, { "epoch": 0.04121569253658099, "grad_norm": 3.0043609142303467, "learning_rate": 2.987443189167909e-05, "loss": 1.9015, "step": 21165 }, { "epoch": 0.04124490280769126, "grad_norm": 4.627270698547363, "learning_rate": 2.9874254093228763e-05, "loss": 1.9487, "step": 21180 }, { "epoch": 0.04127411307880152, "grad_norm": 3.208395481109619, "learning_rate": 2.987407616952025e-05, "loss": 1.6989, "step": 21195 }, { "epoch": 0.04130332334991178, "grad_norm": 4.102930545806885, "learning_rate": 2.9873898120555055e-05, "loss": 2.0639, "step": 21210 }, { "epoch": 0.04133253362102205, "grad_norm": 3.846593141555786, "learning_rate": 2.987371994633467e-05, "loss": 1.7067, "step": 21225 }, { "epoch": 0.04136174389213231, "grad_norm": 3.6651105880737305, "learning_rate": 2.9873541646860597e-05, "loss": 1.8983, "step": 21240 }, { "epoch": 0.04139095416324257, "grad_norm": 3.715604543685913, "learning_rate": 2.987336322213434e-05, "loss": 1.6676, "step": 21255 }, { "epoch": 0.041420164434352834, "grad_norm": 3.0780601501464844, "learning_rate": 2.9873184672157395e-05, "loss": 1.9342, "step": 21270 }, { "epoch": 0.0414493747054631, "grad_norm": 3.385103225708008, "learning_rate": 2.9873005996931274e-05, "loss": 1.9494, "step": 21285 }, { "epoch": 0.04147858497657336, "grad_norm": 2.9652836322784424, "learning_rate": 2.9872827196457475e-05, "loss": 1.8491, "step": 21300 }, { "epoch": 0.041507795247683624, "grad_norm": 2.346210479736328, "learning_rate": 2.9872648270737507e-05, "loss": 1.7948, "step": 21315 }, { "epoch": 0.04153700551879389, "grad_norm": 2.0421793460845947, "learning_rate": 2.9872469219772877e-05, "loss": 1.7642, "step": 21330 }, { "epoch": 0.04156621578990415, "grad_norm": 3.2347426414489746, "learning_rate": 2.9872290043565094e-05, "loss": 1.9741, "step": 21345 }, { "epoch": 0.041595426061014414, "grad_norm": 2.8749160766601562, "learning_rate": 2.987211074211566e-05, "loss": 1.8364, "step": 21360 }, { "epoch": 0.041624636332124675, "grad_norm": 3.484539031982422, "learning_rate": 2.9871931315426094e-05, "loss": 1.7097, "step": 21375 }, { "epoch": 0.04165384660323494, "grad_norm": 3.105286121368408, "learning_rate": 2.98717517634979e-05, "loss": 1.8239, "step": 21390 }, { "epoch": 0.0416830568743452, "grad_norm": 3.804901361465454, "learning_rate": 2.9871572086332594e-05, "loss": 1.7356, "step": 21405 }, { "epoch": 0.041712267145455464, "grad_norm": 2.4536283016204834, "learning_rate": 2.9871392283931686e-05, "loss": 1.8601, "step": 21420 }, { "epoch": 0.04174147741656573, "grad_norm": 2.8864688873291016, "learning_rate": 2.9871212356296697e-05, "loss": 1.802, "step": 21435 }, { "epoch": 0.04177068768767599, "grad_norm": 3.4735238552093506, "learning_rate": 2.9871032303429133e-05, "loss": 1.9557, "step": 21450 }, { "epoch": 0.041799897958786254, "grad_norm": 4.21823263168335, "learning_rate": 2.9870852125330513e-05, "loss": 1.8513, "step": 21465 }, { "epoch": 0.041829108229896515, "grad_norm": 1.6568303108215332, "learning_rate": 2.9870671822002357e-05, "loss": 1.8443, "step": 21480 }, { "epoch": 0.04185831850100678, "grad_norm": 3.8471972942352295, "learning_rate": 2.9870491393446184e-05, "loss": 1.8831, "step": 21495 }, { "epoch": 0.041887528772117044, "grad_norm": 4.5181803703308105, "learning_rate": 2.987031083966351e-05, "loss": 2.0743, "step": 21510 }, { "epoch": 0.041916739043227305, "grad_norm": 2.7604262828826904, "learning_rate": 2.987013016065586e-05, "loss": 1.7647, "step": 21525 }, { "epoch": 0.041945949314337566, "grad_norm": 1.6995900869369507, "learning_rate": 2.986994935642475e-05, "loss": 1.9221, "step": 21540 }, { "epoch": 0.041975159585447834, "grad_norm": 3.400073766708374, "learning_rate": 2.9869768426971706e-05, "loss": 1.7217, "step": 21555 }, { "epoch": 0.042004369856558095, "grad_norm": 2.7783281803131104, "learning_rate": 2.986958737229825e-05, "loss": 1.8149, "step": 21570 }, { "epoch": 0.042033580127668356, "grad_norm": 1.9334073066711426, "learning_rate": 2.9869406192405904e-05, "loss": 1.7657, "step": 21585 }, { "epoch": 0.042062790398778624, "grad_norm": 3.244271993637085, "learning_rate": 2.9869224887296205e-05, "loss": 1.8122, "step": 21600 }, { "epoch": 0.042092000669888885, "grad_norm": 3.9582252502441406, "learning_rate": 2.9869043456970662e-05, "loss": 1.8296, "step": 21615 }, { "epoch": 0.042121210940999146, "grad_norm": 3.9118635654449463, "learning_rate": 2.986886190143082e-05, "loss": 1.9162, "step": 21630 }, { "epoch": 0.04215042121210941, "grad_norm": 3.3247733116149902, "learning_rate": 2.9868680220678198e-05, "loss": 1.9345, "step": 21645 }, { "epoch": 0.042179631483219675, "grad_norm": 4.189952373504639, "learning_rate": 2.9868498414714332e-05, "loss": 1.7394, "step": 21660 }, { "epoch": 0.042208841754329936, "grad_norm": 4.380448818206787, "learning_rate": 2.986831648354075e-05, "loss": 1.8411, "step": 21675 }, { "epoch": 0.0422380520254402, "grad_norm": 3.3280911445617676, "learning_rate": 2.986813442715898e-05, "loss": 1.7338, "step": 21690 }, { "epoch": 0.042267262296550465, "grad_norm": 2.147976875305176, "learning_rate": 2.9867952245570557e-05, "loss": 1.7418, "step": 21705 }, { "epoch": 0.042296472567660726, "grad_norm": 4.927828788757324, "learning_rate": 2.9867769938777025e-05, "loss": 1.9729, "step": 21720 }, { "epoch": 0.04232568283877099, "grad_norm": 2.1555793285369873, "learning_rate": 2.9867587506779903e-05, "loss": 1.9733, "step": 21735 }, { "epoch": 0.04235489310988125, "grad_norm": 4.826751232147217, "learning_rate": 2.986740494958074e-05, "loss": 1.9141, "step": 21750 }, { "epoch": 0.042384103380991515, "grad_norm": 2.62713360786438, "learning_rate": 2.986722226718107e-05, "loss": 1.7623, "step": 21765 }, { "epoch": 0.042413313652101776, "grad_norm": 2.9087536334991455, "learning_rate": 2.9867039459582422e-05, "loss": 2.0298, "step": 21780 }, { "epoch": 0.04244252392321204, "grad_norm": 3.2233824729919434, "learning_rate": 2.986685652678635e-05, "loss": 1.8063, "step": 21795 }, { "epoch": 0.042471734194322305, "grad_norm": 3.8226282596588135, "learning_rate": 2.9866673468794392e-05, "loss": 1.8173, "step": 21810 }, { "epoch": 0.042500944465432566, "grad_norm": 2.362210273742676, "learning_rate": 2.986649028560808e-05, "loss": 2.0107, "step": 21825 }, { "epoch": 0.04253015473654283, "grad_norm": 1.4686235189437866, "learning_rate": 2.9866306977228964e-05, "loss": 1.8339, "step": 21840 }, { "epoch": 0.04255936500765309, "grad_norm": 3.2174501419067383, "learning_rate": 2.9866123543658585e-05, "loss": 1.9372, "step": 21855 }, { "epoch": 0.042588575278763356, "grad_norm": 4.215010643005371, "learning_rate": 2.9865939984898494e-05, "loss": 1.7492, "step": 21870 }, { "epoch": 0.04261778554987362, "grad_norm": 6.015155792236328, "learning_rate": 2.9865756300950224e-05, "loss": 1.8146, "step": 21885 }, { "epoch": 0.04264699582098388, "grad_norm": 3.102923631668091, "learning_rate": 2.9865572491815336e-05, "loss": 1.9206, "step": 21900 }, { "epoch": 0.042676206092094146, "grad_norm": 3.5606796741485596, "learning_rate": 2.986538855749537e-05, "loss": 1.7442, "step": 21915 }, { "epoch": 0.04270541636320441, "grad_norm": 3.877696990966797, "learning_rate": 2.9865204497991874e-05, "loss": 1.8356, "step": 21930 }, { "epoch": 0.04273462663431467, "grad_norm": 2.7707462310791016, "learning_rate": 2.98650203133064e-05, "loss": 2.1114, "step": 21945 }, { "epoch": 0.04276383690542493, "grad_norm": 3.8471269607543945, "learning_rate": 2.9864836003440496e-05, "loss": 1.974, "step": 21960 }, { "epoch": 0.0427930471765352, "grad_norm": 4.265331268310547, "learning_rate": 2.9864651568395728e-05, "loss": 1.7851, "step": 21975 }, { "epoch": 0.04282225744764546, "grad_norm": 3.578641414642334, "learning_rate": 2.986446700817363e-05, "loss": 1.7996, "step": 21990 }, { "epoch": 0.04285146771875572, "grad_norm": 3.289726495742798, "learning_rate": 2.9864282322775768e-05, "loss": 1.7165, "step": 22005 }, { "epoch": 0.04288067798986598, "grad_norm": 1.883353590965271, "learning_rate": 2.986409751220369e-05, "loss": 1.8633, "step": 22020 }, { "epoch": 0.04290988826097625, "grad_norm": 1.7199418544769287, "learning_rate": 2.9863912576458955e-05, "loss": 1.7696, "step": 22035 }, { "epoch": 0.04293909853208651, "grad_norm": 3.445349931716919, "learning_rate": 2.986372751554313e-05, "loss": 1.8349, "step": 22050 }, { "epoch": 0.04296830880319677, "grad_norm": 2.827427387237549, "learning_rate": 2.986354232945776e-05, "loss": 1.8647, "step": 22065 }, { "epoch": 0.04299751907430704, "grad_norm": 2.4225332736968994, "learning_rate": 2.986335701820441e-05, "loss": 2.0951, "step": 22080 }, { "epoch": 0.0430267293454173, "grad_norm": 2.9156410694122314, "learning_rate": 2.986317158178464e-05, "loss": 1.7989, "step": 22095 }, { "epoch": 0.04305593961652756, "grad_norm": 2.3038766384124756, "learning_rate": 2.986298602020001e-05, "loss": 1.8867, "step": 22110 }, { "epoch": 0.04308514988763782, "grad_norm": 3.085129976272583, "learning_rate": 2.986280033345209e-05, "loss": 1.6969, "step": 22125 }, { "epoch": 0.04311436015874809, "grad_norm": 4.071490287780762, "learning_rate": 2.986261452154243e-05, "loss": 1.7865, "step": 22140 }, { "epoch": 0.04314357042985835, "grad_norm": 2.2542295455932617, "learning_rate": 2.986242858447261e-05, "loss": 1.8687, "step": 22155 }, { "epoch": 0.04317278070096861, "grad_norm": 3.3869330883026123, "learning_rate": 2.9862242522244183e-05, "loss": 1.8037, "step": 22170 }, { "epoch": 0.04320199097207888, "grad_norm": 3.7993576526641846, "learning_rate": 2.9862056334858727e-05, "loss": 1.8819, "step": 22185 }, { "epoch": 0.04323120124318914, "grad_norm": 3.019289255142212, "learning_rate": 2.9861870022317798e-05, "loss": 1.8932, "step": 22200 }, { "epoch": 0.0432604115142994, "grad_norm": 2.831664562225342, "learning_rate": 2.9861683584622976e-05, "loss": 1.8813, "step": 22215 }, { "epoch": 0.04328962178540966, "grad_norm": 4.9506754875183105, "learning_rate": 2.9861497021775825e-05, "loss": 1.7917, "step": 22230 }, { "epoch": 0.04331883205651993, "grad_norm": 2.384033203125, "learning_rate": 2.986131033377792e-05, "loss": 2.0226, "step": 22245 }, { "epoch": 0.04334804232763019, "grad_norm": 1.8661621809005737, "learning_rate": 2.9861123520630828e-05, "loss": 1.817, "step": 22260 }, { "epoch": 0.04337725259874045, "grad_norm": 3.5283803939819336, "learning_rate": 2.9860936582336123e-05, "loss": 1.806, "step": 22275 }, { "epoch": 0.04340646286985072, "grad_norm": 4.240048408508301, "learning_rate": 2.9860749518895386e-05, "loss": 1.7773, "step": 22290 }, { "epoch": 0.04343567314096098, "grad_norm": 3.9326212406158447, "learning_rate": 2.986056233031018e-05, "loss": 1.8494, "step": 22305 }, { "epoch": 0.04346488341207124, "grad_norm": 4.390437602996826, "learning_rate": 2.986037501658209e-05, "loss": 1.7608, "step": 22320 }, { "epoch": 0.0434940936831815, "grad_norm": 2.5305089950561523, "learning_rate": 2.98601875777127e-05, "loss": 1.7295, "step": 22335 }, { "epoch": 0.04352330395429177, "grad_norm": 2.4276509284973145, "learning_rate": 2.9860000013703576e-05, "loss": 2.0082, "step": 22350 }, { "epoch": 0.04355251422540203, "grad_norm": 3.4239320755004883, "learning_rate": 2.9859812324556298e-05, "loss": 1.6979, "step": 22365 }, { "epoch": 0.04358172449651229, "grad_norm": 2.544837474822998, "learning_rate": 2.985962451027245e-05, "loss": 1.7844, "step": 22380 }, { "epoch": 0.04361093476762256, "grad_norm": 5.094317436218262, "learning_rate": 2.985943657085362e-05, "loss": 1.8792, "step": 22395 }, { "epoch": 0.04364014503873282, "grad_norm": 3.235121250152588, "learning_rate": 2.985924850630138e-05, "loss": 1.7795, "step": 22410 }, { "epoch": 0.04366935530984308, "grad_norm": 4.88664436340332, "learning_rate": 2.9859060316617325e-05, "loss": 1.9035, "step": 22425 }, { "epoch": 0.04369856558095334, "grad_norm": 2.401301622390747, "learning_rate": 2.9858872001803025e-05, "loss": 1.9182, "step": 22440 }, { "epoch": 0.04372777585206361, "grad_norm": 2.24088978767395, "learning_rate": 2.9858683561860077e-05, "loss": 1.688, "step": 22455 }, { "epoch": 0.04375698612317387, "grad_norm": 2.521176338195801, "learning_rate": 2.9858494996790065e-05, "loss": 1.962, "step": 22470 }, { "epoch": 0.04378619639428413, "grad_norm": 2.897905111312866, "learning_rate": 2.9858306306594578e-05, "loss": 1.7388, "step": 22485 }, { "epoch": 0.0438154066653944, "grad_norm": 2.3301055431365967, "learning_rate": 2.9858117491275204e-05, "loss": 1.8834, "step": 22500 }, { "epoch": 0.04384461693650466, "grad_norm": 2.078205108642578, "learning_rate": 2.9857928550833533e-05, "loss": 1.8959, "step": 22515 }, { "epoch": 0.04387382720761492, "grad_norm": 2.8938355445861816, "learning_rate": 2.9857739485271153e-05, "loss": 1.8529, "step": 22530 }, { "epoch": 0.04390303747872518, "grad_norm": 2.0363752841949463, "learning_rate": 2.9857550294589663e-05, "loss": 1.9974, "step": 22545 }, { "epoch": 0.04393224774983545, "grad_norm": 3.58372163772583, "learning_rate": 2.9857360978790647e-05, "loss": 1.7921, "step": 22560 }, { "epoch": 0.04396145802094571, "grad_norm": 3.4086432456970215, "learning_rate": 2.985717153787571e-05, "loss": 1.8401, "step": 22575 }, { "epoch": 0.04399066829205597, "grad_norm": 2.3264150619506836, "learning_rate": 2.985698197184644e-05, "loss": 1.8789, "step": 22590 }, { "epoch": 0.044019878563166234, "grad_norm": 2.727571964263916, "learning_rate": 2.9856792280704435e-05, "loss": 1.7859, "step": 22605 }, { "epoch": 0.0440490888342765, "grad_norm": 3.833146095275879, "learning_rate": 2.9856602464451293e-05, "loss": 1.9976, "step": 22620 }, { "epoch": 0.04407829910538676, "grad_norm": 1.8543720245361328, "learning_rate": 2.9856412523088612e-05, "loss": 1.8875, "step": 22635 }, { "epoch": 0.044107509376497024, "grad_norm": 2.6003103256225586, "learning_rate": 2.9856222456617993e-05, "loss": 2.0006, "step": 22650 }, { "epoch": 0.04413671964760729, "grad_norm": 3.0136611461639404, "learning_rate": 2.9856032265041035e-05, "loss": 1.8309, "step": 22665 }, { "epoch": 0.04416592991871755, "grad_norm": 1.948042631149292, "learning_rate": 2.9855841948359337e-05, "loss": 1.8721, "step": 22680 }, { "epoch": 0.044195140189827814, "grad_norm": 3.1908979415893555, "learning_rate": 2.9855651506574507e-05, "loss": 1.8373, "step": 22695 }, { "epoch": 0.044224350460938075, "grad_norm": 3.519826889038086, "learning_rate": 2.985546093968815e-05, "loss": 1.8992, "step": 22710 }, { "epoch": 0.04425356073204834, "grad_norm": 4.259973526000977, "learning_rate": 2.985527024770186e-05, "loss": 1.9466, "step": 22725 }, { "epoch": 0.044282771003158604, "grad_norm": 2.69942569732666, "learning_rate": 2.9855079430617253e-05, "loss": 1.6805, "step": 22740 }, { "epoch": 0.044311981274268865, "grad_norm": 3.0210318565368652, "learning_rate": 2.9854888488435933e-05, "loss": 1.8744, "step": 22755 }, { "epoch": 0.04434119154537913, "grad_norm": 2.448091745376587, "learning_rate": 2.9854697421159505e-05, "loss": 1.8449, "step": 22770 }, { "epoch": 0.044370401816489394, "grad_norm": 3.8297231197357178, "learning_rate": 2.9854506228789586e-05, "loss": 1.8071, "step": 22785 }, { "epoch": 0.044399612087599655, "grad_norm": 4.065975189208984, "learning_rate": 2.9854314911327777e-05, "loss": 1.9335, "step": 22800 }, { "epoch": 0.044428822358709916, "grad_norm": 2.0296108722686768, "learning_rate": 2.9854123468775693e-05, "loss": 1.748, "step": 22815 }, { "epoch": 0.044458032629820184, "grad_norm": 1.7504347562789917, "learning_rate": 2.985393190113495e-05, "loss": 1.9084, "step": 22830 }, { "epoch": 0.044487242900930445, "grad_norm": 2.8802502155303955, "learning_rate": 2.9853740208407152e-05, "loss": 1.8649, "step": 22845 }, { "epoch": 0.044516453172040706, "grad_norm": 3.5376250743865967, "learning_rate": 2.985354839059392e-05, "loss": 1.7617, "step": 22860 }, { "epoch": 0.044545663443150973, "grad_norm": 4.846216201782227, "learning_rate": 2.985335644769687e-05, "loss": 1.7531, "step": 22875 }, { "epoch": 0.044574873714261234, "grad_norm": 1.9456549882888794, "learning_rate": 2.9853164379717615e-05, "loss": 1.9782, "step": 22890 }, { "epoch": 0.044604083985371495, "grad_norm": 3.7463254928588867, "learning_rate": 2.9852972186657774e-05, "loss": 1.9347, "step": 22905 }, { "epoch": 0.044633294256481756, "grad_norm": 1.9641201496124268, "learning_rate": 2.9852779868518967e-05, "loss": 1.8366, "step": 22920 }, { "epoch": 0.044662504527592024, "grad_norm": 3.9136605262756348, "learning_rate": 2.9852587425302812e-05, "loss": 2.0271, "step": 22935 }, { "epoch": 0.044691714798702285, "grad_norm": 2.161766767501831, "learning_rate": 2.9852394857010923e-05, "loss": 1.8681, "step": 22950 }, { "epoch": 0.044720925069812546, "grad_norm": 3.1569862365722656, "learning_rate": 2.9852202163644937e-05, "loss": 1.7996, "step": 22965 }, { "epoch": 0.044750135340922814, "grad_norm": 2.0586421489715576, "learning_rate": 2.9852009345206458e-05, "loss": 1.7727, "step": 22980 }, { "epoch": 0.044779345612033075, "grad_norm": 5.425686359405518, "learning_rate": 2.9851816401697127e-05, "loss": 1.9209, "step": 22995 }, { "epoch": 0.044808555883143336, "grad_norm": 5.385043621063232, "learning_rate": 2.985162333311856e-05, "loss": 1.8473, "step": 23010 }, { "epoch": 0.0448377661542536, "grad_norm": 3.4577736854553223, "learning_rate": 2.985143013947238e-05, "loss": 1.848, "step": 23025 }, { "epoch": 0.044866976425363865, "grad_norm": 3.0630152225494385, "learning_rate": 2.985123682076022e-05, "loss": 1.7272, "step": 23040 }, { "epoch": 0.044896186696474126, "grad_norm": 3.8338427543640137, "learning_rate": 2.985104337698371e-05, "loss": 1.9042, "step": 23055 }, { "epoch": 0.04492539696758439, "grad_norm": 3.1702969074249268, "learning_rate": 2.985084980814447e-05, "loss": 1.9389, "step": 23070 }, { "epoch": 0.04495460723869465, "grad_norm": 2.758162021636963, "learning_rate": 2.985065611424414e-05, "loss": 1.8825, "step": 23085 }, { "epoch": 0.044983817509804916, "grad_norm": 2.1373486518859863, "learning_rate": 2.985046229528434e-05, "loss": 1.9451, "step": 23100 }, { "epoch": 0.04501302778091518, "grad_norm": 2.1302478313446045, "learning_rate": 2.985026835126671e-05, "loss": 2.0208, "step": 23115 }, { "epoch": 0.04504223805202544, "grad_norm": 3.8777012825012207, "learning_rate": 2.985007428219289e-05, "loss": 2.0138, "step": 23130 }, { "epoch": 0.045071448323135706, "grad_norm": 3.428769111633301, "learning_rate": 2.9849880088064497e-05, "loss": 1.9868, "step": 23145 }, { "epoch": 0.04510065859424597, "grad_norm": 5.036014556884766, "learning_rate": 2.9849685768883172e-05, "loss": 1.7348, "step": 23160 }, { "epoch": 0.04512986886535623, "grad_norm": 3.206932306289673, "learning_rate": 2.9849491324650563e-05, "loss": 1.9991, "step": 23175 }, { "epoch": 0.04515907913646649, "grad_norm": 2.5311954021453857, "learning_rate": 2.9849296755368297e-05, "loss": 1.7501, "step": 23190 }, { "epoch": 0.04518828940757676, "grad_norm": 2.1587252616882324, "learning_rate": 2.984910206103801e-05, "loss": 1.7981, "step": 23205 }, { "epoch": 0.04521749967868702, "grad_norm": 2.198859453201294, "learning_rate": 2.984890724166135e-05, "loss": 1.9349, "step": 23220 }, { "epoch": 0.04524670994979728, "grad_norm": 3.513099193572998, "learning_rate": 2.9848712297239955e-05, "loss": 1.7639, "step": 23235 }, { "epoch": 0.045275920220907546, "grad_norm": 4.66519021987915, "learning_rate": 2.984851722777546e-05, "loss": 1.6865, "step": 23250 }, { "epoch": 0.04530513049201781, "grad_norm": 2.0970771312713623, "learning_rate": 2.9848322033269523e-05, "loss": 1.8861, "step": 23265 }, { "epoch": 0.04533434076312807, "grad_norm": 4.110560417175293, "learning_rate": 2.984812671372377e-05, "loss": 1.8482, "step": 23280 }, { "epoch": 0.04536355103423833, "grad_norm": 3.2896182537078857, "learning_rate": 2.9847931269139854e-05, "loss": 1.9021, "step": 23295 }, { "epoch": 0.0453927613053486, "grad_norm": 1.929777979850769, "learning_rate": 2.9847735699519423e-05, "loss": 1.9236, "step": 23310 }, { "epoch": 0.04542197157645886, "grad_norm": 2.989177703857422, "learning_rate": 2.9847540004864115e-05, "loss": 1.8256, "step": 23325 }, { "epoch": 0.04545118184756912, "grad_norm": 2.735318899154663, "learning_rate": 2.984734418517559e-05, "loss": 1.8279, "step": 23340 }, { "epoch": 0.04548039211867939, "grad_norm": 2.597491502761841, "learning_rate": 2.9847148240455495e-05, "loss": 1.8952, "step": 23355 }, { "epoch": 0.04550960238978965, "grad_norm": 3.5635156631469727, "learning_rate": 2.9846952170705473e-05, "loss": 1.7302, "step": 23370 }, { "epoch": 0.04553881266089991, "grad_norm": 6.853522777557373, "learning_rate": 2.9846755975927174e-05, "loss": 2.0572, "step": 23385 }, { "epoch": 0.04556802293201017, "grad_norm": 2.006582498550415, "learning_rate": 2.984655965612226e-05, "loss": 1.939, "step": 23400 }, { "epoch": 0.04559723320312044, "grad_norm": 3.6737654209136963, "learning_rate": 2.984636321129238e-05, "loss": 1.8714, "step": 23415 }, { "epoch": 0.0456264434742307, "grad_norm": 2.5802881717681885, "learning_rate": 2.9846166641439185e-05, "loss": 1.8687, "step": 23430 }, { "epoch": 0.04565565374534096, "grad_norm": 3.748178005218506, "learning_rate": 2.984596994656433e-05, "loss": 1.9146, "step": 23445 }, { "epoch": 0.04568486401645123, "grad_norm": 2.5193021297454834, "learning_rate": 2.9845773126669475e-05, "loss": 1.8845, "step": 23460 }, { "epoch": 0.04571407428756149, "grad_norm": 2.3217403888702393, "learning_rate": 2.984557618175628e-05, "loss": 1.9149, "step": 23475 }, { "epoch": 0.04574328455867175, "grad_norm": 3.9094045162200928, "learning_rate": 2.98453791118264e-05, "loss": 1.8615, "step": 23490 }, { "epoch": 0.04577249482978201, "grad_norm": 2.190436840057373, "learning_rate": 2.9845181916881495e-05, "loss": 1.8181, "step": 23505 }, { "epoch": 0.04580170510089228, "grad_norm": 3.33785343170166, "learning_rate": 2.984498459692322e-05, "loss": 1.9345, "step": 23520 }, { "epoch": 0.04583091537200254, "grad_norm": 4.065570831298828, "learning_rate": 2.9844787151953242e-05, "loss": 1.7801, "step": 23535 }, { "epoch": 0.0458601256431128, "grad_norm": 3.4585118293762207, "learning_rate": 2.984458958197323e-05, "loss": 1.8181, "step": 23550 }, { "epoch": 0.04588933591422307, "grad_norm": 2.1615355014801025, "learning_rate": 2.984439188698484e-05, "loss": 1.8265, "step": 23565 }, { "epoch": 0.04591854618533333, "grad_norm": 3.9168038368225098, "learning_rate": 2.9844194066989737e-05, "loss": 1.8508, "step": 23580 }, { "epoch": 0.04594775645644359, "grad_norm": 2.436638593673706, "learning_rate": 2.9843996121989587e-05, "loss": 1.9247, "step": 23595 }, { "epoch": 0.04597696672755385, "grad_norm": 3.637157678604126, "learning_rate": 2.9843798051986053e-05, "loss": 1.9791, "step": 23610 }, { "epoch": 0.04600617699866412, "grad_norm": 2.9777755737304688, "learning_rate": 2.9843599856980815e-05, "loss": 1.8227, "step": 23625 }, { "epoch": 0.04603538726977438, "grad_norm": 5.862057209014893, "learning_rate": 2.9843401536975533e-05, "loss": 1.9745, "step": 23640 }, { "epoch": 0.04606459754088464, "grad_norm": 2.937347173690796, "learning_rate": 2.9843203091971878e-05, "loss": 1.7575, "step": 23655 }, { "epoch": 0.0460938078119949, "grad_norm": 3.7695438861846924, "learning_rate": 2.984300452197152e-05, "loss": 1.9655, "step": 23670 }, { "epoch": 0.04612301808310517, "grad_norm": 3.1122677326202393, "learning_rate": 2.9842805826976137e-05, "loss": 2.0459, "step": 23685 }, { "epoch": 0.04615222835421543, "grad_norm": 4.503921031951904, "learning_rate": 2.98426070069874e-05, "loss": 1.8319, "step": 23700 }, { "epoch": 0.04618143862532569, "grad_norm": 2.247570514678955, "learning_rate": 2.9842408062006982e-05, "loss": 1.765, "step": 23715 }, { "epoch": 0.04621064889643596, "grad_norm": 3.512427806854248, "learning_rate": 2.9842208992036554e-05, "loss": 1.8118, "step": 23730 }, { "epoch": 0.04623985916754622, "grad_norm": 3.174893379211426, "learning_rate": 2.98420097970778e-05, "loss": 1.8791, "step": 23745 }, { "epoch": 0.04626906943865648, "grad_norm": 5.151320934295654, "learning_rate": 2.9841810477132392e-05, "loss": 1.817, "step": 23760 }, { "epoch": 0.04629827970976674, "grad_norm": 2.6584534645080566, "learning_rate": 2.984161103220201e-05, "loss": 1.8441, "step": 23775 }, { "epoch": 0.04632748998087701, "grad_norm": 1.967596173286438, "learning_rate": 2.9841411462288335e-05, "loss": 2.0349, "step": 23790 }, { "epoch": 0.04635670025198727, "grad_norm": 3.7256250381469727, "learning_rate": 2.9841211767393048e-05, "loss": 1.8131, "step": 23805 }, { "epoch": 0.04638591052309753, "grad_norm": 4.081748962402344, "learning_rate": 2.9841011947517826e-05, "loss": 1.8664, "step": 23820 }, { "epoch": 0.0464151207942078, "grad_norm": 2.527451276779175, "learning_rate": 2.984081200266436e-05, "loss": 1.8916, "step": 23835 }, { "epoch": 0.04644433106531806, "grad_norm": 2.159346103668213, "learning_rate": 2.9840611932834326e-05, "loss": 1.9254, "step": 23850 }, { "epoch": 0.04647354133642832, "grad_norm": 2.636519432067871, "learning_rate": 2.9840411738029412e-05, "loss": 1.8062, "step": 23865 }, { "epoch": 0.046502751607538584, "grad_norm": 2.3750388622283936, "learning_rate": 2.9840211418251303e-05, "loss": 1.7347, "step": 23880 }, { "epoch": 0.04653196187864885, "grad_norm": 2.181845188140869, "learning_rate": 2.9840010973501685e-05, "loss": 1.9207, "step": 23895 }, { "epoch": 0.04656117214975911, "grad_norm": 3.098555564880371, "learning_rate": 2.9839810403782252e-05, "loss": 1.9566, "step": 23910 }, { "epoch": 0.046590382420869374, "grad_norm": 5.142632484436035, "learning_rate": 2.9839609709094685e-05, "loss": 1.8871, "step": 23925 }, { "epoch": 0.04661959269197964, "grad_norm": 1.7162665128707886, "learning_rate": 2.9839408889440678e-05, "loss": 1.8784, "step": 23940 }, { "epoch": 0.0466488029630899, "grad_norm": 3.743401050567627, "learning_rate": 2.9839207944821925e-05, "loss": 1.9179, "step": 23955 }, { "epoch": 0.046678013234200164, "grad_norm": 2.952817440032959, "learning_rate": 2.983900687524011e-05, "loss": 1.9182, "step": 23970 }, { "epoch": 0.046707223505310425, "grad_norm": 2.3484249114990234, "learning_rate": 2.983880568069693e-05, "loss": 1.8336, "step": 23985 }, { "epoch": 0.04673643377642069, "grad_norm": 1.931921362876892, "learning_rate": 2.9838604361194087e-05, "loss": 1.771, "step": 24000 }, { "epoch": 0.04676564404753095, "grad_norm": 5.197054386138916, "learning_rate": 2.9838402916733263e-05, "loss": 1.8558, "step": 24015 }, { "epoch": 0.046794854318641214, "grad_norm": 2.862427234649658, "learning_rate": 2.9838201347316164e-05, "loss": 1.6956, "step": 24030 }, { "epoch": 0.04682406458975148, "grad_norm": 2.6003293991088867, "learning_rate": 2.9837999652944487e-05, "loss": 2.0473, "step": 24045 }, { "epoch": 0.04685327486086174, "grad_norm": 2.068455934524536, "learning_rate": 2.9837797833619926e-05, "loss": 1.9233, "step": 24060 }, { "epoch": 0.046882485131972004, "grad_norm": 3.4722557067871094, "learning_rate": 2.983759588934418e-05, "loss": 2.0052, "step": 24075 }, { "epoch": 0.046911695403082265, "grad_norm": 3.198732852935791, "learning_rate": 2.9837393820118954e-05, "loss": 1.906, "step": 24090 }, { "epoch": 0.04694090567419253, "grad_norm": 2.5050504207611084, "learning_rate": 2.983719162594595e-05, "loss": 1.833, "step": 24105 }, { "epoch": 0.046970115945302794, "grad_norm": 2.1438305377960205, "learning_rate": 2.9836989306826866e-05, "loss": 1.7439, "step": 24120 }, { "epoch": 0.046999326216413055, "grad_norm": 1.5980552434921265, "learning_rate": 2.983678686276341e-05, "loss": 1.8018, "step": 24135 }, { "epoch": 0.047028536487523316, "grad_norm": 2.566882610321045, "learning_rate": 2.9836584293757282e-05, "loss": 1.8807, "step": 24150 }, { "epoch": 0.047057746758633584, "grad_norm": 4.125818252563477, "learning_rate": 2.9836381599810196e-05, "loss": 1.8198, "step": 24165 }, { "epoch": 0.047086957029743845, "grad_norm": 3.1217780113220215, "learning_rate": 2.983617878092385e-05, "loss": 2.0202, "step": 24180 }, { "epoch": 0.047116167300854106, "grad_norm": 3.001835823059082, "learning_rate": 2.9835975837099956e-05, "loss": 1.851, "step": 24195 }, { "epoch": 0.047145377571964374, "grad_norm": 4.060051918029785, "learning_rate": 2.9835772768340225e-05, "loss": 1.8918, "step": 24210 }, { "epoch": 0.047174587843074635, "grad_norm": 4.199372291564941, "learning_rate": 2.9835569574646363e-05, "loss": 1.848, "step": 24225 }, { "epoch": 0.047203798114184896, "grad_norm": 3.1068427562713623, "learning_rate": 2.9835366256020085e-05, "loss": 1.793, "step": 24240 }, { "epoch": 0.04723300838529516, "grad_norm": 3.30556321144104, "learning_rate": 2.9835162812463098e-05, "loss": 1.8914, "step": 24255 }, { "epoch": 0.047262218656405425, "grad_norm": 3.898911476135254, "learning_rate": 2.9834959243977123e-05, "loss": 1.929, "step": 24270 }, { "epoch": 0.047291428927515686, "grad_norm": 2.1169817447662354, "learning_rate": 2.9834755550563865e-05, "loss": 1.9513, "step": 24285 }, { "epoch": 0.04732063919862595, "grad_norm": 3.0540499687194824, "learning_rate": 2.9834551732225044e-05, "loss": 1.8283, "step": 24300 }, { "epoch": 0.047349849469736215, "grad_norm": 2.495234727859497, "learning_rate": 2.9834347788962383e-05, "loss": 1.8268, "step": 24315 }, { "epoch": 0.047379059740846476, "grad_norm": 4.254831790924072, "learning_rate": 2.9834143720777588e-05, "loss": 1.8933, "step": 24330 }, { "epoch": 0.04740827001195674, "grad_norm": 5.430483818054199, "learning_rate": 2.9833939527672384e-05, "loss": 1.8128, "step": 24345 }, { "epoch": 0.047437480283067, "grad_norm": 3.332515239715576, "learning_rate": 2.983373520964849e-05, "loss": 1.9089, "step": 24360 }, { "epoch": 0.047466690554177265, "grad_norm": 2.1588125228881836, "learning_rate": 2.983353076670762e-05, "loss": 1.7578, "step": 24375 }, { "epoch": 0.047495900825287526, "grad_norm": 2.6773619651794434, "learning_rate": 2.9833326198851503e-05, "loss": 1.9486, "step": 24390 }, { "epoch": 0.04752511109639779, "grad_norm": 2.855078935623169, "learning_rate": 2.9833121506081862e-05, "loss": 1.9339, "step": 24405 }, { "epoch": 0.047554321367508055, "grad_norm": 2.539210557937622, "learning_rate": 2.983291668840042e-05, "loss": 1.8479, "step": 24420 }, { "epoch": 0.047583531638618316, "grad_norm": 3.0605132579803467, "learning_rate": 2.9832711745808895e-05, "loss": 1.8924, "step": 24435 }, { "epoch": 0.04761274190972858, "grad_norm": 2.142998218536377, "learning_rate": 2.9832506678309025e-05, "loss": 2.0331, "step": 24450 }, { "epoch": 0.04764195218083884, "grad_norm": 3.082610845565796, "learning_rate": 2.983230148590253e-05, "loss": 1.8299, "step": 24465 }, { "epoch": 0.047671162451949106, "grad_norm": 4.011190414428711, "learning_rate": 2.9832096168591128e-05, "loss": 1.8591, "step": 24480 }, { "epoch": 0.04770037272305937, "grad_norm": 2.326756238937378, "learning_rate": 2.983189072637657e-05, "loss": 1.8687, "step": 24495 }, { "epoch": 0.04772958299416963, "grad_norm": 2.1641993522644043, "learning_rate": 2.9831685159260568e-05, "loss": 1.9402, "step": 24510 }, { "epoch": 0.047758793265279896, "grad_norm": 2.614429473876953, "learning_rate": 2.983147946724486e-05, "loss": 1.8911, "step": 24525 }, { "epoch": 0.04778800353639016, "grad_norm": 2.163760185241699, "learning_rate": 2.983127365033118e-05, "loss": 1.7623, "step": 24540 }, { "epoch": 0.04781721380750042, "grad_norm": 1.837815761566162, "learning_rate": 2.9831067708521257e-05, "loss": 2.0103, "step": 24555 }, { "epoch": 0.04784642407861068, "grad_norm": 4.629458904266357, "learning_rate": 2.9830861641816826e-05, "loss": 1.8473, "step": 24570 }, { "epoch": 0.04787563434972095, "grad_norm": 2.741942882537842, "learning_rate": 2.9830655450219623e-05, "loss": 1.8219, "step": 24585 }, { "epoch": 0.04790484462083121, "grad_norm": 2.6273906230926514, "learning_rate": 2.9830449133731387e-05, "loss": 1.7688, "step": 24600 }, { "epoch": 0.04793405489194147, "grad_norm": 2.2749216556549072, "learning_rate": 2.983024269235385e-05, "loss": 1.9809, "step": 24615 }, { "epoch": 0.04796326516305174, "grad_norm": 4.1970367431640625, "learning_rate": 2.9830036126088754e-05, "loss": 1.7998, "step": 24630 }, { "epoch": 0.047992475434162, "grad_norm": 2.87906813621521, "learning_rate": 2.982982943493784e-05, "loss": 1.8496, "step": 24645 }, { "epoch": 0.04802168570527226, "grad_norm": 1.9954204559326172, "learning_rate": 2.9829622618902848e-05, "loss": 1.8782, "step": 24660 }, { "epoch": 0.04805089597638252, "grad_norm": 3.3936777114868164, "learning_rate": 2.982941567798551e-05, "loss": 1.7, "step": 24675 }, { "epoch": 0.04808010624749279, "grad_norm": 5.30580997467041, "learning_rate": 2.9829208612187585e-05, "loss": 1.8712, "step": 24690 }, { "epoch": 0.04810931651860305, "grad_norm": 1.797080636024475, "learning_rate": 2.982900142151081e-05, "loss": 1.8711, "step": 24705 }, { "epoch": 0.04813852678971331, "grad_norm": 1.7338413000106812, "learning_rate": 2.9828794105956922e-05, "loss": 1.9125, "step": 24720 }, { "epoch": 0.04816773706082357, "grad_norm": 6.833633899688721, "learning_rate": 2.9828586665527677e-05, "loss": 1.8622, "step": 24735 }, { "epoch": 0.04819694733193384, "grad_norm": 2.8855645656585693, "learning_rate": 2.9828379100224814e-05, "loss": 1.7596, "step": 24750 }, { "epoch": 0.0482261576030441, "grad_norm": 3.7425286769866943, "learning_rate": 2.982817141005009e-05, "loss": 1.8452, "step": 24765 }, { "epoch": 0.04825536787415436, "grad_norm": 4.187405109405518, "learning_rate": 2.9827963595005248e-05, "loss": 1.8669, "step": 24780 }, { "epoch": 0.04828457814526463, "grad_norm": 4.476016998291016, "learning_rate": 2.982775565509204e-05, "loss": 1.9221, "step": 24795 }, { "epoch": 0.04831378841637489, "grad_norm": 3.6991500854492188, "learning_rate": 2.9827547590312213e-05, "loss": 1.5434, "step": 24810 }, { "epoch": 0.04834299868748515, "grad_norm": 3.0707576274871826, "learning_rate": 2.9827339400667524e-05, "loss": 1.88, "step": 24825 }, { "epoch": 0.04837220895859541, "grad_norm": 2.604163646697998, "learning_rate": 2.9827131086159723e-05, "loss": 1.9946, "step": 24840 }, { "epoch": 0.04840141922970568, "grad_norm": 4.7181549072265625, "learning_rate": 2.9826922646790568e-05, "loss": 1.8807, "step": 24855 }, { "epoch": 0.04843062950081594, "grad_norm": 3.983146905899048, "learning_rate": 2.9826714082561808e-05, "loss": 1.8593, "step": 24870 }, { "epoch": 0.0484598397719262, "grad_norm": 4.0342912673950195, "learning_rate": 2.982650539347521e-05, "loss": 1.9208, "step": 24885 }, { "epoch": 0.04848905004303647, "grad_norm": 3.092644691467285, "learning_rate": 2.982629657953252e-05, "loss": 1.9611, "step": 24900 }, { "epoch": 0.04851826031414673, "grad_norm": 2.244311809539795, "learning_rate": 2.98260876407355e-05, "loss": 1.7965, "step": 24915 }, { "epoch": 0.04854747058525699, "grad_norm": 4.740140914916992, "learning_rate": 2.9825878577085917e-05, "loss": 2.0418, "step": 24930 }, { "epoch": 0.04857668085636725, "grad_norm": 1.6131473779678345, "learning_rate": 2.9825669388585523e-05, "loss": 1.7781, "step": 24945 }, { "epoch": 0.04860589112747752, "grad_norm": 3.3024797439575195, "learning_rate": 2.9825460075236077e-05, "loss": 1.9176, "step": 24960 }, { "epoch": 0.04863510139858778, "grad_norm": 3.6958138942718506, "learning_rate": 2.9825250637039348e-05, "loss": 1.8613, "step": 24975 }, { "epoch": 0.04866431166969804, "grad_norm": 2.277597427368164, "learning_rate": 2.9825041073997102e-05, "loss": 1.9942, "step": 24990 }, { "epoch": 0.04869352194080831, "grad_norm": 3.3070085048675537, "learning_rate": 2.9824831386111103e-05, "loss": 1.5052, "step": 25005 }, { "epoch": 0.04872273221191857, "grad_norm": 2.769148826599121, "learning_rate": 2.9824621573383107e-05, "loss": 1.8361, "step": 25020 }, { "epoch": 0.04875194248302883, "grad_norm": 2.3901126384735107, "learning_rate": 2.982441163581489e-05, "loss": 1.8346, "step": 25035 }, { "epoch": 0.04878115275413909, "grad_norm": 3.8897323608398438, "learning_rate": 2.9824201573408218e-05, "loss": 1.8186, "step": 25050 }, { "epoch": 0.04881036302524936, "grad_norm": 1.947713017463684, "learning_rate": 2.982399138616486e-05, "loss": 1.8812, "step": 25065 }, { "epoch": 0.04883957329635962, "grad_norm": 1.6136304140090942, "learning_rate": 2.9823781074086582e-05, "loss": 2.0161, "step": 25080 }, { "epoch": 0.04886878356746988, "grad_norm": 2.6880128383636475, "learning_rate": 2.9823570637175166e-05, "loss": 1.8861, "step": 25095 }, { "epoch": 0.04889799383858015, "grad_norm": 4.511691570281982, "learning_rate": 2.982336007543237e-05, "loss": 1.8334, "step": 25110 }, { "epoch": 0.04892720410969041, "grad_norm": 2.540619134902954, "learning_rate": 2.9823149388859975e-05, "loss": 1.8554, "step": 25125 }, { "epoch": 0.04895641438080067, "grad_norm": 2.650416135787964, "learning_rate": 2.982293857745976e-05, "loss": 2.0556, "step": 25140 }, { "epoch": 0.04898562465191093, "grad_norm": 4.075965881347656, "learning_rate": 2.9822727641233488e-05, "loss": 1.7369, "step": 25155 }, { "epoch": 0.0490148349230212, "grad_norm": 4.21481466293335, "learning_rate": 2.9822516580182944e-05, "loss": 1.9624, "step": 25170 }, { "epoch": 0.04904404519413146, "grad_norm": 4.617081642150879, "learning_rate": 2.98223053943099e-05, "loss": 1.7276, "step": 25185 }, { "epoch": 0.04907325546524172, "grad_norm": 3.5099408626556396, "learning_rate": 2.9822094083616145e-05, "loss": 1.8783, "step": 25200 }, { "epoch": 0.049102465736351984, "grad_norm": 4.118253231048584, "learning_rate": 2.9821882648103445e-05, "loss": 1.7848, "step": 25215 }, { "epoch": 0.04913167600746225, "grad_norm": 3.617659091949463, "learning_rate": 2.982167108777359e-05, "loss": 1.8307, "step": 25230 }, { "epoch": 0.04916088627857251, "grad_norm": 3.3717000484466553, "learning_rate": 2.9821459402628357e-05, "loss": 1.6414, "step": 25245 }, { "epoch": 0.049190096549682774, "grad_norm": 3.3341469764709473, "learning_rate": 2.9821247592669526e-05, "loss": 1.9641, "step": 25260 }, { "epoch": 0.04921930682079304, "grad_norm": 3.8818039894104004, "learning_rate": 2.9821035657898886e-05, "loss": 1.8227, "step": 25275 }, { "epoch": 0.0492485170919033, "grad_norm": 2.4647955894470215, "learning_rate": 2.9820823598318226e-05, "loss": 1.9441, "step": 25290 }, { "epoch": 0.049277727363013564, "grad_norm": 5.145657062530518, "learning_rate": 2.9820611413929318e-05, "loss": 1.7515, "step": 25305 }, { "epoch": 0.049306937634123825, "grad_norm": 2.595554828643799, "learning_rate": 2.9820399104733964e-05, "loss": 1.8185, "step": 25320 }, { "epoch": 0.04933614790523409, "grad_norm": 3.0854387283325195, "learning_rate": 2.9820186670733944e-05, "loss": 2.0462, "step": 25335 }, { "epoch": 0.049365358176344354, "grad_norm": 4.086148262023926, "learning_rate": 2.9819974111931045e-05, "loss": 1.923, "step": 25350 }, { "epoch": 0.049394568447454615, "grad_norm": 2.3999664783477783, "learning_rate": 2.9819761428327057e-05, "loss": 1.9243, "step": 25365 }, { "epoch": 0.04942377871856488, "grad_norm": 1.8367825746536255, "learning_rate": 2.981954861992378e-05, "loss": 1.9104, "step": 25380 }, { "epoch": 0.049452988989675144, "grad_norm": 2.3233165740966797, "learning_rate": 2.9819335686722997e-05, "loss": 1.7446, "step": 25395 }, { "epoch": 0.049482199260785405, "grad_norm": 3.7498881816864014, "learning_rate": 2.98191226287265e-05, "loss": 2.0172, "step": 25410 }, { "epoch": 0.049511409531895666, "grad_norm": 2.636087656021118, "learning_rate": 2.9818909445936092e-05, "loss": 1.9958, "step": 25425 }, { "epoch": 0.049540619803005934, "grad_norm": 2.6167049407958984, "learning_rate": 2.9818696138353564e-05, "loss": 1.9746, "step": 25440 }, { "epoch": 0.049569830074116195, "grad_norm": 2.699380874633789, "learning_rate": 2.9818482705980708e-05, "loss": 2.0158, "step": 25455 }, { "epoch": 0.049599040345226456, "grad_norm": 2.8639230728149414, "learning_rate": 2.9818269148819326e-05, "loss": 1.8795, "step": 25470 }, { "epoch": 0.04962825061633672, "grad_norm": 3.6716597080230713, "learning_rate": 2.9818055466871217e-05, "loss": 2.0066, "step": 25485 }, { "epoch": 0.049657460887446984, "grad_norm": 3.0852763652801514, "learning_rate": 2.981784166013818e-05, "loss": 1.8392, "step": 25500 }, { "epoch": 0.049686671158557245, "grad_norm": 2.0497000217437744, "learning_rate": 2.981762772862201e-05, "loss": 2.0587, "step": 25515 }, { "epoch": 0.049715881429667506, "grad_norm": 4.302377700805664, "learning_rate": 2.9817413672324517e-05, "loss": 1.9415, "step": 25530 }, { "epoch": 0.049745091700777774, "grad_norm": 2.478428840637207, "learning_rate": 2.9817199491247495e-05, "loss": 2.0082, "step": 25545 }, { "epoch": 0.049774301971888035, "grad_norm": 3.375516891479492, "learning_rate": 2.9816985185392752e-05, "loss": 2.061, "step": 25560 }, { "epoch": 0.049803512242998296, "grad_norm": 2.3733456134796143, "learning_rate": 2.9816770754762094e-05, "loss": 1.8752, "step": 25575 }, { "epoch": 0.049832722514108564, "grad_norm": 2.8646862506866455, "learning_rate": 2.9816556199357334e-05, "loss": 1.9048, "step": 25590 }, { "epoch": 0.049861932785218825, "grad_norm": 3.711494207382202, "learning_rate": 2.981634151918026e-05, "loss": 1.8555, "step": 25605 }, { "epoch": 0.049891143056329086, "grad_norm": 4.652657985687256, "learning_rate": 2.9816126714232694e-05, "loss": 1.9591, "step": 25620 }, { "epoch": 0.04992035332743935, "grad_norm": 3.5854547023773193, "learning_rate": 2.981591178451644e-05, "loss": 1.8043, "step": 25635 }, { "epoch": 0.049949563598549615, "grad_norm": 2.532128095626831, "learning_rate": 2.981569673003331e-05, "loss": 1.7456, "step": 25650 }, { "epoch": 0.049978773869659876, "grad_norm": 1.8968348503112793, "learning_rate": 2.9815481550785116e-05, "loss": 1.8971, "step": 25665 }, { "epoch": 0.05000798414077014, "grad_norm": 4.662414073944092, "learning_rate": 2.9815266246773663e-05, "loss": 2.1576, "step": 25680 }, { "epoch": 0.050037194411880405, "grad_norm": 2.1178083419799805, "learning_rate": 2.9815050818000773e-05, "loss": 1.9055, "step": 25695 }, { "epoch": 0.050066404682990666, "grad_norm": 3.8748810291290283, "learning_rate": 2.9814835264468254e-05, "loss": 1.8177, "step": 25710 }, { "epoch": 0.05009561495410093, "grad_norm": 3.424405097961426, "learning_rate": 2.9814619586177926e-05, "loss": 1.8804, "step": 25725 }, { "epoch": 0.05012482522521119, "grad_norm": 4.104612827301025, "learning_rate": 2.98144037831316e-05, "loss": 1.973, "step": 25740 }, { "epoch": 0.050154035496321456, "grad_norm": 2.1848320960998535, "learning_rate": 2.98141878553311e-05, "loss": 1.8529, "step": 25755 }, { "epoch": 0.05018324576743172, "grad_norm": 4.191700458526611, "learning_rate": 2.981397180277824e-05, "loss": 1.8094, "step": 25770 }, { "epoch": 0.05021245603854198, "grad_norm": 3.3454208374023438, "learning_rate": 2.981375562547484e-05, "loss": 1.9105, "step": 25785 }, { "epoch": 0.05024166630965224, "grad_norm": 2.848618507385254, "learning_rate": 2.9813539323422717e-05, "loss": 1.9054, "step": 25800 }, { "epoch": 0.05027087658076251, "grad_norm": 1.7352811098098755, "learning_rate": 2.98133228966237e-05, "loss": 1.91, "step": 25815 }, { "epoch": 0.05030008685187277, "grad_norm": 2.177297592163086, "learning_rate": 2.9813106345079604e-05, "loss": 1.8276, "step": 25830 }, { "epoch": 0.05032929712298303, "grad_norm": 2.1822316646575928, "learning_rate": 2.981288966879226e-05, "loss": 1.7961, "step": 25845 }, { "epoch": 0.050358507394093296, "grad_norm": 2.70729398727417, "learning_rate": 2.9812672867763482e-05, "loss": 1.7497, "step": 25860 }, { "epoch": 0.05038771766520356, "grad_norm": 2.869450807571411, "learning_rate": 2.981245594199511e-05, "loss": 1.8309, "step": 25875 }, { "epoch": 0.05041692793631382, "grad_norm": 2.233219623565674, "learning_rate": 2.981223889148896e-05, "loss": 1.8577, "step": 25890 }, { "epoch": 0.05044613820742408, "grad_norm": 5.402493000030518, "learning_rate": 2.981202171624686e-05, "loss": 1.9151, "step": 25905 }, { "epoch": 0.05047534847853435, "grad_norm": 4.127412796020508, "learning_rate": 2.9811804416270648e-05, "loss": 1.8644, "step": 25920 }, { "epoch": 0.05050455874964461, "grad_norm": 2.9654555320739746, "learning_rate": 2.9811586991562145e-05, "loss": 1.99, "step": 25935 }, { "epoch": 0.05053376902075487, "grad_norm": 2.6316134929656982, "learning_rate": 2.981136944212318e-05, "loss": 1.7434, "step": 25950 }, { "epoch": 0.05056297929186514, "grad_norm": 2.6688010692596436, "learning_rate": 2.9811151767955597e-05, "loss": 1.8075, "step": 25965 }, { "epoch": 0.0505921895629754, "grad_norm": 1.7729136943817139, "learning_rate": 2.981093396906122e-05, "loss": 1.7828, "step": 25980 }, { "epoch": 0.05062139983408566, "grad_norm": 3.017512321472168, "learning_rate": 2.9810716045441884e-05, "loss": 1.9602, "step": 25995 }, { "epoch": 0.05065061010519592, "grad_norm": 3.954099416732788, "learning_rate": 2.9810497997099427e-05, "loss": 1.9158, "step": 26010 }, { "epoch": 0.05067982037630619, "grad_norm": 4.384187698364258, "learning_rate": 2.981027982403568e-05, "loss": 2.0172, "step": 26025 }, { "epoch": 0.05070903064741645, "grad_norm": 2.7591800689697266, "learning_rate": 2.9810061526252488e-05, "loss": 1.8372, "step": 26040 }, { "epoch": 0.05073824091852671, "grad_norm": 2.6615748405456543, "learning_rate": 2.980984310375168e-05, "loss": 1.8127, "step": 26055 }, { "epoch": 0.05076745118963698, "grad_norm": 3.1193888187408447, "learning_rate": 2.9809624556535106e-05, "loss": 1.8826, "step": 26070 }, { "epoch": 0.05079666146074724, "grad_norm": 2.338146448135376, "learning_rate": 2.9809405884604594e-05, "loss": 1.9003, "step": 26085 }, { "epoch": 0.0508258717318575, "grad_norm": 2.339162588119507, "learning_rate": 2.9809187087961993e-05, "loss": 1.8444, "step": 26100 }, { "epoch": 0.05085508200296776, "grad_norm": 6.189070224761963, "learning_rate": 2.980896816660915e-05, "loss": 1.8682, "step": 26115 }, { "epoch": 0.05088429227407803, "grad_norm": 3.7556257247924805, "learning_rate": 2.9808749120547898e-05, "loss": 1.8692, "step": 26130 }, { "epoch": 0.05091350254518829, "grad_norm": 2.4181950092315674, "learning_rate": 2.980852994978009e-05, "loss": 1.8497, "step": 26145 }, { "epoch": 0.05094271281629855, "grad_norm": 2.1751251220703125, "learning_rate": 2.9808310654307566e-05, "loss": 1.7236, "step": 26160 }, { "epoch": 0.05097192308740882, "grad_norm": 2.304203748703003, "learning_rate": 2.9808091234132177e-05, "loss": 1.8394, "step": 26175 }, { "epoch": 0.05100113335851908, "grad_norm": 2.03320050239563, "learning_rate": 2.980787168925577e-05, "loss": 2.0082, "step": 26190 }, { "epoch": 0.05103034362962934, "grad_norm": 3.1527299880981445, "learning_rate": 2.9807652019680195e-05, "loss": 1.8783, "step": 26205 }, { "epoch": 0.0510595539007396, "grad_norm": 3.638120412826538, "learning_rate": 2.9807432225407295e-05, "loss": 1.8912, "step": 26220 }, { "epoch": 0.05108876417184987, "grad_norm": 1.9177019596099854, "learning_rate": 2.9807212306438927e-05, "loss": 1.7949, "step": 26235 }, { "epoch": 0.05111797444296013, "grad_norm": 4.763120174407959, "learning_rate": 2.9806992262776945e-05, "loss": 1.9464, "step": 26250 }, { "epoch": 0.05114718471407039, "grad_norm": 4.4592132568359375, "learning_rate": 2.980677209442319e-05, "loss": 1.834, "step": 26265 }, { "epoch": 0.05117639498518065, "grad_norm": 2.6289279460906982, "learning_rate": 2.980655180137953e-05, "loss": 1.7644, "step": 26280 }, { "epoch": 0.05120560525629092, "grad_norm": 2.2794735431671143, "learning_rate": 2.9806331383647816e-05, "loss": 1.9125, "step": 26295 }, { "epoch": 0.05123481552740118, "grad_norm": 3.5884125232696533, "learning_rate": 2.9806110841229904e-05, "loss": 1.925, "step": 26310 }, { "epoch": 0.05126402579851144, "grad_norm": 2.8750176429748535, "learning_rate": 2.9805890174127648e-05, "loss": 1.6463, "step": 26325 }, { "epoch": 0.05129323606962171, "grad_norm": 3.1167027950286865, "learning_rate": 2.980566938234291e-05, "loss": 1.8897, "step": 26340 }, { "epoch": 0.05132244634073197, "grad_norm": 2.8106181621551514, "learning_rate": 2.9805448465877546e-05, "loss": 1.855, "step": 26355 }, { "epoch": 0.05135165661184223, "grad_norm": 3.8905303478240967, "learning_rate": 2.980522742473342e-05, "loss": 1.9127, "step": 26370 }, { "epoch": 0.05138086688295249, "grad_norm": 3.031163215637207, "learning_rate": 2.980500625891239e-05, "loss": 1.9577, "step": 26385 }, { "epoch": 0.05141007715406276, "grad_norm": 1.991543173789978, "learning_rate": 2.980478496841632e-05, "loss": 1.6416, "step": 26400 }, { "epoch": 0.05143928742517302, "grad_norm": 3.90432071685791, "learning_rate": 2.9804563553247076e-05, "loss": 1.7318, "step": 26415 }, { "epoch": 0.05146849769628328, "grad_norm": 3.582280158996582, "learning_rate": 2.980434201340652e-05, "loss": 1.7349, "step": 26430 }, { "epoch": 0.05149770796739355, "grad_norm": 3.1633496284484863, "learning_rate": 2.980412034889651e-05, "loss": 1.8023, "step": 26445 }, { "epoch": 0.05152691823850381, "grad_norm": 3.7939155101776123, "learning_rate": 2.9803898559718927e-05, "loss": 1.6626, "step": 26460 }, { "epoch": 0.05155612850961407, "grad_norm": 3.093492031097412, "learning_rate": 2.9803676645875634e-05, "loss": 1.7914, "step": 26475 }, { "epoch": 0.051585338780724334, "grad_norm": 2.4469592571258545, "learning_rate": 2.9803454607368493e-05, "loss": 1.7484, "step": 26490 }, { "epoch": 0.0516145490518346, "grad_norm": 2.613276243209839, "learning_rate": 2.9803232444199382e-05, "loss": 1.7984, "step": 26505 }, { "epoch": 0.05164375932294486, "grad_norm": 3.1793646812438965, "learning_rate": 2.9803010156370166e-05, "loss": 1.7114, "step": 26520 }, { "epoch": 0.051672969594055124, "grad_norm": 1.9021662473678589, "learning_rate": 2.980278774388272e-05, "loss": 1.6837, "step": 26535 }, { "epoch": 0.05170217986516539, "grad_norm": 2.415710687637329, "learning_rate": 2.9802565206738922e-05, "loss": 1.9818, "step": 26550 }, { "epoch": 0.05173139013627565, "grad_norm": 3.8227622509002686, "learning_rate": 2.9802342544940635e-05, "loss": 1.9691, "step": 26565 }, { "epoch": 0.051760600407385914, "grad_norm": 1.6945210695266724, "learning_rate": 2.980211975848974e-05, "loss": 1.9788, "step": 26580 }, { "epoch": 0.051789810678496175, "grad_norm": 2.1171348094940186, "learning_rate": 2.980189684738811e-05, "loss": 1.8498, "step": 26595 }, { "epoch": 0.05181902094960644, "grad_norm": 3.8129332065582275, "learning_rate": 2.9801673811637628e-05, "loss": 1.8109, "step": 26610 }, { "epoch": 0.0518482312207167, "grad_norm": 3.1358275413513184, "learning_rate": 2.9801450651240173e-05, "loss": 1.7809, "step": 26625 }, { "epoch": 0.051877441491826964, "grad_norm": 3.9287755489349365, "learning_rate": 2.9801227366197614e-05, "loss": 1.7791, "step": 26640 }, { "epoch": 0.05190665176293723, "grad_norm": 3.269742488861084, "learning_rate": 2.980100395651184e-05, "loss": 2.0053, "step": 26655 }, { "epoch": 0.05193586203404749, "grad_norm": 3.23358154296875, "learning_rate": 2.980078042218473e-05, "loss": 1.7863, "step": 26670 }, { "epoch": 0.051965072305157754, "grad_norm": 3.6313607692718506, "learning_rate": 2.980055676321817e-05, "loss": 1.8125, "step": 26685 }, { "epoch": 0.051994282576268015, "grad_norm": 2.303229331970215, "learning_rate": 2.9800332979614035e-05, "loss": 1.8336, "step": 26700 }, { "epoch": 0.05202349284737828, "grad_norm": 4.923130035400391, "learning_rate": 2.9800109071374216e-05, "loss": 1.9392, "step": 26715 }, { "epoch": 0.052052703118488544, "grad_norm": 6.160820484161377, "learning_rate": 2.9799885038500597e-05, "loss": 1.8991, "step": 26730 }, { "epoch": 0.052081913389598805, "grad_norm": 1.9204509258270264, "learning_rate": 2.9799660880995065e-05, "loss": 1.8511, "step": 26745 }, { "epoch": 0.05211112366070907, "grad_norm": 4.1540656089782715, "learning_rate": 2.9799436598859507e-05, "loss": 1.7353, "step": 26760 }, { "epoch": 0.052140333931819334, "grad_norm": 2.5721335411071777, "learning_rate": 2.979921219209581e-05, "loss": 1.6949, "step": 26775 }, { "epoch": 0.052169544202929595, "grad_norm": 2.5524466037750244, "learning_rate": 2.9798987660705867e-05, "loss": 1.9033, "step": 26790 }, { "epoch": 0.052198754474039856, "grad_norm": 2.8870813846588135, "learning_rate": 2.979876300469157e-05, "loss": 1.6899, "step": 26805 }, { "epoch": 0.052227964745150124, "grad_norm": 2.5837631225585938, "learning_rate": 2.9798538224054804e-05, "loss": 1.7715, "step": 26820 }, { "epoch": 0.052257175016260385, "grad_norm": 2.5015158653259277, "learning_rate": 2.979831331879747e-05, "loss": 1.9121, "step": 26835 }, { "epoch": 0.052286385287370646, "grad_norm": 3.3133704662323, "learning_rate": 2.9798088288921457e-05, "loss": 1.87, "step": 26850 }, { "epoch": 0.05231559555848091, "grad_norm": 2.159090042114258, "learning_rate": 2.979786313442866e-05, "loss": 1.9665, "step": 26865 }, { "epoch": 0.052344805829591175, "grad_norm": 3.6745707988739014, "learning_rate": 2.9797637855320977e-05, "loss": 1.8956, "step": 26880 }, { "epoch": 0.052374016100701436, "grad_norm": 2.593538761138916, "learning_rate": 2.9797412451600305e-05, "loss": 1.9206, "step": 26895 }, { "epoch": 0.0524032263718117, "grad_norm": 2.1524672508239746, "learning_rate": 2.979718692326854e-05, "loss": 2.0598, "step": 26910 }, { "epoch": 0.052432436642921965, "grad_norm": 5.403210639953613, "learning_rate": 2.9796961270327583e-05, "loss": 1.9446, "step": 26925 }, { "epoch": 0.052461646914032226, "grad_norm": 2.859320878982544, "learning_rate": 2.9796735492779338e-05, "loss": 1.6969, "step": 26940 }, { "epoch": 0.05249085718514249, "grad_norm": 2.0825371742248535, "learning_rate": 2.9796509590625696e-05, "loss": 1.8951, "step": 26955 }, { "epoch": 0.05252006745625275, "grad_norm": 2.3604981899261475, "learning_rate": 2.979628356386857e-05, "loss": 1.8861, "step": 26970 }, { "epoch": 0.052549277727363015, "grad_norm": 3.510629415512085, "learning_rate": 2.9796057412509856e-05, "loss": 1.9885, "step": 26985 }, { "epoch": 0.052578487998473276, "grad_norm": 1.72383713722229, "learning_rate": 2.9795831136551467e-05, "loss": 1.832, "step": 27000 }, { "epoch": 0.05260769826958354, "grad_norm": 2.767523765563965, "learning_rate": 2.9795604735995297e-05, "loss": 1.8956, "step": 27015 }, { "epoch": 0.052636908540693805, "grad_norm": 2.731154203414917, "learning_rate": 2.979537821084326e-05, "loss": 1.7157, "step": 27030 }, { "epoch": 0.052666118811804066, "grad_norm": 3.17053484916687, "learning_rate": 2.9795151561097265e-05, "loss": 1.9691, "step": 27045 }, { "epoch": 0.05269532908291433, "grad_norm": 3.9647130966186523, "learning_rate": 2.979492478675922e-05, "loss": 1.9736, "step": 27060 }, { "epoch": 0.05272453935402459, "grad_norm": 2.4074623584747314, "learning_rate": 2.9794697887831027e-05, "loss": 1.8325, "step": 27075 }, { "epoch": 0.052753749625134856, "grad_norm": 4.785901069641113, "learning_rate": 2.9794470864314603e-05, "loss": 1.9717, "step": 27090 }, { "epoch": 0.05278295989624512, "grad_norm": 5.4298577308654785, "learning_rate": 2.979424371621186e-05, "loss": 1.8316, "step": 27105 }, { "epoch": 0.05281217016735538, "grad_norm": 2.509413003921509, "learning_rate": 2.9794016443524713e-05, "loss": 1.8792, "step": 27120 }, { "epoch": 0.052841380438465646, "grad_norm": 1.903182029724121, "learning_rate": 2.979378904625507e-05, "loss": 1.8049, "step": 27135 }, { "epoch": 0.05287059070957591, "grad_norm": 3.3434927463531494, "learning_rate": 2.9793561524404846e-05, "loss": 1.7794, "step": 27150 }, { "epoch": 0.05289980098068617, "grad_norm": 5.064967632293701, "learning_rate": 2.9793333877975964e-05, "loss": 1.8726, "step": 27165 }, { "epoch": 0.05292901125179643, "grad_norm": 1.8450191020965576, "learning_rate": 2.9793106106970335e-05, "loss": 1.8586, "step": 27180 }, { "epoch": 0.0529582215229067, "grad_norm": 2.540570020675659, "learning_rate": 2.979287821138988e-05, "loss": 1.8988, "step": 27195 }, { "epoch": 0.05298743179401696, "grad_norm": 2.0893425941467285, "learning_rate": 2.9792650191236516e-05, "loss": 1.7794, "step": 27210 }, { "epoch": 0.05301664206512722, "grad_norm": 2.7562851905822754, "learning_rate": 2.979242204651216e-05, "loss": 1.7025, "step": 27225 }, { "epoch": 0.05304585233623749, "grad_norm": 4.634995937347412, "learning_rate": 2.9792193777218743e-05, "loss": 1.8236, "step": 27240 }, { "epoch": 0.05307506260734775, "grad_norm": 3.1855075359344482, "learning_rate": 2.9791965383358184e-05, "loss": 1.8439, "step": 27255 }, { "epoch": 0.05310427287845801, "grad_norm": 3.3286306858062744, "learning_rate": 2.9791736864932403e-05, "loss": 1.9314, "step": 27270 }, { "epoch": 0.05313348314956827, "grad_norm": 3.1028332710266113, "learning_rate": 2.979150822194332e-05, "loss": 2.0257, "step": 27285 }, { "epoch": 0.05316269342067854, "grad_norm": 4.1933393478393555, "learning_rate": 2.979127945439287e-05, "loss": 1.9908, "step": 27300 }, { "epoch": 0.0531919036917888, "grad_norm": 4.206679344177246, "learning_rate": 2.9791050562282974e-05, "loss": 1.8144, "step": 27315 }, { "epoch": 0.05322111396289906, "grad_norm": 2.231621265411377, "learning_rate": 2.9790821545615562e-05, "loss": 1.824, "step": 27330 }, { "epoch": 0.05325032423400932, "grad_norm": 3.625483512878418, "learning_rate": 2.9790592404392557e-05, "loss": 2.0087, "step": 27345 }, { "epoch": 0.05327953450511959, "grad_norm": 4.063029766082764, "learning_rate": 2.9790363138615902e-05, "loss": 1.8927, "step": 27360 }, { "epoch": 0.05330874477622985, "grad_norm": 4.166107654571533, "learning_rate": 2.979013374828751e-05, "loss": 1.9004, "step": 27375 }, { "epoch": 0.05333795504734011, "grad_norm": 2.737416982650757, "learning_rate": 2.9789904233409326e-05, "loss": 1.9678, "step": 27390 }, { "epoch": 0.05336716531845038, "grad_norm": 2.131272315979004, "learning_rate": 2.9789674593983277e-05, "loss": 1.895, "step": 27405 }, { "epoch": 0.05339637558956064, "grad_norm": 2.983872890472412, "learning_rate": 2.9789444830011302e-05, "loss": 1.7807, "step": 27420 }, { "epoch": 0.0534255858606709, "grad_norm": 5.785390377044678, "learning_rate": 2.978921494149533e-05, "loss": 1.9559, "step": 27435 }, { "epoch": 0.05345479613178116, "grad_norm": 2.7100813388824463, "learning_rate": 2.9788984928437298e-05, "loss": 1.9147, "step": 27450 }, { "epoch": 0.05348400640289143, "grad_norm": 2.0759334564208984, "learning_rate": 2.978875479083914e-05, "loss": 1.7345, "step": 27465 }, { "epoch": 0.05351321667400169, "grad_norm": 4.645893573760986, "learning_rate": 2.9788524528702804e-05, "loss": 1.7246, "step": 27480 }, { "epoch": 0.05354242694511195, "grad_norm": 3.1474320888519287, "learning_rate": 2.9788294142030225e-05, "loss": 1.8679, "step": 27495 }, { "epoch": 0.05357163721622222, "grad_norm": 2.8445558547973633, "learning_rate": 2.9788063630823335e-05, "loss": 1.748, "step": 27510 }, { "epoch": 0.05360084748733248, "grad_norm": 3.176330089569092, "learning_rate": 2.978783299508408e-05, "loss": 1.9698, "step": 27525 }, { "epoch": 0.05363005775844274, "grad_norm": 2.875300884246826, "learning_rate": 2.9787602234814407e-05, "loss": 1.7617, "step": 27540 }, { "epoch": 0.053659268029553, "grad_norm": 4.004800796508789, "learning_rate": 2.978737135001626e-05, "loss": 1.8204, "step": 27555 }, { "epoch": 0.05368847830066327, "grad_norm": 2.3588531017303467, "learning_rate": 2.9787140340691574e-05, "loss": 1.8028, "step": 27570 }, { "epoch": 0.05371768857177353, "grad_norm": 2.293210744857788, "learning_rate": 2.9786909206842297e-05, "loss": 1.9525, "step": 27585 }, { "epoch": 0.05374689884288379, "grad_norm": 3.4703333377838135, "learning_rate": 2.9786677948470382e-05, "loss": 1.8144, "step": 27600 }, { "epoch": 0.05377610911399406, "grad_norm": 2.1387853622436523, "learning_rate": 2.9786446565577772e-05, "loss": 1.7853, "step": 27615 }, { "epoch": 0.05380531938510432, "grad_norm": 1.9415435791015625, "learning_rate": 2.9786215058166417e-05, "loss": 1.8912, "step": 27630 }, { "epoch": 0.05383452965621458, "grad_norm": 3.315534830093384, "learning_rate": 2.978598342623826e-05, "loss": 1.7991, "step": 27645 }, { "epoch": 0.05386373992732484, "grad_norm": 3.2139084339141846, "learning_rate": 2.9785751669795265e-05, "loss": 1.8284, "step": 27660 }, { "epoch": 0.05389295019843511, "grad_norm": 2.0407660007476807, "learning_rate": 2.9785519788839368e-05, "loss": 1.8263, "step": 27675 }, { "epoch": 0.05392216046954537, "grad_norm": 3.6719486713409424, "learning_rate": 2.9785287783372538e-05, "loss": 1.7552, "step": 27690 }, { "epoch": 0.05395137074065563, "grad_norm": 2.3214800357818604, "learning_rate": 2.978505565339671e-05, "loss": 1.7752, "step": 27705 }, { "epoch": 0.0539805810117659, "grad_norm": 3.5290887355804443, "learning_rate": 2.9784823398913856e-05, "loss": 1.6684, "step": 27720 }, { "epoch": 0.05400979128287616, "grad_norm": 2.344564437866211, "learning_rate": 2.978459101992592e-05, "loss": 1.7649, "step": 27735 }, { "epoch": 0.05403900155398642, "grad_norm": 3.5880258083343506, "learning_rate": 2.9784358516434867e-05, "loss": 1.8194, "step": 27750 }, { "epoch": 0.05406821182509668, "grad_norm": 4.46945858001709, "learning_rate": 2.978412588844265e-05, "loss": 1.9552, "step": 27765 }, { "epoch": 0.05409742209620695, "grad_norm": 2.9063570499420166, "learning_rate": 2.978389313595123e-05, "loss": 1.9078, "step": 27780 }, { "epoch": 0.05412663236731721, "grad_norm": 4.740022659301758, "learning_rate": 2.9783660258962568e-05, "loss": 1.8773, "step": 27795 }, { "epoch": 0.05415584263842747, "grad_norm": 5.9590654373168945, "learning_rate": 2.9783427257478623e-05, "loss": 2.0891, "step": 27810 }, { "epoch": 0.05418505290953774, "grad_norm": 3.4905142784118652, "learning_rate": 2.978319413150136e-05, "loss": 1.912, "step": 27825 }, { "epoch": 0.054214263180648, "grad_norm": 4.312283039093018, "learning_rate": 2.978296088103273e-05, "loss": 1.7969, "step": 27840 }, { "epoch": 0.05424347345175826, "grad_norm": 3.419679880142212, "learning_rate": 2.978272750607472e-05, "loss": 2.0149, "step": 27855 }, { "epoch": 0.054272683722868524, "grad_norm": 1.4746593236923218, "learning_rate": 2.9782494006629275e-05, "loss": 1.7991, "step": 27870 }, { "epoch": 0.05430189399397879, "grad_norm": 2.6810593605041504, "learning_rate": 2.9782260382698374e-05, "loss": 1.8557, "step": 27885 }, { "epoch": 0.05433110426508905, "grad_norm": 4.144235134124756, "learning_rate": 2.9782026634283975e-05, "loss": 2.0213, "step": 27900 }, { "epoch": 0.054360314536199314, "grad_norm": 3.5831942558288574, "learning_rate": 2.9781792761388055e-05, "loss": 1.7606, "step": 27915 }, { "epoch": 0.054389524807309575, "grad_norm": 3.201408863067627, "learning_rate": 2.9781558764012573e-05, "loss": 1.7619, "step": 27930 }, { "epoch": 0.05441873507841984, "grad_norm": 2.8114073276519775, "learning_rate": 2.978132464215951e-05, "loss": 1.9014, "step": 27945 }, { "epoch": 0.054447945349530104, "grad_norm": 2.400465726852417, "learning_rate": 2.9781090395830834e-05, "loss": 1.9041, "step": 27960 }, { "epoch": 0.054477155620640365, "grad_norm": 2.1496269702911377, "learning_rate": 2.9780856025028513e-05, "loss": 1.7252, "step": 27975 }, { "epoch": 0.05450636589175063, "grad_norm": 3.477867603302002, "learning_rate": 2.978062152975453e-05, "loss": 1.7466, "step": 27990 }, { "epoch": 0.054535576162860894, "grad_norm": 2.1709723472595215, "learning_rate": 2.978038691001085e-05, "loss": 1.6808, "step": 28005 }, { "epoch": 0.054564786433971155, "grad_norm": 2.13655686378479, "learning_rate": 2.978015216579945e-05, "loss": 1.8089, "step": 28020 }, { "epoch": 0.054593996705081416, "grad_norm": 3.563598394393921, "learning_rate": 2.9779917297122318e-05, "loss": 1.8368, "step": 28035 }, { "epoch": 0.054623206976191684, "grad_norm": 3.1836514472961426, "learning_rate": 2.977968230398142e-05, "loss": 1.8042, "step": 28050 }, { "epoch": 0.054652417247301945, "grad_norm": 2.4931259155273438, "learning_rate": 2.9779447186378738e-05, "loss": 1.8061, "step": 28065 }, { "epoch": 0.054681627518412206, "grad_norm": 3.2399775981903076, "learning_rate": 2.977921194431625e-05, "loss": 1.8723, "step": 28080 }, { "epoch": 0.05471083778952247, "grad_norm": 3.281590461730957, "learning_rate": 2.977897657779594e-05, "loss": 1.8882, "step": 28095 }, { "epoch": 0.054740048060632734, "grad_norm": 2.607039451599121, "learning_rate": 2.9778741086819795e-05, "loss": 1.8796, "step": 28110 }, { "epoch": 0.054769258331742995, "grad_norm": 3.4543254375457764, "learning_rate": 2.977850547138979e-05, "loss": 1.8826, "step": 28125 }, { "epoch": 0.054798468602853256, "grad_norm": 3.8168885707855225, "learning_rate": 2.9778269731507914e-05, "loss": 1.9827, "step": 28140 }, { "epoch": 0.054827678873963524, "grad_norm": 2.7308406829833984, "learning_rate": 2.977803386717615e-05, "loss": 1.768, "step": 28155 }, { "epoch": 0.054856889145073785, "grad_norm": 4.276648998260498, "learning_rate": 2.9777797878396477e-05, "loss": 1.8696, "step": 28170 }, { "epoch": 0.054886099416184046, "grad_norm": 3.1874961853027344, "learning_rate": 2.97775617651709e-05, "loss": 1.881, "step": 28185 }, { "epoch": 0.054915309687294314, "grad_norm": 2.4808239936828613, "learning_rate": 2.977732552750139e-05, "loss": 1.7825, "step": 28200 }, { "epoch": 0.054944519958404575, "grad_norm": 3.0970373153686523, "learning_rate": 2.9777089165389942e-05, "loss": 1.7825, "step": 28215 }, { "epoch": 0.054973730229514836, "grad_norm": 1.757534384727478, "learning_rate": 2.9776852678838555e-05, "loss": 2.0145, "step": 28230 }, { "epoch": 0.0550029405006251, "grad_norm": 2.351555347442627, "learning_rate": 2.977661606784921e-05, "loss": 2.0268, "step": 28245 }, { "epoch": 0.055032150771735365, "grad_norm": 2.4561007022857666, "learning_rate": 2.9776379332423902e-05, "loss": 1.7745, "step": 28260 }, { "epoch": 0.055061361042845626, "grad_norm": 3.8144009113311768, "learning_rate": 2.9776142472564624e-05, "loss": 1.8408, "step": 28275 }, { "epoch": 0.05509057131395589, "grad_norm": 4.969006061553955, "learning_rate": 2.9775905488273373e-05, "loss": 1.9223, "step": 28290 }, { "epoch": 0.055119781585066155, "grad_norm": 5.71866512298584, "learning_rate": 2.9775668379552146e-05, "loss": 1.842, "step": 28305 }, { "epoch": 0.055148991856176416, "grad_norm": 1.9934380054473877, "learning_rate": 2.9775431146402937e-05, "loss": 1.976, "step": 28320 }, { "epoch": 0.05517820212728668, "grad_norm": 1.7895939350128174, "learning_rate": 2.9775193788827743e-05, "loss": 2.0921, "step": 28335 }, { "epoch": 0.05520741239839694, "grad_norm": 4.20900821685791, "learning_rate": 2.9774956306828566e-05, "loss": 1.9333, "step": 28350 }, { "epoch": 0.055236622669507206, "grad_norm": 1.7822163105010986, "learning_rate": 2.97747187004074e-05, "loss": 2.0872, "step": 28365 }, { "epoch": 0.05526583294061747, "grad_norm": 1.7469080686569214, "learning_rate": 2.9774480969566254e-05, "loss": 1.8781, "step": 28380 }, { "epoch": 0.05529504321172773, "grad_norm": 3.0533454418182373, "learning_rate": 2.977424311430712e-05, "loss": 1.7184, "step": 28395 }, { "epoch": 0.05532425348283799, "grad_norm": 2.729780912399292, "learning_rate": 2.977400513463201e-05, "loss": 1.7794, "step": 28410 }, { "epoch": 0.05535346375394826, "grad_norm": 3.2360620498657227, "learning_rate": 2.9773767030542926e-05, "loss": 1.7711, "step": 28425 }, { "epoch": 0.05538267402505852, "grad_norm": 2.3599355220794678, "learning_rate": 2.9773528802041873e-05, "loss": 1.8758, "step": 28440 }, { "epoch": 0.05541188429616878, "grad_norm": 3.3827106952667236, "learning_rate": 2.9773290449130856e-05, "loss": 1.901, "step": 28455 }, { "epoch": 0.055441094567279046, "grad_norm": 2.9506921768188477, "learning_rate": 2.977305197181188e-05, "loss": 1.9286, "step": 28470 }, { "epoch": 0.05547030483838931, "grad_norm": 3.381622791290283, "learning_rate": 2.9772813370086956e-05, "loss": 2.0169, "step": 28485 }, { "epoch": 0.05549951510949957, "grad_norm": 3.6618142127990723, "learning_rate": 2.9772574643958095e-05, "loss": 1.8318, "step": 28500 }, { "epoch": 0.05552872538060983, "grad_norm": 2.906064033508301, "learning_rate": 2.9772335793427304e-05, "loss": 1.8778, "step": 28515 }, { "epoch": 0.0555579356517201, "grad_norm": 1.9280356168746948, "learning_rate": 2.9772096818496592e-05, "loss": 1.9095, "step": 28530 }, { "epoch": 0.05558714592283036, "grad_norm": 2.451441764831543, "learning_rate": 2.977185771916798e-05, "loss": 1.7975, "step": 28545 }, { "epoch": 0.05561635619394062, "grad_norm": 3.4363293647766113, "learning_rate": 2.9771618495443473e-05, "loss": 1.9995, "step": 28560 }, { "epoch": 0.05564556646505089, "grad_norm": 3.402430772781372, "learning_rate": 2.9771379147325095e-05, "loss": 1.879, "step": 28575 }, { "epoch": 0.05567477673616115, "grad_norm": 3.7891762256622314, "learning_rate": 2.977113967481485e-05, "loss": 1.8275, "step": 28590 }, { "epoch": 0.05570398700727141, "grad_norm": 2.45809268951416, "learning_rate": 2.977090007791476e-05, "loss": 1.8131, "step": 28605 }, { "epoch": 0.05573319727838167, "grad_norm": 3.5447278022766113, "learning_rate": 2.9770660356626848e-05, "loss": 1.8373, "step": 28620 }, { "epoch": 0.05576240754949194, "grad_norm": 3.397735595703125, "learning_rate": 2.9770420510953124e-05, "loss": 1.7907, "step": 28635 }, { "epoch": 0.0557916178206022, "grad_norm": 2.185011386871338, "learning_rate": 2.9770180540895613e-05, "loss": 1.8909, "step": 28650 }, { "epoch": 0.05582082809171246, "grad_norm": 3.66780161857605, "learning_rate": 2.9769940446456332e-05, "loss": 1.8898, "step": 28665 }, { "epoch": 0.05585003836282273, "grad_norm": 3.8541507720947266, "learning_rate": 2.9769700227637307e-05, "loss": 1.8156, "step": 28680 }, { "epoch": 0.05587924863393299, "grad_norm": 2.504997491836548, "learning_rate": 2.9769459884440563e-05, "loss": 1.8584, "step": 28695 }, { "epoch": 0.05590845890504325, "grad_norm": 2.811286687850952, "learning_rate": 2.9769219416868114e-05, "loss": 1.8762, "step": 28710 }, { "epoch": 0.05593766917615351, "grad_norm": 3.2774808406829834, "learning_rate": 2.976897882492199e-05, "loss": 1.8637, "step": 28725 }, { "epoch": 0.05596687944726378, "grad_norm": 3.6415061950683594, "learning_rate": 2.9768738108604222e-05, "loss": 2.033, "step": 28740 }, { "epoch": 0.05599608971837404, "grad_norm": 2.5112550258636475, "learning_rate": 2.9768497267916833e-05, "loss": 1.9643, "step": 28755 }, { "epoch": 0.0560252999894843, "grad_norm": 3.3538918495178223, "learning_rate": 2.9768256302861852e-05, "loss": 1.7964, "step": 28770 }, { "epoch": 0.05605451026059457, "grad_norm": 1.7511709928512573, "learning_rate": 2.9768015213441306e-05, "loss": 1.9891, "step": 28785 }, { "epoch": 0.05608372053170483, "grad_norm": 4.007176399230957, "learning_rate": 2.9767773999657225e-05, "loss": 2.0263, "step": 28800 }, { "epoch": 0.05611293080281509, "grad_norm": 2.968200206756592, "learning_rate": 2.9767532661511644e-05, "loss": 1.7171, "step": 28815 }, { "epoch": 0.05614214107392535, "grad_norm": 3.2194066047668457, "learning_rate": 2.9767291199006594e-05, "loss": 1.8825, "step": 28830 }, { "epoch": 0.05617135134503562, "grad_norm": 3.994147539138794, "learning_rate": 2.976704961214411e-05, "loss": 1.6859, "step": 28845 }, { "epoch": 0.05620056161614588, "grad_norm": 2.1359071731567383, "learning_rate": 2.976680790092622e-05, "loss": 1.7796, "step": 28860 }, { "epoch": 0.05622977188725614, "grad_norm": 3.074885606765747, "learning_rate": 2.976656606535497e-05, "loss": 1.9507, "step": 28875 }, { "epoch": 0.05625898215836641, "grad_norm": 3.184913396835327, "learning_rate": 2.9766324105432385e-05, "loss": 1.9536, "step": 28890 }, { "epoch": 0.05628819242947667, "grad_norm": 3.911243438720703, "learning_rate": 2.976608202116051e-05, "loss": 1.8609, "step": 28905 }, { "epoch": 0.05631740270058693, "grad_norm": 4.364305019378662, "learning_rate": 2.9765839812541378e-05, "loss": 1.8573, "step": 28920 }, { "epoch": 0.05634661297169719, "grad_norm": 3.0088484287261963, "learning_rate": 2.9765597479577034e-05, "loss": 2.0879, "step": 28935 }, { "epoch": 0.05637582324280746, "grad_norm": 1.9867300987243652, "learning_rate": 2.9765355022269518e-05, "loss": 1.7799, "step": 28950 }, { "epoch": 0.05640503351391772, "grad_norm": 2.1443519592285156, "learning_rate": 2.9765112440620874e-05, "loss": 1.7201, "step": 28965 }, { "epoch": 0.05643424378502798, "grad_norm": 2.226407051086426, "learning_rate": 2.9764869734633134e-05, "loss": 1.7974, "step": 28980 }, { "epoch": 0.05646345405613824, "grad_norm": 4.08579158782959, "learning_rate": 2.9764626904308354e-05, "loss": 1.8633, "step": 28995 }, { "epoch": 0.05649266432724851, "grad_norm": 4.314965724945068, "learning_rate": 2.9764383949648576e-05, "loss": 1.9242, "step": 29010 }, { "epoch": 0.05652187459835877, "grad_norm": 4.151242733001709, "learning_rate": 2.976414087065584e-05, "loss": 1.942, "step": 29025 }, { "epoch": 0.05655108486946903, "grad_norm": 1.9677321910858154, "learning_rate": 2.97638976673322e-05, "loss": 1.8371, "step": 29040 }, { "epoch": 0.0565802951405793, "grad_norm": 2.814545154571533, "learning_rate": 2.97636543396797e-05, "loss": 1.9929, "step": 29055 }, { "epoch": 0.05660950541168956, "grad_norm": 3.4638845920562744, "learning_rate": 2.976341088770039e-05, "loss": 1.8635, "step": 29070 }, { "epoch": 0.05663871568279982, "grad_norm": 2.0748291015625, "learning_rate": 2.976316731139632e-05, "loss": 1.6827, "step": 29085 }, { "epoch": 0.056667925953910084, "grad_norm": 2.4861667156219482, "learning_rate": 2.9762923610769545e-05, "loss": 1.8022, "step": 29100 }, { "epoch": 0.05669713622502035, "grad_norm": 2.12156343460083, "learning_rate": 2.9762679785822113e-05, "loss": 1.8912, "step": 29115 }, { "epoch": 0.05672634649613061, "grad_norm": 2.3051788806915283, "learning_rate": 2.9762435836556075e-05, "loss": 1.786, "step": 29130 }, { "epoch": 0.056755556767240874, "grad_norm": 3.0045323371887207, "learning_rate": 2.9762191762973492e-05, "loss": 1.9105, "step": 29145 }, { "epoch": 0.05678476703835114, "grad_norm": 2.9411308765411377, "learning_rate": 2.9761947565076413e-05, "loss": 1.7949, "step": 29160 }, { "epoch": 0.0568139773094614, "grad_norm": 3.0440449714660645, "learning_rate": 2.97617032428669e-05, "loss": 2.0024, "step": 29175 }, { "epoch": 0.056843187580571664, "grad_norm": 3.836582660675049, "learning_rate": 2.976145879634701e-05, "loss": 1.9192, "step": 29190 }, { "epoch": 0.056872397851681925, "grad_norm": 3.63226580619812, "learning_rate": 2.9761214225518792e-05, "loss": 1.6728, "step": 29205 }, { "epoch": 0.05690160812279219, "grad_norm": 2.588270902633667, "learning_rate": 2.9760969530384317e-05, "loss": 1.8828, "step": 29220 }, { "epoch": 0.05693081839390245, "grad_norm": 3.4929933547973633, "learning_rate": 2.9760724710945642e-05, "loss": 2.0755, "step": 29235 }, { "epoch": 0.056960028665012714, "grad_norm": 2.916142463684082, "learning_rate": 2.976047976720483e-05, "loss": 1.8955, "step": 29250 }, { "epoch": 0.05698923893612298, "grad_norm": 3.606926918029785, "learning_rate": 2.976023469916394e-05, "loss": 1.9196, "step": 29265 }, { "epoch": 0.05701844920723324, "grad_norm": 4.033385276794434, "learning_rate": 2.9759989506825033e-05, "loss": 1.9555, "step": 29280 }, { "epoch": 0.057047659478343504, "grad_norm": 2.1779022216796875, "learning_rate": 2.9759744190190185e-05, "loss": 1.7735, "step": 29295 }, { "epoch": 0.057076869749453765, "grad_norm": 4.592677116394043, "learning_rate": 2.9759498749261452e-05, "loss": 1.8971, "step": 29310 }, { "epoch": 0.05710608002056403, "grad_norm": 4.797886371612549, "learning_rate": 2.9759253184040906e-05, "loss": 1.8494, "step": 29325 }, { "epoch": 0.057135290291674294, "grad_norm": 2.316049337387085, "learning_rate": 2.9759007494530615e-05, "loss": 1.7288, "step": 29340 }, { "epoch": 0.057164500562784555, "grad_norm": 2.9396653175354004, "learning_rate": 2.975876168073264e-05, "loss": 1.8751, "step": 29355 }, { "epoch": 0.05719371083389482, "grad_norm": 4.224997043609619, "learning_rate": 2.9758515742649063e-05, "loss": 1.78, "step": 29370 }, { "epoch": 0.057222921105005084, "grad_norm": 2.5549795627593994, "learning_rate": 2.9758269680281946e-05, "loss": 1.7375, "step": 29385 }, { "epoch": 0.057252131376115345, "grad_norm": 3.9945852756500244, "learning_rate": 2.9758023493633365e-05, "loss": 1.7078, "step": 29400 }, { "epoch": 0.057281341647225606, "grad_norm": 3.5278730392456055, "learning_rate": 2.975777718270539e-05, "loss": 1.7572, "step": 29415 }, { "epoch": 0.057310551918335874, "grad_norm": 3.9674766063690186, "learning_rate": 2.9757530747500104e-05, "loss": 1.9125, "step": 29430 }, { "epoch": 0.057339762189446135, "grad_norm": 3.498309373855591, "learning_rate": 2.9757284188019573e-05, "loss": 1.8242, "step": 29445 }, { "epoch": 0.057368972460556396, "grad_norm": 3.2913856506347656, "learning_rate": 2.9757037504265874e-05, "loss": 1.8559, "step": 29460 }, { "epoch": 0.05739818273166666, "grad_norm": 2.72259521484375, "learning_rate": 2.9756790696241088e-05, "loss": 1.7855, "step": 29475 }, { "epoch": 0.057427393002776925, "grad_norm": 7.3963704109191895, "learning_rate": 2.9756543763947292e-05, "loss": 1.9272, "step": 29490 }, { "epoch": 0.057456603273887186, "grad_norm": 2.901502847671509, "learning_rate": 2.9756296707386566e-05, "loss": 1.7294, "step": 29505 }, { "epoch": 0.05748581354499745, "grad_norm": 1.9630255699157715, "learning_rate": 2.9756049526560995e-05, "loss": 1.8477, "step": 29520 }, { "epoch": 0.057515023816107715, "grad_norm": 2.449836492538452, "learning_rate": 2.975580222147265e-05, "loss": 1.9282, "step": 29535 }, { "epoch": 0.057544234087217976, "grad_norm": 2.7461347579956055, "learning_rate": 2.9755554792123617e-05, "loss": 1.8883, "step": 29550 }, { "epoch": 0.05757344435832824, "grad_norm": 3.3567450046539307, "learning_rate": 2.9755307238515986e-05, "loss": 1.9717, "step": 29565 }, { "epoch": 0.0576026546294385, "grad_norm": 3.6341872215270996, "learning_rate": 2.975505956065184e-05, "loss": 1.9378, "step": 29580 }, { "epoch": 0.057631864900548765, "grad_norm": 4.170873165130615, "learning_rate": 2.9754811758533253e-05, "loss": 1.9077, "step": 29595 }, { "epoch": 0.057661075171659026, "grad_norm": 1.9185329675674438, "learning_rate": 2.975456383216233e-05, "loss": 1.7987, "step": 29610 }, { "epoch": 0.05769028544276929, "grad_norm": 2.3139781951904297, "learning_rate": 2.9754315781541144e-05, "loss": 1.9518, "step": 29625 }, { "epoch": 0.057719495713879555, "grad_norm": 3.581907033920288, "learning_rate": 2.9754067606671794e-05, "loss": 1.8729, "step": 29640 }, { "epoch": 0.057748705984989816, "grad_norm": 2.490278720855713, "learning_rate": 2.975381930755636e-05, "loss": 1.7576, "step": 29655 }, { "epoch": 0.05777791625610008, "grad_norm": 3.5740153789520264, "learning_rate": 2.9753570884196942e-05, "loss": 1.9749, "step": 29670 }, { "epoch": 0.05780712652721034, "grad_norm": 3.012131929397583, "learning_rate": 2.975332233659563e-05, "loss": 1.8704, "step": 29685 }, { "epoch": 0.057836336798320606, "grad_norm": 4.225703239440918, "learning_rate": 2.9753073664754514e-05, "loss": 1.6134, "step": 29700 }, { "epoch": 0.05786554706943087, "grad_norm": 3.840487480163574, "learning_rate": 2.9752824868675693e-05, "loss": 1.822, "step": 29715 }, { "epoch": 0.05789475734054113, "grad_norm": 2.8694260120391846, "learning_rate": 2.975257594836125e-05, "loss": 1.939, "step": 29730 }, { "epoch": 0.057923967611651396, "grad_norm": 3.4660465717315674, "learning_rate": 2.97523269038133e-05, "loss": 1.9517, "step": 29745 }, { "epoch": 0.05795317788276166, "grad_norm": 3.125666379928589, "learning_rate": 2.9752077735033924e-05, "loss": 1.7034, "step": 29760 }, { "epoch": 0.05798238815387192, "grad_norm": 4.895527362823486, "learning_rate": 2.975182844202523e-05, "loss": 1.8021, "step": 29775 }, { "epoch": 0.05801159842498218, "grad_norm": 3.4364778995513916, "learning_rate": 2.9751579024789314e-05, "loss": 1.9357, "step": 29790 }, { "epoch": 0.05804080869609245, "grad_norm": 3.209791421890259, "learning_rate": 2.9751329483328276e-05, "loss": 1.8969, "step": 29805 }, { "epoch": 0.05807001896720271, "grad_norm": 2.851810932159424, "learning_rate": 2.9751079817644217e-05, "loss": 1.8274, "step": 29820 }, { "epoch": 0.05809922923831297, "grad_norm": 1.9028266668319702, "learning_rate": 2.975083002773924e-05, "loss": 1.7498, "step": 29835 }, { "epoch": 0.05812843950942324, "grad_norm": 4.442659378051758, "learning_rate": 2.9750580113615448e-05, "loss": 1.906, "step": 29850 }, { "epoch": 0.0581576497805335, "grad_norm": 4.513755798339844, "learning_rate": 2.9750330075274948e-05, "loss": 1.7855, "step": 29865 }, { "epoch": 0.05818686005164376, "grad_norm": 2.4421234130859375, "learning_rate": 2.975007991271984e-05, "loss": 1.8355, "step": 29880 }, { "epoch": 0.05821607032275402, "grad_norm": 2.0383033752441406, "learning_rate": 2.974982962595224e-05, "loss": 1.8456, "step": 29895 }, { "epoch": 0.05824528059386429, "grad_norm": 2.9908933639526367, "learning_rate": 2.9749579214974245e-05, "loss": 1.7223, "step": 29910 }, { "epoch": 0.05827449086497455, "grad_norm": 3.1746137142181396, "learning_rate": 2.9749328679787976e-05, "loss": 1.8949, "step": 29925 }, { "epoch": 0.05830370113608481, "grad_norm": 3.351712942123413, "learning_rate": 2.9749078020395526e-05, "loss": 1.8423, "step": 29940 }, { "epoch": 0.05833291140719508, "grad_norm": 4.363383769989014, "learning_rate": 2.9748827236799024e-05, "loss": 2.0718, "step": 29955 }, { "epoch": 0.05836212167830534, "grad_norm": 3.5311388969421387, "learning_rate": 2.974857632900057e-05, "loss": 1.8393, "step": 29970 }, { "epoch": 0.0583913319494156, "grad_norm": 2.739178419113159, "learning_rate": 2.974832529700228e-05, "loss": 1.901, "step": 29985 }, { "epoch": 0.05842054222052586, "grad_norm": 5.376321315765381, "learning_rate": 2.974807414080627e-05, "loss": 1.9742, "step": 30000 } ], "logging_steps": 15, "max_steps": 513518, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.741878296671027e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }