diff --git "a/sft_full/smoe/trainer_state.json" "b/sft_full/smoe/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft_full/smoe/trainer_state.json" @@ -0,0 +1,97062 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999639262652863, + "eval_steps": 500, + "global_step": 13860, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.214746942750983e-05, + "grad_norm": 20.24890133738356, + "learning_rate": 0.0, + "loss": 1.8623, + "step": 1 + }, + { + "epoch": 0.00014429493885501966, + "grad_norm": 16.15630572252238, + "learning_rate": 4.5974687827095554e-07, + "loss": 1.7528, + "step": 2 + }, + { + "epoch": 0.0002164424082825295, + "grad_norm": 4.4837207915112725, + "learning_rate": 7.286815618830787e-07, + "loss": 1.1144, + "step": 3 + }, + { + "epoch": 0.0002885898777100393, + "grad_norm": 14.926939994441469, + "learning_rate": 9.194937565419111e-07, + "loss": 1.6407, + "step": 4 + }, + { + "epoch": 0.00036073734713754915, + "grad_norm": 11.171948412674006, + "learning_rate": 1.0674991931940918e-06, + "loss": 1.5802, + "step": 5 + }, + { + "epoch": 0.000432884816565059, + "grad_norm": 12.177216715390827, + "learning_rate": 1.1884284401540344e-06, + "loss": 1.6265, + "step": 6 + }, + { + "epoch": 0.0005050322859925688, + "grad_norm": 14.14133663178267, + "learning_rate": 1.290672661614585e-06, + "loss": 1.638, + "step": 7 + }, + { + "epoch": 0.0005771797554200786, + "grad_norm": 8.065125503651412, + "learning_rate": 1.3792406348128665e-06, + "loss": 1.4894, + "step": 8 + }, + { + "epoch": 0.0006493272248475885, + "grad_norm": 10.150745578686458, + "learning_rate": 1.4573631237661574e-06, + "loss": 1.4969, + "step": 9 + }, + { + "epoch": 0.0007214746942750983, + "grad_norm": 11.337504199110302, + "learning_rate": 1.5272460714650477e-06, + "loss": 1.4471, + "step": 10 + }, + { + "epoch": 0.0007936221637026081, + "grad_norm": 12.290702618223637, + "learning_rate": 1.5904628872603362e-06, + "loss": 1.4572, + "step": 11 + }, + { + "epoch": 0.000865769633130118, + "grad_norm": 12.37451355873958, + "learning_rate": 1.6481753184249899e-06, + "loss": 1.5115, + "step": 12 + }, + { + "epoch": 0.0009379171025576278, + "grad_norm": 13.906546805996909, + "learning_rate": 1.7012656086452217e-06, + "loss": 1.4329, + "step": 13 + }, + { + "epoch": 0.0010100645719851376, + "grad_norm": 3.2772426294087778, + "learning_rate": 1.7504195398855406e-06, + "loss": 1.1061, + "step": 14 + }, + { + "epoch": 0.0010822120414126475, + "grad_norm": 9.951374085592715, + "learning_rate": 1.7961807550771707e-06, + "loss": 1.3569, + "step": 15 + }, + { + "epoch": 0.0011543595108401573, + "grad_norm": 39.63118389986208, + "learning_rate": 1.8389875130838222e-06, + "loss": 1.3933, + "step": 16 + }, + { + "epoch": 0.0012265069802676672, + "grad_norm": 6.265918144541293, + "learning_rate": 1.879198281313374e-06, + "loss": 1.3566, + "step": 17 + }, + { + "epoch": 0.001298654449695177, + "grad_norm": 7.300873210362515, + "learning_rate": 1.9171100020371126e-06, + "loss": 1.3929, + "step": 18 + }, + { + "epoch": 0.0013708019191226869, + "grad_norm": 2.916471983452135, + "learning_rate": 1.9529714134269907e-06, + "loss": 0.9918, + "step": 19 + }, + { + "epoch": 0.0014429493885501966, + "grad_norm": 10.738422280471674, + "learning_rate": 1.986992949736003e-06, + "loss": 1.3207, + "step": 20 + }, + { + "epoch": 0.0015150968579777065, + "grad_norm": 2.6247499437667328, + "learning_rate": 2.0193542234976638e-06, + "loss": 1.0339, + "step": 21 + }, + { + "epoch": 0.0015872443274052162, + "grad_norm": 7.489660638473027, + "learning_rate": 2.0502097655312917e-06, + "loss": 1.2665, + "step": 22 + }, + { + "epoch": 0.0016593917968327262, + "grad_norm": 4.3887353272004, + "learning_rate": 2.079693487962469e-06, + "loss": 1.1615, + "step": 23 + }, + { + "epoch": 0.001731539266260236, + "grad_norm": 3.9137882585827275, + "learning_rate": 2.1079221966959454e-06, + "loss": 1.2329, + "step": 24 + }, + { + "epoch": 0.0018036867356877458, + "grad_norm": 3.792933926486318, + "learning_rate": 2.1349983863881835e-06, + "loss": 1.2051, + "step": 25 + }, + { + "epoch": 0.0018758342051152556, + "grad_norm": 6.13259603519255, + "learning_rate": 2.1610124869161774e-06, + "loss": 1.107, + "step": 26 + }, + { + "epoch": 0.0019479816745427655, + "grad_norm": 3.411536637589322, + "learning_rate": 2.1860446856492365e-06, + "loss": 1.2325, + "step": 27 + }, + { + "epoch": 0.0020201291439702752, + "grad_norm": 3.6083957945693976, + "learning_rate": 2.210166418156496e-06, + "loss": 1.2265, + "step": 28 + }, + { + "epoch": 0.002092276613397785, + "grad_norm": 4.707086966410392, + "learning_rate": 2.2334415972095315e-06, + "loss": 1.1902, + "step": 29 + }, + { + "epoch": 0.002164424082825295, + "grad_norm": 2.813784835647239, + "learning_rate": 2.255927633348126e-06, + "loss": 1.2112, + "step": 30 + }, + { + "epoch": 0.002236571552252805, + "grad_norm": 3.2743515886990533, + "learning_rate": 2.2776762880418517e-06, + "loss": 1.2511, + "step": 31 + }, + { + "epoch": 0.0023087190216803145, + "grad_norm": 3.4250222978074927, + "learning_rate": 2.2987343913547777e-06, + "loss": 1.1338, + "step": 32 + }, + { + "epoch": 0.0023808664911078243, + "grad_norm": 5.4346276073743045, + "learning_rate": 2.3191444491434147e-06, + "loss": 1.0982, + "step": 33 + }, + { + "epoch": 0.0024530139605353344, + "grad_norm": 3.4781129958501715, + "learning_rate": 2.3389451595843297e-06, + "loss": 1.1569, + "step": 34 + }, + { + "epoch": 0.002525161429962844, + "grad_norm": 3.518469308393763, + "learning_rate": 2.358171854808677e-06, + "loss": 1.2725, + "step": 35 + }, + { + "epoch": 0.002597308899390354, + "grad_norm": 2.926559691012817, + "learning_rate": 2.3768568803080688e-06, + "loss": 1.1346, + "step": 36 + }, + { + "epoch": 0.0026694563688178636, + "grad_norm": 5.5516720083244016, + "learning_rate": 2.3950299223460323e-06, + "loss": 1.179, + "step": 37 + }, + { + "epoch": 0.0027416038382453737, + "grad_norm": 3.2470244945075684, + "learning_rate": 2.4127182916979464e-06, + "loss": 1.2495, + "step": 38 + }, + { + "epoch": 0.0028137513076728835, + "grad_norm": 3.985961717795953, + "learning_rate": 2.4299471705283004e-06, + "loss": 1.1265, + "step": 39 + }, + { + "epoch": 0.002885898777100393, + "grad_norm": 3.8078906853076075, + "learning_rate": 2.4467398280069582e-06, + "loss": 1.1125, + "step": 40 + }, + { + "epoch": 0.002958046246527903, + "grad_norm": 4.155024986146057, + "learning_rate": 2.463117809297464e-06, + "loss": 1.1659, + "step": 41 + }, + { + "epoch": 0.003030193715955413, + "grad_norm": 3.2973191382471128, + "learning_rate": 2.4791011017686195e-06, + "loss": 1.1848, + "step": 42 + }, + { + "epoch": 0.0031023411853829228, + "grad_norm": 1.7728824503099994, + "learning_rate": 2.494708281646002e-06, + "loss": 0.9041, + "step": 43 + }, + { + "epoch": 0.0031744886548104325, + "grad_norm": 3.095045030602152, + "learning_rate": 2.5099566438022474e-06, + "loss": 1.111, + "step": 44 + }, + { + "epoch": 0.003246636124237942, + "grad_norm": 3.8472608953767, + "learning_rate": 2.5248623169602493e-06, + "loss": 1.2013, + "step": 45 + }, + { + "epoch": 0.0033187835936654524, + "grad_norm": 3.23193576995366, + "learning_rate": 2.539440366233425e-06, + "loss": 1.1457, + "step": 46 + }, + { + "epoch": 0.003390931063092962, + "grad_norm": 3.476378703884585, + "learning_rate": 2.553704884637446e-06, + "loss": 1.0829, + "step": 47 + }, + { + "epoch": 0.003463078532520472, + "grad_norm": 3.5263071490838933, + "learning_rate": 2.567669074966901e-06, + "loss": 1.0535, + "step": 48 + }, + { + "epoch": 0.0035352260019479815, + "grad_norm": 4.6547445641407075, + "learning_rate": 2.58134532322917e-06, + "loss": 1.0921, + "step": 49 + }, + { + "epoch": 0.0036073734713754917, + "grad_norm": 33.11859091889516, + "learning_rate": 2.5947452646591396e-06, + "loss": 1.152, + "step": 50 + }, + { + "epoch": 0.0036795209408030014, + "grad_norm": 5.589486143348035, + "learning_rate": 2.6078798431964527e-06, + "loss": 1.1594, + "step": 51 + }, + { + "epoch": 0.003751668410230511, + "grad_norm": 7.854263858210681, + "learning_rate": 2.6207593651871327e-06, + "loss": 1.1842, + "step": 52 + }, + { + "epoch": 0.003823815879658021, + "grad_norm": 3.4783508501646034, + "learning_rate": 2.6333935479697833e-06, + "loss": 1.1684, + "step": 53 + }, + { + "epoch": 0.003895963349085531, + "grad_norm": 3.371837830007202, + "learning_rate": 2.6457915639201917e-06, + "loss": 1.0895, + "step": 54 + }, + { + "epoch": 0.003968110818513041, + "grad_norm": 5.806196835441813, + "learning_rate": 2.6579620804544284e-06, + "loss": 1.1157, + "step": 55 + }, + { + "epoch": 0.0040402582879405504, + "grad_norm": 3.44459813468992, + "learning_rate": 2.669913296427452e-06, + "loss": 1.0614, + "step": 56 + }, + { + "epoch": 0.00411240575736806, + "grad_norm": 3.564867939261908, + "learning_rate": 2.68165297531007e-06, + "loss": 1.1047, + "step": 57 + }, + { + "epoch": 0.00418455322679557, + "grad_norm": 4.3233051661679465, + "learning_rate": 2.693188475480487e-06, + "loss": 1.0711, + "step": 58 + }, + { + "epoch": 0.00425670069622308, + "grad_norm": 3.0394081370268937, + "learning_rate": 2.704526777926441e-06, + "loss": 1.1127, + "step": 59 + }, + { + "epoch": 0.00432884816565059, + "grad_norm": 4.661469471951892, + "learning_rate": 2.7156745116190816e-06, + "loss": 1.0676, + "step": 60 + }, + { + "epoch": 0.0044009956350781, + "grad_norm": 4.8322753844576525, + "learning_rate": 2.7266379767895355e-06, + "loss": 1.0851, + "step": 61 + }, + { + "epoch": 0.00447314310450561, + "grad_norm": 4.551590928507169, + "learning_rate": 2.7374231663128074e-06, + "loss": 1.1078, + "step": 62 + }, + { + "epoch": 0.004545290573933119, + "grad_norm": 3.617512962249542, + "learning_rate": 2.7480357853807425e-06, + "loss": 1.1353, + "step": 63 + }, + { + "epoch": 0.004617438043360629, + "grad_norm": 4.140756511399135, + "learning_rate": 2.758481269625733e-06, + "loss": 1.1427, + "step": 64 + }, + { + "epoch": 0.004689585512788139, + "grad_norm": 3.6780352922647466, + "learning_rate": 2.7687648018393137e-06, + "loss": 1.1744, + "step": 65 + }, + { + "epoch": 0.0047617329822156485, + "grad_norm": 3.7673838453100092, + "learning_rate": 2.7788913274143704e-06, + "loss": 1.1671, + "step": 66 + }, + { + "epoch": 0.004833880451643158, + "grad_norm": 5.502698929694796, + "learning_rate": 2.7888655686261485e-06, + "loss": 0.9874, + "step": 67 + }, + { + "epoch": 0.004906027921070669, + "grad_norm": 4.857187250880113, + "learning_rate": 2.798692037855285e-06, + "loss": 1.125, + "step": 68 + }, + { + "epoch": 0.0049781753904981785, + "grad_norm": 5.280045953319444, + "learning_rate": 2.8083750498455478e-06, + "loss": 1.1318, + "step": 69 + }, + { + "epoch": 0.005050322859925688, + "grad_norm": 5.766227380562243, + "learning_rate": 2.817918733079633e-06, + "loss": 1.0993, + "step": 70 + }, + { + "epoch": 0.005122470329353198, + "grad_norm": 4.893151748745882, + "learning_rate": 2.8273270403480783e-06, + "loss": 1.1006, + "step": 71 + }, + { + "epoch": 0.005194617798780708, + "grad_norm": 5.095532438929385, + "learning_rate": 2.836603758579024e-06, + "loss": 1.0765, + "step": 72 + }, + { + "epoch": 0.005266765268208217, + "grad_norm": 4.829942280995099, + "learning_rate": 2.8457525179899824e-06, + "loss": 1.1323, + "step": 73 + }, + { + "epoch": 0.005338912737635727, + "grad_norm": 4.139887445366448, + "learning_rate": 2.8547768006169884e-06, + "loss": 1.0949, + "step": 74 + }, + { + "epoch": 0.005411060207063237, + "grad_norm": 4.721425087661398, + "learning_rate": 2.863679948271262e-06, + "loss": 1.1147, + "step": 75 + }, + { + "epoch": 0.0054832076764907475, + "grad_norm": 4.902549757561489, + "learning_rate": 2.872465169968902e-06, + "loss": 0.9565, + "step": 76 + }, + { + "epoch": 0.005555355145918257, + "grad_norm": 2.7531917429702513, + "learning_rate": 2.881135548874922e-06, + "loss": 1.0814, + "step": 77 + }, + { + "epoch": 0.005627502615345767, + "grad_norm": 3.527271020758376, + "learning_rate": 2.889694048799256e-06, + "loss": 1.2043, + "step": 78 + }, + { + "epoch": 0.005699650084773277, + "grad_norm": 3.2164006223668737, + "learning_rate": 2.8981435202789713e-06, + "loss": 1.0769, + "step": 79 + }, + { + "epoch": 0.005771797554200786, + "grad_norm": 3.110669810543551, + "learning_rate": 2.906486706277914e-06, + "loss": 1.0959, + "step": 80 + }, + { + "epoch": 0.005843945023628296, + "grad_norm": 3.6180011666265983, + "learning_rate": 2.9147262475323147e-06, + "loss": 1.0859, + "step": 81 + }, + { + "epoch": 0.005916092493055806, + "grad_norm": 6.207074270225365, + "learning_rate": 2.9228646875684196e-06, + "loss": 1.1065, + "step": 82 + }, + { + "epoch": 0.0059882399624833155, + "grad_norm": 4.1745210749790616, + "learning_rate": 2.9309044774159963e-06, + "loss": 1.0494, + "step": 83 + }, + { + "epoch": 0.006060387431910826, + "grad_norm": 1.791268733286679, + "learning_rate": 2.9388479800395748e-06, + "loss": 0.8946, + "step": 84 + }, + { + "epoch": 0.006132534901338336, + "grad_norm": 3.3051171711106564, + "learning_rate": 2.946697474507466e-06, + "loss": 1.1173, + "step": 85 + }, + { + "epoch": 0.0062046823707658455, + "grad_norm": 4.639181715402555, + "learning_rate": 2.954455159916957e-06, + "loss": 0.9877, + "step": 86 + }, + { + "epoch": 0.006276829840193355, + "grad_norm": 3.985540361706613, + "learning_rate": 2.96212315909261e-06, + "loss": 1.0954, + "step": 87 + }, + { + "epoch": 0.006348977309620865, + "grad_norm": 3.569732418557461, + "learning_rate": 2.969703522073203e-06, + "loss": 1.1126, + "step": 88 + }, + { + "epoch": 0.006421124779048375, + "grad_norm": 3.334615476708288, + "learning_rate": 2.9771982294016654e-06, + "loss": 1.0166, + "step": 89 + }, + { + "epoch": 0.006493272248475884, + "grad_norm": 4.377360769657815, + "learning_rate": 2.9846091952312046e-06, + "loss": 1.0903, + "step": 90 + }, + { + "epoch": 0.006565419717903394, + "grad_norm": 2.991240906946666, + "learning_rate": 2.9919382702598064e-06, + "loss": 1.088, + "step": 91 + }, + { + "epoch": 0.006637567187330905, + "grad_norm": 3.779268486448611, + "learning_rate": 2.99918724450438e-06, + "loss": 1.1146, + "step": 92 + }, + { + "epoch": 0.0067097146567584144, + "grad_norm": 3.7516409780356343, + "learning_rate": 3.0063578499249308e-06, + "loss": 1.1475, + "step": 93 + }, + { + "epoch": 0.006781862126185924, + "grad_norm": 7.03255928565136, + "learning_rate": 3.013451762908401e-06, + "loss": 1.0881, + "step": 94 + }, + { + "epoch": 0.006854009595613434, + "grad_norm": 4.574961744471631, + "learning_rate": 3.020470606621083e-06, + "loss": 1.1034, + "step": 95 + }, + { + "epoch": 0.006926157065040944, + "grad_norm": 4.126223064936772, + "learning_rate": 3.0274159532378564e-06, + "loss": 1.0978, + "step": 96 + }, + { + "epoch": 0.006998304534468453, + "grad_norm": 3.5135250347130964, + "learning_rate": 3.0342893260559215e-06, + "loss": 1.0416, + "step": 97 + }, + { + "epoch": 0.007070452003895963, + "grad_norm": 2.8315510099283348, + "learning_rate": 3.0410922015001263e-06, + "loss": 1.0653, + "step": 98 + }, + { + "epoch": 0.007142599473323473, + "grad_norm": 4.666065290848588, + "learning_rate": 3.0478260110264934e-06, + "loss": 1.2194, + "step": 99 + }, + { + "epoch": 0.007214746942750983, + "grad_norm": 4.559071546029565, + "learning_rate": 3.0544921429300953e-06, + "loss": 1.0841, + "step": 100 + }, + { + "epoch": 0.007286894412178493, + "grad_norm": 5.787216818948594, + "learning_rate": 3.061091944062968e-06, + "loss": 1.0957, + "step": 101 + }, + { + "epoch": 0.007359041881606003, + "grad_norm": 2.8854754794263293, + "learning_rate": 3.067626721467408e-06, + "loss": 1.0538, + "step": 102 + }, + { + "epoch": 0.0074311893510335125, + "grad_norm": 4.5284600449179155, + "learning_rate": 3.0740977439295834e-06, + "loss": 1.1195, + "step": 103 + }, + { + "epoch": 0.007503336820461022, + "grad_norm": 3.5594442546694856, + "learning_rate": 3.0805062434580884e-06, + "loss": 0.9759, + "step": 104 + }, + { + "epoch": 0.007575484289888532, + "grad_norm": 12.327769149846782, + "learning_rate": 3.0868534166917558e-06, + "loss": 1.0362, + "step": 105 + }, + { + "epoch": 0.007647631759316042, + "grad_norm": 4.342358482923061, + "learning_rate": 3.093140426240739e-06, + "loss": 1.1431, + "step": 106 + }, + { + "epoch": 0.007719779228743551, + "grad_norm": 4.900287674224967, + "learning_rate": 3.0993684019646337e-06, + "loss": 1.0273, + "step": 107 + }, + { + "epoch": 0.007791926698171062, + "grad_norm": 11.178837837864608, + "learning_rate": 3.1055384421911475e-06, + "loss": 1.1, + "step": 108 + }, + { + "epoch": 0.00786407416759857, + "grad_norm": 3.1587484310072282, + "learning_rate": 3.1116516148786068e-06, + "loss": 1.076, + "step": 109 + }, + { + "epoch": 0.007936221637026081, + "grad_norm": 2.5033178248054173, + "learning_rate": 3.1177089587253837e-06, + "loss": 0.8925, + "step": 110 + }, + { + "epoch": 0.00800836910645359, + "grad_norm": 2.8738080746696744, + "learning_rate": 3.123711484229111e-06, + "loss": 1.0437, + "step": 111 + }, + { + "epoch": 0.008080516575881101, + "grad_norm": 2.4871685708240254, + "learning_rate": 3.129660174698407e-06, + "loss": 1.1319, + "step": 112 + }, + { + "epoch": 0.008152664045308611, + "grad_norm": 3.390180848735623, + "learning_rate": 3.1355559872196274e-06, + "loss": 1.1482, + "step": 113 + }, + { + "epoch": 0.00822481151473612, + "grad_norm": 1.8310705728477599, + "learning_rate": 3.1413998535810255e-06, + "loss": 0.87, + "step": 114 + }, + { + "epoch": 0.008296958984163631, + "grad_norm": 3.1211270556146924, + "learning_rate": 3.147192681156561e-06, + "loss": 0.9994, + "step": 115 + }, + { + "epoch": 0.00836910645359114, + "grad_norm": 4.389285497544569, + "learning_rate": 3.1529353537514425e-06, + "loss": 1.045, + "step": 116 + }, + { + "epoch": 0.00844125392301865, + "grad_norm": 5.0024328035324395, + "learning_rate": 3.158628732411379e-06, + "loss": 1.029, + "step": 117 + }, + { + "epoch": 0.00851340139244616, + "grad_norm": 2.30402232770565, + "learning_rate": 3.164273656197397e-06, + "loss": 0.9387, + "step": 118 + }, + { + "epoch": 0.00858554886187367, + "grad_norm": 4.2703953712258205, + "learning_rate": 3.169870942927959e-06, + "loss": 0.9214, + "step": 119 + }, + { + "epoch": 0.00865769633130118, + "grad_norm": 5.049021733775727, + "learning_rate": 3.175421389890037e-06, + "loss": 1.045, + "step": 120 + }, + { + "epoch": 0.00872984380072869, + "grad_norm": 3.9225454783188916, + "learning_rate": 3.1809257745206724e-06, + "loss": 0.9792, + "step": 121 + }, + { + "epoch": 0.0088019912701562, + "grad_norm": 2.2198979806226586, + "learning_rate": 3.186384855060491e-06, + "loss": 1.0914, + "step": 122 + }, + { + "epoch": 0.008874138739583709, + "grad_norm": 3.247273695463176, + "learning_rate": 3.191799371180542e-06, + "loss": 1.046, + "step": 123 + }, + { + "epoch": 0.00894628620901122, + "grad_norm": 5.9247788528827305, + "learning_rate": 3.197170044583763e-06, + "loss": 1.0812, + "step": 124 + }, + { + "epoch": 0.009018433678438728, + "grad_norm": 3.5401000802930995, + "learning_rate": 3.202497579582276e-06, + "loss": 1.0064, + "step": 125 + }, + { + "epoch": 0.009090581147866239, + "grad_norm": 4.573835322649453, + "learning_rate": 3.207782663651698e-06, + "loss": 1.1355, + "step": 126 + }, + { + "epoch": 0.009162728617293748, + "grad_norm": 5.655152778214421, + "learning_rate": 3.213025967963534e-06, + "loss": 1.0396, + "step": 127 + }, + { + "epoch": 0.009234876086721258, + "grad_norm": 3.0180780583039306, + "learning_rate": 3.2182281478966887e-06, + "loss": 0.9421, + "step": 128 + }, + { + "epoch": 0.009307023556148769, + "grad_norm": 4.525558369787048, + "learning_rate": 3.22338984352908e-06, + "loss": 1.0872, + "step": 129 + }, + { + "epoch": 0.009379171025576278, + "grad_norm": 4.065544513489197, + "learning_rate": 3.228511680110269e-06, + "loss": 1.1756, + "step": 130 + }, + { + "epoch": 0.009451318495003788, + "grad_norm": 2.9725678640050797, + "learning_rate": 3.2335942685159767e-06, + "loss": 1.0972, + "step": 131 + }, + { + "epoch": 0.009523465964431297, + "grad_norm": 3.8222124800693447, + "learning_rate": 3.2386382056853257e-06, + "loss": 1.1156, + "step": 132 + }, + { + "epoch": 0.009595613433858808, + "grad_norm": 2.7514888894385185, + "learning_rate": 3.2436440750415763e-06, + "loss": 1.0802, + "step": 133 + }, + { + "epoch": 0.009667760903286316, + "grad_norm": 2.974578436467061, + "learning_rate": 3.248612446897104e-06, + "loss": 1.0232, + "step": 134 + }, + { + "epoch": 0.009739908372713827, + "grad_norm": 3.644073686568658, + "learning_rate": 3.2535438788433284e-06, + "loss": 1.1101, + "step": 135 + }, + { + "epoch": 0.009812055842141338, + "grad_norm": 3.6230650374376294, + "learning_rate": 3.258438916126241e-06, + "loss": 1.0038, + "step": 136 + }, + { + "epoch": 0.009884203311568847, + "grad_norm": 5.000728172684462, + "learning_rate": 3.2632980920081904e-06, + "loss": 1.1187, + "step": 137 + }, + { + "epoch": 0.009956350780996357, + "grad_norm": 4.116612548060554, + "learning_rate": 3.268121928116504e-06, + "loss": 1.02, + "step": 138 + }, + { + "epoch": 0.010028498250423866, + "grad_norm": 4.131526523402, + "learning_rate": 3.27291093477952e-06, + "loss": 1.1899, + "step": 139 + }, + { + "epoch": 0.010100645719851377, + "grad_norm": 13.743818523518923, + "learning_rate": 3.277665611350588e-06, + "loss": 0.9909, + "step": 140 + }, + { + "epoch": 0.010172793189278885, + "grad_norm": 8.191049519831273, + "learning_rate": 3.282386446520524e-06, + "loss": 1.0842, + "step": 141 + }, + { + "epoch": 0.010244940658706396, + "grad_norm": 3.9739010041360903, + "learning_rate": 3.2870739186190344e-06, + "loss": 1.0692, + "step": 142 + }, + { + "epoch": 0.010317088128133907, + "grad_norm": 3.482319315257509, + "learning_rate": 3.2917284959055577e-06, + "loss": 1.0068, + "step": 143 + }, + { + "epoch": 0.010389235597561415, + "grad_norm": 7.574200115210444, + "learning_rate": 3.2963506368499798e-06, + "loss": 1.1377, + "step": 144 + }, + { + "epoch": 0.010461383066988926, + "grad_norm": 3.663946804668289, + "learning_rate": 3.300940790403623e-06, + "loss": 1.151, + "step": 145 + }, + { + "epoch": 0.010533530536416435, + "grad_norm": 6.083582764132241, + "learning_rate": 3.3054993962609377e-06, + "loss": 1.0065, + "step": 146 + }, + { + "epoch": 0.010605678005843945, + "grad_norm": 3.038767508578339, + "learning_rate": 3.310026885112249e-06, + "loss": 1.0312, + "step": 147 + }, + { + "epoch": 0.010677825475271454, + "grad_norm": 3.0277664775423645, + "learning_rate": 3.3145236788879433e-06, + "loss": 1.0874, + "step": 148 + }, + { + "epoch": 0.010749972944698965, + "grad_norm": 2.7771038912226604, + "learning_rate": 3.3189901909944316e-06, + "loss": 1.0923, + "step": 149 + }, + { + "epoch": 0.010822120414126474, + "grad_norm": 3.7800610146820697, + "learning_rate": 3.323426826542218e-06, + "loss": 1.0158, + "step": 150 + }, + { + "epoch": 0.010894267883553984, + "grad_norm": 3.020721240539967, + "learning_rate": 3.327833982566395e-06, + "loss": 1.0984, + "step": 151 + }, + { + "epoch": 0.010966415352981495, + "grad_norm": 3.3208744067272926, + "learning_rate": 3.3322120482398574e-06, + "loss": 1.0941, + "step": 152 + }, + { + "epoch": 0.011038562822409004, + "grad_norm": 3.1584232515470574, + "learning_rate": 3.3365614050795313e-06, + "loss": 0.9282, + "step": 153 + }, + { + "epoch": 0.011110710291836514, + "grad_norm": 4.851619308556797, + "learning_rate": 3.3408824271458772e-06, + "loss": 1.0208, + "step": 154 + }, + { + "epoch": 0.011182857761264023, + "grad_norm": 3.87078542087715, + "learning_rate": 3.3451754812359436e-06, + "loss": 1.1072, + "step": 155 + }, + { + "epoch": 0.011255005230691534, + "grad_norm": 4.638755344643748, + "learning_rate": 3.3494409270702114e-06, + "loss": 1.0097, + "step": 156 + }, + { + "epoch": 0.011327152700119043, + "grad_norm": 3.5249261185499123, + "learning_rate": 3.3536791174734653e-06, + "loss": 1.0267, + "step": 157 + }, + { + "epoch": 0.011399300169546553, + "grad_norm": 3.8789336634778406, + "learning_rate": 3.3578903985499274e-06, + "loss": 1.1194, + "step": 158 + }, + { + "epoch": 0.011471447638974064, + "grad_norm": 2.6895015271720233, + "learning_rate": 3.362075109852862e-06, + "loss": 1.1209, + "step": 159 + }, + { + "epoch": 0.011543595108401573, + "grad_norm": 3.9178175892294407, + "learning_rate": 3.3662335845488692e-06, + "loss": 1.0819, + "step": 160 + }, + { + "epoch": 0.011615742577829083, + "grad_norm": 1.3585362532449845, + "learning_rate": 3.370366149577054e-06, + "loss": 0.8954, + "step": 161 + }, + { + "epoch": 0.011687890047256592, + "grad_norm": 4.273242450844695, + "learning_rate": 3.37447312580327e-06, + "loss": 1.1388, + "step": 162 + }, + { + "epoch": 0.011760037516684103, + "grad_norm": 3.5844172826932033, + "learning_rate": 3.3785548281696194e-06, + "loss": 1.0554, + "step": 163 + }, + { + "epoch": 0.011832184986111612, + "grad_norm": 2.6490027800588973, + "learning_rate": 3.3826115658393753e-06, + "loss": 0.997, + "step": 164 + }, + { + "epoch": 0.011904332455539122, + "grad_norm": 3.882203083070004, + "learning_rate": 3.3866436423375062e-06, + "loss": 1.0151, + "step": 165 + }, + { + "epoch": 0.011976479924966631, + "grad_norm": 3.502134762003791, + "learning_rate": 3.3906513556869524e-06, + "loss": 1.0426, + "step": 166 + }, + { + "epoch": 0.012048627394394142, + "grad_norm": 1.5807501986852035, + "learning_rate": 3.3946349985408007e-06, + "loss": 0.8422, + "step": 167 + }, + { + "epoch": 0.012120774863821652, + "grad_norm": 1.1340471576862767, + "learning_rate": 3.3985948583105305e-06, + "loss": 0.8783, + "step": 168 + }, + { + "epoch": 0.012192922333249161, + "grad_norm": 2.7494993621109525, + "learning_rate": 3.4025312172904435e-06, + "loss": 0.9344, + "step": 169 + }, + { + "epoch": 0.012265069802676672, + "grad_norm": 3.431956728780276, + "learning_rate": 3.4064443527784217e-06, + "loss": 1.09, + "step": 170 + }, + { + "epoch": 0.01233721727210418, + "grad_norm": 2.7515757121213533, + "learning_rate": 3.4103345371931485e-06, + "loss": 1.0037, + "step": 171 + }, + { + "epoch": 0.012409364741531691, + "grad_norm": 3.346391721680479, + "learning_rate": 3.414202038187913e-06, + "loss": 1.1029, + "step": 172 + }, + { + "epoch": 0.0124815122109592, + "grad_norm": 3.862396663530326, + "learning_rate": 3.4180471187611114e-06, + "loss": 0.9873, + "step": 173 + }, + { + "epoch": 0.01255365968038671, + "grad_norm": 5.504869386265115, + "learning_rate": 3.4218700373635654e-06, + "loss": 1.0463, + "step": 174 + }, + { + "epoch": 0.012625807149814221, + "grad_norm": 3.154721333670414, + "learning_rate": 3.4256710480027695e-06, + "loss": 1.0957, + "step": 175 + }, + { + "epoch": 0.01269795461924173, + "grad_norm": 1.1244819295948476, + "learning_rate": 3.4294504003441584e-06, + "loss": 0.8515, + "step": 176 + }, + { + "epoch": 0.01277010208866924, + "grad_norm": 5.974139112438346, + "learning_rate": 3.4332083398095194e-06, + "loss": 1.1231, + "step": 177 + }, + { + "epoch": 0.01284224955809675, + "grad_norm": 2.5336501433724488, + "learning_rate": 3.4369451076726216e-06, + "loss": 0.9939, + "step": 178 + }, + { + "epoch": 0.01291439702752426, + "grad_norm": 3.0051367016766743, + "learning_rate": 3.4406609411521665e-06, + "loss": 1.0795, + "step": 179 + }, + { + "epoch": 0.012986544496951769, + "grad_norm": 5.052002371961893, + "learning_rate": 3.4443560735021607e-06, + "loss": 1.0056, + "step": 180 + }, + { + "epoch": 0.01305869196637928, + "grad_norm": 5.106329987551966, + "learning_rate": 3.4480307340997692e-06, + "loss": 1.0769, + "step": 181 + }, + { + "epoch": 0.013130839435806788, + "grad_norm": 5.984955682295649, + "learning_rate": 3.4516851485307625e-06, + "loss": 1.0844, + "step": 182 + }, + { + "epoch": 0.013202986905234299, + "grad_norm": 5.825517086277386, + "learning_rate": 3.455319538672614e-06, + "loss": 1.0149, + "step": 183 + }, + { + "epoch": 0.01327513437466181, + "grad_norm": 2.3122870888512765, + "learning_rate": 3.4589341227753358e-06, + "loss": 1.092, + "step": 184 + }, + { + "epoch": 0.013347281844089318, + "grad_norm": 3.0614399535068824, + "learning_rate": 3.462529115540124e-06, + "loss": 0.9476, + "step": 185 + }, + { + "epoch": 0.013419429313516829, + "grad_norm": 5.806183366834588, + "learning_rate": 3.466104728195886e-06, + "loss": 1.0933, + "step": 186 + }, + { + "epoch": 0.013491576782944338, + "grad_norm": 2.911303812383807, + "learning_rate": 3.4696611685737104e-06, + "loss": 1.073, + "step": 187 + }, + { + "epoch": 0.013563724252371848, + "grad_norm": 3.48647433276229, + "learning_rate": 3.4731986411793564e-06, + "loss": 1.0499, + "step": 188 + }, + { + "epoch": 0.013635871721799357, + "grad_norm": 4.102689344835835, + "learning_rate": 3.4767173472638216e-06, + "loss": 0.8867, + "step": 189 + }, + { + "epoch": 0.013708019191226868, + "grad_norm": 3.50799519290399, + "learning_rate": 3.4802174848920384e-06, + "loss": 0.9873, + "step": 190 + }, + { + "epoch": 0.013780166660654378, + "grad_norm": 4.021046107432207, + "learning_rate": 3.4836992490097806e-06, + "loss": 1.0599, + "step": 191 + }, + { + "epoch": 0.013852314130081887, + "grad_norm": 4.511146515726633, + "learning_rate": 3.487162831508812e-06, + "loss": 1.1435, + "step": 192 + }, + { + "epoch": 0.013924461599509398, + "grad_norm": 4.23118645804855, + "learning_rate": 3.490608421290348e-06, + "loss": 0.9764, + "step": 193 + }, + { + "epoch": 0.013996609068936907, + "grad_norm": 2.9552280264462603, + "learning_rate": 3.4940362043268776e-06, + "loss": 1.0319, + "step": 194 + }, + { + "epoch": 0.014068756538364417, + "grad_norm": 3.5907948160809533, + "learning_rate": 3.4974463637223924e-06, + "loss": 1.1516, + "step": 195 + }, + { + "epoch": 0.014140904007791926, + "grad_norm": 4.926866113929812, + "learning_rate": 3.500839079771081e-06, + "loss": 0.9367, + "step": 196 + }, + { + "epoch": 0.014213051477219437, + "grad_norm": 2.289240599073583, + "learning_rate": 3.504214530014526e-06, + "loss": 1.0202, + "step": 197 + }, + { + "epoch": 0.014285198946646946, + "grad_norm": 2.6710874947371184, + "learning_rate": 3.507572889297449e-06, + "loss": 1.1045, + "step": 198 + }, + { + "epoch": 0.014357346416074456, + "grad_norm": 2.2894160808071455, + "learning_rate": 3.5109143298220628e-06, + "loss": 1.0485, + "step": 199 + }, + { + "epoch": 0.014429493885501967, + "grad_norm": 2.9090664250253826, + "learning_rate": 3.5142390212010506e-06, + "loss": 1.0323, + "step": 200 + }, + { + "epoch": 0.014501641354929476, + "grad_norm": 2.827988847858934, + "learning_rate": 3.517547130509227e-06, + "loss": 1.152, + "step": 201 + }, + { + "epoch": 0.014573788824356986, + "grad_norm": 2.991356318394081, + "learning_rate": 3.5208388223339235e-06, + "loss": 1.0548, + "step": 202 + }, + { + "epoch": 0.014645936293784495, + "grad_norm": 3.1015076139960214, + "learning_rate": 3.5241142588241166e-06, + "loss": 1.0381, + "step": 203 + }, + { + "epoch": 0.014718083763212006, + "grad_norm": 4.008769436043613, + "learning_rate": 3.5273735997383636e-06, + "loss": 0.9668, + "step": 204 + }, + { + "epoch": 0.014790231232639514, + "grad_norm": 2.9374696445925754, + "learning_rate": 3.530617002491556e-06, + "loss": 1.0195, + "step": 205 + }, + { + "epoch": 0.014862378702067025, + "grad_norm": 2.713981392658178, + "learning_rate": 3.5338446222005387e-06, + "loss": 0.9781, + "step": 206 + }, + { + "epoch": 0.014934526171494536, + "grad_norm": 4.039583095152469, + "learning_rate": 3.5370566117286265e-06, + "loss": 1.0319, + "step": 207 + }, + { + "epoch": 0.015006673640922044, + "grad_norm": 2.602536955469363, + "learning_rate": 3.5402531217290437e-06, + "loss": 1.0964, + "step": 208 + }, + { + "epoch": 0.015078821110349555, + "grad_norm": 3.5923800303603373, + "learning_rate": 3.543434300687327e-06, + "loss": 1.0397, + "step": 209 + }, + { + "epoch": 0.015150968579777064, + "grad_norm": 5.3959837000256385, + "learning_rate": 3.546600294962711e-06, + "loss": 0.954, + "step": 210 + }, + { + "epoch": 0.015223116049204575, + "grad_norm": 2.6705251038783246, + "learning_rate": 3.549751248828536e-06, + "loss": 0.9806, + "step": 211 + }, + { + "epoch": 0.015295263518632083, + "grad_norm": 5.498993347985295, + "learning_rate": 3.5528873045116943e-06, + "loss": 1.0031, + "step": 212 + }, + { + "epoch": 0.015367410988059594, + "grad_norm": 4.6607484529158425, + "learning_rate": 3.5560086022311574e-06, + "loss": 1.0655, + "step": 213 + }, + { + "epoch": 0.015439558457487103, + "grad_norm": 2.470182848042811, + "learning_rate": 3.559115280235589e-06, + "loss": 1.1498, + "step": 214 + }, + { + "epoch": 0.015511705926914613, + "grad_norm": 5.114211473588282, + "learning_rate": 3.562207474840094e-06, + "loss": 1.0276, + "step": 215 + }, + { + "epoch": 0.015583853396342124, + "grad_norm": 5.833985682750105, + "learning_rate": 3.5652853204621027e-06, + "loss": 0.9857, + "step": 216 + }, + { + "epoch": 0.015656000865769633, + "grad_norm": 1.172306540822889, + "learning_rate": 3.568348949656437e-06, + "loss": 0.8447, + "step": 217 + }, + { + "epoch": 0.01572814833519714, + "grad_norm": 6.327031698090549, + "learning_rate": 3.5713984931495625e-06, + "loss": 1.1086, + "step": 218 + }, + { + "epoch": 0.015800295804624654, + "grad_norm": 4.9364076202722, + "learning_rate": 3.574434079873061e-06, + "loss": 1.0656, + "step": 219 + }, + { + "epoch": 0.015872443274052163, + "grad_norm": 7.728227611435895, + "learning_rate": 3.57745583699634e-06, + "loss": 1.0206, + "step": 220 + }, + { + "epoch": 0.01594459074347967, + "grad_norm": 11.647256526527032, + "learning_rate": 3.5804638899585957e-06, + "loss": 1.0887, + "step": 221 + }, + { + "epoch": 0.01601673821290718, + "grad_norm": 3.241586652485744, + "learning_rate": 3.5834583625000667e-06, + "loss": 1.0928, + "step": 222 + }, + { + "epoch": 0.016088885682334693, + "grad_norm": 1.535446023164367, + "learning_rate": 3.58643937669257e-06, + "loss": 0.8484, + "step": 223 + }, + { + "epoch": 0.016161033151762202, + "grad_norm": 4.430530071019164, + "learning_rate": 3.5894070529693628e-06, + "loss": 0.9539, + "step": 224 + }, + { + "epoch": 0.01623318062118971, + "grad_norm": 6.222712648834183, + "learning_rate": 3.5923615101543413e-06, + "loss": 1.0014, + "step": 225 + }, + { + "epoch": 0.016305328090617223, + "grad_norm": 7.992572197180857, + "learning_rate": 3.5953028654905827e-06, + "loss": 0.9953, + "step": 226 + }, + { + "epoch": 0.016377475560044732, + "grad_norm": 2.982793738584429, + "learning_rate": 3.598231234668268e-06, + "loss": 1.0616, + "step": 227 + }, + { + "epoch": 0.01644962302947224, + "grad_norm": 4.8410662691891195, + "learning_rate": 3.6011467318519812e-06, + "loss": 1.0795, + "step": 228 + }, + { + "epoch": 0.01652177049889975, + "grad_norm": 3.3309680812271822, + "learning_rate": 3.6040494697074194e-06, + "loss": 1.1426, + "step": 229 + }, + { + "epoch": 0.016593917968327262, + "grad_norm": 4.129939041242014, + "learning_rate": 3.6069395594275168e-06, + "loss": 1.1113, + "step": 230 + }, + { + "epoch": 0.01666606543775477, + "grad_norm": 3.007105518830258, + "learning_rate": 3.609817110758e-06, + "loss": 1.0414, + "step": 231 + }, + { + "epoch": 0.01673821290718228, + "grad_norm": 7.84151180599535, + "learning_rate": 3.612682232022398e-06, + "loss": 1.0072, + "step": 232 + }, + { + "epoch": 0.016810360376609792, + "grad_norm": 4.275911441481863, + "learning_rate": 3.6155350301465064e-06, + "loss": 1.0856, + "step": 233 + }, + { + "epoch": 0.0168825078460373, + "grad_norm": 2.901403837123371, + "learning_rate": 3.618375610682335e-06, + "loss": 1.1268, + "step": 234 + }, + { + "epoch": 0.01695465531546481, + "grad_norm": 4.705381948147685, + "learning_rate": 3.6212040778315378e-06, + "loss": 1.0281, + "step": 235 + }, + { + "epoch": 0.01702680278489232, + "grad_norm": 3.651078438494315, + "learning_rate": 3.6240205344683526e-06, + "loss": 1.0444, + "step": 236 + }, + { + "epoch": 0.01709895025431983, + "grad_norm": 5.17070764004109, + "learning_rate": 3.6268250821620504e-06, + "loss": 1.0715, + "step": 237 + }, + { + "epoch": 0.01717109772374734, + "grad_norm": 3.7494552216593435, + "learning_rate": 3.629617821198915e-06, + "loss": 0.9463, + "step": 238 + }, + { + "epoch": 0.01724324519317485, + "grad_norm": 3.4200554456176047, + "learning_rate": 3.6323988506037587e-06, + "loss": 1.0774, + "step": 239 + }, + { + "epoch": 0.01731539266260236, + "grad_norm": 6.503239044699939, + "learning_rate": 3.6351682681609926e-06, + "loss": 1.0738, + "step": 240 + }, + { + "epoch": 0.01738754013202987, + "grad_norm": 3.0125710633313023, + "learning_rate": 3.6379261704352583e-06, + "loss": 1.0409, + "step": 241 + }, + { + "epoch": 0.01745968760145738, + "grad_norm": 5.329979895290798, + "learning_rate": 3.640672652791628e-06, + "loss": 1.0108, + "step": 242 + }, + { + "epoch": 0.017531835070884887, + "grad_norm": 3.8640699979629916, + "learning_rate": 3.6434078094153934e-06, + "loss": 1.0208, + "step": 243 + }, + { + "epoch": 0.0176039825403124, + "grad_norm": 3.7714891794214607, + "learning_rate": 3.6461317333314465e-06, + "loss": 1.1249, + "step": 244 + }, + { + "epoch": 0.01767613000973991, + "grad_norm": 1.900705121354574, + "learning_rate": 3.6488445164232626e-06, + "loss": 1.0056, + "step": 245 + }, + { + "epoch": 0.017748277479167417, + "grad_norm": 5.797970896979114, + "learning_rate": 3.6515462494514983e-06, + "loss": 1.0351, + "step": 246 + }, + { + "epoch": 0.01782042494859493, + "grad_norm": 6.194231475377445, + "learning_rate": 3.654237022072213e-06, + "loss": 1.0411, + "step": 247 + }, + { + "epoch": 0.01789257241802244, + "grad_norm": 3.7091589590789744, + "learning_rate": 3.6569169228547184e-06, + "loss": 1.0609, + "step": 248 + }, + { + "epoch": 0.017964719887449947, + "grad_norm": 4.331373187098185, + "learning_rate": 3.659586039299075e-06, + "loss": 0.9315, + "step": 249 + }, + { + "epoch": 0.018036867356877456, + "grad_norm": 8.726704568141912, + "learning_rate": 3.662244457853231e-06, + "loss": 1.1815, + "step": 250 + }, + { + "epoch": 0.01810901482630497, + "grad_norm": 2.7473859500556586, + "learning_rate": 3.664892263929826e-06, + "loss": 1.0089, + "step": 251 + }, + { + "epoch": 0.018181162295732477, + "grad_norm": 3.4500197866121196, + "learning_rate": 3.667529541922654e-06, + "loss": 1.0611, + "step": 252 + }, + { + "epoch": 0.018253309765159986, + "grad_norm": 6.285393254640359, + "learning_rate": 3.6701563752228055e-06, + "loss": 1.0594, + "step": 253 + }, + { + "epoch": 0.018325457234587495, + "grad_norm": 98.6199400634831, + "learning_rate": 3.6727728462344894e-06, + "loss": 0.8999, + "step": 254 + }, + { + "epoch": 0.018397604704015007, + "grad_norm": 4.1864237754568, + "learning_rate": 3.675379036390544e-06, + "loss": 1.009, + "step": 255 + }, + { + "epoch": 0.018469752173442516, + "grad_norm": 1.2797872986497234, + "learning_rate": 3.6779750261676444e-06, + "loss": 0.9285, + "step": 256 + }, + { + "epoch": 0.018541899642870025, + "grad_norm": 3.0938765374710364, + "learning_rate": 3.6805608951012116e-06, + "loss": 0.9425, + "step": 257 + }, + { + "epoch": 0.018614047112297537, + "grad_norm": 4.0557928030448585, + "learning_rate": 3.6831367218000362e-06, + "loss": 0.9967, + "step": 258 + }, + { + "epoch": 0.018686194581725046, + "grad_norm": 3.9591588949647645, + "learning_rate": 3.6857025839606174e-06, + "loss": 0.9769, + "step": 259 + }, + { + "epoch": 0.018758342051152555, + "grad_norm": 1.1472635302266372, + "learning_rate": 3.6882585583812243e-06, + "loss": 0.913, + "step": 260 + }, + { + "epoch": 0.018830489520580064, + "grad_norm": 2.5158710240963655, + "learning_rate": 3.6908047209756893e-06, + "loss": 1.0906, + "step": 261 + }, + { + "epoch": 0.018902636990007576, + "grad_norm": 3.429475747351534, + "learning_rate": 3.6933411467869324e-06, + "loss": 1.0409, + "step": 262 + }, + { + "epoch": 0.018974784459435085, + "grad_norm": 2.8980585966221537, + "learning_rate": 3.6958679100002413e-06, + "loss": 1.0077, + "step": 263 + }, + { + "epoch": 0.019046931928862594, + "grad_norm": 6.771494796916555, + "learning_rate": 3.6983850839562814e-06, + "loss": 0.9932, + "step": 264 + }, + { + "epoch": 0.019119079398290106, + "grad_norm": 2.9544519743102713, + "learning_rate": 3.7008927411638757e-06, + "loss": 1.0184, + "step": 265 + }, + { + "epoch": 0.019191226867717615, + "grad_norm": 3.002781690966887, + "learning_rate": 3.7033909533125315e-06, + "loss": 1.0991, + "step": 266 + }, + { + "epoch": 0.019263374337145124, + "grad_norm": 3.3037825262898384, + "learning_rate": 3.7058797912847445e-06, + "loss": 1.0957, + "step": 267 + }, + { + "epoch": 0.019335521806572633, + "grad_norm": 3.698175011148869, + "learning_rate": 3.7083593251680595e-06, + "loss": 1.0684, + "step": 268 + }, + { + "epoch": 0.019407669276000145, + "grad_norm": 3.5379832392630344, + "learning_rate": 3.7108296242669196e-06, + "loss": 0.9686, + "step": 269 + }, + { + "epoch": 0.019479816745427654, + "grad_norm": 4.075984934096715, + "learning_rate": 3.7132907571142837e-06, + "loss": 0.9101, + "step": 270 + }, + { + "epoch": 0.019551964214855163, + "grad_norm": 2.651123305427081, + "learning_rate": 3.7157427914830385e-06, + "loss": 1.0309, + "step": 271 + }, + { + "epoch": 0.019624111684282675, + "grad_norm": 2.2205779376680974, + "learning_rate": 3.7181857943971964e-06, + "loss": 1.136, + "step": 272 + }, + { + "epoch": 0.019696259153710184, + "grad_norm": 5.052814024425368, + "learning_rate": 3.7206198321428855e-06, + "loss": 1.0423, + "step": 273 + }, + { + "epoch": 0.019768406623137693, + "grad_norm": 3.077363205572464, + "learning_rate": 3.723044970279146e-06, + "loss": 1.0007, + "step": 274 + }, + { + "epoch": 0.019840554092565202, + "grad_norm": 2.8791073194497594, + "learning_rate": 3.7254612736485204e-06, + "loss": 0.8549, + "step": 275 + }, + { + "epoch": 0.019912701561992714, + "grad_norm": 2.9164330706830777, + "learning_rate": 3.7278688063874583e-06, + "loss": 1.0868, + "step": 276 + }, + { + "epoch": 0.019984849031420223, + "grad_norm": 3.543597484179434, + "learning_rate": 3.730267631936535e-06, + "loss": 1.0259, + "step": 277 + }, + { + "epoch": 0.020056996500847732, + "grad_norm": 3.0496270686550404, + "learning_rate": 3.732657813050475e-06, + "loss": 1.0582, + "step": 278 + }, + { + "epoch": 0.020129143970275244, + "grad_norm": 3.0008445186842625, + "learning_rate": 3.735039411808009e-06, + "loss": 0.7942, + "step": 279 + }, + { + "epoch": 0.020201291439702753, + "grad_norm": 4.465368738956898, + "learning_rate": 3.7374124896215433e-06, + "loss": 0.8898, + "step": 280 + }, + { + "epoch": 0.020273438909130262, + "grad_norm": 3.3162475116603933, + "learning_rate": 3.739777107246667e-06, + "loss": 0.9954, + "step": 281 + }, + { + "epoch": 0.02034558637855777, + "grad_norm": 3.905404571598042, + "learning_rate": 3.7421333247914798e-06, + "loss": 1.1137, + "step": 282 + }, + { + "epoch": 0.020417733847985283, + "grad_norm": 2.797107452450731, + "learning_rate": 3.7444812017257643e-06, + "loss": 1.1285, + "step": 283 + }, + { + "epoch": 0.020489881317412792, + "grad_norm": 3.614637625485329, + "learning_rate": 3.7468207968899897e-06, + "loss": 1.0179, + "step": 284 + }, + { + "epoch": 0.0205620287868403, + "grad_norm": 9.636157022161985, + "learning_rate": 3.749152168504162e-06, + "loss": 1.0274, + "step": 285 + }, + { + "epoch": 0.020634176256267813, + "grad_norm": 3.4628567508389736, + "learning_rate": 3.7514753741765134e-06, + "loss": 0.9997, + "step": 286 + }, + { + "epoch": 0.020706323725695322, + "grad_norm": 1.4084415313344765, + "learning_rate": 3.7537904709120494e-06, + "loss": 0.8383, + "step": 287 + }, + { + "epoch": 0.02077847119512283, + "grad_norm": 3.0406290836444025, + "learning_rate": 3.756097515120935e-06, + "loss": 1.0461, + "step": 288 + }, + { + "epoch": 0.02085061866455034, + "grad_norm": 2.571621454286048, + "learning_rate": 3.758396562626748e-06, + "loss": 1.0839, + "step": 289 + }, + { + "epoch": 0.020922766133977852, + "grad_norm": 4.729557015642055, + "learning_rate": 3.760687668674579e-06, + "loss": 0.9756, + "step": 290 + }, + { + "epoch": 0.02099491360340536, + "grad_norm": 3.3610198496329153, + "learning_rate": 3.7629708879390006e-06, + "loss": 0.996, + "step": 291 + }, + { + "epoch": 0.02106706107283287, + "grad_norm": 1.0399494257881108, + "learning_rate": 3.7652462745318934e-06, + "loss": 0.8852, + "step": 292 + }, + { + "epoch": 0.02113920854226038, + "grad_norm": 3.609034270350309, + "learning_rate": 3.767513882010144e-06, + "loss": 1.0707, + "step": 293 + }, + { + "epoch": 0.02121135601168789, + "grad_norm": 3.2612557270998397, + "learning_rate": 3.7697737633832046e-06, + "loss": 1.0682, + "step": 294 + }, + { + "epoch": 0.0212835034811154, + "grad_norm": 3.176192328579912, + "learning_rate": 3.772025971120533e-06, + "loss": 0.979, + "step": 295 + }, + { + "epoch": 0.02135565095054291, + "grad_norm": 1.3688764766280725, + "learning_rate": 3.774270557158899e-06, + "loss": 0.8708, + "step": 296 + }, + { + "epoch": 0.02142779841997042, + "grad_norm": 3.204106622405331, + "learning_rate": 3.7765075729095725e-06, + "loss": 1.0121, + "step": 297 + }, + { + "epoch": 0.02149994588939793, + "grad_norm": 3.3212273268666923, + "learning_rate": 3.7787370692653873e-06, + "loss": 1.0515, + "step": 298 + }, + { + "epoch": 0.02157209335882544, + "grad_norm": 7.261853754964782, + "learning_rate": 3.7809590966076912e-06, + "loss": 0.9064, + "step": 299 + }, + { + "epoch": 0.021644240828252947, + "grad_norm": 3.0619194881884857, + "learning_rate": 3.7831737048131736e-06, + "loss": 0.9823, + "step": 300 + }, + { + "epoch": 0.02171638829768046, + "grad_norm": 5.0970220061619145, + "learning_rate": 3.785380943260587e-06, + "loss": 1.0074, + "step": 301 + }, + { + "epoch": 0.02178853576710797, + "grad_norm": 1.1789302795671168, + "learning_rate": 3.7875808608373505e-06, + "loss": 0.8139, + "step": 302 + }, + { + "epoch": 0.021860683236535478, + "grad_norm": 23.744758107794155, + "learning_rate": 3.7897735059460465e-06, + "loss": 0.9875, + "step": 303 + }, + { + "epoch": 0.02193283070596299, + "grad_norm": 2.5370979103258096, + "learning_rate": 3.7919589265108135e-06, + "loss": 1.033, + "step": 304 + }, + { + "epoch": 0.0220049781753905, + "grad_norm": 3.269569606396502, + "learning_rate": 3.7941371699836275e-06, + "loss": 0.9654, + "step": 305 + }, + { + "epoch": 0.022077125644818008, + "grad_norm": 4.087955479806845, + "learning_rate": 3.7963082833504866e-06, + "loss": 1.0362, + "step": 306 + }, + { + "epoch": 0.022149273114245516, + "grad_norm": 37.44871982072201, + "learning_rate": 3.798472313137493e-06, + "loss": 1.1171, + "step": 307 + }, + { + "epoch": 0.02222142058367303, + "grad_norm": 14.406859544283444, + "learning_rate": 3.8006293054168325e-06, + "loss": 1.0174, + "step": 308 + }, + { + "epoch": 0.022293568053100538, + "grad_norm": 3.517878780616567, + "learning_rate": 3.802779305812662e-06, + "loss": 1.0076, + "step": 309 + }, + { + "epoch": 0.022365715522528046, + "grad_norm": 2.7773382559720625, + "learning_rate": 3.8049223595068993e-06, + "loss": 0.8796, + "step": 310 + }, + { + "epoch": 0.02243786299195556, + "grad_norm": 3.2357354735153225, + "learning_rate": 3.8070585112449207e-06, + "loss": 1.019, + "step": 311 + }, + { + "epoch": 0.022510010461383068, + "grad_norm": 3.8880546689291218, + "learning_rate": 3.809187805341167e-06, + "loss": 1.1053, + "step": 312 + }, + { + "epoch": 0.022582157930810576, + "grad_norm": 2.898558909708712, + "learning_rate": 3.8113102856846578e-06, + "loss": 0.9959, + "step": 313 + }, + { + "epoch": 0.022654305400238085, + "grad_norm": 6.690546500517423, + "learning_rate": 3.813425995744421e-06, + "loss": 1.0452, + "step": 314 + }, + { + "epoch": 0.022726452869665598, + "grad_norm": 2.884964302675158, + "learning_rate": 3.8155349785748344e-06, + "loss": 1.0239, + "step": 315 + }, + { + "epoch": 0.022798600339093106, + "grad_norm": 2.657082281432996, + "learning_rate": 3.817637276820883e-06, + "loss": 0.9754, + "step": 316 + }, + { + "epoch": 0.022870747808520615, + "grad_norm": 3.512986884531611, + "learning_rate": 3.81973293272333e-06, + "loss": 1.0313, + "step": 317 + }, + { + "epoch": 0.022942895277948128, + "grad_norm": 4.625970274727888, + "learning_rate": 3.821821988123818e-06, + "loss": 1.0954, + "step": 318 + }, + { + "epoch": 0.023015042747375637, + "grad_norm": 3.3895690961435996, + "learning_rate": 3.8239044844698675e-06, + "loss": 0.9693, + "step": 319 + }, + { + "epoch": 0.023087190216803145, + "grad_norm": 3.809884847267497, + "learning_rate": 3.8259804628198245e-06, + "loss": 1.0272, + "step": 320 + }, + { + "epoch": 0.023159337686230654, + "grad_norm": 3.2084664816702486, + "learning_rate": 3.828049963847712e-06, + "loss": 1.0377, + "step": 321 + }, + { + "epoch": 0.023231485155658167, + "grad_norm": 5.012660591425873, + "learning_rate": 3.8301130278480095e-06, + "loss": 0.9691, + "step": 322 + }, + { + "epoch": 0.023303632625085675, + "grad_norm": 5.079003830203966, + "learning_rate": 3.832169694740365e-06, + "loss": 1.1348, + "step": 323 + }, + { + "epoch": 0.023375780094513184, + "grad_norm": 4.677686667945659, + "learning_rate": 3.834220004074225e-06, + "loss": 1.0607, + "step": 324 + }, + { + "epoch": 0.023447927563940693, + "grad_norm": 2.932495887018492, + "learning_rate": 3.836263995033405e-06, + "loss": 0.8702, + "step": 325 + }, + { + "epoch": 0.023520075033368205, + "grad_norm": 3.548269527059312, + "learning_rate": 3.838301706440575e-06, + "loss": 1.1385, + "step": 326 + }, + { + "epoch": 0.023592222502795714, + "grad_norm": 4.4246278423127015, + "learning_rate": 3.8403331767616854e-06, + "loss": 1.0345, + "step": 327 + }, + { + "epoch": 0.023664369972223223, + "grad_norm": 1.1544007127146327, + "learning_rate": 3.8423584441103306e-06, + "loss": 0.8102, + "step": 328 + }, + { + "epoch": 0.023736517441650735, + "grad_norm": 5.651146878670871, + "learning_rate": 3.844377546252031e-06, + "loss": 0.9457, + "step": 329 + }, + { + "epoch": 0.023808664911078244, + "grad_norm": 2.4839963804393697, + "learning_rate": 3.846390520608462e-06, + "loss": 1.1001, + "step": 330 + }, + { + "epoch": 0.023880812380505753, + "grad_norm": 10.1077768280065, + "learning_rate": 3.848397404261619e-06, + "loss": 0.991, + "step": 331 + }, + { + "epoch": 0.023952959849933262, + "grad_norm": 3.143087152635138, + "learning_rate": 3.850398233957907e-06, + "loss": 1.0311, + "step": 332 + }, + { + "epoch": 0.024025107319360774, + "grad_norm": 6.043035663414281, + "learning_rate": 3.85239304611219e-06, + "loss": 1.008, + "step": 333 + }, + { + "epoch": 0.024097254788788283, + "grad_norm": 4.290202560800051, + "learning_rate": 3.854381876811756e-06, + "loss": 1.0652, + "step": 334 + }, + { + "epoch": 0.024169402258215792, + "grad_norm": 3.1495559258262467, + "learning_rate": 3.856364761820241e-06, + "loss": 1.0198, + "step": 335 + }, + { + "epoch": 0.024241549727643304, + "grad_norm": 3.7796240218951946, + "learning_rate": 3.858341736581486e-06, + "loss": 0.9453, + "step": 336 + }, + { + "epoch": 0.024313697197070813, + "grad_norm": 2.116264113746982, + "learning_rate": 3.860312836223339e-06, + "loss": 1.0316, + "step": 337 + }, + { + "epoch": 0.024385844666498322, + "grad_norm": 4.419474135311851, + "learning_rate": 3.862278095561399e-06, + "loss": 1.0204, + "step": 338 + }, + { + "epoch": 0.02445799213592583, + "grad_norm": 3.5735567073846126, + "learning_rate": 3.864237549102705e-06, + "loss": 1.0992, + "step": 339 + }, + { + "epoch": 0.024530139605353343, + "grad_norm": 3.903680062817253, + "learning_rate": 3.866191231049377e-06, + "loss": 0.998, + "step": 340 + }, + { + "epoch": 0.024602287074780852, + "grad_norm": 4.646842516212853, + "learning_rate": 3.868139175302188e-06, + "loss": 1.0899, + "step": 341 + }, + { + "epoch": 0.02467443454420836, + "grad_norm": 4.588710797671604, + "learning_rate": 3.870081415464103e-06, + "loss": 1.0213, + "step": 342 + }, + { + "epoch": 0.024746582013635873, + "grad_norm": 3.4611482922016434, + "learning_rate": 3.872017984843755e-06, + "loss": 0.9409, + "step": 343 + }, + { + "epoch": 0.024818729483063382, + "grad_norm": 4.305407455248909, + "learning_rate": 3.8739489164588685e-06, + "loss": 0.9707, + "step": 344 + }, + { + "epoch": 0.02489087695249089, + "grad_norm": 4.251218733230911, + "learning_rate": 3.87587424303964e-06, + "loss": 1.018, + "step": 345 + }, + { + "epoch": 0.0249630244219184, + "grad_norm": 1.1122340045069452, + "learning_rate": 3.877793997032067e-06, + "loss": 0.786, + "step": 346 + }, + { + "epoch": 0.025035171891345912, + "grad_norm": 3.1818535852320426, + "learning_rate": 3.879708210601229e-06, + "loss": 0.9195, + "step": 347 + }, + { + "epoch": 0.02510731936077342, + "grad_norm": 3.202250440345107, + "learning_rate": 3.881616915634521e-06, + "loss": 1.0037, + "step": 348 + }, + { + "epoch": 0.02517946683020093, + "grad_norm": 3.902404357011841, + "learning_rate": 3.883520143744839e-06, + "loss": 0.948, + "step": 349 + }, + { + "epoch": 0.025251614299628442, + "grad_norm": 4.298734727162936, + "learning_rate": 3.885417926273724e-06, + "loss": 1.1219, + "step": 350 + }, + { + "epoch": 0.02532376176905595, + "grad_norm": 4.010121414743739, + "learning_rate": 3.887310294294458e-06, + "loss": 1.087, + "step": 351 + }, + { + "epoch": 0.02539590923848346, + "grad_norm": 5.107486294108564, + "learning_rate": 3.889197278615114e-06, + "loss": 1.0901, + "step": 352 + }, + { + "epoch": 0.02546805670791097, + "grad_norm": 4.41390521014006, + "learning_rate": 3.89107890978157e-06, + "loss": 0.9954, + "step": 353 + }, + { + "epoch": 0.02554020417733848, + "grad_norm": 2.8912636541101064, + "learning_rate": 3.8929552180804756e-06, + "loss": 1.0338, + "step": 354 + }, + { + "epoch": 0.02561235164676599, + "grad_norm": 1.0247629915090022, + "learning_rate": 3.894826233542171e-06, + "loss": 0.8739, + "step": 355 + }, + { + "epoch": 0.0256844991161935, + "grad_norm": 2.92006998866611, + "learning_rate": 3.896691985943577e-06, + "loss": 1.0112, + "step": 356 + }, + { + "epoch": 0.025756646585621008, + "grad_norm": 3.5601840605998336, + "learning_rate": 3.898552504811037e-06, + "loss": 1.022, + "step": 357 + }, + { + "epoch": 0.02582879405504852, + "grad_norm": 3.528568701469996, + "learning_rate": 3.900407819423122e-06, + "loss": 0.8284, + "step": 358 + }, + { + "epoch": 0.02590094152447603, + "grad_norm": 3.9320039690216926, + "learning_rate": 3.90225795881339e-06, + "loss": 1.0449, + "step": 359 + }, + { + "epoch": 0.025973088993903538, + "grad_norm": 2.8488032120358473, + "learning_rate": 3.9041029517731164e-06, + "loss": 1.0556, + "step": 360 + }, + { + "epoch": 0.02604523646333105, + "grad_norm": 3.004198950507192, + "learning_rate": 3.9059428268539815e-06, + "loss": 1.0396, + "step": 361 + }, + { + "epoch": 0.02611738393275856, + "grad_norm": 4.245302633724043, + "learning_rate": 3.907777612370725e-06, + "loss": 1.0515, + "step": 362 + }, + { + "epoch": 0.026189531402186068, + "grad_norm": 3.310350836516824, + "learning_rate": 3.909607336403751e-06, + "loss": 1.0361, + "step": 363 + }, + { + "epoch": 0.026261678871613577, + "grad_norm": 4.401549300613085, + "learning_rate": 3.911432026801718e-06, + "loss": 1.0992, + "step": 364 + }, + { + "epoch": 0.02633382634104109, + "grad_norm": 3.978145861215125, + "learning_rate": 3.913251711184074e-06, + "loss": 1.1381, + "step": 365 + }, + { + "epoch": 0.026405973810468598, + "grad_norm": 2.111423769045194, + "learning_rate": 3.9150664169435694e-06, + "loss": 0.8374, + "step": 366 + }, + { + "epoch": 0.026478121279896107, + "grad_norm": 4.110730545941669, + "learning_rate": 3.9168761712487285e-06, + "loss": 1.0822, + "step": 367 + }, + { + "epoch": 0.02655026874932362, + "grad_norm": 3.268926676247887, + "learning_rate": 3.9186810010462915e-06, + "loss": 1.0196, + "step": 368 + }, + { + "epoch": 0.026622416218751128, + "grad_norm": 3.028631278626535, + "learning_rate": 3.920480933063622e-06, + "loss": 1.0926, + "step": 369 + }, + { + "epoch": 0.026694563688178637, + "grad_norm": 2.4960269829502, + "learning_rate": 3.92227599381108e-06, + "loss": 1.0175, + "step": 370 + }, + { + "epoch": 0.026766711157606145, + "grad_norm": 3.336331132093101, + "learning_rate": 3.924066209584369e-06, + "loss": 1.0124, + "step": 371 + }, + { + "epoch": 0.026838858627033658, + "grad_norm": 1.435898178608234, + "learning_rate": 3.925851606466841e-06, + "loss": 0.8705, + "step": 372 + }, + { + "epoch": 0.026911006096461167, + "grad_norm": 3.2221861340339792, + "learning_rate": 3.9276322103317845e-06, + "loss": 0.9245, + "step": 373 + }, + { + "epoch": 0.026983153565888676, + "grad_norm": 3.4132333267979296, + "learning_rate": 3.929408046844666e-06, + "loss": 1.0464, + "step": 374 + }, + { + "epoch": 0.027055301035316188, + "grad_norm": 4.047525832227489, + "learning_rate": 3.931179141465354e-06, + "loss": 1.1043, + "step": 375 + }, + { + "epoch": 0.027127448504743697, + "grad_norm": 4.10208748500311, + "learning_rate": 3.932945519450312e-06, + "loss": 0.9937, + "step": 376 + }, + { + "epoch": 0.027199595974171206, + "grad_norm": 2.427680262778126, + "learning_rate": 3.934707205854753e-06, + "loss": 0.9947, + "step": 377 + }, + { + "epoch": 0.027271743443598714, + "grad_norm": 4.9887203779036495, + "learning_rate": 3.936464225534777e-06, + "loss": 1.0136, + "step": 378 + }, + { + "epoch": 0.027343890913026227, + "grad_norm": 2.7079656400261536, + "learning_rate": 3.9382166031494725e-06, + "loss": 1.0383, + "step": 379 + }, + { + "epoch": 0.027416038382453736, + "grad_norm": 4.030320171198939, + "learning_rate": 3.939964363162994e-06, + "loss": 1.0287, + "step": 380 + }, + { + "epoch": 0.027488185851881244, + "grad_norm": 3.310538808987224, + "learning_rate": 3.941707529846612e-06, + "loss": 1.0144, + "step": 381 + }, + { + "epoch": 0.027560333321308757, + "grad_norm": 3.43783040626231, + "learning_rate": 3.943446127280736e-06, + "loss": 1.0916, + "step": 382 + }, + { + "epoch": 0.027632480790736266, + "grad_norm": 2.9633751650912497, + "learning_rate": 3.945180179356906e-06, + "loss": 0.9643, + "step": 383 + }, + { + "epoch": 0.027704628260163774, + "grad_norm": 3.7423861456850664, + "learning_rate": 3.946909709779767e-06, + "loss": 1.0738, + "step": 384 + }, + { + "epoch": 0.027776775729591283, + "grad_norm": 1.127450340767571, + "learning_rate": 3.9486347420690135e-06, + "loss": 0.8144, + "step": 385 + }, + { + "epoch": 0.027848923199018796, + "grad_norm": 2.706248777182217, + "learning_rate": 3.950355299561303e-06, + "loss": 1.0048, + "step": 386 + }, + { + "epoch": 0.027921070668446304, + "grad_norm": 3.669656546006644, + "learning_rate": 3.95207140541216e-06, + "loss": 1.0089, + "step": 387 + }, + { + "epoch": 0.027993218137873813, + "grad_norm": 6.746811290191035, + "learning_rate": 3.953783082597833e-06, + "loss": 0.9817, + "step": 388 + }, + { + "epoch": 0.028065365607301322, + "grad_norm": 3.5594703775409213, + "learning_rate": 3.955490353917153e-06, + "loss": 1.0616, + "step": 389 + }, + { + "epoch": 0.028137513076728835, + "grad_norm": 2.7191579227694667, + "learning_rate": 3.957193241993348e-06, + "loss": 0.9999, + "step": 390 + }, + { + "epoch": 0.028209660546156343, + "grad_norm": 3.871507362726249, + "learning_rate": 3.9588917692758435e-06, + "loss": 1.0558, + "step": 391 + }, + { + "epoch": 0.028281808015583852, + "grad_norm": 2.6146073561903473, + "learning_rate": 3.9605859580420365e-06, + "loss": 1.0193, + "step": 392 + }, + { + "epoch": 0.028353955485011365, + "grad_norm": 3.8182268748989605, + "learning_rate": 3.962275830399056e-06, + "loss": 0.9701, + "step": 393 + }, + { + "epoch": 0.028426102954438873, + "grad_norm": 2.9759671474407248, + "learning_rate": 3.963961408285482e-06, + "loss": 0.9959, + "step": 394 + }, + { + "epoch": 0.028498250423866382, + "grad_norm": 2.25018765408736, + "learning_rate": 3.965642713473064e-06, + "loss": 1.0111, + "step": 395 + }, + { + "epoch": 0.02857039789329389, + "grad_norm": 3.998337757913278, + "learning_rate": 3.967319767568405e-06, + "loss": 1.0172, + "step": 396 + }, + { + "epoch": 0.028642545362721403, + "grad_norm": 2.481415527357153, + "learning_rate": 3.968992592014627e-06, + "loss": 0.9926, + "step": 397 + }, + { + "epoch": 0.028714692832148912, + "grad_norm": 3.559744857046025, + "learning_rate": 3.9706612080930185e-06, + "loss": 1.0701, + "step": 398 + }, + { + "epoch": 0.02878684030157642, + "grad_norm": 9.7566787146557, + "learning_rate": 3.972325636924655e-06, + "loss": 0.9761, + "step": 399 + }, + { + "epoch": 0.028858987771003933, + "grad_norm": 5.0506109340241965, + "learning_rate": 3.973985899472006e-06, + "loss": 0.9505, + "step": 400 + }, + { + "epoch": 0.028931135240431442, + "grad_norm": 3.211852303012956, + "learning_rate": 3.975642016540519e-06, + "loss": 1.0662, + "step": 401 + }, + { + "epoch": 0.02900328270985895, + "grad_norm": 4.511601856913752, + "learning_rate": 3.977294008780183e-06, + "loss": 1.0469, + "step": 402 + }, + { + "epoch": 0.02907543017928646, + "grad_norm": 5.837578926941501, + "learning_rate": 3.978941896687073e-06, + "loss": 0.9349, + "step": 403 + }, + { + "epoch": 0.029147577648713972, + "grad_norm": 3.064219400842411, + "learning_rate": 3.98058570060488e-06, + "loss": 0.9776, + "step": 404 + }, + { + "epoch": 0.02921972511814148, + "grad_norm": 3.4962125449003603, + "learning_rate": 3.982225440726407e-06, + "loss": 1.0209, + "step": 405 + }, + { + "epoch": 0.02929187258756899, + "grad_norm": 3.238033713162308, + "learning_rate": 3.983861137095072e-06, + "loss": 1.0131, + "step": 406 + }, + { + "epoch": 0.029364020056996502, + "grad_norm": 3.628624649537205, + "learning_rate": 3.985492809606368e-06, + "loss": 0.9384, + "step": 407 + }, + { + "epoch": 0.02943616752642401, + "grad_norm": 2.8666183364106055, + "learning_rate": 3.987120478009319e-06, + "loss": 1.0726, + "step": 408 + }, + { + "epoch": 0.02950831499585152, + "grad_norm": 3.1210147623277082, + "learning_rate": 3.988744161907911e-06, + "loss": 1.0157, + "step": 409 + }, + { + "epoch": 0.02958046246527903, + "grad_norm": 2.7674607902857526, + "learning_rate": 3.990363880762511e-06, + "loss": 1.0015, + "step": 410 + }, + { + "epoch": 0.02965260993470654, + "grad_norm": 5.971605826020118, + "learning_rate": 3.991979653891269e-06, + "loss": 0.9931, + "step": 411 + }, + { + "epoch": 0.02972475740413405, + "grad_norm": 1.9420427165612522, + "learning_rate": 3.993591500471494e-06, + "loss": 0.84, + "step": 412 + }, + { + "epoch": 0.02979690487356156, + "grad_norm": 3.348293034671953, + "learning_rate": 3.995199439541027e-06, + "loss": 1.0221, + "step": 413 + }, + { + "epoch": 0.02986905234298907, + "grad_norm": 3.332289262019287, + "learning_rate": 3.996803489999582e-06, + "loss": 1.0708, + "step": 414 + }, + { + "epoch": 0.02994119981241658, + "grad_norm": 3.9712619791639687, + "learning_rate": 3.998403670610088e-06, + "loss": 1.1818, + "step": 415 + }, + { + "epoch": 0.03001334728184409, + "grad_norm": 11.80058858664308, + "learning_rate": 3.999999999999999e-06, + "loss": 1.0393, + "step": 416 + }, + { + "epoch": 0.030085494751271598, + "grad_norm": 3.405653175073639, + "learning_rate": 3.999999945393702e-06, + "loss": 1.0346, + "step": 417 + }, + { + "epoch": 0.03015764222069911, + "grad_norm": 4.134437630914969, + "learning_rate": 3.999999781574811e-06, + "loss": 0.9493, + "step": 418 + }, + { + "epoch": 0.03022978969012662, + "grad_norm": 1.2539534515338224, + "learning_rate": 3.9999995085433345e-06, + "loss": 0.93, + "step": 419 + }, + { + "epoch": 0.030301937159554128, + "grad_norm": 4.188559099319384, + "learning_rate": 3.999999126299289e-06, + "loss": 1.0531, + "step": 420 + }, + { + "epoch": 0.03037408462898164, + "grad_norm": 4.26119798927872, + "learning_rate": 3.999998634842696e-06, + "loss": 1.0623, + "step": 421 + }, + { + "epoch": 0.03044623209840915, + "grad_norm": 2.676190644291528, + "learning_rate": 3.99999803417358e-06, + "loss": 1.0316, + "step": 422 + }, + { + "epoch": 0.030518379567836658, + "grad_norm": 6.500031397817046, + "learning_rate": 3.9999973242919755e-06, + "loss": 1.0062, + "step": 423 + }, + { + "epoch": 0.030590527037264167, + "grad_norm": 3.4153339453935, + "learning_rate": 3.999996505197922e-06, + "loss": 0.9884, + "step": 424 + }, + { + "epoch": 0.03066267450669168, + "grad_norm": 3.086165871197771, + "learning_rate": 3.999995576891461e-06, + "loss": 1.0753, + "step": 425 + }, + { + "epoch": 0.030734821976119188, + "grad_norm": 5.102551865265039, + "learning_rate": 3.999994539372647e-06, + "loss": 1.103, + "step": 426 + }, + { + "epoch": 0.030806969445546697, + "grad_norm": 7.539622864132503, + "learning_rate": 3.999993392641535e-06, + "loss": 0.8564, + "step": 427 + }, + { + "epoch": 0.030879116914974206, + "grad_norm": 4.34897809357256, + "learning_rate": 3.9999921366981865e-06, + "loss": 1.0262, + "step": 428 + }, + { + "epoch": 0.030951264384401718, + "grad_norm": 3.191217834812609, + "learning_rate": 3.9999907715426715e-06, + "loss": 1.1452, + "step": 429 + }, + { + "epoch": 0.031023411853829227, + "grad_norm": 0.8176915031369522, + "learning_rate": 3.999989297175064e-06, + "loss": 0.8528, + "step": 430 + }, + { + "epoch": 0.031095559323256736, + "grad_norm": 5.48906331310706, + "learning_rate": 3.999987713595445e-06, + "loss": 1.071, + "step": 431 + }, + { + "epoch": 0.031167706792684248, + "grad_norm": 2.738626281506756, + "learning_rate": 3.999986020803901e-06, + "loss": 1.0515, + "step": 432 + }, + { + "epoch": 0.031239854262111757, + "grad_norm": 3.7385366067597903, + "learning_rate": 3.999984218800523e-06, + "loss": 1.0892, + "step": 433 + }, + { + "epoch": 0.031312001731539266, + "grad_norm": 3.289426516144531, + "learning_rate": 3.999982307585412e-06, + "loss": 1.0283, + "step": 434 + }, + { + "epoch": 0.03138414920096678, + "grad_norm": 3.464939533273024, + "learning_rate": 3.99998028715867e-06, + "loss": 1.0042, + "step": 435 + }, + { + "epoch": 0.03145629667039428, + "grad_norm": 4.541635122393799, + "learning_rate": 3.999978157520408e-06, + "loss": 1.0089, + "step": 436 + }, + { + "epoch": 0.031528444139821796, + "grad_norm": 6.323686111950883, + "learning_rate": 3.9999759186707424e-06, + "loss": 0.9299, + "step": 437 + }, + { + "epoch": 0.03160059160924931, + "grad_norm": 1.0178896377749584, + "learning_rate": 3.999973570609796e-06, + "loss": 0.8379, + "step": 438 + }, + { + "epoch": 0.03167273907867681, + "grad_norm": 6.217540834275148, + "learning_rate": 3.9999711133376955e-06, + "loss": 1.022, + "step": 439 + }, + { + "epoch": 0.031744886548104326, + "grad_norm": 4.064924719607267, + "learning_rate": 3.999968546854578e-06, + "loss": 1.073, + "step": 440 + }, + { + "epoch": 0.03181703401753184, + "grad_norm": 4.752716492322731, + "learning_rate": 3.999965871160581e-06, + "loss": 0.9734, + "step": 441 + }, + { + "epoch": 0.03188918148695934, + "grad_norm": 3.3747177005959803, + "learning_rate": 3.9999630862558505e-06, + "loss": 1.0126, + "step": 442 + }, + { + "epoch": 0.031961328956386856, + "grad_norm": 3.300453449653189, + "learning_rate": 3.99996019214054e-06, + "loss": 1.0888, + "step": 443 + }, + { + "epoch": 0.03203347642581436, + "grad_norm": 3.3517787481227317, + "learning_rate": 3.9999571888148075e-06, + "loss": 0.9922, + "step": 444 + }, + { + "epoch": 0.032105623895241873, + "grad_norm": 5.495459155961298, + "learning_rate": 3.999954076278817e-06, + "loss": 1.0309, + "step": 445 + }, + { + "epoch": 0.032177771364669386, + "grad_norm": 4.248542760209334, + "learning_rate": 3.999950854532737e-06, + "loss": 1.0402, + "step": 446 + }, + { + "epoch": 0.03224991883409689, + "grad_norm": 6.519577474184545, + "learning_rate": 3.999947523576744e-06, + "loss": 1.0185, + "step": 447 + }, + { + "epoch": 0.032322066303524404, + "grad_norm": 3.640303175949716, + "learning_rate": 3.999944083411021e-06, + "loss": 0.9456, + "step": 448 + }, + { + "epoch": 0.032394213772951916, + "grad_norm": 4.324362320829688, + "learning_rate": 3.999940534035755e-06, + "loss": 1.1152, + "step": 449 + }, + { + "epoch": 0.03246636124237942, + "grad_norm": 3.9426770765442134, + "learning_rate": 3.9999368754511396e-06, + "loss": 1.03, + "step": 450 + }, + { + "epoch": 0.032538508711806934, + "grad_norm": 5.257395666867584, + "learning_rate": 3.999933107657376e-06, + "loss": 0.9601, + "step": 451 + }, + { + "epoch": 0.032610656181234446, + "grad_norm": 3.062878176636242, + "learning_rate": 3.999929230654668e-06, + "loss": 0.9582, + "step": 452 + }, + { + "epoch": 0.03268280365066195, + "grad_norm": 1.069818034430579, + "learning_rate": 3.999925244443229e-06, + "loss": 0.8325, + "step": 453 + }, + { + "epoch": 0.032754951120089464, + "grad_norm": 4.035515236557447, + "learning_rate": 3.999921149023275e-06, + "loss": 1.0167, + "step": 454 + }, + { + "epoch": 0.032827098589516976, + "grad_norm": 4.3630642064275404, + "learning_rate": 3.99991694439503e-06, + "loss": 1.0244, + "step": 455 + }, + { + "epoch": 0.03289924605894448, + "grad_norm": 11.447906595954018, + "learning_rate": 3.999912630558725e-06, + "loss": 0.9174, + "step": 456 + }, + { + "epoch": 0.032971393528371994, + "grad_norm": 3.391898623571024, + "learning_rate": 3.999908207514595e-06, + "loss": 1.034, + "step": 457 + }, + { + "epoch": 0.0330435409977995, + "grad_norm": 4.265342619216159, + "learning_rate": 3.99990367526288e-06, + "loss": 1.057, + "step": 458 + }, + { + "epoch": 0.03311568846722701, + "grad_norm": 7.960273333126566, + "learning_rate": 3.99989903380383e-06, + "loss": 1.0168, + "step": 459 + }, + { + "epoch": 0.033187835936654524, + "grad_norm": 2.8835906345760907, + "learning_rate": 3.999894283137696e-06, + "loss": 1.0474, + "step": 460 + }, + { + "epoch": 0.03325998340608203, + "grad_norm": 4.503827057114915, + "learning_rate": 3.999889423264739e-06, + "loss": 1.0566, + "step": 461 + }, + { + "epoch": 0.03333213087550954, + "grad_norm": 3.204822816042497, + "learning_rate": 3.999884454185224e-06, + "loss": 1.0887, + "step": 462 + }, + { + "epoch": 0.033404278344937054, + "grad_norm": 2.624196974223064, + "learning_rate": 3.999879375899423e-06, + "loss": 1.1138, + "step": 463 + }, + { + "epoch": 0.03347642581436456, + "grad_norm": 2.201122049818738, + "learning_rate": 3.999874188407612e-06, + "loss": 1.1079, + "step": 464 + }, + { + "epoch": 0.03354857328379207, + "grad_norm": 2.922494588019693, + "learning_rate": 3.999868891710075e-06, + "loss": 0.9639, + "step": 465 + }, + { + "epoch": 0.033620720753219584, + "grad_norm": 3.143430756575802, + "learning_rate": 3.9998634858071004e-06, + "loss": 1.0258, + "step": 466 + }, + { + "epoch": 0.03369286822264709, + "grad_norm": 4.3148478875251755, + "learning_rate": 3.9998579706989845e-06, + "loss": 1.0052, + "step": 467 + }, + { + "epoch": 0.0337650156920746, + "grad_norm": 4.312616319055158, + "learning_rate": 3.999852346386028e-06, + "loss": 0.998, + "step": 468 + }, + { + "epoch": 0.033837163161502114, + "grad_norm": 3.610758726744792, + "learning_rate": 3.999846612868539e-06, + "loss": 0.9625, + "step": 469 + }, + { + "epoch": 0.03390931063092962, + "grad_norm": 3.13998573591552, + "learning_rate": 3.999840770146829e-06, + "loss": 0.9108, + "step": 470 + }, + { + "epoch": 0.03398145810035713, + "grad_norm": 9.469446137776485, + "learning_rate": 3.999834818221216e-06, + "loss": 0.9807, + "step": 471 + }, + { + "epoch": 0.03405360556978464, + "grad_norm": 2.3812880345308205, + "learning_rate": 3.999828757092029e-06, + "loss": 1.0083, + "step": 472 + }, + { + "epoch": 0.03412575303921215, + "grad_norm": 3.8782062186958863, + "learning_rate": 3.999822586759595e-06, + "loss": 0.9643, + "step": 473 + }, + { + "epoch": 0.03419790050863966, + "grad_norm": 2.108605138374095, + "learning_rate": 3.999816307224254e-06, + "loss": 1.0659, + "step": 474 + }, + { + "epoch": 0.03427004797806717, + "grad_norm": 4.232846367729726, + "learning_rate": 3.999809918486347e-06, + "loss": 1.0384, + "step": 475 + }, + { + "epoch": 0.03434219544749468, + "grad_norm": 2.5314500517734935, + "learning_rate": 3.999803420546223e-06, + "loss": 0.9496, + "step": 476 + }, + { + "epoch": 0.03441434291692219, + "grad_norm": 5.618179764204331, + "learning_rate": 3.999796813404238e-06, + "loss": 1.0625, + "step": 477 + }, + { + "epoch": 0.0344864903863497, + "grad_norm": 4.0055132064896855, + "learning_rate": 3.999790097060752e-06, + "loss": 1.0049, + "step": 478 + }, + { + "epoch": 0.03455863785577721, + "grad_norm": 5.001487701342929, + "learning_rate": 3.999783271516131e-06, + "loss": 1.04, + "step": 479 + }, + { + "epoch": 0.03463078532520472, + "grad_norm": 4.896007561796245, + "learning_rate": 3.99977633677075e-06, + "loss": 0.9604, + "step": 480 + }, + { + "epoch": 0.03470293279463223, + "grad_norm": 10.510555229874127, + "learning_rate": 3.999769292824984e-06, + "loss": 1.0924, + "step": 481 + }, + { + "epoch": 0.03477508026405974, + "grad_norm": 2.4028565890617894, + "learning_rate": 3.9997621396792215e-06, + "loss": 1.0866, + "step": 482 + }, + { + "epoch": 0.034847227733487245, + "grad_norm": 6.4706512458303616, + "learning_rate": 3.99975487733385e-06, + "loss": 1.0371, + "step": 483 + }, + { + "epoch": 0.03491937520291476, + "grad_norm": 3.379267643471493, + "learning_rate": 3.999747505789269e-06, + "loss": 0.9668, + "step": 484 + }, + { + "epoch": 0.03499152267234227, + "grad_norm": 6.281163760679749, + "learning_rate": 3.999740025045879e-06, + "loss": 0.9936, + "step": 485 + }, + { + "epoch": 0.035063670141769775, + "grad_norm": 19.87897494891352, + "learning_rate": 3.9997324351040885e-06, + "loss": 1.0793, + "step": 486 + }, + { + "epoch": 0.03513581761119729, + "grad_norm": 7.492366211861024, + "learning_rate": 3.999724735964313e-06, + "loss": 0.9857, + "step": 487 + }, + { + "epoch": 0.0352079650806248, + "grad_norm": 1.2838993129053038, + "learning_rate": 3.999716927626972e-06, + "loss": 0.8345, + "step": 488 + }, + { + "epoch": 0.035280112550052305, + "grad_norm": 4.665320860856326, + "learning_rate": 3.999709010092493e-06, + "loss": 0.908, + "step": 489 + }, + { + "epoch": 0.03535226001947982, + "grad_norm": 4.3661003056116465, + "learning_rate": 3.999700983361307e-06, + "loss": 1.069, + "step": 490 + }, + { + "epoch": 0.03542440748890733, + "grad_norm": 4.514880689901226, + "learning_rate": 3.999692847433853e-06, + "loss": 1.0533, + "step": 491 + }, + { + "epoch": 0.035496554958334835, + "grad_norm": 4.197531683270022, + "learning_rate": 3.999684602310575e-06, + "loss": 1.0111, + "step": 492 + }, + { + "epoch": 0.03556870242776235, + "grad_norm": 4.65061989210679, + "learning_rate": 3.999676247991924e-06, + "loss": 0.9687, + "step": 493 + }, + { + "epoch": 0.03564084989718986, + "grad_norm": 5.705921229640086, + "learning_rate": 3.999667784478355e-06, + "loss": 1.0634, + "step": 494 + }, + { + "epoch": 0.035712997366617365, + "grad_norm": 5.89070886203153, + "learning_rate": 3.999659211770331e-06, + "loss": 1.0405, + "step": 495 + }, + { + "epoch": 0.03578514483604488, + "grad_norm": 1.3895919477356844, + "learning_rate": 3.99965052986832e-06, + "loss": 0.8456, + "step": 496 + }, + { + "epoch": 0.03585729230547238, + "grad_norm": 3.2778523610855657, + "learning_rate": 3.999641738772797e-06, + "loss": 1.052, + "step": 497 + }, + { + "epoch": 0.035929439774899895, + "grad_norm": 3.0543593598728953, + "learning_rate": 3.99963283848424e-06, + "loss": 1.0197, + "step": 498 + }, + { + "epoch": 0.03600158724432741, + "grad_norm": 4.307826951825596, + "learning_rate": 3.999623829003136e-06, + "loss": 1.0326, + "step": 499 + }, + { + "epoch": 0.03607373471375491, + "grad_norm": 4.956487146310678, + "learning_rate": 3.999614710329977e-06, + "loss": 1.1097, + "step": 500 + }, + { + "epoch": 0.036145882183182425, + "grad_norm": 1.16630864894381, + "learning_rate": 3.999605482465262e-06, + "loss": 0.857, + "step": 501 + }, + { + "epoch": 0.03621802965260994, + "grad_norm": 4.28332558320726, + "learning_rate": 3.999596145409493e-06, + "loss": 1.0171, + "step": 502 + }, + { + "epoch": 0.03629017712203744, + "grad_norm": 3.900560431683296, + "learning_rate": 3.999586699163181e-06, + "loss": 0.9936, + "step": 503 + }, + { + "epoch": 0.036362324591464955, + "grad_norm": 5.599443489760347, + "learning_rate": 3.999577143726841e-06, + "loss": 1.0557, + "step": 504 + }, + { + "epoch": 0.03643447206089247, + "grad_norm": 4.604581351038871, + "learning_rate": 3.999567479100996e-06, + "loss": 1.0711, + "step": 505 + }, + { + "epoch": 0.03650661953031997, + "grad_norm": 4.263980310863391, + "learning_rate": 3.9995577052861735e-06, + "loss": 0.9088, + "step": 506 + }, + { + "epoch": 0.036578766999747485, + "grad_norm": 4.633449568536902, + "learning_rate": 3.999547822282906e-06, + "loss": 0.9485, + "step": 507 + }, + { + "epoch": 0.03665091446917499, + "grad_norm": 4.9417090522959715, + "learning_rate": 3.999537830091734e-06, + "loss": 0.9968, + "step": 508 + }, + { + "epoch": 0.0367230619386025, + "grad_norm": 3.318654945865755, + "learning_rate": 3.999527728713204e-06, + "loss": 0.9962, + "step": 509 + }, + { + "epoch": 0.036795209408030015, + "grad_norm": 4.5565619300427, + "learning_rate": 3.9995175181478655e-06, + "loss": 0.9247, + "step": 510 + }, + { + "epoch": 0.03686735687745752, + "grad_norm": 5.072178384165034, + "learning_rate": 3.999507198396278e-06, + "loss": 0.9979, + "step": 511 + }, + { + "epoch": 0.03693950434688503, + "grad_norm": 2.696348925364303, + "learning_rate": 3.999496769459005e-06, + "loss": 0.9564, + "step": 512 + }, + { + "epoch": 0.037011651816312545, + "grad_norm": 3.7692420650893803, + "learning_rate": 3.999486231336614e-06, + "loss": 0.9156, + "step": 513 + }, + { + "epoch": 0.03708379928574005, + "grad_norm": 2.65903640904629, + "learning_rate": 3.9994755840296825e-06, + "loss": 1.0128, + "step": 514 + }, + { + "epoch": 0.03715594675516756, + "grad_norm": 1.255474310377777, + "learning_rate": 3.999464827538791e-06, + "loss": 0.8329, + "step": 515 + }, + { + "epoch": 0.037228094224595075, + "grad_norm": 5.9684013544485435, + "learning_rate": 3.999453961864527e-06, + "loss": 1.0381, + "step": 516 + }, + { + "epoch": 0.03730024169402258, + "grad_norm": 3.8797932174480962, + "learning_rate": 3.999442987007484e-06, + "loss": 0.953, + "step": 517 + }, + { + "epoch": 0.03737238916345009, + "grad_norm": 6.351143272988214, + "learning_rate": 3.999431902968261e-06, + "loss": 1.0452, + "step": 518 + }, + { + "epoch": 0.037444536632877605, + "grad_norm": 3.839313955615023, + "learning_rate": 3.999420709747464e-06, + "loss": 0.9952, + "step": 519 + }, + { + "epoch": 0.03751668410230511, + "grad_norm": 11.383393596826627, + "learning_rate": 3.999409407345703e-06, + "loss": 1.0496, + "step": 520 + }, + { + "epoch": 0.03758883157173262, + "grad_norm": 3.5642940621615464, + "learning_rate": 3.999397995763596e-06, + "loss": 0.9403, + "step": 521 + }, + { + "epoch": 0.03766097904116013, + "grad_norm": 6.149828354836036, + "learning_rate": 3.9993864750017664e-06, + "loss": 1.0249, + "step": 522 + }, + { + "epoch": 0.03773312651058764, + "grad_norm": 3.6715455252365152, + "learning_rate": 3.999374845060842e-06, + "loss": 1.1282, + "step": 523 + }, + { + "epoch": 0.03780527398001515, + "grad_norm": 4.725231241693425, + "learning_rate": 3.999363105941459e-06, + "loss": 1.0399, + "step": 524 + }, + { + "epoch": 0.03787742144944266, + "grad_norm": 4.538476902912099, + "learning_rate": 3.999351257644258e-06, + "loss": 0.981, + "step": 525 + }, + { + "epoch": 0.03794956891887017, + "grad_norm": 3.4025199135023128, + "learning_rate": 3.999339300169887e-06, + "loss": 0.9406, + "step": 526 + }, + { + "epoch": 0.03802171638829768, + "grad_norm": 5.535769927276976, + "learning_rate": 3.999327233518997e-06, + "loss": 1.0513, + "step": 527 + }, + { + "epoch": 0.03809386385772519, + "grad_norm": 4.328332338290173, + "learning_rate": 3.999315057692249e-06, + "loss": 0.9966, + "step": 528 + }, + { + "epoch": 0.0381660113271527, + "grad_norm": 3.68016771556309, + "learning_rate": 3.999302772690307e-06, + "loss": 1.0295, + "step": 529 + }, + { + "epoch": 0.03823815879658021, + "grad_norm": 6.170289427462244, + "learning_rate": 3.999290378513841e-06, + "loss": 1.0894, + "step": 530 + }, + { + "epoch": 0.03831030626600772, + "grad_norm": 4.046507829778052, + "learning_rate": 3.9992778751635285e-06, + "loss": 1.018, + "step": 531 + }, + { + "epoch": 0.03838245373543523, + "grad_norm": 3.2027854181179976, + "learning_rate": 3.999265262640053e-06, + "loss": 0.9245, + "step": 532 + }, + { + "epoch": 0.03845460120486274, + "grad_norm": 3.1119414046588783, + "learning_rate": 3.999252540944102e-06, + "loss": 0.966, + "step": 533 + }, + { + "epoch": 0.03852674867429025, + "grad_norm": 9.860828516198774, + "learning_rate": 3.999239710076371e-06, + "loss": 0.9482, + "step": 534 + }, + { + "epoch": 0.03859889614371776, + "grad_norm": 9.5615856257074, + "learning_rate": 3.99922677003756e-06, + "loss": 0.8396, + "step": 535 + }, + { + "epoch": 0.038671043613145266, + "grad_norm": 5.355472489570631, + "learning_rate": 3.999213720828377e-06, + "loss": 0.972, + "step": 536 + }, + { + "epoch": 0.03874319108257278, + "grad_norm": 3.0515745257459534, + "learning_rate": 3.999200562449532e-06, + "loss": 0.9549, + "step": 537 + }, + { + "epoch": 0.03881533855200029, + "grad_norm": 5.357842851954984, + "learning_rate": 3.999187294901747e-06, + "loss": 0.8428, + "step": 538 + }, + { + "epoch": 0.038887486021427796, + "grad_norm": 3.9941134053167855, + "learning_rate": 3.9991739181857435e-06, + "loss": 0.9943, + "step": 539 + }, + { + "epoch": 0.03895963349085531, + "grad_norm": 3.6136766042153683, + "learning_rate": 3.999160432302252e-06, + "loss": 1.0309, + "step": 540 + }, + { + "epoch": 0.03903178096028282, + "grad_norm": 6.313409720861366, + "learning_rate": 3.999146837252011e-06, + "loss": 1.0347, + "step": 541 + }, + { + "epoch": 0.039103928429710326, + "grad_norm": 4.7772613044168635, + "learning_rate": 3.999133133035762e-06, + "loss": 1.046, + "step": 542 + }, + { + "epoch": 0.03917607589913784, + "grad_norm": 3.7581772974163843, + "learning_rate": 3.999119319654252e-06, + "loss": 1.1024, + "step": 543 + }, + { + "epoch": 0.03924822336856535, + "grad_norm": 4.224798091600937, + "learning_rate": 3.999105397108238e-06, + "loss": 1.1123, + "step": 544 + }, + { + "epoch": 0.039320370837992856, + "grad_norm": 4.078171769966484, + "learning_rate": 3.999091365398478e-06, + "loss": 0.9956, + "step": 545 + }, + { + "epoch": 0.03939251830742037, + "grad_norm": 4.149761671093882, + "learning_rate": 3.999077224525739e-06, + "loss": 1.0416, + "step": 546 + }, + { + "epoch": 0.039464665776847874, + "grad_norm": 5.2320212466773866, + "learning_rate": 3.999062974490793e-06, + "loss": 1.0392, + "step": 547 + }, + { + "epoch": 0.039536813246275386, + "grad_norm": 4.5101365258641115, + "learning_rate": 3.999048615294418e-06, + "loss": 1.0623, + "step": 548 + }, + { + "epoch": 0.0396089607157029, + "grad_norm": 3.773540487522841, + "learning_rate": 3.999034146937399e-06, + "loss": 1.0488, + "step": 549 + }, + { + "epoch": 0.039681108185130404, + "grad_norm": 3.7581105579965963, + "learning_rate": 3.999019569420524e-06, + "loss": 1.057, + "step": 550 + }, + { + "epoch": 0.039753255654557916, + "grad_norm": 10.685293226346566, + "learning_rate": 3.999004882744592e-06, + "loss": 1.0765, + "step": 551 + }, + { + "epoch": 0.03982540312398543, + "grad_norm": 4.2196344649389, + "learning_rate": 3.998990086910403e-06, + "loss": 0.9681, + "step": 552 + }, + { + "epoch": 0.039897550593412934, + "grad_norm": 2.3537264274433767, + "learning_rate": 3.998975181918765e-06, + "loss": 1.0075, + "step": 553 + }, + { + "epoch": 0.039969698062840446, + "grad_norm": 6.65531348862215, + "learning_rate": 3.998960167770493e-06, + "loss": 1.0938, + "step": 554 + }, + { + "epoch": 0.04004184553226796, + "grad_norm": 3.46031827532213, + "learning_rate": 3.9989450444664054e-06, + "loss": 1.0666, + "step": 555 + }, + { + "epoch": 0.040113993001695464, + "grad_norm": 3.719762840504702, + "learning_rate": 3.998929812007329e-06, + "loss": 0.9379, + "step": 556 + }, + { + "epoch": 0.040186140471122976, + "grad_norm": 0.9609599847411505, + "learning_rate": 3.998914470394095e-06, + "loss": 0.8298, + "step": 557 + }, + { + "epoch": 0.04025828794055049, + "grad_norm": 3.7170374156514794, + "learning_rate": 3.998899019627542e-06, + "loss": 0.9933, + "step": 558 + }, + { + "epoch": 0.040330435409977994, + "grad_norm": 4.572545945326207, + "learning_rate": 3.998883459708513e-06, + "loss": 0.966, + "step": 559 + }, + { + "epoch": 0.040402582879405506, + "grad_norm": 5.74029318775162, + "learning_rate": 3.9988677906378575e-06, + "loss": 1.0265, + "step": 560 + }, + { + "epoch": 0.04047473034883301, + "grad_norm": 3.362682612682573, + "learning_rate": 3.998852012416432e-06, + "loss": 1.0145, + "step": 561 + }, + { + "epoch": 0.040546877818260524, + "grad_norm": 3.320766139736699, + "learning_rate": 3.998836125045097e-06, + "loss": 0.9858, + "step": 562 + }, + { + "epoch": 0.040619025287688036, + "grad_norm": 3.4942769535223563, + "learning_rate": 3.9988201285247216e-06, + "loss": 1.0983, + "step": 563 + }, + { + "epoch": 0.04069117275711554, + "grad_norm": 3.4356172434077523, + "learning_rate": 3.998804022856178e-06, + "loss": 1.0848, + "step": 564 + }, + { + "epoch": 0.040763320226543054, + "grad_norm": 3.9412019069845496, + "learning_rate": 3.998787808040345e-06, + "loss": 1.0353, + "step": 565 + }, + { + "epoch": 0.040835467695970566, + "grad_norm": 3.6023466038656418, + "learning_rate": 3.998771484078111e-06, + "loss": 1.0336, + "step": 566 + }, + { + "epoch": 0.04090761516539807, + "grad_norm": 5.226644974358626, + "learning_rate": 3.998755050970364e-06, + "loss": 1.0217, + "step": 567 + }, + { + "epoch": 0.040979762634825584, + "grad_norm": 3.952063135160165, + "learning_rate": 3.998738508718004e-06, + "loss": 1.0218, + "step": 568 + }, + { + "epoch": 0.041051910104253096, + "grad_norm": 3.3745710312238786, + "learning_rate": 3.998721857321932e-06, + "loss": 1.1757, + "step": 569 + }, + { + "epoch": 0.0411240575736806, + "grad_norm": 0.9776889560344535, + "learning_rate": 3.998705096783059e-06, + "loss": 0.7644, + "step": 570 + }, + { + "epoch": 0.041196205043108114, + "grad_norm": 4.393985209131501, + "learning_rate": 3.998688227102299e-06, + "loss": 1.1039, + "step": 571 + }, + { + "epoch": 0.041268352512535626, + "grad_norm": 7.256640057261645, + "learning_rate": 3.998671248280575e-06, + "loss": 1.0591, + "step": 572 + }, + { + "epoch": 0.04134049998196313, + "grad_norm": 4.016529500895939, + "learning_rate": 3.9986541603188114e-06, + "loss": 1.0267, + "step": 573 + }, + { + "epoch": 0.041412647451390644, + "grad_norm": 2.7642600791618364, + "learning_rate": 3.998636963217944e-06, + "loss": 1.0918, + "step": 574 + }, + { + "epoch": 0.04148479492081815, + "grad_norm": 3.4880425690844308, + "learning_rate": 3.9986196569789095e-06, + "loss": 1.0379, + "step": 575 + }, + { + "epoch": 0.04155694239024566, + "grad_norm": 3.037659151907756, + "learning_rate": 3.998602241602656e-06, + "loss": 0.9351, + "step": 576 + }, + { + "epoch": 0.041629089859673174, + "grad_norm": 3.5478417511648677, + "learning_rate": 3.998584717090132e-06, + "loss": 1.0594, + "step": 577 + }, + { + "epoch": 0.04170123732910068, + "grad_norm": 2.8365299667370456, + "learning_rate": 3.998567083442295e-06, + "loss": 0.9928, + "step": 578 + }, + { + "epoch": 0.04177338479852819, + "grad_norm": 3.141893121330555, + "learning_rate": 3.998549340660107e-06, + "loss": 1.0496, + "step": 579 + }, + { + "epoch": 0.041845532267955704, + "grad_norm": 2.559204121941517, + "learning_rate": 3.998531488744538e-06, + "loss": 1.0064, + "step": 580 + }, + { + "epoch": 0.04191767973738321, + "grad_norm": 4.0751380943817415, + "learning_rate": 3.998513527696564e-06, + "loss": 1.0838, + "step": 581 + }, + { + "epoch": 0.04198982720681072, + "grad_norm": 1.7671120382128926, + "learning_rate": 3.9984954575171634e-06, + "loss": 0.8538, + "step": 582 + }, + { + "epoch": 0.042061974676238234, + "grad_norm": 3.2650676050825163, + "learning_rate": 3.998477278207325e-06, + "loss": 1.0429, + "step": 583 + }, + { + "epoch": 0.04213412214566574, + "grad_norm": 3.8162985467306303, + "learning_rate": 3.9984589897680405e-06, + "loss": 1.0666, + "step": 584 + }, + { + "epoch": 0.04220626961509325, + "grad_norm": 3.3783853294713446, + "learning_rate": 3.998440592200308e-06, + "loss": 1.101, + "step": 585 + }, + { + "epoch": 0.04227841708452076, + "grad_norm": 4.950360317795158, + "learning_rate": 3.998422085505132e-06, + "loss": 0.7882, + "step": 586 + }, + { + "epoch": 0.04235056455394827, + "grad_norm": 3.3328435696970238, + "learning_rate": 3.998403469683525e-06, + "loss": 1.0567, + "step": 587 + }, + { + "epoch": 0.04242271202337578, + "grad_norm": 3.817008932389523, + "learning_rate": 3.998384744736502e-06, + "loss": 1.0579, + "step": 588 + }, + { + "epoch": 0.04249485949280329, + "grad_norm": 4.7495594824256715, + "learning_rate": 3.998365910665087e-06, + "loss": 0.9897, + "step": 589 + }, + { + "epoch": 0.0425670069622308, + "grad_norm": 4.283314228138947, + "learning_rate": 3.998346967470306e-06, + "loss": 0.9956, + "step": 590 + }, + { + "epoch": 0.04263915443165831, + "grad_norm": 5.1892336970216215, + "learning_rate": 3.9983279151531944e-06, + "loss": 0.9716, + "step": 591 + }, + { + "epoch": 0.04271130190108582, + "grad_norm": 6.764343281269474, + "learning_rate": 3.998308753714794e-06, + "loss": 1.0024, + "step": 592 + }, + { + "epoch": 0.04278344937051333, + "grad_norm": 5.820501910718335, + "learning_rate": 3.998289483156149e-06, + "loss": 0.9369, + "step": 593 + }, + { + "epoch": 0.04285559683994084, + "grad_norm": 4.246360791467616, + "learning_rate": 3.9982701034783136e-06, + "loss": 0.942, + "step": 594 + }, + { + "epoch": 0.04292774430936835, + "grad_norm": 3.473672209723518, + "learning_rate": 3.998250614682345e-06, + "loss": 0.9801, + "step": 595 + }, + { + "epoch": 0.04299989177879586, + "grad_norm": 3.910017470298821, + "learning_rate": 3.9982310167693075e-06, + "loss": 1.018, + "step": 596 + }, + { + "epoch": 0.04307203924822337, + "grad_norm": 3.4093673859816302, + "learning_rate": 3.998211309740272e-06, + "loss": 1.0312, + "step": 597 + }, + { + "epoch": 0.04314418671765088, + "grad_norm": 2.684098686234597, + "learning_rate": 3.9981914935963125e-06, + "loss": 1.087, + "step": 598 + }, + { + "epoch": 0.04321633418707839, + "grad_norm": 4.069698353304536, + "learning_rate": 3.998171568338514e-06, + "loss": 0.9763, + "step": 599 + }, + { + "epoch": 0.043288481656505895, + "grad_norm": 3.137370534710606, + "learning_rate": 3.9981515339679625e-06, + "loss": 0.9527, + "step": 600 + }, + { + "epoch": 0.04336062912593341, + "grad_norm": 3.9033405913283588, + "learning_rate": 3.998131390485753e-06, + "loss": 0.918, + "step": 601 + }, + { + "epoch": 0.04343277659536092, + "grad_norm": 2.0725190693492213, + "learning_rate": 3.9981111378929844e-06, + "loss": 1.0456, + "step": 602 + }, + { + "epoch": 0.043504924064788425, + "grad_norm": 2.5765156664735693, + "learning_rate": 3.998090776190763e-06, + "loss": 1.0852, + "step": 603 + }, + { + "epoch": 0.04357707153421594, + "grad_norm": 6.618068307671053, + "learning_rate": 3.998070305380202e-06, + "loss": 1.0378, + "step": 604 + }, + { + "epoch": 0.04364921900364345, + "grad_norm": 3.74550155398034, + "learning_rate": 3.9980497254624185e-06, + "loss": 0.9773, + "step": 605 + }, + { + "epoch": 0.043721366473070955, + "grad_norm": 3.3296478082948524, + "learning_rate": 3.998029036438536e-06, + "loss": 1.0592, + "step": 606 + }, + { + "epoch": 0.04379351394249847, + "grad_norm": 3.0414272022481, + "learning_rate": 3.998008238309683e-06, + "loss": 0.8745, + "step": 607 + }, + { + "epoch": 0.04386566141192598, + "grad_norm": 6.7726388568238995, + "learning_rate": 3.9979873310769965e-06, + "loss": 0.9808, + "step": 608 + }, + { + "epoch": 0.043937808881353485, + "grad_norm": 5.0061578977877845, + "learning_rate": 3.99796631474162e-06, + "loss": 0.9614, + "step": 609 + }, + { + "epoch": 0.044009956350781, + "grad_norm": 2.8086333079728982, + "learning_rate": 3.997945189304698e-06, + "loss": 1.0307, + "step": 610 + }, + { + "epoch": 0.0440821038202085, + "grad_norm": 4.076014882652957, + "learning_rate": 3.997923954767385e-06, + "loss": 1.0243, + "step": 611 + }, + { + "epoch": 0.044154251289636015, + "grad_norm": 3.2289301795912673, + "learning_rate": 3.997902611130842e-06, + "loss": 1.0241, + "step": 612 + }, + { + "epoch": 0.04422639875906353, + "grad_norm": 1.0535475044674618, + "learning_rate": 3.997881158396233e-06, + "loss": 0.8482, + "step": 613 + }, + { + "epoch": 0.04429854622849103, + "grad_norm": 5.523859463859019, + "learning_rate": 3.997859596564729e-06, + "loss": 0.944, + "step": 614 + }, + { + "epoch": 0.044370693697918545, + "grad_norm": 0.878050866849806, + "learning_rate": 3.997837925637509e-06, + "loss": 0.8215, + "step": 615 + }, + { + "epoch": 0.04444284116734606, + "grad_norm": 3.786569557847638, + "learning_rate": 3.997816145615755e-06, + "loss": 0.9956, + "step": 616 + }, + { + "epoch": 0.04451498863677356, + "grad_norm": 6.502147613210913, + "learning_rate": 3.997794256500658e-06, + "loss": 0.9887, + "step": 617 + }, + { + "epoch": 0.044587136106201075, + "grad_norm": 3.275496461235444, + "learning_rate": 3.997772258293411e-06, + "loss": 0.9806, + "step": 618 + }, + { + "epoch": 0.04465928357562859, + "grad_norm": 3.6411727034866406, + "learning_rate": 3.997750150995218e-06, + "loss": 1.0916, + "step": 619 + }, + { + "epoch": 0.04473143104505609, + "grad_norm": 2.944642008345329, + "learning_rate": 3.997727934607283e-06, + "loss": 0.9302, + "step": 620 + }, + { + "epoch": 0.044803578514483605, + "grad_norm": 3.6852197870887418, + "learning_rate": 3.997705609130822e-06, + "loss": 1.1103, + "step": 621 + }, + { + "epoch": 0.04487572598391112, + "grad_norm": 5.235686818186309, + "learning_rate": 3.997683174567052e-06, + "loss": 1.0081, + "step": 622 + }, + { + "epoch": 0.04494787345333862, + "grad_norm": 3.0727080883209057, + "learning_rate": 3.9976606309172e-06, + "loss": 1.0072, + "step": 623 + }, + { + "epoch": 0.045020020922766135, + "grad_norm": 5.093798116445, + "learning_rate": 3.997637978182496e-06, + "loss": 0.9437, + "step": 624 + }, + { + "epoch": 0.04509216839219364, + "grad_norm": 7.720461759840291, + "learning_rate": 3.9976152163641765e-06, + "loss": 0.9788, + "step": 625 + }, + { + "epoch": 0.04516431586162115, + "grad_norm": 3.2883723388874557, + "learning_rate": 3.9975923454634856e-06, + "loss": 1.0826, + "step": 626 + }, + { + "epoch": 0.045236463331048665, + "grad_norm": 3.87023140148935, + "learning_rate": 3.997569365481671e-06, + "loss": 1.0252, + "step": 627 + }, + { + "epoch": 0.04530861080047617, + "grad_norm": 5.239530569408728, + "learning_rate": 3.997546276419988e-06, + "loss": 0.9208, + "step": 628 + }, + { + "epoch": 0.04538075826990368, + "grad_norm": 3.0589953240530012, + "learning_rate": 3.997523078279697e-06, + "loss": 1.1363, + "step": 629 + }, + { + "epoch": 0.045452905739331195, + "grad_norm": 3.106033753633553, + "learning_rate": 3.997499771062066e-06, + "loss": 1.039, + "step": 630 + }, + { + "epoch": 0.0455250532087587, + "grad_norm": 4.107690269999437, + "learning_rate": 3.997476354768367e-06, + "loss": 1.1214, + "step": 631 + }, + { + "epoch": 0.04559720067818621, + "grad_norm": 2.9033222833620957, + "learning_rate": 3.9974528293998785e-06, + "loss": 1.0534, + "step": 632 + }, + { + "epoch": 0.045669348147613725, + "grad_norm": 12.490410750562747, + "learning_rate": 3.997429194957886e-06, + "loss": 1.098, + "step": 633 + }, + { + "epoch": 0.04574149561704123, + "grad_norm": 5.316275119215377, + "learning_rate": 3.997405451443678e-06, + "loss": 1.0043, + "step": 634 + }, + { + "epoch": 0.04581364308646874, + "grad_norm": 3.839764645635786, + "learning_rate": 3.997381598858554e-06, + "loss": 0.9745, + "step": 635 + }, + { + "epoch": 0.045885790555896255, + "grad_norm": 3.6503164284861898, + "learning_rate": 3.9973576372038135e-06, + "loss": 0.8923, + "step": 636 + }, + { + "epoch": 0.04595793802532376, + "grad_norm": 2.5190816315451032, + "learning_rate": 3.997333566480767e-06, + "loss": 1.0487, + "step": 637 + }, + { + "epoch": 0.04603008549475127, + "grad_norm": 4.107221496018738, + "learning_rate": 3.997309386690729e-06, + "loss": 0.9203, + "step": 638 + }, + { + "epoch": 0.04610223296417878, + "grad_norm": 2.438067932478028, + "learning_rate": 3.997285097835018e-06, + "loss": 1.0269, + "step": 639 + }, + { + "epoch": 0.04617438043360629, + "grad_norm": 3.8980192827791984, + "learning_rate": 3.997260699914962e-06, + "loss": 1.0256, + "step": 640 + }, + { + "epoch": 0.0462465279030338, + "grad_norm": 7.471988247308739, + "learning_rate": 3.9972361929318925e-06, + "loss": 1.0056, + "step": 641 + }, + { + "epoch": 0.04631867537246131, + "grad_norm": 2.4709861382188643, + "learning_rate": 3.9972115768871485e-06, + "loss": 1.0609, + "step": 642 + }, + { + "epoch": 0.04639082284188882, + "grad_norm": 3.1843638231794467, + "learning_rate": 3.997186851782074e-06, + "loss": 1.0011, + "step": 643 + }, + { + "epoch": 0.04646297031131633, + "grad_norm": 3.448918883615455, + "learning_rate": 3.997162017618018e-06, + "loss": 0.9352, + "step": 644 + }, + { + "epoch": 0.04653511778074384, + "grad_norm": 4.3719659366380075, + "learning_rate": 3.997137074396338e-06, + "loss": 1.0442, + "step": 645 + }, + { + "epoch": 0.04660726525017135, + "grad_norm": 4.446691530454592, + "learning_rate": 3.997112022118396e-06, + "loss": 1.0821, + "step": 646 + }, + { + "epoch": 0.04667941271959886, + "grad_norm": 4.008304558305033, + "learning_rate": 3.997086860785558e-06, + "loss": 0.9233, + "step": 647 + }, + { + "epoch": 0.04675156018902637, + "grad_norm": 3.4138152777158597, + "learning_rate": 3.997061590399201e-06, + "loss": 0.8368, + "step": 648 + }, + { + "epoch": 0.04682370765845388, + "grad_norm": 9.379561471672272, + "learning_rate": 3.997036210960703e-06, + "loss": 0.9833, + "step": 649 + }, + { + "epoch": 0.046895855127881386, + "grad_norm": 3.339562191165499, + "learning_rate": 3.99701072247145e-06, + "loss": 0.9825, + "step": 650 + }, + { + "epoch": 0.0469680025973089, + "grad_norm": 7.259741191191731, + "learning_rate": 3.996985124932834e-06, + "loss": 0.9742, + "step": 651 + }, + { + "epoch": 0.04704015006673641, + "grad_norm": 5.636484079108878, + "learning_rate": 3.996959418346253e-06, + "loss": 0.9886, + "step": 652 + }, + { + "epoch": 0.047112297536163916, + "grad_norm": 5.899641588607179, + "learning_rate": 3.9969336027131115e-06, + "loss": 1.0153, + "step": 653 + }, + { + "epoch": 0.04718444500559143, + "grad_norm": 2.5949049067484418, + "learning_rate": 3.996907678034817e-06, + "loss": 1.0788, + "step": 654 + }, + { + "epoch": 0.04725659247501894, + "grad_norm": 3.172485292850009, + "learning_rate": 3.996881644312787e-06, + "loss": 1.0123, + "step": 655 + }, + { + "epoch": 0.047328739944446446, + "grad_norm": 1.557476895989124, + "learning_rate": 3.996855501548443e-06, + "loss": 0.8666, + "step": 656 + }, + { + "epoch": 0.04740088741387396, + "grad_norm": 2.9858939414687367, + "learning_rate": 3.996829249743212e-06, + "loss": 0.998, + "step": 657 + }, + { + "epoch": 0.04747303488330147, + "grad_norm": 3.9701404688892814, + "learning_rate": 3.996802888898527e-06, + "loss": 0.9927, + "step": 658 + }, + { + "epoch": 0.047545182352728976, + "grad_norm": 2.079322297942164, + "learning_rate": 3.99677641901583e-06, + "loss": 1.0652, + "step": 659 + }, + { + "epoch": 0.04761732982215649, + "grad_norm": 3.240935081427329, + "learning_rate": 3.996749840096562e-06, + "loss": 0.9287, + "step": 660 + }, + { + "epoch": 0.047689477291584, + "grad_norm": 4.817769917962239, + "learning_rate": 3.996723152142179e-06, + "loss": 0.9464, + "step": 661 + }, + { + "epoch": 0.047761624761011506, + "grad_norm": 2.978221200868275, + "learning_rate": 3.996696355154136e-06, + "loss": 1.0114, + "step": 662 + }, + { + "epoch": 0.04783377223043902, + "grad_norm": 4.650387753962784, + "learning_rate": 3.996669449133896e-06, + "loss": 1.0063, + "step": 663 + }, + { + "epoch": 0.047905919699866524, + "grad_norm": 2.4112117193573868, + "learning_rate": 3.996642434082929e-06, + "loss": 0.9849, + "step": 664 + }, + { + "epoch": 0.047978067169294036, + "grad_norm": 4.3155502368105445, + "learning_rate": 3.9966153100027095e-06, + "loss": 0.9515, + "step": 665 + }, + { + "epoch": 0.04805021463872155, + "grad_norm": 3.403255721454859, + "learning_rate": 3.9965880768947196e-06, + "loss": 0.9627, + "step": 666 + }, + { + "epoch": 0.048122362108149054, + "grad_norm": 3.615207467991442, + "learning_rate": 3.996560734760446e-06, + "loss": 0.9654, + "step": 667 + }, + { + "epoch": 0.048194509577576566, + "grad_norm": 3.6290842922130913, + "learning_rate": 3.996533283601382e-06, + "loss": 1.0159, + "step": 668 + }, + { + "epoch": 0.04826665704700408, + "grad_norm": 3.773922464549665, + "learning_rate": 3.996505723419025e-06, + "loss": 0.963, + "step": 669 + }, + { + "epoch": 0.048338804516431584, + "grad_norm": 4.614517291811331, + "learning_rate": 3.996478054214883e-06, + "loss": 0.9462, + "step": 670 + }, + { + "epoch": 0.048410951985859096, + "grad_norm": 20.16959950372404, + "learning_rate": 3.9964502759904635e-06, + "loss": 0.9193, + "step": 671 + }, + { + "epoch": 0.04848309945528661, + "grad_norm": 2.761756389528689, + "learning_rate": 3.996422388747286e-06, + "loss": 0.9688, + "step": 672 + }, + { + "epoch": 0.048555246924714114, + "grad_norm": 3.500329683307803, + "learning_rate": 3.996394392486872e-06, + "loss": 1.0495, + "step": 673 + }, + { + "epoch": 0.048627394394141626, + "grad_norm": 5.344990725674268, + "learning_rate": 3.996366287210751e-06, + "loss": 0.9953, + "step": 674 + }, + { + "epoch": 0.04869954186356914, + "grad_norm": 4.3806571352532835, + "learning_rate": 3.996338072920457e-06, + "loss": 0.9622, + "step": 675 + }, + { + "epoch": 0.048771689332996644, + "grad_norm": 4.783026670578968, + "learning_rate": 3.996309749617532e-06, + "loss": 1.1294, + "step": 676 + }, + { + "epoch": 0.048843836802424156, + "grad_norm": 3.6414220371325605, + "learning_rate": 3.996281317303521e-06, + "loss": 0.8888, + "step": 677 + }, + { + "epoch": 0.04891598427185166, + "grad_norm": 2.6994061770695548, + "learning_rate": 3.996252775979976e-06, + "loss": 0.9267, + "step": 678 + }, + { + "epoch": 0.048988131741279174, + "grad_norm": 10.070715920570494, + "learning_rate": 3.996224125648458e-06, + "loss": 0.9635, + "step": 679 + }, + { + "epoch": 0.049060279210706687, + "grad_norm": 8.90422040333745, + "learning_rate": 3.99619536631053e-06, + "loss": 0.9476, + "step": 680 + }, + { + "epoch": 0.04913242668013419, + "grad_norm": 2.9372909146832, + "learning_rate": 3.996166497967764e-06, + "loss": 1.0111, + "step": 681 + }, + { + "epoch": 0.049204574149561704, + "grad_norm": 4.053542135239633, + "learning_rate": 3.9961375206217335e-06, + "loss": 1.0583, + "step": 682 + }, + { + "epoch": 0.04927672161898922, + "grad_norm": 8.561927943367772, + "learning_rate": 3.996108434274022e-06, + "loss": 1.1241, + "step": 683 + }, + { + "epoch": 0.04934886908841672, + "grad_norm": 3.814254107071335, + "learning_rate": 3.9960792389262195e-06, + "loss": 0.8836, + "step": 684 + }, + { + "epoch": 0.049421016557844234, + "grad_norm": 5.120110575256793, + "learning_rate": 3.996049934579919e-06, + "loss": 1.1096, + "step": 685 + }, + { + "epoch": 0.04949316402727175, + "grad_norm": 3.76764279829633, + "learning_rate": 3.99602052123672e-06, + "loss": 0.9489, + "step": 686 + }, + { + "epoch": 0.04956531149669925, + "grad_norm": 4.4988980533523995, + "learning_rate": 3.99599099889823e-06, + "loss": 0.9889, + "step": 687 + }, + { + "epoch": 0.049637458966126764, + "grad_norm": 3.0777513306266036, + "learning_rate": 3.995961367566061e-06, + "loss": 1.0531, + "step": 688 + }, + { + "epoch": 0.04970960643555427, + "grad_norm": 3.052742184990809, + "learning_rate": 3.9959316272418285e-06, + "loss": 0.9644, + "step": 689 + }, + { + "epoch": 0.04978175390498178, + "grad_norm": 3.0870073612719753, + "learning_rate": 3.99590177792716e-06, + "loss": 1.0647, + "step": 690 + }, + { + "epoch": 0.049853901374409294, + "grad_norm": 3.983571049980369, + "learning_rate": 3.995871819623684e-06, + "loss": 1.0925, + "step": 691 + }, + { + "epoch": 0.0499260488438368, + "grad_norm": 4.992463535563446, + "learning_rate": 3.995841752333035e-06, + "loss": 0.8586, + "step": 692 + }, + { + "epoch": 0.04999819631326431, + "grad_norm": 3.78910591651189, + "learning_rate": 3.9958115760568575e-06, + "loss": 0.9323, + "step": 693 + }, + { + "epoch": 0.050070343782691824, + "grad_norm": 5.608382198277682, + "learning_rate": 3.995781290796798e-06, + "loss": 0.872, + "step": 694 + }, + { + "epoch": 0.05014249125211933, + "grad_norm": 3.8285641788096814, + "learning_rate": 3.995750896554511e-06, + "loss": 0.9964, + "step": 695 + }, + { + "epoch": 0.05021463872154684, + "grad_norm": 3.372621051038235, + "learning_rate": 3.995720393331654e-06, + "loss": 1.047, + "step": 696 + }, + { + "epoch": 0.050286786190974354, + "grad_norm": 2.7786262223118623, + "learning_rate": 3.995689781129896e-06, + "loss": 0.9814, + "step": 697 + }, + { + "epoch": 0.05035893366040186, + "grad_norm": 3.890353886606861, + "learning_rate": 3.995659059950905e-06, + "loss": 0.9692, + "step": 698 + }, + { + "epoch": 0.05043108112982937, + "grad_norm": 4.7689233590908975, + "learning_rate": 3.995628229796361e-06, + "loss": 1.0027, + "step": 699 + }, + { + "epoch": 0.050503228599256884, + "grad_norm": 2.853276320389956, + "learning_rate": 3.995597290667948e-06, + "loss": 0.9961, + "step": 700 + }, + { + "epoch": 0.05057537606868439, + "grad_norm": 4.6335230470600575, + "learning_rate": 3.995566242567353e-06, + "loss": 0.9844, + "step": 701 + }, + { + "epoch": 0.0506475235381119, + "grad_norm": 2.4439351160517635, + "learning_rate": 3.9955350854962734e-06, + "loss": 0.8703, + "step": 702 + }, + { + "epoch": 0.05071967100753941, + "grad_norm": 3.3884648768279395, + "learning_rate": 3.995503819456411e-06, + "loss": 0.9681, + "step": 703 + }, + { + "epoch": 0.05079181847696692, + "grad_norm": 2.7244672621987895, + "learning_rate": 3.99547244444947e-06, + "loss": 1.0571, + "step": 704 + }, + { + "epoch": 0.05086396594639443, + "grad_norm": 3.8723955323305335, + "learning_rate": 3.995440960477167e-06, + "loss": 1.1138, + "step": 705 + }, + { + "epoch": 0.05093611341582194, + "grad_norm": 1.1133003099382675, + "learning_rate": 3.995409367541221e-06, + "loss": 0.7949, + "step": 706 + }, + { + "epoch": 0.05100826088524945, + "grad_norm": 3.3718738912981845, + "learning_rate": 3.995377665643355e-06, + "loss": 0.8536, + "step": 707 + }, + { + "epoch": 0.05108040835467696, + "grad_norm": 4.651789634625535, + "learning_rate": 3.995345854785301e-06, + "loss": 1.0053, + "step": 708 + }, + { + "epoch": 0.05115255582410447, + "grad_norm": 3.2746557913867815, + "learning_rate": 3.995313934968797e-06, + "loss": 0.9081, + "step": 709 + }, + { + "epoch": 0.05122470329353198, + "grad_norm": 2.97566766379398, + "learning_rate": 3.995281906195586e-06, + "loss": 1.0428, + "step": 710 + }, + { + "epoch": 0.05129685076295949, + "grad_norm": 3.4173488556172855, + "learning_rate": 3.995249768467415e-06, + "loss": 1.0856, + "step": 711 + }, + { + "epoch": 0.051368998232387, + "grad_norm": 3.5941242852392543, + "learning_rate": 3.995217521786041e-06, + "loss": 0.9586, + "step": 712 + }, + { + "epoch": 0.05144114570181451, + "grad_norm": 2.5423059024447126, + "learning_rate": 3.995185166153223e-06, + "loss": 1.03, + "step": 713 + }, + { + "epoch": 0.051513293171242015, + "grad_norm": 6.058929020885371, + "learning_rate": 3.995152701570731e-06, + "loss": 1.0437, + "step": 714 + }, + { + "epoch": 0.05158544064066953, + "grad_norm": 4.269205567833426, + "learning_rate": 3.995120128040335e-06, + "loss": 0.9656, + "step": 715 + }, + { + "epoch": 0.05165758811009704, + "grad_norm": 0.8541220599056485, + "learning_rate": 3.995087445563814e-06, + "loss": 0.8558, + "step": 716 + }, + { + "epoch": 0.051729735579524545, + "grad_norm": 3.31171638288906, + "learning_rate": 3.995054654142954e-06, + "loss": 0.9175, + "step": 717 + }, + { + "epoch": 0.05180188304895206, + "grad_norm": 3.714516931190985, + "learning_rate": 3.995021753779544e-06, + "loss": 1.0941, + "step": 718 + }, + { + "epoch": 0.05187403051837957, + "grad_norm": 3.739506471883593, + "learning_rate": 3.994988744475382e-06, + "loss": 1.0891, + "step": 719 + }, + { + "epoch": 0.051946177987807075, + "grad_norm": 3.808459282481991, + "learning_rate": 3.99495562623227e-06, + "loss": 1.0649, + "step": 720 + }, + { + "epoch": 0.05201832545723459, + "grad_norm": 4.755223263413834, + "learning_rate": 3.994922399052015e-06, + "loss": 0.9914, + "step": 721 + }, + { + "epoch": 0.0520904729266621, + "grad_norm": 7.628205422958503, + "learning_rate": 3.994889062936434e-06, + "loss": 0.968, + "step": 722 + }, + { + "epoch": 0.052162620396089605, + "grad_norm": 0.9566960896199662, + "learning_rate": 3.994855617887346e-06, + "loss": 0.8469, + "step": 723 + }, + { + "epoch": 0.05223476786551712, + "grad_norm": 3.461836377390012, + "learning_rate": 3.994822063906577e-06, + "loss": 0.9217, + "step": 724 + }, + { + "epoch": 0.05230691533494463, + "grad_norm": 4.417288298615559, + "learning_rate": 3.99478840099596e-06, + "loss": 0.9139, + "step": 725 + }, + { + "epoch": 0.052379062804372135, + "grad_norm": 3.745290723855916, + "learning_rate": 3.994754629157332e-06, + "loss": 0.9089, + "step": 726 + }, + { + "epoch": 0.05245121027379965, + "grad_norm": 4.009464748742796, + "learning_rate": 3.994720748392539e-06, + "loss": 0.9748, + "step": 727 + }, + { + "epoch": 0.05252335774322715, + "grad_norm": 3.059167411081557, + "learning_rate": 3.994686758703429e-06, + "loss": 1.068, + "step": 728 + }, + { + "epoch": 0.052595505212654665, + "grad_norm": 2.748549425579743, + "learning_rate": 3.994652660091861e-06, + "loss": 1.0491, + "step": 729 + }, + { + "epoch": 0.05266765268208218, + "grad_norm": 18.78308685511092, + "learning_rate": 3.994618452559693e-06, + "loss": 0.9094, + "step": 730 + }, + { + "epoch": 0.05273980015150968, + "grad_norm": 3.0133562003030416, + "learning_rate": 3.994584136108797e-06, + "loss": 1.0335, + "step": 731 + }, + { + "epoch": 0.052811947620937195, + "grad_norm": 6.0848182363954555, + "learning_rate": 3.994549710741044e-06, + "loss": 0.9025, + "step": 732 + }, + { + "epoch": 0.05288409509036471, + "grad_norm": 3.331078020705804, + "learning_rate": 3.994515176458316e-06, + "loss": 0.9766, + "step": 733 + }, + { + "epoch": 0.05295624255979221, + "grad_norm": 6.440150335701453, + "learning_rate": 3.994480533262496e-06, + "loss": 1.0591, + "step": 734 + }, + { + "epoch": 0.053028390029219725, + "grad_norm": 4.308116952171098, + "learning_rate": 3.994445781155478e-06, + "loss": 1.0651, + "step": 735 + }, + { + "epoch": 0.05310053749864724, + "grad_norm": 2.630269982147383, + "learning_rate": 3.99441092013916e-06, + "loss": 0.9706, + "step": 736 + }, + { + "epoch": 0.05317268496807474, + "grad_norm": 3.333824900303156, + "learning_rate": 3.994375950215443e-06, + "loss": 1.0597, + "step": 737 + }, + { + "epoch": 0.053244832437502256, + "grad_norm": 3.1039373496605367, + "learning_rate": 3.994340871386241e-06, + "loss": 1.0487, + "step": 738 + }, + { + "epoch": 0.05331697990692977, + "grad_norm": 3.742478902855415, + "learning_rate": 3.994305683653464e-06, + "loss": 0.9666, + "step": 739 + }, + { + "epoch": 0.05338912737635727, + "grad_norm": 3.162085247360081, + "learning_rate": 3.994270387019038e-06, + "loss": 0.9863, + "step": 740 + }, + { + "epoch": 0.053461274845784786, + "grad_norm": 3.9770443967202436, + "learning_rate": 3.994234981484888e-06, + "loss": 1.0237, + "step": 741 + }, + { + "epoch": 0.05353342231521229, + "grad_norm": 3.9820780996253844, + "learning_rate": 3.9941994670529485e-06, + "loss": 1.013, + "step": 742 + }, + { + "epoch": 0.0536055697846398, + "grad_norm": 0.8775368767513084, + "learning_rate": 3.994163843725158e-06, + "loss": 0.7871, + "step": 743 + }, + { + "epoch": 0.053677717254067316, + "grad_norm": 4.32081995599945, + "learning_rate": 3.9941281115034624e-06, + "loss": 1.0268, + "step": 744 + }, + { + "epoch": 0.05374986472349482, + "grad_norm": 2.848468596622976, + "learning_rate": 3.994092270389812e-06, + "loss": 1.0794, + "step": 745 + }, + { + "epoch": 0.05382201219292233, + "grad_norm": 3.0596876660684833, + "learning_rate": 3.994056320386165e-06, + "loss": 0.954, + "step": 746 + }, + { + "epoch": 0.053894159662349846, + "grad_norm": 7.263077352820679, + "learning_rate": 3.994020261494484e-06, + "loss": 0.8868, + "step": 747 + }, + { + "epoch": 0.05396630713177735, + "grad_norm": 2.667766334372865, + "learning_rate": 3.993984093716738e-06, + "loss": 1.0027, + "step": 748 + }, + { + "epoch": 0.05403845460120486, + "grad_norm": 3.189514122196884, + "learning_rate": 3.993947817054903e-06, + "loss": 0.992, + "step": 749 + }, + { + "epoch": 0.054110602070632376, + "grad_norm": 0.9249503676527743, + "learning_rate": 3.993911431510958e-06, + "loss": 0.8948, + "step": 750 + }, + { + "epoch": 0.05418274954005988, + "grad_norm": 2.7333078100206842, + "learning_rate": 3.993874937086891e-06, + "loss": 1.0506, + "step": 751 + }, + { + "epoch": 0.05425489700948739, + "grad_norm": 3.6081069105848913, + "learning_rate": 3.993838333784695e-06, + "loss": 1.0794, + "step": 752 + }, + { + "epoch": 0.0543270444789149, + "grad_norm": 3.1641108756429217, + "learning_rate": 3.993801621606369e-06, + "loss": 0.9705, + "step": 753 + }, + { + "epoch": 0.05439919194834241, + "grad_norm": 5.564871293299273, + "learning_rate": 3.993764800553917e-06, + "loss": 1.0272, + "step": 754 + }, + { + "epoch": 0.05447133941776992, + "grad_norm": 27.331573181544545, + "learning_rate": 3.99372787062935e-06, + "loss": 1.0847, + "step": 755 + }, + { + "epoch": 0.05454348688719743, + "grad_norm": 3.3121736473688554, + "learning_rate": 3.993690831834685e-06, + "loss": 1.0034, + "step": 756 + }, + { + "epoch": 0.05461563435662494, + "grad_norm": 9.20015048028364, + "learning_rate": 3.993653684171942e-06, + "loss": 1.0125, + "step": 757 + }, + { + "epoch": 0.05468778182605245, + "grad_norm": 1.04409015389396, + "learning_rate": 3.9936164276431536e-06, + "loss": 0.7812, + "step": 758 + }, + { + "epoch": 0.05475992929547996, + "grad_norm": 3.46323113830096, + "learning_rate": 3.9935790622503514e-06, + "loss": 1.0093, + "step": 759 + }, + { + "epoch": 0.05483207676490747, + "grad_norm": 7.37445790512979, + "learning_rate": 3.993541587995577e-06, + "loss": 0.9044, + "step": 760 + }, + { + "epoch": 0.05490422423433498, + "grad_norm": 2.4531560033309066, + "learning_rate": 3.993504004880876e-06, + "loss": 1.0328, + "step": 761 + }, + { + "epoch": 0.05497637170376249, + "grad_norm": 4.000370008521477, + "learning_rate": 3.9934663129083e-06, + "loss": 1.0031, + "step": 762 + }, + { + "epoch": 0.05504851917319, + "grad_norm": 3.070767201540144, + "learning_rate": 3.99342851207991e-06, + "loss": 1.0494, + "step": 763 + }, + { + "epoch": 0.055120666642617513, + "grad_norm": 3.625638971247565, + "learning_rate": 3.993390602397768e-06, + "loss": 1.007, + "step": 764 + }, + { + "epoch": 0.05519281411204502, + "grad_norm": 2.75292085373494, + "learning_rate": 3.993352583863943e-06, + "loss": 0.927, + "step": 765 + }, + { + "epoch": 0.05526496158147253, + "grad_norm": 1.175174594650249, + "learning_rate": 3.9933144564805145e-06, + "loss": 0.7834, + "step": 766 + }, + { + "epoch": 0.05533710905090004, + "grad_norm": 4.229977509639462, + "learning_rate": 3.993276220249561e-06, + "loss": 1.0008, + "step": 767 + }, + { + "epoch": 0.05540925652032755, + "grad_norm": 3.3695859929817966, + "learning_rate": 3.993237875173173e-06, + "loss": 1.0021, + "step": 768 + }, + { + "epoch": 0.05548140398975506, + "grad_norm": 3.49206952401393, + "learning_rate": 3.993199421253443e-06, + "loss": 0.9961, + "step": 769 + }, + { + "epoch": 0.05555355145918257, + "grad_norm": 2.731674168382282, + "learning_rate": 3.993160858492471e-06, + "loss": 0.9278, + "step": 770 + }, + { + "epoch": 0.05562569892861008, + "grad_norm": 3.0773413575573016, + "learning_rate": 3.993122186892363e-06, + "loss": 1.0277, + "step": 771 + }, + { + "epoch": 0.05569784639803759, + "grad_norm": 5.394359588654069, + "learning_rate": 3.993083406455231e-06, + "loss": 1.0451, + "step": 772 + }, + { + "epoch": 0.0557699938674651, + "grad_norm": 4.968965537716542, + "learning_rate": 3.993044517183192e-06, + "loss": 1.0446, + "step": 773 + }, + { + "epoch": 0.05584214133689261, + "grad_norm": 1.068044891656939, + "learning_rate": 3.99300551907837e-06, + "loss": 0.832, + "step": 774 + }, + { + "epoch": 0.05591428880632012, + "grad_norm": 3.92217943446157, + "learning_rate": 3.992966412142895e-06, + "loss": 1.051, + "step": 775 + }, + { + "epoch": 0.05598643627574763, + "grad_norm": 3.0112530735454652, + "learning_rate": 3.992927196378901e-06, + "loss": 0.9405, + "step": 776 + }, + { + "epoch": 0.05605858374517514, + "grad_norm": 7.208725082542254, + "learning_rate": 3.992887871788531e-06, + "loss": 0.9979, + "step": 777 + }, + { + "epoch": 0.056130731214602644, + "grad_norm": 5.870564836039256, + "learning_rate": 3.992848438373931e-06, + "loss": 0.9806, + "step": 778 + }, + { + "epoch": 0.05620287868403016, + "grad_norm": 5.467819745098825, + "learning_rate": 3.992808896137256e-06, + "loss": 0.9908, + "step": 779 + }, + { + "epoch": 0.05627502615345767, + "grad_norm": 4.699261164929634, + "learning_rate": 3.992769245080663e-06, + "loss": 1.0355, + "step": 780 + }, + { + "epoch": 0.056347173622885174, + "grad_norm": 4.172509305931031, + "learning_rate": 3.99272948520632e-06, + "loss": 0.9542, + "step": 781 + }, + { + "epoch": 0.05641932109231269, + "grad_norm": 3.302403447532333, + "learning_rate": 3.992689616516396e-06, + "loss": 1.0878, + "step": 782 + }, + { + "epoch": 0.0564914685617402, + "grad_norm": 8.470453407735455, + "learning_rate": 3.992649639013069e-06, + "loss": 0.9745, + "step": 783 + }, + { + "epoch": 0.056563616031167704, + "grad_norm": 3.217208789608635, + "learning_rate": 3.992609552698521e-06, + "loss": 0.8888, + "step": 784 + }, + { + "epoch": 0.05663576350059522, + "grad_norm": 2.663689640387028, + "learning_rate": 3.9925693575749425e-06, + "loss": 1.0357, + "step": 785 + }, + { + "epoch": 0.05670791097002273, + "grad_norm": 5.4940533134392755, + "learning_rate": 3.992529053644527e-06, + "loss": 0.9666, + "step": 786 + }, + { + "epoch": 0.056780058439450234, + "grad_norm": 4.054477693572352, + "learning_rate": 3.992488640909476e-06, + "loss": 1.0545, + "step": 787 + }, + { + "epoch": 0.05685220590887775, + "grad_norm": 3.048920868874862, + "learning_rate": 3.992448119371997e-06, + "loss": 1.0045, + "step": 788 + }, + { + "epoch": 0.05692435337830526, + "grad_norm": 1.0567229797648647, + "learning_rate": 3.992407489034302e-06, + "loss": 0.8639, + "step": 789 + }, + { + "epoch": 0.056996500847732764, + "grad_norm": 2.7734067243560965, + "learning_rate": 3.992366749898609e-06, + "loss": 0.928, + "step": 790 + }, + { + "epoch": 0.05706864831716028, + "grad_norm": 2.7847726351729603, + "learning_rate": 3.992325901967144e-06, + "loss": 0.9443, + "step": 791 + }, + { + "epoch": 0.05714079578658778, + "grad_norm": 3.682927416245468, + "learning_rate": 3.992284945242136e-06, + "loss": 0.9017, + "step": 792 + }, + { + "epoch": 0.057212943256015295, + "grad_norm": 3.314521316788933, + "learning_rate": 3.992243879725822e-06, + "loss": 0.9924, + "step": 793 + }, + { + "epoch": 0.05728509072544281, + "grad_norm": 0.9062947558170793, + "learning_rate": 3.992202705420446e-06, + "loss": 0.8237, + "step": 794 + }, + { + "epoch": 0.05735723819487031, + "grad_norm": 7.658217321381256, + "learning_rate": 3.992161422328255e-06, + "loss": 1.017, + "step": 795 + }, + { + "epoch": 0.057429385664297825, + "grad_norm": 4.179274037668934, + "learning_rate": 3.992120030451503e-06, + "loss": 1.0024, + "step": 796 + }, + { + "epoch": 0.05750153313372534, + "grad_norm": 4.837690340777503, + "learning_rate": 3.99207852979245e-06, + "loss": 0.9941, + "step": 797 + }, + { + "epoch": 0.05757368060315284, + "grad_norm": 3.629404483423813, + "learning_rate": 3.992036920353364e-06, + "loss": 0.9868, + "step": 798 + }, + { + "epoch": 0.057645828072580355, + "grad_norm": 3.4996209620545704, + "learning_rate": 3.991995202136516e-06, + "loss": 0.9996, + "step": 799 + }, + { + "epoch": 0.05771797554200787, + "grad_norm": 3.0547132564151513, + "learning_rate": 3.991953375144183e-06, + "loss": 0.919, + "step": 800 + }, + { + "epoch": 0.05779012301143537, + "grad_norm": 3.885028782250656, + "learning_rate": 3.99191143937865e-06, + "loss": 1.0033, + "step": 801 + }, + { + "epoch": 0.057862270480862885, + "grad_norm": 5.687593564112779, + "learning_rate": 3.991869394842209e-06, + "loss": 1.0481, + "step": 802 + }, + { + "epoch": 0.0579344179502904, + "grad_norm": 3.7385673453334047, + "learning_rate": 3.991827241537153e-06, + "loss": 1.0261, + "step": 803 + }, + { + "epoch": 0.0580065654197179, + "grad_norm": 3.735940517295088, + "learning_rate": 3.9917849794657846e-06, + "loss": 0.9441, + "step": 804 + }, + { + "epoch": 0.058078712889145415, + "grad_norm": 6.463277224762285, + "learning_rate": 3.991742608630412e-06, + "loss": 0.9953, + "step": 805 + }, + { + "epoch": 0.05815086035857292, + "grad_norm": 3.544781978988899, + "learning_rate": 3.991700129033349e-06, + "loss": 0.991, + "step": 806 + }, + { + "epoch": 0.05822300782800043, + "grad_norm": 4.515906302062069, + "learning_rate": 3.991657540676915e-06, + "loss": 0.9137, + "step": 807 + }, + { + "epoch": 0.058295155297427945, + "grad_norm": 4.329564044669455, + "learning_rate": 3.991614843563435e-06, + "loss": 1.0244, + "step": 808 + }, + { + "epoch": 0.05836730276685545, + "grad_norm": 4.4128766382722935, + "learning_rate": 3.991572037695242e-06, + "loss": 1.001, + "step": 809 + }, + { + "epoch": 0.05843945023628296, + "grad_norm": 4.8844151666689575, + "learning_rate": 3.9915291230746716e-06, + "loss": 1.0008, + "step": 810 + }, + { + "epoch": 0.058511597705710475, + "grad_norm": 8.151652125369855, + "learning_rate": 3.9914860997040685e-06, + "loss": 1.0086, + "step": 811 + }, + { + "epoch": 0.05858374517513798, + "grad_norm": 5.279923492513125, + "learning_rate": 3.991442967585782e-06, + "loss": 1.0694, + "step": 812 + }, + { + "epoch": 0.05865589264456549, + "grad_norm": 2.9707914059655534, + "learning_rate": 3.991399726722167e-06, + "loss": 0.8595, + "step": 813 + }, + { + "epoch": 0.058728040113993005, + "grad_norm": 4.281085380118031, + "learning_rate": 3.9913563771155845e-06, + "loss": 0.9747, + "step": 814 + }, + { + "epoch": 0.05880018758342051, + "grad_norm": 4.976572464913493, + "learning_rate": 3.991312918768402e-06, + "loss": 0.9088, + "step": 815 + }, + { + "epoch": 0.05887233505284802, + "grad_norm": 2.9870816883606404, + "learning_rate": 3.991269351682993e-06, + "loss": 1.1066, + "step": 816 + }, + { + "epoch": 0.05894448252227553, + "grad_norm": 2.8881551552514737, + "learning_rate": 3.991225675861736e-06, + "loss": 1.0205, + "step": 817 + }, + { + "epoch": 0.05901662999170304, + "grad_norm": 2.97191924534542, + "learning_rate": 3.991181891307017e-06, + "loss": 0.9674, + "step": 818 + }, + { + "epoch": 0.05908877746113055, + "grad_norm": 3.359557390806352, + "learning_rate": 3.991137998021225e-06, + "loss": 0.9573, + "step": 819 + }, + { + "epoch": 0.05916092493055806, + "grad_norm": 2.971468032585368, + "learning_rate": 3.991093996006757e-06, + "loss": 0.9445, + "step": 820 + }, + { + "epoch": 0.05923307239998557, + "grad_norm": 3.578266124357728, + "learning_rate": 3.991049885266018e-06, + "loss": 1.0258, + "step": 821 + }, + { + "epoch": 0.05930521986941308, + "grad_norm": 3.5757462369585244, + "learning_rate": 3.991005665801415e-06, + "loss": 0.8787, + "step": 822 + }, + { + "epoch": 0.05937736733884059, + "grad_norm": 3.9770987098573776, + "learning_rate": 3.990961337615363e-06, + "loss": 0.9824, + "step": 823 + }, + { + "epoch": 0.0594495148082681, + "grad_norm": 3.6149807289211364, + "learning_rate": 3.990916900710282e-06, + "loss": 1.0168, + "step": 824 + }, + { + "epoch": 0.05952166227769561, + "grad_norm": 4.57088379508962, + "learning_rate": 3.9908723550886005e-06, + "loss": 0.9338, + "step": 825 + }, + { + "epoch": 0.05959380974712312, + "grad_norm": 3.546269436535231, + "learning_rate": 3.990827700752749e-06, + "loss": 1.0299, + "step": 826 + }, + { + "epoch": 0.05966595721655063, + "grad_norm": 3.637253538610992, + "learning_rate": 3.990782937705166e-06, + "loss": 1.0731, + "step": 827 + }, + { + "epoch": 0.05973810468597814, + "grad_norm": 4.025667094990889, + "learning_rate": 3.9907380659482975e-06, + "loss": 0.9737, + "step": 828 + }, + { + "epoch": 0.05981025215540565, + "grad_norm": 3.6258012609539905, + "learning_rate": 3.990693085484592e-06, + "loss": 0.9784, + "step": 829 + }, + { + "epoch": 0.05988239962483316, + "grad_norm": 1.1726458239616042, + "learning_rate": 3.990647996316506e-06, + "loss": 0.8471, + "step": 830 + }, + { + "epoch": 0.059954547094260666, + "grad_norm": 3.14979724534256, + "learning_rate": 3.990602798446503e-06, + "loss": 1.0983, + "step": 831 + }, + { + "epoch": 0.06002669456368818, + "grad_norm": 14.571698995693458, + "learning_rate": 3.99055749187705e-06, + "loss": 1.0458, + "step": 832 + }, + { + "epoch": 0.06009884203311569, + "grad_norm": 2.6620812530039752, + "learning_rate": 3.9905120766106205e-06, + "loss": 0.9572, + "step": 833 + }, + { + "epoch": 0.060170989502543196, + "grad_norm": 3.6226217426616976, + "learning_rate": 3.990466552649696e-06, + "loss": 1.0029, + "step": 834 + }, + { + "epoch": 0.06024313697197071, + "grad_norm": 21.88488406719957, + "learning_rate": 3.9904209199967605e-06, + "loss": 1.0207, + "step": 835 + }, + { + "epoch": 0.06031528444139822, + "grad_norm": 1.123310568991071, + "learning_rate": 3.990375178654307e-06, + "loss": 0.8294, + "step": 836 + }, + { + "epoch": 0.060387431910825726, + "grad_norm": 7.833917176693843, + "learning_rate": 3.9903293286248336e-06, + "loss": 1.0034, + "step": 837 + }, + { + "epoch": 0.06045957938025324, + "grad_norm": 2.983856156070468, + "learning_rate": 3.990283369910844e-06, + "loss": 1.0203, + "step": 838 + }, + { + "epoch": 0.06053172684968075, + "grad_norm": 3.228373538823653, + "learning_rate": 3.9902373025148465e-06, + "loss": 1.0494, + "step": 839 + }, + { + "epoch": 0.060603874319108256, + "grad_norm": 2.951574978415837, + "learning_rate": 3.990191126439357e-06, + "loss": 1.0299, + "step": 840 + }, + { + "epoch": 0.06067602178853577, + "grad_norm": 2.6501991916863976, + "learning_rate": 3.990144841686899e-06, + "loss": 0.9694, + "step": 841 + }, + { + "epoch": 0.06074816925796328, + "grad_norm": 3.4254012380986514, + "learning_rate": 3.990098448259997e-06, + "loss": 1.0302, + "step": 842 + }, + { + "epoch": 0.060820316727390786, + "grad_norm": 4.000106810097388, + "learning_rate": 3.990051946161187e-06, + "loss": 0.9988, + "step": 843 + }, + { + "epoch": 0.0608924641968183, + "grad_norm": 2.251323946215476, + "learning_rate": 3.990005335393007e-06, + "loss": 0.9577, + "step": 844 + }, + { + "epoch": 0.0609646116662458, + "grad_norm": 3.6397723726216054, + "learning_rate": 3.989958615958002e-06, + "loss": 0.9378, + "step": 845 + }, + { + "epoch": 0.061036759135673316, + "grad_norm": 3.921347411638774, + "learning_rate": 3.989911787858724e-06, + "loss": 1.098, + "step": 846 + }, + { + "epoch": 0.06110890660510083, + "grad_norm": 2.6719284498434392, + "learning_rate": 3.989864851097729e-06, + "loss": 1.0102, + "step": 847 + }, + { + "epoch": 0.061181054074528333, + "grad_norm": 4.0269283816777905, + "learning_rate": 3.989817805677581e-06, + "loss": 0.8893, + "step": 848 + }, + { + "epoch": 0.061253201543955846, + "grad_norm": 4.323626111503537, + "learning_rate": 3.989770651600849e-06, + "loss": 1.0084, + "step": 849 + }, + { + "epoch": 0.06132534901338336, + "grad_norm": 3.955516348214687, + "learning_rate": 3.989723388870107e-06, + "loss": 0.9336, + "step": 850 + }, + { + "epoch": 0.061397496482810864, + "grad_norm": 3.0329058342417166, + "learning_rate": 3.9896760174879365e-06, + "loss": 1.0951, + "step": 851 + }, + { + "epoch": 0.061469643952238376, + "grad_norm": 0.8213008557719156, + "learning_rate": 3.989628537456925e-06, + "loss": 0.7858, + "step": 852 + }, + { + "epoch": 0.06154179142166589, + "grad_norm": 5.534671738012888, + "learning_rate": 3.989580948779664e-06, + "loss": 0.9256, + "step": 853 + }, + { + "epoch": 0.061613938891093394, + "grad_norm": 4.023923855616652, + "learning_rate": 3.989533251458752e-06, + "loss": 1.0513, + "step": 854 + }, + { + "epoch": 0.061686086360520906, + "grad_norm": 3.4316303424540027, + "learning_rate": 3.989485445496794e-06, + "loss": 0.867, + "step": 855 + }, + { + "epoch": 0.06175823382994841, + "grad_norm": 3.873992665935264, + "learning_rate": 3.989437530896402e-06, + "loss": 0.8646, + "step": 856 + }, + { + "epoch": 0.061830381299375924, + "grad_norm": 3.879476361284929, + "learning_rate": 3.989389507660191e-06, + "loss": 0.9931, + "step": 857 + }, + { + "epoch": 0.061902528768803436, + "grad_norm": 3.3382765991549923, + "learning_rate": 3.989341375790782e-06, + "loss": 1.0987, + "step": 858 + }, + { + "epoch": 0.06197467623823094, + "grad_norm": 4.8769097133380255, + "learning_rate": 3.989293135290805e-06, + "loss": 1.0088, + "step": 859 + }, + { + "epoch": 0.062046823707658454, + "grad_norm": 2.9963689128146638, + "learning_rate": 3.989244786162895e-06, + "loss": 0.9822, + "step": 860 + }, + { + "epoch": 0.062118971177085966, + "grad_norm": 3.9197338080041035, + "learning_rate": 3.9891963284096904e-06, + "loss": 0.9271, + "step": 861 + }, + { + "epoch": 0.06219111864651347, + "grad_norm": 0.9311054860063745, + "learning_rate": 3.989147762033838e-06, + "loss": 0.8086, + "step": 862 + }, + { + "epoch": 0.062263266115940984, + "grad_norm": 4.215208926922534, + "learning_rate": 3.9890990870379905e-06, + "loss": 0.9946, + "step": 863 + }, + { + "epoch": 0.062335413585368496, + "grad_norm": 3.2411522372942283, + "learning_rate": 3.989050303424805e-06, + "loss": 0.9619, + "step": 864 + }, + { + "epoch": 0.062407561054796, + "grad_norm": 4.191569841504022, + "learning_rate": 3.989001411196946e-06, + "loss": 1.0478, + "step": 865 + }, + { + "epoch": 0.062479708524223514, + "grad_norm": 3.5107229006772034, + "learning_rate": 3.988952410357081e-06, + "loss": 0.9025, + "step": 866 + }, + { + "epoch": 0.06255185599365103, + "grad_norm": 3.070624182802135, + "learning_rate": 3.98890330090789e-06, + "loss": 0.9344, + "step": 867 + }, + { + "epoch": 0.06262400346307853, + "grad_norm": 3.5947944781855146, + "learning_rate": 3.988854082852052e-06, + "loss": 0.9679, + "step": 868 + }, + { + "epoch": 0.06269615093250604, + "grad_norm": 3.6187686636382512, + "learning_rate": 3.988804756192254e-06, + "loss": 1.0523, + "step": 869 + }, + { + "epoch": 0.06276829840193356, + "grad_norm": 3.388100805176853, + "learning_rate": 3.988755320931192e-06, + "loss": 0.9882, + "step": 870 + }, + { + "epoch": 0.06284044587136106, + "grad_norm": 4.471492326039057, + "learning_rate": 3.988705777071563e-06, + "loss": 1.0307, + "step": 871 + }, + { + "epoch": 0.06291259334078857, + "grad_norm": 0.7435291010324907, + "learning_rate": 3.988656124616074e-06, + "loss": 0.8425, + "step": 872 + }, + { + "epoch": 0.06298474081021609, + "grad_norm": 3.281023435490933, + "learning_rate": 3.9886063635674355e-06, + "loss": 1.005, + "step": 873 + }, + { + "epoch": 0.06305688827964359, + "grad_norm": 0.7531706625997092, + "learning_rate": 3.988556493928365e-06, + "loss": 0.8372, + "step": 874 + }, + { + "epoch": 0.0631290357490711, + "grad_norm": 4.749997189169604, + "learning_rate": 3.988506515701586e-06, + "loss": 0.9751, + "step": 875 + }, + { + "epoch": 0.06320118321849862, + "grad_norm": 4.955318506745646, + "learning_rate": 3.9884564288898275e-06, + "loss": 1.0192, + "step": 876 + }, + { + "epoch": 0.06327333068792612, + "grad_norm": 4.907266693710282, + "learning_rate": 3.988406233495824e-06, + "loss": 0.9281, + "step": 877 + }, + { + "epoch": 0.06334547815735363, + "grad_norm": 3.2046095966484205, + "learning_rate": 3.988355929522317e-06, + "loss": 0.9761, + "step": 878 + }, + { + "epoch": 0.06341762562678115, + "grad_norm": 0.9384374699702148, + "learning_rate": 3.988305516972053e-06, + "loss": 0.8541, + "step": 879 + }, + { + "epoch": 0.06348977309620865, + "grad_norm": 3.802861270834744, + "learning_rate": 3.988254995847786e-06, + "loss": 1.095, + "step": 880 + }, + { + "epoch": 0.06356192056563616, + "grad_norm": 16.505665124470806, + "learning_rate": 3.988204366152273e-06, + "loss": 1.0075, + "step": 881 + }, + { + "epoch": 0.06363406803506368, + "grad_norm": 3.3178091148667477, + "learning_rate": 3.988153627888281e-06, + "loss": 1.0181, + "step": 882 + }, + { + "epoch": 0.06370621550449118, + "grad_norm": 3.707310238983089, + "learning_rate": 3.988102781058578e-06, + "loss": 1.0153, + "step": 883 + }, + { + "epoch": 0.06377836297391869, + "grad_norm": 3.156127020828922, + "learning_rate": 3.988051825665942e-06, + "loss": 1.0884, + "step": 884 + }, + { + "epoch": 0.0638505104433462, + "grad_norm": 3.617200550690212, + "learning_rate": 3.988000761713156e-06, + "loss": 1.0817, + "step": 885 + }, + { + "epoch": 0.06392265791277371, + "grad_norm": 3.5061581795105217, + "learning_rate": 3.987949589203007e-06, + "loss": 1.0196, + "step": 886 + }, + { + "epoch": 0.06399480538220122, + "grad_norm": 3.0459244247849395, + "learning_rate": 3.987898308138291e-06, + "loss": 1.0314, + "step": 887 + }, + { + "epoch": 0.06406695285162872, + "grad_norm": 4.704316555413766, + "learning_rate": 3.9878469185218065e-06, + "loss": 1.0636, + "step": 888 + }, + { + "epoch": 0.06413910032105624, + "grad_norm": 4.095570632617962, + "learning_rate": 3.98779542035636e-06, + "loss": 0.9373, + "step": 889 + }, + { + "epoch": 0.06421124779048375, + "grad_norm": 3.9842601086376437, + "learning_rate": 3.987743813644765e-06, + "loss": 1.1849, + "step": 890 + }, + { + "epoch": 0.06428339525991125, + "grad_norm": 2.092169549478983, + "learning_rate": 3.987692098389839e-06, + "loss": 0.9319, + "step": 891 + }, + { + "epoch": 0.06435554272933877, + "grad_norm": 3.726352901682284, + "learning_rate": 3.987640274594405e-06, + "loss": 0.9848, + "step": 892 + }, + { + "epoch": 0.06442769019876628, + "grad_norm": 3.511751881274468, + "learning_rate": 3.9875883422612944e-06, + "loss": 1.0944, + "step": 893 + }, + { + "epoch": 0.06449983766819378, + "grad_norm": 5.22054620771708, + "learning_rate": 3.9875363013933416e-06, + "loss": 0.9337, + "step": 894 + }, + { + "epoch": 0.0645719851376213, + "grad_norm": 3.326920268904407, + "learning_rate": 3.987484151993389e-06, + "loss": 0.9544, + "step": 895 + }, + { + "epoch": 0.06464413260704881, + "grad_norm": 3.3118098277719916, + "learning_rate": 3.987431894064285e-06, + "loss": 1.0213, + "step": 896 + }, + { + "epoch": 0.06471628007647631, + "grad_norm": 4.187884412742571, + "learning_rate": 3.987379527608881e-06, + "loss": 0.9978, + "step": 897 + }, + { + "epoch": 0.06478842754590383, + "grad_norm": 2.3894854054648853, + "learning_rate": 3.987327052630039e-06, + "loss": 0.9356, + "step": 898 + }, + { + "epoch": 0.06486057501533134, + "grad_norm": 3.0635613918263918, + "learning_rate": 3.987274469130624e-06, + "loss": 0.9518, + "step": 899 + }, + { + "epoch": 0.06493272248475884, + "grad_norm": 3.839744031043995, + "learning_rate": 3.987221777113507e-06, + "loss": 0.9763, + "step": 900 + }, + { + "epoch": 0.06500486995418636, + "grad_norm": 2.30877209795841, + "learning_rate": 3.987168976581564e-06, + "loss": 0.9256, + "step": 901 + }, + { + "epoch": 0.06507701742361387, + "grad_norm": 4.282475873764498, + "learning_rate": 3.98711606753768e-06, + "loss": 0.9703, + "step": 902 + }, + { + "epoch": 0.06514916489304137, + "grad_norm": 3.0577790599119226, + "learning_rate": 3.987063049984743e-06, + "loss": 1.0081, + "step": 903 + }, + { + "epoch": 0.06522131236246889, + "grad_norm": 2.6354949597777484, + "learning_rate": 3.987009923925649e-06, + "loss": 0.9129, + "step": 904 + }, + { + "epoch": 0.0652934598318964, + "grad_norm": 4.289682812468982, + "learning_rate": 3.9869566893633e-06, + "loss": 0.9118, + "step": 905 + }, + { + "epoch": 0.0653656073013239, + "grad_norm": 6.166599616983548, + "learning_rate": 3.9869033463006e-06, + "loss": 0.9642, + "step": 906 + }, + { + "epoch": 0.06543775477075142, + "grad_norm": 2.1255953459674735, + "learning_rate": 3.986849894740464e-06, + "loss": 0.8353, + "step": 907 + }, + { + "epoch": 0.06550990224017893, + "grad_norm": 4.0772358008978395, + "learning_rate": 3.9867963346858105e-06, + "loss": 0.9792, + "step": 908 + }, + { + "epoch": 0.06558204970960643, + "grad_norm": 2.7348485046216364, + "learning_rate": 3.986742666139565e-06, + "loss": 0.8341, + "step": 909 + }, + { + "epoch": 0.06565419717903395, + "grad_norm": 2.74021479843662, + "learning_rate": 3.986688889104655e-06, + "loss": 0.967, + "step": 910 + }, + { + "epoch": 0.06572634464846146, + "grad_norm": 4.496565249698908, + "learning_rate": 3.986635003584021e-06, + "loss": 1.0194, + "step": 911 + }, + { + "epoch": 0.06579849211788896, + "grad_norm": 3.7165271943841174, + "learning_rate": 3.986581009580603e-06, + "loss": 1.006, + "step": 912 + }, + { + "epoch": 0.06587063958731647, + "grad_norm": 3.069930113453604, + "learning_rate": 3.98652690709735e-06, + "loss": 1.0017, + "step": 913 + }, + { + "epoch": 0.06594278705674399, + "grad_norm": 2.73582960402945, + "learning_rate": 3.986472696137217e-06, + "loss": 1.1252, + "step": 914 + }, + { + "epoch": 0.06601493452617149, + "grad_norm": 3.4035795050153372, + "learning_rate": 3.986418376703163e-06, + "loss": 1.0424, + "step": 915 + }, + { + "epoch": 0.066087081995599, + "grad_norm": 2.481790794033438, + "learning_rate": 3.9863639487981555e-06, + "loss": 1.0095, + "step": 916 + }, + { + "epoch": 0.06615922946502652, + "grad_norm": 3.607510832560792, + "learning_rate": 3.986309412425166e-06, + "loss": 1.0165, + "step": 917 + }, + { + "epoch": 0.06623137693445402, + "grad_norm": 2.7175979804349577, + "learning_rate": 3.986254767587173e-06, + "loss": 0.9789, + "step": 918 + }, + { + "epoch": 0.06630352440388153, + "grad_norm": 3.388073501744414, + "learning_rate": 3.986200014287159e-06, + "loss": 1.0141, + "step": 919 + }, + { + "epoch": 0.06637567187330905, + "grad_norm": 2.740139449114231, + "learning_rate": 3.986145152528115e-06, + "loss": 0.9768, + "step": 920 + }, + { + "epoch": 0.06644781934273655, + "grad_norm": 3.948250279216603, + "learning_rate": 3.986090182313037e-06, + "loss": 0.9362, + "step": 921 + }, + { + "epoch": 0.06651996681216406, + "grad_norm": 3.862730228019182, + "learning_rate": 3.986035103644926e-06, + "loss": 1.0403, + "step": 922 + }, + { + "epoch": 0.06659211428159158, + "grad_norm": 2.5100166878012526, + "learning_rate": 3.985979916526791e-06, + "loss": 1.0785, + "step": 923 + }, + { + "epoch": 0.06666426175101908, + "grad_norm": 3.445496015786378, + "learning_rate": 3.985924620961644e-06, + "loss": 1.1035, + "step": 924 + }, + { + "epoch": 0.06673640922044659, + "grad_norm": 0.7169661984556426, + "learning_rate": 3.985869216952505e-06, + "loss": 0.7841, + "step": 925 + }, + { + "epoch": 0.06680855668987411, + "grad_norm": 3.8030581265465297, + "learning_rate": 3.9858137045023996e-06, + "loss": 1.0312, + "step": 926 + }, + { + "epoch": 0.06688070415930161, + "grad_norm": 4.8152642176181395, + "learning_rate": 3.9857580836143595e-06, + "loss": 0.9968, + "step": 927 + }, + { + "epoch": 0.06695285162872912, + "grad_norm": 4.560301368525697, + "learning_rate": 3.9857023542914205e-06, + "loss": 0.9711, + "step": 928 + }, + { + "epoch": 0.06702499909815664, + "grad_norm": 2.321399472651615, + "learning_rate": 3.985646516536628e-06, + "loss": 1.028, + "step": 929 + }, + { + "epoch": 0.06709714656758414, + "grad_norm": 3.20235556688476, + "learning_rate": 3.985590570353029e-06, + "loss": 1.0211, + "step": 930 + }, + { + "epoch": 0.06716929403701165, + "grad_norm": 2.932428893180875, + "learning_rate": 3.985534515743679e-06, + "loss": 0.9119, + "step": 931 + }, + { + "epoch": 0.06724144150643917, + "grad_norm": 3.586869527995723, + "learning_rate": 3.98547835271164e-06, + "loss": 1.0912, + "step": 932 + }, + { + "epoch": 0.06731358897586667, + "grad_norm": 3.7078061675649807, + "learning_rate": 3.985422081259978e-06, + "loss": 1.015, + "step": 933 + }, + { + "epoch": 0.06738573644529418, + "grad_norm": 4.439223693477546, + "learning_rate": 3.985365701391766e-06, + "loss": 1.0522, + "step": 934 + }, + { + "epoch": 0.0674578839147217, + "grad_norm": 2.645285562196718, + "learning_rate": 3.985309213110082e-06, + "loss": 1.0016, + "step": 935 + }, + { + "epoch": 0.0675300313841492, + "grad_norm": 3.2223671153439057, + "learning_rate": 3.985252616418012e-06, + "loss": 0.9777, + "step": 936 + }, + { + "epoch": 0.06760217885357671, + "grad_norm": 2.7213506427845506, + "learning_rate": 3.985195911318646e-06, + "loss": 0.9488, + "step": 937 + }, + { + "epoch": 0.06767432632300423, + "grad_norm": 4.596656761979425, + "learning_rate": 3.9851390978150795e-06, + "loss": 0.9359, + "step": 938 + }, + { + "epoch": 0.06774647379243173, + "grad_norm": 3.0313220457973307, + "learning_rate": 3.985082175910416e-06, + "loss": 1.034, + "step": 939 + }, + { + "epoch": 0.06781862126185924, + "grad_norm": 4.143076505631842, + "learning_rate": 3.985025145607763e-06, + "loss": 1.0801, + "step": 940 + }, + { + "epoch": 0.06789076873128674, + "grad_norm": 3.8521596820885544, + "learning_rate": 3.984968006910235e-06, + "loss": 0.8677, + "step": 941 + }, + { + "epoch": 0.06796291620071426, + "grad_norm": 3.5049414492953064, + "learning_rate": 3.984910759820953e-06, + "loss": 0.8867, + "step": 942 + }, + { + "epoch": 0.06803506367014177, + "grad_norm": 6.733705420961706, + "learning_rate": 3.984853404343042e-06, + "loss": 0.9308, + "step": 943 + }, + { + "epoch": 0.06810721113956927, + "grad_norm": 3.8904216667263825, + "learning_rate": 3.984795940479634e-06, + "loss": 0.8805, + "step": 944 + }, + { + "epoch": 0.06817935860899679, + "grad_norm": 4.289613226325857, + "learning_rate": 3.984738368233867e-06, + "loss": 1.0092, + "step": 945 + }, + { + "epoch": 0.0682515060784243, + "grad_norm": 2.787205695109894, + "learning_rate": 3.9846806876088855e-06, + "loss": 1.036, + "step": 946 + }, + { + "epoch": 0.0683236535478518, + "grad_norm": 4.412450229266823, + "learning_rate": 3.984622898607838e-06, + "loss": 1.0841, + "step": 947 + }, + { + "epoch": 0.06839580101727932, + "grad_norm": 2.521412134059062, + "learning_rate": 3.984565001233881e-06, + "loss": 1.033, + "step": 948 + }, + { + "epoch": 0.06846794848670683, + "grad_norm": 2.6018695821895816, + "learning_rate": 3.984506995490177e-06, + "loss": 1.0138, + "step": 949 + }, + { + "epoch": 0.06854009595613433, + "grad_norm": 0.7953499056370413, + "learning_rate": 3.984448881379891e-06, + "loss": 0.8076, + "step": 950 + }, + { + "epoch": 0.06861224342556185, + "grad_norm": 2.4202861865726852, + "learning_rate": 3.984390658906198e-06, + "loss": 1.0352, + "step": 951 + }, + { + "epoch": 0.06868439089498936, + "grad_norm": 4.238843073721614, + "learning_rate": 3.984332328072278e-06, + "loss": 1.0671, + "step": 952 + }, + { + "epoch": 0.06875653836441686, + "grad_norm": 4.467788272687119, + "learning_rate": 3.984273888881314e-06, + "loss": 0.9623, + "step": 953 + }, + { + "epoch": 0.06882868583384438, + "grad_norm": 4.179679970422541, + "learning_rate": 3.9842153413364995e-06, + "loss": 1.0535, + "step": 954 + }, + { + "epoch": 0.06890083330327189, + "grad_norm": 8.96926974076581, + "learning_rate": 3.984156685441031e-06, + "loss": 0.9147, + "step": 955 + }, + { + "epoch": 0.0689729807726994, + "grad_norm": 4.205009860323164, + "learning_rate": 3.984097921198109e-06, + "loss": 0.9216, + "step": 956 + }, + { + "epoch": 0.06904512824212691, + "grad_norm": 6.866272814421267, + "learning_rate": 3.984039048610946e-06, + "loss": 0.9415, + "step": 957 + }, + { + "epoch": 0.06911727571155442, + "grad_norm": 18.63257828731205, + "learning_rate": 3.983980067682755e-06, + "loss": 0.9692, + "step": 958 + }, + { + "epoch": 0.06918942318098192, + "grad_norm": 2.995094102708967, + "learning_rate": 3.983920978416757e-06, + "loss": 0.902, + "step": 959 + }, + { + "epoch": 0.06926157065040944, + "grad_norm": 3.026046218912418, + "learning_rate": 3.983861780816178e-06, + "loss": 0.9835, + "step": 960 + }, + { + "epoch": 0.06933371811983695, + "grad_norm": 3.911683111255105, + "learning_rate": 3.983802474884252e-06, + "loss": 1.0376, + "step": 961 + }, + { + "epoch": 0.06940586558926445, + "grad_norm": 4.772274593667393, + "learning_rate": 3.9837430606242154e-06, + "loss": 1.024, + "step": 962 + }, + { + "epoch": 0.06947801305869197, + "grad_norm": 3.1333546157547008, + "learning_rate": 3.983683538039314e-06, + "loss": 1.014, + "step": 963 + }, + { + "epoch": 0.06955016052811948, + "grad_norm": 4.57733772461041, + "learning_rate": 3.983623907132799e-06, + "loss": 1.0987, + "step": 964 + }, + { + "epoch": 0.06962230799754698, + "grad_norm": 3.1851699390917205, + "learning_rate": 3.983564167907925e-06, + "loss": 0.924, + "step": 965 + }, + { + "epoch": 0.06969445546697449, + "grad_norm": 0.8414549166207121, + "learning_rate": 3.983504320367955e-06, + "loss": 0.8476, + "step": 966 + }, + { + "epoch": 0.06976660293640201, + "grad_norm": 5.386371559865938, + "learning_rate": 3.983444364516156e-06, + "loss": 1.0143, + "step": 967 + }, + { + "epoch": 0.06983875040582951, + "grad_norm": 0.7452228516218533, + "learning_rate": 3.983384300355803e-06, + "loss": 0.8602, + "step": 968 + }, + { + "epoch": 0.06991089787525702, + "grad_norm": 0.8632366117095984, + "learning_rate": 3.983324127890176e-06, + "loss": 0.8659, + "step": 969 + }, + { + "epoch": 0.06998304534468454, + "grad_norm": 5.415020482243822, + "learning_rate": 3.98326384712256e-06, + "loss": 0.9943, + "step": 970 + }, + { + "epoch": 0.07005519281411204, + "grad_norm": 3.1379895136155307, + "learning_rate": 3.983203458056247e-06, + "loss": 0.9437, + "step": 971 + }, + { + "epoch": 0.07012734028353955, + "grad_norm": 3.199640080953488, + "learning_rate": 3.983142960694535e-06, + "loss": 0.9183, + "step": 972 + }, + { + "epoch": 0.07019948775296707, + "grad_norm": 0.6991794724327953, + "learning_rate": 3.9830823550407275e-06, + "loss": 0.8202, + "step": 973 + }, + { + "epoch": 0.07027163522239457, + "grad_norm": 4.320351571153007, + "learning_rate": 3.983021641098134e-06, + "loss": 0.9637, + "step": 974 + }, + { + "epoch": 0.07034378269182208, + "grad_norm": 3.0608785579783944, + "learning_rate": 3.982960818870069e-06, + "loss": 1.0023, + "step": 975 + }, + { + "epoch": 0.0704159301612496, + "grad_norm": 3.14211029306914, + "learning_rate": 3.982899888359854e-06, + "loss": 1.0331, + "step": 976 + }, + { + "epoch": 0.0704880776306771, + "grad_norm": 3.4807589117789686, + "learning_rate": 3.982838849570817e-06, + "loss": 1.0105, + "step": 977 + }, + { + "epoch": 0.07056022510010461, + "grad_norm": 4.862358134340468, + "learning_rate": 3.982777702506291e-06, + "loss": 0.8567, + "step": 978 + }, + { + "epoch": 0.07063237256953213, + "grad_norm": 3.5760276011142116, + "learning_rate": 3.982716447169614e-06, + "loss": 0.9656, + "step": 979 + }, + { + "epoch": 0.07070452003895963, + "grad_norm": 3.8076903225375607, + "learning_rate": 3.982655083564131e-06, + "loss": 0.9367, + "step": 980 + }, + { + "epoch": 0.07077666750838714, + "grad_norm": 2.502939689339409, + "learning_rate": 3.982593611693195e-06, + "loss": 1.0438, + "step": 981 + }, + { + "epoch": 0.07084881497781466, + "grad_norm": 6.331285078548399, + "learning_rate": 3.98253203156016e-06, + "loss": 0.9537, + "step": 982 + }, + { + "epoch": 0.07092096244724216, + "grad_norm": 3.563126759859061, + "learning_rate": 3.98247034316839e-06, + "loss": 1.0455, + "step": 983 + }, + { + "epoch": 0.07099310991666967, + "grad_norm": 3.9641127039249207, + "learning_rate": 3.982408546521254e-06, + "loss": 0.9133, + "step": 984 + }, + { + "epoch": 0.07106525738609719, + "grad_norm": 3.5506281552185746, + "learning_rate": 3.9823466416221254e-06, + "loss": 0.9925, + "step": 985 + }, + { + "epoch": 0.0711374048555247, + "grad_norm": 3.914483266664339, + "learning_rate": 3.9822846284743856e-06, + "loss": 1.0058, + "step": 986 + }, + { + "epoch": 0.0712095523249522, + "grad_norm": 3.111687954906619, + "learning_rate": 3.9822225070814195e-06, + "loss": 1.1021, + "step": 987 + }, + { + "epoch": 0.07128169979437972, + "grad_norm": 4.294958068500863, + "learning_rate": 3.982160277446621e-06, + "loss": 1.0147, + "step": 988 + }, + { + "epoch": 0.07135384726380722, + "grad_norm": 4.193723693856112, + "learning_rate": 3.982097939573388e-06, + "loss": 1.0134, + "step": 989 + }, + { + "epoch": 0.07142599473323473, + "grad_norm": 2.21273665805803, + "learning_rate": 3.982035493465123e-06, + "loss": 0.9743, + "step": 990 + }, + { + "epoch": 0.07149814220266223, + "grad_norm": 3.0021602482222463, + "learning_rate": 3.981972939125237e-06, + "loss": 1.0601, + "step": 991 + }, + { + "epoch": 0.07157028967208975, + "grad_norm": 4.241928963835735, + "learning_rate": 3.981910276557146e-06, + "loss": 0.9165, + "step": 992 + }, + { + "epoch": 0.07164243714151726, + "grad_norm": 4.612178247492773, + "learning_rate": 3.9818475057642715e-06, + "loss": 0.895, + "step": 993 + }, + { + "epoch": 0.07171458461094476, + "grad_norm": 3.0954371004092995, + "learning_rate": 3.981784626750041e-06, + "loss": 1.0228, + "step": 994 + }, + { + "epoch": 0.07178673208037228, + "grad_norm": 6.597668102879377, + "learning_rate": 3.981721639517889e-06, + "loss": 0.9904, + "step": 995 + }, + { + "epoch": 0.07185887954979979, + "grad_norm": 4.994991273309151, + "learning_rate": 3.981658544071254e-06, + "loss": 0.974, + "step": 996 + }, + { + "epoch": 0.0719310270192273, + "grad_norm": 4.244331394196811, + "learning_rate": 3.981595340413582e-06, + "loss": 1.1089, + "step": 997 + }, + { + "epoch": 0.07200317448865481, + "grad_norm": 11.14315891206577, + "learning_rate": 3.981532028548324e-06, + "loss": 0.9918, + "step": 998 + }, + { + "epoch": 0.07207532195808232, + "grad_norm": 3.5716968081423754, + "learning_rate": 3.9814686084789365e-06, + "loss": 0.9438, + "step": 999 + }, + { + "epoch": 0.07214746942750982, + "grad_norm": 4.076133270885117, + "learning_rate": 3.981405080208885e-06, + "loss": 0.9512, + "step": 1000 + }, + { + "epoch": 0.07221961689693734, + "grad_norm": 1.941054928343869, + "learning_rate": 3.981341443741636e-06, + "loss": 1.0282, + "step": 1001 + }, + { + "epoch": 0.07229176436636485, + "grad_norm": 3.5613239421780167, + "learning_rate": 3.981277699080665e-06, + "loss": 0.8629, + "step": 1002 + }, + { + "epoch": 0.07236391183579235, + "grad_norm": 4.20390749316152, + "learning_rate": 3.981213846229454e-06, + "loss": 1.0544, + "step": 1003 + }, + { + "epoch": 0.07243605930521987, + "grad_norm": 3.701668497148382, + "learning_rate": 3.981149885191489e-06, + "loss": 0.9432, + "step": 1004 + }, + { + "epoch": 0.07250820677464738, + "grad_norm": 7.894565074457475, + "learning_rate": 3.9810858159702625e-06, + "loss": 1.0318, + "step": 1005 + }, + { + "epoch": 0.07258035424407489, + "grad_norm": 4.65653181823326, + "learning_rate": 3.981021638569273e-06, + "loss": 1.0506, + "step": 1006 + }, + { + "epoch": 0.0726525017135024, + "grad_norm": 3.8188874564897133, + "learning_rate": 3.9809573529920255e-06, + "loss": 1.0109, + "step": 1007 + }, + { + "epoch": 0.07272464918292991, + "grad_norm": 4.917213000100208, + "learning_rate": 3.980892959242031e-06, + "loss": 1.082, + "step": 1008 + }, + { + "epoch": 0.07279679665235742, + "grad_norm": 5.328286624370017, + "learning_rate": 3.9808284573228045e-06, + "loss": 1.0899, + "step": 1009 + }, + { + "epoch": 0.07286894412178493, + "grad_norm": 4.090592426027558, + "learning_rate": 3.980763847237869e-06, + "loss": 0.876, + "step": 1010 + }, + { + "epoch": 0.07294109159121244, + "grad_norm": 4.084000487548918, + "learning_rate": 3.980699128990752e-06, + "loss": 1.0687, + "step": 1011 + }, + { + "epoch": 0.07301323906063995, + "grad_norm": 4.067112343760922, + "learning_rate": 3.980634302584987e-06, + "loss": 1.0507, + "step": 1012 + }, + { + "epoch": 0.07308538653006746, + "grad_norm": 3.352056293524692, + "learning_rate": 3.980569368024116e-06, + "loss": 1.0226, + "step": 1013 + }, + { + "epoch": 0.07315753399949497, + "grad_norm": 3.7294466705621216, + "learning_rate": 3.980504325311683e-06, + "loss": 1.0381, + "step": 1014 + }, + { + "epoch": 0.07322968146892248, + "grad_norm": 2.7030005453929853, + "learning_rate": 3.980439174451241e-06, + "loss": 0.9418, + "step": 1015 + }, + { + "epoch": 0.07330182893834998, + "grad_norm": 4.744727169509954, + "learning_rate": 3.980373915446346e-06, + "loss": 1.103, + "step": 1016 + }, + { + "epoch": 0.0733739764077775, + "grad_norm": 2.9255493235116234, + "learning_rate": 3.980308548300564e-06, + "loss": 0.9583, + "step": 1017 + }, + { + "epoch": 0.073446123877205, + "grad_norm": 0.8866850052517005, + "learning_rate": 3.980243073017462e-06, + "loss": 0.7143, + "step": 1018 + }, + { + "epoch": 0.07351827134663251, + "grad_norm": 2.981571978250201, + "learning_rate": 3.980177489600616e-06, + "loss": 0.9358, + "step": 1019 + }, + { + "epoch": 0.07359041881606003, + "grad_norm": 0.8581127607079819, + "learning_rate": 3.980111798053609e-06, + "loss": 0.8425, + "step": 1020 + }, + { + "epoch": 0.07366256628548754, + "grad_norm": 3.3189966085013785, + "learning_rate": 3.980045998380026e-06, + "loss": 1.0022, + "step": 1021 + }, + { + "epoch": 0.07373471375491504, + "grad_norm": 11.083129957431714, + "learning_rate": 3.9799800905834604e-06, + "loss": 0.9317, + "step": 1022 + }, + { + "epoch": 0.07380686122434256, + "grad_norm": 5.146115788212356, + "learning_rate": 3.979914074667513e-06, + "loss": 1.0073, + "step": 1023 + }, + { + "epoch": 0.07387900869377007, + "grad_norm": 2.9972887503384245, + "learning_rate": 3.979847950635786e-06, + "loss": 0.9825, + "step": 1024 + }, + { + "epoch": 0.07395115616319757, + "grad_norm": 5.68613761262485, + "learning_rate": 3.979781718491894e-06, + "loss": 0.9446, + "step": 1025 + }, + { + "epoch": 0.07402330363262509, + "grad_norm": 2.6750795655583537, + "learning_rate": 3.9797153782394485e-06, + "loss": 0.9873, + "step": 1026 + }, + { + "epoch": 0.0740954511020526, + "grad_norm": 3.009707639062611, + "learning_rate": 3.979648929882076e-06, + "loss": 1.0436, + "step": 1027 + }, + { + "epoch": 0.0741675985714801, + "grad_norm": 5.204210322827904, + "learning_rate": 3.979582373423404e-06, + "loss": 1.017, + "step": 1028 + }, + { + "epoch": 0.07423974604090762, + "grad_norm": 2.9677043478713494, + "learning_rate": 3.9795157088670665e-06, + "loss": 0.961, + "step": 1029 + }, + { + "epoch": 0.07431189351033513, + "grad_norm": 2.617905338968552, + "learning_rate": 3.979448936216703e-06, + "loss": 1.0288, + "step": 1030 + }, + { + "epoch": 0.07438404097976263, + "grad_norm": 3.252165293144416, + "learning_rate": 3.979382055475962e-06, + "loss": 0.9128, + "step": 1031 + }, + { + "epoch": 0.07445618844919015, + "grad_norm": 6.390403837000536, + "learning_rate": 3.979315066648495e-06, + "loss": 1.0131, + "step": 1032 + }, + { + "epoch": 0.07452833591861766, + "grad_norm": 6.653969817413864, + "learning_rate": 3.979247969737958e-06, + "loss": 0.9263, + "step": 1033 + }, + { + "epoch": 0.07460048338804516, + "grad_norm": 3.930152213373941, + "learning_rate": 3.979180764748017e-06, + "loss": 1.0279, + "step": 1034 + }, + { + "epoch": 0.07467263085747268, + "grad_norm": 6.447833018050491, + "learning_rate": 3.979113451682341e-06, + "loss": 0.9389, + "step": 1035 + }, + { + "epoch": 0.07474477832690019, + "grad_norm": 9.193525142240903, + "learning_rate": 3.9790460305446064e-06, + "loss": 0.9565, + "step": 1036 + }, + { + "epoch": 0.07481692579632769, + "grad_norm": 6.093234152845781, + "learning_rate": 3.978978501338493e-06, + "loss": 1.0246, + "step": 1037 + }, + { + "epoch": 0.07488907326575521, + "grad_norm": 3.972722745955896, + "learning_rate": 3.97891086406769e-06, + "loss": 0.9884, + "step": 1038 + }, + { + "epoch": 0.07496122073518272, + "grad_norm": 2.2731210809589424, + "learning_rate": 3.978843118735891e-06, + "loss": 0.9171, + "step": 1039 + }, + { + "epoch": 0.07503336820461022, + "grad_norm": 4.299971141274776, + "learning_rate": 3.978775265346794e-06, + "loss": 0.8932, + "step": 1040 + }, + { + "epoch": 0.07510551567403774, + "grad_norm": 3.550144386041667, + "learning_rate": 3.978707303904106e-06, + "loss": 1.0094, + "step": 1041 + }, + { + "epoch": 0.07517766314346525, + "grad_norm": 2.565949699866241, + "learning_rate": 3.978639234411536e-06, + "loss": 0.9785, + "step": 1042 + }, + { + "epoch": 0.07524981061289275, + "grad_norm": 2.9128340108024187, + "learning_rate": 3.978571056872803e-06, + "loss": 1.0175, + "step": 1043 + }, + { + "epoch": 0.07532195808232026, + "grad_norm": 4.100232996949376, + "learning_rate": 3.978502771291628e-06, + "loss": 1.0179, + "step": 1044 + }, + { + "epoch": 0.07539410555174778, + "grad_norm": 3.0817541425746278, + "learning_rate": 3.978434377671741e-06, + "loss": 1.0284, + "step": 1045 + }, + { + "epoch": 0.07546625302117528, + "grad_norm": 8.375992901147788, + "learning_rate": 3.9783658760168765e-06, + "loss": 0.9259, + "step": 1046 + }, + { + "epoch": 0.07553840049060279, + "grad_norm": 3.536867113416379, + "learning_rate": 3.978297266330775e-06, + "loss": 0.9932, + "step": 1047 + }, + { + "epoch": 0.0756105479600303, + "grad_norm": 3.089277020165757, + "learning_rate": 3.978228548617184e-06, + "loss": 0.9461, + "step": 1048 + }, + { + "epoch": 0.07568269542945781, + "grad_norm": 7.030825996330291, + "learning_rate": 3.978159722879855e-06, + "loss": 1.001, + "step": 1049 + }, + { + "epoch": 0.07575484289888532, + "grad_norm": 4.15715647972845, + "learning_rate": 3.978090789122546e-06, + "loss": 0.9838, + "step": 1050 + }, + { + "epoch": 0.07582699036831284, + "grad_norm": 3.5901851055909675, + "learning_rate": 3.978021747349022e-06, + "loss": 0.9746, + "step": 1051 + }, + { + "epoch": 0.07589913783774034, + "grad_norm": 5.615287363880417, + "learning_rate": 3.977952597563052e-06, + "loss": 1.0007, + "step": 1052 + }, + { + "epoch": 0.07597128530716785, + "grad_norm": 5.241672086889783, + "learning_rate": 3.977883339768414e-06, + "loss": 1.0147, + "step": 1053 + }, + { + "epoch": 0.07604343277659537, + "grad_norm": 4.789565092662952, + "learning_rate": 3.977813973968888e-06, + "loss": 1.1053, + "step": 1054 + }, + { + "epoch": 0.07611558024602287, + "grad_norm": 3.0382361379517646, + "learning_rate": 3.977744500168262e-06, + "loss": 1.0208, + "step": 1055 + }, + { + "epoch": 0.07618772771545038, + "grad_norm": 7.372845092437524, + "learning_rate": 3.977674918370331e-06, + "loss": 0.9486, + "step": 1056 + }, + { + "epoch": 0.0762598751848779, + "grad_norm": 8.8757027361251, + "learning_rate": 3.977605228578894e-06, + "loss": 0.9416, + "step": 1057 + }, + { + "epoch": 0.0763320226543054, + "grad_norm": 6.222854256697457, + "learning_rate": 3.977535430797756e-06, + "loss": 0.9756, + "step": 1058 + }, + { + "epoch": 0.0764041701237329, + "grad_norm": 4.336601247760748, + "learning_rate": 3.9774655250307285e-06, + "loss": 1.1228, + "step": 1059 + }, + { + "epoch": 0.07647631759316043, + "grad_norm": 3.1724749218586075, + "learning_rate": 3.977395511281629e-06, + "loss": 0.9955, + "step": 1060 + }, + { + "epoch": 0.07654846506258793, + "grad_norm": 3.789151849381455, + "learning_rate": 3.97732538955428e-06, + "loss": 1.0739, + "step": 1061 + }, + { + "epoch": 0.07662061253201544, + "grad_norm": 4.094172099243345, + "learning_rate": 3.977255159852513e-06, + "loss": 1.0248, + "step": 1062 + }, + { + "epoch": 0.07669276000144296, + "grad_norm": 4.935552671766481, + "learning_rate": 3.9771848221801605e-06, + "loss": 0.8899, + "step": 1063 + }, + { + "epoch": 0.07676490747087046, + "grad_norm": 3.707039096450139, + "learning_rate": 3.9771143765410635e-06, + "loss": 0.9116, + "step": 1064 + }, + { + "epoch": 0.07683705494029797, + "grad_norm": 4.222350667929402, + "learning_rate": 3.97704382293907e-06, + "loss": 1.0213, + "step": 1065 + }, + { + "epoch": 0.07690920240972549, + "grad_norm": 6.882877899448089, + "learning_rate": 3.976973161378032e-06, + "loss": 0.9769, + "step": 1066 + }, + { + "epoch": 0.07698134987915299, + "grad_norm": 3.8900619160645027, + "learning_rate": 3.9769023918618085e-06, + "loss": 1.0179, + "step": 1067 + }, + { + "epoch": 0.0770534973485805, + "grad_norm": 6.068878593614573, + "learning_rate": 3.976831514394263e-06, + "loss": 0.9564, + "step": 1068 + }, + { + "epoch": 0.077125644818008, + "grad_norm": 3.9267790631808097, + "learning_rate": 3.976760528979267e-06, + "loss": 0.9397, + "step": 1069 + }, + { + "epoch": 0.07719779228743552, + "grad_norm": 3.4994969687690585, + "learning_rate": 3.976689435620696e-06, + "loss": 0.9514, + "step": 1070 + }, + { + "epoch": 0.07726993975686303, + "grad_norm": 23.163028275815968, + "learning_rate": 3.9766182343224325e-06, + "loss": 0.9995, + "step": 1071 + }, + { + "epoch": 0.07734208722629053, + "grad_norm": 4.363972089235893, + "learning_rate": 3.976546925088364e-06, + "loss": 0.9894, + "step": 1072 + }, + { + "epoch": 0.07741423469571805, + "grad_norm": 0.9633832681038007, + "learning_rate": 3.976475507922385e-06, + "loss": 0.8374, + "step": 1073 + }, + { + "epoch": 0.07748638216514556, + "grad_norm": 6.36566795518153, + "learning_rate": 3.976403982828396e-06, + "loss": 1.0798, + "step": 1074 + }, + { + "epoch": 0.07755852963457306, + "grad_norm": 4.013247487448016, + "learning_rate": 3.9763323498103e-06, + "loss": 0.9749, + "step": 1075 + }, + { + "epoch": 0.07763067710400058, + "grad_norm": 4.131482665742891, + "learning_rate": 3.9762606088720126e-06, + "loss": 0.9696, + "step": 1076 + }, + { + "epoch": 0.07770282457342809, + "grad_norm": 3.324118338120663, + "learning_rate": 3.976188760017448e-06, + "loss": 0.9899, + "step": 1077 + }, + { + "epoch": 0.07777497204285559, + "grad_norm": 2.208239847279732, + "learning_rate": 3.976116803250531e-06, + "loss": 1.0498, + "step": 1078 + }, + { + "epoch": 0.07784711951228311, + "grad_norm": 8.487506885553863, + "learning_rate": 3.976044738575191e-06, + "loss": 1.0919, + "step": 1079 + }, + { + "epoch": 0.07791926698171062, + "grad_norm": 3.877527212690615, + "learning_rate": 3.975972565995363e-06, + "loss": 0.9939, + "step": 1080 + }, + { + "epoch": 0.07799141445113812, + "grad_norm": 4.214846691452287, + "learning_rate": 3.975900285514988e-06, + "loss": 1.026, + "step": 1081 + }, + { + "epoch": 0.07806356192056564, + "grad_norm": 3.448435503289734, + "learning_rate": 3.975827897138012e-06, + "loss": 0.9146, + "step": 1082 + }, + { + "epoch": 0.07813570938999315, + "grad_norm": 3.583232449434746, + "learning_rate": 3.97575540086839e-06, + "loss": 0.9645, + "step": 1083 + }, + { + "epoch": 0.07820785685942065, + "grad_norm": 4.31667181380467, + "learning_rate": 3.975682796710079e-06, + "loss": 0.9693, + "step": 1084 + }, + { + "epoch": 0.07828000432884817, + "grad_norm": 20.492955928215462, + "learning_rate": 3.975610084667044e-06, + "loss": 0.8698, + "step": 1085 + }, + { + "epoch": 0.07835215179827568, + "grad_norm": 4.819397974049807, + "learning_rate": 3.975537264743255e-06, + "loss": 1.0724, + "step": 1086 + }, + { + "epoch": 0.07842429926770318, + "grad_norm": 3.5087912368661507, + "learning_rate": 3.975464336942691e-06, + "loss": 0.9991, + "step": 1087 + }, + { + "epoch": 0.0784964467371307, + "grad_norm": 5.013556413320378, + "learning_rate": 3.975391301269332e-06, + "loss": 0.8969, + "step": 1088 + }, + { + "epoch": 0.0785685942065582, + "grad_norm": 2.549588215060568, + "learning_rate": 3.975318157727166e-06, + "loss": 1.0288, + "step": 1089 + }, + { + "epoch": 0.07864074167598571, + "grad_norm": 3.1732149347555647, + "learning_rate": 3.975244906320188e-06, + "loss": 0.8826, + "step": 1090 + }, + { + "epoch": 0.07871288914541323, + "grad_norm": 2.55674935758804, + "learning_rate": 3.975171547052398e-06, + "loss": 0.8935, + "step": 1091 + }, + { + "epoch": 0.07878503661484074, + "grad_norm": 9.943731309644203, + "learning_rate": 3.975098079927801e-06, + "loss": 0.8948, + "step": 1092 + }, + { + "epoch": 0.07885718408426824, + "grad_norm": 3.573956055706695, + "learning_rate": 3.97502450495041e-06, + "loss": 0.973, + "step": 1093 + }, + { + "epoch": 0.07892933155369575, + "grad_norm": 8.332566442170359, + "learning_rate": 3.974950822124242e-06, + "loss": 0.8586, + "step": 1094 + }, + { + "epoch": 0.07900147902312327, + "grad_norm": 4.07117275878044, + "learning_rate": 3.974877031453321e-06, + "loss": 1.034, + "step": 1095 + }, + { + "epoch": 0.07907362649255077, + "grad_norm": 0.9625765150793753, + "learning_rate": 3.974803132941676e-06, + "loss": 0.8357, + "step": 1096 + }, + { + "epoch": 0.07914577396197828, + "grad_norm": 4.220777010502651, + "learning_rate": 3.974729126593342e-06, + "loss": 1.0172, + "step": 1097 + }, + { + "epoch": 0.0792179214314058, + "grad_norm": 8.670918619841503, + "learning_rate": 3.97465501241236e-06, + "loss": 0.976, + "step": 1098 + }, + { + "epoch": 0.0792900689008333, + "grad_norm": 4.236478276139759, + "learning_rate": 3.974580790402779e-06, + "loss": 0.9853, + "step": 1099 + }, + { + "epoch": 0.07936221637026081, + "grad_norm": 3.9246802764711894, + "learning_rate": 3.97450646056865e-06, + "loss": 1.1044, + "step": 1100 + }, + { + "epoch": 0.07943436383968833, + "grad_norm": 13.494322536632072, + "learning_rate": 3.974432022914032e-06, + "loss": 0.9866, + "step": 1101 + }, + { + "epoch": 0.07950651130911583, + "grad_norm": 5.2930889270835655, + "learning_rate": 3.974357477442991e-06, + "loss": 1.0293, + "step": 1102 + }, + { + "epoch": 0.07957865877854334, + "grad_norm": 5.101282646600062, + "learning_rate": 3.974282824159597e-06, + "loss": 1.0232, + "step": 1103 + }, + { + "epoch": 0.07965080624797086, + "grad_norm": 7.777161531406925, + "learning_rate": 3.974208063067926e-06, + "loss": 0.9534, + "step": 1104 + }, + { + "epoch": 0.07972295371739836, + "grad_norm": 3.4229409982966144, + "learning_rate": 3.974133194172062e-06, + "loss": 1.0576, + "step": 1105 + }, + { + "epoch": 0.07979510118682587, + "grad_norm": 7.200619342445129, + "learning_rate": 3.974058217476091e-06, + "loss": 1.0361, + "step": 1106 + }, + { + "epoch": 0.07986724865625339, + "grad_norm": 4.334674823185, + "learning_rate": 3.97398313298411e-06, + "loss": 0.9226, + "step": 1107 + }, + { + "epoch": 0.07993939612568089, + "grad_norm": 3.7937844339281748, + "learning_rate": 3.973907940700216e-06, + "loss": 0.9693, + "step": 1108 + }, + { + "epoch": 0.0800115435951084, + "grad_norm": 3.088591310939462, + "learning_rate": 3.973832640628517e-06, + "loss": 0.9849, + "step": 1109 + }, + { + "epoch": 0.08008369106453592, + "grad_norm": 4.995770954742719, + "learning_rate": 3.973757232773124e-06, + "loss": 0.962, + "step": 1110 + }, + { + "epoch": 0.08015583853396342, + "grad_norm": 3.677169035146074, + "learning_rate": 3.9736817171381555e-06, + "loss": 0.9289, + "step": 1111 + }, + { + "epoch": 0.08022798600339093, + "grad_norm": 0.7602458701079572, + "learning_rate": 3.973606093727735e-06, + "loss": 0.7577, + "step": 1112 + }, + { + "epoch": 0.08030013347281845, + "grad_norm": 5.200980629522378, + "learning_rate": 3.973530362545992e-06, + "loss": 1.095, + "step": 1113 + }, + { + "epoch": 0.08037228094224595, + "grad_norm": 3.5244524861960635, + "learning_rate": 3.973454523597061e-06, + "loss": 0.9989, + "step": 1114 + }, + { + "epoch": 0.08044442841167346, + "grad_norm": 0.8193068379814632, + "learning_rate": 3.973378576885084e-06, + "loss": 0.8436, + "step": 1115 + }, + { + "epoch": 0.08051657588110098, + "grad_norm": 4.774395386130121, + "learning_rate": 3.973302522414209e-06, + "loss": 0.9353, + "step": 1116 + }, + { + "epoch": 0.08058872335052848, + "grad_norm": 3.8792285612408404, + "learning_rate": 3.973226360188587e-06, + "loss": 0.9122, + "step": 1117 + }, + { + "epoch": 0.08066087081995599, + "grad_norm": 5.632786257361286, + "learning_rate": 3.973150090212379e-06, + "loss": 1.0697, + "step": 1118 + }, + { + "epoch": 0.08073301828938349, + "grad_norm": 6.387594311161626, + "learning_rate": 3.973073712489748e-06, + "loss": 0.9917, + "step": 1119 + }, + { + "epoch": 0.08080516575881101, + "grad_norm": 5.195867018397986, + "learning_rate": 3.9729972270248665e-06, + "loss": 1.1223, + "step": 1120 + }, + { + "epoch": 0.08087731322823852, + "grad_norm": 5.576836103170942, + "learning_rate": 3.972920633821909e-06, + "loss": 1.0129, + "step": 1121 + }, + { + "epoch": 0.08094946069766602, + "grad_norm": 2.7690982883707758, + "learning_rate": 3.9728439328850605e-06, + "loss": 0.9179, + "step": 1122 + }, + { + "epoch": 0.08102160816709354, + "grad_norm": 4.16193866782434, + "learning_rate": 3.972767124218508e-06, + "loss": 0.9381, + "step": 1123 + }, + { + "epoch": 0.08109375563652105, + "grad_norm": 3.89508940313763, + "learning_rate": 3.972690207826446e-06, + "loss": 1.0507, + "step": 1124 + }, + { + "epoch": 0.08116590310594855, + "grad_norm": 2.837433055474493, + "learning_rate": 3.972613183713073e-06, + "loss": 0.9954, + "step": 1125 + }, + { + "epoch": 0.08123805057537607, + "grad_norm": 3.2792880050784596, + "learning_rate": 3.972536051882597e-06, + "loss": 0.9737, + "step": 1126 + }, + { + "epoch": 0.08131019804480358, + "grad_norm": 2.6102816896687924, + "learning_rate": 3.97245881233923e-06, + "loss": 0.8721, + "step": 1127 + }, + { + "epoch": 0.08138234551423108, + "grad_norm": 7.908764827467777, + "learning_rate": 3.972381465087189e-06, + "loss": 1.0363, + "step": 1128 + }, + { + "epoch": 0.0814544929836586, + "grad_norm": 4.723512179143667, + "learning_rate": 3.972304010130696e-06, + "loss": 0.9574, + "step": 1129 + }, + { + "epoch": 0.08152664045308611, + "grad_norm": 4.001712909152204, + "learning_rate": 3.972226447473984e-06, + "loss": 1.0379, + "step": 1130 + }, + { + "epoch": 0.08159878792251361, + "grad_norm": 3.0039954600090333, + "learning_rate": 3.972148777121286e-06, + "loss": 0.9596, + "step": 1131 + }, + { + "epoch": 0.08167093539194113, + "grad_norm": 12.723401018268648, + "learning_rate": 3.972070999076844e-06, + "loss": 0.9932, + "step": 1132 + }, + { + "epoch": 0.08174308286136864, + "grad_norm": 2.7907554980412406, + "learning_rate": 3.971993113344905e-06, + "loss": 1.0077, + "step": 1133 + }, + { + "epoch": 0.08181523033079614, + "grad_norm": 2.836176081938713, + "learning_rate": 3.971915119929722e-06, + "loss": 0.976, + "step": 1134 + }, + { + "epoch": 0.08188737780022366, + "grad_norm": 2.946870346308378, + "learning_rate": 3.971837018835555e-06, + "loss": 1.0273, + "step": 1135 + }, + { + "epoch": 0.08195952526965117, + "grad_norm": 4.0012760511156245, + "learning_rate": 3.9717588100666675e-06, + "loss": 1.0884, + "step": 1136 + }, + { + "epoch": 0.08203167273907867, + "grad_norm": 3.633393534473658, + "learning_rate": 3.97168049362733e-06, + "loss": 0.8567, + "step": 1137 + }, + { + "epoch": 0.08210382020850619, + "grad_norm": 2.9389185625327743, + "learning_rate": 3.97160206952182e-06, + "loss": 0.989, + "step": 1138 + }, + { + "epoch": 0.0821759676779337, + "grad_norm": 3.717109254107312, + "learning_rate": 3.971523537754419e-06, + "loss": 1.1772, + "step": 1139 + }, + { + "epoch": 0.0822481151473612, + "grad_norm": 3.035193169368042, + "learning_rate": 3.971444898329417e-06, + "loss": 0.9435, + "step": 1140 + }, + { + "epoch": 0.08232026261678872, + "grad_norm": 3.2128835011319197, + "learning_rate": 3.971366151251107e-06, + "loss": 1.0009, + "step": 1141 + }, + { + "epoch": 0.08239241008621623, + "grad_norm": 4.1506056929571375, + "learning_rate": 3.971287296523788e-06, + "loss": 1.0267, + "step": 1142 + }, + { + "epoch": 0.08246455755564373, + "grad_norm": 2.799680953922447, + "learning_rate": 3.971208334151768e-06, + "loss": 1.0408, + "step": 1143 + }, + { + "epoch": 0.08253670502507125, + "grad_norm": 2.998046238802864, + "learning_rate": 3.971129264139358e-06, + "loss": 0.9459, + "step": 1144 + }, + { + "epoch": 0.08260885249449876, + "grad_norm": 2.9847066804806563, + "learning_rate": 3.971050086490876e-06, + "loss": 1.0131, + "step": 1145 + }, + { + "epoch": 0.08268099996392626, + "grad_norm": 3.906153929482681, + "learning_rate": 3.970970801210645e-06, + "loss": 1.0281, + "step": 1146 + }, + { + "epoch": 0.08275314743335377, + "grad_norm": 3.6039616667966583, + "learning_rate": 3.970891408302995e-06, + "loss": 1.0016, + "step": 1147 + }, + { + "epoch": 0.08282529490278129, + "grad_norm": 4.059930311459931, + "learning_rate": 3.970811907772262e-06, + "loss": 0.9145, + "step": 1148 + }, + { + "epoch": 0.0828974423722088, + "grad_norm": 3.4679853266633276, + "learning_rate": 3.9707322996227855e-06, + "loss": 0.9357, + "step": 1149 + }, + { + "epoch": 0.0829695898416363, + "grad_norm": 7.09423933988158, + "learning_rate": 3.970652583858913e-06, + "loss": 1.0408, + "step": 1150 + }, + { + "epoch": 0.08304173731106382, + "grad_norm": 3.470447296863564, + "learning_rate": 3.970572760484999e-06, + "loss": 1.0294, + "step": 1151 + }, + { + "epoch": 0.08311388478049132, + "grad_norm": 3.5977294119189565, + "learning_rate": 3.970492829505401e-06, + "loss": 1.0133, + "step": 1152 + }, + { + "epoch": 0.08318603224991883, + "grad_norm": 3.1939425714593357, + "learning_rate": 3.970412790924484e-06, + "loss": 0.8948, + "step": 1153 + }, + { + "epoch": 0.08325817971934635, + "grad_norm": 3.3815288266559804, + "learning_rate": 3.970332644746619e-06, + "loss": 0.9203, + "step": 1154 + }, + { + "epoch": 0.08333032718877385, + "grad_norm": 4.7150502637690055, + "learning_rate": 3.970252390976182e-06, + "loss": 1.0668, + "step": 1155 + }, + { + "epoch": 0.08340247465820136, + "grad_norm": 11.99841616032905, + "learning_rate": 3.970172029617556e-06, + "loss": 1.0448, + "step": 1156 + }, + { + "epoch": 0.08347462212762888, + "grad_norm": 3.353068406906433, + "learning_rate": 3.970091560675128e-06, + "loss": 0.937, + "step": 1157 + }, + { + "epoch": 0.08354676959705638, + "grad_norm": 2.7332534669822532, + "learning_rate": 3.970010984153293e-06, + "loss": 1.0417, + "step": 1158 + }, + { + "epoch": 0.08361891706648389, + "grad_norm": 0.8242742872284308, + "learning_rate": 3.9699303000564515e-06, + "loss": 0.7923, + "step": 1159 + }, + { + "epoch": 0.08369106453591141, + "grad_norm": 3.5579671297576017, + "learning_rate": 3.969849508389008e-06, + "loss": 0.948, + "step": 1160 + }, + { + "epoch": 0.08376321200533891, + "grad_norm": 6.008018539725193, + "learning_rate": 3.969768609155376e-06, + "loss": 0.8752, + "step": 1161 + }, + { + "epoch": 0.08383535947476642, + "grad_norm": 3.450831000824648, + "learning_rate": 3.969687602359971e-06, + "loss": 1.0309, + "step": 1162 + }, + { + "epoch": 0.08390750694419394, + "grad_norm": 9.700301049663436, + "learning_rate": 3.969606488007218e-06, + "loss": 0.9201, + "step": 1163 + }, + { + "epoch": 0.08397965441362144, + "grad_norm": 4.228861805618683, + "learning_rate": 3.9695252661015455e-06, + "loss": 1.0923, + "step": 1164 + }, + { + "epoch": 0.08405180188304895, + "grad_norm": 3.678950987240264, + "learning_rate": 3.9694439366473904e-06, + "loss": 0.9917, + "step": 1165 + }, + { + "epoch": 0.08412394935247647, + "grad_norm": 4.145594195397103, + "learning_rate": 3.9693624996491915e-06, + "loss": 1.0195, + "step": 1166 + }, + { + "epoch": 0.08419609682190397, + "grad_norm": 7.487347675487165, + "learning_rate": 3.9692809551113974e-06, + "loss": 0.9407, + "step": 1167 + }, + { + "epoch": 0.08426824429133148, + "grad_norm": 4.511709238017571, + "learning_rate": 3.969199303038459e-06, + "loss": 0.9544, + "step": 1168 + }, + { + "epoch": 0.084340391760759, + "grad_norm": 3.4814034008697416, + "learning_rate": 3.969117543434838e-06, + "loss": 1.0697, + "step": 1169 + }, + { + "epoch": 0.0844125392301865, + "grad_norm": 14.271743515207495, + "learning_rate": 3.969035676304997e-06, + "loss": 1.0897, + "step": 1170 + }, + { + "epoch": 0.08448468669961401, + "grad_norm": 3.901224428317358, + "learning_rate": 3.968953701653406e-06, + "loss": 1.0506, + "step": 1171 + }, + { + "epoch": 0.08455683416904151, + "grad_norm": 4.347423070804978, + "learning_rate": 3.968871619484543e-06, + "loss": 0.9653, + "step": 1172 + }, + { + "epoch": 0.08462898163846903, + "grad_norm": 3.361635946556768, + "learning_rate": 3.9687894298028895e-06, + "loss": 1.0428, + "step": 1173 + }, + { + "epoch": 0.08470112910789654, + "grad_norm": 4.010202033834811, + "learning_rate": 3.968707132612933e-06, + "loss": 1.045, + "step": 1174 + }, + { + "epoch": 0.08477327657732404, + "grad_norm": 5.41562417586684, + "learning_rate": 3.968624727919169e-06, + "loss": 1.0235, + "step": 1175 + }, + { + "epoch": 0.08484542404675156, + "grad_norm": 4.308222321507237, + "learning_rate": 3.968542215726095e-06, + "loss": 0.9102, + "step": 1176 + }, + { + "epoch": 0.08491757151617907, + "grad_norm": 4.949195434226354, + "learning_rate": 3.968459596038218e-06, + "loss": 1.0407, + "step": 1177 + }, + { + "epoch": 0.08498971898560657, + "grad_norm": 2.942302916740121, + "learning_rate": 3.968376868860049e-06, + "loss": 0.9556, + "step": 1178 + }, + { + "epoch": 0.0850618664550341, + "grad_norm": 5.847117060875726, + "learning_rate": 3.968294034196107e-06, + "loss": 0.9882, + "step": 1179 + }, + { + "epoch": 0.0851340139244616, + "grad_norm": 4.313270168362757, + "learning_rate": 3.968211092050914e-06, + "loss": 0.9228, + "step": 1180 + }, + { + "epoch": 0.0852061613938891, + "grad_norm": 2.9715659188397225, + "learning_rate": 3.968128042428998e-06, + "loss": 0.9299, + "step": 1181 + }, + { + "epoch": 0.08527830886331662, + "grad_norm": 3.70939562437499, + "learning_rate": 3.9680448853348965e-06, + "loss": 0.9755, + "step": 1182 + }, + { + "epoch": 0.08535045633274413, + "grad_norm": 3.7471151063422408, + "learning_rate": 3.967961620773149e-06, + "loss": 1.0637, + "step": 1183 + }, + { + "epoch": 0.08542260380217163, + "grad_norm": 5.5830898896089876, + "learning_rate": 3.967878248748302e-06, + "loss": 1.0056, + "step": 1184 + }, + { + "epoch": 0.08549475127159915, + "grad_norm": 3.363381343331283, + "learning_rate": 3.96779476926491e-06, + "loss": 0.8543, + "step": 1185 + }, + { + "epoch": 0.08556689874102666, + "grad_norm": 2.9397339138626077, + "learning_rate": 3.967711182327529e-06, + "loss": 1.0327, + "step": 1186 + }, + { + "epoch": 0.08563904621045416, + "grad_norm": 3.197908349704593, + "learning_rate": 3.967627487940725e-06, + "loss": 0.9392, + "step": 1187 + }, + { + "epoch": 0.08571119367988168, + "grad_norm": 3.567980783273925, + "learning_rate": 3.967543686109068e-06, + "loss": 0.98, + "step": 1188 + }, + { + "epoch": 0.08578334114930919, + "grad_norm": 5.995788685745298, + "learning_rate": 3.967459776837134e-06, + "loss": 0.9858, + "step": 1189 + }, + { + "epoch": 0.0858554886187367, + "grad_norm": 4.9725360001785095, + "learning_rate": 3.967375760129504e-06, + "loss": 0.9529, + "step": 1190 + }, + { + "epoch": 0.08592763608816421, + "grad_norm": 3.942627610758061, + "learning_rate": 3.967291635990768e-06, + "loss": 0.9265, + "step": 1191 + }, + { + "epoch": 0.08599978355759172, + "grad_norm": 2.5543181826117984, + "learning_rate": 3.967207404425518e-06, + "loss": 1.0017, + "step": 1192 + }, + { + "epoch": 0.08607193102701922, + "grad_norm": 3.110392801679123, + "learning_rate": 3.967123065438353e-06, + "loss": 1.0645, + "step": 1193 + }, + { + "epoch": 0.08614407849644674, + "grad_norm": 3.1891081997831803, + "learning_rate": 3.96703861903388e-06, + "loss": 0.9209, + "step": 1194 + }, + { + "epoch": 0.08621622596587425, + "grad_norm": 1.1785547742451328, + "learning_rate": 3.96695406521671e-06, + "loss": 0.9804, + "step": 1195 + }, + { + "epoch": 0.08628837343530175, + "grad_norm": 7.634989214316834, + "learning_rate": 3.96686940399146e-06, + "loss": 0.9692, + "step": 1196 + }, + { + "epoch": 0.08636052090472926, + "grad_norm": 5.86286126754093, + "learning_rate": 3.966784635362753e-06, + "loss": 1.0064, + "step": 1197 + }, + { + "epoch": 0.08643266837415678, + "grad_norm": 0.7965048135826316, + "learning_rate": 3.966699759335218e-06, + "loss": 0.7876, + "step": 1198 + }, + { + "epoch": 0.08650481584358428, + "grad_norm": 3.0557743880390946, + "learning_rate": 3.966614775913489e-06, + "loss": 0.9444, + "step": 1199 + }, + { + "epoch": 0.08657696331301179, + "grad_norm": 2.4085445610613907, + "learning_rate": 3.966529685102207e-06, + "loss": 1.0158, + "step": 1200 + }, + { + "epoch": 0.08664911078243931, + "grad_norm": 3.4599674148348205, + "learning_rate": 3.966444486906019e-06, + "loss": 0.956, + "step": 1201 + }, + { + "epoch": 0.08672125825186681, + "grad_norm": 4.990677343964695, + "learning_rate": 3.966359181329578e-06, + "loss": 1.0332, + "step": 1202 + }, + { + "epoch": 0.08679340572129432, + "grad_norm": 4.726654909349748, + "learning_rate": 3.9662737683775405e-06, + "loss": 0.9893, + "step": 1203 + }, + { + "epoch": 0.08686555319072184, + "grad_norm": 7.992634720655763, + "learning_rate": 3.966188248054571e-06, + "loss": 1.0305, + "step": 1204 + }, + { + "epoch": 0.08693770066014934, + "grad_norm": 4.327923821686704, + "learning_rate": 3.96610262036534e-06, + "loss": 0.9996, + "step": 1205 + }, + { + "epoch": 0.08700984812957685, + "grad_norm": 2.586838026646573, + "learning_rate": 3.9660168853145245e-06, + "loss": 1.0284, + "step": 1206 + }, + { + "epoch": 0.08708199559900437, + "grad_norm": 3.3374490283974194, + "learning_rate": 3.965931042906804e-06, + "loss": 1.0117, + "step": 1207 + }, + { + "epoch": 0.08715414306843187, + "grad_norm": 1.2149477113845766, + "learning_rate": 3.965845093146866e-06, + "loss": 0.9063, + "step": 1208 + }, + { + "epoch": 0.08722629053785938, + "grad_norm": 4.200303956795005, + "learning_rate": 3.965759036039406e-06, + "loss": 0.9626, + "step": 1209 + }, + { + "epoch": 0.0872984380072869, + "grad_norm": 2.0930652922221236, + "learning_rate": 3.965672871589121e-06, + "loss": 0.9547, + "step": 1210 + }, + { + "epoch": 0.0873705854767144, + "grad_norm": 3.719557345931154, + "learning_rate": 3.965586599800718e-06, + "loss": 1.0911, + "step": 1211 + }, + { + "epoch": 0.08744273294614191, + "grad_norm": 4.273722246854809, + "learning_rate": 3.965500220678907e-06, + "loss": 0.9878, + "step": 1212 + }, + { + "epoch": 0.08751488041556943, + "grad_norm": 4.01571453306377, + "learning_rate": 3.965413734228405e-06, + "loss": 0.9539, + "step": 1213 + }, + { + "epoch": 0.08758702788499693, + "grad_norm": 3.5819540400280188, + "learning_rate": 3.965327140453934e-06, + "loss": 0.9262, + "step": 1214 + }, + { + "epoch": 0.08765917535442444, + "grad_norm": 2.405370477228185, + "learning_rate": 3.965240439360224e-06, + "loss": 1.0604, + "step": 1215 + }, + { + "epoch": 0.08773132282385196, + "grad_norm": 3.1431218939898775, + "learning_rate": 3.965153630952009e-06, + "loss": 1.0009, + "step": 1216 + }, + { + "epoch": 0.08780347029327946, + "grad_norm": 3.8065603295709862, + "learning_rate": 3.9650667152340284e-06, + "loss": 1.005, + "step": 1217 + }, + { + "epoch": 0.08787561776270697, + "grad_norm": 2.7755124349258864, + "learning_rate": 3.964979692211029e-06, + "loss": 1.0379, + "step": 1218 + }, + { + "epoch": 0.08794776523213449, + "grad_norm": 4.002024138431682, + "learning_rate": 3.964892561887762e-06, + "loss": 1.0481, + "step": 1219 + }, + { + "epoch": 0.088019912701562, + "grad_norm": 2.4424512412007964, + "learning_rate": 3.964805324268987e-06, + "loss": 0.9399, + "step": 1220 + }, + { + "epoch": 0.0880920601709895, + "grad_norm": 5.403949225920625, + "learning_rate": 3.964717979359466e-06, + "loss": 1.0396, + "step": 1221 + }, + { + "epoch": 0.088164207640417, + "grad_norm": 2.7370550482067513, + "learning_rate": 3.96463052716397e-06, + "loss": 0.9499, + "step": 1222 + }, + { + "epoch": 0.08823635510984452, + "grad_norm": 3.5831659115262755, + "learning_rate": 3.964542967687273e-06, + "loss": 1.0145, + "step": 1223 + }, + { + "epoch": 0.08830850257927203, + "grad_norm": 2.738173021471418, + "learning_rate": 3.964455300934158e-06, + "loss": 0.9467, + "step": 1224 + }, + { + "epoch": 0.08838065004869954, + "grad_norm": 3.380295308095741, + "learning_rate": 3.96436752690941e-06, + "loss": 0.94, + "step": 1225 + }, + { + "epoch": 0.08845279751812705, + "grad_norm": 2.864029228487527, + "learning_rate": 3.964279645617824e-06, + "loss": 0.8754, + "step": 1226 + }, + { + "epoch": 0.08852494498755456, + "grad_norm": 3.2851505891839605, + "learning_rate": 3.964191657064197e-06, + "loss": 0.9612, + "step": 1227 + }, + { + "epoch": 0.08859709245698207, + "grad_norm": 7.1052207127543285, + "learning_rate": 3.964103561253336e-06, + "loss": 1.1172, + "step": 1228 + }, + { + "epoch": 0.08866923992640958, + "grad_norm": 3.4150655918329003, + "learning_rate": 3.96401535819005e-06, + "loss": 1.0364, + "step": 1229 + }, + { + "epoch": 0.08874138739583709, + "grad_norm": 2.8616584040397157, + "learning_rate": 3.963927047879156e-06, + "loss": 0.9879, + "step": 1230 + }, + { + "epoch": 0.0888135348652646, + "grad_norm": 2.6046929298165793, + "learning_rate": 3.9638386303254764e-06, + "loss": 0.9632, + "step": 1231 + }, + { + "epoch": 0.08888568233469211, + "grad_norm": 2.6454486917766435, + "learning_rate": 3.963750105533838e-06, + "loss": 0.9134, + "step": 1232 + }, + { + "epoch": 0.08895782980411962, + "grad_norm": 2.736287608857565, + "learning_rate": 3.963661473509076e-06, + "loss": 0.9854, + "step": 1233 + }, + { + "epoch": 0.08902997727354713, + "grad_norm": 3.258180885618637, + "learning_rate": 3.963572734256031e-06, + "loss": 0.9907, + "step": 1234 + }, + { + "epoch": 0.08910212474297464, + "grad_norm": 3.243389155138483, + "learning_rate": 3.963483887779547e-06, + "loss": 0.9017, + "step": 1235 + }, + { + "epoch": 0.08917427221240215, + "grad_norm": 1.0179601966106262, + "learning_rate": 3.963394934084478e-06, + "loss": 0.8748, + "step": 1236 + }, + { + "epoch": 0.08924641968182966, + "grad_norm": 2.626050738801055, + "learning_rate": 3.963305873175678e-06, + "loss": 0.994, + "step": 1237 + }, + { + "epoch": 0.08931856715125717, + "grad_norm": 3.0744214699977466, + "learning_rate": 3.963216705058014e-06, + "loss": 1.0573, + "step": 1238 + }, + { + "epoch": 0.08939071462068468, + "grad_norm": 4.362318569662543, + "learning_rate": 3.9631274297363525e-06, + "loss": 1.0406, + "step": 1239 + }, + { + "epoch": 0.08946286209011219, + "grad_norm": 5.174233443388941, + "learning_rate": 3.963038047215569e-06, + "loss": 1.1133, + "step": 1240 + }, + { + "epoch": 0.0895350095595397, + "grad_norm": 4.106741986347772, + "learning_rate": 3.9629485575005444e-06, + "loss": 0.9521, + "step": 1241 + }, + { + "epoch": 0.08960715702896721, + "grad_norm": 4.555002191838262, + "learning_rate": 3.962858960596166e-06, + "loss": 1.0203, + "step": 1242 + }, + { + "epoch": 0.08967930449839472, + "grad_norm": 3.505431456736243, + "learning_rate": 3.962769256507326e-06, + "loss": 0.9931, + "step": 1243 + }, + { + "epoch": 0.08975145196782224, + "grad_norm": 3.1802390180286753, + "learning_rate": 3.9626794452389235e-06, + "loss": 1.0645, + "step": 1244 + }, + { + "epoch": 0.08982359943724974, + "grad_norm": 4.524094607995585, + "learning_rate": 3.9625895267958615e-06, + "loss": 0.9756, + "step": 1245 + }, + { + "epoch": 0.08989574690667725, + "grad_norm": 5.932093678676834, + "learning_rate": 3.962499501183051e-06, + "loss": 0.9124, + "step": 1246 + }, + { + "epoch": 0.08996789437610477, + "grad_norm": 3.581816256029274, + "learning_rate": 3.9624093684054065e-06, + "loss": 0.9468, + "step": 1247 + }, + { + "epoch": 0.09004004184553227, + "grad_norm": 5.855849851048722, + "learning_rate": 3.962319128467852e-06, + "loss": 0.9964, + "step": 1248 + }, + { + "epoch": 0.09011218931495978, + "grad_norm": 3.6016604726709582, + "learning_rate": 3.9622287813753144e-06, + "loss": 1.0467, + "step": 1249 + }, + { + "epoch": 0.09018433678438728, + "grad_norm": 2.380309344150995, + "learning_rate": 3.962138327132726e-06, + "loss": 0.9548, + "step": 1250 + }, + { + "epoch": 0.0902564842538148, + "grad_norm": 3.068527673565135, + "learning_rate": 3.962047765745028e-06, + "loss": 0.9794, + "step": 1251 + }, + { + "epoch": 0.0903286317232423, + "grad_norm": 2.809811557546541, + "learning_rate": 3.961957097217165e-06, + "loss": 0.9433, + "step": 1252 + }, + { + "epoch": 0.09040077919266981, + "grad_norm": 4.448347560668772, + "learning_rate": 3.961866321554087e-06, + "loss": 0.972, + "step": 1253 + }, + { + "epoch": 0.09047292666209733, + "grad_norm": 3.161401302990078, + "learning_rate": 3.9617754387607524e-06, + "loss": 1.0177, + "step": 1254 + }, + { + "epoch": 0.09054507413152484, + "grad_norm": 4.5643010177372805, + "learning_rate": 3.961684448842123e-06, + "loss": 1.0158, + "step": 1255 + }, + { + "epoch": 0.09061722160095234, + "grad_norm": 4.892797992814166, + "learning_rate": 3.961593351803167e-06, + "loss": 1.0059, + "step": 1256 + }, + { + "epoch": 0.09068936907037986, + "grad_norm": 2.7301984006450457, + "learning_rate": 3.9615021476488615e-06, + "loss": 0.8885, + "step": 1257 + }, + { + "epoch": 0.09076151653980737, + "grad_norm": 4.762448712570894, + "learning_rate": 3.9614108363841835e-06, + "loss": 0.9755, + "step": 1258 + }, + { + "epoch": 0.09083366400923487, + "grad_norm": 2.1709988150854755, + "learning_rate": 3.961319418014121e-06, + "loss": 1.0218, + "step": 1259 + }, + { + "epoch": 0.09090581147866239, + "grad_norm": 3.7571374996754225, + "learning_rate": 3.961227892543666e-06, + "loss": 0.937, + "step": 1260 + }, + { + "epoch": 0.0909779589480899, + "grad_norm": 4.086139156113629, + "learning_rate": 3.961136259977815e-06, + "loss": 0.8587, + "step": 1261 + }, + { + "epoch": 0.0910501064175174, + "grad_norm": 4.257494433360736, + "learning_rate": 3.961044520321573e-06, + "loss": 1.021, + "step": 1262 + }, + { + "epoch": 0.09112225388694492, + "grad_norm": 3.873067804800357, + "learning_rate": 3.96095267357995e-06, + "loss": 1.0317, + "step": 1263 + }, + { + "epoch": 0.09119440135637243, + "grad_norm": 3.6759257597291106, + "learning_rate": 3.960860719757961e-06, + "loss": 1.0102, + "step": 1264 + }, + { + "epoch": 0.09126654882579993, + "grad_norm": 3.952483475690993, + "learning_rate": 3.960768658860626e-06, + "loss": 1.0671, + "step": 1265 + }, + { + "epoch": 0.09133869629522745, + "grad_norm": 3.7312152854544176, + "learning_rate": 3.960676490892973e-06, + "loss": 1.0089, + "step": 1266 + }, + { + "epoch": 0.09141084376465496, + "grad_norm": 3.415987707147211, + "learning_rate": 3.960584215860036e-06, + "loss": 1.0296, + "step": 1267 + }, + { + "epoch": 0.09148299123408246, + "grad_norm": 3.832947974254261, + "learning_rate": 3.960491833766852e-06, + "loss": 0.9945, + "step": 1268 + }, + { + "epoch": 0.09155513870350998, + "grad_norm": 3.389785311041992, + "learning_rate": 3.960399344618467e-06, + "loss": 1.0263, + "step": 1269 + }, + { + "epoch": 0.09162728617293749, + "grad_norm": 3.5757325015538366, + "learning_rate": 3.96030674841993e-06, + "loss": 1.0608, + "step": 1270 + }, + { + "epoch": 0.09169943364236499, + "grad_norm": 3.1536795847709196, + "learning_rate": 3.960214045176299e-06, + "loss": 0.9904, + "step": 1271 + }, + { + "epoch": 0.09177158111179251, + "grad_norm": 1.108913540568257, + "learning_rate": 3.960121234892635e-06, + "loss": 0.8749, + "step": 1272 + }, + { + "epoch": 0.09184372858122002, + "grad_norm": 6.545964799123746, + "learning_rate": 3.960028317574007e-06, + "loss": 0.979, + "step": 1273 + }, + { + "epoch": 0.09191587605064752, + "grad_norm": 2.0565674742525624, + "learning_rate": 3.959935293225488e-06, + "loss": 1.0353, + "step": 1274 + }, + { + "epoch": 0.09198802352007503, + "grad_norm": 3.327162769175913, + "learning_rate": 3.959842161852158e-06, + "loss": 1.0915, + "step": 1275 + }, + { + "epoch": 0.09206017098950255, + "grad_norm": 3.3012016073866657, + "learning_rate": 3.959748923459103e-06, + "loss": 0.9929, + "step": 1276 + }, + { + "epoch": 0.09213231845893005, + "grad_norm": 2.6807501551521256, + "learning_rate": 3.959655578051413e-06, + "loss": 0.9108, + "step": 1277 + }, + { + "epoch": 0.09220446592835756, + "grad_norm": 2.780708517448168, + "learning_rate": 3.959562125634188e-06, + "loss": 0.8958, + "step": 1278 + }, + { + "epoch": 0.09227661339778508, + "grad_norm": 3.68788429456082, + "learning_rate": 3.959468566212529e-06, + "loss": 0.9477, + "step": 1279 + }, + { + "epoch": 0.09234876086721258, + "grad_norm": 3.683416400737987, + "learning_rate": 3.959374899791544e-06, + "loss": 0.9095, + "step": 1280 + }, + { + "epoch": 0.09242090833664009, + "grad_norm": 2.6692893919260836, + "learning_rate": 3.95928112637635e-06, + "loss": 1.0177, + "step": 1281 + }, + { + "epoch": 0.0924930558060676, + "grad_norm": 10.797547337401118, + "learning_rate": 3.959187245972066e-06, + "loss": 1.094, + "step": 1282 + }, + { + "epoch": 0.09256520327549511, + "grad_norm": 3.966581578236994, + "learning_rate": 3.95909325858382e-06, + "loss": 1.0161, + "step": 1283 + }, + { + "epoch": 0.09263735074492262, + "grad_norm": 2.830021680607327, + "learning_rate": 3.9589991642167435e-06, + "loss": 0.7972, + "step": 1284 + }, + { + "epoch": 0.09270949821435014, + "grad_norm": 4.02884099381174, + "learning_rate": 3.958904962875975e-06, + "loss": 1.0283, + "step": 1285 + }, + { + "epoch": 0.09278164568377764, + "grad_norm": 3.32152465236002, + "learning_rate": 3.958810654566657e-06, + "loss": 1.0134, + "step": 1286 + }, + { + "epoch": 0.09285379315320515, + "grad_norm": 4.268797424857237, + "learning_rate": 3.958716239293941e-06, + "loss": 1.0095, + "step": 1287 + }, + { + "epoch": 0.09292594062263267, + "grad_norm": 2.8879454694339066, + "learning_rate": 3.9586217170629816e-06, + "loss": 0.9473, + "step": 1288 + }, + { + "epoch": 0.09299808809206017, + "grad_norm": 4.395178337775341, + "learning_rate": 3.958527087878941e-06, + "loss": 0.9761, + "step": 1289 + }, + { + "epoch": 0.09307023556148768, + "grad_norm": 4.307316193824073, + "learning_rate": 3.958432351746986e-06, + "loss": 0.9669, + "step": 1290 + }, + { + "epoch": 0.0931423830309152, + "grad_norm": 2.75136922214888, + "learning_rate": 3.958337508672291e-06, + "loss": 0.9455, + "step": 1291 + }, + { + "epoch": 0.0932145305003427, + "grad_norm": 13.394168985696082, + "learning_rate": 3.9582425586600336e-06, + "loss": 0.9016, + "step": 1292 + }, + { + "epoch": 0.09328667796977021, + "grad_norm": 3.9218187062148226, + "learning_rate": 3.9581475017153995e-06, + "loss": 1.0019, + "step": 1293 + }, + { + "epoch": 0.09335882543919773, + "grad_norm": 3.0760260072062584, + "learning_rate": 3.958052337843579e-06, + "loss": 1.0241, + "step": 1294 + }, + { + "epoch": 0.09343097290862523, + "grad_norm": 4.8779315448257226, + "learning_rate": 3.957957067049769e-06, + "loss": 1.0311, + "step": 1295 + }, + { + "epoch": 0.09350312037805274, + "grad_norm": 3.31997011102298, + "learning_rate": 3.9578616893391716e-06, + "loss": 1.059, + "step": 1296 + }, + { + "epoch": 0.09357526784748026, + "grad_norm": 4.188035389849964, + "learning_rate": 3.957766204716995e-06, + "loss": 0.9685, + "step": 1297 + }, + { + "epoch": 0.09364741531690776, + "grad_norm": 4.240136087860754, + "learning_rate": 3.957670613188453e-06, + "loss": 0.9664, + "step": 1298 + }, + { + "epoch": 0.09371956278633527, + "grad_norm": 5.479394982902958, + "learning_rate": 3.9575749147587665e-06, + "loss": 0.8968, + "step": 1299 + }, + { + "epoch": 0.09379171025576277, + "grad_norm": 3.1037147414885284, + "learning_rate": 3.95747910943316e-06, + "loss": 0.9394, + "step": 1300 + }, + { + "epoch": 0.09386385772519029, + "grad_norm": 4.037621010304405, + "learning_rate": 3.957383197216867e-06, + "loss": 0.9988, + "step": 1301 + }, + { + "epoch": 0.0939360051946178, + "grad_norm": 3.3780136781712575, + "learning_rate": 3.957287178115122e-06, + "loss": 1.1274, + "step": 1302 + }, + { + "epoch": 0.0940081526640453, + "grad_norm": 2.6134037736073292, + "learning_rate": 3.957191052133171e-06, + "loss": 1.0055, + "step": 1303 + }, + { + "epoch": 0.09408030013347282, + "grad_norm": 3.248679699843506, + "learning_rate": 3.957094819276261e-06, + "loss": 0.9794, + "step": 1304 + }, + { + "epoch": 0.09415244760290033, + "grad_norm": 3.486909637251785, + "learning_rate": 3.956998479549649e-06, + "loss": 1.0367, + "step": 1305 + }, + { + "epoch": 0.09422459507232783, + "grad_norm": 0.8477147165693917, + "learning_rate": 3.956902032958593e-06, + "loss": 0.7994, + "step": 1306 + }, + { + "epoch": 0.09429674254175535, + "grad_norm": 3.53132251226193, + "learning_rate": 3.956805479508362e-06, + "loss": 1.0211, + "step": 1307 + }, + { + "epoch": 0.09436889001118286, + "grad_norm": 3.6529302931848666, + "learning_rate": 3.956708819204228e-06, + "loss": 0.9944, + "step": 1308 + }, + { + "epoch": 0.09444103748061036, + "grad_norm": 4.203172647993167, + "learning_rate": 3.956612052051468e-06, + "loss": 1.0691, + "step": 1309 + }, + { + "epoch": 0.09451318495003788, + "grad_norm": 4.723723966875538, + "learning_rate": 3.956515178055367e-06, + "loss": 1.0266, + "step": 1310 + }, + { + "epoch": 0.09458533241946539, + "grad_norm": 8.773627071081942, + "learning_rate": 3.956418197221215e-06, + "loss": 0.9449, + "step": 1311 + }, + { + "epoch": 0.09465747988889289, + "grad_norm": 3.785146927797473, + "learning_rate": 3.9563211095543085e-06, + "loss": 0.9481, + "step": 1312 + }, + { + "epoch": 0.09472962735832041, + "grad_norm": 5.167539686872784, + "learning_rate": 3.956223915059948e-06, + "loss": 1.056, + "step": 1313 + }, + { + "epoch": 0.09480177482774792, + "grad_norm": 4.987403834283483, + "learning_rate": 3.956126613743441e-06, + "loss": 0.8869, + "step": 1314 + }, + { + "epoch": 0.09487392229717542, + "grad_norm": 2.977168304391087, + "learning_rate": 3.956029205610101e-06, + "loss": 1.0495, + "step": 1315 + }, + { + "epoch": 0.09494606976660294, + "grad_norm": 3.1793827595062085, + "learning_rate": 3.955931690665247e-06, + "loss": 0.9403, + "step": 1316 + }, + { + "epoch": 0.09501821723603045, + "grad_norm": 3.1690810352291594, + "learning_rate": 3.955834068914203e-06, + "loss": 0.9858, + "step": 1317 + }, + { + "epoch": 0.09509036470545795, + "grad_norm": 3.520680230477957, + "learning_rate": 3.9557363403623024e-06, + "loss": 0.9406, + "step": 1318 + }, + { + "epoch": 0.09516251217488547, + "grad_norm": 2.937860669659311, + "learning_rate": 3.955638505014879e-06, + "loss": 1.035, + "step": 1319 + }, + { + "epoch": 0.09523465964431298, + "grad_norm": 4.0988025498344385, + "learning_rate": 3.955540562877277e-06, + "loss": 0.896, + "step": 1320 + }, + { + "epoch": 0.09530680711374048, + "grad_norm": 4.206575360120864, + "learning_rate": 3.955442513954843e-06, + "loss": 1.0878, + "step": 1321 + }, + { + "epoch": 0.095378954583168, + "grad_norm": 3.2144649092360553, + "learning_rate": 3.955344358252934e-06, + "loss": 1.0747, + "step": 1322 + }, + { + "epoch": 0.09545110205259551, + "grad_norm": 2.9688409088671435, + "learning_rate": 3.955246095776907e-06, + "loss": 0.8045, + "step": 1323 + }, + { + "epoch": 0.09552324952202301, + "grad_norm": 16.639953771673692, + "learning_rate": 3.955147726532128e-06, + "loss": 0.9054, + "step": 1324 + }, + { + "epoch": 0.09559539699145052, + "grad_norm": 4.9758668230640115, + "learning_rate": 3.95504925052397e-06, + "loss": 1.0085, + "step": 1325 + }, + { + "epoch": 0.09566754446087804, + "grad_norm": 3.0066965386146607, + "learning_rate": 3.95495066775781e-06, + "loss": 1.1375, + "step": 1326 + }, + { + "epoch": 0.09573969193030554, + "grad_norm": 4.376489658151299, + "learning_rate": 3.954851978239031e-06, + "loss": 0.9545, + "step": 1327 + }, + { + "epoch": 0.09581183939973305, + "grad_norm": 3.2160022451288537, + "learning_rate": 3.9547531819730224e-06, + "loss": 0.9651, + "step": 1328 + }, + { + "epoch": 0.09588398686916057, + "grad_norm": 4.519697735224984, + "learning_rate": 3.954654278965179e-06, + "loss": 1.0027, + "step": 1329 + }, + { + "epoch": 0.09595613433858807, + "grad_norm": 3.4240088929212362, + "learning_rate": 3.9545552692209e-06, + "loss": 0.9524, + "step": 1330 + }, + { + "epoch": 0.09602828180801558, + "grad_norm": 4.6631212163259725, + "learning_rate": 3.954456152745594e-06, + "loss": 0.8931, + "step": 1331 + }, + { + "epoch": 0.0961004292774431, + "grad_norm": 3.2914170923061246, + "learning_rate": 3.954356929544672e-06, + "loss": 1.1506, + "step": 1332 + }, + { + "epoch": 0.0961725767468706, + "grad_norm": 3.3701621809514637, + "learning_rate": 3.954257599623554e-06, + "loss": 1.0461, + "step": 1333 + }, + { + "epoch": 0.09624472421629811, + "grad_norm": 6.894057614751113, + "learning_rate": 3.954158162987662e-06, + "loss": 1.0044, + "step": 1334 + }, + { + "epoch": 0.09631687168572563, + "grad_norm": 6.822963693083007, + "learning_rate": 3.954058619642427e-06, + "loss": 0.8441, + "step": 1335 + }, + { + "epoch": 0.09638901915515313, + "grad_norm": 2.889438364966695, + "learning_rate": 3.953958969593285e-06, + "loss": 1.0551, + "step": 1336 + }, + { + "epoch": 0.09646116662458064, + "grad_norm": 3.912264534686743, + "learning_rate": 3.953859212845676e-06, + "loss": 0.9639, + "step": 1337 + }, + { + "epoch": 0.09653331409400816, + "grad_norm": 2.6866829095764366, + "learning_rate": 3.9537593494050494e-06, + "loss": 0.9813, + "step": 1338 + }, + { + "epoch": 0.09660546156343566, + "grad_norm": 5.225861050416641, + "learning_rate": 3.953659379276856e-06, + "loss": 0.9761, + "step": 1339 + }, + { + "epoch": 0.09667760903286317, + "grad_norm": 3.0187265525523297, + "learning_rate": 3.953559302466557e-06, + "loss": 0.8726, + "step": 1340 + }, + { + "epoch": 0.09674975650229069, + "grad_norm": 4.157817803013576, + "learning_rate": 3.953459118979617e-06, + "loss": 0.9896, + "step": 1341 + }, + { + "epoch": 0.09682190397171819, + "grad_norm": 2.8877785354299492, + "learning_rate": 3.9533588288215045e-06, + "loss": 1.0017, + "step": 1342 + }, + { + "epoch": 0.0968940514411457, + "grad_norm": 5.424631610791001, + "learning_rate": 3.9532584319976985e-06, + "loss": 0.9255, + "step": 1343 + }, + { + "epoch": 0.09696619891057322, + "grad_norm": 29.842625142413745, + "learning_rate": 3.953157928513679e-06, + "loss": 0.9778, + "step": 1344 + }, + { + "epoch": 0.09703834638000072, + "grad_norm": 3.692903471374409, + "learning_rate": 3.9530573183749365e-06, + "loss": 1.0255, + "step": 1345 + }, + { + "epoch": 0.09711049384942823, + "grad_norm": 3.1335711618891846, + "learning_rate": 3.952956601586964e-06, + "loss": 1.0073, + "step": 1346 + }, + { + "epoch": 0.09718264131885575, + "grad_norm": 3.5497484037405242, + "learning_rate": 3.9528557781552605e-06, + "loss": 0.9113, + "step": 1347 + }, + { + "epoch": 0.09725478878828325, + "grad_norm": 3.540048895460482, + "learning_rate": 3.952754848085332e-06, + "loss": 1.1251, + "step": 1348 + }, + { + "epoch": 0.09732693625771076, + "grad_norm": 17.948328238927033, + "learning_rate": 3.95265381138269e-06, + "loss": 0.9931, + "step": 1349 + }, + { + "epoch": 0.09739908372713828, + "grad_norm": 3.8811499492139814, + "learning_rate": 3.952552668052853e-06, + "loss": 0.9398, + "step": 1350 + }, + { + "epoch": 0.09747123119656578, + "grad_norm": 2.8739535873796065, + "learning_rate": 3.952451418101342e-06, + "loss": 0.9232, + "step": 1351 + }, + { + "epoch": 0.09754337866599329, + "grad_norm": 5.742513343752353, + "learning_rate": 3.952350061533687e-06, + "loss": 0.9391, + "step": 1352 + }, + { + "epoch": 0.0976155261354208, + "grad_norm": 5.338648273675149, + "learning_rate": 3.952248598355422e-06, + "loss": 0.9389, + "step": 1353 + }, + { + "epoch": 0.09768767360484831, + "grad_norm": 3.7579438311493867, + "learning_rate": 3.952147028572088e-06, + "loss": 0.9901, + "step": 1354 + }, + { + "epoch": 0.09775982107427582, + "grad_norm": 3.2764310782239203, + "learning_rate": 3.952045352189232e-06, + "loss": 1.0493, + "step": 1355 + }, + { + "epoch": 0.09783196854370332, + "grad_norm": 2.089190286452872, + "learning_rate": 3.9519435692124056e-06, + "loss": 0.9589, + "step": 1356 + }, + { + "epoch": 0.09790411601313084, + "grad_norm": 3.1300941146009915, + "learning_rate": 3.951841679647167e-06, + "loss": 1.0229, + "step": 1357 + }, + { + "epoch": 0.09797626348255835, + "grad_norm": 3.3324450898851445, + "learning_rate": 3.9517396834990784e-06, + "loss": 1.018, + "step": 1358 + }, + { + "epoch": 0.09804841095198585, + "grad_norm": 3.0974416574274635, + "learning_rate": 3.951637580773712e-06, + "loss": 0.9632, + "step": 1359 + }, + { + "epoch": 0.09812055842141337, + "grad_norm": 2.5172205065049704, + "learning_rate": 3.951535371476641e-06, + "loss": 1.028, + "step": 1360 + }, + { + "epoch": 0.09819270589084088, + "grad_norm": 2.985184005612264, + "learning_rate": 3.951433055613449e-06, + "loss": 1.025, + "step": 1361 + }, + { + "epoch": 0.09826485336026838, + "grad_norm": 3.381358902564781, + "learning_rate": 3.951330633189721e-06, + "loss": 0.9324, + "step": 1362 + }, + { + "epoch": 0.0983370008296959, + "grad_norm": 3.630492029183152, + "learning_rate": 3.951228104211051e-06, + "loss": 0.9187, + "step": 1363 + }, + { + "epoch": 0.09840914829912341, + "grad_norm": 4.186771841935132, + "learning_rate": 3.951125468683037e-06, + "loss": 0.928, + "step": 1364 + }, + { + "epoch": 0.09848129576855091, + "grad_norm": 2.9630618564561804, + "learning_rate": 3.951022726611284e-06, + "loss": 1.013, + "step": 1365 + }, + { + "epoch": 0.09855344323797843, + "grad_norm": 5.298497674454635, + "learning_rate": 3.950919878001403e-06, + "loss": 0.9801, + "step": 1366 + }, + { + "epoch": 0.09862559070740594, + "grad_norm": 4.092846020778779, + "learning_rate": 3.950816922859009e-06, + "loss": 0.9538, + "step": 1367 + }, + { + "epoch": 0.09869773817683344, + "grad_norm": 3.514451301475019, + "learning_rate": 3.950713861189724e-06, + "loss": 0.9562, + "step": 1368 + }, + { + "epoch": 0.09876988564626096, + "grad_norm": 3.938969655741529, + "learning_rate": 3.950610692999177e-06, + "loss": 0.9948, + "step": 1369 + }, + { + "epoch": 0.09884203311568847, + "grad_norm": 3.3662440843668286, + "learning_rate": 3.950507418293001e-06, + "loss": 1.0557, + "step": 1370 + }, + { + "epoch": 0.09891418058511597, + "grad_norm": 3.1779067843230067, + "learning_rate": 3.950404037076836e-06, + "loss": 0.9708, + "step": 1371 + }, + { + "epoch": 0.0989863280545435, + "grad_norm": 4.520059592605293, + "learning_rate": 3.950300549356326e-06, + "loss": 1.0891, + "step": 1372 + }, + { + "epoch": 0.099058475523971, + "grad_norm": 3.0869210136646665, + "learning_rate": 3.950196955137122e-06, + "loss": 0.9956, + "step": 1373 + }, + { + "epoch": 0.0991306229933985, + "grad_norm": 3.509220783653096, + "learning_rate": 3.950093254424883e-06, + "loss": 0.8419, + "step": 1374 + }, + { + "epoch": 0.09920277046282602, + "grad_norm": 3.3153373514387967, + "learning_rate": 3.94998944722527e-06, + "loss": 0.9851, + "step": 1375 + }, + { + "epoch": 0.09927491793225353, + "grad_norm": 4.343983156821404, + "learning_rate": 3.949885533543951e-06, + "loss": 0.9531, + "step": 1376 + }, + { + "epoch": 0.09934706540168103, + "grad_norm": 2.9041569874263318, + "learning_rate": 3.949781513386602e-06, + "loss": 1.0087, + "step": 1377 + }, + { + "epoch": 0.09941921287110854, + "grad_norm": 8.726409938096815, + "learning_rate": 3.949677386758902e-06, + "loss": 0.981, + "step": 1378 + }, + { + "epoch": 0.09949136034053606, + "grad_norm": 2.4634535748872963, + "learning_rate": 3.949573153666538e-06, + "loss": 0.8827, + "step": 1379 + }, + { + "epoch": 0.09956350780996356, + "grad_norm": 3.8660316049909076, + "learning_rate": 3.9494688141152005e-06, + "loss": 1.037, + "step": 1380 + }, + { + "epoch": 0.09963565527939107, + "grad_norm": 2.4096217126860995, + "learning_rate": 3.949364368110589e-06, + "loss": 0.9582, + "step": 1381 + }, + { + "epoch": 0.09970780274881859, + "grad_norm": 4.999915122265417, + "learning_rate": 3.949259815658404e-06, + "loss": 0.9353, + "step": 1382 + }, + { + "epoch": 0.0997799502182461, + "grad_norm": 2.9894056810148055, + "learning_rate": 3.949155156764357e-06, + "loss": 0.9348, + "step": 1383 + }, + { + "epoch": 0.0998520976876736, + "grad_norm": 3.45460085162778, + "learning_rate": 3.949050391434163e-06, + "loss": 0.9393, + "step": 1384 + }, + { + "epoch": 0.09992424515710112, + "grad_norm": 2.4555480070367306, + "learning_rate": 3.948945519673541e-06, + "loss": 0.867, + "step": 1385 + }, + { + "epoch": 0.09999639262652862, + "grad_norm": 3.277122008104956, + "learning_rate": 3.94884054148822e-06, + "loss": 0.9144, + "step": 1386 + }, + { + "epoch": 0.10006854009595613, + "grad_norm": 3.568856441159748, + "learning_rate": 3.948735456883931e-06, + "loss": 0.9756, + "step": 1387 + }, + { + "epoch": 0.10014068756538365, + "grad_norm": 3.2747578655159817, + "learning_rate": 3.948630265866414e-06, + "loss": 0.8415, + "step": 1388 + }, + { + "epoch": 0.10021283503481115, + "grad_norm": 2.135666164306245, + "learning_rate": 3.9485249684414095e-06, + "loss": 1.0358, + "step": 1389 + }, + { + "epoch": 0.10028498250423866, + "grad_norm": 2.4415810484299403, + "learning_rate": 3.948419564614671e-06, + "loss": 1.0197, + "step": 1390 + }, + { + "epoch": 0.10035712997366618, + "grad_norm": 3.3351862684674054, + "learning_rate": 3.948314054391953e-06, + "loss": 0.8902, + "step": 1391 + }, + { + "epoch": 0.10042927744309368, + "grad_norm": 3.315193952177121, + "learning_rate": 3.948208437779017e-06, + "loss": 0.9535, + "step": 1392 + }, + { + "epoch": 0.10050142491252119, + "grad_norm": 3.0765604603277743, + "learning_rate": 3.948102714781629e-06, + "loss": 0.9207, + "step": 1393 + }, + { + "epoch": 0.10057357238194871, + "grad_norm": 2.8635188858904135, + "learning_rate": 3.947996885405564e-06, + "loss": 0.8706, + "step": 1394 + }, + { + "epoch": 0.10064571985137621, + "grad_norm": 4.9751191014741085, + "learning_rate": 3.9478909496566005e-06, + "loss": 0.982, + "step": 1395 + }, + { + "epoch": 0.10071786732080372, + "grad_norm": 2.377296441767369, + "learning_rate": 3.947784907540523e-06, + "loss": 0.9852, + "step": 1396 + }, + { + "epoch": 0.10079001479023124, + "grad_norm": 2.864846419766703, + "learning_rate": 3.947678759063122e-06, + "loss": 1.0033, + "step": 1397 + }, + { + "epoch": 0.10086216225965874, + "grad_norm": 2.809400333888616, + "learning_rate": 3.947572504230194e-06, + "loss": 0.9611, + "step": 1398 + }, + { + "epoch": 0.10093430972908625, + "grad_norm": 4.163609463951986, + "learning_rate": 3.947466143047542e-06, + "loss": 1.0117, + "step": 1399 + }, + { + "epoch": 0.10100645719851377, + "grad_norm": 3.2905615065952687, + "learning_rate": 3.947359675520973e-06, + "loss": 1.0189, + "step": 1400 + }, + { + "epoch": 0.10107860466794127, + "grad_norm": 2.629894960575823, + "learning_rate": 3.9472531016563e-06, + "loss": 1.0807, + "step": 1401 + }, + { + "epoch": 0.10115075213736878, + "grad_norm": 2.3874284903317533, + "learning_rate": 3.947146421459344e-06, + "loss": 0.9932, + "step": 1402 + }, + { + "epoch": 0.10122289960679628, + "grad_norm": 2.3570518930231064, + "learning_rate": 3.947039634935931e-06, + "loss": 1.0384, + "step": 1403 + }, + { + "epoch": 0.1012950470762238, + "grad_norm": 2.5855240505725074, + "learning_rate": 3.94693274209189e-06, + "loss": 0.927, + "step": 1404 + }, + { + "epoch": 0.10136719454565131, + "grad_norm": 2.9818350483783145, + "learning_rate": 3.94682574293306e-06, + "loss": 0.8975, + "step": 1405 + }, + { + "epoch": 0.10143934201507881, + "grad_norm": 2.647557025039533, + "learning_rate": 3.946718637465282e-06, + "loss": 1.068, + "step": 1406 + }, + { + "epoch": 0.10151148948450633, + "grad_norm": 2.3266773203229696, + "learning_rate": 3.946611425694407e-06, + "loss": 0.9888, + "step": 1407 + }, + { + "epoch": 0.10158363695393384, + "grad_norm": 3.030534797532874, + "learning_rate": 3.946504107626288e-06, + "loss": 1.0254, + "step": 1408 + }, + { + "epoch": 0.10165578442336135, + "grad_norm": 2.783055266083429, + "learning_rate": 3.946396683266785e-06, + "loss": 1.079, + "step": 1409 + }, + { + "epoch": 0.10172793189278886, + "grad_norm": 2.6088187715598448, + "learning_rate": 3.946289152621764e-06, + "loss": 0.9685, + "step": 1410 + }, + { + "epoch": 0.10180007936221637, + "grad_norm": 3.1970717383561027, + "learning_rate": 3.9461815156970975e-06, + "loss": 0.9671, + "step": 1411 + }, + { + "epoch": 0.10187222683164388, + "grad_norm": 2.4472389311069103, + "learning_rate": 3.946073772498664e-06, + "loss": 1.0366, + "step": 1412 + }, + { + "epoch": 0.1019443743010714, + "grad_norm": 2.4090764673431586, + "learning_rate": 3.945965923032345e-06, + "loss": 1.1569, + "step": 1413 + }, + { + "epoch": 0.1020165217704989, + "grad_norm": 2.7383718870130207, + "learning_rate": 3.945857967304031e-06, + "loss": 0.9342, + "step": 1414 + }, + { + "epoch": 0.1020886692399264, + "grad_norm": 3.057286710372278, + "learning_rate": 3.945749905319616e-06, + "loss": 1.0376, + "step": 1415 + }, + { + "epoch": 0.10216081670935392, + "grad_norm": 0.8968635704561665, + "learning_rate": 3.9456417370850015e-06, + "loss": 0.8916, + "step": 1416 + }, + { + "epoch": 0.10223296417878143, + "grad_norm": 2.8396563566525006, + "learning_rate": 3.9455334626060955e-06, + "loss": 0.9273, + "step": 1417 + }, + { + "epoch": 0.10230511164820894, + "grad_norm": 3.2157349029236455, + "learning_rate": 3.945425081888809e-06, + "loss": 1.0001, + "step": 1418 + }, + { + "epoch": 0.10237725911763645, + "grad_norm": 2.304383959420409, + "learning_rate": 3.94531659493906e-06, + "loss": 0.9592, + "step": 1419 + }, + { + "epoch": 0.10244940658706396, + "grad_norm": 3.2970177497513253, + "learning_rate": 3.945208001762773e-06, + "loss": 0.9545, + "step": 1420 + }, + { + "epoch": 0.10252155405649147, + "grad_norm": 3.5601797746421453, + "learning_rate": 3.9450993023658775e-06, + "loss": 0.9467, + "step": 1421 + }, + { + "epoch": 0.10259370152591898, + "grad_norm": 2.971695733504986, + "learning_rate": 3.9449904967543105e-06, + "loss": 0.9486, + "step": 1422 + }, + { + "epoch": 0.10266584899534649, + "grad_norm": 2.9365559136257047, + "learning_rate": 3.9448815849340125e-06, + "loss": 0.8739, + "step": 1423 + }, + { + "epoch": 0.102737996464774, + "grad_norm": 2.4823987285936537, + "learning_rate": 3.944772566910931e-06, + "loss": 0.9351, + "step": 1424 + }, + { + "epoch": 0.10281014393420151, + "grad_norm": 3.3316059087035192, + "learning_rate": 3.944663442691018e-06, + "loss": 0.9287, + "step": 1425 + }, + { + "epoch": 0.10288229140362902, + "grad_norm": 0.7179088646421798, + "learning_rate": 3.944554212280234e-06, + "loss": 0.8027, + "step": 1426 + }, + { + "epoch": 0.10295443887305653, + "grad_norm": 2.5934765223449285, + "learning_rate": 3.9444448756845425e-06, + "loss": 1.0533, + "step": 1427 + }, + { + "epoch": 0.10302658634248403, + "grad_norm": 5.288842696745938, + "learning_rate": 3.944335432909915e-06, + "loss": 0.9995, + "step": 1428 + }, + { + "epoch": 0.10309873381191155, + "grad_norm": 2.3652165331116386, + "learning_rate": 3.944225883962327e-06, + "loss": 1.0016, + "step": 1429 + }, + { + "epoch": 0.10317088128133906, + "grad_norm": 2.7295506233961615, + "learning_rate": 3.944116228847761e-06, + "loss": 0.9492, + "step": 1430 + }, + { + "epoch": 0.10324302875076656, + "grad_norm": 3.7200692024767257, + "learning_rate": 3.9440064675722035e-06, + "loss": 0.9758, + "step": 1431 + }, + { + "epoch": 0.10331517622019408, + "grad_norm": 3.5149266884593966, + "learning_rate": 3.94389660014165e-06, + "loss": 0.9986, + "step": 1432 + }, + { + "epoch": 0.10338732368962159, + "grad_norm": 5.114290165575774, + "learning_rate": 3.9437866265621e-06, + "loss": 0.8861, + "step": 1433 + }, + { + "epoch": 0.10345947115904909, + "grad_norm": 2.2932086149760855, + "learning_rate": 3.943676546839557e-06, + "loss": 0.9891, + "step": 1434 + }, + { + "epoch": 0.10353161862847661, + "grad_norm": 1.9379430079824613, + "learning_rate": 3.943566360980033e-06, + "loss": 1.0413, + "step": 1435 + }, + { + "epoch": 0.10360376609790412, + "grad_norm": 3.182607448828687, + "learning_rate": 3.943456068989546e-06, + "loss": 1.048, + "step": 1436 + }, + { + "epoch": 0.10367591356733162, + "grad_norm": 3.715501536506969, + "learning_rate": 3.943345670874116e-06, + "loss": 1.0396, + "step": 1437 + }, + { + "epoch": 0.10374806103675914, + "grad_norm": 2.242594293251353, + "learning_rate": 3.943235166639774e-06, + "loss": 0.9587, + "step": 1438 + }, + { + "epoch": 0.10382020850618665, + "grad_norm": 2.2413370998667865, + "learning_rate": 3.943124556292553e-06, + "loss": 1.0013, + "step": 1439 + }, + { + "epoch": 0.10389235597561415, + "grad_norm": 2.391511640497646, + "learning_rate": 3.943013839838492e-06, + "loss": 0.9835, + "step": 1440 + }, + { + "epoch": 0.10396450344504167, + "grad_norm": 3.254076675036875, + "learning_rate": 3.942903017283639e-06, + "loss": 0.8447, + "step": 1441 + }, + { + "epoch": 0.10403665091446918, + "grad_norm": 2.3857179032290974, + "learning_rate": 3.942792088634045e-06, + "loss": 0.8694, + "step": 1442 + }, + { + "epoch": 0.10410879838389668, + "grad_norm": 2.4128973164279586, + "learning_rate": 3.942681053895766e-06, + "loss": 1.0033, + "step": 1443 + }, + { + "epoch": 0.1041809458533242, + "grad_norm": 2.4963324347625284, + "learning_rate": 3.9425699130748665e-06, + "loss": 0.9559, + "step": 1444 + }, + { + "epoch": 0.1042530933227517, + "grad_norm": 3.0583640995262478, + "learning_rate": 3.942458666177416e-06, + "loss": 1.0903, + "step": 1445 + }, + { + "epoch": 0.10432524079217921, + "grad_norm": 2.204061755428083, + "learning_rate": 3.9423473132094885e-06, + "loss": 0.9626, + "step": 1446 + }, + { + "epoch": 0.10439738826160673, + "grad_norm": 2.2941322818853167, + "learning_rate": 3.942235854177164e-06, + "loss": 1.0296, + "step": 1447 + }, + { + "epoch": 0.10446953573103424, + "grad_norm": 3.200252826954943, + "learning_rate": 3.942124289086529e-06, + "loss": 1.0703, + "step": 1448 + }, + { + "epoch": 0.10454168320046174, + "grad_norm": 3.3378607679965864, + "learning_rate": 3.942012617943676e-06, + "loss": 0.9485, + "step": 1449 + }, + { + "epoch": 0.10461383066988926, + "grad_norm": 2.815366767447503, + "learning_rate": 3.941900840754704e-06, + "loss": 0.9887, + "step": 1450 + }, + { + "epoch": 0.10468597813931677, + "grad_norm": 3.6263789480978845, + "learning_rate": 3.941788957525715e-06, + "loss": 0.95, + "step": 1451 + }, + { + "epoch": 0.10475812560874427, + "grad_norm": 1.1541677748104426, + "learning_rate": 3.941676968262819e-06, + "loss": 0.8374, + "step": 1452 + }, + { + "epoch": 0.10483027307817178, + "grad_norm": 2.822762986019928, + "learning_rate": 3.941564872972132e-06, + "loss": 1.0056, + "step": 1453 + }, + { + "epoch": 0.1049024205475993, + "grad_norm": 3.3132455814441197, + "learning_rate": 3.941452671659775e-06, + "loss": 0.9581, + "step": 1454 + }, + { + "epoch": 0.1049745680170268, + "grad_norm": 3.1241740851942947, + "learning_rate": 3.941340364331875e-06, + "loss": 1.0379, + "step": 1455 + }, + { + "epoch": 0.1050467154864543, + "grad_norm": 2.9241074789831205, + "learning_rate": 3.941227950994564e-06, + "loss": 0.9513, + "step": 1456 + }, + { + "epoch": 0.10511886295588183, + "grad_norm": 3.3432410022109904, + "learning_rate": 3.94111543165398e-06, + "loss": 1.0039, + "step": 1457 + }, + { + "epoch": 0.10519101042530933, + "grad_norm": 2.4121806392876044, + "learning_rate": 3.941002806316269e-06, + "loss": 0.959, + "step": 1458 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 2.142855630601622, + "learning_rate": 3.9408900749875795e-06, + "loss": 0.964, + "step": 1459 + }, + { + "epoch": 0.10533530536416436, + "grad_norm": 2.2404510473084973, + "learning_rate": 3.940777237674068e-06, + "loss": 1.0574, + "step": 1460 + }, + { + "epoch": 0.10540745283359186, + "grad_norm": 2.398308467811199, + "learning_rate": 3.940664294381896e-06, + "loss": 0.9486, + "step": 1461 + }, + { + "epoch": 0.10547960030301937, + "grad_norm": 0.9083360472180716, + "learning_rate": 3.940551245117231e-06, + "loss": 0.776, + "step": 1462 + }, + { + "epoch": 0.10555174777244689, + "grad_norm": 3.628795609773944, + "learning_rate": 3.940438089886246e-06, + "loss": 0.9653, + "step": 1463 + }, + { + "epoch": 0.10562389524187439, + "grad_norm": 2.91530126673558, + "learning_rate": 3.940324828695121e-06, + "loss": 0.9901, + "step": 1464 + }, + { + "epoch": 0.1056960427113019, + "grad_norm": 3.267941032363255, + "learning_rate": 3.9402114615500394e-06, + "loss": 0.9356, + "step": 1465 + }, + { + "epoch": 0.10576819018072942, + "grad_norm": 2.9210904133877875, + "learning_rate": 3.9400979884571925e-06, + "loss": 0.9768, + "step": 1466 + }, + { + "epoch": 0.10584033765015692, + "grad_norm": 2.7763192692290244, + "learning_rate": 3.939984409422776e-06, + "loss": 1.0442, + "step": 1467 + }, + { + "epoch": 0.10591248511958443, + "grad_norm": 2.990352855007036, + "learning_rate": 3.939870724452993e-06, + "loss": 1.0575, + "step": 1468 + }, + { + "epoch": 0.10598463258901195, + "grad_norm": 2.9899144714778934, + "learning_rate": 3.9397569335540505e-06, + "loss": 0.8808, + "step": 1469 + }, + { + "epoch": 0.10605678005843945, + "grad_norm": 2.1953046805354512, + "learning_rate": 3.939643036732163e-06, + "loss": 0.9789, + "step": 1470 + }, + { + "epoch": 0.10612892752786696, + "grad_norm": 3.7416265461447953, + "learning_rate": 3.939529033993549e-06, + "loss": 1.0317, + "step": 1471 + }, + { + "epoch": 0.10620107499729448, + "grad_norm": 5.192881538872017, + "learning_rate": 3.939414925344435e-06, + "loss": 1.005, + "step": 1472 + }, + { + "epoch": 0.10627322246672198, + "grad_norm": 2.231290809407743, + "learning_rate": 3.939300710791051e-06, + "loss": 0.9405, + "step": 1473 + }, + { + "epoch": 0.10634536993614949, + "grad_norm": 3.2770365956427963, + "learning_rate": 3.939186390339633e-06, + "loss": 0.9493, + "step": 1474 + }, + { + "epoch": 0.106417517405577, + "grad_norm": 2.800160444294924, + "learning_rate": 3.939071963996427e-06, + "loss": 1.1468, + "step": 1475 + }, + { + "epoch": 0.10648966487500451, + "grad_norm": 3.8402827061458065, + "learning_rate": 3.938957431767679e-06, + "loss": 0.9488, + "step": 1476 + }, + { + "epoch": 0.10656181234443202, + "grad_norm": 2.3602597428290353, + "learning_rate": 3.938842793659642e-06, + "loss": 0.9251, + "step": 1477 + }, + { + "epoch": 0.10663395981385954, + "grad_norm": 3.2927246003240387, + "learning_rate": 3.938728049678579e-06, + "loss": 1.0809, + "step": 1478 + }, + { + "epoch": 0.10670610728328704, + "grad_norm": 3.005750231530508, + "learning_rate": 3.938613199830753e-06, + "loss": 1.0042, + "step": 1479 + }, + { + "epoch": 0.10677825475271455, + "grad_norm": 2.053369715954526, + "learning_rate": 3.9384982441224365e-06, + "loss": 0.976, + "step": 1480 + }, + { + "epoch": 0.10685040222214205, + "grad_norm": 2.50474070244421, + "learning_rate": 3.938383182559907e-06, + "loss": 0.9255, + "step": 1481 + }, + { + "epoch": 0.10692254969156957, + "grad_norm": 1.7342486636225531, + "learning_rate": 3.9382680151494486e-06, + "loss": 1.0484, + "step": 1482 + }, + { + "epoch": 0.10699469716099708, + "grad_norm": 1.9706535219692736, + "learning_rate": 3.938152741897349e-06, + "loss": 0.994, + "step": 1483 + }, + { + "epoch": 0.10706684463042458, + "grad_norm": 3.686553445656013, + "learning_rate": 3.938037362809902e-06, + "loss": 0.9684, + "step": 1484 + }, + { + "epoch": 0.1071389920998521, + "grad_norm": 2.7532212724426124, + "learning_rate": 3.937921877893409e-06, + "loss": 0.9971, + "step": 1485 + }, + { + "epoch": 0.1072111395692796, + "grad_norm": 2.0310175836084623, + "learning_rate": 3.937806287154177e-06, + "loss": 0.9489, + "step": 1486 + }, + { + "epoch": 0.10728328703870711, + "grad_norm": 3.671864091572892, + "learning_rate": 3.937690590598516e-06, + "loss": 0.946, + "step": 1487 + }, + { + "epoch": 0.10735543450813463, + "grad_norm": 2.439032830712205, + "learning_rate": 3.937574788232747e-06, + "loss": 0.9802, + "step": 1488 + }, + { + "epoch": 0.10742758197756214, + "grad_norm": 2.3271670834990523, + "learning_rate": 3.937458880063189e-06, + "loss": 1.0255, + "step": 1489 + }, + { + "epoch": 0.10749972944698964, + "grad_norm": 2.1924417534532488, + "learning_rate": 3.937342866096175e-06, + "loss": 0.9111, + "step": 1490 + }, + { + "epoch": 0.10757187691641716, + "grad_norm": 3.0620065408309154, + "learning_rate": 3.937226746338039e-06, + "loss": 0.9966, + "step": 1491 + }, + { + "epoch": 0.10764402438584467, + "grad_norm": 2.667926302079279, + "learning_rate": 3.937110520795122e-06, + "loss": 0.9536, + "step": 1492 + }, + { + "epoch": 0.10771617185527217, + "grad_norm": 3.9431044641172526, + "learning_rate": 3.93699418947377e-06, + "loss": 1.0404, + "step": 1493 + }, + { + "epoch": 0.10778831932469969, + "grad_norm": 2.9235864206833533, + "learning_rate": 3.936877752380336e-06, + "loss": 0.9784, + "step": 1494 + }, + { + "epoch": 0.1078604667941272, + "grad_norm": 2.2975607127406334, + "learning_rate": 3.936761209521178e-06, + "loss": 0.9484, + "step": 1495 + }, + { + "epoch": 0.1079326142635547, + "grad_norm": 2.550367616820468, + "learning_rate": 3.93664456090266e-06, + "loss": 0.8701, + "step": 1496 + }, + { + "epoch": 0.10800476173298222, + "grad_norm": 2.5144861142997685, + "learning_rate": 3.936527806531151e-06, + "loss": 0.9836, + "step": 1497 + }, + { + "epoch": 0.10807690920240973, + "grad_norm": 2.4061491313451353, + "learning_rate": 3.936410946413028e-06, + "loss": 0.9476, + "step": 1498 + }, + { + "epoch": 0.10814905667183723, + "grad_norm": 3.006004523329082, + "learning_rate": 3.9362939805546716e-06, + "loss": 0.9857, + "step": 1499 + }, + { + "epoch": 0.10822120414126475, + "grad_norm": 2.4488041236274443, + "learning_rate": 3.9361769089624685e-06, + "loss": 0.9613, + "step": 1500 + }, + { + "epoch": 0.10829335161069226, + "grad_norm": 2.7444186358282954, + "learning_rate": 3.9360597316428125e-06, + "loss": 0.9496, + "step": 1501 + }, + { + "epoch": 0.10836549908011976, + "grad_norm": 5.149198021526515, + "learning_rate": 3.935942448602101e-06, + "loss": 0.967, + "step": 1502 + }, + { + "epoch": 0.10843764654954728, + "grad_norm": 1.3232252506988045, + "learning_rate": 3.935825059846739e-06, + "loss": 0.829, + "step": 1503 + }, + { + "epoch": 0.10850979401897479, + "grad_norm": 3.12496520976728, + "learning_rate": 3.935707565383137e-06, + "loss": 1.0336, + "step": 1504 + }, + { + "epoch": 0.10858194148840229, + "grad_norm": 2.1271255185100344, + "learning_rate": 3.93558996521771e-06, + "loss": 0.8707, + "step": 1505 + }, + { + "epoch": 0.1086540889578298, + "grad_norm": 7.331718816182229, + "learning_rate": 3.935472259356881e-06, + "loss": 0.9695, + "step": 1506 + }, + { + "epoch": 0.10872623642725732, + "grad_norm": 3.9708611587771685, + "learning_rate": 3.935354447807077e-06, + "loss": 0.9362, + "step": 1507 + }, + { + "epoch": 0.10879838389668482, + "grad_norm": 3.5088257547483117, + "learning_rate": 3.93523653057473e-06, + "loss": 0.9768, + "step": 1508 + }, + { + "epoch": 0.10887053136611233, + "grad_norm": 2.522721321823762, + "learning_rate": 3.9351185076662805e-06, + "loss": 1.0347, + "step": 1509 + }, + { + "epoch": 0.10894267883553985, + "grad_norm": 3.0414381768789673, + "learning_rate": 3.935000379088173e-06, + "loss": 1.0022, + "step": 1510 + }, + { + "epoch": 0.10901482630496735, + "grad_norm": 3.5409295754936947, + "learning_rate": 3.934882144846858e-06, + "loss": 0.9514, + "step": 1511 + }, + { + "epoch": 0.10908697377439486, + "grad_norm": 2.893675313360646, + "learning_rate": 3.934763804948792e-06, + "loss": 0.9671, + "step": 1512 + }, + { + "epoch": 0.10915912124382238, + "grad_norm": 4.64385565615735, + "learning_rate": 3.934645359400436e-06, + "loss": 0.9888, + "step": 1513 + }, + { + "epoch": 0.10923126871324988, + "grad_norm": 6.357351670119277, + "learning_rate": 3.934526808208259e-06, + "loss": 0.9603, + "step": 1514 + }, + { + "epoch": 0.10930341618267739, + "grad_norm": 2.908692184942505, + "learning_rate": 3.934408151378735e-06, + "loss": 0.8338, + "step": 1515 + }, + { + "epoch": 0.1093755636521049, + "grad_norm": 2.5433299662149853, + "learning_rate": 3.934289388918343e-06, + "loss": 1.0546, + "step": 1516 + }, + { + "epoch": 0.10944771112153241, + "grad_norm": 2.609428633635506, + "learning_rate": 3.934170520833567e-06, + "loss": 0.9749, + "step": 1517 + }, + { + "epoch": 0.10951985859095992, + "grad_norm": 3.3474715013836462, + "learning_rate": 3.934051547130899e-06, + "loss": 1.0029, + "step": 1518 + }, + { + "epoch": 0.10959200606038744, + "grad_norm": 0.9257972812471749, + "learning_rate": 3.933932467816836e-06, + "loss": 0.8355, + "step": 1519 + }, + { + "epoch": 0.10966415352981494, + "grad_norm": 2.0289502550008627, + "learning_rate": 3.9338132828978795e-06, + "loss": 1.0062, + "step": 1520 + }, + { + "epoch": 0.10973630099924245, + "grad_norm": 3.022940819133561, + "learning_rate": 3.933693992380539e-06, + "loss": 0.9653, + "step": 1521 + }, + { + "epoch": 0.10980844846866997, + "grad_norm": 2.9902747193249137, + "learning_rate": 3.933574596271328e-06, + "loss": 0.9791, + "step": 1522 + }, + { + "epoch": 0.10988059593809747, + "grad_norm": 3.29993777360913, + "learning_rate": 3.933455094576766e-06, + "loss": 0.9823, + "step": 1523 + }, + { + "epoch": 0.10995274340752498, + "grad_norm": 4.049622063604483, + "learning_rate": 3.933335487303378e-06, + "loss": 0.9224, + "step": 1524 + }, + { + "epoch": 0.1100248908769525, + "grad_norm": 2.580193077373924, + "learning_rate": 3.933215774457697e-06, + "loss": 1.0117, + "step": 1525 + }, + { + "epoch": 0.11009703834638, + "grad_norm": 4.1444477213101445, + "learning_rate": 3.933095956046259e-06, + "loss": 0.9491, + "step": 1526 + }, + { + "epoch": 0.11016918581580751, + "grad_norm": 2.5631442306902192, + "learning_rate": 3.932976032075607e-06, + "loss": 1.072, + "step": 1527 + }, + { + "epoch": 0.11024133328523503, + "grad_norm": 2.401884304865399, + "learning_rate": 3.932856002552289e-06, + "loss": 0.9265, + "step": 1528 + }, + { + "epoch": 0.11031348075466253, + "grad_norm": 2.5204538947763675, + "learning_rate": 3.93273586748286e-06, + "loss": 1.0584, + "step": 1529 + }, + { + "epoch": 0.11038562822409004, + "grad_norm": 3.7845161140775185, + "learning_rate": 3.93261562687388e-06, + "loss": 0.9775, + "step": 1530 + }, + { + "epoch": 0.11045777569351754, + "grad_norm": 2.879849034962139, + "learning_rate": 3.932495280731914e-06, + "loss": 0.9322, + "step": 1531 + }, + { + "epoch": 0.11052992316294506, + "grad_norm": 0.8315789869552758, + "learning_rate": 3.932374829063536e-06, + "loss": 0.8164, + "step": 1532 + }, + { + "epoch": 0.11060207063237257, + "grad_norm": 3.041895940491844, + "learning_rate": 3.9322542718753216e-06, + "loss": 1.0512, + "step": 1533 + }, + { + "epoch": 0.11067421810180007, + "grad_norm": 3.358016906168618, + "learning_rate": 3.932133609173854e-06, + "loss": 1.0269, + "step": 1534 + }, + { + "epoch": 0.11074636557122759, + "grad_norm": 2.2794480196043887, + "learning_rate": 3.932012840965723e-06, + "loss": 1.0959, + "step": 1535 + }, + { + "epoch": 0.1108185130406551, + "grad_norm": 3.743880046438991, + "learning_rate": 3.931891967257522e-06, + "loss": 0.9758, + "step": 1536 + }, + { + "epoch": 0.1108906605100826, + "grad_norm": 2.585934918451317, + "learning_rate": 3.931770988055853e-06, + "loss": 1.0072, + "step": 1537 + }, + { + "epoch": 0.11096280797951012, + "grad_norm": 3.0578966381600297, + "learning_rate": 3.931649903367321e-06, + "loss": 0.9072, + "step": 1538 + }, + { + "epoch": 0.11103495544893763, + "grad_norm": 2.524183228082523, + "learning_rate": 3.931528713198539e-06, + "loss": 1.0608, + "step": 1539 + }, + { + "epoch": 0.11110710291836513, + "grad_norm": 1.1901232205235293, + "learning_rate": 3.931407417556125e-06, + "loss": 0.9126, + "step": 1540 + }, + { + "epoch": 0.11117925038779265, + "grad_norm": 2.648881453042115, + "learning_rate": 3.931286016446701e-06, + "loss": 1.03, + "step": 1541 + }, + { + "epoch": 0.11125139785722016, + "grad_norm": 5.788877002869011, + "learning_rate": 3.931164509876896e-06, + "loss": 1.003, + "step": 1542 + }, + { + "epoch": 0.11132354532664766, + "grad_norm": 2.576900029325724, + "learning_rate": 3.931042897853348e-06, + "loss": 1.0635, + "step": 1543 + }, + { + "epoch": 0.11139569279607518, + "grad_norm": 3.363035257273197, + "learning_rate": 3.930921180382694e-06, + "loss": 0.9727, + "step": 1544 + }, + { + "epoch": 0.11146784026550269, + "grad_norm": 3.0460836312027366, + "learning_rate": 3.930799357471583e-06, + "loss": 0.923, + "step": 1545 + }, + { + "epoch": 0.1115399877349302, + "grad_norm": 2.8065552509359932, + "learning_rate": 3.9306774291266675e-06, + "loss": 0.9454, + "step": 1546 + }, + { + "epoch": 0.11161213520435771, + "grad_norm": 5.2891120851973135, + "learning_rate": 3.930555395354604e-06, + "loss": 1.065, + "step": 1547 + }, + { + "epoch": 0.11168428267378522, + "grad_norm": 2.5773062590814826, + "learning_rate": 3.930433256162057e-06, + "loss": 0.8791, + "step": 1548 + }, + { + "epoch": 0.11175643014321272, + "grad_norm": 3.29998300143401, + "learning_rate": 3.930311011555696e-06, + "loss": 0.8653, + "step": 1549 + }, + { + "epoch": 0.11182857761264024, + "grad_norm": 1.9560429862667985, + "learning_rate": 3.930188661542196e-06, + "loss": 1.0528, + "step": 1550 + }, + { + "epoch": 0.11190072508206775, + "grad_norm": 2.7928981531997774, + "learning_rate": 3.930066206128239e-06, + "loss": 1.0695, + "step": 1551 + }, + { + "epoch": 0.11197287255149525, + "grad_norm": 3.4069147423774218, + "learning_rate": 3.929943645320512e-06, + "loss": 1.0108, + "step": 1552 + }, + { + "epoch": 0.11204502002092277, + "grad_norm": 6.585749540893645, + "learning_rate": 3.929820979125706e-06, + "loss": 1.0214, + "step": 1553 + }, + { + "epoch": 0.11211716749035028, + "grad_norm": 3.546800654636235, + "learning_rate": 3.92969820755052e-06, + "loss": 1.0185, + "step": 1554 + }, + { + "epoch": 0.11218931495977778, + "grad_norm": 3.0364429785382208, + "learning_rate": 3.92957533060166e-06, + "loss": 1.0392, + "step": 1555 + }, + { + "epoch": 0.11226146242920529, + "grad_norm": 2.2438972148576566, + "learning_rate": 3.9294523482858336e-06, + "loss": 1.0488, + "step": 1556 + }, + { + "epoch": 0.11233360989863281, + "grad_norm": 0.8116983346877592, + "learning_rate": 3.929329260609757e-06, + "loss": 0.7912, + "step": 1557 + }, + { + "epoch": 0.11240575736806031, + "grad_norm": 3.3149292873015512, + "learning_rate": 3.929206067580151e-06, + "loss": 0.882, + "step": 1558 + }, + { + "epoch": 0.11247790483748782, + "grad_norm": 0.8282632892232227, + "learning_rate": 3.9290827692037435e-06, + "loss": 0.8225, + "step": 1559 + }, + { + "epoch": 0.11255005230691534, + "grad_norm": 3.6199346207673933, + "learning_rate": 3.9289593654872674e-06, + "loss": 0.945, + "step": 1560 + }, + { + "epoch": 0.11262219977634284, + "grad_norm": 2.85876675165937, + "learning_rate": 3.92883585643746e-06, + "loss": 1.0211, + "step": 1561 + }, + { + "epoch": 0.11269434724577035, + "grad_norm": 2.824717286403705, + "learning_rate": 3.928712242061069e-06, + "loss": 0.9776, + "step": 1562 + }, + { + "epoch": 0.11276649471519787, + "grad_norm": 2.864754707527637, + "learning_rate": 3.928588522364841e-06, + "loss": 0.9668, + "step": 1563 + }, + { + "epoch": 0.11283864218462537, + "grad_norm": 2.773387210013903, + "learning_rate": 3.9284646973555335e-06, + "loss": 0.8833, + "step": 1564 + }, + { + "epoch": 0.11291078965405288, + "grad_norm": 3.283883046510515, + "learning_rate": 3.9283407670399074e-06, + "loss": 0.8895, + "step": 1565 + }, + { + "epoch": 0.1129829371234804, + "grad_norm": 6.051903656892966, + "learning_rate": 3.928216731424731e-06, + "loss": 0.9928, + "step": 1566 + }, + { + "epoch": 0.1130550845929079, + "grad_norm": 4.298144343761001, + "learning_rate": 3.928092590516776e-06, + "loss": 1.0041, + "step": 1567 + }, + { + "epoch": 0.11312723206233541, + "grad_norm": 4.069593369699502, + "learning_rate": 3.927968344322824e-06, + "loss": 0.997, + "step": 1568 + }, + { + "epoch": 0.11319937953176293, + "grad_norm": 2.2557501960525164, + "learning_rate": 3.927843992849656e-06, + "loss": 1.0292, + "step": 1569 + }, + { + "epoch": 0.11327152700119043, + "grad_norm": 2.9253268328269617, + "learning_rate": 3.927719536104065e-06, + "loss": 0.9146, + "step": 1570 + }, + { + "epoch": 0.11334367447061794, + "grad_norm": 2.608339298310533, + "learning_rate": 3.927594974092846e-06, + "loss": 0.8849, + "step": 1571 + }, + { + "epoch": 0.11341582194004546, + "grad_norm": 3.7525338831012895, + "learning_rate": 3.927470306822802e-06, + "loss": 0.9654, + "step": 1572 + }, + { + "epoch": 0.11348796940947296, + "grad_norm": 3.8036935122755553, + "learning_rate": 3.927345534300738e-06, + "loss": 0.9511, + "step": 1573 + }, + { + "epoch": 0.11356011687890047, + "grad_norm": 2.3336613515353726, + "learning_rate": 3.9272206565334705e-06, + "loss": 1.0339, + "step": 1574 + }, + { + "epoch": 0.11363226434832799, + "grad_norm": 2.7863566632602903, + "learning_rate": 3.927095673527817e-06, + "loss": 1.0288, + "step": 1575 + }, + { + "epoch": 0.1137044118177555, + "grad_norm": 2.506094180469249, + "learning_rate": 3.926970585290603e-06, + "loss": 0.9367, + "step": 1576 + }, + { + "epoch": 0.113776559287183, + "grad_norm": 4.313022084353623, + "learning_rate": 3.9268453918286585e-06, + "loss": 0.99, + "step": 1577 + }, + { + "epoch": 0.11384870675661052, + "grad_norm": 2.4215454861836676, + "learning_rate": 3.926720093148819e-06, + "loss": 1.0259, + "step": 1578 + }, + { + "epoch": 0.11392085422603802, + "grad_norm": 2.2006418288935454, + "learning_rate": 3.9265946892579295e-06, + "loss": 0.9092, + "step": 1579 + }, + { + "epoch": 0.11399300169546553, + "grad_norm": 3.5151393978862724, + "learning_rate": 3.9264691801628345e-06, + "loss": 0.9261, + "step": 1580 + }, + { + "epoch": 0.11406514916489305, + "grad_norm": 2.6601843923460904, + "learning_rate": 3.92634356587039e-06, + "loss": 0.9896, + "step": 1581 + }, + { + "epoch": 0.11413729663432055, + "grad_norm": 0.7968363285031013, + "learning_rate": 3.926217846387454e-06, + "loss": 0.78, + "step": 1582 + }, + { + "epoch": 0.11420944410374806, + "grad_norm": 3.0363463985865433, + "learning_rate": 3.926092021720893e-06, + "loss": 0.936, + "step": 1583 + }, + { + "epoch": 0.11428159157317556, + "grad_norm": 2.937782923283049, + "learning_rate": 3.925966091877576e-06, + "loss": 0.912, + "step": 1584 + }, + { + "epoch": 0.11435373904260308, + "grad_norm": 2.3682668510231335, + "learning_rate": 3.92584005686438e-06, + "loss": 1.0426, + "step": 1585 + }, + { + "epoch": 0.11442588651203059, + "grad_norm": 2.828674094496136, + "learning_rate": 3.925713916688189e-06, + "loss": 0.9408, + "step": 1586 + }, + { + "epoch": 0.1144980339814581, + "grad_norm": 3.0014584493002694, + "learning_rate": 3.925587671355889e-06, + "loss": 0.9123, + "step": 1587 + }, + { + "epoch": 0.11457018145088561, + "grad_norm": 3.2553345841402703, + "learning_rate": 3.9254613208743746e-06, + "loss": 0.9652, + "step": 1588 + }, + { + "epoch": 0.11464232892031312, + "grad_norm": 2.985658379668211, + "learning_rate": 3.925334865250545e-06, + "loss": 0.9624, + "step": 1589 + }, + { + "epoch": 0.11471447638974062, + "grad_norm": 2.2918842096822476, + "learning_rate": 3.925208304491306e-06, + "loss": 1.0343, + "step": 1590 + }, + { + "epoch": 0.11478662385916814, + "grad_norm": 1.9104756281360682, + "learning_rate": 3.925081638603568e-06, + "loss": 0.9904, + "step": 1591 + }, + { + "epoch": 0.11485877132859565, + "grad_norm": 3.9994644760230966, + "learning_rate": 3.924954867594249e-06, + "loss": 0.9239, + "step": 1592 + }, + { + "epoch": 0.11493091879802315, + "grad_norm": 2.814547069286764, + "learning_rate": 3.92482799147027e-06, + "loss": 0.8099, + "step": 1593 + }, + { + "epoch": 0.11500306626745067, + "grad_norm": 2.94448298503036, + "learning_rate": 3.92470101023856e-06, + "loss": 0.899, + "step": 1594 + }, + { + "epoch": 0.11507521373687818, + "grad_norm": 4.376949856717321, + "learning_rate": 3.9245739239060525e-06, + "loss": 0.8814, + "step": 1595 + }, + { + "epoch": 0.11514736120630568, + "grad_norm": 3.050509275849037, + "learning_rate": 3.924446732479688e-06, + "loss": 1.1484, + "step": 1596 + }, + { + "epoch": 0.1152195086757332, + "grad_norm": 2.516497632427834, + "learning_rate": 3.924319435966411e-06, + "loss": 1.0534, + "step": 1597 + }, + { + "epoch": 0.11529165614516071, + "grad_norm": 3.6258716193068423, + "learning_rate": 3.924192034373174e-06, + "loss": 0.9681, + "step": 1598 + }, + { + "epoch": 0.11536380361458821, + "grad_norm": 2.1396317231062127, + "learning_rate": 3.924064527706933e-06, + "loss": 0.9797, + "step": 1599 + }, + { + "epoch": 0.11543595108401573, + "grad_norm": 2.501729748273869, + "learning_rate": 3.92393691597465e-06, + "loss": 0.957, + "step": 1600 + }, + { + "epoch": 0.11550809855344324, + "grad_norm": 3.074483353546633, + "learning_rate": 3.923809199183295e-06, + "loss": 0.9971, + "step": 1601 + }, + { + "epoch": 0.11558024602287074, + "grad_norm": 3.3907278018959723, + "learning_rate": 3.9236813773398405e-06, + "loss": 0.9812, + "step": 1602 + }, + { + "epoch": 0.11565239349229826, + "grad_norm": 0.944953724268017, + "learning_rate": 3.923553450451268e-06, + "loss": 0.846, + "step": 1603 + }, + { + "epoch": 0.11572454096172577, + "grad_norm": 2.892673241019297, + "learning_rate": 3.923425418524562e-06, + "loss": 0.9776, + "step": 1604 + }, + { + "epoch": 0.11579668843115327, + "grad_norm": 2.3804485664476016, + "learning_rate": 3.923297281566714e-06, + "loss": 0.8734, + "step": 1605 + }, + { + "epoch": 0.1158688359005808, + "grad_norm": 2.349778927894973, + "learning_rate": 3.923169039584722e-06, + "loss": 1.0016, + "step": 1606 + }, + { + "epoch": 0.1159409833700083, + "grad_norm": 2.3793645207520275, + "learning_rate": 3.923040692585587e-06, + "loss": 0.8763, + "step": 1607 + }, + { + "epoch": 0.1160131308394358, + "grad_norm": 2.527323939438876, + "learning_rate": 3.922912240576319e-06, + "loss": 1.0575, + "step": 1608 + }, + { + "epoch": 0.11608527830886331, + "grad_norm": 2.439765171101188, + "learning_rate": 3.922783683563933e-06, + "loss": 1.0317, + "step": 1609 + }, + { + "epoch": 0.11615742577829083, + "grad_norm": 2.5346355638249554, + "learning_rate": 3.922655021555446e-06, + "loss": 0.9719, + "step": 1610 + }, + { + "epoch": 0.11622957324771833, + "grad_norm": 3.383580638196859, + "learning_rate": 3.922526254557887e-06, + "loss": 1.038, + "step": 1611 + }, + { + "epoch": 0.11630172071714584, + "grad_norm": 0.8472646344795962, + "learning_rate": 3.9223973825782854e-06, + "loss": 0.7702, + "step": 1612 + }, + { + "epoch": 0.11637386818657336, + "grad_norm": 3.655611031281081, + "learning_rate": 3.92226840562368e-06, + "loss": 0.8669, + "step": 1613 + }, + { + "epoch": 0.11644601565600086, + "grad_norm": 3.1645706333879122, + "learning_rate": 3.922139323701113e-06, + "loss": 0.9908, + "step": 1614 + }, + { + "epoch": 0.11651816312542837, + "grad_norm": 3.807361329046272, + "learning_rate": 3.922010136817633e-06, + "loss": 0.9219, + "step": 1615 + }, + { + "epoch": 0.11659031059485589, + "grad_norm": 3.3905612921992234, + "learning_rate": 3.921880844980293e-06, + "loss": 0.9043, + "step": 1616 + }, + { + "epoch": 0.1166624580642834, + "grad_norm": 2.0649204501146077, + "learning_rate": 3.921751448196156e-06, + "loss": 0.9999, + "step": 1617 + }, + { + "epoch": 0.1167346055337109, + "grad_norm": 2.721158944509074, + "learning_rate": 3.9216219464722866e-06, + "loss": 1.0483, + "step": 1618 + }, + { + "epoch": 0.11680675300313842, + "grad_norm": 4.099535864633937, + "learning_rate": 3.921492339815756e-06, + "loss": 0.9947, + "step": 1619 + }, + { + "epoch": 0.11687890047256592, + "grad_norm": 3.132745528040385, + "learning_rate": 3.921362628233642e-06, + "loss": 1.0648, + "step": 1620 + }, + { + "epoch": 0.11695104794199343, + "grad_norm": 2.4106772182783653, + "learning_rate": 3.921232811733027e-06, + "loss": 0.9649, + "step": 1621 + }, + { + "epoch": 0.11702319541142095, + "grad_norm": 1.1476659973021444, + "learning_rate": 3.921102890321001e-06, + "loss": 0.9038, + "step": 1622 + }, + { + "epoch": 0.11709534288084845, + "grad_norm": 3.632652229957599, + "learning_rate": 3.920972864004657e-06, + "loss": 1.0095, + "step": 1623 + }, + { + "epoch": 0.11716749035027596, + "grad_norm": 3.5438569121741983, + "learning_rate": 3.920842732791097e-06, + "loss": 0.9104, + "step": 1624 + }, + { + "epoch": 0.11723963781970348, + "grad_norm": 3.507232006272923, + "learning_rate": 3.920712496687425e-06, + "loss": 0.9584, + "step": 1625 + }, + { + "epoch": 0.11731178528913098, + "grad_norm": 3.4331632133730845, + "learning_rate": 3.9205821557007545e-06, + "loss": 0.8829, + "step": 1626 + }, + { + "epoch": 0.11738393275855849, + "grad_norm": 2.2641451507920114, + "learning_rate": 3.920451709838202e-06, + "loss": 0.9141, + "step": 1627 + }, + { + "epoch": 0.11745608022798601, + "grad_norm": 3.445386820919265, + "learning_rate": 3.920321159106891e-06, + "loss": 0.915, + "step": 1628 + }, + { + "epoch": 0.11752822769741351, + "grad_norm": 2.604954979154229, + "learning_rate": 3.920190503513951e-06, + "loss": 0.9498, + "step": 1629 + }, + { + "epoch": 0.11760037516684102, + "grad_norm": 1.0961443170913223, + "learning_rate": 3.920059743066515e-06, + "loss": 0.8187, + "step": 1630 + }, + { + "epoch": 0.11767252263626854, + "grad_norm": 3.1836533616926626, + "learning_rate": 3.919928877771725e-06, + "loss": 1.0692, + "step": 1631 + }, + { + "epoch": 0.11774467010569604, + "grad_norm": 3.3728663446459324, + "learning_rate": 3.919797907636726e-06, + "loss": 1.0365, + "step": 1632 + }, + { + "epoch": 0.11781681757512355, + "grad_norm": 4.053529665943688, + "learning_rate": 3.919666832668669e-06, + "loss": 1.0878, + "step": 1633 + }, + { + "epoch": 0.11788896504455106, + "grad_norm": 2.970655131156116, + "learning_rate": 3.919535652874714e-06, + "loss": 1.0105, + "step": 1634 + }, + { + "epoch": 0.11796111251397857, + "grad_norm": 3.011744240579951, + "learning_rate": 3.919404368262023e-06, + "loss": 0.9805, + "step": 1635 + }, + { + "epoch": 0.11803325998340608, + "grad_norm": 2.6649139028243582, + "learning_rate": 3.919272978837764e-06, + "loss": 1.0442, + "step": 1636 + }, + { + "epoch": 0.11810540745283359, + "grad_norm": 2.1062957894529526, + "learning_rate": 3.919141484609113e-06, + "loss": 1.0472, + "step": 1637 + }, + { + "epoch": 0.1181775549222611, + "grad_norm": 2.6308880119775875, + "learning_rate": 3.919009885583249e-06, + "loss": 0.9518, + "step": 1638 + }, + { + "epoch": 0.11824970239168861, + "grad_norm": 5.7963304289567885, + "learning_rate": 3.91887818176736e-06, + "loss": 1.0041, + "step": 1639 + }, + { + "epoch": 0.11832184986111612, + "grad_norm": 2.089442476625921, + "learning_rate": 3.9187463731686365e-06, + "loss": 0.9879, + "step": 1640 + }, + { + "epoch": 0.11839399733054363, + "grad_norm": 3.0744065805369223, + "learning_rate": 3.918614459794277e-06, + "loss": 0.8798, + "step": 1641 + }, + { + "epoch": 0.11846614479997114, + "grad_norm": 1.038968418996163, + "learning_rate": 3.918482441651484e-06, + "loss": 0.8571, + "step": 1642 + }, + { + "epoch": 0.11853829226939865, + "grad_norm": 3.030260062057574, + "learning_rate": 3.918350318747468e-06, + "loss": 0.9922, + "step": 1643 + }, + { + "epoch": 0.11861043973882616, + "grad_norm": 3.2885405427586245, + "learning_rate": 3.9182180910894404e-06, + "loss": 0.9114, + "step": 1644 + }, + { + "epoch": 0.11868258720825367, + "grad_norm": 2.724754105418975, + "learning_rate": 3.918085758684625e-06, + "loss": 1.0702, + "step": 1645 + }, + { + "epoch": 0.11875473467768118, + "grad_norm": 3.334866711470845, + "learning_rate": 3.917953321540248e-06, + "loss": 0.9616, + "step": 1646 + }, + { + "epoch": 0.1188268821471087, + "grad_norm": 2.9758846282027696, + "learning_rate": 3.917820779663538e-06, + "loss": 1.0821, + "step": 1647 + }, + { + "epoch": 0.1188990296165362, + "grad_norm": 2.847426333257392, + "learning_rate": 3.917688133061736e-06, + "loss": 0.9721, + "step": 1648 + }, + { + "epoch": 0.1189711770859637, + "grad_norm": 3.51690935198445, + "learning_rate": 3.917555381742083e-06, + "loss": 0.9974, + "step": 1649 + }, + { + "epoch": 0.11904332455539123, + "grad_norm": 3.207522913315027, + "learning_rate": 3.917422525711831e-06, + "loss": 0.8397, + "step": 1650 + }, + { + "epoch": 0.11911547202481873, + "grad_norm": 3.1243083951489243, + "learning_rate": 3.917289564978231e-06, + "loss": 1.0244, + "step": 1651 + }, + { + "epoch": 0.11918761949424624, + "grad_norm": 2.471778653626658, + "learning_rate": 3.917156499548546e-06, + "loss": 0.9861, + "step": 1652 + }, + { + "epoch": 0.11925976696367376, + "grad_norm": 16.143158380564813, + "learning_rate": 3.917023329430042e-06, + "loss": 0.8697, + "step": 1653 + }, + { + "epoch": 0.11933191443310126, + "grad_norm": 2.8401389198175333, + "learning_rate": 3.916890054629991e-06, + "loss": 0.8979, + "step": 1654 + }, + { + "epoch": 0.11940406190252877, + "grad_norm": 3.742803152645504, + "learning_rate": 3.916756675155669e-06, + "loss": 1.0236, + "step": 1655 + }, + { + "epoch": 0.11947620937195629, + "grad_norm": 2.8636124693849365, + "learning_rate": 3.916623191014362e-06, + "loss": 0.854, + "step": 1656 + }, + { + "epoch": 0.11954835684138379, + "grad_norm": 3.3714320785467273, + "learning_rate": 3.9164896022133565e-06, + "loss": 1.0002, + "step": 1657 + }, + { + "epoch": 0.1196205043108113, + "grad_norm": 1.1282185502507136, + "learning_rate": 3.9163559087599495e-06, + "loss": 0.7942, + "step": 1658 + }, + { + "epoch": 0.1196926517802388, + "grad_norm": 3.112526681318402, + "learning_rate": 3.916222110661439e-06, + "loss": 1.1126, + "step": 1659 + }, + { + "epoch": 0.11976479924966632, + "grad_norm": 3.1020369082824724, + "learning_rate": 3.916088207925133e-06, + "loss": 0.9662, + "step": 1660 + }, + { + "epoch": 0.11983694671909383, + "grad_norm": 2.437151957386144, + "learning_rate": 3.915954200558344e-06, + "loss": 0.9878, + "step": 1661 + }, + { + "epoch": 0.11990909418852133, + "grad_norm": 2.3147235181253394, + "learning_rate": 3.915820088568388e-06, + "loss": 0.9811, + "step": 1662 + }, + { + "epoch": 0.11998124165794885, + "grad_norm": 2.6635427555285394, + "learning_rate": 3.91568587196259e-06, + "loss": 0.9591, + "step": 1663 + }, + { + "epoch": 0.12005338912737636, + "grad_norm": 3.0212010828429134, + "learning_rate": 3.915551550748277e-06, + "loss": 0.9323, + "step": 1664 + }, + { + "epoch": 0.12012553659680386, + "grad_norm": 3.257785129203441, + "learning_rate": 3.915417124932785e-06, + "loss": 1.0349, + "step": 1665 + }, + { + "epoch": 0.12019768406623138, + "grad_norm": 2.3518863087508186, + "learning_rate": 3.915282594523455e-06, + "loss": 1.0231, + "step": 1666 + }, + { + "epoch": 0.12026983153565889, + "grad_norm": 3.3463589481463085, + "learning_rate": 3.915147959527632e-06, + "loss": 0.9111, + "step": 1667 + }, + { + "epoch": 0.12034197900508639, + "grad_norm": 3.1938674755849092, + "learning_rate": 3.915013219952669e-06, + "loss": 0.9414, + "step": 1668 + }, + { + "epoch": 0.12041412647451391, + "grad_norm": 3.6044761804118295, + "learning_rate": 3.9148783758059235e-06, + "loss": 0.9201, + "step": 1669 + }, + { + "epoch": 0.12048627394394142, + "grad_norm": 1.2366682069996773, + "learning_rate": 3.9147434270947585e-06, + "loss": 0.8642, + "step": 1670 + }, + { + "epoch": 0.12055842141336892, + "grad_norm": 3.2409774544463423, + "learning_rate": 3.914608373826544e-06, + "loss": 0.9781, + "step": 1671 + }, + { + "epoch": 0.12063056888279644, + "grad_norm": 3.9945397063385335, + "learning_rate": 3.914473216008651e-06, + "loss": 0.9445, + "step": 1672 + }, + { + "epoch": 0.12070271635222395, + "grad_norm": 2.996856950341565, + "learning_rate": 3.9143379536484654e-06, + "loss": 0.9487, + "step": 1673 + }, + { + "epoch": 0.12077486382165145, + "grad_norm": 1.2197140524278391, + "learning_rate": 3.914202586753369e-06, + "loss": 0.8085, + "step": 1674 + }, + { + "epoch": 0.12084701129107897, + "grad_norm": 3.509513323699751, + "learning_rate": 3.914067115330757e-06, + "loss": 0.913, + "step": 1675 + }, + { + "epoch": 0.12091915876050648, + "grad_norm": 2.5357725455270876, + "learning_rate": 3.913931539388024e-06, + "loss": 0.8763, + "step": 1676 + }, + { + "epoch": 0.12099130622993398, + "grad_norm": 2.2271779063181283, + "learning_rate": 3.913795858932576e-06, + "loss": 1.0358, + "step": 1677 + }, + { + "epoch": 0.1210634536993615, + "grad_norm": 1.1080642801464464, + "learning_rate": 3.913660073971821e-06, + "loss": 0.8323, + "step": 1678 + }, + { + "epoch": 0.121135601168789, + "grad_norm": 6.0163465983603, + "learning_rate": 3.913524184513173e-06, + "loss": 1.0831, + "step": 1679 + }, + { + "epoch": 0.12120774863821651, + "grad_norm": 0.9465137762083794, + "learning_rate": 3.913388190564052e-06, + "loss": 0.8316, + "step": 1680 + }, + { + "epoch": 0.12127989610764403, + "grad_norm": 2.3955206514956084, + "learning_rate": 3.913252092131886e-06, + "loss": 1.0207, + "step": 1681 + }, + { + "epoch": 0.12135204357707154, + "grad_norm": 4.075406977451498, + "learning_rate": 3.913115889224105e-06, + "loss": 1.0557, + "step": 1682 + }, + { + "epoch": 0.12142419104649904, + "grad_norm": 3.314012577988416, + "learning_rate": 3.9129795818481485e-06, + "loss": 1.1214, + "step": 1683 + }, + { + "epoch": 0.12149633851592656, + "grad_norm": 3.0572910774572652, + "learning_rate": 3.912843170011458e-06, + "loss": 0.9926, + "step": 1684 + }, + { + "epoch": 0.12156848598535407, + "grad_norm": 3.1204214505891468, + "learning_rate": 3.912706653721483e-06, + "loss": 0.8087, + "step": 1685 + }, + { + "epoch": 0.12164063345478157, + "grad_norm": 3.5673064884988146, + "learning_rate": 3.912570032985679e-06, + "loss": 1.0288, + "step": 1686 + }, + { + "epoch": 0.12171278092420908, + "grad_norm": 3.225954012264528, + "learning_rate": 3.912433307811505e-06, + "loss": 0.8712, + "step": 1687 + }, + { + "epoch": 0.1217849283936366, + "grad_norm": 2.4847855138799795, + "learning_rate": 3.912296478206428e-06, + "loss": 0.9435, + "step": 1688 + }, + { + "epoch": 0.1218570758630641, + "grad_norm": 6.39947359781268, + "learning_rate": 3.912159544177919e-06, + "loss": 0.908, + "step": 1689 + }, + { + "epoch": 0.1219292233324916, + "grad_norm": 2.513706399699774, + "learning_rate": 3.912022505733456e-06, + "loss": 0.8726, + "step": 1690 + }, + { + "epoch": 0.12200137080191913, + "grad_norm": 2.1751944882484744, + "learning_rate": 3.911885362880522e-06, + "loss": 0.948, + "step": 1691 + }, + { + "epoch": 0.12207351827134663, + "grad_norm": 2.904106416081844, + "learning_rate": 3.9117481156266065e-06, + "loss": 0.841, + "step": 1692 + }, + { + "epoch": 0.12214566574077414, + "grad_norm": 6.6044121184199005, + "learning_rate": 3.911610763979202e-06, + "loss": 0.9656, + "step": 1693 + }, + { + "epoch": 0.12221781321020166, + "grad_norm": 3.3201150274457647, + "learning_rate": 3.911473307945812e-06, + "loss": 1.0401, + "step": 1694 + }, + { + "epoch": 0.12228996067962916, + "grad_norm": 3.2047877020308055, + "learning_rate": 3.911335747533939e-06, + "loss": 1.0089, + "step": 1695 + }, + { + "epoch": 0.12236210814905667, + "grad_norm": 2.8628913645171474, + "learning_rate": 3.9111980827510975e-06, + "loss": 0.9693, + "step": 1696 + }, + { + "epoch": 0.12243425561848419, + "grad_norm": 3.1802790510951193, + "learning_rate": 3.911060313604804e-06, + "loss": 0.9674, + "step": 1697 + }, + { + "epoch": 0.12250640308791169, + "grad_norm": 2.6807816387580226, + "learning_rate": 3.91092244010258e-06, + "loss": 0.9837, + "step": 1698 + }, + { + "epoch": 0.1225785505573392, + "grad_norm": 3.0315399031292887, + "learning_rate": 3.910784462251956e-06, + "loss": 1.0957, + "step": 1699 + }, + { + "epoch": 0.12265069802676672, + "grad_norm": 1.800428972520761, + "learning_rate": 3.910646380060467e-06, + "loss": 0.9972, + "step": 1700 + }, + { + "epoch": 0.12272284549619422, + "grad_norm": 3.2279270048621393, + "learning_rate": 3.9105081935356506e-06, + "loss": 1.1187, + "step": 1701 + }, + { + "epoch": 0.12279499296562173, + "grad_norm": 3.1323144378866195, + "learning_rate": 3.910369902685055e-06, + "loss": 1.0618, + "step": 1702 + }, + { + "epoch": 0.12286714043504925, + "grad_norm": 3.0633526607674404, + "learning_rate": 3.910231507516231e-06, + "loss": 0.9502, + "step": 1703 + }, + { + "epoch": 0.12293928790447675, + "grad_norm": 1.0979635548336384, + "learning_rate": 3.910093008036735e-06, + "loss": 0.8617, + "step": 1704 + }, + { + "epoch": 0.12301143537390426, + "grad_norm": 2.6650006832100455, + "learning_rate": 3.909954404254132e-06, + "loss": 0.9647, + "step": 1705 + }, + { + "epoch": 0.12308358284333178, + "grad_norm": 4.215676325322286, + "learning_rate": 3.9098156961759885e-06, + "loss": 0.9796, + "step": 1706 + }, + { + "epoch": 0.12315573031275928, + "grad_norm": 2.413547991888533, + "learning_rate": 3.9096768838098805e-06, + "loss": 0.9813, + "step": 1707 + }, + { + "epoch": 0.12322787778218679, + "grad_norm": 3.4440517064908334, + "learning_rate": 3.9095379671633865e-06, + "loss": 1.038, + "step": 1708 + }, + { + "epoch": 0.1233000252516143, + "grad_norm": 2.20458600454746, + "learning_rate": 3.909398946244094e-06, + "loss": 0.9898, + "step": 1709 + }, + { + "epoch": 0.12337217272104181, + "grad_norm": 2.313475119709871, + "learning_rate": 3.909259821059592e-06, + "loss": 0.9604, + "step": 1710 + }, + { + "epoch": 0.12344432019046932, + "grad_norm": 2.268415009118695, + "learning_rate": 3.90912059161748e-06, + "loss": 0.9182, + "step": 1711 + }, + { + "epoch": 0.12351646765989682, + "grad_norm": 2.546442755780223, + "learning_rate": 3.90898125792536e-06, + "loss": 0.9751, + "step": 1712 + }, + { + "epoch": 0.12358861512932434, + "grad_norm": 4.275948464992131, + "learning_rate": 3.90884181999084e-06, + "loss": 1.0767, + "step": 1713 + }, + { + "epoch": 0.12366076259875185, + "grad_norm": 4.5416899280943985, + "learning_rate": 3.908702277821534e-06, + "loss": 1.0079, + "step": 1714 + }, + { + "epoch": 0.12373291006817935, + "grad_norm": 5.117439275473543, + "learning_rate": 3.908562631425063e-06, + "loss": 0.898, + "step": 1715 + }, + { + "epoch": 0.12380505753760687, + "grad_norm": 4.612672822190535, + "learning_rate": 3.908422880809051e-06, + "loss": 0.9604, + "step": 1716 + }, + { + "epoch": 0.12387720500703438, + "grad_norm": 4.545159819324632, + "learning_rate": 3.908283025981131e-06, + "loss": 0.9859, + "step": 1717 + }, + { + "epoch": 0.12394935247646188, + "grad_norm": 0.8586649996184028, + "learning_rate": 3.9081430669489395e-06, + "loss": 0.7838, + "step": 1718 + }, + { + "epoch": 0.1240214999458894, + "grad_norm": 3.3422911126339363, + "learning_rate": 3.908003003720118e-06, + "loss": 1.0832, + "step": 1719 + }, + { + "epoch": 0.12409364741531691, + "grad_norm": 2.222934055230566, + "learning_rate": 3.907862836302316e-06, + "loss": 0.9474, + "step": 1720 + }, + { + "epoch": 0.12416579488474441, + "grad_norm": 2.595329539078295, + "learning_rate": 3.907722564703188e-06, + "loss": 0.9127, + "step": 1721 + }, + { + "epoch": 0.12423794235417193, + "grad_norm": 4.5829958473567975, + "learning_rate": 3.907582188930392e-06, + "loss": 1.1106, + "step": 1722 + }, + { + "epoch": 0.12431008982359944, + "grad_norm": 2.643347245508875, + "learning_rate": 3.907441708991594e-06, + "loss": 1.047, + "step": 1723 + }, + { + "epoch": 0.12438223729302694, + "grad_norm": 3.9339143684892224, + "learning_rate": 3.907301124894465e-06, + "loss": 1.0524, + "step": 1724 + }, + { + "epoch": 0.12445438476245446, + "grad_norm": 2.8242385040478757, + "learning_rate": 3.907160436646684e-06, + "loss": 0.9938, + "step": 1725 + }, + { + "epoch": 0.12452653223188197, + "grad_norm": 3.5804803417333635, + "learning_rate": 3.90701964425593e-06, + "loss": 0.9288, + "step": 1726 + }, + { + "epoch": 0.12459867970130947, + "grad_norm": 4.106507203568973, + "learning_rate": 3.906878747729893e-06, + "loss": 0.9507, + "step": 1727 + }, + { + "epoch": 0.12467082717073699, + "grad_norm": 1.800804286983205, + "learning_rate": 3.906737747076268e-06, + "loss": 1.0497, + "step": 1728 + }, + { + "epoch": 0.1247429746401645, + "grad_norm": 2.8088884689770692, + "learning_rate": 3.906596642302752e-06, + "loss": 0.9035, + "step": 1729 + }, + { + "epoch": 0.124815122109592, + "grad_norm": 8.334592902042273, + "learning_rate": 3.9064554334170504e-06, + "loss": 0.979, + "step": 1730 + }, + { + "epoch": 0.12488726957901952, + "grad_norm": 3.028066949907977, + "learning_rate": 3.906314120426876e-06, + "loss": 0.9227, + "step": 1731 + }, + { + "epoch": 0.12495941704844703, + "grad_norm": 3.3569247514088087, + "learning_rate": 3.9061727033399446e-06, + "loss": 0.8761, + "step": 1732 + }, + { + "epoch": 0.12503156451787453, + "grad_norm": 2.108030781001815, + "learning_rate": 3.906031182163978e-06, + "loss": 0.9553, + "step": 1733 + }, + { + "epoch": 0.12510371198730205, + "grad_norm": 2.2995040275868583, + "learning_rate": 3.9058895569067055e-06, + "loss": 1.0171, + "step": 1734 + }, + { + "epoch": 0.12517585945672954, + "grad_norm": 2.0129101826868614, + "learning_rate": 3.905747827575858e-06, + "loss": 1.0817, + "step": 1735 + }, + { + "epoch": 0.12524800692615706, + "grad_norm": 4.4037470641439045, + "learning_rate": 3.905605994179178e-06, + "loss": 0.9301, + "step": 1736 + }, + { + "epoch": 0.12532015439558458, + "grad_norm": 2.253921799630883, + "learning_rate": 3.905464056724408e-06, + "loss": 0.8962, + "step": 1737 + }, + { + "epoch": 0.12539230186501207, + "grad_norm": 2.248636256294592, + "learning_rate": 3.9053220152193e-06, + "loss": 1.0514, + "step": 1738 + }, + { + "epoch": 0.1254644493344396, + "grad_norm": 2.71665523113192, + "learning_rate": 3.90517986967161e-06, + "loss": 0.9984, + "step": 1739 + }, + { + "epoch": 0.1255365968038671, + "grad_norm": 2.498887100465974, + "learning_rate": 3.9050376200891e-06, + "loss": 0.9972, + "step": 1740 + }, + { + "epoch": 0.1256087442732946, + "grad_norm": 3.758512245425207, + "learning_rate": 3.904895266479537e-06, + "loss": 0.8679, + "step": 1741 + }, + { + "epoch": 0.12568089174272212, + "grad_norm": 2.3425795620787757, + "learning_rate": 3.904752808850696e-06, + "loss": 1.0542, + "step": 1742 + }, + { + "epoch": 0.12575303921214964, + "grad_norm": 2.753447539090749, + "learning_rate": 3.904610247210355e-06, + "loss": 0.967, + "step": 1743 + }, + { + "epoch": 0.12582518668157713, + "grad_norm": 2.0284070587342304, + "learning_rate": 3.9044675815663e-06, + "loss": 0.9055, + "step": 1744 + }, + { + "epoch": 0.12589733415100465, + "grad_norm": 2.150088809640956, + "learning_rate": 3.904324811926319e-06, + "loss": 1.0146, + "step": 1745 + }, + { + "epoch": 0.12596948162043217, + "grad_norm": 1.2679202612856129, + "learning_rate": 3.90418193829821e-06, + "loss": 0.8573, + "step": 1746 + }, + { + "epoch": 0.12604162908985966, + "grad_norm": 4.424855228792956, + "learning_rate": 3.904038960689775e-06, + "loss": 1.0064, + "step": 1747 + }, + { + "epoch": 0.12611377655928718, + "grad_norm": 4.388851146830807, + "learning_rate": 3.90389587910882e-06, + "loss": 1.0208, + "step": 1748 + }, + { + "epoch": 0.1261859240287147, + "grad_norm": 3.316332633164089, + "learning_rate": 3.903752693563159e-06, + "loss": 0.9277, + "step": 1749 + }, + { + "epoch": 0.1262580714981422, + "grad_norm": 0.9943159207405045, + "learning_rate": 3.903609404060612e-06, + "loss": 0.8219, + "step": 1750 + }, + { + "epoch": 0.1263302189675697, + "grad_norm": 0.9372370351106826, + "learning_rate": 3.903466010609002e-06, + "loss": 0.76, + "step": 1751 + }, + { + "epoch": 0.12640236643699723, + "grad_norm": 2.6051263998926664, + "learning_rate": 3.903322513216159e-06, + "loss": 0.9718, + "step": 1752 + }, + { + "epoch": 0.12647451390642472, + "grad_norm": 3.281525809185803, + "learning_rate": 3.9031789118899195e-06, + "loss": 0.9512, + "step": 1753 + }, + { + "epoch": 0.12654666137585224, + "grad_norm": 3.742218208585184, + "learning_rate": 3.903035206638125e-06, + "loss": 0.8647, + "step": 1754 + }, + { + "epoch": 0.12661880884527976, + "grad_norm": 3.233314819545112, + "learning_rate": 3.902891397468623e-06, + "loss": 0.8621, + "step": 1755 + }, + { + "epoch": 0.12669095631470725, + "grad_norm": 5.261033589159979, + "learning_rate": 3.902747484389266e-06, + "loss": 1.0207, + "step": 1756 + }, + { + "epoch": 0.12676310378413477, + "grad_norm": 2.9715163342384634, + "learning_rate": 3.902603467407912e-06, + "loss": 0.794, + "step": 1757 + }, + { + "epoch": 0.1268352512535623, + "grad_norm": 3.758450713654201, + "learning_rate": 3.902459346532426e-06, + "loss": 1.0445, + "step": 1758 + }, + { + "epoch": 0.12690739872298978, + "grad_norm": 5.031626243028802, + "learning_rate": 3.902315121770679e-06, + "loss": 1.0892, + "step": 1759 + }, + { + "epoch": 0.1269795461924173, + "grad_norm": 2.344458816156837, + "learning_rate": 3.902170793130544e-06, + "loss": 0.9755, + "step": 1760 + }, + { + "epoch": 0.12705169366184482, + "grad_norm": 6.411985035346375, + "learning_rate": 3.902026360619904e-06, + "loss": 1.0204, + "step": 1761 + }, + { + "epoch": 0.1271238411312723, + "grad_norm": 4.0238551247152605, + "learning_rate": 3.901881824246645e-06, + "loss": 0.9897, + "step": 1762 + }, + { + "epoch": 0.12719598860069983, + "grad_norm": 3.3827896645988784, + "learning_rate": 3.901737184018661e-06, + "loss": 1.0294, + "step": 1763 + }, + { + "epoch": 0.12726813607012735, + "grad_norm": 4.641174283898473, + "learning_rate": 3.901592439943849e-06, + "loss": 1.0869, + "step": 1764 + }, + { + "epoch": 0.12734028353955484, + "grad_norm": 3.4139263204480197, + "learning_rate": 3.901447592030113e-06, + "loss": 0.9174, + "step": 1765 + }, + { + "epoch": 0.12741243100898236, + "grad_norm": 3.176352211464726, + "learning_rate": 3.901302640285363e-06, + "loss": 1.1092, + "step": 1766 + }, + { + "epoch": 0.12748457847840988, + "grad_norm": 3.515005370742363, + "learning_rate": 3.901157584717514e-06, + "loss": 0.9165, + "step": 1767 + }, + { + "epoch": 0.12755672594783737, + "grad_norm": 3.2692194235074044, + "learning_rate": 3.901012425334488e-06, + "loss": 0.85, + "step": 1768 + }, + { + "epoch": 0.1276288734172649, + "grad_norm": 2.494407693280192, + "learning_rate": 3.90086716214421e-06, + "loss": 1.0352, + "step": 1769 + }, + { + "epoch": 0.1277010208866924, + "grad_norm": 1.3356014520606974, + "learning_rate": 3.900721795154613e-06, + "loss": 0.8009, + "step": 1770 + }, + { + "epoch": 0.1277731683561199, + "grad_norm": 3.486642416243042, + "learning_rate": 3.900576324373635e-06, + "loss": 0.9679, + "step": 1771 + }, + { + "epoch": 0.12784531582554742, + "grad_norm": 2.3229279339663913, + "learning_rate": 3.900430749809221e-06, + "loss": 0.8427, + "step": 1772 + }, + { + "epoch": 0.12791746329497494, + "grad_norm": 4.4896517554407085, + "learning_rate": 3.9002850714693176e-06, + "loss": 0.8806, + "step": 1773 + }, + { + "epoch": 0.12798961076440243, + "grad_norm": 3.172402323969479, + "learning_rate": 3.900139289361881e-06, + "loss": 0.9058, + "step": 1774 + }, + { + "epoch": 0.12806175823382995, + "grad_norm": 2.970694938813918, + "learning_rate": 3.899993403494872e-06, + "loss": 1.0104, + "step": 1775 + }, + { + "epoch": 0.12813390570325744, + "grad_norm": 3.950522912095524, + "learning_rate": 3.899847413876257e-06, + "loss": 0.9546, + "step": 1776 + }, + { + "epoch": 0.12820605317268496, + "grad_norm": 3.455147543598488, + "learning_rate": 3.899701320514007e-06, + "loss": 1.0329, + "step": 1777 + }, + { + "epoch": 0.12827820064211248, + "grad_norm": 2.5136773762201847, + "learning_rate": 3.899555123416101e-06, + "loss": 1.0034, + "step": 1778 + }, + { + "epoch": 0.12835034811153997, + "grad_norm": 3.661808711615929, + "learning_rate": 3.899408822590522e-06, + "loss": 1.1856, + "step": 1779 + }, + { + "epoch": 0.1284224955809675, + "grad_norm": 3.6931092870032307, + "learning_rate": 3.899262418045259e-06, + "loss": 1.0157, + "step": 1780 + }, + { + "epoch": 0.128494643050395, + "grad_norm": 2.9368658801722827, + "learning_rate": 3.899115909788305e-06, + "loss": 0.9863, + "step": 1781 + }, + { + "epoch": 0.1285667905198225, + "grad_norm": 2.7082619779919312, + "learning_rate": 3.8989692978276615e-06, + "loss": 0.9496, + "step": 1782 + }, + { + "epoch": 0.12863893798925002, + "grad_norm": 3.8436286721038173, + "learning_rate": 3.898822582171335e-06, + "loss": 0.9851, + "step": 1783 + }, + { + "epoch": 0.12871108545867754, + "grad_norm": 2.8026814631873953, + "learning_rate": 3.8986757628273365e-06, + "loss": 0.9403, + "step": 1784 + }, + { + "epoch": 0.12878323292810503, + "grad_norm": 2.8930352137690805, + "learning_rate": 3.898528839803682e-06, + "loss": 1.0487, + "step": 1785 + }, + { + "epoch": 0.12885538039753255, + "grad_norm": 3.0743848666106257, + "learning_rate": 3.898381813108397e-06, + "loss": 0.9011, + "step": 1786 + }, + { + "epoch": 0.12892752786696007, + "grad_norm": 2.9112564772144425, + "learning_rate": 3.8982346827495086e-06, + "loss": 0.9892, + "step": 1787 + }, + { + "epoch": 0.12899967533638756, + "grad_norm": 2.8317207534531206, + "learning_rate": 3.898087448735051e-06, + "loss": 1.0006, + "step": 1788 + }, + { + "epoch": 0.12907182280581508, + "grad_norm": 3.458665916046712, + "learning_rate": 3.897940111073064e-06, + "loss": 1.0446, + "step": 1789 + }, + { + "epoch": 0.1291439702752426, + "grad_norm": 4.172589301633044, + "learning_rate": 3.897792669771593e-06, + "loss": 0.9612, + "step": 1790 + }, + { + "epoch": 0.1292161177446701, + "grad_norm": 2.6974968503804595, + "learning_rate": 3.897645124838691e-06, + "loss": 0.8745, + "step": 1791 + }, + { + "epoch": 0.12928826521409761, + "grad_norm": 0.8259236930186745, + "learning_rate": 3.897497476282413e-06, + "loss": 0.756, + "step": 1792 + }, + { + "epoch": 0.12936041268352513, + "grad_norm": 5.524673431549466, + "learning_rate": 3.897349724110822e-06, + "loss": 0.9549, + "step": 1793 + }, + { + "epoch": 0.12943256015295262, + "grad_norm": 3.9686794725294043, + "learning_rate": 3.897201868331986e-06, + "loss": 0.9165, + "step": 1794 + }, + { + "epoch": 0.12950470762238014, + "grad_norm": 2.46663285266622, + "learning_rate": 3.89705390895398e-06, + "loss": 1.1031, + "step": 1795 + }, + { + "epoch": 0.12957685509180766, + "grad_norm": 2.866827087201563, + "learning_rate": 3.896905845984882e-06, + "loss": 0.907, + "step": 1796 + }, + { + "epoch": 0.12964900256123515, + "grad_norm": 3.3184204455184494, + "learning_rate": 3.896757679432779e-06, + "loss": 0.9814, + "step": 1797 + }, + { + "epoch": 0.12972115003066267, + "grad_norm": 2.49869207501122, + "learning_rate": 3.896609409305759e-06, + "loss": 0.9998, + "step": 1798 + }, + { + "epoch": 0.1297932975000902, + "grad_norm": 10.842021881391059, + "learning_rate": 3.896461035611921e-06, + "loss": 0.9334, + "step": 1799 + }, + { + "epoch": 0.12986544496951768, + "grad_norm": 6.022530216507032, + "learning_rate": 3.896312558359367e-06, + "loss": 0.8663, + "step": 1800 + }, + { + "epoch": 0.1299375924389452, + "grad_norm": 3.360615204166653, + "learning_rate": 3.8961639775562035e-06, + "loss": 0.9719, + "step": 1801 + }, + { + "epoch": 0.13000973990837272, + "grad_norm": 3.6289840377830784, + "learning_rate": 3.8960152932105435e-06, + "loss": 0.925, + "step": 1802 + }, + { + "epoch": 0.13008188737780021, + "grad_norm": 1.1241008025985797, + "learning_rate": 3.895866505330509e-06, + "loss": 0.9434, + "step": 1803 + }, + { + "epoch": 0.13015403484722773, + "grad_norm": 8.08584880688072, + "learning_rate": 3.8957176139242214e-06, + "loss": 1.0624, + "step": 1804 + }, + { + "epoch": 0.13022618231665525, + "grad_norm": 3.0602888571839584, + "learning_rate": 3.895568618999814e-06, + "loss": 0.9259, + "step": 1805 + }, + { + "epoch": 0.13029832978608274, + "grad_norm": 4.848040052984987, + "learning_rate": 3.89541952056542e-06, + "loss": 0.9777, + "step": 1806 + }, + { + "epoch": 0.13037047725551026, + "grad_norm": 2.6034760640280137, + "learning_rate": 3.895270318629184e-06, + "loss": 1.0265, + "step": 1807 + }, + { + "epoch": 0.13044262472493778, + "grad_norm": 4.633883013240139, + "learning_rate": 3.895121013199251e-06, + "loss": 0.9078, + "step": 1808 + }, + { + "epoch": 0.13051477219436527, + "grad_norm": 2.0789928488486833, + "learning_rate": 3.8949716042837746e-06, + "loss": 0.9297, + "step": 1809 + }, + { + "epoch": 0.1305869196637928, + "grad_norm": 3.8458836533935155, + "learning_rate": 3.8948220918909145e-06, + "loss": 0.9622, + "step": 1810 + }, + { + "epoch": 0.1306590671332203, + "grad_norm": 2.275429289002246, + "learning_rate": 3.8946724760288345e-06, + "loss": 1.0592, + "step": 1811 + }, + { + "epoch": 0.1307312146026478, + "grad_norm": 2.3649353795971697, + "learning_rate": 3.894522756705704e-06, + "loss": 1.0158, + "step": 1812 + }, + { + "epoch": 0.13080336207207532, + "grad_norm": 9.112006146061326, + "learning_rate": 3.8943729339296994e-06, + "loss": 0.9962, + "step": 1813 + }, + { + "epoch": 0.13087550954150284, + "grad_norm": 5.543758924750162, + "learning_rate": 3.894223007709001e-06, + "loss": 0.8913, + "step": 1814 + }, + { + "epoch": 0.13094765701093034, + "grad_norm": 3.8889556122157245, + "learning_rate": 3.8940729780517975e-06, + "loss": 1.0118, + "step": 1815 + }, + { + "epoch": 0.13101980448035785, + "grad_norm": 2.8426677614762212, + "learning_rate": 3.893922844966279e-06, + "loss": 0.9664, + "step": 1816 + }, + { + "epoch": 0.13109195194978537, + "grad_norm": 2.161638024149448, + "learning_rate": 3.893772608460646e-06, + "loss": 1.0507, + "step": 1817 + }, + { + "epoch": 0.13116409941921287, + "grad_norm": 2.873472720288669, + "learning_rate": 3.8936222685431e-06, + "loss": 1.0239, + "step": 1818 + }, + { + "epoch": 0.13123624688864038, + "grad_norm": 2.8958794752511134, + "learning_rate": 3.893471825221853e-06, + "loss": 0.9154, + "step": 1819 + }, + { + "epoch": 0.1313083943580679, + "grad_norm": 2.1347939924779213, + "learning_rate": 3.893321278505119e-06, + "loss": 0.8963, + "step": 1820 + }, + { + "epoch": 0.1313805418274954, + "grad_norm": 3.1188209479753377, + "learning_rate": 3.893170628401118e-06, + "loss": 1.0045, + "step": 1821 + }, + { + "epoch": 0.13145268929692291, + "grad_norm": 2.735290025470334, + "learning_rate": 3.893019874918078e-06, + "loss": 0.9961, + "step": 1822 + }, + { + "epoch": 0.13152483676635043, + "grad_norm": 2.569850427898721, + "learning_rate": 3.89286901806423e-06, + "loss": 1.0442, + "step": 1823 + }, + { + "epoch": 0.13159698423577793, + "grad_norm": 3.1969689736672935, + "learning_rate": 3.892718057847813e-06, + "loss": 1.0158, + "step": 1824 + }, + { + "epoch": 0.13166913170520544, + "grad_norm": 3.465434096146029, + "learning_rate": 3.892566994277069e-06, + "loss": 1.0459, + "step": 1825 + }, + { + "epoch": 0.13174127917463294, + "grad_norm": 2.14296677853907, + "learning_rate": 3.892415827360247e-06, + "loss": 0.9643, + "step": 1826 + }, + { + "epoch": 0.13181342664406046, + "grad_norm": 2.525274499759046, + "learning_rate": 3.892264557105602e-06, + "loss": 0.9622, + "step": 1827 + }, + { + "epoch": 0.13188557411348797, + "grad_norm": 3.2857318605225427, + "learning_rate": 3.892113183521395e-06, + "loss": 0.9952, + "step": 1828 + }, + { + "epoch": 0.13195772158291547, + "grad_norm": 12.569226034853315, + "learning_rate": 3.891961706615891e-06, + "loss": 0.9465, + "step": 1829 + }, + { + "epoch": 0.13202986905234299, + "grad_norm": 16.949891684757162, + "learning_rate": 3.891810126397362e-06, + "loss": 0.9478, + "step": 1830 + }, + { + "epoch": 0.1321020165217705, + "grad_norm": 2.8439054341857624, + "learning_rate": 3.8916584428740865e-06, + "loss": 1.0429, + "step": 1831 + }, + { + "epoch": 0.132174163991198, + "grad_norm": 4.126728967084395, + "learning_rate": 3.891506656054344e-06, + "loss": 0.9318, + "step": 1832 + }, + { + "epoch": 0.13224631146062552, + "grad_norm": 2.4019470383509134, + "learning_rate": 3.891354765946427e-06, + "loss": 1.0471, + "step": 1833 + }, + { + "epoch": 0.13231845893005303, + "grad_norm": 2.805966141581437, + "learning_rate": 3.891202772558626e-06, + "loss": 1.0995, + "step": 1834 + }, + { + "epoch": 0.13239060639948053, + "grad_norm": 3.601547539073793, + "learning_rate": 3.891050675899245e-06, + "loss": 0.907, + "step": 1835 + }, + { + "epoch": 0.13246275386890805, + "grad_norm": 2.5620522456801553, + "learning_rate": 3.890898475976585e-06, + "loss": 0.9764, + "step": 1836 + }, + { + "epoch": 0.13253490133833556, + "grad_norm": 2.771830778403549, + "learning_rate": 3.89074617279896e-06, + "loss": 0.9429, + "step": 1837 + }, + { + "epoch": 0.13260704880776306, + "grad_norm": 2.780963411460159, + "learning_rate": 3.890593766374685e-06, + "loss": 1.177, + "step": 1838 + }, + { + "epoch": 0.13267919627719058, + "grad_norm": 11.708655157225373, + "learning_rate": 3.890441256712084e-06, + "loss": 1.0367, + "step": 1839 + }, + { + "epoch": 0.1327513437466181, + "grad_norm": 2.61179326550496, + "learning_rate": 3.8902886438194834e-06, + "loss": 0.9139, + "step": 1840 + }, + { + "epoch": 0.13282349121604559, + "grad_norm": 2.46614265374553, + "learning_rate": 3.890135927705218e-06, + "loss": 0.9228, + "step": 1841 + }, + { + "epoch": 0.1328956386854731, + "grad_norm": 2.9941008105433404, + "learning_rate": 3.889983108377627e-06, + "loss": 1.0162, + "step": 1842 + }, + { + "epoch": 0.13296778615490062, + "grad_norm": 2.384232295054277, + "learning_rate": 3.889830185845054e-06, + "loss": 1.0154, + "step": 1843 + }, + { + "epoch": 0.13303993362432812, + "grad_norm": 2.790563612174087, + "learning_rate": 3.889677160115852e-06, + "loss": 1.0156, + "step": 1844 + }, + { + "epoch": 0.13311208109375564, + "grad_norm": 2.4317903403646803, + "learning_rate": 3.8895240311983745e-06, + "loss": 0.9904, + "step": 1845 + }, + { + "epoch": 0.13318422856318315, + "grad_norm": 2.3860230875808797, + "learning_rate": 3.889370799100984e-06, + "loss": 1.0041, + "step": 1846 + }, + { + "epoch": 0.13325637603261065, + "grad_norm": 3.059845688862175, + "learning_rate": 3.889217463832049e-06, + "loss": 0.9404, + "step": 1847 + }, + { + "epoch": 0.13332852350203817, + "grad_norm": 1.0536422636405651, + "learning_rate": 3.889064025399942e-06, + "loss": 0.8163, + "step": 1848 + }, + { + "epoch": 0.13340067097146568, + "grad_norm": 2.731559044574245, + "learning_rate": 3.888910483813042e-06, + "loss": 0.9718, + "step": 1849 + }, + { + "epoch": 0.13347281844089318, + "grad_norm": 4.788477004635793, + "learning_rate": 3.888756839079733e-06, + "loss": 1.0331, + "step": 1850 + }, + { + "epoch": 0.1335449659103207, + "grad_norm": 2.367710571914557, + "learning_rate": 3.888603091208404e-06, + "loss": 0.943, + "step": 1851 + }, + { + "epoch": 0.13361711337974821, + "grad_norm": 3.511444047106471, + "learning_rate": 3.888449240207452e-06, + "loss": 1.0635, + "step": 1852 + }, + { + "epoch": 0.1336892608491757, + "grad_norm": 4.753908256085238, + "learning_rate": 3.888295286085278e-06, + "loss": 0.8869, + "step": 1853 + }, + { + "epoch": 0.13376140831860323, + "grad_norm": 3.497412951615233, + "learning_rate": 3.888141228850288e-06, + "loss": 0.9402, + "step": 1854 + }, + { + "epoch": 0.13383355578803074, + "grad_norm": 11.978896339794048, + "learning_rate": 3.887987068510896e-06, + "loss": 0.9393, + "step": 1855 + }, + { + "epoch": 0.13390570325745824, + "grad_norm": 0.8579834856231326, + "learning_rate": 3.887832805075519e-06, + "loss": 0.7803, + "step": 1856 + }, + { + "epoch": 0.13397785072688576, + "grad_norm": 2.9823577614158965, + "learning_rate": 3.8876784385525806e-06, + "loss": 0.9539, + "step": 1857 + }, + { + "epoch": 0.13404999819631327, + "grad_norm": 2.399738070976842, + "learning_rate": 3.887523968950512e-06, + "loss": 1.0599, + "step": 1858 + }, + { + "epoch": 0.13412214566574077, + "grad_norm": 2.6381211006011016, + "learning_rate": 3.887369396277745e-06, + "loss": 0.9271, + "step": 1859 + }, + { + "epoch": 0.13419429313516829, + "grad_norm": 3.819515964339364, + "learning_rate": 3.8872147205427235e-06, + "loss": 1.0326, + "step": 1860 + }, + { + "epoch": 0.1342664406045958, + "grad_norm": 3.663965024647924, + "learning_rate": 3.887059941753891e-06, + "loss": 0.9589, + "step": 1861 + }, + { + "epoch": 0.1343385880740233, + "grad_norm": 2.277729714928081, + "learning_rate": 3.8869050599197014e-06, + "loss": 0.9882, + "step": 1862 + }, + { + "epoch": 0.13441073554345082, + "grad_norm": 2.377075192178152, + "learning_rate": 3.886750075048612e-06, + "loss": 0.9278, + "step": 1863 + }, + { + "epoch": 0.13448288301287833, + "grad_norm": 2.8341610391395133, + "learning_rate": 3.8865949871490845e-06, + "loss": 0.9486, + "step": 1864 + }, + { + "epoch": 0.13455503048230583, + "grad_norm": 3.709591912705962, + "learning_rate": 3.88643979622959e-06, + "loss": 0.9984, + "step": 1865 + }, + { + "epoch": 0.13462717795173335, + "grad_norm": 2.262193699119601, + "learning_rate": 3.8862845022986006e-06, + "loss": 0.9708, + "step": 1866 + }, + { + "epoch": 0.13469932542116086, + "grad_norm": 3.1994518525807543, + "learning_rate": 3.886129105364598e-06, + "loss": 1.0395, + "step": 1867 + }, + { + "epoch": 0.13477147289058836, + "grad_norm": 2.209754222822249, + "learning_rate": 3.8859736054360665e-06, + "loss": 0.9418, + "step": 1868 + }, + { + "epoch": 0.13484362036001588, + "grad_norm": 2.0284132883345634, + "learning_rate": 3.885818002521499e-06, + "loss": 0.9899, + "step": 1869 + }, + { + "epoch": 0.1349157678294434, + "grad_norm": 2.2446274723150466, + "learning_rate": 3.88566229662939e-06, + "loss": 0.9397, + "step": 1870 + }, + { + "epoch": 0.1349879152988709, + "grad_norm": 16.7674405980455, + "learning_rate": 3.885506487768244e-06, + "loss": 0.9361, + "step": 1871 + }, + { + "epoch": 0.1350600627682984, + "grad_norm": 1.948857642841197, + "learning_rate": 3.8853505759465696e-06, + "loss": 0.8685, + "step": 1872 + }, + { + "epoch": 0.13513221023772592, + "grad_norm": 1.0087039760220418, + "learning_rate": 3.885194561172878e-06, + "loss": 0.8145, + "step": 1873 + }, + { + "epoch": 0.13520435770715342, + "grad_norm": 2.401715055247813, + "learning_rate": 3.885038443455692e-06, + "loss": 0.8704, + "step": 1874 + }, + { + "epoch": 0.13527650517658094, + "grad_norm": 2.4905858169594164, + "learning_rate": 3.884882222803533e-06, + "loss": 0.9592, + "step": 1875 + }, + { + "epoch": 0.13534865264600845, + "grad_norm": 1.9754832943977791, + "learning_rate": 3.884725899224934e-06, + "loss": 0.9268, + "step": 1876 + }, + { + "epoch": 0.13542080011543595, + "grad_norm": 2.1062879791005122, + "learning_rate": 3.884569472728432e-06, + "loss": 0.9846, + "step": 1877 + }, + { + "epoch": 0.13549294758486347, + "grad_norm": 4.666122927096581, + "learning_rate": 3.884412943322566e-06, + "loss": 1.0286, + "step": 1878 + }, + { + "epoch": 0.13556509505429096, + "grad_norm": 3.7008328969523268, + "learning_rate": 3.884256311015885e-06, + "loss": 0.9557, + "step": 1879 + }, + { + "epoch": 0.13563724252371848, + "grad_norm": 2.6023352222881235, + "learning_rate": 3.884099575816943e-06, + "loss": 0.9468, + "step": 1880 + }, + { + "epoch": 0.135709389993146, + "grad_norm": 2.2840216534284603, + "learning_rate": 3.883942737734297e-06, + "loss": 1.0678, + "step": 1881 + }, + { + "epoch": 0.1357815374625735, + "grad_norm": 2.357973806881236, + "learning_rate": 3.883785796776513e-06, + "loss": 0.8415, + "step": 1882 + }, + { + "epoch": 0.135853684932001, + "grad_norm": 2.498629098765854, + "learning_rate": 3.883628752952159e-06, + "loss": 0.9402, + "step": 1883 + }, + { + "epoch": 0.13592583240142853, + "grad_norm": 3.3383852981075455, + "learning_rate": 3.883471606269813e-06, + "loss": 0.9749, + "step": 1884 + }, + { + "epoch": 0.13599797987085602, + "grad_norm": 2.697446116845282, + "learning_rate": 3.883314356738054e-06, + "loss": 0.9679, + "step": 1885 + }, + { + "epoch": 0.13607012734028354, + "grad_norm": 2.1094670028402382, + "learning_rate": 3.88315700436547e-06, + "loss": 0.9969, + "step": 1886 + }, + { + "epoch": 0.13614227480971106, + "grad_norm": 3.617205560033546, + "learning_rate": 3.882999549160654e-06, + "loss": 1.0084, + "step": 1887 + }, + { + "epoch": 0.13621442227913855, + "grad_norm": 2.2708731014957495, + "learning_rate": 3.882841991132203e-06, + "loss": 0.9881, + "step": 1888 + }, + { + "epoch": 0.13628656974856607, + "grad_norm": 1.9884016379012561, + "learning_rate": 3.88268433028872e-06, + "loss": 1.0218, + "step": 1889 + }, + { + "epoch": 0.13635871721799359, + "grad_norm": 2.2343713720332214, + "learning_rate": 3.882526566638816e-06, + "loss": 1.015, + "step": 1890 + }, + { + "epoch": 0.13643086468742108, + "grad_norm": 2.487712417810883, + "learning_rate": 3.882368700191105e-06, + "loss": 0.9812, + "step": 1891 + }, + { + "epoch": 0.1365030121568486, + "grad_norm": 2.190973521328108, + "learning_rate": 3.882210730954208e-06, + "loss": 0.9675, + "step": 1892 + }, + { + "epoch": 0.13657515962627612, + "grad_norm": 2.568293094142379, + "learning_rate": 3.8820526589367495e-06, + "loss": 0.9756, + "step": 1893 + }, + { + "epoch": 0.1366473070957036, + "grad_norm": 4.988142449248431, + "learning_rate": 3.8818944841473625e-06, + "loss": 0.9396, + "step": 1894 + }, + { + "epoch": 0.13671945456513113, + "grad_norm": 4.065795969905092, + "learning_rate": 3.881736206594686e-06, + "loss": 1.0515, + "step": 1895 + }, + { + "epoch": 0.13679160203455865, + "grad_norm": 2.5670452280492335, + "learning_rate": 3.881577826287359e-06, + "loss": 0.9516, + "step": 1896 + }, + { + "epoch": 0.13686374950398614, + "grad_norm": 2.3060385671733714, + "learning_rate": 3.881419343234034e-06, + "loss": 0.9385, + "step": 1897 + }, + { + "epoch": 0.13693589697341366, + "grad_norm": 3.432312258637885, + "learning_rate": 3.881260757443362e-06, + "loss": 0.9083, + "step": 1898 + }, + { + "epoch": 0.13700804444284118, + "grad_norm": 2.7691061234339425, + "learning_rate": 3.881102068924006e-06, + "loss": 0.9294, + "step": 1899 + }, + { + "epoch": 0.13708019191226867, + "grad_norm": 2.7016432565298065, + "learning_rate": 3.880943277684628e-06, + "loss": 0.9742, + "step": 1900 + }, + { + "epoch": 0.1371523393816962, + "grad_norm": 2.7485372814740185, + "learning_rate": 3.880784383733901e-06, + "loss": 0.9149, + "step": 1901 + }, + { + "epoch": 0.1372244868511237, + "grad_norm": 3.467974876884794, + "learning_rate": 3.880625387080502e-06, + "loss": 0.9553, + "step": 1902 + }, + { + "epoch": 0.1372966343205512, + "grad_norm": 0.9602368599534634, + "learning_rate": 3.880466287733111e-06, + "loss": 0.7591, + "step": 1903 + }, + { + "epoch": 0.13736878178997872, + "grad_norm": 2.284802949117579, + "learning_rate": 3.8803070857004184e-06, + "loss": 0.8841, + "step": 1904 + }, + { + "epoch": 0.13744092925940624, + "grad_norm": 2.0355148145998156, + "learning_rate": 3.880147780991116e-06, + "loss": 1.0434, + "step": 1905 + }, + { + "epoch": 0.13751307672883373, + "grad_norm": 6.302169353644166, + "learning_rate": 3.879988373613903e-06, + "loss": 1.0335, + "step": 1906 + }, + { + "epoch": 0.13758522419826125, + "grad_norm": 2.84900824708237, + "learning_rate": 3.879828863577486e-06, + "loss": 0.9236, + "step": 1907 + }, + { + "epoch": 0.13765737166768877, + "grad_norm": 3.185318892487592, + "learning_rate": 3.8796692508905725e-06, + "loss": 1.1028, + "step": 1908 + }, + { + "epoch": 0.13772951913711626, + "grad_norm": 3.805306950142425, + "learning_rate": 3.8795095355618796e-06, + "loss": 0.9277, + "step": 1909 + }, + { + "epoch": 0.13780166660654378, + "grad_norm": 2.2789671462445678, + "learning_rate": 3.879349717600129e-06, + "loss": 1.0016, + "step": 1910 + }, + { + "epoch": 0.1378738140759713, + "grad_norm": 2.322362232043595, + "learning_rate": 3.8791897970140466e-06, + "loss": 0.9574, + "step": 1911 + }, + { + "epoch": 0.1379459615453988, + "grad_norm": 3.2970048779401484, + "learning_rate": 3.879029773812366e-06, + "loss": 0.9677, + "step": 1912 + }, + { + "epoch": 0.1380181090148263, + "grad_norm": 5.43987844780152, + "learning_rate": 3.878869648003826e-06, + "loss": 0.9298, + "step": 1913 + }, + { + "epoch": 0.13809025648425383, + "grad_norm": 2.4764769145244636, + "learning_rate": 3.87870941959717e-06, + "loss": 1.0586, + "step": 1914 + }, + { + "epoch": 0.13816240395368132, + "grad_norm": 6.761813528365213, + "learning_rate": 3.878549088601147e-06, + "loss": 0.9455, + "step": 1915 + }, + { + "epoch": 0.13823455142310884, + "grad_norm": 2.8160588953675942, + "learning_rate": 3.878388655024513e-06, + "loss": 0.9206, + "step": 1916 + }, + { + "epoch": 0.13830669889253636, + "grad_norm": 3.3722054956793976, + "learning_rate": 3.878228118876027e-06, + "loss": 0.8226, + "step": 1917 + }, + { + "epoch": 0.13837884636196385, + "grad_norm": 1.091020229396808, + "learning_rate": 3.878067480164457e-06, + "loss": 0.9935, + "step": 1918 + }, + { + "epoch": 0.13845099383139137, + "grad_norm": 2.712391839749576, + "learning_rate": 3.877906738898574e-06, + "loss": 1.016, + "step": 1919 + }, + { + "epoch": 0.13852314130081889, + "grad_norm": 5.100047144952417, + "learning_rate": 3.877745895087157e-06, + "loss": 0.9768, + "step": 1920 + }, + { + "epoch": 0.13859528877024638, + "grad_norm": 2.6522984451671965, + "learning_rate": 3.877584948738986e-06, + "loss": 0.9741, + "step": 1921 + }, + { + "epoch": 0.1386674362396739, + "grad_norm": 3.022121251358085, + "learning_rate": 3.877423899862853e-06, + "loss": 1.0691, + "step": 1922 + }, + { + "epoch": 0.13873958370910142, + "grad_norm": 2.1613586280224304, + "learning_rate": 3.87726274846755e-06, + "loss": 1.0121, + "step": 1923 + }, + { + "epoch": 0.1388117311785289, + "grad_norm": 3.970188631049321, + "learning_rate": 3.877101494561878e-06, + "loss": 0.9024, + "step": 1924 + }, + { + "epoch": 0.13888387864795643, + "grad_norm": 1.8465088715369864, + "learning_rate": 3.876940138154642e-06, + "loss": 0.9843, + "step": 1925 + }, + { + "epoch": 0.13895602611738395, + "grad_norm": 2.939148947353469, + "learning_rate": 3.876778679254653e-06, + "loss": 1.0233, + "step": 1926 + }, + { + "epoch": 0.13902817358681144, + "grad_norm": 2.8221643343901777, + "learning_rate": 3.876617117870728e-06, + "loss": 1.0015, + "step": 1927 + }, + { + "epoch": 0.13910032105623896, + "grad_norm": 2.961457299979118, + "learning_rate": 3.8764554540116895e-06, + "loss": 0.9012, + "step": 1928 + }, + { + "epoch": 0.13917246852566645, + "grad_norm": 5.518335816338542, + "learning_rate": 3.876293687686365e-06, + "loss": 0.8612, + "step": 1929 + }, + { + "epoch": 0.13924461599509397, + "grad_norm": 4.569705029406836, + "learning_rate": 3.876131818903588e-06, + "loss": 0.9475, + "step": 1930 + }, + { + "epoch": 0.1393167634645215, + "grad_norm": 3.795193900481046, + "learning_rate": 3.875969847672198e-06, + "loss": 0.9756, + "step": 1931 + }, + { + "epoch": 0.13938891093394898, + "grad_norm": 3.0251775263878495, + "learning_rate": 3.875807774001038e-06, + "loss": 0.8057, + "step": 1932 + }, + { + "epoch": 0.1394610584033765, + "grad_norm": 6.280777794847701, + "learning_rate": 3.875645597898961e-06, + "loss": 0.8834, + "step": 1933 + }, + { + "epoch": 0.13953320587280402, + "grad_norm": 4.164550294628958, + "learning_rate": 3.8754833193748194e-06, + "loss": 0.9784, + "step": 1934 + }, + { + "epoch": 0.1396053533422315, + "grad_norm": 6.231399935312176, + "learning_rate": 3.8753209384374785e-06, + "loss": 0.9496, + "step": 1935 + }, + { + "epoch": 0.13967750081165903, + "grad_norm": 2.8688748667018324, + "learning_rate": 3.875158455095802e-06, + "loss": 0.9974, + "step": 1936 + }, + { + "epoch": 0.13974964828108655, + "grad_norm": 2.961669509305289, + "learning_rate": 3.8749958693586646e-06, + "loss": 1.0432, + "step": 1937 + }, + { + "epoch": 0.13982179575051404, + "grad_norm": 3.4427775704505175, + "learning_rate": 3.874833181234944e-06, + "loss": 1.0448, + "step": 1938 + }, + { + "epoch": 0.13989394321994156, + "grad_norm": 2.909389317986385, + "learning_rate": 3.874670390733523e-06, + "loss": 1.1346, + "step": 1939 + }, + { + "epoch": 0.13996609068936908, + "grad_norm": 2.0723352303711273, + "learning_rate": 3.874507497863292e-06, + "loss": 0.962, + "step": 1940 + }, + { + "epoch": 0.14003823815879657, + "grad_norm": 2.304607118400949, + "learning_rate": 3.8743445026331455e-06, + "loss": 0.8594, + "step": 1941 + }, + { + "epoch": 0.1401103856282241, + "grad_norm": 2.506196544200806, + "learning_rate": 3.874181405051984e-06, + "loss": 0.9425, + "step": 1942 + }, + { + "epoch": 0.1401825330976516, + "grad_norm": 1.015336156999799, + "learning_rate": 3.8740182051287155e-06, + "loss": 0.825, + "step": 1943 + }, + { + "epoch": 0.1402546805670791, + "grad_norm": 3.190780447555418, + "learning_rate": 3.8738549028722495e-06, + "loss": 0.9972, + "step": 1944 + }, + { + "epoch": 0.14032682803650662, + "grad_norm": 2.475515342774266, + "learning_rate": 3.873691498291504e-06, + "loss": 0.9892, + "step": 1945 + }, + { + "epoch": 0.14039897550593414, + "grad_norm": 0.8262329800421097, + "learning_rate": 3.873527991395402e-06, + "loss": 0.8364, + "step": 1946 + }, + { + "epoch": 0.14047112297536163, + "grad_norm": 3.6125944237429954, + "learning_rate": 3.873364382192871e-06, + "loss": 0.8861, + "step": 1947 + }, + { + "epoch": 0.14054327044478915, + "grad_norm": 2.0796778334028256, + "learning_rate": 3.873200670692846e-06, + "loss": 0.9753, + "step": 1948 + }, + { + "epoch": 0.14061541791421667, + "grad_norm": 2.305096399273277, + "learning_rate": 3.8730368569042685e-06, + "loss": 1.0332, + "step": 1949 + }, + { + "epoch": 0.14068756538364416, + "grad_norm": 2.298793069706698, + "learning_rate": 3.872872940836081e-06, + "loss": 0.9962, + "step": 1950 + }, + { + "epoch": 0.14075971285307168, + "grad_norm": 0.8668700315832267, + "learning_rate": 3.872708922497235e-06, + "loss": 0.8401, + "step": 1951 + }, + { + "epoch": 0.1408318603224992, + "grad_norm": 2.3471038219176834, + "learning_rate": 3.872544801896688e-06, + "loss": 0.9723, + "step": 1952 + }, + { + "epoch": 0.1409040077919267, + "grad_norm": 2.4509653680752304, + "learning_rate": 3.872380579043401e-06, + "loss": 1.019, + "step": 1953 + }, + { + "epoch": 0.1409761552613542, + "grad_norm": 2.640384189076293, + "learning_rate": 3.872216253946341e-06, + "loss": 0.9861, + "step": 1954 + }, + { + "epoch": 0.14104830273078173, + "grad_norm": 2.890411534031023, + "learning_rate": 3.8720518266144836e-06, + "loss": 0.8501, + "step": 1955 + }, + { + "epoch": 0.14112045020020922, + "grad_norm": 0.8272256105850448, + "learning_rate": 3.871887297056805e-06, + "loss": 0.7722, + "step": 1956 + }, + { + "epoch": 0.14119259766963674, + "grad_norm": 2.604622081353843, + "learning_rate": 3.871722665282291e-06, + "loss": 1.0372, + "step": 1957 + }, + { + "epoch": 0.14126474513906426, + "grad_norm": 3.046239072508499, + "learning_rate": 3.8715579312999304e-06, + "loss": 0.9943, + "step": 1958 + }, + { + "epoch": 0.14133689260849175, + "grad_norm": 2.5104773317554683, + "learning_rate": 3.87139309511872e-06, + "loss": 0.9069, + "step": 1959 + }, + { + "epoch": 0.14140904007791927, + "grad_norm": 2.828793272866583, + "learning_rate": 3.87122815674766e-06, + "loss": 1.0023, + "step": 1960 + }, + { + "epoch": 0.1414811875473468, + "grad_norm": 3.8118745884955305, + "learning_rate": 3.871063116195758e-06, + "loss": 0.9732, + "step": 1961 + }, + { + "epoch": 0.14155333501677428, + "grad_norm": 3.991060758620934, + "learning_rate": 3.870897973472026e-06, + "loss": 0.9811, + "step": 1962 + }, + { + "epoch": 0.1416254824862018, + "grad_norm": 2.0999358167376747, + "learning_rate": 3.870732728585481e-06, + "loss": 0.9316, + "step": 1963 + }, + { + "epoch": 0.14169762995562932, + "grad_norm": 2.786952227497499, + "learning_rate": 3.870567381545147e-06, + "loss": 0.94, + "step": 1964 + }, + { + "epoch": 0.1417697774250568, + "grad_norm": 3.8543252998217086, + "learning_rate": 3.870401932360053e-06, + "loss": 0.9684, + "step": 1965 + }, + { + "epoch": 0.14184192489448433, + "grad_norm": 2.541091436945544, + "learning_rate": 3.870236381039232e-06, + "loss": 1.0347, + "step": 1966 + }, + { + "epoch": 0.14191407236391185, + "grad_norm": 2.444748883603761, + "learning_rate": 3.870070727591727e-06, + "loss": 0.9849, + "step": 1967 + }, + { + "epoch": 0.14198621983333934, + "grad_norm": 3.175590556332747, + "learning_rate": 3.869904972026582e-06, + "loss": 0.9926, + "step": 1968 + }, + { + "epoch": 0.14205836730276686, + "grad_norm": 2.6625053799713654, + "learning_rate": 3.869739114352849e-06, + "loss": 0.8966, + "step": 1969 + }, + { + "epoch": 0.14213051477219438, + "grad_norm": 1.8849193134797195, + "learning_rate": 3.869573154579585e-06, + "loss": 0.9223, + "step": 1970 + }, + { + "epoch": 0.14220266224162187, + "grad_norm": 2.1416468722050865, + "learning_rate": 3.869407092715851e-06, + "loss": 0.9989, + "step": 1971 + }, + { + "epoch": 0.1422748097110494, + "grad_norm": 6.149095434639687, + "learning_rate": 3.869240928770716e-06, + "loss": 0.9381, + "step": 1972 + }, + { + "epoch": 0.1423469571804769, + "grad_norm": 2.354572992508574, + "learning_rate": 3.869074662753253e-06, + "loss": 1.0096, + "step": 1973 + }, + { + "epoch": 0.1424191046499044, + "grad_norm": 3.812779150576941, + "learning_rate": 3.8689082946725435e-06, + "loss": 0.9952, + "step": 1974 + }, + { + "epoch": 0.14249125211933192, + "grad_norm": 5.587218323163884, + "learning_rate": 3.868741824537669e-06, + "loss": 0.9758, + "step": 1975 + }, + { + "epoch": 0.14256339958875944, + "grad_norm": 2.441686897931784, + "learning_rate": 3.868575252357722e-06, + "loss": 1.0093, + "step": 1976 + }, + { + "epoch": 0.14263554705818693, + "grad_norm": 3.1793862089960996, + "learning_rate": 3.868408578141798e-06, + "loss": 1.0298, + "step": 1977 + }, + { + "epoch": 0.14270769452761445, + "grad_norm": 2.2656479406839645, + "learning_rate": 3.868241801898997e-06, + "loss": 1.0198, + "step": 1978 + }, + { + "epoch": 0.14277984199704197, + "grad_norm": 1.7232140570018513, + "learning_rate": 3.868074923638428e-06, + "loss": 0.9633, + "step": 1979 + }, + { + "epoch": 0.14285198946646946, + "grad_norm": 3.984598190937839, + "learning_rate": 3.8679079433692026e-06, + "loss": 0.9658, + "step": 1980 + }, + { + "epoch": 0.14292413693589698, + "grad_norm": 2.1607239237708873, + "learning_rate": 3.86774086110044e-06, + "loss": 1.1578, + "step": 1981 + }, + { + "epoch": 0.14299628440532447, + "grad_norm": 4.232057056626481, + "learning_rate": 3.867573676841262e-06, + "loss": 1.0587, + "step": 1982 + }, + { + "epoch": 0.143068431874752, + "grad_norm": 2.0333981232665326, + "learning_rate": 3.8674063906008e-06, + "loss": 0.9983, + "step": 1983 + }, + { + "epoch": 0.1431405793441795, + "grad_norm": 2.9884039560693396, + "learning_rate": 3.8672390023881865e-06, + "loss": 0.8947, + "step": 1984 + }, + { + "epoch": 0.143212726813607, + "grad_norm": 2.4110181061971816, + "learning_rate": 3.867071512212564e-06, + "loss": 0.972, + "step": 1985 + }, + { + "epoch": 0.14328487428303452, + "grad_norm": 2.472267638609644, + "learning_rate": 3.866903920083079e-06, + "loss": 0.9226, + "step": 1986 + }, + { + "epoch": 0.14335702175246204, + "grad_norm": 2.7612926796567274, + "learning_rate": 3.86673622600888e-06, + "loss": 0.8983, + "step": 1987 + }, + { + "epoch": 0.14342916922188953, + "grad_norm": 2.1380648931748443, + "learning_rate": 3.866568429999128e-06, + "loss": 0.9155, + "step": 1988 + }, + { + "epoch": 0.14350131669131705, + "grad_norm": 1.925660352026436, + "learning_rate": 3.866400532062983e-06, + "loss": 1.0409, + "step": 1989 + }, + { + "epoch": 0.14357346416074457, + "grad_norm": 2.4908706869330284, + "learning_rate": 3.866232532209614e-06, + "loss": 0.9388, + "step": 1990 + }, + { + "epoch": 0.14364561163017206, + "grad_norm": 2.088295507471555, + "learning_rate": 3.866064430448195e-06, + "loss": 1.0194, + "step": 1991 + }, + { + "epoch": 0.14371775909959958, + "grad_norm": 3.1220538175002166, + "learning_rate": 3.865896226787905e-06, + "loss": 1.0245, + "step": 1992 + }, + { + "epoch": 0.1437899065690271, + "grad_norm": 6.059996727165119, + "learning_rate": 3.86572792123793e-06, + "loss": 0.862, + "step": 1993 + }, + { + "epoch": 0.1438620540384546, + "grad_norm": 2.9931540260333365, + "learning_rate": 3.8655595138074594e-06, + "loss": 0.9905, + "step": 1994 + }, + { + "epoch": 0.1439342015078821, + "grad_norm": 3.420857456755677, + "learning_rate": 3.865391004505691e-06, + "loss": 1.0452, + "step": 1995 + }, + { + "epoch": 0.14400634897730963, + "grad_norm": 4.366332050538704, + "learning_rate": 3.865222393341824e-06, + "loss": 1.02, + "step": 1996 + }, + { + "epoch": 0.14407849644673712, + "grad_norm": 3.2160175169187775, + "learning_rate": 3.865053680325067e-06, + "loss": 1.0724, + "step": 1997 + }, + { + "epoch": 0.14415064391616464, + "grad_norm": 2.687022676884002, + "learning_rate": 3.864884865464633e-06, + "loss": 0.9911, + "step": 1998 + }, + { + "epoch": 0.14422279138559216, + "grad_norm": 3.0191917080480812, + "learning_rate": 3.864715948769741e-06, + "loss": 0.9225, + "step": 1999 + }, + { + "epoch": 0.14429493885501965, + "grad_norm": 3.4655635735422865, + "learning_rate": 3.864546930249612e-06, + "loss": 0.8993, + "step": 2000 + }, + { + "epoch": 0.14436708632444717, + "grad_norm": 0.7525613282011356, + "learning_rate": 3.8643778099134785e-06, + "loss": 0.7174, + "step": 2001 + }, + { + "epoch": 0.1444392337938747, + "grad_norm": 3.204488324579024, + "learning_rate": 3.864208587770575e-06, + "loss": 0.9539, + "step": 2002 + }, + { + "epoch": 0.14451138126330218, + "grad_norm": 4.698294252952778, + "learning_rate": 3.864039263830141e-06, + "loss": 1.0298, + "step": 2003 + }, + { + "epoch": 0.1445835287327297, + "grad_norm": 3.0138501888980156, + "learning_rate": 3.863869838101423e-06, + "loss": 1.0348, + "step": 2004 + }, + { + "epoch": 0.14465567620215722, + "grad_norm": 2.3529812914570267, + "learning_rate": 3.863700310593674e-06, + "loss": 0.9477, + "step": 2005 + }, + { + "epoch": 0.1447278236715847, + "grad_norm": 2.5511360330652892, + "learning_rate": 3.863530681316149e-06, + "loss": 0.9396, + "step": 2006 + }, + { + "epoch": 0.14479997114101223, + "grad_norm": 4.669717245751978, + "learning_rate": 3.863360950278113e-06, + "loss": 0.8727, + "step": 2007 + }, + { + "epoch": 0.14487211861043975, + "grad_norm": 3.0892878248111435, + "learning_rate": 3.863191117488833e-06, + "loss": 0.881, + "step": 2008 + }, + { + "epoch": 0.14494426607986724, + "grad_norm": 3.7931163378051065, + "learning_rate": 3.8630211829575835e-06, + "loss": 1.0069, + "step": 2009 + }, + { + "epoch": 0.14501641354929476, + "grad_norm": 4.072895541059077, + "learning_rate": 3.862851146693644e-06, + "loss": 0.7972, + "step": 2010 + }, + { + "epoch": 0.14508856101872228, + "grad_norm": 13.190473320763788, + "learning_rate": 3.862681008706299e-06, + "loss": 1.0131, + "step": 2011 + }, + { + "epoch": 0.14516070848814977, + "grad_norm": 6.6697991958886265, + "learning_rate": 3.862510769004839e-06, + "loss": 0.956, + "step": 2012 + }, + { + "epoch": 0.1452328559575773, + "grad_norm": 4.121793714622715, + "learning_rate": 3.862340427598561e-06, + "loss": 1.0167, + "step": 2013 + }, + { + "epoch": 0.1453050034270048, + "grad_norm": 2.4302260608465267, + "learning_rate": 3.8621699844967675e-06, + "loss": 0.8684, + "step": 2014 + }, + { + "epoch": 0.1453771508964323, + "grad_norm": 3.8231720938657685, + "learning_rate": 3.861999439708764e-06, + "loss": 0.8908, + "step": 2015 + }, + { + "epoch": 0.14544929836585982, + "grad_norm": 3.0812204355674706, + "learning_rate": 3.861828793243864e-06, + "loss": 0.9762, + "step": 2016 + }, + { + "epoch": 0.14552144583528734, + "grad_norm": 22.906359246840438, + "learning_rate": 3.861658045111386e-06, + "loss": 0.897, + "step": 2017 + }, + { + "epoch": 0.14559359330471483, + "grad_norm": 3.029160556148643, + "learning_rate": 3.861487195320653e-06, + "loss": 0.8548, + "step": 2018 + }, + { + "epoch": 0.14566574077414235, + "grad_norm": 2.1221914243654814, + "learning_rate": 3.861316243880996e-06, + "loss": 0.9482, + "step": 2019 + }, + { + "epoch": 0.14573788824356987, + "grad_norm": 2.0482749519481205, + "learning_rate": 3.861145190801749e-06, + "loss": 1.0489, + "step": 2020 + }, + { + "epoch": 0.14581003571299736, + "grad_norm": 2.724765043016064, + "learning_rate": 3.860974036092253e-06, + "loss": 0.9156, + "step": 2021 + }, + { + "epoch": 0.14588218318242488, + "grad_norm": 2.9304235728968187, + "learning_rate": 3.860802779761854e-06, + "loss": 0.9201, + "step": 2022 + }, + { + "epoch": 0.1459543306518524, + "grad_norm": 0.8711953895015114, + "learning_rate": 3.860631421819904e-06, + "loss": 0.72, + "step": 2023 + }, + { + "epoch": 0.1460264781212799, + "grad_norm": 5.409319937616927, + "learning_rate": 3.860459962275759e-06, + "loss": 0.9333, + "step": 2024 + }, + { + "epoch": 0.1460986255907074, + "grad_norm": 4.105012499921495, + "learning_rate": 3.860288401138783e-06, + "loss": 0.9566, + "step": 2025 + }, + { + "epoch": 0.14617077306013493, + "grad_norm": 3.7735879998374346, + "learning_rate": 3.8601167384183445e-06, + "loss": 0.9595, + "step": 2026 + }, + { + "epoch": 0.14624292052956242, + "grad_norm": 2.820204291552963, + "learning_rate": 3.859944974123816e-06, + "loss": 0.7634, + "step": 2027 + }, + { + "epoch": 0.14631506799898994, + "grad_norm": 4.0587138212357905, + "learning_rate": 3.8597731082645785e-06, + "loss": 0.9837, + "step": 2028 + }, + { + "epoch": 0.14638721546841746, + "grad_norm": 8.20696757099879, + "learning_rate": 3.859601140850016e-06, + "loss": 0.9024, + "step": 2029 + }, + { + "epoch": 0.14645936293784495, + "grad_norm": 3.136895237728367, + "learning_rate": 3.859429071889518e-06, + "loss": 0.959, + "step": 2030 + }, + { + "epoch": 0.14653151040727247, + "grad_norm": 2.698194737384763, + "learning_rate": 3.859256901392484e-06, + "loss": 1.0165, + "step": 2031 + }, + { + "epoch": 0.14660365787669996, + "grad_norm": 2.474656391374587, + "learning_rate": 3.859084629368312e-06, + "loss": 0.9172, + "step": 2032 + }, + { + "epoch": 0.14667580534612748, + "grad_norm": 5.354241317446312, + "learning_rate": 3.85891225582641e-06, + "loss": 1.012, + "step": 2033 + }, + { + "epoch": 0.146747952815555, + "grad_norm": 2.7793168625478146, + "learning_rate": 3.858739780776192e-06, + "loss": 0.9002, + "step": 2034 + }, + { + "epoch": 0.1468201002849825, + "grad_norm": 3.9940984821661196, + "learning_rate": 3.858567204227075e-06, + "loss": 0.9162, + "step": 2035 + }, + { + "epoch": 0.14689224775441, + "grad_norm": 24.329205027808786, + "learning_rate": 3.8583945261884835e-06, + "loss": 0.9539, + "step": 2036 + }, + { + "epoch": 0.14696439522383753, + "grad_norm": 2.428898526851514, + "learning_rate": 3.858221746669847e-06, + "loss": 0.9862, + "step": 2037 + }, + { + "epoch": 0.14703654269326502, + "grad_norm": 2.4140734626002605, + "learning_rate": 3.8580488656805985e-06, + "loss": 0.9224, + "step": 2038 + }, + { + "epoch": 0.14710869016269254, + "grad_norm": 2.7460217744557145, + "learning_rate": 3.85787588323018e-06, + "loss": 0.9181, + "step": 2039 + }, + { + "epoch": 0.14718083763212006, + "grad_norm": 3.4653134205084477, + "learning_rate": 3.857702799328038e-06, + "loss": 1.0943, + "step": 2040 + }, + { + "epoch": 0.14725298510154755, + "grad_norm": 2.6586687631755024, + "learning_rate": 3.857529613983623e-06, + "loss": 0.9501, + "step": 2041 + }, + { + "epoch": 0.14732513257097507, + "grad_norm": 2.1964879266071455, + "learning_rate": 3.857356327206391e-06, + "loss": 0.9167, + "step": 2042 + }, + { + "epoch": 0.1473972800404026, + "grad_norm": 3.768568732360781, + "learning_rate": 3.8571829390058066e-06, + "loss": 1.0332, + "step": 2043 + }, + { + "epoch": 0.14746942750983008, + "grad_norm": 2.5995464589611323, + "learning_rate": 3.857009449391337e-06, + "loss": 1.0012, + "step": 2044 + }, + { + "epoch": 0.1475415749792576, + "grad_norm": 4.371922964033376, + "learning_rate": 3.856835858372455e-06, + "loss": 0.8512, + "step": 2045 + }, + { + "epoch": 0.14761372244868512, + "grad_norm": 2.7139684789312803, + "learning_rate": 3.856662165958641e-06, + "loss": 0.9166, + "step": 2046 + }, + { + "epoch": 0.1476858699181126, + "grad_norm": 4.30901627185128, + "learning_rate": 3.856488372159379e-06, + "loss": 1.0092, + "step": 2047 + }, + { + "epoch": 0.14775801738754013, + "grad_norm": 3.113237445288597, + "learning_rate": 3.856314476984161e-06, + "loss": 0.9504, + "step": 2048 + }, + { + "epoch": 0.14783016485696765, + "grad_norm": 2.2552967245092774, + "learning_rate": 3.8561404804424795e-06, + "loss": 0.9583, + "step": 2049 + }, + { + "epoch": 0.14790231232639514, + "grad_norm": 10.346871357430082, + "learning_rate": 3.855966382543838e-06, + "loss": 0.9118, + "step": 2050 + }, + { + "epoch": 0.14797445979582266, + "grad_norm": 3.826344476908638, + "learning_rate": 3.855792183297743e-06, + "loss": 0.8898, + "step": 2051 + }, + { + "epoch": 0.14804660726525018, + "grad_norm": 2.1554133685671975, + "learning_rate": 3.855617882713707e-06, + "loss": 0.9895, + "step": 2052 + }, + { + "epoch": 0.14811875473467767, + "grad_norm": 0.8845423401941082, + "learning_rate": 3.855443480801247e-06, + "loss": 0.8366, + "step": 2053 + }, + { + "epoch": 0.1481909022041052, + "grad_norm": 3.5518924665583755, + "learning_rate": 3.855268977569888e-06, + "loss": 0.9251, + "step": 2054 + }, + { + "epoch": 0.1482630496735327, + "grad_norm": 0.892273531638992, + "learning_rate": 3.855094373029158e-06, + "loss": 0.8836, + "step": 2055 + }, + { + "epoch": 0.1483351971429602, + "grad_norm": 2.2283923744401233, + "learning_rate": 3.8549196671885915e-06, + "loss": 0.9055, + "step": 2056 + }, + { + "epoch": 0.14840734461238772, + "grad_norm": 4.6910604560711695, + "learning_rate": 3.854744860057728e-06, + "loss": 0.992, + "step": 2057 + }, + { + "epoch": 0.14847949208181524, + "grad_norm": 3.854632100363197, + "learning_rate": 3.8545699516461145e-06, + "loss": 1.0553, + "step": 2058 + }, + { + "epoch": 0.14855163955124273, + "grad_norm": 2.1114396904785893, + "learning_rate": 3.854394941963301e-06, + "loss": 0.9169, + "step": 2059 + }, + { + "epoch": 0.14862378702067025, + "grad_norm": 3.758450713654201, + "learning_rate": 3.854219831018845e-06, + "loss": 0.9592, + "step": 2060 + }, + { + "epoch": 0.14869593449009777, + "grad_norm": 2.0811557324571504, + "learning_rate": 3.854044618822307e-06, + "loss": 0.916, + "step": 2061 + }, + { + "epoch": 0.14876808195952526, + "grad_norm": 3.108179370520404, + "learning_rate": 3.853869305383256e-06, + "loss": 1.0606, + "step": 2062 + }, + { + "epoch": 0.14884022942895278, + "grad_norm": 2.879772371727298, + "learning_rate": 3.853693890711265e-06, + "loss": 0.9527, + "step": 2063 + }, + { + "epoch": 0.1489123768983803, + "grad_norm": 1.9480341367609142, + "learning_rate": 3.853518374815913e-06, + "loss": 1.0214, + "step": 2064 + }, + { + "epoch": 0.1489845243678078, + "grad_norm": 1.9808921587079247, + "learning_rate": 3.853342757706785e-06, + "loss": 0.8617, + "step": 2065 + }, + { + "epoch": 0.1490566718372353, + "grad_norm": 2.07152133414488, + "learning_rate": 3.853167039393469e-06, + "loss": 0.8745, + "step": 2066 + }, + { + "epoch": 0.14912881930666283, + "grad_norm": 2.476734143395154, + "learning_rate": 3.852991219885562e-06, + "loss": 1.0026, + "step": 2067 + }, + { + "epoch": 0.14920096677609032, + "grad_norm": 7.227331568029675, + "learning_rate": 3.852815299192662e-06, + "loss": 1.0218, + "step": 2068 + }, + { + "epoch": 0.14927311424551784, + "grad_norm": 3.131126197301497, + "learning_rate": 3.852639277324379e-06, + "loss": 1.0993, + "step": 2069 + }, + { + "epoch": 0.14934526171494536, + "grad_norm": 3.9864740564604246, + "learning_rate": 3.852463154290323e-06, + "loss": 0.9966, + "step": 2070 + }, + { + "epoch": 0.14941740918437285, + "grad_norm": 3.3346014630566323, + "learning_rate": 3.852286930100112e-06, + "loss": 1.0325, + "step": 2071 + }, + { + "epoch": 0.14948955665380037, + "grad_norm": 2.4216048551452176, + "learning_rate": 3.852110604763368e-06, + "loss": 0.9301, + "step": 2072 + }, + { + "epoch": 0.1495617041232279, + "grad_norm": 2.1960053894657783, + "learning_rate": 3.8519341782897204e-06, + "loss": 0.8601, + "step": 2073 + }, + { + "epoch": 0.14963385159265538, + "grad_norm": 3.712516753804329, + "learning_rate": 3.851757650688803e-06, + "loss": 0.9738, + "step": 2074 + }, + { + "epoch": 0.1497059990620829, + "grad_norm": 2.0857794198132398, + "learning_rate": 3.851581021970255e-06, + "loss": 1.0078, + "step": 2075 + }, + { + "epoch": 0.14977814653151042, + "grad_norm": 2.9719864721080187, + "learning_rate": 3.851404292143723e-06, + "loss": 1.047, + "step": 2076 + }, + { + "epoch": 0.1498502940009379, + "grad_norm": 5.849327974763225, + "learning_rate": 3.851227461218855e-06, + "loss": 1.0515, + "step": 2077 + }, + { + "epoch": 0.14992244147036543, + "grad_norm": 3.134448758566738, + "learning_rate": 3.851050529205309e-06, + "loss": 0.9452, + "step": 2078 + }, + { + "epoch": 0.14999458893979295, + "grad_norm": 3.383538078038595, + "learning_rate": 3.8508734961127454e-06, + "loss": 0.9978, + "step": 2079 + }, + { + "epoch": 0.15006673640922044, + "grad_norm": 2.5040860163628067, + "learning_rate": 3.850696361950832e-06, + "loss": 0.895, + "step": 2080 + }, + { + "epoch": 0.15013888387864796, + "grad_norm": 1.7826251025926434, + "learning_rate": 3.850519126729241e-06, + "loss": 1.0313, + "step": 2081 + }, + { + "epoch": 0.15021103134807548, + "grad_norm": 3.032970137039409, + "learning_rate": 3.8503417904576515e-06, + "loss": 1.0137, + "step": 2082 + }, + { + "epoch": 0.15028317881750297, + "grad_norm": 5.088042824019147, + "learning_rate": 3.850164353145747e-06, + "loss": 1.0599, + "step": 2083 + }, + { + "epoch": 0.1503553262869305, + "grad_norm": 0.8471149175121871, + "learning_rate": 3.8499868148032145e-06, + "loss": 0.8613, + "step": 2084 + }, + { + "epoch": 0.15042747375635798, + "grad_norm": 2.2791587964281774, + "learning_rate": 3.849809175439752e-06, + "loss": 0.8992, + "step": 2085 + }, + { + "epoch": 0.1504996212257855, + "grad_norm": 4.508662680808082, + "learning_rate": 3.849631435065058e-06, + "loss": 0.8771, + "step": 2086 + }, + { + "epoch": 0.15057176869521302, + "grad_norm": 2.3563187364018288, + "learning_rate": 3.8494535936888375e-06, + "loss": 0.8977, + "step": 2087 + }, + { + "epoch": 0.1506439161646405, + "grad_norm": 2.448570932380943, + "learning_rate": 3.849275651320804e-06, + "loss": 0.855, + "step": 2088 + }, + { + "epoch": 0.15071606363406803, + "grad_norm": 2.883046706695589, + "learning_rate": 3.849097607970672e-06, + "loss": 0.9127, + "step": 2089 + }, + { + "epoch": 0.15078821110349555, + "grad_norm": 2.546923397049034, + "learning_rate": 3.848919463648165e-06, + "loss": 0.9149, + "step": 2090 + }, + { + "epoch": 0.15086035857292304, + "grad_norm": 1.7520281075004007, + "learning_rate": 3.848741218363011e-06, + "loss": 0.9812, + "step": 2091 + }, + { + "epoch": 0.15093250604235056, + "grad_norm": 2.6626773040907934, + "learning_rate": 3.848562872124942e-06, + "loss": 0.918, + "step": 2092 + }, + { + "epoch": 0.15100465351177808, + "grad_norm": 2.5649018545131335, + "learning_rate": 3.848384424943699e-06, + "loss": 1.0132, + "step": 2093 + }, + { + "epoch": 0.15107680098120557, + "grad_norm": 3.5982892104025237, + "learning_rate": 3.848205876829024e-06, + "loss": 0.9403, + "step": 2094 + }, + { + "epoch": 0.1511489484506331, + "grad_norm": 1.8229665767558307, + "learning_rate": 3.848027227790668e-06, + "loss": 0.9986, + "step": 2095 + }, + { + "epoch": 0.1512210959200606, + "grad_norm": 1.8190933532667228, + "learning_rate": 3.847848477838387e-06, + "loss": 0.9515, + "step": 2096 + }, + { + "epoch": 0.1512932433894881, + "grad_norm": 2.251849897521595, + "learning_rate": 3.84766962698194e-06, + "loss": 1.0862, + "step": 2097 + }, + { + "epoch": 0.15136539085891562, + "grad_norm": 3.7598488221314437, + "learning_rate": 3.8474906752310945e-06, + "loss": 0.9542, + "step": 2098 + }, + { + "epoch": 0.15143753832834314, + "grad_norm": 2.224740979837465, + "learning_rate": 3.847311622595623e-06, + "loss": 0.8859, + "step": 2099 + }, + { + "epoch": 0.15150968579777063, + "grad_norm": 1.8788564441448068, + "learning_rate": 3.847132469085302e-06, + "loss": 0.9185, + "step": 2100 + }, + { + "epoch": 0.15158183326719815, + "grad_norm": 2.5751838322174896, + "learning_rate": 3.846953214709916e-06, + "loss": 0.9329, + "step": 2101 + }, + { + "epoch": 0.15165398073662567, + "grad_norm": 2.7394568603873095, + "learning_rate": 3.846773859479251e-06, + "loss": 0.9029, + "step": 2102 + }, + { + "epoch": 0.15172612820605316, + "grad_norm": 18.91909590708304, + "learning_rate": 3.846594403403101e-06, + "loss": 1.0287, + "step": 2103 + }, + { + "epoch": 0.15179827567548068, + "grad_norm": 2.2574221791724143, + "learning_rate": 3.846414846491268e-06, + "loss": 0.9151, + "step": 2104 + }, + { + "epoch": 0.1518704231449082, + "grad_norm": 3.825886472813385, + "learning_rate": 3.846235188753555e-06, + "loss": 1.0392, + "step": 2105 + }, + { + "epoch": 0.1519425706143357, + "grad_norm": 2.583644899427766, + "learning_rate": 3.846055430199773e-06, + "loss": 0.9583, + "step": 2106 + }, + { + "epoch": 0.1520147180837632, + "grad_norm": 2.4177858348183148, + "learning_rate": 3.845875570839738e-06, + "loss": 0.8927, + "step": 2107 + }, + { + "epoch": 0.15208686555319073, + "grad_norm": 0.9462889365244369, + "learning_rate": 3.845695610683271e-06, + "loss": 0.8636, + "step": 2108 + }, + { + "epoch": 0.15215901302261822, + "grad_norm": 2.698732809883981, + "learning_rate": 3.8455155497401995e-06, + "loss": 0.9733, + "step": 2109 + }, + { + "epoch": 0.15223116049204574, + "grad_norm": 3.1118850161729683, + "learning_rate": 3.8453353880203545e-06, + "loss": 0.9901, + "step": 2110 + }, + { + "epoch": 0.15230330796147326, + "grad_norm": 4.388472603049919, + "learning_rate": 3.845155125533576e-06, + "loss": 0.9718, + "step": 2111 + }, + { + "epoch": 0.15237545543090075, + "grad_norm": 4.004394501480383, + "learning_rate": 3.8449747622897075e-06, + "loss": 0.9315, + "step": 2112 + }, + { + "epoch": 0.15244760290032827, + "grad_norm": 3.380515078286006, + "learning_rate": 3.844794298298596e-06, + "loss": 0.9639, + "step": 2113 + }, + { + "epoch": 0.1525197503697558, + "grad_norm": 3.698939279635933, + "learning_rate": 3.844613733570097e-06, + "loss": 0.9593, + "step": 2114 + }, + { + "epoch": 0.15259189783918328, + "grad_norm": 3.5749590998090617, + "learning_rate": 3.844433068114072e-06, + "loss": 1.0338, + "step": 2115 + }, + { + "epoch": 0.1526640453086108, + "grad_norm": 3.6933011474538184, + "learning_rate": 3.844252301940384e-06, + "loss": 0.9599, + "step": 2116 + }, + { + "epoch": 0.15273619277803832, + "grad_norm": 7.627119426403976, + "learning_rate": 3.844071435058904e-06, + "loss": 1.0467, + "step": 2117 + }, + { + "epoch": 0.1528083402474658, + "grad_norm": 3.908127356485364, + "learning_rate": 3.843890467479511e-06, + "loss": 0.844, + "step": 2118 + }, + { + "epoch": 0.15288048771689333, + "grad_norm": 2.5947146173158426, + "learning_rate": 3.843709399212084e-06, + "loss": 1.0512, + "step": 2119 + }, + { + "epoch": 0.15295263518632085, + "grad_norm": 5.990279589004453, + "learning_rate": 3.843528230266514e-06, + "loss": 0.9648, + "step": 2120 + }, + { + "epoch": 0.15302478265574834, + "grad_norm": 4.454629423096734, + "learning_rate": 3.84334696065269e-06, + "loss": 0.9403, + "step": 2121 + }, + { + "epoch": 0.15309693012517586, + "grad_norm": 2.69368794451615, + "learning_rate": 3.843165590380512e-06, + "loss": 0.9344, + "step": 2122 + }, + { + "epoch": 0.15316907759460338, + "grad_norm": 6.767380836770794, + "learning_rate": 3.8429841194598855e-06, + "loss": 1.0144, + "step": 2123 + }, + { + "epoch": 0.15324122506403087, + "grad_norm": 13.763282602564217, + "learning_rate": 3.842802547900719e-06, + "loss": 0.958, + "step": 2124 + }, + { + "epoch": 0.1533133725334584, + "grad_norm": 3.366351596957873, + "learning_rate": 3.842620875712926e-06, + "loss": 0.9156, + "step": 2125 + }, + { + "epoch": 0.1533855200028859, + "grad_norm": 3.1806585155482026, + "learning_rate": 3.842439102906429e-06, + "loss": 1.1142, + "step": 2126 + }, + { + "epoch": 0.1534576674723134, + "grad_norm": 2.4940600400854476, + "learning_rate": 3.842257229491152e-06, + "loss": 1.0158, + "step": 2127 + }, + { + "epoch": 0.15352981494174092, + "grad_norm": 5.262283306236255, + "learning_rate": 3.842075255477028e-06, + "loss": 0.9763, + "step": 2128 + }, + { + "epoch": 0.15360196241116844, + "grad_norm": 5.816715147732721, + "learning_rate": 3.841893180873994e-06, + "loss": 0.9366, + "step": 2129 + }, + { + "epoch": 0.15367410988059593, + "grad_norm": 2.439177791551409, + "learning_rate": 3.841711005691992e-06, + "loss": 0.9772, + "step": 2130 + }, + { + "epoch": 0.15374625735002345, + "grad_norm": 3.129870472138309, + "learning_rate": 3.841528729940969e-06, + "loss": 0.9855, + "step": 2131 + }, + { + "epoch": 0.15381840481945097, + "grad_norm": 3.1051386051908034, + "learning_rate": 3.841346353630879e-06, + "loss": 0.9007, + "step": 2132 + }, + { + "epoch": 0.15389055228887846, + "grad_norm": 4.408989021175124, + "learning_rate": 3.841163876771681e-06, + "loss": 1.0787, + "step": 2133 + }, + { + "epoch": 0.15396269975830598, + "grad_norm": 2.8175403782304493, + "learning_rate": 3.84098129937334e-06, + "loss": 0.9572, + "step": 2134 + }, + { + "epoch": 0.15403484722773347, + "grad_norm": 3.362154923781227, + "learning_rate": 3.840798621445825e-06, + "loss": 0.9984, + "step": 2135 + }, + { + "epoch": 0.154106994697161, + "grad_norm": 3.239553388025829, + "learning_rate": 3.840615842999112e-06, + "loss": 0.909, + "step": 2136 + }, + { + "epoch": 0.1541791421665885, + "grad_norm": 3.9791226342978847, + "learning_rate": 3.840432964043182e-06, + "loss": 1.0853, + "step": 2137 + }, + { + "epoch": 0.154251289636016, + "grad_norm": 3.2923106919582805, + "learning_rate": 3.840249984588021e-06, + "loss": 0.9492, + "step": 2138 + }, + { + "epoch": 0.15432343710544352, + "grad_norm": 3.604168195485285, + "learning_rate": 3.84006690464362e-06, + "loss": 0.8908, + "step": 2139 + }, + { + "epoch": 0.15439558457487104, + "grad_norm": 3.612606303093026, + "learning_rate": 3.839883724219976e-06, + "loss": 0.9987, + "step": 2140 + }, + { + "epoch": 0.15446773204429853, + "grad_norm": 4.662251541022073, + "learning_rate": 3.839700443327094e-06, + "loss": 0.9032, + "step": 2141 + }, + { + "epoch": 0.15453987951372605, + "grad_norm": 8.391798793304371, + "learning_rate": 3.839517061974981e-06, + "loss": 0.8987, + "step": 2142 + }, + { + "epoch": 0.15461202698315357, + "grad_norm": 3.275826176690787, + "learning_rate": 3.839333580173651e-06, + "loss": 0.8926, + "step": 2143 + }, + { + "epoch": 0.15468417445258106, + "grad_norm": 5.830712447171577, + "learning_rate": 3.839149997933123e-06, + "loss": 0.9233, + "step": 2144 + }, + { + "epoch": 0.15475632192200858, + "grad_norm": 3.53170894914933, + "learning_rate": 3.838966315263422e-06, + "loss": 0.9239, + "step": 2145 + }, + { + "epoch": 0.1548284693914361, + "grad_norm": 3.5385884689606297, + "learning_rate": 3.838782532174579e-06, + "loss": 0.9881, + "step": 2146 + }, + { + "epoch": 0.1549006168608636, + "grad_norm": 0.7796176259871158, + "learning_rate": 3.838598648676628e-06, + "loss": 0.8511, + "step": 2147 + }, + { + "epoch": 0.1549727643302911, + "grad_norm": 23.218264999561093, + "learning_rate": 3.838414664779611e-06, + "loss": 0.9973, + "step": 2148 + }, + { + "epoch": 0.15504491179971863, + "grad_norm": 4.560733400896476, + "learning_rate": 3.838230580493575e-06, + "loss": 0.9238, + "step": 2149 + }, + { + "epoch": 0.15511705926914612, + "grad_norm": 4.058101679295623, + "learning_rate": 3.838046395828572e-06, + "loss": 1.0594, + "step": 2150 + }, + { + "epoch": 0.15518920673857364, + "grad_norm": 3.8150856130450443, + "learning_rate": 3.837862110794659e-06, + "loss": 0.9276, + "step": 2151 + }, + { + "epoch": 0.15526135420800116, + "grad_norm": 5.813998777701056, + "learning_rate": 3.8376777254018994e-06, + "loss": 0.8965, + "step": 2152 + }, + { + "epoch": 0.15533350167742865, + "grad_norm": 8.470736675270773, + "learning_rate": 3.837493239660363e-06, + "loss": 0.9662, + "step": 2153 + }, + { + "epoch": 0.15540564914685617, + "grad_norm": 4.580017531915284, + "learning_rate": 3.837308653580122e-06, + "loss": 0.8342, + "step": 2154 + }, + { + "epoch": 0.1554777966162837, + "grad_norm": 2.168877158710847, + "learning_rate": 3.837123967171258e-06, + "loss": 0.9995, + "step": 2155 + }, + { + "epoch": 0.15554994408571118, + "grad_norm": 3.574230623486084, + "learning_rate": 3.8369391804438545e-06, + "loss": 0.8807, + "step": 2156 + }, + { + "epoch": 0.1556220915551387, + "grad_norm": 4.354202319034223, + "learning_rate": 3.836754293408002e-06, + "loss": 0.8829, + "step": 2157 + }, + { + "epoch": 0.15569423902456622, + "grad_norm": 2.4987258525252187, + "learning_rate": 3.836569306073798e-06, + "loss": 0.9692, + "step": 2158 + }, + { + "epoch": 0.1557663864939937, + "grad_norm": 2.808284058030088, + "learning_rate": 3.836384218451342e-06, + "loss": 0.9735, + "step": 2159 + }, + { + "epoch": 0.15583853396342123, + "grad_norm": 4.64809322845405, + "learning_rate": 3.8361990305507415e-06, + "loss": 0.8879, + "step": 2160 + }, + { + "epoch": 0.15591068143284875, + "grad_norm": 2.6799998643505005, + "learning_rate": 3.83601374238211e-06, + "loss": 1.0053, + "step": 2161 + }, + { + "epoch": 0.15598282890227624, + "grad_norm": 2.9786278475272705, + "learning_rate": 3.835828353955565e-06, + "loss": 0.916, + "step": 2162 + }, + { + "epoch": 0.15605497637170376, + "grad_norm": 2.2705113827256875, + "learning_rate": 3.8356428652812295e-06, + "loss": 1.0165, + "step": 2163 + }, + { + "epoch": 0.15612712384113128, + "grad_norm": 3.5767510454002087, + "learning_rate": 3.835457276369232e-06, + "loss": 0.9032, + "step": 2164 + }, + { + "epoch": 0.15619927131055877, + "grad_norm": 1.8764199284526604, + "learning_rate": 3.835271587229708e-06, + "loss": 0.9365, + "step": 2165 + }, + { + "epoch": 0.1562714187799863, + "grad_norm": 0.8742011715346363, + "learning_rate": 3.835085797872795e-06, + "loss": 0.8569, + "step": 2166 + }, + { + "epoch": 0.1563435662494138, + "grad_norm": 3.2001616377538595, + "learning_rate": 3.8348999083086414e-06, + "loss": 0.9546, + "step": 2167 + }, + { + "epoch": 0.1564157137188413, + "grad_norm": 12.116891568356007, + "learning_rate": 3.834713918547396e-06, + "loss": 0.9577, + "step": 2168 + }, + { + "epoch": 0.15648786118826882, + "grad_norm": 3.967034156723427, + "learning_rate": 3.834527828599216e-06, + "loss": 0.9296, + "step": 2169 + }, + { + "epoch": 0.15656000865769634, + "grad_norm": 2.8847312434477894, + "learning_rate": 3.834341638474261e-06, + "loss": 0.9874, + "step": 2170 + }, + { + "epoch": 0.15663215612712383, + "grad_norm": 3.725451417746742, + "learning_rate": 3.834155348182701e-06, + "loss": 0.9948, + "step": 2171 + }, + { + "epoch": 0.15670430359655135, + "grad_norm": 5.405915178283407, + "learning_rate": 3.833968957734706e-06, + "loss": 0.9905, + "step": 2172 + }, + { + "epoch": 0.15677645106597887, + "grad_norm": 0.8575548490326588, + "learning_rate": 3.833782467140456e-06, + "loss": 0.865, + "step": 2173 + }, + { + "epoch": 0.15684859853540636, + "grad_norm": 2.7801160751095924, + "learning_rate": 3.833595876410134e-06, + "loss": 1.0001, + "step": 2174 + }, + { + "epoch": 0.15692074600483388, + "grad_norm": 2.980990582901375, + "learning_rate": 3.833409185553929e-06, + "loss": 0.9223, + "step": 2175 + }, + { + "epoch": 0.1569928934742614, + "grad_norm": 3.0327896455338963, + "learning_rate": 3.833222394582035e-06, + "loss": 0.9224, + "step": 2176 + }, + { + "epoch": 0.1570650409436889, + "grad_norm": 3.2148532426955843, + "learning_rate": 3.833035503504653e-06, + "loss": 1.0116, + "step": 2177 + }, + { + "epoch": 0.1571371884131164, + "grad_norm": 1.0645298921393611, + "learning_rate": 3.832848512331987e-06, + "loss": 0.8286, + "step": 2178 + }, + { + "epoch": 0.15720933588254393, + "grad_norm": 3.288977832565229, + "learning_rate": 3.832661421074249e-06, + "loss": 0.9346, + "step": 2179 + }, + { + "epoch": 0.15728148335197142, + "grad_norm": 2.1585056835079413, + "learning_rate": 3.832474229741655e-06, + "loss": 0.9336, + "step": 2180 + }, + { + "epoch": 0.15735363082139894, + "grad_norm": 2.5249192933129843, + "learning_rate": 3.832286938344428e-06, + "loss": 0.9258, + "step": 2181 + }, + { + "epoch": 0.15742577829082646, + "grad_norm": 3.9532980899837344, + "learning_rate": 3.832099546892792e-06, + "loss": 0.9441, + "step": 2182 + }, + { + "epoch": 0.15749792576025395, + "grad_norm": 9.021923066907371, + "learning_rate": 3.831912055396984e-06, + "loss": 1.0677, + "step": 2183 + }, + { + "epoch": 0.15757007322968147, + "grad_norm": 3.0602198306138284, + "learning_rate": 3.831724463867239e-06, + "loss": 1.014, + "step": 2184 + }, + { + "epoch": 0.157642220699109, + "grad_norm": 2.741957695615875, + "learning_rate": 3.8315367723138025e-06, + "loss": 1.0164, + "step": 2185 + }, + { + "epoch": 0.15771436816853648, + "grad_norm": 2.097310355596341, + "learning_rate": 3.831348980746922e-06, + "loss": 1.019, + "step": 2186 + }, + { + "epoch": 0.157786515637964, + "grad_norm": 2.8070127576287356, + "learning_rate": 3.831161089176855e-06, + "loss": 0.9791, + "step": 2187 + }, + { + "epoch": 0.1578586631073915, + "grad_norm": 2.790876124788557, + "learning_rate": 3.830973097613859e-06, + "loss": 1.0166, + "step": 2188 + }, + { + "epoch": 0.15793081057681901, + "grad_norm": 2.862744623251278, + "learning_rate": 3.830785006068199e-06, + "loss": 0.9989, + "step": 2189 + }, + { + "epoch": 0.15800295804624653, + "grad_norm": 2.8012442957536186, + "learning_rate": 3.830596814550148e-06, + "loss": 0.9969, + "step": 2190 + }, + { + "epoch": 0.15807510551567402, + "grad_norm": 4.003359337647337, + "learning_rate": 3.830408523069981e-06, + "loss": 0.9443, + "step": 2191 + }, + { + "epoch": 0.15814725298510154, + "grad_norm": 2.988762152378675, + "learning_rate": 3.830220131637981e-06, + "loss": 0.9283, + "step": 2192 + }, + { + "epoch": 0.15821940045452906, + "grad_norm": 2.635943625216329, + "learning_rate": 3.8300316402644345e-06, + "loss": 0.9044, + "step": 2193 + }, + { + "epoch": 0.15829154792395655, + "grad_norm": 3.11886268672062, + "learning_rate": 3.829843048959635e-06, + "loss": 0.9404, + "step": 2194 + }, + { + "epoch": 0.15836369539338407, + "grad_norm": 3.0933783722207577, + "learning_rate": 3.829654357733881e-06, + "loss": 1.0161, + "step": 2195 + }, + { + "epoch": 0.1584358428628116, + "grad_norm": 3.2641184859087558, + "learning_rate": 3.829465566597474e-06, + "loss": 0.9814, + "step": 2196 + }, + { + "epoch": 0.15850799033223908, + "grad_norm": 5.284025647906101, + "learning_rate": 3.829276675560727e-06, + "loss": 1.0325, + "step": 2197 + }, + { + "epoch": 0.1585801378016666, + "grad_norm": 2.8089831086326846, + "learning_rate": 3.829087684633951e-06, + "loss": 1.0668, + "step": 2198 + }, + { + "epoch": 0.15865228527109412, + "grad_norm": 2.6080553749676607, + "learning_rate": 3.8288985938274675e-06, + "loss": 0.9763, + "step": 2199 + }, + { + "epoch": 0.15872443274052161, + "grad_norm": 3.0325003336051637, + "learning_rate": 3.828709403151603e-06, + "loss": 1.0344, + "step": 2200 + }, + { + "epoch": 0.15879658020994913, + "grad_norm": 2.8184969228064856, + "learning_rate": 3.8285201126166865e-06, + "loss": 0.9752, + "step": 2201 + }, + { + "epoch": 0.15886872767937665, + "grad_norm": 3.7416830021445695, + "learning_rate": 3.8283307222330566e-06, + "loss": 0.9777, + "step": 2202 + }, + { + "epoch": 0.15894087514880414, + "grad_norm": 2.154906033726453, + "learning_rate": 3.828141232011054e-06, + "loss": 0.9454, + "step": 2203 + }, + { + "epoch": 0.15901302261823166, + "grad_norm": 2.6892175730578067, + "learning_rate": 3.827951641961026e-06, + "loss": 1.0261, + "step": 2204 + }, + { + "epoch": 0.15908517008765918, + "grad_norm": 4.080959694134026, + "learning_rate": 3.827761952093326e-06, + "loss": 1.0336, + "step": 2205 + }, + { + "epoch": 0.15915731755708667, + "grad_norm": 4.390202512132462, + "learning_rate": 3.827572162418312e-06, + "loss": 0.9735, + "step": 2206 + }, + { + "epoch": 0.1592294650265142, + "grad_norm": 4.776793952802371, + "learning_rate": 3.827382272946348e-06, + "loss": 0.9237, + "step": 2207 + }, + { + "epoch": 0.1593016124959417, + "grad_norm": 8.49025257179727, + "learning_rate": 3.827192283687801e-06, + "loss": 0.8884, + "step": 2208 + }, + { + "epoch": 0.1593737599653692, + "grad_norm": 4.014424779215996, + "learning_rate": 3.827002194653049e-06, + "loss": 1.0207, + "step": 2209 + }, + { + "epoch": 0.15944590743479672, + "grad_norm": 2.842477031470699, + "learning_rate": 3.826812005852471e-06, + "loss": 0.9237, + "step": 2210 + }, + { + "epoch": 0.15951805490422424, + "grad_norm": 2.6233303573022786, + "learning_rate": 3.826621717296451e-06, + "loss": 0.8676, + "step": 2211 + }, + { + "epoch": 0.15959020237365174, + "grad_norm": 3.04572372204503, + "learning_rate": 3.826431328995381e-06, + "loss": 0.9759, + "step": 2212 + }, + { + "epoch": 0.15966234984307925, + "grad_norm": 2.5411755968151586, + "learning_rate": 3.826240840959658e-06, + "loss": 1.0827, + "step": 2213 + }, + { + "epoch": 0.15973449731250677, + "grad_norm": 4.330699829564956, + "learning_rate": 3.826050253199682e-06, + "loss": 0.9854, + "step": 2214 + }, + { + "epoch": 0.15980664478193427, + "grad_norm": 3.1393007741511383, + "learning_rate": 3.825859565725862e-06, + "loss": 1.0007, + "step": 2215 + }, + { + "epoch": 0.15987879225136178, + "grad_norm": 2.5182433155902335, + "learning_rate": 3.82566877854861e-06, + "loss": 0.9815, + "step": 2216 + }, + { + "epoch": 0.1599509397207893, + "grad_norm": 3.2951464709315395, + "learning_rate": 3.825477891678345e-06, + "loss": 0.9458, + "step": 2217 + }, + { + "epoch": 0.1600230871902168, + "grad_norm": 2.774565905048265, + "learning_rate": 3.82528690512549e-06, + "loss": 0.9789, + "step": 2218 + }, + { + "epoch": 0.16009523465964431, + "grad_norm": 2.5930335652525063, + "learning_rate": 3.825095818900474e-06, + "loss": 0.9755, + "step": 2219 + }, + { + "epoch": 0.16016738212907183, + "grad_norm": 4.925668762035862, + "learning_rate": 3.824904633013731e-06, + "loss": 0.9622, + "step": 2220 + }, + { + "epoch": 0.16023952959849933, + "grad_norm": 2.844590618265212, + "learning_rate": 3.824713347475702e-06, + "loss": 0.9323, + "step": 2221 + }, + { + "epoch": 0.16031167706792684, + "grad_norm": 2.91118915839885, + "learning_rate": 3.824521962296832e-06, + "loss": 0.9887, + "step": 2222 + }, + { + "epoch": 0.16038382453735436, + "grad_norm": 1.0088975373521556, + "learning_rate": 3.824330477487572e-06, + "loss": 0.8775, + "step": 2223 + }, + { + "epoch": 0.16045597200678186, + "grad_norm": 3.3001832679952727, + "learning_rate": 3.824138893058377e-06, + "loss": 0.9257, + "step": 2224 + }, + { + "epoch": 0.16052811947620937, + "grad_norm": 4.199050459927179, + "learning_rate": 3.823947209019712e-06, + "loss": 0.9658, + "step": 2225 + }, + { + "epoch": 0.1606002669456369, + "grad_norm": 2.8927798925241093, + "learning_rate": 3.82375542538204e-06, + "loss": 0.9631, + "step": 2226 + }, + { + "epoch": 0.16067241441506439, + "grad_norm": 4.703923255878266, + "learning_rate": 3.8235635421558365e-06, + "loss": 1.0208, + "step": 2227 + }, + { + "epoch": 0.1607445618844919, + "grad_norm": 4.076773584002993, + "learning_rate": 3.823371559351578e-06, + "loss": 0.9871, + "step": 2228 + }, + { + "epoch": 0.16081670935391942, + "grad_norm": 3.7114287041270067, + "learning_rate": 3.823179476979748e-06, + "loss": 0.9831, + "step": 2229 + }, + { + "epoch": 0.16088885682334692, + "grad_norm": 3.3155667485936524, + "learning_rate": 3.822987295050837e-06, + "loss": 1.0514, + "step": 2230 + }, + { + "epoch": 0.16096100429277443, + "grad_norm": 2.900237395994507, + "learning_rate": 3.822795013575338e-06, + "loss": 0.9627, + "step": 2231 + }, + { + "epoch": 0.16103315176220195, + "grad_norm": 3.0143993041189567, + "learning_rate": 3.822602632563751e-06, + "loss": 1.0485, + "step": 2232 + }, + { + "epoch": 0.16110529923162945, + "grad_norm": 0.8164573178905945, + "learning_rate": 3.822410152026581e-06, + "loss": 0.7719, + "step": 2233 + }, + { + "epoch": 0.16117744670105696, + "grad_norm": 5.291791518933677, + "learning_rate": 3.822217571974339e-06, + "loss": 0.9963, + "step": 2234 + }, + { + "epoch": 0.16124959417048448, + "grad_norm": 3.62718319099008, + "learning_rate": 3.822024892417542e-06, + "loss": 1.1114, + "step": 2235 + }, + { + "epoch": 0.16132174163991198, + "grad_norm": 15.712117974516268, + "learning_rate": 3.82183211336671e-06, + "loss": 0.9051, + "step": 2236 + }, + { + "epoch": 0.1613938891093395, + "grad_norm": 4.0711549557023545, + "learning_rate": 3.82163923483237e-06, + "loss": 0.9197, + "step": 2237 + }, + { + "epoch": 0.16146603657876699, + "grad_norm": 1.1279991756023613, + "learning_rate": 3.821446256825055e-06, + "loss": 0.7652, + "step": 2238 + }, + { + "epoch": 0.1615381840481945, + "grad_norm": 3.4273533275114785, + "learning_rate": 3.821253179355302e-06, + "loss": 0.9758, + "step": 2239 + }, + { + "epoch": 0.16161033151762202, + "grad_norm": 3.673884584519314, + "learning_rate": 3.8210600024336556e-06, + "loss": 0.9728, + "step": 2240 + }, + { + "epoch": 0.16168247898704952, + "grad_norm": 8.483239197973795, + "learning_rate": 3.8208667260706645e-06, + "loss": 0.9639, + "step": 2241 + }, + { + "epoch": 0.16175462645647704, + "grad_norm": 3.7913093119030723, + "learning_rate": 3.820673350276881e-06, + "loss": 1.023, + "step": 2242 + }, + { + "epoch": 0.16182677392590455, + "grad_norm": 4.775241442263331, + "learning_rate": 3.820479875062867e-06, + "loss": 1.0184, + "step": 2243 + }, + { + "epoch": 0.16189892139533205, + "grad_norm": 4.41333152997613, + "learning_rate": 3.820286300439184e-06, + "loss": 0.8571, + "step": 2244 + }, + { + "epoch": 0.16197106886475957, + "grad_norm": 18.925821159013648, + "learning_rate": 3.820092626416407e-06, + "loss": 0.9901, + "step": 2245 + }, + { + "epoch": 0.16204321633418708, + "grad_norm": 3.389399012403704, + "learning_rate": 3.819898853005108e-06, + "loss": 0.9525, + "step": 2246 + }, + { + "epoch": 0.16211536380361458, + "grad_norm": 4.19832044352367, + "learning_rate": 3.8197049802158705e-06, + "loss": 0.9273, + "step": 2247 + }, + { + "epoch": 0.1621875112730421, + "grad_norm": 4.294670954613221, + "learning_rate": 3.81951100805928e-06, + "loss": 1.0107, + "step": 2248 + }, + { + "epoch": 0.16225965874246961, + "grad_norm": 5.949340258013088, + "learning_rate": 3.819316936545929e-06, + "loss": 0.9729, + "step": 2249 + }, + { + "epoch": 0.1623318062118971, + "grad_norm": 3.88265702302839, + "learning_rate": 3.819122765686415e-06, + "loss": 0.911, + "step": 2250 + }, + { + "epoch": 0.16240395368132463, + "grad_norm": 1.9097024265380758, + "learning_rate": 3.818928495491341e-06, + "loss": 0.9031, + "step": 2251 + }, + { + "epoch": 0.16247610115075214, + "grad_norm": 2.752551109262159, + "learning_rate": 3.818734125971315e-06, + "loss": 0.9904, + "step": 2252 + }, + { + "epoch": 0.16254824862017964, + "grad_norm": 2.5611381633687813, + "learning_rate": 3.818539657136951e-06, + "loss": 0.934, + "step": 2253 + }, + { + "epoch": 0.16262039608960716, + "grad_norm": 2.8210427984529276, + "learning_rate": 3.818345088998868e-06, + "loss": 0.9202, + "step": 2254 + }, + { + "epoch": 0.16269254355903467, + "grad_norm": 1.084332714300705, + "learning_rate": 3.818150421567692e-06, + "loss": 0.8663, + "step": 2255 + }, + { + "epoch": 0.16276469102846217, + "grad_norm": 3.131025989308862, + "learning_rate": 3.8179556548540514e-06, + "loss": 0.9072, + "step": 2256 + }, + { + "epoch": 0.16283683849788969, + "grad_norm": 4.6867237719931625, + "learning_rate": 3.817760788868583e-06, + "loss": 1.0367, + "step": 2257 + }, + { + "epoch": 0.1629089859673172, + "grad_norm": 2.381667365395951, + "learning_rate": 3.817565823621926e-06, + "loss": 1.0222, + "step": 2258 + }, + { + "epoch": 0.1629811334367447, + "grad_norm": 4.7434166913047635, + "learning_rate": 3.8173707591247286e-06, + "loss": 0.8421, + "step": 2259 + }, + { + "epoch": 0.16305328090617222, + "grad_norm": 4.169648108328812, + "learning_rate": 3.817175595387641e-06, + "loss": 0.9779, + "step": 2260 + }, + { + "epoch": 0.16312542837559973, + "grad_norm": 4.2001720393231645, + "learning_rate": 3.816980332421322e-06, + "loss": 1.0691, + "step": 2261 + }, + { + "epoch": 0.16319757584502723, + "grad_norm": 2.739949674492216, + "learning_rate": 3.816784970236432e-06, + "loss": 0.9487, + "step": 2262 + }, + { + "epoch": 0.16326972331445475, + "grad_norm": 2.287285332951693, + "learning_rate": 3.816589508843642e-06, + "loss": 1.0243, + "step": 2263 + }, + { + "epoch": 0.16334187078388226, + "grad_norm": 13.65893863425185, + "learning_rate": 3.816393948253623e-06, + "loss": 1.0553, + "step": 2264 + }, + { + "epoch": 0.16341401825330976, + "grad_norm": 3.374835257571408, + "learning_rate": 3.816198288477054e-06, + "loss": 1.0021, + "step": 2265 + }, + { + "epoch": 0.16348616572273728, + "grad_norm": 16.2435853474966, + "learning_rate": 3.81600252952462e-06, + "loss": 1.0498, + "step": 2266 + }, + { + "epoch": 0.1635583131921648, + "grad_norm": 2.680581791628016, + "learning_rate": 3.815806671407011e-06, + "loss": 0.9973, + "step": 2267 + }, + { + "epoch": 0.1636304606615923, + "grad_norm": 3.1278452318445296, + "learning_rate": 3.815610714134921e-06, + "loss": 0.9327, + "step": 2268 + }, + { + "epoch": 0.1637026081310198, + "grad_norm": 2.5098078030259376, + "learning_rate": 3.815414657719051e-06, + "loss": 0.9623, + "step": 2269 + }, + { + "epoch": 0.16377475560044732, + "grad_norm": 5.024552335773483, + "learning_rate": 3.815218502170108e-06, + "loss": 0.9111, + "step": 2270 + }, + { + "epoch": 0.16384690306987482, + "grad_norm": 6.134286496830362, + "learning_rate": 3.8150222474988006e-06, + "loss": 0.9479, + "step": 2271 + }, + { + "epoch": 0.16391905053930234, + "grad_norm": 5.244202637261377, + "learning_rate": 3.814825893715849e-06, + "loss": 0.9951, + "step": 2272 + }, + { + "epoch": 0.16399119800872985, + "grad_norm": 7.102448495250462, + "learning_rate": 3.8146294408319726e-06, + "loss": 0.9738, + "step": 2273 + }, + { + "epoch": 0.16406334547815735, + "grad_norm": 2.9250440088857563, + "learning_rate": 3.8144328888579e-06, + "loss": 1.0326, + "step": 2274 + }, + { + "epoch": 0.16413549294758487, + "grad_norm": 2.3554418255483367, + "learning_rate": 3.814236237804364e-06, + "loss": 0.9038, + "step": 2275 + }, + { + "epoch": 0.16420764041701238, + "grad_norm": 7.267648722698626, + "learning_rate": 3.814039487682104e-06, + "loss": 1.0119, + "step": 2276 + }, + { + "epoch": 0.16427978788643988, + "grad_norm": 4.156262390577161, + "learning_rate": 3.813842638501862e-06, + "loss": 0.9999, + "step": 2277 + }, + { + "epoch": 0.1643519353558674, + "grad_norm": 2.0237752860776856, + "learning_rate": 3.813645690274388e-06, + "loss": 0.9026, + "step": 2278 + }, + { + "epoch": 0.16442408282529491, + "grad_norm": 2.3997101529255986, + "learning_rate": 3.8134486430104373e-06, + "loss": 0.8692, + "step": 2279 + }, + { + "epoch": 0.1644962302947224, + "grad_norm": 2.4646796944215184, + "learning_rate": 3.8132514967207693e-06, + "loss": 0.983, + "step": 2280 + }, + { + "epoch": 0.16456837776414993, + "grad_norm": 2.08810643507267, + "learning_rate": 3.813054251416149e-06, + "loss": 0.9023, + "step": 2281 + }, + { + "epoch": 0.16464052523357744, + "grad_norm": 2.7197854711977523, + "learning_rate": 3.8128569071073487e-06, + "loss": 0.8794, + "step": 2282 + }, + { + "epoch": 0.16471267270300494, + "grad_norm": 3.000598370958911, + "learning_rate": 3.812659463805143e-06, + "loss": 0.9024, + "step": 2283 + }, + { + "epoch": 0.16478482017243246, + "grad_norm": 2.676697689635122, + "learning_rate": 3.8124619215203143e-06, + "loss": 0.9308, + "step": 2284 + }, + { + "epoch": 0.16485696764185997, + "grad_norm": 0.8022511087958548, + "learning_rate": 3.81226428026365e-06, + "loss": 0.8367, + "step": 2285 + }, + { + "epoch": 0.16492911511128747, + "grad_norm": 2.881737485345992, + "learning_rate": 3.812066540045942e-06, + "loss": 0.9806, + "step": 2286 + }, + { + "epoch": 0.16500126258071499, + "grad_norm": 2.4504246518876798, + "learning_rate": 3.8118687008779876e-06, + "loss": 1.0184, + "step": 2287 + }, + { + "epoch": 0.1650734100501425, + "grad_norm": 3.623641055345554, + "learning_rate": 3.8116707627705907e-06, + "loss": 0.9802, + "step": 2288 + }, + { + "epoch": 0.16514555751957, + "grad_norm": 2.1364002722692326, + "learning_rate": 3.811472725734561e-06, + "loss": 0.9375, + "step": 2289 + }, + { + "epoch": 0.16521770498899752, + "grad_norm": 0.7995154075402338, + "learning_rate": 3.8112745897807114e-06, + "loss": 0.7698, + "step": 2290 + }, + { + "epoch": 0.165289852458425, + "grad_norm": 4.045409183812874, + "learning_rate": 3.811076354919861e-06, + "loss": 0.9746, + "step": 2291 + }, + { + "epoch": 0.16536199992785253, + "grad_norm": 3.1413379068821685, + "learning_rate": 3.8108780211628353e-06, + "loss": 0.8973, + "step": 2292 + }, + { + "epoch": 0.16543414739728005, + "grad_norm": 2.3797347406618843, + "learning_rate": 3.8106795885204648e-06, + "loss": 1.0165, + "step": 2293 + }, + { + "epoch": 0.16550629486670754, + "grad_norm": 2.232597123924316, + "learning_rate": 3.810481057003585e-06, + "loss": 0.9836, + "step": 2294 + }, + { + "epoch": 0.16557844233613506, + "grad_norm": 4.116460804804913, + "learning_rate": 3.810282426623036e-06, + "loss": 0.9565, + "step": 2295 + }, + { + "epoch": 0.16565058980556258, + "grad_norm": 3.6185874783878194, + "learning_rate": 3.8100836973896663e-06, + "loss": 1.003, + "step": 2296 + }, + { + "epoch": 0.16572273727499007, + "grad_norm": 2.701513526697658, + "learning_rate": 3.809884869314326e-06, + "loss": 0.9743, + "step": 2297 + }, + { + "epoch": 0.1657948847444176, + "grad_norm": 2.322183798462463, + "learning_rate": 3.809685942407873e-06, + "loss": 0.9915, + "step": 2298 + }, + { + "epoch": 0.1658670322138451, + "grad_norm": 2.5243382219325756, + "learning_rate": 3.8094869166811696e-06, + "loss": 1.0445, + "step": 2299 + }, + { + "epoch": 0.1659391796832726, + "grad_norm": 2.3007304731636955, + "learning_rate": 3.8092877921450847e-06, + "loss": 0.9944, + "step": 2300 + }, + { + "epoch": 0.16601132715270012, + "grad_norm": 2.6083871947588575, + "learning_rate": 3.809088568810491e-06, + "loss": 0.866, + "step": 2301 + }, + { + "epoch": 0.16608347462212764, + "grad_norm": 3.4133643652081522, + "learning_rate": 3.8088892466882674e-06, + "loss": 1.073, + "step": 2302 + }, + { + "epoch": 0.16615562209155513, + "grad_norm": 2.448186386075034, + "learning_rate": 3.8086898257892982e-06, + "loss": 0.8587, + "step": 2303 + }, + { + "epoch": 0.16622776956098265, + "grad_norm": 2.1790592286252086, + "learning_rate": 3.808490306124474e-06, + "loss": 0.9895, + "step": 2304 + }, + { + "epoch": 0.16629991703041017, + "grad_norm": 3.255689189744581, + "learning_rate": 3.808290687704688e-06, + "loss": 1.0426, + "step": 2305 + }, + { + "epoch": 0.16637206449983766, + "grad_norm": 2.913319183536389, + "learning_rate": 3.808090970540842e-06, + "loss": 0.977, + "step": 2306 + }, + { + "epoch": 0.16644421196926518, + "grad_norm": 2.2775587762310354, + "learning_rate": 3.8078911546438415e-06, + "loss": 0.9275, + "step": 2307 + }, + { + "epoch": 0.1665163594386927, + "grad_norm": 5.748297065995767, + "learning_rate": 3.807691240024598e-06, + "loss": 1.1303, + "step": 2308 + }, + { + "epoch": 0.1665885069081202, + "grad_norm": 1.9627879860836241, + "learning_rate": 3.807491226694027e-06, + "loss": 1.0434, + "step": 2309 + }, + { + "epoch": 0.1666606543775477, + "grad_norm": 3.4525125227914435, + "learning_rate": 3.8072911146630516e-06, + "loss": 0.961, + "step": 2310 + }, + { + "epoch": 0.16673280184697523, + "grad_norm": 2.4244079142315598, + "learning_rate": 3.807090903942599e-06, + "loss": 0.8832, + "step": 2311 + }, + { + "epoch": 0.16680494931640272, + "grad_norm": 1.8286833603268844, + "learning_rate": 3.806890594543601e-06, + "loss": 0.9701, + "step": 2312 + }, + { + "epoch": 0.16687709678583024, + "grad_norm": 3.154493390743765, + "learning_rate": 3.8066901864769974e-06, + "loss": 0.9252, + "step": 2313 + }, + { + "epoch": 0.16694924425525776, + "grad_norm": 2.954873510967993, + "learning_rate": 3.8064896797537304e-06, + "loss": 0.9231, + "step": 2314 + }, + { + "epoch": 0.16702139172468525, + "grad_norm": 3.2200043835817436, + "learning_rate": 3.8062890743847493e-06, + "loss": 0.9041, + "step": 2315 + }, + { + "epoch": 0.16709353919411277, + "grad_norm": 3.309199866540833, + "learning_rate": 3.8060883703810088e-06, + "loss": 0.8544, + "step": 2316 + }, + { + "epoch": 0.16716568666354029, + "grad_norm": 2.379278946168444, + "learning_rate": 3.805887567753468e-06, + "loss": 0.8395, + "step": 2317 + }, + { + "epoch": 0.16723783413296778, + "grad_norm": 2.310117680082835, + "learning_rate": 3.8056866665130923e-06, + "loss": 0.9751, + "step": 2318 + }, + { + "epoch": 0.1673099816023953, + "grad_norm": 2.3692452831376327, + "learning_rate": 3.8054856666708528e-06, + "loss": 0.9934, + "step": 2319 + }, + { + "epoch": 0.16738212907182282, + "grad_norm": 2.6041374408353524, + "learning_rate": 3.8052845682377238e-06, + "loss": 0.995, + "step": 2320 + }, + { + "epoch": 0.1674542765412503, + "grad_norm": 2.210892639783406, + "learning_rate": 3.805083371224688e-06, + "loss": 0.911, + "step": 2321 + }, + { + "epoch": 0.16752642401067783, + "grad_norm": 1.872750967954101, + "learning_rate": 3.8048820756427312e-06, + "loss": 0.9291, + "step": 2322 + }, + { + "epoch": 0.16759857148010535, + "grad_norm": 3.243904118928347, + "learning_rate": 3.8046806815028456e-06, + "loss": 0.9053, + "step": 2323 + }, + { + "epoch": 0.16767071894953284, + "grad_norm": 10.472969138719277, + "learning_rate": 3.804479188816029e-06, + "loss": 1.0514, + "step": 2324 + }, + { + "epoch": 0.16774286641896036, + "grad_norm": 2.4165975845811034, + "learning_rate": 3.804277597593284e-06, + "loss": 0.9411, + "step": 2325 + }, + { + "epoch": 0.16781501388838788, + "grad_norm": 4.071380533685502, + "learning_rate": 3.804075907845618e-06, + "loss": 0.9479, + "step": 2326 + }, + { + "epoch": 0.16788716135781537, + "grad_norm": 2.089963649269368, + "learning_rate": 3.803874119584046e-06, + "loss": 1.0339, + "step": 2327 + }, + { + "epoch": 0.1679593088272429, + "grad_norm": 4.966187014023128, + "learning_rate": 3.8036722328195854e-06, + "loss": 1.0109, + "step": 2328 + }, + { + "epoch": 0.1680314562966704, + "grad_norm": 1.7748189242776287, + "learning_rate": 3.803470247563261e-06, + "loss": 1.0439, + "step": 2329 + }, + { + "epoch": 0.1681036037660979, + "grad_norm": 1.916741127834482, + "learning_rate": 3.803268163826103e-06, + "loss": 0.9439, + "step": 2330 + }, + { + "epoch": 0.16817575123552542, + "grad_norm": 2.831519855497686, + "learning_rate": 3.8030659816191457e-06, + "loss": 1.0358, + "step": 2331 + }, + { + "epoch": 0.16824789870495294, + "grad_norm": 2.2241535205340597, + "learning_rate": 3.8028637009534305e-06, + "loss": 0.9041, + "step": 2332 + }, + { + "epoch": 0.16832004617438043, + "grad_norm": 2.741491332626765, + "learning_rate": 3.802661321840002e-06, + "loss": 0.9347, + "step": 2333 + }, + { + "epoch": 0.16839219364380795, + "grad_norm": 1.9003764482304588, + "learning_rate": 3.802458844289912e-06, + "loss": 0.8991, + "step": 2334 + }, + { + "epoch": 0.16846434111323547, + "grad_norm": 2.6883256220682337, + "learning_rate": 3.8022562683142176e-06, + "loss": 0.9278, + "step": 2335 + }, + { + "epoch": 0.16853648858266296, + "grad_norm": 1.9468705399218764, + "learning_rate": 3.8020535939239796e-06, + "loss": 1.0025, + "step": 2336 + }, + { + "epoch": 0.16860863605209048, + "grad_norm": 1.872952551351218, + "learning_rate": 3.801850821130266e-06, + "loss": 0.9788, + "step": 2337 + }, + { + "epoch": 0.168680783521518, + "grad_norm": 2.0546714869118174, + "learning_rate": 3.801647949944149e-06, + "loss": 1.047, + "step": 2338 + }, + { + "epoch": 0.1687529309909455, + "grad_norm": 1.946605696812039, + "learning_rate": 3.8014449803767067e-06, + "loss": 1.0058, + "step": 2339 + }, + { + "epoch": 0.168825078460373, + "grad_norm": 3.5086603648033523, + "learning_rate": 3.801241912439023e-06, + "loss": 0.9391, + "step": 2340 + }, + { + "epoch": 0.1688972259298005, + "grad_norm": 1.7408093531180255, + "learning_rate": 3.8010387461421866e-06, + "loss": 0.9091, + "step": 2341 + }, + { + "epoch": 0.16896937339922802, + "grad_norm": 3.8303258680552448, + "learning_rate": 3.8008354814972913e-06, + "loss": 1.0151, + "step": 2342 + }, + { + "epoch": 0.16904152086865554, + "grad_norm": 3.051007720317344, + "learning_rate": 3.8006321185154373e-06, + "loss": 0.9686, + "step": 2343 + }, + { + "epoch": 0.16911366833808303, + "grad_norm": 2.1263299034215772, + "learning_rate": 3.800428657207728e-06, + "loss": 0.9559, + "step": 2344 + }, + { + "epoch": 0.16918581580751055, + "grad_norm": 5.893354274373083, + "learning_rate": 3.800225097585276e-06, + "loss": 1.0534, + "step": 2345 + }, + { + "epoch": 0.16925796327693807, + "grad_norm": 2.5556273058710404, + "learning_rate": 3.8000214396591945e-06, + "loss": 1.1006, + "step": 2346 + }, + { + "epoch": 0.16933011074636556, + "grad_norm": 2.7075669500327986, + "learning_rate": 3.799817683440606e-06, + "loss": 0.9324, + "step": 2347 + }, + { + "epoch": 0.16940225821579308, + "grad_norm": 0.7243613669533597, + "learning_rate": 3.7996138289406366e-06, + "loss": 0.7765, + "step": 2348 + }, + { + "epoch": 0.1694744056852206, + "grad_norm": 4.366094189650561, + "learning_rate": 3.799409876170418e-06, + "loss": 1.0164, + "step": 2349 + }, + { + "epoch": 0.1695465531546481, + "grad_norm": 2.641125693620884, + "learning_rate": 3.7992058251410876e-06, + "loss": 0.992, + "step": 2350 + }, + { + "epoch": 0.1696187006240756, + "grad_norm": 3.412578757917786, + "learning_rate": 3.7990016758637866e-06, + "loss": 1.0112, + "step": 2351 + }, + { + "epoch": 0.16969084809350313, + "grad_norm": 2.2569413666011333, + "learning_rate": 3.7987974283496645e-06, + "loss": 1.0302, + "step": 2352 + }, + { + "epoch": 0.16976299556293062, + "grad_norm": 0.756117626301169, + "learning_rate": 3.7985930826098735e-06, + "loss": 0.7857, + "step": 2353 + }, + { + "epoch": 0.16983514303235814, + "grad_norm": 2.3057000473684903, + "learning_rate": 3.798388638655572e-06, + "loss": 0.9459, + "step": 2354 + }, + { + "epoch": 0.16990729050178566, + "grad_norm": 2.707767270916174, + "learning_rate": 3.798184096497925e-06, + "loss": 0.7559, + "step": 2355 + }, + { + "epoch": 0.16997943797121315, + "grad_norm": 3.1105818203624596, + "learning_rate": 3.797979456148101e-06, + "loss": 0.982, + "step": 2356 + }, + { + "epoch": 0.17005158544064067, + "grad_norm": 2.688297064816984, + "learning_rate": 3.797774717617275e-06, + "loss": 1.028, + "step": 2357 + }, + { + "epoch": 0.1701237329100682, + "grad_norm": 2.3194935184954772, + "learning_rate": 3.797569880916626e-06, + "loss": 0.9819, + "step": 2358 + }, + { + "epoch": 0.17019588037949568, + "grad_norm": 2.6197905846979674, + "learning_rate": 3.7973649460573407e-06, + "loss": 1.0489, + "step": 2359 + }, + { + "epoch": 0.1702680278489232, + "grad_norm": 3.2134173022441104, + "learning_rate": 3.79715991305061e-06, + "loss": 1.0019, + "step": 2360 + }, + { + "epoch": 0.17034017531835072, + "grad_norm": 3.2147056577742057, + "learning_rate": 3.796954781907628e-06, + "loss": 0.8663, + "step": 2361 + }, + { + "epoch": 0.1704123227877782, + "grad_norm": 3.0163924273470846, + "learning_rate": 3.796749552639598e-06, + "loss": 1.0356, + "step": 2362 + }, + { + "epoch": 0.17048447025720573, + "grad_norm": 3.8434388569761913, + "learning_rate": 3.7965442252577265e-06, + "loss": 0.989, + "step": 2363 + }, + { + "epoch": 0.17055661772663325, + "grad_norm": 2.1756458069097606, + "learning_rate": 3.7963387997732253e-06, + "loss": 0.9514, + "step": 2364 + }, + { + "epoch": 0.17062876519606074, + "grad_norm": 2.824314564283894, + "learning_rate": 3.796133276197312e-06, + "loss": 0.9957, + "step": 2365 + }, + { + "epoch": 0.17070091266548826, + "grad_norm": 2.3890282777539404, + "learning_rate": 3.7959276545412093e-06, + "loss": 1.0431, + "step": 2366 + }, + { + "epoch": 0.17077306013491578, + "grad_norm": 2.708585482866133, + "learning_rate": 3.7957219348161464e-06, + "loss": 1.0101, + "step": 2367 + }, + { + "epoch": 0.17084520760434327, + "grad_norm": 2.7766101152569505, + "learning_rate": 3.7955161170333556e-06, + "loss": 0.9647, + "step": 2368 + }, + { + "epoch": 0.1709173550737708, + "grad_norm": 2.3121252013958786, + "learning_rate": 3.7953102012040764e-06, + "loss": 1.0092, + "step": 2369 + }, + { + "epoch": 0.1709895025431983, + "grad_norm": 2.892443110929475, + "learning_rate": 3.795104187339554e-06, + "loss": 0.8889, + "step": 2370 + }, + { + "epoch": 0.1710616500126258, + "grad_norm": 2.697665218776948, + "learning_rate": 3.7948980754510363e-06, + "loss": 0.9817, + "step": 2371 + }, + { + "epoch": 0.17113379748205332, + "grad_norm": 2.7605362104525946, + "learning_rate": 3.794691865549779e-06, + "loss": 1.0107, + "step": 2372 + }, + { + "epoch": 0.17120594495148084, + "grad_norm": 3.6957781701915176, + "learning_rate": 3.794485557647043e-06, + "loss": 1.0942, + "step": 2373 + }, + { + "epoch": 0.17127809242090833, + "grad_norm": 1.9583693629528418, + "learning_rate": 3.794279151754094e-06, + "loss": 0.9566, + "step": 2374 + }, + { + "epoch": 0.17135023989033585, + "grad_norm": 0.8562375450620661, + "learning_rate": 3.794072647882202e-06, + "loss": 0.8374, + "step": 2375 + }, + { + "epoch": 0.17142238735976337, + "grad_norm": 2.752431661396733, + "learning_rate": 3.793866046042645e-06, + "loss": 1.08, + "step": 2376 + }, + { + "epoch": 0.17149453482919086, + "grad_norm": 2.409934950302842, + "learning_rate": 3.793659346246703e-06, + "loss": 0.9564, + "step": 2377 + }, + { + "epoch": 0.17156668229861838, + "grad_norm": 0.8887877001056381, + "learning_rate": 3.7934525485056643e-06, + "loss": 0.8119, + "step": 2378 + }, + { + "epoch": 0.1716388297680459, + "grad_norm": 2.229926315595616, + "learning_rate": 3.793245652830821e-06, + "loss": 0.9864, + "step": 2379 + }, + { + "epoch": 0.1717109772374734, + "grad_norm": 2.698334434498057, + "learning_rate": 3.793038659233471e-06, + "loss": 0.9815, + "step": 2380 + }, + { + "epoch": 0.1717831247069009, + "grad_norm": 2.541361826975433, + "learning_rate": 3.792831567724918e-06, + "loss": 1.02, + "step": 2381 + }, + { + "epoch": 0.17185527217632843, + "grad_norm": 0.8226993531385581, + "learning_rate": 3.7926243783164693e-06, + "loss": 0.7748, + "step": 2382 + }, + { + "epoch": 0.17192741964575592, + "grad_norm": 2.0002388811502088, + "learning_rate": 3.792417091019439e-06, + "loss": 0.9083, + "step": 2383 + }, + { + "epoch": 0.17199956711518344, + "grad_norm": 5.635460346308839, + "learning_rate": 3.7922097058451477e-06, + "loss": 0.9441, + "step": 2384 + }, + { + "epoch": 0.17207171458461096, + "grad_norm": 2.178740483677403, + "learning_rate": 3.7920022228049186e-06, + "loss": 1.0178, + "step": 2385 + }, + { + "epoch": 0.17214386205403845, + "grad_norm": 2.526362940751891, + "learning_rate": 3.7917946419100816e-06, + "loss": 0.9031, + "step": 2386 + }, + { + "epoch": 0.17221600952346597, + "grad_norm": 2.0437340552031755, + "learning_rate": 3.791586963171972e-06, + "loss": 0.9732, + "step": 2387 + }, + { + "epoch": 0.1722881569928935, + "grad_norm": 2.2431358963390027, + "learning_rate": 3.791379186601931e-06, + "loss": 0.9921, + "step": 2388 + }, + { + "epoch": 0.17236030446232098, + "grad_norm": 2.0851492660479822, + "learning_rate": 3.791171312211304e-06, + "loss": 0.9652, + "step": 2389 + }, + { + "epoch": 0.1724324519317485, + "grad_norm": 3.080758609238711, + "learning_rate": 3.790963340011442e-06, + "loss": 0.9743, + "step": 2390 + }, + { + "epoch": 0.17250459940117602, + "grad_norm": 18.32169504378194, + "learning_rate": 3.7907552700137024e-06, + "loss": 0.8325, + "step": 2391 + }, + { + "epoch": 0.1725767468706035, + "grad_norm": 0.7700362896108582, + "learning_rate": 3.7905471022294463e-06, + "loss": 0.8453, + "step": 2392 + }, + { + "epoch": 0.17264889434003103, + "grad_norm": 3.784574324180539, + "learning_rate": 3.790338836670042e-06, + "loss": 1.0222, + "step": 2393 + }, + { + "epoch": 0.17272104180945852, + "grad_norm": 1.8623701063087097, + "learning_rate": 3.7901304733468607e-06, + "loss": 0.8758, + "step": 2394 + }, + { + "epoch": 0.17279318927888604, + "grad_norm": 1.8741602606668843, + "learning_rate": 3.789922012271281e-06, + "loss": 0.9027, + "step": 2395 + }, + { + "epoch": 0.17286533674831356, + "grad_norm": 2.8605974429676206, + "learning_rate": 3.7897134534546863e-06, + "loss": 0.9921, + "step": 2396 + }, + { + "epoch": 0.17293748421774105, + "grad_norm": 1.62844571343025, + "learning_rate": 3.789504796908465e-06, + "loss": 0.9613, + "step": 2397 + }, + { + "epoch": 0.17300963168716857, + "grad_norm": 2.0290965476117764, + "learning_rate": 3.789296042644012e-06, + "loss": 0.9471, + "step": 2398 + }, + { + "epoch": 0.1730817791565961, + "grad_norm": 2.2550516639424383, + "learning_rate": 3.7890871906727256e-06, + "loss": 0.8853, + "step": 2399 + }, + { + "epoch": 0.17315392662602358, + "grad_norm": 1.9790692623329493, + "learning_rate": 3.788878241006011e-06, + "loss": 0.9138, + "step": 2400 + }, + { + "epoch": 0.1732260740954511, + "grad_norm": 2.2704486929467196, + "learning_rate": 3.788669193655277e-06, + "loss": 1.0501, + "step": 2401 + }, + { + "epoch": 0.17329822156487862, + "grad_norm": 2.8338492615642186, + "learning_rate": 3.7884600486319405e-06, + "loss": 0.9224, + "step": 2402 + }, + { + "epoch": 0.1733703690343061, + "grad_norm": 2.7318675724400716, + "learning_rate": 3.788250805947421e-06, + "loss": 0.8765, + "step": 2403 + }, + { + "epoch": 0.17344251650373363, + "grad_norm": 0.8155968774500029, + "learning_rate": 3.788041465613145e-06, + "loss": 0.8473, + "step": 2404 + }, + { + "epoch": 0.17351466397316115, + "grad_norm": 4.054441940721755, + "learning_rate": 3.787832027640544e-06, + "loss": 1.0605, + "step": 2405 + }, + { + "epoch": 0.17358681144258864, + "grad_norm": 2.9981294204332607, + "learning_rate": 3.787622492041054e-06, + "loss": 0.9992, + "step": 2406 + }, + { + "epoch": 0.17365895891201616, + "grad_norm": 2.268593047626869, + "learning_rate": 3.7874128588261174e-06, + "loss": 0.8754, + "step": 2407 + }, + { + "epoch": 0.17373110638144368, + "grad_norm": 2.176759236379357, + "learning_rate": 3.787203128007181e-06, + "loss": 1.0126, + "step": 2408 + }, + { + "epoch": 0.17380325385087117, + "grad_norm": 2.7929632867079435, + "learning_rate": 3.7869932995956982e-06, + "loss": 0.8121, + "step": 2409 + }, + { + "epoch": 0.1738754013202987, + "grad_norm": 2.6111888862642156, + "learning_rate": 3.786783373603126e-06, + "loss": 1.0296, + "step": 2410 + }, + { + "epoch": 0.1739475487897262, + "grad_norm": 3.954047174661456, + "learning_rate": 3.7865733500409286e-06, + "loss": 0.9531, + "step": 2411 + }, + { + "epoch": 0.1740196962591537, + "grad_norm": 1.9671192378531626, + "learning_rate": 3.7863632289205743e-06, + "loss": 0.8674, + "step": 2412 + }, + { + "epoch": 0.17409184372858122, + "grad_norm": 3.369659436956402, + "learning_rate": 3.786153010253537e-06, + "loss": 0.9914, + "step": 2413 + }, + { + "epoch": 0.17416399119800874, + "grad_norm": 2.3097548816314286, + "learning_rate": 3.7859426940512956e-06, + "loss": 0.8907, + "step": 2414 + }, + { + "epoch": 0.17423613866743623, + "grad_norm": 2.147488804211814, + "learning_rate": 3.785732280325335e-06, + "loss": 0.9137, + "step": 2415 + }, + { + "epoch": 0.17430828613686375, + "grad_norm": 2.394474993444738, + "learning_rate": 3.785521769087145e-06, + "loss": 0.9634, + "step": 2416 + }, + { + "epoch": 0.17438043360629127, + "grad_norm": 3.703031514593319, + "learning_rate": 3.785311160348222e-06, + "loss": 0.9946, + "step": 2417 + }, + { + "epoch": 0.17445258107571876, + "grad_norm": 2.021706094422867, + "learning_rate": 3.7851004541200645e-06, + "loss": 1.0209, + "step": 2418 + }, + { + "epoch": 0.17452472854514628, + "grad_norm": 2.61688995947433, + "learning_rate": 3.78488965041418e-06, + "loss": 0.972, + "step": 2419 + }, + { + "epoch": 0.1745968760145738, + "grad_norm": 1.9367354945866566, + "learning_rate": 3.7846787492420785e-06, + "loss": 0.9366, + "step": 2420 + }, + { + "epoch": 0.1746690234840013, + "grad_norm": 2.7167982791153813, + "learning_rate": 3.7844677506152776e-06, + "loss": 1.0173, + "step": 2421 + }, + { + "epoch": 0.1747411709534288, + "grad_norm": 2.682020678611876, + "learning_rate": 3.7842566545452982e-06, + "loss": 0.9639, + "step": 2422 + }, + { + "epoch": 0.17481331842285633, + "grad_norm": 3.099639680902963, + "learning_rate": 3.7840454610436685e-06, + "loss": 0.9972, + "step": 2423 + }, + { + "epoch": 0.17488546589228382, + "grad_norm": 1.8861578508421444, + "learning_rate": 3.7838341701219204e-06, + "loss": 0.9185, + "step": 2424 + }, + { + "epoch": 0.17495761336171134, + "grad_norm": 3.1165027296500183, + "learning_rate": 3.7836227817915917e-06, + "loss": 0.8482, + "step": 2425 + }, + { + "epoch": 0.17502976083113886, + "grad_norm": 2.7348189511253884, + "learning_rate": 3.7834112960642254e-06, + "loss": 0.9812, + "step": 2426 + }, + { + "epoch": 0.17510190830056635, + "grad_norm": 2.84373641796851, + "learning_rate": 3.7831997129513707e-06, + "loss": 0.9167, + "step": 2427 + }, + { + "epoch": 0.17517405576999387, + "grad_norm": 2.2084002814551313, + "learning_rate": 3.7829880324645804e-06, + "loss": 0.9496, + "step": 2428 + }, + { + "epoch": 0.1752462032394214, + "grad_norm": 1.875994672792106, + "learning_rate": 3.7827762546154144e-06, + "loss": 1.0046, + "step": 2429 + }, + { + "epoch": 0.17531835070884888, + "grad_norm": 2.3307030248330265, + "learning_rate": 3.7825643794154367e-06, + "loss": 0.9985, + "step": 2430 + }, + { + "epoch": 0.1753904981782764, + "grad_norm": 0.8333438673943068, + "learning_rate": 3.7823524068762174e-06, + "loss": 0.8734, + "step": 2431 + }, + { + "epoch": 0.17546264564770392, + "grad_norm": 2.1163604359615977, + "learning_rate": 3.7821403370093303e-06, + "loss": 0.9423, + "step": 2432 + }, + { + "epoch": 0.1755347931171314, + "grad_norm": 2.3256611940088883, + "learning_rate": 3.781928169826357e-06, + "loss": 0.9633, + "step": 2433 + }, + { + "epoch": 0.17560694058655893, + "grad_norm": 2.4679737077731443, + "learning_rate": 3.7817159053388828e-06, + "loss": 0.8569, + "step": 2434 + }, + { + "epoch": 0.17567908805598645, + "grad_norm": 2.4485643111779036, + "learning_rate": 3.781503543558499e-06, + "loss": 0.9454, + "step": 2435 + }, + { + "epoch": 0.17575123552541394, + "grad_norm": 2.199381143052416, + "learning_rate": 3.7812910844968017e-06, + "loss": 0.9801, + "step": 2436 + }, + { + "epoch": 0.17582338299484146, + "grad_norm": 1.9286930749766196, + "learning_rate": 3.7810785281653922e-06, + "loss": 0.9097, + "step": 2437 + }, + { + "epoch": 0.17589553046426898, + "grad_norm": 2.977957491251705, + "learning_rate": 3.7808658745758775e-06, + "loss": 0.97, + "step": 2438 + }, + { + "epoch": 0.17596767793369647, + "grad_norm": 2.0193827295937594, + "learning_rate": 3.7806531237398697e-06, + "loss": 1.006, + "step": 2439 + }, + { + "epoch": 0.176039825403124, + "grad_norm": 2.2850870757759263, + "learning_rate": 3.780440275668987e-06, + "loss": 0.901, + "step": 2440 + }, + { + "epoch": 0.1761119728725515, + "grad_norm": 2.1453396744753905, + "learning_rate": 3.780227330374852e-06, + "loss": 0.9948, + "step": 2441 + }, + { + "epoch": 0.176184120341979, + "grad_norm": 2.1623945177404615, + "learning_rate": 3.780014287869092e-06, + "loss": 0.8765, + "step": 2442 + }, + { + "epoch": 0.17625626781140652, + "grad_norm": 2.9458721246462933, + "learning_rate": 3.7798011481633416e-06, + "loss": 0.9635, + "step": 2443 + }, + { + "epoch": 0.176328415280834, + "grad_norm": 1.8962307258486415, + "learning_rate": 3.779587911269239e-06, + "loss": 0.9915, + "step": 2444 + }, + { + "epoch": 0.17640056275026153, + "grad_norm": 2.4069514057371575, + "learning_rate": 3.7793745771984282e-06, + "loss": 0.8151, + "step": 2445 + }, + { + "epoch": 0.17647271021968905, + "grad_norm": 2.0489961538879133, + "learning_rate": 3.7791611459625587e-06, + "loss": 0.9448, + "step": 2446 + }, + { + "epoch": 0.17654485768911654, + "grad_norm": 2.6809361165171293, + "learning_rate": 3.7789476175732852e-06, + "loss": 0.9027, + "step": 2447 + }, + { + "epoch": 0.17661700515854406, + "grad_norm": 2.6561315173483355, + "learning_rate": 3.7787339920422686e-06, + "loss": 0.9224, + "step": 2448 + }, + { + "epoch": 0.17668915262797158, + "grad_norm": 2.120525361229426, + "learning_rate": 3.7785202693811725e-06, + "loss": 0.9135, + "step": 2449 + }, + { + "epoch": 0.17676130009739907, + "grad_norm": 2.452183925625571, + "learning_rate": 3.778306449601668e-06, + "loss": 0.9724, + "step": 2450 + }, + { + "epoch": 0.1768334475668266, + "grad_norm": 4.81344654633444, + "learning_rate": 3.7780925327154323e-06, + "loss": 0.9698, + "step": 2451 + }, + { + "epoch": 0.1769055950362541, + "grad_norm": 2.3917298599915187, + "learning_rate": 3.7778785187341456e-06, + "loss": 0.8267, + "step": 2452 + }, + { + "epoch": 0.1769777425056816, + "grad_norm": 2.148979201949753, + "learning_rate": 3.7776644076694936e-06, + "loss": 0.8613, + "step": 2453 + }, + { + "epoch": 0.17704988997510912, + "grad_norm": 0.699267273822849, + "learning_rate": 3.7774501995331693e-06, + "loss": 0.8122, + "step": 2454 + }, + { + "epoch": 0.17712203744453664, + "grad_norm": 32.709899113265394, + "learning_rate": 3.77723589433687e-06, + "loss": 0.9527, + "step": 2455 + }, + { + "epoch": 0.17719418491396413, + "grad_norm": 2.1786832512738292, + "learning_rate": 3.7770214920922974e-06, + "loss": 0.881, + "step": 2456 + }, + { + "epoch": 0.17726633238339165, + "grad_norm": 2.1137051818174855, + "learning_rate": 3.776806992811159e-06, + "loss": 1.0904, + "step": 2457 + }, + { + "epoch": 0.17733847985281917, + "grad_norm": 2.577860593241942, + "learning_rate": 3.776592396505169e-06, + "loss": 0.9142, + "step": 2458 + }, + { + "epoch": 0.17741062732224666, + "grad_norm": 1.471311891561737, + "learning_rate": 3.7763777031860445e-06, + "loss": 0.9684, + "step": 2459 + }, + { + "epoch": 0.17748277479167418, + "grad_norm": 1.7433526628977472, + "learning_rate": 3.7761629128655094e-06, + "loss": 0.9273, + "step": 2460 + }, + { + "epoch": 0.1775549222611017, + "grad_norm": 4.290784924243265, + "learning_rate": 3.7759480255552934e-06, + "loss": 0.9994, + "step": 2461 + }, + { + "epoch": 0.1776270697305292, + "grad_norm": 0.7261452656139328, + "learning_rate": 3.775733041267129e-06, + "loss": 0.7741, + "step": 2462 + }, + { + "epoch": 0.1776992171999567, + "grad_norm": 2.2153754563223784, + "learning_rate": 3.7755179600127576e-06, + "loss": 0.935, + "step": 2463 + }, + { + "epoch": 0.17777136466938423, + "grad_norm": 2.6026225178633764, + "learning_rate": 3.7753027818039227e-06, + "loss": 1.0116, + "step": 2464 + }, + { + "epoch": 0.17784351213881172, + "grad_norm": 2.4507536877148426, + "learning_rate": 3.775087506652375e-06, + "loss": 0.9587, + "step": 2465 + }, + { + "epoch": 0.17791565960823924, + "grad_norm": 6.832889945679327, + "learning_rate": 3.7748721345698698e-06, + "loss": 0.9938, + "step": 2466 + }, + { + "epoch": 0.17798780707766676, + "grad_norm": 2.0061821756752476, + "learning_rate": 3.7746566655681675e-06, + "loss": 0.9715, + "step": 2467 + }, + { + "epoch": 0.17805995454709425, + "grad_norm": 1.7964747107424808, + "learning_rate": 3.7744410996590348e-06, + "loss": 1.0317, + "step": 2468 + }, + { + "epoch": 0.17813210201652177, + "grad_norm": 2.6566744296998723, + "learning_rate": 3.7742254368542418e-06, + "loss": 1.0802, + "step": 2469 + }, + { + "epoch": 0.1782042494859493, + "grad_norm": 2.1375045865550666, + "learning_rate": 3.7740096771655658e-06, + "loss": 0.9259, + "step": 2470 + }, + { + "epoch": 0.17827639695537678, + "grad_norm": 2.573446016002246, + "learning_rate": 3.773793820604789e-06, + "loss": 1.0519, + "step": 2471 + }, + { + "epoch": 0.1783485444248043, + "grad_norm": 2.7218342091496255, + "learning_rate": 3.773577867183697e-06, + "loss": 0.8705, + "step": 2472 + }, + { + "epoch": 0.17842069189423182, + "grad_norm": 3.887989300418778, + "learning_rate": 3.7733618169140842e-06, + "loss": 0.9507, + "step": 2473 + }, + { + "epoch": 0.1784928393636593, + "grad_norm": 2.156267912417107, + "learning_rate": 3.7731456698077466e-06, + "loss": 1.011, + "step": 2474 + }, + { + "epoch": 0.17856498683308683, + "grad_norm": 2.1072056670907067, + "learning_rate": 3.772929425876488e-06, + "loss": 0.9885, + "step": 2475 + }, + { + "epoch": 0.17863713430251435, + "grad_norm": 3.5938439398387927, + "learning_rate": 3.7727130851321166e-06, + "loss": 0.9835, + "step": 2476 + }, + { + "epoch": 0.17870928177194184, + "grad_norm": 2.003346385416091, + "learning_rate": 3.7724966475864468e-06, + "loss": 0.9684, + "step": 2477 + }, + { + "epoch": 0.17878142924136936, + "grad_norm": 2.0953037702704287, + "learning_rate": 3.772280113251296e-06, + "loss": 0.9701, + "step": 2478 + }, + { + "epoch": 0.17885357671079688, + "grad_norm": 2.159923354661052, + "learning_rate": 3.772063482138489e-06, + "loss": 0.8731, + "step": 2479 + }, + { + "epoch": 0.17892572418022437, + "grad_norm": 2.3600581108632923, + "learning_rate": 3.7718467542598553e-06, + "loss": 0.9729, + "step": 2480 + }, + { + "epoch": 0.1789978716496519, + "grad_norm": 2.1559109628583806, + "learning_rate": 3.7716299296272292e-06, + "loss": 0.9665, + "step": 2481 + }, + { + "epoch": 0.1790700191190794, + "grad_norm": 1.968795654736327, + "learning_rate": 3.7714130082524516e-06, + "loss": 0.9766, + "step": 2482 + }, + { + "epoch": 0.1791421665885069, + "grad_norm": 1.9778570225671368, + "learning_rate": 3.771195990147367e-06, + "loss": 0.9664, + "step": 2483 + }, + { + "epoch": 0.17921431405793442, + "grad_norm": 2.008634287760052, + "learning_rate": 3.7709788753238255e-06, + "loss": 0.9389, + "step": 2484 + }, + { + "epoch": 0.17928646152736194, + "grad_norm": 2.9502525286159793, + "learning_rate": 3.770761663793684e-06, + "loss": 0.8696, + "step": 2485 + }, + { + "epoch": 0.17935860899678943, + "grad_norm": 2.249044215329553, + "learning_rate": 3.770544355568803e-06, + "loss": 1.0936, + "step": 2486 + }, + { + "epoch": 0.17943075646621695, + "grad_norm": 2.328491182139785, + "learning_rate": 3.770326950661049e-06, + "loss": 0.9719, + "step": 2487 + }, + { + "epoch": 0.17950290393564447, + "grad_norm": 2.371567001750683, + "learning_rate": 3.7701094490822943e-06, + "loss": 0.8831, + "step": 2488 + }, + { + "epoch": 0.17957505140507196, + "grad_norm": 2.3189372601429934, + "learning_rate": 3.769891850844415e-06, + "loss": 0.9805, + "step": 2489 + }, + { + "epoch": 0.17964719887449948, + "grad_norm": 2.1379449035568086, + "learning_rate": 3.7696741559592937e-06, + "loss": 0.9831, + "step": 2490 + }, + { + "epoch": 0.179719346343927, + "grad_norm": 5.053996538578444, + "learning_rate": 3.7694563644388175e-06, + "loss": 0.9218, + "step": 2491 + }, + { + "epoch": 0.1797914938133545, + "grad_norm": 3.2579947221688643, + "learning_rate": 3.76923847629488e-06, + "loss": 0.8433, + "step": 2492 + }, + { + "epoch": 0.179863641282782, + "grad_norm": 1.9320924421431218, + "learning_rate": 3.7690204915393784e-06, + "loss": 0.9047, + "step": 2493 + }, + { + "epoch": 0.17993578875220953, + "grad_norm": 2.1076798020451823, + "learning_rate": 3.7688024101842173e-06, + "loss": 0.9643, + "step": 2494 + }, + { + "epoch": 0.18000793622163702, + "grad_norm": 1.8307629976000028, + "learning_rate": 3.7685842322413034e-06, + "loss": 1.1136, + "step": 2495 + }, + { + "epoch": 0.18008008369106454, + "grad_norm": 1.9786951674613407, + "learning_rate": 3.768365957722552e-06, + "loss": 0.98, + "step": 2496 + }, + { + "epoch": 0.18015223116049203, + "grad_norm": 3.965906641512982, + "learning_rate": 3.768147586639882e-06, + "loss": 0.9688, + "step": 2497 + }, + { + "epoch": 0.18022437862991955, + "grad_norm": 2.7335648562683295, + "learning_rate": 3.7679291190052175e-06, + "loss": 0.9089, + "step": 2498 + }, + { + "epoch": 0.18029652609934707, + "grad_norm": 2.3770539286670087, + "learning_rate": 3.767710554830489e-06, + "loss": 1.0154, + "step": 2499 + }, + { + "epoch": 0.18036867356877456, + "grad_norm": 2.393677501726013, + "learning_rate": 3.767491894127631e-06, + "loss": 1.0186, + "step": 2500 + }, + { + "epoch": 0.18044082103820208, + "grad_norm": 2.2917771630669135, + "learning_rate": 3.7672731369085833e-06, + "loss": 0.8519, + "step": 2501 + }, + { + "epoch": 0.1805129685076296, + "grad_norm": 2.0214760010771604, + "learning_rate": 3.767054283185292e-06, + "loss": 0.9775, + "step": 2502 + }, + { + "epoch": 0.1805851159770571, + "grad_norm": 2.926167807616316, + "learning_rate": 3.766835332969707e-06, + "loss": 1.0209, + "step": 2503 + }, + { + "epoch": 0.1806572634464846, + "grad_norm": 1.801342990335932, + "learning_rate": 3.7666162862737857e-06, + "loss": 0.9881, + "step": 2504 + }, + { + "epoch": 0.18072941091591213, + "grad_norm": 2.697341199695898, + "learning_rate": 3.766397143109489e-06, + "loss": 0.976, + "step": 2505 + }, + { + "epoch": 0.18080155838533962, + "grad_norm": 7.083728854038603, + "learning_rate": 3.766177903488783e-06, + "loss": 0.9271, + "step": 2506 + }, + { + "epoch": 0.18087370585476714, + "grad_norm": 3.3834344939594367, + "learning_rate": 3.76595856742364e-06, + "loss": 0.9649, + "step": 2507 + }, + { + "epoch": 0.18094585332419466, + "grad_norm": 2.348524799361363, + "learning_rate": 3.7657391349260362e-06, + "loss": 1.0963, + "step": 2508 + }, + { + "epoch": 0.18101800079362215, + "grad_norm": 2.1047611215199957, + "learning_rate": 3.765519606007956e-06, + "loss": 1.0267, + "step": 2509 + }, + { + "epoch": 0.18109014826304967, + "grad_norm": 2.3927655578687803, + "learning_rate": 3.7652999806813856e-06, + "loss": 0.9529, + "step": 2510 + }, + { + "epoch": 0.1811622957324772, + "grad_norm": 2.0245192298494055, + "learning_rate": 3.7650802589583173e-06, + "loss": 0.9203, + "step": 2511 + }, + { + "epoch": 0.18123444320190468, + "grad_norm": 1.846910128444936, + "learning_rate": 3.764860440850751e-06, + "loss": 0.925, + "step": 2512 + }, + { + "epoch": 0.1813065906713322, + "grad_norm": 2.4352762887231076, + "learning_rate": 3.764640526370689e-06, + "loss": 0.9853, + "step": 2513 + }, + { + "epoch": 0.18137873814075972, + "grad_norm": 2.7668108795298454, + "learning_rate": 3.7644205155301404e-06, + "loss": 0.9409, + "step": 2514 + }, + { + "epoch": 0.1814508856101872, + "grad_norm": 1.9535716652340769, + "learning_rate": 3.764200408341119e-06, + "loss": 0.9588, + "step": 2515 + }, + { + "epoch": 0.18152303307961473, + "grad_norm": 3.025097925792209, + "learning_rate": 3.7639802048156445e-06, + "loss": 0.9308, + "step": 2516 + }, + { + "epoch": 0.18159518054904225, + "grad_norm": 2.1853875996196392, + "learning_rate": 3.763759904965741e-06, + "loss": 1.0019, + "step": 2517 + }, + { + "epoch": 0.18166732801846974, + "grad_norm": 2.3503091203005653, + "learning_rate": 3.7635395088034376e-06, + "loss": 1.0029, + "step": 2518 + }, + { + "epoch": 0.18173947548789726, + "grad_norm": 1.7874288638171834, + "learning_rate": 3.7633190163407702e-06, + "loss": 0.9856, + "step": 2519 + }, + { + "epoch": 0.18181162295732478, + "grad_norm": 45.78494545665365, + "learning_rate": 3.7630984275897796e-06, + "loss": 0.9371, + "step": 2520 + }, + { + "epoch": 0.18188377042675227, + "grad_norm": 2.630737029971224, + "learning_rate": 3.7628777425625103e-06, + "loss": 0.9162, + "step": 2521 + }, + { + "epoch": 0.1819559178961798, + "grad_norm": 2.0645270356920773, + "learning_rate": 3.762656961271013e-06, + "loss": 0.9677, + "step": 2522 + }, + { + "epoch": 0.1820280653656073, + "grad_norm": 3.641249836359567, + "learning_rate": 3.762436083727345e-06, + "loss": 0.7842, + "step": 2523 + }, + { + "epoch": 0.1821002128350348, + "grad_norm": 4.603698752721614, + "learning_rate": 3.7622151099435666e-06, + "loss": 0.9561, + "step": 2524 + }, + { + "epoch": 0.18217236030446232, + "grad_norm": 1.9718032050022645, + "learning_rate": 3.7619940399317437e-06, + "loss": 1.0257, + "step": 2525 + }, + { + "epoch": 0.18224450777388984, + "grad_norm": 2.528702006843927, + "learning_rate": 3.7617728737039495e-06, + "loss": 0.9716, + "step": 2526 + }, + { + "epoch": 0.18231665524331733, + "grad_norm": 2.917309790051926, + "learning_rate": 3.761551611272261e-06, + "loss": 1.0733, + "step": 2527 + }, + { + "epoch": 0.18238880271274485, + "grad_norm": 0.7250138462322951, + "learning_rate": 3.761330252648759e-06, + "loss": 0.7547, + "step": 2528 + }, + { + "epoch": 0.18246095018217237, + "grad_norm": 2.4214238884933206, + "learning_rate": 3.7611087978455328e-06, + "loss": 0.922, + "step": 2529 + }, + { + "epoch": 0.18253309765159986, + "grad_norm": 1.9905068881987202, + "learning_rate": 3.7608872468746746e-06, + "loss": 1.0061, + "step": 2530 + }, + { + "epoch": 0.18260524512102738, + "grad_norm": 2.395168997390792, + "learning_rate": 3.7606655997482826e-06, + "loss": 0.9698, + "step": 2531 + }, + { + "epoch": 0.1826773925904549, + "grad_norm": 1.951321127436425, + "learning_rate": 3.7604438564784596e-06, + "loss": 0.9995, + "step": 2532 + }, + { + "epoch": 0.1827495400598824, + "grad_norm": 4.781021212420853, + "learning_rate": 3.7602220170773144e-06, + "loss": 0.9417, + "step": 2533 + }, + { + "epoch": 0.1828216875293099, + "grad_norm": 2.9227440505514983, + "learning_rate": 3.760000081556961e-06, + "loss": 0.9117, + "step": 2534 + }, + { + "epoch": 0.18289383499873743, + "grad_norm": 0.7426301138316056, + "learning_rate": 3.759778049929519e-06, + "loss": 0.7621, + "step": 2535 + }, + { + "epoch": 0.18296598246816492, + "grad_norm": 1.7455990493763085, + "learning_rate": 3.759555922207112e-06, + "loss": 1.055, + "step": 2536 + }, + { + "epoch": 0.18303812993759244, + "grad_norm": 2.4192941019166323, + "learning_rate": 3.7593336984018695e-06, + "loss": 0.9633, + "step": 2537 + }, + { + "epoch": 0.18311027740701996, + "grad_norm": 1.9707400844953138, + "learning_rate": 3.7591113785259263e-06, + "loss": 0.9406, + "step": 2538 + }, + { + "epoch": 0.18318242487644745, + "grad_norm": 2.6580691223227078, + "learning_rate": 3.758888962591423e-06, + "loss": 0.9267, + "step": 2539 + }, + { + "epoch": 0.18325457234587497, + "grad_norm": 3.531229069740585, + "learning_rate": 3.7586664506105047e-06, + "loss": 0.9203, + "step": 2540 + }, + { + "epoch": 0.1833267198153025, + "grad_norm": 1.8599016741943373, + "learning_rate": 3.758443842595322e-06, + "loss": 0.902, + "step": 2541 + }, + { + "epoch": 0.18339886728472998, + "grad_norm": 2.542434190856795, + "learning_rate": 3.7582211385580305e-06, + "loss": 0.7251, + "step": 2542 + }, + { + "epoch": 0.1834710147541575, + "grad_norm": 3.120938064454443, + "learning_rate": 3.757998338510791e-06, + "loss": 0.8746, + "step": 2543 + }, + { + "epoch": 0.18354316222358502, + "grad_norm": 2.362433869581744, + "learning_rate": 3.75777544246577e-06, + "loss": 0.9777, + "step": 2544 + }, + { + "epoch": 0.1836153096930125, + "grad_norm": 2.2246255580993353, + "learning_rate": 3.7575524504351402e-06, + "loss": 0.9542, + "step": 2545 + }, + { + "epoch": 0.18368745716244003, + "grad_norm": 2.0831451585345366, + "learning_rate": 3.7573293624310762e-06, + "loss": 1.0156, + "step": 2546 + }, + { + "epoch": 0.18375960463186752, + "grad_norm": 2.459713198372833, + "learning_rate": 3.757106178465762e-06, + "loss": 0.9354, + "step": 2547 + }, + { + "epoch": 0.18383175210129504, + "grad_norm": 1.0285455904375758, + "learning_rate": 3.7568828985513833e-06, + "loss": 0.8413, + "step": 2548 + }, + { + "epoch": 0.18390389957072256, + "grad_norm": 4.463405587767928, + "learning_rate": 3.7566595227001336e-06, + "loss": 0.9787, + "step": 2549 + }, + { + "epoch": 0.18397604704015005, + "grad_norm": 2.3912797417712888, + "learning_rate": 3.75643605092421e-06, + "loss": 0.9477, + "step": 2550 + }, + { + "epoch": 0.18404819450957757, + "grad_norm": 2.2781906516161206, + "learning_rate": 3.7562124832358163e-06, + "loss": 1.0065, + "step": 2551 + }, + { + "epoch": 0.1841203419790051, + "grad_norm": 1.9200508330490393, + "learning_rate": 3.75598881964716e-06, + "loss": 0.9482, + "step": 2552 + }, + { + "epoch": 0.18419248944843258, + "grad_norm": 1.9341614169282793, + "learning_rate": 3.7557650601704544e-06, + "loss": 0.9653, + "step": 2553 + }, + { + "epoch": 0.1842646369178601, + "grad_norm": 2.3564774866354083, + "learning_rate": 3.755541204817919e-06, + "loss": 1.0085, + "step": 2554 + }, + { + "epoch": 0.18433678438728762, + "grad_norm": 2.0026786271147445, + "learning_rate": 3.7553172536017763e-06, + "loss": 1.0243, + "step": 2555 + }, + { + "epoch": 0.1844089318567151, + "grad_norm": 2.3469092883412936, + "learning_rate": 3.755093206534257e-06, + "loss": 0.9731, + "step": 2556 + }, + { + "epoch": 0.18448107932614263, + "grad_norm": 2.5069248613160355, + "learning_rate": 3.7548690636275946e-06, + "loss": 0.8878, + "step": 2557 + }, + { + "epoch": 0.18455322679557015, + "grad_norm": 3.158885044702794, + "learning_rate": 3.7546448248940293e-06, + "loss": 0.9677, + "step": 2558 + }, + { + "epoch": 0.18462537426499764, + "grad_norm": 3.759396288677514, + "learning_rate": 3.754420490345805e-06, + "loss": 0.907, + "step": 2559 + }, + { + "epoch": 0.18469752173442516, + "grad_norm": 2.2245213840354525, + "learning_rate": 3.754196059995173e-06, + "loss": 0.9966, + "step": 2560 + }, + { + "epoch": 0.18476966920385268, + "grad_norm": 2.4786512549695665, + "learning_rate": 3.7539715338543877e-06, + "loss": 1.0026, + "step": 2561 + }, + { + "epoch": 0.18484181667328017, + "grad_norm": 0.8728478053266676, + "learning_rate": 3.75374691193571e-06, + "loss": 0.8839, + "step": 2562 + }, + { + "epoch": 0.1849139641427077, + "grad_norm": 1.7552557088946266, + "learning_rate": 3.7535221942514053e-06, + "loss": 0.9965, + "step": 2563 + }, + { + "epoch": 0.1849861116121352, + "grad_norm": 0.8428061468997385, + "learning_rate": 3.7532973808137453e-06, + "loss": 0.8319, + "step": 2564 + }, + { + "epoch": 0.1850582590815627, + "grad_norm": 2.650135767355835, + "learning_rate": 3.7530724716350057e-06, + "loss": 0.966, + "step": 2565 + }, + { + "epoch": 0.18513040655099022, + "grad_norm": 3.4568482086517323, + "learning_rate": 3.752847466727468e-06, + "loss": 0.9162, + "step": 2566 + }, + { + "epoch": 0.18520255402041774, + "grad_norm": 2.141017682889337, + "learning_rate": 3.7526223661034184e-06, + "loss": 1.0156, + "step": 2567 + }, + { + "epoch": 0.18527470148984523, + "grad_norm": 2.0792655396380164, + "learning_rate": 3.75239716977515e-06, + "loss": 1.0203, + "step": 2568 + }, + { + "epoch": 0.18534684895927275, + "grad_norm": 2.1336519594468806, + "learning_rate": 3.7521718777549594e-06, + "loss": 0.8503, + "step": 2569 + }, + { + "epoch": 0.18541899642870027, + "grad_norm": 3.318401046769686, + "learning_rate": 3.7519464900551485e-06, + "loss": 1.043, + "step": 2570 + }, + { + "epoch": 0.18549114389812776, + "grad_norm": 2.131790419270411, + "learning_rate": 3.751721006688026e-06, + "loss": 1.0362, + "step": 2571 + }, + { + "epoch": 0.18556329136755528, + "grad_norm": 1.7800616265462184, + "learning_rate": 3.7514954276659026e-06, + "loss": 0.997, + "step": 2572 + }, + { + "epoch": 0.1856354388369828, + "grad_norm": 2.3242759889879343, + "learning_rate": 3.751269753001099e-06, + "loss": 1.0175, + "step": 2573 + }, + { + "epoch": 0.1857075863064103, + "grad_norm": 2.480933438725528, + "learning_rate": 3.7510439827059366e-06, + "loss": 1.0723, + "step": 2574 + }, + { + "epoch": 0.1857797337758378, + "grad_norm": 2.793775834000382, + "learning_rate": 3.7508181167927447e-06, + "loss": 0.9554, + "step": 2575 + }, + { + "epoch": 0.18585188124526533, + "grad_norm": 2.268238323089536, + "learning_rate": 3.7505921552738565e-06, + "loss": 1.1098, + "step": 2576 + }, + { + "epoch": 0.18592402871469282, + "grad_norm": 2.5336942762769206, + "learning_rate": 3.750366098161611e-06, + "loss": 0.9857, + "step": 2577 + }, + { + "epoch": 0.18599617618412034, + "grad_norm": 0.8771425650040525, + "learning_rate": 3.750139945468353e-06, + "loss": 0.8226, + "step": 2578 + }, + { + "epoch": 0.18606832365354786, + "grad_norm": 2.872822475940949, + "learning_rate": 3.749913697206431e-06, + "loss": 0.9395, + "step": 2579 + }, + { + "epoch": 0.18614047112297535, + "grad_norm": 2.6754887107706473, + "learning_rate": 3.7496873533882e-06, + "loss": 0.9062, + "step": 2580 + }, + { + "epoch": 0.18621261859240287, + "grad_norm": 1.7885708862240315, + "learning_rate": 3.74946091402602e-06, + "loss": 1.0147, + "step": 2581 + }, + { + "epoch": 0.1862847660618304, + "grad_norm": 2.7968752557338834, + "learning_rate": 3.7492343791322556e-06, + "loss": 0.9823, + "step": 2582 + }, + { + "epoch": 0.18635691353125788, + "grad_norm": 2.687037228507043, + "learning_rate": 3.7490077487192773e-06, + "loss": 1.0151, + "step": 2583 + }, + { + "epoch": 0.1864290610006854, + "grad_norm": 2.574971437758764, + "learning_rate": 3.7487810227994606e-06, + "loss": 0.903, + "step": 2584 + }, + { + "epoch": 0.18650120847011292, + "grad_norm": 2.0272061516256716, + "learning_rate": 3.7485542013851855e-06, + "loss": 0.9854, + "step": 2585 + }, + { + "epoch": 0.18657335593954041, + "grad_norm": 4.620954032585116, + "learning_rate": 3.748327284488839e-06, + "loss": 0.9589, + "step": 2586 + }, + { + "epoch": 0.18664550340896793, + "grad_norm": 2.192252854061629, + "learning_rate": 3.748100272122811e-06, + "loss": 0.9643, + "step": 2587 + }, + { + "epoch": 0.18671765087839545, + "grad_norm": 5.789331344971973, + "learning_rate": 3.7478731642994984e-06, + "loss": 1.0146, + "step": 2588 + }, + { + "epoch": 0.18678979834782294, + "grad_norm": 1.765043584917646, + "learning_rate": 3.747645961031303e-06, + "loss": 0.882, + "step": 2589 + }, + { + "epoch": 0.18686194581725046, + "grad_norm": 2.315368419851842, + "learning_rate": 3.7474186623306317e-06, + "loss": 0.9713, + "step": 2590 + }, + { + "epoch": 0.18693409328667798, + "grad_norm": 3.5840866864228405, + "learning_rate": 3.747191268209895e-06, + "loss": 0.8893, + "step": 2591 + }, + { + "epoch": 0.18700624075610547, + "grad_norm": 3.6231479517571823, + "learning_rate": 3.7469637786815117e-06, + "loss": 0.9488, + "step": 2592 + }, + { + "epoch": 0.187078388225533, + "grad_norm": 2.193973553778941, + "learning_rate": 3.746736193757903e-06, + "loss": 0.9992, + "step": 2593 + }, + { + "epoch": 0.1871505356949605, + "grad_norm": 2.643215646693538, + "learning_rate": 3.7465085134514977e-06, + "loss": 0.999, + "step": 2594 + }, + { + "epoch": 0.187222683164388, + "grad_norm": 2.1426365443622424, + "learning_rate": 3.746280737774727e-06, + "loss": 0.9332, + "step": 2595 + }, + { + "epoch": 0.18729483063381552, + "grad_norm": 3.212583542268161, + "learning_rate": 3.7460528667400306e-06, + "loss": 0.9882, + "step": 2596 + }, + { + "epoch": 0.18736697810324304, + "grad_norm": 2.8848795109649976, + "learning_rate": 3.7458249003598496e-06, + "loss": 0.9128, + "step": 2597 + }, + { + "epoch": 0.18743912557267053, + "grad_norm": 3.137453822065865, + "learning_rate": 3.7455968386466346e-06, + "loss": 0.9401, + "step": 2598 + }, + { + "epoch": 0.18751127304209805, + "grad_norm": 1.8523869348943764, + "learning_rate": 3.745368681612838e-06, + "loss": 0.969, + "step": 2599 + }, + { + "epoch": 0.18758342051152554, + "grad_norm": 2.164500939199588, + "learning_rate": 3.7451404292709184e-06, + "loss": 0.94, + "step": 2600 + }, + { + "epoch": 0.18765556798095306, + "grad_norm": 2.8135383384672665, + "learning_rate": 3.7449120816333405e-06, + "loss": 0.8738, + "step": 2601 + }, + { + "epoch": 0.18772771545038058, + "grad_norm": 2.1813760322077393, + "learning_rate": 3.7446836387125737e-06, + "loss": 0.9109, + "step": 2602 + }, + { + "epoch": 0.18779986291980807, + "grad_norm": 2.1124799693585645, + "learning_rate": 3.7444551005210914e-06, + "loss": 1.0161, + "step": 2603 + }, + { + "epoch": 0.1878720103892356, + "grad_norm": 2.750254879323936, + "learning_rate": 3.744226467071374e-06, + "loss": 0.9659, + "step": 2604 + }, + { + "epoch": 0.1879441578586631, + "grad_norm": 2.6501534003717047, + "learning_rate": 3.743997738375906e-06, + "loss": 0.9089, + "step": 2605 + }, + { + "epoch": 0.1880163053280906, + "grad_norm": 3.138284902686837, + "learning_rate": 3.743768914447178e-06, + "loss": 0.9526, + "step": 2606 + }, + { + "epoch": 0.18808845279751812, + "grad_norm": 0.8670772448033742, + "learning_rate": 3.7435399952976844e-06, + "loss": 0.7789, + "step": 2607 + }, + { + "epoch": 0.18816060026694564, + "grad_norm": 0.8190491166344023, + "learning_rate": 3.7433109809399264e-06, + "loss": 0.8999, + "step": 2608 + }, + { + "epoch": 0.18823274773637313, + "grad_norm": 8.780332469858417, + "learning_rate": 3.743081871386409e-06, + "loss": 1.0949, + "step": 2609 + }, + { + "epoch": 0.18830489520580065, + "grad_norm": 2.743360306648532, + "learning_rate": 3.7428526666496436e-06, + "loss": 1.0215, + "step": 2610 + }, + { + "epoch": 0.18837704267522817, + "grad_norm": 2.118178807686978, + "learning_rate": 3.742623366742145e-06, + "loss": 1.0643, + "step": 2611 + }, + { + "epoch": 0.18844919014465566, + "grad_norm": 2.20593742339262, + "learning_rate": 3.742393971676436e-06, + "loss": 0.9911, + "step": 2612 + }, + { + "epoch": 0.18852133761408318, + "grad_norm": 2.9962930028675774, + "learning_rate": 3.7421644814650424e-06, + "loss": 0.9731, + "step": 2613 + }, + { + "epoch": 0.1885934850835107, + "grad_norm": 2.8241638769198096, + "learning_rate": 3.7419348961204956e-06, + "loss": 0.9011, + "step": 2614 + }, + { + "epoch": 0.1886656325529382, + "grad_norm": 2.3362503097476823, + "learning_rate": 3.7417052156553325e-06, + "loss": 0.8692, + "step": 2615 + }, + { + "epoch": 0.18873778002236571, + "grad_norm": 2.74659657114207, + "learning_rate": 3.7414754400820952e-06, + "loss": 1.009, + "step": 2616 + }, + { + "epoch": 0.18880992749179323, + "grad_norm": 1.7644956234417992, + "learning_rate": 3.7412455694133308e-06, + "loss": 0.9775, + "step": 2617 + }, + { + "epoch": 0.18888207496122073, + "grad_norm": 2.9919994484396035, + "learning_rate": 3.741015603661592e-06, + "loss": 0.9388, + "step": 2618 + }, + { + "epoch": 0.18895422243064824, + "grad_norm": 2.253398130583722, + "learning_rate": 3.740785542839435e-06, + "loss": 1.0247, + "step": 2619 + }, + { + "epoch": 0.18902636990007576, + "grad_norm": 2.2938080117688, + "learning_rate": 3.740555386959425e-06, + "loss": 0.9469, + "step": 2620 + }, + { + "epoch": 0.18909851736950326, + "grad_norm": 3.407836264729007, + "learning_rate": 3.740325136034128e-06, + "loss": 0.972, + "step": 2621 + }, + { + "epoch": 0.18917066483893077, + "grad_norm": 1.8125653090713856, + "learning_rate": 3.740094790076118e-06, + "loss": 0.8866, + "step": 2622 + }, + { + "epoch": 0.1892428123083583, + "grad_norm": 2.3644766314346577, + "learning_rate": 3.7398643490979735e-06, + "loss": 1.0188, + "step": 2623 + }, + { + "epoch": 0.18931495977778579, + "grad_norm": 2.5344621974935198, + "learning_rate": 3.7396338131122762e-06, + "loss": 1.017, + "step": 2624 + }, + { + "epoch": 0.1893871072472133, + "grad_norm": 2.3856377533873334, + "learning_rate": 3.7394031821316175e-06, + "loss": 0.9439, + "step": 2625 + }, + { + "epoch": 0.18945925471664082, + "grad_norm": 2.0090461236378565, + "learning_rate": 3.7391724561685902e-06, + "loss": 0.948, + "step": 2626 + }, + { + "epoch": 0.18953140218606832, + "grad_norm": 2.3046063942296566, + "learning_rate": 3.738941635235793e-06, + "loss": 0.8459, + "step": 2627 + }, + { + "epoch": 0.18960354965549583, + "grad_norm": 2.4185011467086097, + "learning_rate": 3.73871071934583e-06, + "loss": 0.9089, + "step": 2628 + }, + { + "epoch": 0.18967569712492335, + "grad_norm": 2.660506753334747, + "learning_rate": 3.7384797085113114e-06, + "loss": 0.9822, + "step": 2629 + }, + { + "epoch": 0.18974784459435085, + "grad_norm": 0.8351389198896036, + "learning_rate": 3.7382486027448515e-06, + "loss": 0.8495, + "step": 2630 + }, + { + "epoch": 0.18981999206377836, + "grad_norm": 2.392431038208418, + "learning_rate": 3.7380174020590707e-06, + "loss": 0.9314, + "step": 2631 + }, + { + "epoch": 0.18989213953320588, + "grad_norm": 1.9245351353939275, + "learning_rate": 3.737786106466593e-06, + "loss": 0.9275, + "step": 2632 + }, + { + "epoch": 0.18996428700263338, + "grad_norm": 3.2469876340507846, + "learning_rate": 3.737554715980049e-06, + "loss": 0.9811, + "step": 2633 + }, + { + "epoch": 0.1900364344720609, + "grad_norm": 2.3993654723598055, + "learning_rate": 3.737323230612074e-06, + "loss": 0.9688, + "step": 2634 + }, + { + "epoch": 0.1901085819414884, + "grad_norm": 2.0953235691355645, + "learning_rate": 3.7370916503753095e-06, + "loss": 0.9707, + "step": 2635 + }, + { + "epoch": 0.1901807294109159, + "grad_norm": 2.3852684496901815, + "learning_rate": 3.7368599752824e-06, + "loss": 0.9673, + "step": 2636 + }, + { + "epoch": 0.19025287688034342, + "grad_norm": 0.8582819142418039, + "learning_rate": 3.736628205345997e-06, + "loss": 0.8261, + "step": 2637 + }, + { + "epoch": 0.19032502434977094, + "grad_norm": 2.4894349973106267, + "learning_rate": 3.736396340578756e-06, + "loss": 0.9177, + "step": 2638 + }, + { + "epoch": 0.19039717181919844, + "grad_norm": 4.637974248527571, + "learning_rate": 3.7361643809933395e-06, + "loss": 0.9545, + "step": 2639 + }, + { + "epoch": 0.19046931928862595, + "grad_norm": 3.1014144439335336, + "learning_rate": 3.7359323266024126e-06, + "loss": 0.9561, + "step": 2640 + }, + { + "epoch": 0.19054146675805347, + "grad_norm": 3.0204307726394144, + "learning_rate": 3.735700177418648e-06, + "loss": 0.94, + "step": 2641 + }, + { + "epoch": 0.19061361422748097, + "grad_norm": 2.5533232706467603, + "learning_rate": 3.735467933454722e-06, + "loss": 0.956, + "step": 2642 + }, + { + "epoch": 0.19068576169690848, + "grad_norm": 2.073117525985995, + "learning_rate": 3.7352355947233166e-06, + "loss": 0.911, + "step": 2643 + }, + { + "epoch": 0.190757909166336, + "grad_norm": 2.306205740816684, + "learning_rate": 3.7350031612371186e-06, + "loss": 0.8789, + "step": 2644 + }, + { + "epoch": 0.1908300566357635, + "grad_norm": 1.943744686401562, + "learning_rate": 3.7347706330088213e-06, + "loss": 1.041, + "step": 2645 + }, + { + "epoch": 0.19090220410519101, + "grad_norm": 2.1150133293940265, + "learning_rate": 3.7345380100511214e-06, + "loss": 0.9839, + "step": 2646 + }, + { + "epoch": 0.19097435157461853, + "grad_norm": 2.621482854350159, + "learning_rate": 3.734305292376722e-06, + "loss": 0.9735, + "step": 2647 + }, + { + "epoch": 0.19104649904404603, + "grad_norm": 3.163037093922823, + "learning_rate": 3.73407247999833e-06, + "loss": 0.9418, + "step": 2648 + }, + { + "epoch": 0.19111864651347354, + "grad_norm": 2.354647111957668, + "learning_rate": 3.7338395729286595e-06, + "loss": 0.9762, + "step": 2649 + }, + { + "epoch": 0.19119079398290104, + "grad_norm": 3.4552111645881403, + "learning_rate": 3.733606571180429e-06, + "loss": 1.0382, + "step": 2650 + }, + { + "epoch": 0.19126294145232856, + "grad_norm": 1.9270712224906255, + "learning_rate": 3.7333734747663605e-06, + "loss": 1.1189, + "step": 2651 + }, + { + "epoch": 0.19133508892175607, + "grad_norm": 4.58547165436324, + "learning_rate": 3.7331402836991838e-06, + "loss": 0.9332, + "step": 2652 + }, + { + "epoch": 0.19140723639118357, + "grad_norm": 1.9582496855165854, + "learning_rate": 3.7329069979916322e-06, + "loss": 0.8637, + "step": 2653 + }, + { + "epoch": 0.19147938386061109, + "grad_norm": 2.095438034768901, + "learning_rate": 3.732673617656444e-06, + "loss": 0.938, + "step": 2654 + }, + { + "epoch": 0.1915515313300386, + "grad_norm": 3.387782298357144, + "learning_rate": 3.7324401427063634e-06, + "loss": 1.0115, + "step": 2655 + }, + { + "epoch": 0.1916236787994661, + "grad_norm": 2.3784983117621032, + "learning_rate": 3.7322065731541406e-06, + "loss": 0.8485, + "step": 2656 + }, + { + "epoch": 0.19169582626889362, + "grad_norm": 2.2829241945277565, + "learning_rate": 3.7319729090125288e-06, + "loss": 1.0145, + "step": 2657 + }, + { + "epoch": 0.19176797373832113, + "grad_norm": 2.26557206716856, + "learning_rate": 3.731739150294288e-06, + "loss": 0.9015, + "step": 2658 + }, + { + "epoch": 0.19184012120774863, + "grad_norm": 2.0120656131109698, + "learning_rate": 3.731505297012183e-06, + "loss": 1.0308, + "step": 2659 + }, + { + "epoch": 0.19191226867717615, + "grad_norm": 2.673388493064761, + "learning_rate": 3.731271349178984e-06, + "loss": 0.8913, + "step": 2660 + }, + { + "epoch": 0.19198441614660366, + "grad_norm": 2.3010086957492835, + "learning_rate": 3.7310373068074646e-06, + "loss": 0.9348, + "step": 2661 + }, + { + "epoch": 0.19205656361603116, + "grad_norm": 2.2003822298057383, + "learning_rate": 3.730803169910406e-06, + "loss": 0.8001, + "step": 2662 + }, + { + "epoch": 0.19212871108545868, + "grad_norm": 1.724003064208173, + "learning_rate": 3.7305689385005945e-06, + "loss": 0.9403, + "step": 2663 + }, + { + "epoch": 0.1922008585548862, + "grad_norm": 4.483698458262333, + "learning_rate": 3.730334612590819e-06, + "loss": 0.9498, + "step": 2664 + }, + { + "epoch": 0.1922730060243137, + "grad_norm": 0.7616694116508164, + "learning_rate": 3.730100192193876e-06, + "loss": 0.8885, + "step": 2665 + }, + { + "epoch": 0.1923451534937412, + "grad_norm": 0.7293665929733111, + "learning_rate": 3.729865677322566e-06, + "loss": 0.8146, + "step": 2666 + }, + { + "epoch": 0.19241730096316872, + "grad_norm": 2.6410309970398758, + "learning_rate": 3.729631067989695e-06, + "loss": 1.022, + "step": 2667 + }, + { + "epoch": 0.19248944843259622, + "grad_norm": 2.587292457402494, + "learning_rate": 3.7293963642080747e-06, + "loss": 1.097, + "step": 2668 + }, + { + "epoch": 0.19256159590202374, + "grad_norm": 10.238692210251733, + "learning_rate": 3.72916156599052e-06, + "loss": 1.0041, + "step": 2669 + }, + { + "epoch": 0.19263374337145125, + "grad_norm": 2.150138154137697, + "learning_rate": 3.7289266733498543e-06, + "loss": 0.9525, + "step": 2670 + }, + { + "epoch": 0.19270589084087875, + "grad_norm": 1.9828669908038725, + "learning_rate": 3.7286916862989024e-06, + "loss": 1.0135, + "step": 2671 + }, + { + "epoch": 0.19277803831030627, + "grad_norm": 2.004922412122622, + "learning_rate": 3.7284566048504974e-06, + "loss": 0.9837, + "step": 2672 + }, + { + "epoch": 0.19285018577973378, + "grad_norm": 1.9013788490349242, + "learning_rate": 3.7282214290174763e-06, + "loss": 0.8651, + "step": 2673 + }, + { + "epoch": 0.19292233324916128, + "grad_norm": 2.0358962692336733, + "learning_rate": 3.72798615881268e-06, + "loss": 0.991, + "step": 2674 + }, + { + "epoch": 0.1929944807185888, + "grad_norm": 2.9505865903946953, + "learning_rate": 3.727750794248956e-06, + "loss": 0.958, + "step": 2675 + }, + { + "epoch": 0.19306662818801631, + "grad_norm": 2.7138254574933542, + "learning_rate": 3.7275153353391576e-06, + "loss": 0.9632, + "step": 2676 + }, + { + "epoch": 0.1931387756574438, + "grad_norm": 2.418746206985378, + "learning_rate": 3.7272797820961412e-06, + "loss": 0.8668, + "step": 2677 + }, + { + "epoch": 0.19321092312687133, + "grad_norm": 2.521916831802897, + "learning_rate": 3.727044134532771e-06, + "loss": 0.8704, + "step": 2678 + }, + { + "epoch": 0.19328307059629884, + "grad_norm": 2.1868133557165956, + "learning_rate": 3.7268083926619126e-06, + "loss": 0.8359, + "step": 2679 + }, + { + "epoch": 0.19335521806572634, + "grad_norm": 3.813921678968149, + "learning_rate": 3.7265725564964414e-06, + "loss": 0.9807, + "step": 2680 + }, + { + "epoch": 0.19342736553515386, + "grad_norm": 2.503965856166992, + "learning_rate": 3.7263366260492345e-06, + "loss": 0.8758, + "step": 2681 + }, + { + "epoch": 0.19349951300458137, + "grad_norm": 2.1162741404743337, + "learning_rate": 3.726100601333174e-06, + "loss": 1.0501, + "step": 2682 + }, + { + "epoch": 0.19357166047400887, + "grad_norm": 2.3973935119719325, + "learning_rate": 3.7258644823611506e-06, + "loss": 1.014, + "step": 2683 + }, + { + "epoch": 0.19364380794343639, + "grad_norm": 44.69452250328752, + "learning_rate": 3.725628269146056e-06, + "loss": 0.9927, + "step": 2684 + }, + { + "epoch": 0.1937159554128639, + "grad_norm": 1.925687342708329, + "learning_rate": 3.72539196170079e-06, + "loss": 0.8821, + "step": 2685 + }, + { + "epoch": 0.1937881028822914, + "grad_norm": 2.7466633234497513, + "learning_rate": 3.7251555600382563e-06, + "loss": 0.9615, + "step": 2686 + }, + { + "epoch": 0.19386025035171892, + "grad_norm": 2.2947728924380844, + "learning_rate": 3.724919064171364e-06, + "loss": 0.9653, + "step": 2687 + }, + { + "epoch": 0.19393239782114643, + "grad_norm": 1.786946874015621, + "learning_rate": 3.7246824741130266e-06, + "loss": 1.0379, + "step": 2688 + }, + { + "epoch": 0.19400454529057393, + "grad_norm": 2.788228484616707, + "learning_rate": 3.7244457898761635e-06, + "loss": 0.8899, + "step": 2689 + }, + { + "epoch": 0.19407669276000145, + "grad_norm": 2.434409407444734, + "learning_rate": 3.7242090114737002e-06, + "loss": 0.9597, + "step": 2690 + }, + { + "epoch": 0.19414884022942897, + "grad_norm": 1.919918770462584, + "learning_rate": 3.723972138918565e-06, + "loss": 1.0001, + "step": 2691 + }, + { + "epoch": 0.19422098769885646, + "grad_norm": 2.3011424585811846, + "learning_rate": 3.723735172223694e-06, + "loss": 0.9128, + "step": 2692 + }, + { + "epoch": 0.19429313516828398, + "grad_norm": 1.98127390128078, + "learning_rate": 3.723498111402026e-06, + "loss": 0.9196, + "step": 2693 + }, + { + "epoch": 0.1943652826377115, + "grad_norm": 1.8970345544719516, + "learning_rate": 3.723260956466506e-06, + "loss": 1.0281, + "step": 2694 + }, + { + "epoch": 0.194437430107139, + "grad_norm": 1.4774787118995019, + "learning_rate": 3.7230237074300847e-06, + "loss": 0.9345, + "step": 2695 + }, + { + "epoch": 0.1945095775765665, + "grad_norm": 2.7724699669906236, + "learning_rate": 3.7227863643057173e-06, + "loss": 0.9474, + "step": 2696 + }, + { + "epoch": 0.19458172504599403, + "grad_norm": 2.433228295380227, + "learning_rate": 3.7225489271063637e-06, + "loss": 0.915, + "step": 2697 + }, + { + "epoch": 0.19465387251542152, + "grad_norm": 2.104711053004232, + "learning_rate": 3.7223113958449898e-06, + "loss": 0.9252, + "step": 2698 + }, + { + "epoch": 0.19472601998484904, + "grad_norm": 2.5916139930532203, + "learning_rate": 3.7220737705345666e-06, + "loss": 0.9471, + "step": 2699 + }, + { + "epoch": 0.19479816745427656, + "grad_norm": 1.7682127766886595, + "learning_rate": 3.7218360511880694e-06, + "loss": 0.9693, + "step": 2700 + }, + { + "epoch": 0.19487031492370405, + "grad_norm": 2.860937640004178, + "learning_rate": 3.7215982378184794e-06, + "loss": 1.0388, + "step": 2701 + }, + { + "epoch": 0.19494246239313157, + "grad_norm": 2.4747960863979257, + "learning_rate": 3.7213603304387833e-06, + "loss": 1.0067, + "step": 2702 + }, + { + "epoch": 0.19501460986255906, + "grad_norm": 2.4713468775961616, + "learning_rate": 3.7211223290619717e-06, + "loss": 0.8547, + "step": 2703 + }, + { + "epoch": 0.19508675733198658, + "grad_norm": 2.0135820074590804, + "learning_rate": 3.720884233701041e-06, + "loss": 0.9145, + "step": 2704 + }, + { + "epoch": 0.1951589048014141, + "grad_norm": 0.7873721003741698, + "learning_rate": 3.7206460443689927e-06, + "loss": 0.87, + "step": 2705 + }, + { + "epoch": 0.1952310522708416, + "grad_norm": 2.4782905205844905, + "learning_rate": 3.7204077610788334e-06, + "loss": 1.0557, + "step": 2706 + }, + { + "epoch": 0.1953031997402691, + "grad_norm": 3.5372271968401225, + "learning_rate": 3.7201693838435758e-06, + "loss": 0.9847, + "step": 2707 + }, + { + "epoch": 0.19537534720969663, + "grad_norm": 0.7704102583749853, + "learning_rate": 3.719930912676235e-06, + "loss": 0.8015, + "step": 2708 + }, + { + "epoch": 0.19544749467912412, + "grad_norm": 3.9868542904756232, + "learning_rate": 3.7196923475898344e-06, + "loss": 0.9672, + "step": 2709 + }, + { + "epoch": 0.19551964214855164, + "grad_norm": 2.0616407049338723, + "learning_rate": 3.7194536885974008e-06, + "loss": 1.0412, + "step": 2710 + }, + { + "epoch": 0.19559178961797916, + "grad_norm": 2.113498979729324, + "learning_rate": 3.7192149357119663e-06, + "loss": 1.0048, + "step": 2711 + }, + { + "epoch": 0.19566393708740665, + "grad_norm": 1.8603204198918928, + "learning_rate": 3.7189760889465686e-06, + "loss": 1.0327, + "step": 2712 + }, + { + "epoch": 0.19573608455683417, + "grad_norm": 2.1198656098937048, + "learning_rate": 3.7187371483142506e-06, + "loss": 0.907, + "step": 2713 + }, + { + "epoch": 0.19580823202626169, + "grad_norm": 3.364727210087502, + "learning_rate": 3.7184981138280593e-06, + "loss": 0.9693, + "step": 2714 + }, + { + "epoch": 0.19588037949568918, + "grad_norm": 2.851752455145728, + "learning_rate": 3.718258985501047e-06, + "loss": 1.0593, + "step": 2715 + }, + { + "epoch": 0.1959525269651167, + "grad_norm": 2.4054169637192104, + "learning_rate": 3.7180197633462726e-06, + "loss": 0.8925, + "step": 2716 + }, + { + "epoch": 0.19602467443454422, + "grad_norm": 1.662451882490642, + "learning_rate": 3.717780447376799e-06, + "loss": 0.9703, + "step": 2717 + }, + { + "epoch": 0.1960968219039717, + "grad_norm": 3.0029509653717703, + "learning_rate": 3.7175410376056943e-06, + "loss": 0.9372, + "step": 2718 + }, + { + "epoch": 0.19616896937339923, + "grad_norm": 2.512762111067987, + "learning_rate": 3.7173015340460314e-06, + "loss": 0.849, + "step": 2719 + }, + { + "epoch": 0.19624111684282675, + "grad_norm": 2.2273164242366454, + "learning_rate": 3.7170619367108884e-06, + "loss": 0.9707, + "step": 2720 + }, + { + "epoch": 0.19631326431225424, + "grad_norm": 3.588586298531676, + "learning_rate": 3.71682224561335e-06, + "loss": 1.0109, + "step": 2721 + }, + { + "epoch": 0.19638541178168176, + "grad_norm": 2.418484683603648, + "learning_rate": 3.7165824607665047e-06, + "loss": 0.9327, + "step": 2722 + }, + { + "epoch": 0.19645755925110928, + "grad_norm": 2.284251707327097, + "learning_rate": 3.716342582183445e-06, + "loss": 0.9051, + "step": 2723 + }, + { + "epoch": 0.19652970672053677, + "grad_norm": 1.6938064143509035, + "learning_rate": 3.7161026098772707e-06, + "loss": 0.9929, + "step": 2724 + }, + { + "epoch": 0.1966018541899643, + "grad_norm": 2.2781897097425095, + "learning_rate": 3.7158625438610857e-06, + "loss": 1.0291, + "step": 2725 + }, + { + "epoch": 0.1966740016593918, + "grad_norm": 1.971632225217346, + "learning_rate": 3.715622384147999e-06, + "loss": 0.8954, + "step": 2726 + }, + { + "epoch": 0.1967461491288193, + "grad_norm": 2.9099963253037364, + "learning_rate": 3.7153821307511253e-06, + "loss": 1.0182, + "step": 2727 + }, + { + "epoch": 0.19681829659824682, + "grad_norm": 2.603705179650073, + "learning_rate": 3.715141783683584e-06, + "loss": 0.9177, + "step": 2728 + }, + { + "epoch": 0.19689044406767434, + "grad_norm": 1.9547446897390433, + "learning_rate": 3.714901342958498e-06, + "loss": 0.9786, + "step": 2729 + }, + { + "epoch": 0.19696259153710183, + "grad_norm": 3.9680652177819358, + "learning_rate": 3.7146608085889987e-06, + "loss": 0.9368, + "step": 2730 + }, + { + "epoch": 0.19703473900652935, + "grad_norm": 2.0614590185266106, + "learning_rate": 3.7144201805882194e-06, + "loss": 1.0359, + "step": 2731 + }, + { + "epoch": 0.19710688647595687, + "grad_norm": 4.049710845043976, + "learning_rate": 3.714179458969301e-06, + "loss": 1.0107, + "step": 2732 + }, + { + "epoch": 0.19717903394538436, + "grad_norm": 1.688251469326072, + "learning_rate": 3.713938643745389e-06, + "loss": 0.958, + "step": 2733 + }, + { + "epoch": 0.19725118141481188, + "grad_norm": 3.672771977376588, + "learning_rate": 3.7136977349296315e-06, + "loss": 0.9271, + "step": 2734 + }, + { + "epoch": 0.1973233288842394, + "grad_norm": 3.686571165874812, + "learning_rate": 3.713456732535184e-06, + "loss": 0.9452, + "step": 2735 + }, + { + "epoch": 0.1973954763536669, + "grad_norm": 2.2809553609290614, + "learning_rate": 3.7132156365752085e-06, + "loss": 0.9438, + "step": 2736 + }, + { + "epoch": 0.1974676238230944, + "grad_norm": 2.5962860394374654, + "learning_rate": 3.7129744470628685e-06, + "loss": 0.9986, + "step": 2737 + }, + { + "epoch": 0.19753977129252193, + "grad_norm": 3.622983963823746, + "learning_rate": 3.7127331640113355e-06, + "loss": 1.0361, + "step": 2738 + }, + { + "epoch": 0.19761191876194942, + "grad_norm": 2.2932781680073995, + "learning_rate": 3.7124917874337846e-06, + "loss": 0.9179, + "step": 2739 + }, + { + "epoch": 0.19768406623137694, + "grad_norm": 2.989589270254658, + "learning_rate": 3.7122503173433966e-06, + "loss": 0.9828, + "step": 2740 + }, + { + "epoch": 0.19775621370080446, + "grad_norm": 2.3518850922703796, + "learning_rate": 3.712008753753357e-06, + "loss": 0.978, + "step": 2741 + }, + { + "epoch": 0.19782836117023195, + "grad_norm": 3.76793096698608, + "learning_rate": 3.7117670966768575e-06, + "loss": 0.9922, + "step": 2742 + }, + { + "epoch": 0.19790050863965947, + "grad_norm": 2.264888775349326, + "learning_rate": 3.7115253461270933e-06, + "loss": 0.9166, + "step": 2743 + }, + { + "epoch": 0.197972656109087, + "grad_norm": 2.392824445228608, + "learning_rate": 3.7112835021172662e-06, + "loss": 0.9573, + "step": 2744 + }, + { + "epoch": 0.19804480357851448, + "grad_norm": 3.2208448093172026, + "learning_rate": 3.711041564660582e-06, + "loss": 1.0177, + "step": 2745 + }, + { + "epoch": 0.198116951047942, + "grad_norm": 5.263321822775748, + "learning_rate": 3.7107995337702515e-06, + "loss": 1.0623, + "step": 2746 + }, + { + "epoch": 0.19818909851736952, + "grad_norm": 2.4720380109623763, + "learning_rate": 3.7105574094594917e-06, + "loss": 1.0472, + "step": 2747 + }, + { + "epoch": 0.198261245986797, + "grad_norm": 7.661521871594182, + "learning_rate": 3.7103151917415244e-06, + "loss": 0.9122, + "step": 2748 + }, + { + "epoch": 0.19833339345622453, + "grad_norm": 2.590666995381118, + "learning_rate": 3.7100728806295756e-06, + "loss": 0.8427, + "step": 2749 + }, + { + "epoch": 0.19840554092565205, + "grad_norm": 2.5164413549182743, + "learning_rate": 3.7098304761368776e-06, + "loss": 0.959, + "step": 2750 + }, + { + "epoch": 0.19847768839507954, + "grad_norm": 2.0212301932964523, + "learning_rate": 3.709587978276666e-06, + "loss": 0.9839, + "step": 2751 + }, + { + "epoch": 0.19854983586450706, + "grad_norm": 1.6627331649072545, + "learning_rate": 3.7093453870621847e-06, + "loss": 1.1168, + "step": 2752 + }, + { + "epoch": 0.19862198333393455, + "grad_norm": 2.307531328577565, + "learning_rate": 3.709102702506679e-06, + "loss": 1.0025, + "step": 2753 + }, + { + "epoch": 0.19869413080336207, + "grad_norm": 1.8610788440595827, + "learning_rate": 3.708859924623401e-06, + "loss": 0.9368, + "step": 2754 + }, + { + "epoch": 0.1987662782727896, + "grad_norm": 2.029894332390577, + "learning_rate": 3.7086170534256098e-06, + "loss": 0.9824, + "step": 2755 + }, + { + "epoch": 0.19883842574221708, + "grad_norm": 2.3158733404638183, + "learning_rate": 3.7083740889265655e-06, + "loss": 0.8934, + "step": 2756 + }, + { + "epoch": 0.1989105732116446, + "grad_norm": 2.222143881794445, + "learning_rate": 3.708131031139537e-06, + "loss": 1.0139, + "step": 2757 + }, + { + "epoch": 0.19898272068107212, + "grad_norm": 3.7553281443085034, + "learning_rate": 3.707887880077796e-06, + "loss": 1.0713, + "step": 2758 + }, + { + "epoch": 0.1990548681504996, + "grad_norm": 2.5095877855280304, + "learning_rate": 3.7076446357546196e-06, + "loss": 0.9118, + "step": 2759 + }, + { + "epoch": 0.19912701561992713, + "grad_norm": 2.352195781208177, + "learning_rate": 3.7074012981832917e-06, + "loss": 0.9465, + "step": 2760 + }, + { + "epoch": 0.19919916308935465, + "grad_norm": 2.429865594451345, + "learning_rate": 3.7071578673770995e-06, + "loss": 1.0702, + "step": 2761 + }, + { + "epoch": 0.19927131055878214, + "grad_norm": 2.6496698893661312, + "learning_rate": 3.706914343349335e-06, + "loss": 0.9467, + "step": 2762 + }, + { + "epoch": 0.19934345802820966, + "grad_norm": 2.616287213246291, + "learning_rate": 3.706670726113298e-06, + "loss": 0.9017, + "step": 2763 + }, + { + "epoch": 0.19941560549763718, + "grad_norm": 2.161787469155582, + "learning_rate": 3.7064270156822906e-06, + "loss": 0.9999, + "step": 2764 + }, + { + "epoch": 0.19948775296706467, + "grad_norm": 2.4460142061159407, + "learning_rate": 3.70618321206962e-06, + "loss": 0.9324, + "step": 2765 + }, + { + "epoch": 0.1995599004364922, + "grad_norm": 1.9166069090552973, + "learning_rate": 3.705939315288601e-06, + "loss": 0.9769, + "step": 2766 + }, + { + "epoch": 0.1996320479059197, + "grad_norm": 2.133787609946588, + "learning_rate": 3.705695325352551e-06, + "loss": 0.9864, + "step": 2767 + }, + { + "epoch": 0.1997041953753472, + "grad_norm": 0.8063413420396816, + "learning_rate": 3.705451242274793e-06, + "loss": 0.773, + "step": 2768 + }, + { + "epoch": 0.19977634284477472, + "grad_norm": 3.000905218247696, + "learning_rate": 3.705207066068657e-06, + "loss": 1.004, + "step": 2769 + }, + { + "epoch": 0.19984849031420224, + "grad_norm": 1.9114165409796193, + "learning_rate": 3.704962796747475e-06, + "loss": 0.9372, + "step": 2770 + }, + { + "epoch": 0.19992063778362973, + "grad_norm": 0.8613217584975877, + "learning_rate": 3.704718434324586e-06, + "loss": 0.7693, + "step": 2771 + }, + { + "epoch": 0.19999278525305725, + "grad_norm": 2.1939733364394107, + "learning_rate": 3.7044739788133343e-06, + "loss": 1.0299, + "step": 2772 + }, + { + "epoch": 0.20006493272248477, + "grad_norm": 3.2500334517884797, + "learning_rate": 3.7042294302270686e-06, + "loss": 0.971, + "step": 2773 + }, + { + "epoch": 0.20013708019191226, + "grad_norm": 2.2494251788521233, + "learning_rate": 3.7039847885791426e-06, + "loss": 0.9974, + "step": 2774 + }, + { + "epoch": 0.20020922766133978, + "grad_norm": 3.9274721366335883, + "learning_rate": 3.7037400538829147e-06, + "loss": 0.8953, + "step": 2775 + }, + { + "epoch": 0.2002813751307673, + "grad_norm": 2.280809647373601, + "learning_rate": 3.7034952261517494e-06, + "loss": 0.9581, + "step": 2776 + }, + { + "epoch": 0.2003535226001948, + "grad_norm": 2.2564561194952946, + "learning_rate": 3.703250305399016e-06, + "loss": 0.9348, + "step": 2777 + }, + { + "epoch": 0.2004256700696223, + "grad_norm": 2.131250389300448, + "learning_rate": 3.703005291638089e-06, + "loss": 0.9961, + "step": 2778 + }, + { + "epoch": 0.20049781753904983, + "grad_norm": 4.048688445416831, + "learning_rate": 3.702760184882347e-06, + "loss": 0.9257, + "step": 2779 + }, + { + "epoch": 0.20056996500847732, + "grad_norm": 1.8570171761691598, + "learning_rate": 3.702514985145175e-06, + "loss": 1.0434, + "step": 2780 + }, + { + "epoch": 0.20064211247790484, + "grad_norm": 2.268763295704789, + "learning_rate": 3.7022696924399623e-06, + "loss": 0.9237, + "step": 2781 + }, + { + "epoch": 0.20071425994733236, + "grad_norm": 2.584230164755762, + "learning_rate": 3.702024306780103e-06, + "loss": 0.9929, + "step": 2782 + }, + { + "epoch": 0.20078640741675985, + "grad_norm": 1.9157171454367137, + "learning_rate": 3.701778828178997e-06, + "loss": 0.9411, + "step": 2783 + }, + { + "epoch": 0.20085855488618737, + "grad_norm": 1.7236577105142241, + "learning_rate": 3.701533256650049e-06, + "loss": 0.9536, + "step": 2784 + }, + { + "epoch": 0.2009307023556149, + "grad_norm": 2.526922694984356, + "learning_rate": 3.7012875922066688e-06, + "loss": 0.9846, + "step": 2785 + }, + { + "epoch": 0.20100284982504238, + "grad_norm": 3.131883592900212, + "learning_rate": 3.701041834862271e-06, + "loss": 0.8998, + "step": 2786 + }, + { + "epoch": 0.2010749972944699, + "grad_norm": 1.9135157835872365, + "learning_rate": 3.700795984630275e-06, + "loss": 0.9332, + "step": 2787 + }, + { + "epoch": 0.20114714476389742, + "grad_norm": 2.326961252077622, + "learning_rate": 3.7005500415241073e-06, + "loss": 0.9684, + "step": 2788 + }, + { + "epoch": 0.2012192922333249, + "grad_norm": 2.520090061879319, + "learning_rate": 3.7003040055571973e-06, + "loss": 0.9109, + "step": 2789 + }, + { + "epoch": 0.20129143970275243, + "grad_norm": 2.1018046980485634, + "learning_rate": 3.7000578767429792e-06, + "loss": 0.983, + "step": 2790 + }, + { + "epoch": 0.20136358717217995, + "grad_norm": 2.8827678397836376, + "learning_rate": 3.699811655094894e-06, + "loss": 0.7951, + "step": 2791 + }, + { + "epoch": 0.20143573464160744, + "grad_norm": 2.259463541451312, + "learning_rate": 3.699565340626387e-06, + "loss": 0.9953, + "step": 2792 + }, + { + "epoch": 0.20150788211103496, + "grad_norm": 1.9134591533526584, + "learning_rate": 3.6993189333509082e-06, + "loss": 0.9214, + "step": 2793 + }, + { + "epoch": 0.20158002958046248, + "grad_norm": 1.5944426284377995, + "learning_rate": 3.699072433281913e-06, + "loss": 1.0226, + "step": 2794 + }, + { + "epoch": 0.20165217704988997, + "grad_norm": 2.853956414342346, + "learning_rate": 3.698825840432862e-06, + "loss": 0.9577, + "step": 2795 + }, + { + "epoch": 0.2017243245193175, + "grad_norm": 3.849618912699055, + "learning_rate": 3.698579154817221e-06, + "loss": 0.9625, + "step": 2796 + }, + { + "epoch": 0.201796471988745, + "grad_norm": 1.9283801140327022, + "learning_rate": 3.69833237644846e-06, + "loss": 1.006, + "step": 2797 + }, + { + "epoch": 0.2018686194581725, + "grad_norm": 2.3325255108222236, + "learning_rate": 3.698085505340055e-06, + "loss": 0.9962, + "step": 2798 + }, + { + "epoch": 0.20194076692760002, + "grad_norm": 2.3843347908519354, + "learning_rate": 3.6978385415054863e-06, + "loss": 1.0733, + "step": 2799 + }, + { + "epoch": 0.20201291439702754, + "grad_norm": 2.460588024695847, + "learning_rate": 3.6975914849582407e-06, + "loss": 0.9389, + "step": 2800 + }, + { + "epoch": 0.20208506186645503, + "grad_norm": 2.2191409988819326, + "learning_rate": 3.6973443357118077e-06, + "loss": 1.0129, + "step": 2801 + }, + { + "epoch": 0.20215720933588255, + "grad_norm": 0.7850312090701697, + "learning_rate": 3.697097093779684e-06, + "loss": 0.8718, + "step": 2802 + }, + { + "epoch": 0.20222935680531004, + "grad_norm": 2.7270996963635654, + "learning_rate": 3.6968497591753706e-06, + "loss": 0.8913, + "step": 2803 + }, + { + "epoch": 0.20230150427473756, + "grad_norm": 1.9266363570635245, + "learning_rate": 3.696602331912374e-06, + "loss": 1.0546, + "step": 2804 + }, + { + "epoch": 0.20237365174416508, + "grad_norm": 4.858957352430084, + "learning_rate": 3.6963548120042037e-06, + "loss": 0.9512, + "step": 2805 + }, + { + "epoch": 0.20244579921359257, + "grad_norm": 2.461438906013535, + "learning_rate": 3.6961071994643774e-06, + "loss": 0.8663, + "step": 2806 + }, + { + "epoch": 0.2025179466830201, + "grad_norm": 2.4582610577664656, + "learning_rate": 3.6958594943064157e-06, + "loss": 0.9221, + "step": 2807 + }, + { + "epoch": 0.2025900941524476, + "grad_norm": 2.9017854258778186, + "learning_rate": 3.695611696543844e-06, + "loss": 0.9033, + "step": 2808 + }, + { + "epoch": 0.2026622416218751, + "grad_norm": 2.545700083074176, + "learning_rate": 3.6953638061901958e-06, + "loss": 1.0235, + "step": 2809 + }, + { + "epoch": 0.20273438909130262, + "grad_norm": 2.128493522743512, + "learning_rate": 3.695115823259005e-06, + "loss": 0.8923, + "step": 2810 + }, + { + "epoch": 0.20280653656073014, + "grad_norm": 2.0552043772326014, + "learning_rate": 3.694867747763815e-06, + "loss": 0.9527, + "step": 2811 + }, + { + "epoch": 0.20287868403015763, + "grad_norm": 2.0400941073458823, + "learning_rate": 3.6946195797181714e-06, + "loss": 0.913, + "step": 2812 + }, + { + "epoch": 0.20295083149958515, + "grad_norm": 2.5394232390974265, + "learning_rate": 3.6943713191356256e-06, + "loss": 0.8991, + "step": 2813 + }, + { + "epoch": 0.20302297896901267, + "grad_norm": 0.7940213212794685, + "learning_rate": 3.6941229660297343e-06, + "loss": 0.8537, + "step": 2814 + }, + { + "epoch": 0.20309512643844016, + "grad_norm": 2.6334139988612106, + "learning_rate": 3.693874520414059e-06, + "loss": 1.0829, + "step": 2815 + }, + { + "epoch": 0.20316727390786768, + "grad_norm": 2.201157577454675, + "learning_rate": 3.6936259823021673e-06, + "loss": 0.8428, + "step": 2816 + }, + { + "epoch": 0.2032394213772952, + "grad_norm": 1.587099448306084, + "learning_rate": 3.6933773517076297e-06, + "loss": 1.0061, + "step": 2817 + }, + { + "epoch": 0.2033115688467227, + "grad_norm": 1.7407052615715746, + "learning_rate": 3.6931286286440245e-06, + "loss": 0.9389, + "step": 2818 + }, + { + "epoch": 0.2033837163161502, + "grad_norm": 2.4632541953844513, + "learning_rate": 3.692879813124932e-06, + "loss": 1.0161, + "step": 2819 + }, + { + "epoch": 0.20345586378557773, + "grad_norm": 2.0910548613551536, + "learning_rate": 3.6926309051639396e-06, + "loss": 1.0311, + "step": 2820 + }, + { + "epoch": 0.20352801125500522, + "grad_norm": 0.7942522682563397, + "learning_rate": 3.69238190477464e-06, + "loss": 0.8543, + "step": 2821 + }, + { + "epoch": 0.20360015872443274, + "grad_norm": 2.74507740905233, + "learning_rate": 3.6921328119706293e-06, + "loss": 1.0647, + "step": 2822 + }, + { + "epoch": 0.20367230619386026, + "grad_norm": 0.8040307336930358, + "learning_rate": 3.6918836267655093e-06, + "loss": 0.8794, + "step": 2823 + }, + { + "epoch": 0.20374445366328775, + "grad_norm": 2.3493065642882396, + "learning_rate": 3.691634349172888e-06, + "loss": 1.0004, + "step": 2824 + }, + { + "epoch": 0.20381660113271527, + "grad_norm": 2.1381554385028276, + "learning_rate": 3.691384979206377e-06, + "loss": 0.8323, + "step": 2825 + }, + { + "epoch": 0.2038887486021428, + "grad_norm": 0.865536398088677, + "learning_rate": 3.691135516879593e-06, + "loss": 0.7999, + "step": 2826 + }, + { + "epoch": 0.20396089607157028, + "grad_norm": 2.1927261049784255, + "learning_rate": 3.6908859622061596e-06, + "loss": 1.0092, + "step": 2827 + }, + { + "epoch": 0.2040330435409978, + "grad_norm": 0.7017093394329481, + "learning_rate": 3.6906363151997033e-06, + "loss": 0.8142, + "step": 2828 + }, + { + "epoch": 0.20410519101042532, + "grad_norm": 2.0891022980122487, + "learning_rate": 3.6903865758738563e-06, + "loss": 1.0338, + "step": 2829 + }, + { + "epoch": 0.2041773384798528, + "grad_norm": 2.052994062873145, + "learning_rate": 3.6901367442422556e-06, + "loss": 0.9212, + "step": 2830 + }, + { + "epoch": 0.20424948594928033, + "grad_norm": 1.6945940596115423, + "learning_rate": 3.689886820318544e-06, + "loss": 1.0485, + "step": 2831 + }, + { + "epoch": 0.20432163341870785, + "grad_norm": 2.696106459290033, + "learning_rate": 3.689636804116369e-06, + "loss": 0.958, + "step": 2832 + }, + { + "epoch": 0.20439378088813534, + "grad_norm": 3.0124874890332793, + "learning_rate": 3.6893866956493832e-06, + "loss": 0.8923, + "step": 2833 + }, + { + "epoch": 0.20446592835756286, + "grad_norm": 2.48031236050425, + "learning_rate": 3.6891364949312437e-06, + "loss": 0.9966, + "step": 2834 + }, + { + "epoch": 0.20453807582699038, + "grad_norm": 3.410463579275041, + "learning_rate": 3.688886201975613e-06, + "loss": 0.9595, + "step": 2835 + }, + { + "epoch": 0.20461022329641787, + "grad_norm": 3.110360607571228, + "learning_rate": 3.688635816796159e-06, + "loss": 0.9331, + "step": 2836 + }, + { + "epoch": 0.2046823707658454, + "grad_norm": 2.7159414583282016, + "learning_rate": 3.6883853394065546e-06, + "loss": 0.8942, + "step": 2837 + }, + { + "epoch": 0.2047545182352729, + "grad_norm": 2.7120864588085114, + "learning_rate": 3.6881347698204765e-06, + "loss": 0.9329, + "step": 2838 + }, + { + "epoch": 0.2048266657047004, + "grad_norm": 6.698487854170574, + "learning_rate": 3.6878841080516074e-06, + "loss": 0.8922, + "step": 2839 + }, + { + "epoch": 0.20489881317412792, + "grad_norm": 3.8293961886141474, + "learning_rate": 3.687633354113636e-06, + "loss": 0.9559, + "step": 2840 + }, + { + "epoch": 0.20497096064355544, + "grad_norm": 2.2892541382025158, + "learning_rate": 3.6873825080202544e-06, + "loss": 0.8066, + "step": 2841 + }, + { + "epoch": 0.20504310811298293, + "grad_norm": 2.5060091278689316, + "learning_rate": 3.687131569785161e-06, + "loss": 0.9458, + "step": 2842 + }, + { + "epoch": 0.20511525558241045, + "grad_norm": 2.919925141093491, + "learning_rate": 3.6868805394220575e-06, + "loss": 0.9286, + "step": 2843 + }, + { + "epoch": 0.20518740305183797, + "grad_norm": 2.0961347066238343, + "learning_rate": 3.6866294169446526e-06, + "loss": 0.9244, + "step": 2844 + }, + { + "epoch": 0.20525955052126546, + "grad_norm": 3.5002667461654293, + "learning_rate": 3.686378202366659e-06, + "loss": 0.8525, + "step": 2845 + }, + { + "epoch": 0.20533169799069298, + "grad_norm": 2.5281767850459573, + "learning_rate": 3.6861268957017947e-06, + "loss": 0.9881, + "step": 2846 + }, + { + "epoch": 0.2054038454601205, + "grad_norm": 2.1190950590939326, + "learning_rate": 3.6858754969637822e-06, + "loss": 0.9309, + "step": 2847 + }, + { + "epoch": 0.205475992929548, + "grad_norm": 2.6324228966299583, + "learning_rate": 3.6856240061663496e-06, + "loss": 0.9877, + "step": 2848 + }, + { + "epoch": 0.2055481403989755, + "grad_norm": 2.023208664138482, + "learning_rate": 3.6853724233232307e-06, + "loss": 0.988, + "step": 2849 + }, + { + "epoch": 0.20562028786840303, + "grad_norm": 2.316985857081148, + "learning_rate": 3.6851207484481617e-06, + "loss": 1.0278, + "step": 2850 + }, + { + "epoch": 0.20569243533783052, + "grad_norm": 2.0492646922270907, + "learning_rate": 3.6848689815548876e-06, + "loss": 0.9727, + "step": 2851 + }, + { + "epoch": 0.20576458280725804, + "grad_norm": 2.7669584863414047, + "learning_rate": 3.6846171226571547e-06, + "loss": 0.8657, + "step": 2852 + }, + { + "epoch": 0.20583673027668556, + "grad_norm": 3.1120187839343223, + "learning_rate": 3.6843651717687178e-06, + "loss": 0.952, + "step": 2853 + }, + { + "epoch": 0.20590887774611305, + "grad_norm": 2.230766530376793, + "learning_rate": 3.684113128903334e-06, + "loss": 1.0167, + "step": 2854 + }, + { + "epoch": 0.20598102521554057, + "grad_norm": 2.8962482914365983, + "learning_rate": 3.6838609940747665e-06, + "loss": 0.9484, + "step": 2855 + }, + { + "epoch": 0.20605317268496806, + "grad_norm": 2.3629653593438964, + "learning_rate": 3.6836087672967837e-06, + "loss": 0.995, + "step": 2856 + }, + { + "epoch": 0.20612532015439558, + "grad_norm": 3.7091589590789744, + "learning_rate": 3.683356448583158e-06, + "loss": 0.9548, + "step": 2857 + }, + { + "epoch": 0.2061974676238231, + "grad_norm": 2.56138847248566, + "learning_rate": 3.683104037947669e-06, + "loss": 0.963, + "step": 2858 + }, + { + "epoch": 0.2062696150932506, + "grad_norm": 1.9340065871819843, + "learning_rate": 3.682851535404099e-06, + "loss": 0.9314, + "step": 2859 + }, + { + "epoch": 0.2063417625626781, + "grad_norm": 2.332662576813377, + "learning_rate": 3.682598940966236e-06, + "loss": 1.0425, + "step": 2860 + }, + { + "epoch": 0.20641391003210563, + "grad_norm": 2.351786859398052, + "learning_rate": 3.682346254647874e-06, + "loss": 0.8976, + "step": 2861 + }, + { + "epoch": 0.20648605750153312, + "grad_norm": 2.940786430638485, + "learning_rate": 3.6820934764628102e-06, + "loss": 0.9676, + "step": 2862 + }, + { + "epoch": 0.20655820497096064, + "grad_norm": 0.731915718659419, + "learning_rate": 3.681840606424849e-06, + "loss": 0.7828, + "step": 2863 + }, + { + "epoch": 0.20663035244038816, + "grad_norm": 2.669457217276461, + "learning_rate": 3.6815876445477983e-06, + "loss": 0.9424, + "step": 2864 + }, + { + "epoch": 0.20670249990981565, + "grad_norm": 2.1762791171856697, + "learning_rate": 3.6813345908454716e-06, + "loss": 1.0075, + "step": 2865 + }, + { + "epoch": 0.20677464737924317, + "grad_norm": 3.4495803798959783, + "learning_rate": 3.681081445331687e-06, + "loss": 0.9987, + "step": 2866 + }, + { + "epoch": 0.2068467948486707, + "grad_norm": 2.0438267964624566, + "learning_rate": 3.680828208020267e-06, + "loss": 0.931, + "step": 2867 + }, + { + "epoch": 0.20691894231809818, + "grad_norm": 1.984837275448485, + "learning_rate": 3.6805748789250414e-06, + "loss": 0.9537, + "step": 2868 + }, + { + "epoch": 0.2069910897875257, + "grad_norm": 1.898444030499313, + "learning_rate": 3.680321458059843e-06, + "loss": 1.0035, + "step": 2869 + }, + { + "epoch": 0.20706323725695322, + "grad_norm": 2.430430602237488, + "learning_rate": 3.6800679454385096e-06, + "loss": 0.8197, + "step": 2870 + }, + { + "epoch": 0.2071353847263807, + "grad_norm": 2.2681415130695277, + "learning_rate": 3.679814341074886e-06, + "loss": 0.9545, + "step": 2871 + }, + { + "epoch": 0.20720753219580823, + "grad_norm": 2.965171021905473, + "learning_rate": 3.6795606449828185e-06, + "loss": 0.893, + "step": 2872 + }, + { + "epoch": 0.20727967966523575, + "grad_norm": 0.8279523129621462, + "learning_rate": 3.6793068571761624e-06, + "loss": 0.8143, + "step": 2873 + }, + { + "epoch": 0.20735182713466324, + "grad_norm": 2.125988449883509, + "learning_rate": 3.6790529776687753e-06, + "loss": 0.9753, + "step": 2874 + }, + { + "epoch": 0.20742397460409076, + "grad_norm": 4.589961559325837, + "learning_rate": 3.678799006474521e-06, + "loss": 0.9403, + "step": 2875 + }, + { + "epoch": 0.20749612207351828, + "grad_norm": 1.9263080240133705, + "learning_rate": 3.6785449436072672e-06, + "loss": 0.9609, + "step": 2876 + }, + { + "epoch": 0.20756826954294577, + "grad_norm": 2.954456170607052, + "learning_rate": 3.678290789080888e-06, + "loss": 0.975, + "step": 2877 + }, + { + "epoch": 0.2076404170123733, + "grad_norm": 0.7227039218334316, + "learning_rate": 3.678036542909262e-06, + "loss": 0.7805, + "step": 2878 + }, + { + "epoch": 0.2077125644818008, + "grad_norm": 3.0659241509956185, + "learning_rate": 3.677782205106272e-06, + "loss": 0.9468, + "step": 2879 + }, + { + "epoch": 0.2077847119512283, + "grad_norm": 2.390777782469832, + "learning_rate": 3.6775277756858066e-06, + "loss": 0.9855, + "step": 2880 + }, + { + "epoch": 0.20785685942065582, + "grad_norm": 2.79563168643226, + "learning_rate": 3.677273254661759e-06, + "loss": 0.9169, + "step": 2881 + }, + { + "epoch": 0.20792900689008334, + "grad_norm": 2.3699045231254505, + "learning_rate": 3.6770186420480282e-06, + "loss": 0.9678, + "step": 2882 + }, + { + "epoch": 0.20800115435951083, + "grad_norm": 2.335862888943528, + "learning_rate": 3.676763937858518e-06, + "loss": 1.0241, + "step": 2883 + }, + { + "epoch": 0.20807330182893835, + "grad_norm": 1.8314199505474056, + "learning_rate": 3.676509142107136e-06, + "loss": 1.0085, + "step": 2884 + }, + { + "epoch": 0.20814544929836587, + "grad_norm": 3.238220287894968, + "learning_rate": 3.676254254807796e-06, + "loss": 0.9723, + "step": 2885 + }, + { + "epoch": 0.20821759676779336, + "grad_norm": 2.797859657901683, + "learning_rate": 3.6759992759744166e-06, + "loss": 1.0284, + "step": 2886 + }, + { + "epoch": 0.20828974423722088, + "grad_norm": 2.9953750882402836, + "learning_rate": 3.6757442056209204e-06, + "loss": 1.0439, + "step": 2887 + }, + { + "epoch": 0.2083618917066484, + "grad_norm": 3.0263248033192913, + "learning_rate": 3.675489043761237e-06, + "loss": 0.89, + "step": 2888 + }, + { + "epoch": 0.2084340391760759, + "grad_norm": 5.4964294981736606, + "learning_rate": 3.675233790409299e-06, + "loss": 0.9631, + "step": 2889 + }, + { + "epoch": 0.2085061866455034, + "grad_norm": 3.0555144066399964, + "learning_rate": 3.6749784455790457e-06, + "loss": 0.888, + "step": 2890 + }, + { + "epoch": 0.20857833411493093, + "grad_norm": 2.554310062069301, + "learning_rate": 3.67472300928442e-06, + "loss": 0.9756, + "step": 2891 + }, + { + "epoch": 0.20865048158435842, + "grad_norm": 1.9278807425080458, + "learning_rate": 3.674467481539371e-06, + "loss": 0.9102, + "step": 2892 + }, + { + "epoch": 0.20872262905378594, + "grad_norm": 2.610629117276135, + "learning_rate": 3.6742118623578506e-06, + "loss": 1.0275, + "step": 2893 + }, + { + "epoch": 0.20879477652321346, + "grad_norm": 3.7854556820499377, + "learning_rate": 3.673956151753818e-06, + "loss": 0.9659, + "step": 2894 + }, + { + "epoch": 0.20886692399264095, + "grad_norm": 2.334373026774235, + "learning_rate": 3.673700349741237e-06, + "loss": 0.9168, + "step": 2895 + }, + { + "epoch": 0.20893907146206847, + "grad_norm": 0.7504312546953795, + "learning_rate": 3.673444456334076e-06, + "loss": 0.7736, + "step": 2896 + }, + { + "epoch": 0.209011218931496, + "grad_norm": 3.100978844445923, + "learning_rate": 3.673188471546308e-06, + "loss": 1.0204, + "step": 2897 + }, + { + "epoch": 0.20908336640092348, + "grad_norm": 2.7547936707374223, + "learning_rate": 3.6729323953919115e-06, + "loss": 1.0685, + "step": 2898 + }, + { + "epoch": 0.209155513870351, + "grad_norm": 2.760990116846398, + "learning_rate": 3.672676227884869e-06, + "loss": 0.9672, + "step": 2899 + }, + { + "epoch": 0.20922766133977852, + "grad_norm": 2.059131182996917, + "learning_rate": 3.672419969039171e-06, + "loss": 1.0047, + "step": 2900 + }, + { + "epoch": 0.209299808809206, + "grad_norm": 2.6065018420129675, + "learning_rate": 3.6721636188688086e-06, + "loss": 0.9323, + "step": 2901 + }, + { + "epoch": 0.20937195627863353, + "grad_norm": 3.015509094224568, + "learning_rate": 3.6719071773877815e-06, + "loss": 1.05, + "step": 2902 + }, + { + "epoch": 0.20944410374806105, + "grad_norm": 3.2107046372374595, + "learning_rate": 3.671650644610093e-06, + "loss": 0.958, + "step": 2903 + }, + { + "epoch": 0.20951625121748854, + "grad_norm": 0.8384713603493008, + "learning_rate": 3.671394020549751e-06, + "loss": 0.8608, + "step": 2904 + }, + { + "epoch": 0.20958839868691606, + "grad_norm": 2.563563149579332, + "learning_rate": 3.6711373052207686e-06, + "loss": 0.9719, + "step": 2905 + }, + { + "epoch": 0.20966054615634355, + "grad_norm": 2.7997012217377333, + "learning_rate": 3.6708804986371637e-06, + "loss": 0.9193, + "step": 2906 + }, + { + "epoch": 0.20973269362577107, + "grad_norm": 0.8274330991456086, + "learning_rate": 3.670623600812961e-06, + "loss": 0.8354, + "step": 2907 + }, + { + "epoch": 0.2098048410951986, + "grad_norm": 3.226872242245335, + "learning_rate": 3.670366611762188e-06, + "loss": 0.991, + "step": 2908 + }, + { + "epoch": 0.20987698856462608, + "grad_norm": 2.3107762484877137, + "learning_rate": 3.670109531498877e-06, + "loss": 0.963, + "step": 2909 + }, + { + "epoch": 0.2099491360340536, + "grad_norm": 2.827222142551759, + "learning_rate": 3.669852360037067e-06, + "loss": 0.98, + "step": 2910 + }, + { + "epoch": 0.21002128350348112, + "grad_norm": 2.245546808407698, + "learning_rate": 3.6695950973908022e-06, + "loss": 0.9833, + "step": 2911 + }, + { + "epoch": 0.2100934309729086, + "grad_norm": 2.6504328140353017, + "learning_rate": 3.6693377435741295e-06, + "loss": 0.901, + "step": 2912 + }, + { + "epoch": 0.21016557844233613, + "grad_norm": 4.697701910936725, + "learning_rate": 3.669080298601102e-06, + "loss": 0.9224, + "step": 2913 + }, + { + "epoch": 0.21023772591176365, + "grad_norm": 2.1822121058659247, + "learning_rate": 3.6688227624857777e-06, + "loss": 1.0534, + "step": 2914 + }, + { + "epoch": 0.21030987338119114, + "grad_norm": 3.0115297013258515, + "learning_rate": 3.6685651352422207e-06, + "loss": 0.8768, + "step": 2915 + }, + { + "epoch": 0.21038202085061866, + "grad_norm": 2.736997643628289, + "learning_rate": 3.6683074168844984e-06, + "loss": 0.9084, + "step": 2916 + }, + { + "epoch": 0.21045416832004618, + "grad_norm": 0.9074808509634886, + "learning_rate": 3.668049607426684e-06, + "loss": 0.8837, + "step": 2917 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 2.9431989862778813, + "learning_rate": 3.667791706882855e-06, + "loss": 0.9205, + "step": 2918 + }, + { + "epoch": 0.2105984632589012, + "grad_norm": 2.887821632076075, + "learning_rate": 3.6675337152670954e-06, + "loss": 0.9173, + "step": 2919 + }, + { + "epoch": 0.2106706107283287, + "grad_norm": 4.497060875566917, + "learning_rate": 3.667275632593492e-06, + "loss": 0.9078, + "step": 2920 + }, + { + "epoch": 0.2107427581977562, + "grad_norm": 2.583268646742943, + "learning_rate": 3.667017458876139e-06, + "loss": 0.9072, + "step": 2921 + }, + { + "epoch": 0.21081490566718372, + "grad_norm": 2.8320632406598336, + "learning_rate": 3.666759194129133e-06, + "loss": 0.9828, + "step": 2922 + }, + { + "epoch": 0.21088705313661124, + "grad_norm": 2.96677774871427, + "learning_rate": 3.666500838366578e-06, + "loss": 0.8973, + "step": 2923 + }, + { + "epoch": 0.21095920060603873, + "grad_norm": 2.5637497994261484, + "learning_rate": 3.6662423916025812e-06, + "loss": 1.0642, + "step": 2924 + }, + { + "epoch": 0.21103134807546625, + "grad_norm": 2.674104886607868, + "learning_rate": 3.665983853851256e-06, + "loss": 0.9674, + "step": 2925 + }, + { + "epoch": 0.21110349554489377, + "grad_norm": 1.0271124633826196, + "learning_rate": 3.6657252251267193e-06, + "loss": 0.8952, + "step": 2926 + }, + { + "epoch": 0.21117564301432126, + "grad_norm": 3.054004485511109, + "learning_rate": 3.6654665054430946e-06, + "loss": 0.9976, + "step": 2927 + }, + { + "epoch": 0.21124779048374878, + "grad_norm": 0.7617437896525437, + "learning_rate": 3.6652076948145092e-06, + "loss": 0.8145, + "step": 2928 + }, + { + "epoch": 0.2113199379531763, + "grad_norm": 3.0697809977497594, + "learning_rate": 3.664948793255096e-06, + "loss": 0.896, + "step": 2929 + }, + { + "epoch": 0.2113920854226038, + "grad_norm": 2.933386499472179, + "learning_rate": 3.6646898007789934e-06, + "loss": 0.9606, + "step": 2930 + }, + { + "epoch": 0.2114642328920313, + "grad_norm": 2.5558795539284422, + "learning_rate": 3.664430717400342e-06, + "loss": 1.0306, + "step": 2931 + }, + { + "epoch": 0.21153638036145883, + "grad_norm": 2.05069417132835, + "learning_rate": 3.6641715431332914e-06, + "loss": 0.9825, + "step": 2932 + }, + { + "epoch": 0.21160852783088632, + "grad_norm": 2.831738266094876, + "learning_rate": 3.6639122779919935e-06, + "loss": 0.845, + "step": 2933 + }, + { + "epoch": 0.21168067530031384, + "grad_norm": 2.68136170694763, + "learning_rate": 3.6636529219906054e-06, + "loss": 0.878, + "step": 2934 + }, + { + "epoch": 0.21175282276974136, + "grad_norm": 2.2505064500276846, + "learning_rate": 3.6633934751432897e-06, + "loss": 0.9929, + "step": 2935 + }, + { + "epoch": 0.21182497023916885, + "grad_norm": 3.43476203771634, + "learning_rate": 3.6631339374642138e-06, + "loss": 1.0224, + "step": 2936 + }, + { + "epoch": 0.21189711770859637, + "grad_norm": 2.4705739104365874, + "learning_rate": 3.662874308967551e-06, + "loss": 0.9773, + "step": 2937 + }, + { + "epoch": 0.2119692651780239, + "grad_norm": 1.9453921014516593, + "learning_rate": 3.662614589667478e-06, + "loss": 0.9194, + "step": 2938 + }, + { + "epoch": 0.21204141264745138, + "grad_norm": 2.4713008594826804, + "learning_rate": 3.662354779578176e-06, + "loss": 1.1576, + "step": 2939 + }, + { + "epoch": 0.2121135601168789, + "grad_norm": 2.7185539964200482, + "learning_rate": 3.662094878713834e-06, + "loss": 0.9895, + "step": 2940 + }, + { + "epoch": 0.21218570758630642, + "grad_norm": 2.366365396997715, + "learning_rate": 3.6618348870886435e-06, + "loss": 0.8382, + "step": 2941 + }, + { + "epoch": 0.2122578550557339, + "grad_norm": 2.9021802730394333, + "learning_rate": 3.661574804716802e-06, + "loss": 0.9497, + "step": 2942 + }, + { + "epoch": 0.21233000252516143, + "grad_norm": 2.038408313327647, + "learning_rate": 3.6613146316125105e-06, + "loss": 0.9609, + "step": 2943 + }, + { + "epoch": 0.21240214999458895, + "grad_norm": 2.496196046264295, + "learning_rate": 3.661054367789978e-06, + "loss": 0.8733, + "step": 2944 + }, + { + "epoch": 0.21247429746401644, + "grad_norm": 2.7825873085164106, + "learning_rate": 3.660794013263415e-06, + "loss": 1.0266, + "step": 2945 + }, + { + "epoch": 0.21254644493344396, + "grad_norm": 1.926496144781703, + "learning_rate": 3.6605335680470384e-06, + "loss": 1.0036, + "step": 2946 + }, + { + "epoch": 0.21261859240287148, + "grad_norm": 1.816325655554071, + "learning_rate": 3.6602730321550713e-06, + "loss": 0.9873, + "step": 2947 + }, + { + "epoch": 0.21269073987229897, + "grad_norm": 2.0945050600400092, + "learning_rate": 3.6600124056017404e-06, + "loss": 0.9963, + "step": 2948 + }, + { + "epoch": 0.2127628873417265, + "grad_norm": 2.923147648561901, + "learning_rate": 3.659751688401277e-06, + "loss": 0.9258, + "step": 2949 + }, + { + "epoch": 0.212835034811154, + "grad_norm": 5.883284551113859, + "learning_rate": 3.659490880567918e-06, + "loss": 0.8512, + "step": 2950 + }, + { + "epoch": 0.2129071822805815, + "grad_norm": 2.680798536586106, + "learning_rate": 3.6592299821159048e-06, + "loss": 0.9991, + "step": 2951 + }, + { + "epoch": 0.21297932975000902, + "grad_norm": 2.904942043862191, + "learning_rate": 3.658968993059485e-06, + "loss": 0.9932, + "step": 2952 + }, + { + "epoch": 0.21305147721943654, + "grad_norm": 2.4209036571204825, + "learning_rate": 3.65870791341291e-06, + "loss": 1.0122, + "step": 2953 + }, + { + "epoch": 0.21312362468886403, + "grad_norm": 4.214542805955638, + "learning_rate": 3.6584467431904357e-06, + "loss": 0.97, + "step": 2954 + }, + { + "epoch": 0.21319577215829155, + "grad_norm": 2.4653754046336847, + "learning_rate": 3.6581854824063243e-06, + "loss": 0.9652, + "step": 2955 + }, + { + "epoch": 0.21326791962771907, + "grad_norm": 3.1596848333100582, + "learning_rate": 3.657924131074842e-06, + "loss": 0.9841, + "step": 2956 + }, + { + "epoch": 0.21334006709714656, + "grad_norm": 3.8542702463339156, + "learning_rate": 3.6576626892102605e-06, + "loss": 0.8808, + "step": 2957 + }, + { + "epoch": 0.21341221456657408, + "grad_norm": 4.258108594205745, + "learning_rate": 3.6574011568268556e-06, + "loss": 1.0398, + "step": 2958 + }, + { + "epoch": 0.21348436203600157, + "grad_norm": 1.9952121885650864, + "learning_rate": 3.657139533938909e-06, + "loss": 0.9185, + "step": 2959 + }, + { + "epoch": 0.2135565095054291, + "grad_norm": 2.180141934840157, + "learning_rate": 3.656877820560708e-06, + "loss": 1.019, + "step": 2960 + }, + { + "epoch": 0.2136286569748566, + "grad_norm": 2.475763330367659, + "learning_rate": 3.6566160167065415e-06, + "loss": 0.9007, + "step": 2961 + }, + { + "epoch": 0.2137008044442841, + "grad_norm": 2.510442762648149, + "learning_rate": 3.6563541223907076e-06, + "loss": 0.9275, + "step": 2962 + }, + { + "epoch": 0.21377295191371162, + "grad_norm": 4.920054459309202, + "learning_rate": 3.6560921376275068e-06, + "loss": 0.9902, + "step": 2963 + }, + { + "epoch": 0.21384509938313914, + "grad_norm": 2.1661037912131222, + "learning_rate": 3.655830062431244e-06, + "loss": 0.9563, + "step": 2964 + }, + { + "epoch": 0.21391724685256663, + "grad_norm": 3.535248779434782, + "learning_rate": 3.655567896816232e-06, + "loss": 0.9965, + "step": 2965 + }, + { + "epoch": 0.21398939432199415, + "grad_norm": 2.6136124063230644, + "learning_rate": 3.6553056407967858e-06, + "loss": 1.0183, + "step": 2966 + }, + { + "epoch": 0.21406154179142167, + "grad_norm": 2.3432305332532297, + "learning_rate": 3.6550432943872265e-06, + "loss": 0.9604, + "step": 2967 + }, + { + "epoch": 0.21413368926084916, + "grad_norm": 2.2044618487034238, + "learning_rate": 3.6547808576018796e-06, + "loss": 1.032, + "step": 2968 + }, + { + "epoch": 0.21420583673027668, + "grad_norm": 2.718566888364837, + "learning_rate": 3.654518330455075e-06, + "loss": 0.7857, + "step": 2969 + }, + { + "epoch": 0.2142779841997042, + "grad_norm": 2.269757310438941, + "learning_rate": 3.6542557129611505e-06, + "loss": 1.0095, + "step": 2970 + }, + { + "epoch": 0.2143501316691317, + "grad_norm": 2.6486826195276847, + "learning_rate": 3.6539930051344445e-06, + "loss": 1.0222, + "step": 2971 + }, + { + "epoch": 0.2144222791385592, + "grad_norm": 2.866824924920684, + "learning_rate": 3.6537302069893035e-06, + "loss": 1.0697, + "step": 2972 + }, + { + "epoch": 0.21449442660798673, + "grad_norm": 2.6208648809122197, + "learning_rate": 3.6534673185400776e-06, + "loss": 0.8969, + "step": 2973 + }, + { + "epoch": 0.21456657407741422, + "grad_norm": 2.4042837853776504, + "learning_rate": 3.6532043398011227e-06, + "loss": 0.9606, + "step": 2974 + }, + { + "epoch": 0.21463872154684174, + "grad_norm": 2.5286302549679167, + "learning_rate": 3.6529412707867994e-06, + "loss": 0.9811, + "step": 2975 + }, + { + "epoch": 0.21471086901626926, + "grad_norm": 2.1251830975805794, + "learning_rate": 3.6526781115114708e-06, + "loss": 0.8914, + "step": 2976 + }, + { + "epoch": 0.21478301648569675, + "grad_norm": 2.496851655280418, + "learning_rate": 3.65241486198951e-06, + "loss": 0.9, + "step": 2977 + }, + { + "epoch": 0.21485516395512427, + "grad_norm": 2.3246259573035895, + "learning_rate": 3.6521515222352893e-06, + "loss": 1.0112, + "step": 2978 + }, + { + "epoch": 0.2149273114245518, + "grad_norm": 1.9695201154709618, + "learning_rate": 3.651888092263191e-06, + "loss": 0.9432, + "step": 2979 + }, + { + "epoch": 0.21499945889397928, + "grad_norm": 3.2456141001582424, + "learning_rate": 3.6516245720875986e-06, + "loss": 0.9622, + "step": 2980 + }, + { + "epoch": 0.2150716063634068, + "grad_norm": 2.0985470377495035, + "learning_rate": 3.6513609617229024e-06, + "loss": 1.0162, + "step": 2981 + }, + { + "epoch": 0.21514375383283432, + "grad_norm": 2.512481715956627, + "learning_rate": 3.6510972611834976e-06, + "loss": 0.9609, + "step": 2982 + }, + { + "epoch": 0.21521590130226181, + "grad_norm": 2.4968269238480643, + "learning_rate": 3.650833470483783e-06, + "loss": 0.9334, + "step": 2983 + }, + { + "epoch": 0.21528804877168933, + "grad_norm": 2.8541869745204957, + "learning_rate": 3.6505695896381643e-06, + "loss": 0.9364, + "step": 2984 + }, + { + "epoch": 0.21536019624111685, + "grad_norm": 5.374262382107278, + "learning_rate": 3.65030561866105e-06, + "loss": 0.9371, + "step": 2985 + }, + { + "epoch": 0.21543234371054434, + "grad_norm": 2.6619055285811366, + "learning_rate": 3.6500415575668557e-06, + "loss": 1.0556, + "step": 2986 + }, + { + "epoch": 0.21550449117997186, + "grad_norm": 3.2598526601903193, + "learning_rate": 3.6497774063699994e-06, + "loss": 0.9492, + "step": 2987 + }, + { + "epoch": 0.21557663864939938, + "grad_norm": 2.717875986909356, + "learning_rate": 3.6495131650849068e-06, + "loss": 0.9158, + "step": 2988 + }, + { + "epoch": 0.21564878611882687, + "grad_norm": 2.4047889732679097, + "learning_rate": 3.6492488337260065e-06, + "loss": 1.0383, + "step": 2989 + }, + { + "epoch": 0.2157209335882544, + "grad_norm": 2.5793740685740647, + "learning_rate": 3.648984412307732e-06, + "loss": 1.123, + "step": 2990 + }, + { + "epoch": 0.2157930810576819, + "grad_norm": 4.201404091603471, + "learning_rate": 3.648719900844524e-06, + "loss": 1.0423, + "step": 2991 + }, + { + "epoch": 0.2158652285271094, + "grad_norm": 0.8215423810199634, + "learning_rate": 3.648455299350825e-06, + "loss": 0.7954, + "step": 2992 + }, + { + "epoch": 0.21593737599653692, + "grad_norm": 1.8985061947046122, + "learning_rate": 3.648190607841085e-06, + "loss": 0.948, + "step": 2993 + }, + { + "epoch": 0.21600952346596444, + "grad_norm": 0.7721798236979455, + "learning_rate": 3.647925826329757e-06, + "loss": 0.7892, + "step": 2994 + }, + { + "epoch": 0.21608167093539193, + "grad_norm": 1.8978749764389744, + "learning_rate": 3.6476609548312996e-06, + "loss": 0.9734, + "step": 2995 + }, + { + "epoch": 0.21615381840481945, + "grad_norm": 1.8587924461418124, + "learning_rate": 3.6473959933601777e-06, + "loss": 0.9266, + "step": 2996 + }, + { + "epoch": 0.21622596587424697, + "grad_norm": 2.610244058035759, + "learning_rate": 3.6471309419308584e-06, + "loss": 1.0173, + "step": 2997 + }, + { + "epoch": 0.21629811334367446, + "grad_norm": 1.7776875257762645, + "learning_rate": 3.6468658005578163e-06, + "loss": 1.0316, + "step": 2998 + }, + { + "epoch": 0.21637026081310198, + "grad_norm": 1.6525688288633726, + "learning_rate": 3.646600569255529e-06, + "loss": 0.9638, + "step": 2999 + }, + { + "epoch": 0.2164424082825295, + "grad_norm": 2.634525273970522, + "learning_rate": 3.64633524803848e-06, + "loss": 1.0175, + "step": 3000 + }, + { + "epoch": 0.216514555751957, + "grad_norm": 1.7987303733152562, + "learning_rate": 3.6460698369211576e-06, + "loss": 0.9494, + "step": 3001 + }, + { + "epoch": 0.2165867032213845, + "grad_norm": 2.1831364297772384, + "learning_rate": 3.645804335918056e-06, + "loss": 1.031, + "step": 3002 + }, + { + "epoch": 0.21665885069081203, + "grad_norm": 1.9446086912570422, + "learning_rate": 3.6455387450436712e-06, + "loss": 0.918, + "step": 3003 + }, + { + "epoch": 0.21673099816023952, + "grad_norm": 2.2070553398505535, + "learning_rate": 3.645273064312507e-06, + "loss": 1.0342, + "step": 3004 + }, + { + "epoch": 0.21680314562966704, + "grad_norm": 2.0238830782776396, + "learning_rate": 3.6450072937390717e-06, + "loss": 0.9658, + "step": 3005 + }, + { + "epoch": 0.21687529309909456, + "grad_norm": 1.9508914841506482, + "learning_rate": 3.6447414333378775e-06, + "loss": 0.9317, + "step": 3006 + }, + { + "epoch": 0.21694744056852205, + "grad_norm": 2.0575360760590975, + "learning_rate": 3.6444754831234425e-06, + "loss": 0.9785, + "step": 3007 + }, + { + "epoch": 0.21701958803794957, + "grad_norm": 2.4545981484996022, + "learning_rate": 3.6442094431102886e-06, + "loss": 0.8818, + "step": 3008 + }, + { + "epoch": 0.21709173550737706, + "grad_norm": 2.829462230526686, + "learning_rate": 3.6439433133129442e-06, + "loss": 0.9776, + "step": 3009 + }, + { + "epoch": 0.21716388297680458, + "grad_norm": 1.948149546473788, + "learning_rate": 3.643677093745941e-06, + "loss": 1.0444, + "step": 3010 + }, + { + "epoch": 0.2172360304462321, + "grad_norm": 2.497813699321824, + "learning_rate": 3.643410784423816e-06, + "loss": 1.0432, + "step": 3011 + }, + { + "epoch": 0.2173081779156596, + "grad_norm": 2.4582886988149277, + "learning_rate": 3.643144385361112e-06, + "loss": 1.0394, + "step": 3012 + }, + { + "epoch": 0.21738032538508711, + "grad_norm": 3.1439080989209036, + "learning_rate": 3.6428778965723755e-06, + "loss": 0.9604, + "step": 3013 + }, + { + "epoch": 0.21745247285451463, + "grad_norm": 2.927099931518266, + "learning_rate": 3.6426113180721594e-06, + "loss": 1.0319, + "step": 3014 + }, + { + "epoch": 0.21752462032394212, + "grad_norm": 4.001121125462227, + "learning_rate": 3.6423446498750197e-06, + "loss": 0.9075, + "step": 3015 + }, + { + "epoch": 0.21759676779336964, + "grad_norm": 0.7391069427096025, + "learning_rate": 3.6420778919955184e-06, + "loss": 0.7511, + "step": 3016 + }, + { + "epoch": 0.21766891526279716, + "grad_norm": 1.6751266659341997, + "learning_rate": 3.6418110444482226e-06, + "loss": 0.9859, + "step": 3017 + }, + { + "epoch": 0.21774106273222466, + "grad_norm": 2.40326287279617, + "learning_rate": 3.6415441072477027e-06, + "loss": 0.9085, + "step": 3018 + }, + { + "epoch": 0.21781321020165217, + "grad_norm": 2.0127890100931123, + "learning_rate": 3.6412770804085368e-06, + "loss": 0.9941, + "step": 3019 + }, + { + "epoch": 0.2178853576710797, + "grad_norm": 3.633907030174971, + "learning_rate": 3.6410099639453042e-06, + "loss": 0.9044, + "step": 3020 + }, + { + "epoch": 0.21795750514050719, + "grad_norm": 2.7187099234597834, + "learning_rate": 3.640742757872593e-06, + "loss": 0.9059, + "step": 3021 + }, + { + "epoch": 0.2180296526099347, + "grad_norm": 3.142348087253359, + "learning_rate": 3.6404754622049937e-06, + "loss": 0.9933, + "step": 3022 + }, + { + "epoch": 0.21810180007936222, + "grad_norm": 2.254838932315844, + "learning_rate": 3.6402080769571023e-06, + "loss": 0.9789, + "step": 3023 + }, + { + "epoch": 0.21817394754878972, + "grad_norm": 2.2530605904820993, + "learning_rate": 3.639940602143519e-06, + "loss": 1.0475, + "step": 3024 + }, + { + "epoch": 0.21824609501821723, + "grad_norm": 2.059351396208327, + "learning_rate": 3.6396730377788506e-06, + "loss": 0.9837, + "step": 3025 + }, + { + "epoch": 0.21831824248764475, + "grad_norm": 2.299192855020304, + "learning_rate": 3.639405383877708e-06, + "loss": 0.9427, + "step": 3026 + }, + { + "epoch": 0.21839038995707225, + "grad_norm": 2.0105874445297895, + "learning_rate": 3.6391376404547053e-06, + "loss": 0.9571, + "step": 3027 + }, + { + "epoch": 0.21846253742649976, + "grad_norm": 2.547904526147955, + "learning_rate": 3.6388698075244644e-06, + "loss": 0.9821, + "step": 3028 + }, + { + "epoch": 0.21853468489592728, + "grad_norm": 2.8092121828145573, + "learning_rate": 3.6386018851016102e-06, + "loss": 0.9933, + "step": 3029 + }, + { + "epoch": 0.21860683236535478, + "grad_norm": 3.187467986301398, + "learning_rate": 3.638333873200773e-06, + "loss": 0.7693, + "step": 3030 + }, + { + "epoch": 0.2186789798347823, + "grad_norm": 13.607084201933771, + "learning_rate": 3.6380657718365877e-06, + "loss": 0.9183, + "step": 3031 + }, + { + "epoch": 0.2187511273042098, + "grad_norm": 2.8568345755064435, + "learning_rate": 3.6377975810236946e-06, + "loss": 0.9244, + "step": 3032 + }, + { + "epoch": 0.2188232747736373, + "grad_norm": 2.0047195301187886, + "learning_rate": 3.637529300776739e-06, + "loss": 0.9772, + "step": 3033 + }, + { + "epoch": 0.21889542224306482, + "grad_norm": 0.9185709824314149, + "learning_rate": 3.6372609311103694e-06, + "loss": 0.8829, + "step": 3034 + }, + { + "epoch": 0.21896756971249234, + "grad_norm": 2.4081344161553715, + "learning_rate": 3.636992472039242e-06, + "loss": 0.9961, + "step": 3035 + }, + { + "epoch": 0.21903971718191984, + "grad_norm": 2.7223344172066124, + "learning_rate": 3.6367239235780156e-06, + "loss": 0.9981, + "step": 3036 + }, + { + "epoch": 0.21911186465134735, + "grad_norm": 2.8371994531579046, + "learning_rate": 3.636455285741354e-06, + "loss": 0.9819, + "step": 3037 + }, + { + "epoch": 0.21918401212077487, + "grad_norm": 1.8319955915872763, + "learning_rate": 3.636186558543928e-06, + "loss": 0.86, + "step": 3038 + }, + { + "epoch": 0.21925615959020237, + "grad_norm": 2.9950764307831794, + "learning_rate": 3.6359177420004104e-06, + "loss": 0.9848, + "step": 3039 + }, + { + "epoch": 0.21932830705962988, + "grad_norm": 1.993132722300764, + "learning_rate": 3.635648836125481e-06, + "loss": 0.9453, + "step": 3040 + }, + { + "epoch": 0.2194004545290574, + "grad_norm": 2.5747278199278085, + "learning_rate": 3.635379840933824e-06, + "loss": 0.9022, + "step": 3041 + }, + { + "epoch": 0.2194726019984849, + "grad_norm": 2.794856104686705, + "learning_rate": 3.6351107564401274e-06, + "loss": 0.9648, + "step": 3042 + }, + { + "epoch": 0.21954474946791241, + "grad_norm": 2.486048200960959, + "learning_rate": 3.634841582659085e-06, + "loss": 0.9226, + "step": 3043 + }, + { + "epoch": 0.21961689693733993, + "grad_norm": 2.819844045858325, + "learning_rate": 3.6345723196053964e-06, + "loss": 0.8996, + "step": 3044 + }, + { + "epoch": 0.21968904440676743, + "grad_norm": 1.7819067430294666, + "learning_rate": 3.6343029672937645e-06, + "loss": 0.9958, + "step": 3045 + }, + { + "epoch": 0.21976119187619494, + "grad_norm": 2.976380990969698, + "learning_rate": 3.634033525738897e-06, + "loss": 0.8781, + "step": 3046 + }, + { + "epoch": 0.21983333934562246, + "grad_norm": 3.2216329163587387, + "learning_rate": 3.633763994955508e-06, + "loss": 0.8946, + "step": 3047 + }, + { + "epoch": 0.21990548681504996, + "grad_norm": 2.3945245789377405, + "learning_rate": 3.633494374958315e-06, + "loss": 0.9584, + "step": 3048 + }, + { + "epoch": 0.21997763428447747, + "grad_norm": 2.2918432225504626, + "learning_rate": 3.6332246657620415e-06, + "loss": 0.9144, + "step": 3049 + }, + { + "epoch": 0.220049781753905, + "grad_norm": 1.7937993139207173, + "learning_rate": 3.6329548673814148e-06, + "loss": 1.058, + "step": 3050 + }, + { + "epoch": 0.22012192922333249, + "grad_norm": 2.4540133477479893, + "learning_rate": 3.632684979831168e-06, + "loss": 0.8731, + "step": 3051 + }, + { + "epoch": 0.22019407669276, + "grad_norm": 2.0412993457480497, + "learning_rate": 3.6324150031260377e-06, + "loss": 0.9352, + "step": 3052 + }, + { + "epoch": 0.22026622416218752, + "grad_norm": 2.796275154447409, + "learning_rate": 3.632144937280768e-06, + "loss": 0.9561, + "step": 3053 + }, + { + "epoch": 0.22033837163161502, + "grad_norm": 2.0526530706127373, + "learning_rate": 3.631874782310105e-06, + "loss": 1.0841, + "step": 3054 + }, + { + "epoch": 0.22041051910104253, + "grad_norm": 2.6421568450863897, + "learning_rate": 3.6316045382288012e-06, + "loss": 1.0166, + "step": 3055 + }, + { + "epoch": 0.22048266657047005, + "grad_norm": 3.283296074308905, + "learning_rate": 3.6313342050516133e-06, + "loss": 0.9345, + "step": 3056 + }, + { + "epoch": 0.22055481403989755, + "grad_norm": 2.739331010190688, + "learning_rate": 3.6310637827933035e-06, + "loss": 0.9294, + "step": 3057 + }, + { + "epoch": 0.22062696150932506, + "grad_norm": 2.368058953852995, + "learning_rate": 3.6307932714686382e-06, + "loss": 0.8937, + "step": 3058 + }, + { + "epoch": 0.22069910897875258, + "grad_norm": 2.2814955905768866, + "learning_rate": 3.6305226710923897e-06, + "loss": 0.9935, + "step": 3059 + }, + { + "epoch": 0.22077125644818008, + "grad_norm": 2.056817752640745, + "learning_rate": 3.6302519816793344e-06, + "loss": 0.9923, + "step": 3060 + }, + { + "epoch": 0.2208434039176076, + "grad_norm": 1.7515287533860209, + "learning_rate": 3.629981203244253e-06, + "loss": 0.9711, + "step": 3061 + }, + { + "epoch": 0.2209155513870351, + "grad_norm": 2.1926223725951837, + "learning_rate": 3.629710335801932e-06, + "loss": 0.8905, + "step": 3062 + }, + { + "epoch": 0.2209876988564626, + "grad_norm": 3.6658521238763053, + "learning_rate": 3.629439379367163e-06, + "loss": 0.9898, + "step": 3063 + }, + { + "epoch": 0.22105984632589012, + "grad_norm": 2.9949646653780064, + "learning_rate": 3.629168333954741e-06, + "loss": 0.9587, + "step": 3064 + }, + { + "epoch": 0.22113199379531762, + "grad_norm": 2.6864804840259393, + "learning_rate": 3.6288971995794673e-06, + "loss": 0.9562, + "step": 3065 + }, + { + "epoch": 0.22120414126474514, + "grad_norm": 2.0003313743727076, + "learning_rate": 3.6286259762561475e-06, + "loss": 0.9113, + "step": 3066 + }, + { + "epoch": 0.22127628873417265, + "grad_norm": 2.933251087885263, + "learning_rate": 3.6283546639995927e-06, + "loss": 1.0077, + "step": 3067 + }, + { + "epoch": 0.22134843620360015, + "grad_norm": 1.8230738835431706, + "learning_rate": 3.628083262824617e-06, + "loss": 1.0654, + "step": 3068 + }, + { + "epoch": 0.22142058367302767, + "grad_norm": 2.2050009241772575, + "learning_rate": 3.6278117727460414e-06, + "loss": 1.0165, + "step": 3069 + }, + { + "epoch": 0.22149273114245518, + "grad_norm": 2.1886584892728274, + "learning_rate": 3.627540193778691e-06, + "loss": 0.9781, + "step": 3070 + }, + { + "epoch": 0.22156487861188268, + "grad_norm": 2.1420888976840002, + "learning_rate": 3.6272685259373957e-06, + "loss": 0.8735, + "step": 3071 + }, + { + "epoch": 0.2216370260813102, + "grad_norm": 3.0985980339995947, + "learning_rate": 3.6269967692369894e-06, + "loss": 0.9318, + "step": 3072 + }, + { + "epoch": 0.22170917355073771, + "grad_norm": 2.046014036859444, + "learning_rate": 3.626724923692313e-06, + "loss": 0.9572, + "step": 3073 + }, + { + "epoch": 0.2217813210201652, + "grad_norm": 2.4991941106785487, + "learning_rate": 3.6264529893182108e-06, + "loss": 1.0481, + "step": 3074 + }, + { + "epoch": 0.22185346848959273, + "grad_norm": 2.3252971295234657, + "learning_rate": 3.6261809661295316e-06, + "loss": 0.9765, + "step": 3075 + }, + { + "epoch": 0.22192561595902024, + "grad_norm": 2.6109618883803076, + "learning_rate": 3.6259088541411293e-06, + "loss": 0.9937, + "step": 3076 + }, + { + "epoch": 0.22199776342844774, + "grad_norm": 0.7148627190079472, + "learning_rate": 3.625636653367864e-06, + "loss": 0.7715, + "step": 3077 + }, + { + "epoch": 0.22206991089787526, + "grad_norm": 2.4166908153335886, + "learning_rate": 3.625364363824598e-06, + "loss": 1.0197, + "step": 3078 + }, + { + "epoch": 0.22214205836730277, + "grad_norm": 2.7701408457840113, + "learning_rate": 3.6250919855262017e-06, + "loss": 0.9252, + "step": 3079 + }, + { + "epoch": 0.22221420583673027, + "grad_norm": 5.30903332864313, + "learning_rate": 3.624819518487548e-06, + "loss": 0.9382, + "step": 3080 + }, + { + "epoch": 0.22228635330615779, + "grad_norm": 2.413610916121102, + "learning_rate": 3.624546962723515e-06, + "loss": 0.9398, + "step": 3081 + }, + { + "epoch": 0.2223585007755853, + "grad_norm": 2.326018333972892, + "learning_rate": 3.624274318248986e-06, + "loss": 0.9923, + "step": 3082 + }, + { + "epoch": 0.2224306482450128, + "grad_norm": 2.573710413105434, + "learning_rate": 3.62400158507885e-06, + "loss": 0.8848, + "step": 3083 + }, + { + "epoch": 0.22250279571444032, + "grad_norm": 2.2883424635462126, + "learning_rate": 3.6237287632279992e-06, + "loss": 1.0971, + "step": 3084 + }, + { + "epoch": 0.22257494318386783, + "grad_norm": 2.0027775550920937, + "learning_rate": 3.623455852711331e-06, + "loss": 0.8998, + "step": 3085 + }, + { + "epoch": 0.22264709065329533, + "grad_norm": 3.8192686437356644, + "learning_rate": 3.623182853543749e-06, + "loss": 0.8398, + "step": 3086 + }, + { + "epoch": 0.22271923812272285, + "grad_norm": 2.002306680860844, + "learning_rate": 3.62290976574016e-06, + "loss": 0.9737, + "step": 3087 + }, + { + "epoch": 0.22279138559215036, + "grad_norm": 0.90632908574683, + "learning_rate": 3.622636589315476e-06, + "loss": 0.8706, + "step": 3088 + }, + { + "epoch": 0.22286353306157786, + "grad_norm": 0.9821272421499679, + "learning_rate": 3.622363324284615e-06, + "loss": 0.8388, + "step": 3089 + }, + { + "epoch": 0.22293568053100538, + "grad_norm": 3.108773178057129, + "learning_rate": 3.622089970662499e-06, + "loss": 0.9382, + "step": 3090 + }, + { + "epoch": 0.2230078280004329, + "grad_norm": 3.7168610206958195, + "learning_rate": 3.6218165284640546e-06, + "loss": 0.8067, + "step": 3091 + }, + { + "epoch": 0.2230799754698604, + "grad_norm": 0.6608162013363076, + "learning_rate": 3.621542997704212e-06, + "loss": 0.7773, + "step": 3092 + }, + { + "epoch": 0.2231521229392879, + "grad_norm": 3.150828624911163, + "learning_rate": 3.6212693783979103e-06, + "loss": 0.9228, + "step": 3093 + }, + { + "epoch": 0.22322427040871543, + "grad_norm": 7.846871412269565, + "learning_rate": 3.62099567056009e-06, + "loss": 0.8662, + "step": 3094 + }, + { + "epoch": 0.22329641787814292, + "grad_norm": 0.8069106936076281, + "learning_rate": 3.620721874205696e-06, + "loss": 0.8389, + "step": 3095 + }, + { + "epoch": 0.22336856534757044, + "grad_norm": 2.4654755908482655, + "learning_rate": 3.6204479893496798e-06, + "loss": 0.8268, + "step": 3096 + }, + { + "epoch": 0.22344071281699796, + "grad_norm": 1.7585516159860182, + "learning_rate": 3.620174016006998e-06, + "loss": 0.9809, + "step": 3097 + }, + { + "epoch": 0.22351286028642545, + "grad_norm": 3.246465374805277, + "learning_rate": 3.6198999541926114e-06, + "loss": 1.0259, + "step": 3098 + }, + { + "epoch": 0.22358500775585297, + "grad_norm": 2.48186226696705, + "learning_rate": 3.619625803921484e-06, + "loss": 0.8969, + "step": 3099 + }, + { + "epoch": 0.22365715522528049, + "grad_norm": 2.3006432172887727, + "learning_rate": 3.619351565208588e-06, + "loss": 0.7962, + "step": 3100 + }, + { + "epoch": 0.22372930269470798, + "grad_norm": 2.4900796520561315, + "learning_rate": 3.6190772380688973e-06, + "loss": 0.9448, + "step": 3101 + }, + { + "epoch": 0.2238014501641355, + "grad_norm": 1.8979529245083773, + "learning_rate": 3.618802822517392e-06, + "loss": 0.9337, + "step": 3102 + }, + { + "epoch": 0.22387359763356302, + "grad_norm": 3.1743011967145427, + "learning_rate": 3.6185283185690577e-06, + "loss": 1.0323, + "step": 3103 + }, + { + "epoch": 0.2239457451029905, + "grad_norm": 2.1123449821702778, + "learning_rate": 3.618253726238883e-06, + "loss": 0.9659, + "step": 3104 + }, + { + "epoch": 0.22401789257241803, + "grad_norm": 2.716629780087921, + "learning_rate": 3.617979045541863e-06, + "loss": 0.9267, + "step": 3105 + }, + { + "epoch": 0.22409004004184555, + "grad_norm": 1.9970358698703201, + "learning_rate": 3.617704276492997e-06, + "loss": 1.0159, + "step": 3106 + }, + { + "epoch": 0.22416218751127304, + "grad_norm": 2.197349065255973, + "learning_rate": 3.6174294191072886e-06, + "loss": 0.9803, + "step": 3107 + }, + { + "epoch": 0.22423433498070056, + "grad_norm": 1.8614462847996602, + "learning_rate": 3.617154473399748e-06, + "loss": 1.0759, + "step": 3108 + }, + { + "epoch": 0.22430648245012808, + "grad_norm": 4.230581238044524, + "learning_rate": 3.6168794393853874e-06, + "loss": 0.957, + "step": 3109 + }, + { + "epoch": 0.22437862991955557, + "grad_norm": 2.7739107829559546, + "learning_rate": 3.6166043170792265e-06, + "loss": 0.9743, + "step": 3110 + }, + { + "epoch": 0.22445077738898309, + "grad_norm": 5.7090321505228125, + "learning_rate": 3.616329106496288e-06, + "loss": 1.0195, + "step": 3111 + }, + { + "epoch": 0.22452292485841058, + "grad_norm": 1.9283806085794915, + "learning_rate": 3.6160538076516007e-06, + "loss": 0.9177, + "step": 3112 + }, + { + "epoch": 0.2245950723278381, + "grad_norm": 2.02504710389913, + "learning_rate": 3.615778420560197e-06, + "loss": 0.9334, + "step": 3113 + }, + { + "epoch": 0.22466721979726562, + "grad_norm": 1.941788695215905, + "learning_rate": 3.6155029452371154e-06, + "loss": 0.9247, + "step": 3114 + }, + { + "epoch": 0.2247393672666931, + "grad_norm": 1.834555247612611, + "learning_rate": 3.615227381697398e-06, + "loss": 0.8809, + "step": 3115 + }, + { + "epoch": 0.22481151473612063, + "grad_norm": 2.3031076997394457, + "learning_rate": 3.6149517299560933e-06, + "loss": 0.9083, + "step": 3116 + }, + { + "epoch": 0.22488366220554815, + "grad_norm": 2.0335259231704668, + "learning_rate": 3.6146759900282526e-06, + "loss": 0.9697, + "step": 3117 + }, + { + "epoch": 0.22495580967497564, + "grad_norm": 7.587009028177859, + "learning_rate": 3.6144001619289335e-06, + "loss": 0.9488, + "step": 3118 + }, + { + "epoch": 0.22502795714440316, + "grad_norm": 3.566686612961751, + "learning_rate": 3.6141242456731977e-06, + "loss": 0.9408, + "step": 3119 + }, + { + "epoch": 0.22510010461383068, + "grad_norm": 1.9795279591381176, + "learning_rate": 3.613848241276113e-06, + "loss": 0.9749, + "step": 3120 + }, + { + "epoch": 0.22517225208325817, + "grad_norm": 2.762515545279146, + "learning_rate": 3.6135721487527486e-06, + "loss": 0.8958, + "step": 3121 + }, + { + "epoch": 0.2252443995526857, + "grad_norm": 2.4553137084289895, + "learning_rate": 3.6132959681181835e-06, + "loss": 0.9513, + "step": 3122 + }, + { + "epoch": 0.2253165470221132, + "grad_norm": 2.100546098275452, + "learning_rate": 3.6130196993874976e-06, + "loss": 1.002, + "step": 3123 + }, + { + "epoch": 0.2253886944915407, + "grad_norm": 2.348096657064864, + "learning_rate": 3.6127433425757766e-06, + "loss": 0.8683, + "step": 3124 + }, + { + "epoch": 0.22546084196096822, + "grad_norm": 0.8200616362132305, + "learning_rate": 3.612466897698112e-06, + "loss": 0.7981, + "step": 3125 + }, + { + "epoch": 0.22553298943039574, + "grad_norm": 1.9992519410192988, + "learning_rate": 3.6121903647696e-06, + "loss": 0.8902, + "step": 3126 + }, + { + "epoch": 0.22560513689982323, + "grad_norm": 2.283963405616605, + "learning_rate": 3.611913743805339e-06, + "loss": 0.9853, + "step": 3127 + }, + { + "epoch": 0.22567728436925075, + "grad_norm": 2.8146877680749047, + "learning_rate": 3.611637034820437e-06, + "loss": 0.9764, + "step": 3128 + }, + { + "epoch": 0.22574943183867827, + "grad_norm": 1.9271849184496808, + "learning_rate": 3.611360237830002e-06, + "loss": 1.0382, + "step": 3129 + }, + { + "epoch": 0.22582157930810576, + "grad_norm": 6.419964657029645, + "learning_rate": 3.611083352849149e-06, + "loss": 1.0329, + "step": 3130 + }, + { + "epoch": 0.22589372677753328, + "grad_norm": 2.5312971887781686, + "learning_rate": 3.610806379892999e-06, + "loss": 0.8989, + "step": 3131 + }, + { + "epoch": 0.2259658742469608, + "grad_norm": 2.8298482966116643, + "learning_rate": 3.610529318976675e-06, + "loss": 0.886, + "step": 3132 + }, + { + "epoch": 0.2260380217163883, + "grad_norm": 2.031879855584005, + "learning_rate": 3.6102521701153065e-06, + "loss": 0.9914, + "step": 3133 + }, + { + "epoch": 0.2261101691858158, + "grad_norm": 1.9994949060648473, + "learning_rate": 3.609974933324029e-06, + "loss": 0.9792, + "step": 3134 + }, + { + "epoch": 0.22618231665524333, + "grad_norm": 2.2062742842507572, + "learning_rate": 3.6096976086179797e-06, + "loss": 0.9068, + "step": 3135 + }, + { + "epoch": 0.22625446412467082, + "grad_norm": 3.8316423722390627, + "learning_rate": 3.609420196012303e-06, + "loss": 0.932, + "step": 3136 + }, + { + "epoch": 0.22632661159409834, + "grad_norm": 2.327835577057778, + "learning_rate": 3.6091426955221465e-06, + "loss": 0.9424, + "step": 3137 + }, + { + "epoch": 0.22639875906352586, + "grad_norm": 2.4969225061358933, + "learning_rate": 3.608865107162665e-06, + "loss": 0.9247, + "step": 3138 + }, + { + "epoch": 0.22647090653295335, + "grad_norm": 2.094185399574968, + "learning_rate": 3.6085874309490157e-06, + "loss": 0.9152, + "step": 3139 + }, + { + "epoch": 0.22654305400238087, + "grad_norm": 2.0978586678188837, + "learning_rate": 3.608309666896362e-06, + "loss": 0.8719, + "step": 3140 + }, + { + "epoch": 0.2266152014718084, + "grad_norm": 3.060778542753585, + "learning_rate": 3.6080318150198705e-06, + "loss": 0.9615, + "step": 3141 + }, + { + "epoch": 0.22668734894123588, + "grad_norm": 2.326725481523256, + "learning_rate": 3.6077538753347145e-06, + "loss": 1.0466, + "step": 3142 + }, + { + "epoch": 0.2267594964106634, + "grad_norm": 0.8620174011478796, + "learning_rate": 3.6074758478560716e-06, + "loss": 0.8849, + "step": 3143 + }, + { + "epoch": 0.22683164388009092, + "grad_norm": 4.495976026437179, + "learning_rate": 3.6071977325991227e-06, + "loss": 0.9041, + "step": 3144 + }, + { + "epoch": 0.2269037913495184, + "grad_norm": 2.9462482777049517, + "learning_rate": 3.6069195295790556e-06, + "loss": 0.8478, + "step": 3145 + }, + { + "epoch": 0.22697593881894593, + "grad_norm": 2.4644801238874656, + "learning_rate": 3.6066412388110614e-06, + "loss": 1.0019, + "step": 3146 + }, + { + "epoch": 0.22704808628837345, + "grad_norm": 1.8917373663620447, + "learning_rate": 3.606362860310337e-06, + "loss": 1.0437, + "step": 3147 + }, + { + "epoch": 0.22712023375780094, + "grad_norm": 2.5787548798493196, + "learning_rate": 3.606084394092083e-06, + "loss": 0.8641, + "step": 3148 + }, + { + "epoch": 0.22719238122722846, + "grad_norm": 2.3429215047715006, + "learning_rate": 3.605805840171506e-06, + "loss": 0.9368, + "step": 3149 + }, + { + "epoch": 0.22726452869665598, + "grad_norm": 2.525276671255635, + "learning_rate": 3.6055271985638163e-06, + "loss": 0.9621, + "step": 3150 + }, + { + "epoch": 0.22733667616608347, + "grad_norm": 3.449369019013984, + "learning_rate": 3.6052484692842297e-06, + "loss": 0.938, + "step": 3151 + }, + { + "epoch": 0.227408823635511, + "grad_norm": 3.1233806228993175, + "learning_rate": 3.6049696523479673e-06, + "loss": 0.939, + "step": 3152 + }, + { + "epoch": 0.2274809711049385, + "grad_norm": 2.7140033546376423, + "learning_rate": 3.6046907477702524e-06, + "loss": 0.9801, + "step": 3153 + }, + { + "epoch": 0.227553118574366, + "grad_norm": 4.380501693838467, + "learning_rate": 3.6044117555663166e-06, + "loss": 0.9999, + "step": 3154 + }, + { + "epoch": 0.22762526604379352, + "grad_norm": 2.5317807053310273, + "learning_rate": 3.604132675751395e-06, + "loss": 0.9174, + "step": 3155 + }, + { + "epoch": 0.22769741351322104, + "grad_norm": 4.680682705373107, + "learning_rate": 3.6038535083407256e-06, + "loss": 0.9164, + "step": 3156 + }, + { + "epoch": 0.22776956098264853, + "grad_norm": 4.25496338878978, + "learning_rate": 3.6035742533495533e-06, + "loss": 1.0246, + "step": 3157 + }, + { + "epoch": 0.22784170845207605, + "grad_norm": 3.023856358260198, + "learning_rate": 3.6032949107931276e-06, + "loss": 0.8758, + "step": 3158 + }, + { + "epoch": 0.22791385592150357, + "grad_norm": 2.567784326962154, + "learning_rate": 3.6030154806867018e-06, + "loss": 0.8932, + "step": 3159 + }, + { + "epoch": 0.22798600339093106, + "grad_norm": 4.151666623706019, + "learning_rate": 3.6027359630455347e-06, + "loss": 1.022, + "step": 3160 + }, + { + "epoch": 0.22805815086035858, + "grad_norm": 2.452092141584221, + "learning_rate": 3.6024563578848896e-06, + "loss": 0.991, + "step": 3161 + }, + { + "epoch": 0.2281302983297861, + "grad_norm": 4.716623799878477, + "learning_rate": 3.6021766652200354e-06, + "loss": 0.9892, + "step": 3162 + }, + { + "epoch": 0.2282024457992136, + "grad_norm": 3.0921615992729206, + "learning_rate": 3.601896885066244e-06, + "loss": 0.9606, + "step": 3163 + }, + { + "epoch": 0.2282745932686411, + "grad_norm": 1.947866945771126, + "learning_rate": 3.601617017438794e-06, + "loss": 0.9599, + "step": 3164 + }, + { + "epoch": 0.2283467407380686, + "grad_norm": 0.852259289606682, + "learning_rate": 3.601337062352968e-06, + "loss": 0.8364, + "step": 3165 + }, + { + "epoch": 0.22841888820749612, + "grad_norm": 3.0541935592098843, + "learning_rate": 3.6010570198240523e-06, + "loss": 0.9298, + "step": 3166 + }, + { + "epoch": 0.22849103567692364, + "grad_norm": 3.2094194299581003, + "learning_rate": 3.6007768898673404e-06, + "loss": 0.9121, + "step": 3167 + }, + { + "epoch": 0.22856318314635113, + "grad_norm": 5.523089407348255, + "learning_rate": 3.6004966724981277e-06, + "loss": 1.0129, + "step": 3168 + }, + { + "epoch": 0.22863533061577865, + "grad_norm": 2.690539836532623, + "learning_rate": 3.6002163677317166e-06, + "loss": 1.0111, + "step": 3169 + }, + { + "epoch": 0.22870747808520617, + "grad_norm": 3.2297585385892753, + "learning_rate": 3.599935975583414e-06, + "loss": 0.9551, + "step": 3170 + }, + { + "epoch": 0.22877962555463366, + "grad_norm": 5.238138696151723, + "learning_rate": 3.5996554960685304e-06, + "loss": 0.9239, + "step": 3171 + }, + { + "epoch": 0.22885177302406118, + "grad_norm": 9.385251962879755, + "learning_rate": 3.599374929202382e-06, + "loss": 1.0554, + "step": 3172 + }, + { + "epoch": 0.2289239204934887, + "grad_norm": 3.104999012478535, + "learning_rate": 3.599094275000289e-06, + "loss": 1.088, + "step": 3173 + }, + { + "epoch": 0.2289960679629162, + "grad_norm": 4.728413570898906, + "learning_rate": 3.5988135334775775e-06, + "loss": 0.8835, + "step": 3174 + }, + { + "epoch": 0.2290682154323437, + "grad_norm": 2.0220763588211303, + "learning_rate": 3.5985327046495773e-06, + "loss": 0.8852, + "step": 3175 + }, + { + "epoch": 0.22914036290177123, + "grad_norm": 2.033241234970947, + "learning_rate": 3.5982517885316232e-06, + "loss": 0.9419, + "step": 3176 + }, + { + "epoch": 0.22921251037119872, + "grad_norm": 6.71617609445934, + "learning_rate": 3.597970785139057e-06, + "loss": 0.9936, + "step": 3177 + }, + { + "epoch": 0.22928465784062624, + "grad_norm": 5.262472142669086, + "learning_rate": 3.5976896944872205e-06, + "loss": 0.9749, + "step": 3178 + }, + { + "epoch": 0.22935680531005376, + "grad_norm": 3.3982929769018373, + "learning_rate": 3.597408516591464e-06, + "loss": 1.015, + "step": 3179 + }, + { + "epoch": 0.22942895277948125, + "grad_norm": 3.533718090581247, + "learning_rate": 3.5971272514671426e-06, + "loss": 0.8332, + "step": 3180 + }, + { + "epoch": 0.22950110024890877, + "grad_norm": 2.2941610690361633, + "learning_rate": 3.5968458991296144e-06, + "loss": 1.0437, + "step": 3181 + }, + { + "epoch": 0.2295732477183363, + "grad_norm": 2.0852580017441227, + "learning_rate": 3.5965644595942423e-06, + "loss": 0.9653, + "step": 3182 + }, + { + "epoch": 0.22964539518776378, + "grad_norm": 3.600052589456241, + "learning_rate": 3.596282932876396e-06, + "loss": 0.9799, + "step": 3183 + }, + { + "epoch": 0.2297175426571913, + "grad_norm": 2.7716054963843413, + "learning_rate": 3.5960013189914477e-06, + "loss": 0.8649, + "step": 3184 + }, + { + "epoch": 0.22978969012661882, + "grad_norm": 2.345491398158226, + "learning_rate": 3.5957196179547756e-06, + "loss": 0.9518, + "step": 3185 + }, + { + "epoch": 0.2298618375960463, + "grad_norm": 2.2635748696792226, + "learning_rate": 3.5954378297817617e-06, + "loss": 0.9389, + "step": 3186 + }, + { + "epoch": 0.22993398506547383, + "grad_norm": 2.9343667059670455, + "learning_rate": 3.595155954487795e-06, + "loss": 0.9221, + "step": 3187 + }, + { + "epoch": 0.23000613253490135, + "grad_norm": 2.6828703814635046, + "learning_rate": 3.594873992088266e-06, + "loss": 0.9629, + "step": 3188 + }, + { + "epoch": 0.23007828000432884, + "grad_norm": 2.7839119848391496, + "learning_rate": 3.594591942598573e-06, + "loss": 0.9603, + "step": 3189 + }, + { + "epoch": 0.23015042747375636, + "grad_norm": 3.358537862401011, + "learning_rate": 3.5943098060341166e-06, + "loss": 0.9391, + "step": 3190 + }, + { + "epoch": 0.23022257494318388, + "grad_norm": 2.739202020794355, + "learning_rate": 3.5940275824103033e-06, + "loss": 1.0087, + "step": 3191 + }, + { + "epoch": 0.23029472241261137, + "grad_norm": 3.5525260641925738, + "learning_rate": 3.593745271742545e-06, + "loss": 0.9452, + "step": 3192 + }, + { + "epoch": 0.2303668698820389, + "grad_norm": 2.4710905351837984, + "learning_rate": 3.593462874046257e-06, + "loss": 0.9594, + "step": 3193 + }, + { + "epoch": 0.2304390173514664, + "grad_norm": 2.653399924332415, + "learning_rate": 3.5931803893368604e-06, + "loss": 0.9451, + "step": 3194 + }, + { + "epoch": 0.2305111648208939, + "grad_norm": 12.634488911649484, + "learning_rate": 3.592897817629781e-06, + "loss": 0.9636, + "step": 3195 + }, + { + "epoch": 0.23058331229032142, + "grad_norm": 4.521932137318803, + "learning_rate": 3.592615158940448e-06, + "loss": 0.9274, + "step": 3196 + }, + { + "epoch": 0.23065545975974894, + "grad_norm": 0.864315087096284, + "learning_rate": 3.592332413284296e-06, + "loss": 0.8375, + "step": 3197 + }, + { + "epoch": 0.23072760722917643, + "grad_norm": 0.7733556530735047, + "learning_rate": 3.592049580676767e-06, + "loss": 0.8531, + "step": 3198 + }, + { + "epoch": 0.23079975469860395, + "grad_norm": 2.955501992434232, + "learning_rate": 3.5917666611333032e-06, + "loss": 1.0711, + "step": 3199 + }, + { + "epoch": 0.23087190216803147, + "grad_norm": 2.8061361582886493, + "learning_rate": 3.5914836546693547e-06, + "loss": 0.9813, + "step": 3200 + }, + { + "epoch": 0.23094404963745896, + "grad_norm": 13.104229868790519, + "learning_rate": 3.5912005613003755e-06, + "loss": 0.8489, + "step": 3201 + }, + { + "epoch": 0.23101619710688648, + "grad_norm": 3.467806163602267, + "learning_rate": 3.5909173810418236e-06, + "loss": 1.0278, + "step": 3202 + }, + { + "epoch": 0.231088344576314, + "grad_norm": 5.7274910250809485, + "learning_rate": 3.590634113909163e-06, + "loss": 0.9959, + "step": 3203 + }, + { + "epoch": 0.2311604920457415, + "grad_norm": 2.7797086066248275, + "learning_rate": 3.5903507599178616e-06, + "loss": 0.9536, + "step": 3204 + }, + { + "epoch": 0.231232639515169, + "grad_norm": 5.275009712101823, + "learning_rate": 3.5900673190833932e-06, + "loss": 0.8569, + "step": 3205 + }, + { + "epoch": 0.23130478698459653, + "grad_norm": 3.091338942467221, + "learning_rate": 3.5897837914212342e-06, + "loss": 1.077, + "step": 3206 + }, + { + "epoch": 0.23137693445402402, + "grad_norm": 2.8330517423235704, + "learning_rate": 3.589500176946868e-06, + "loss": 0.9783, + "step": 3207 + }, + { + "epoch": 0.23144908192345154, + "grad_norm": 3.7206302024924125, + "learning_rate": 3.5892164756757807e-06, + "loss": 0.8381, + "step": 3208 + }, + { + "epoch": 0.23152122939287906, + "grad_norm": 0.9303300384211193, + "learning_rate": 3.5889326876234643e-06, + "loss": 0.8476, + "step": 3209 + }, + { + "epoch": 0.23159337686230655, + "grad_norm": 2.2434785428977535, + "learning_rate": 3.5886488128054172e-06, + "loss": 0.9442, + "step": 3210 + }, + { + "epoch": 0.23166552433173407, + "grad_norm": 3.522485710207109, + "learning_rate": 3.5883648512371386e-06, + "loss": 0.93, + "step": 3211 + }, + { + "epoch": 0.2317376718011616, + "grad_norm": 2.1188783544860756, + "learning_rate": 3.5880808029341355e-06, + "loss": 0.8687, + "step": 3212 + }, + { + "epoch": 0.23180981927058908, + "grad_norm": 3.1758711873700247, + "learning_rate": 3.5877966679119183e-06, + "loss": 1.0938, + "step": 3213 + }, + { + "epoch": 0.2318819667400166, + "grad_norm": 2.503443920759158, + "learning_rate": 3.5875124461860034e-06, + "loss": 0.8631, + "step": 3214 + }, + { + "epoch": 0.2319541142094441, + "grad_norm": 2.2539174626697918, + "learning_rate": 3.5872281377719105e-06, + "loss": 0.9246, + "step": 3215 + }, + { + "epoch": 0.2320262616788716, + "grad_norm": 2.74301769589937, + "learning_rate": 3.5869437426851645e-06, + "loss": 1.0023, + "step": 3216 + }, + { + "epoch": 0.23209840914829913, + "grad_norm": 2.4351482294542284, + "learning_rate": 3.586659260941295e-06, + "loss": 0.7937, + "step": 3217 + }, + { + "epoch": 0.23217055661772662, + "grad_norm": 2.923100178931077, + "learning_rate": 3.5863746925558376e-06, + "loss": 0.966, + "step": 3218 + }, + { + "epoch": 0.23224270408715414, + "grad_norm": 2.29377100880328, + "learning_rate": 3.5860900375443305e-06, + "loss": 0.9396, + "step": 3219 + }, + { + "epoch": 0.23231485155658166, + "grad_norm": 3.3720038501631544, + "learning_rate": 3.5858052959223184e-06, + "loss": 0.9705, + "step": 3220 + }, + { + "epoch": 0.23238699902600915, + "grad_norm": 3.510006632567657, + "learning_rate": 3.5855204677053494e-06, + "loss": 0.9672, + "step": 3221 + }, + { + "epoch": 0.23245914649543667, + "grad_norm": 3.3934252729370384, + "learning_rate": 3.5852355529089763e-06, + "loss": 1.0329, + "step": 3222 + }, + { + "epoch": 0.2325312939648642, + "grad_norm": 2.6131269697370314, + "learning_rate": 3.5849505515487586e-06, + "loss": 0.9993, + "step": 3223 + }, + { + "epoch": 0.23260344143429168, + "grad_norm": 3.792719823995714, + "learning_rate": 3.5846654636402586e-06, + "loss": 0.9128, + "step": 3224 + }, + { + "epoch": 0.2326755889037192, + "grad_norm": 2.9732270864426336, + "learning_rate": 3.5843802891990438e-06, + "loss": 0.8712, + "step": 3225 + }, + { + "epoch": 0.23274773637314672, + "grad_norm": 4.043828929856013, + "learning_rate": 3.584095028240686e-06, + "loss": 1.016, + "step": 3226 + }, + { + "epoch": 0.2328198838425742, + "grad_norm": 2.822796602030176, + "learning_rate": 3.5838096807807635e-06, + "loss": 0.9163, + "step": 3227 + }, + { + "epoch": 0.23289203131200173, + "grad_norm": 3.025426560269152, + "learning_rate": 3.5835242468348574e-06, + "loss": 0.918, + "step": 3228 + }, + { + "epoch": 0.23296417878142925, + "grad_norm": 2.4854807762027233, + "learning_rate": 3.583238726418554e-06, + "loss": 0.8075, + "step": 3229 + }, + { + "epoch": 0.23303632625085674, + "grad_norm": 3.046455550054363, + "learning_rate": 3.5829531195474443e-06, + "loss": 0.8369, + "step": 3230 + }, + { + "epoch": 0.23310847372028426, + "grad_norm": 2.659673661025784, + "learning_rate": 3.5826674262371254e-06, + "loss": 0.9151, + "step": 3231 + }, + { + "epoch": 0.23318062118971178, + "grad_norm": 2.4439871124202495, + "learning_rate": 3.5823816465031968e-06, + "loss": 0.9154, + "step": 3232 + }, + { + "epoch": 0.23325276865913927, + "grad_norm": 3.688194080268888, + "learning_rate": 3.5820957803612645e-06, + "loss": 0.9512, + "step": 3233 + }, + { + "epoch": 0.2333249161285668, + "grad_norm": 3.318948479013746, + "learning_rate": 3.5818098278269383e-06, + "loss": 0.9995, + "step": 3234 + }, + { + "epoch": 0.2333970635979943, + "grad_norm": 4.701594816675553, + "learning_rate": 3.5815237889158326e-06, + "loss": 0.9803, + "step": 3235 + }, + { + "epoch": 0.2334692110674218, + "grad_norm": 2.8566518020985017, + "learning_rate": 3.5812376636435676e-06, + "loss": 0.9035, + "step": 3236 + }, + { + "epoch": 0.23354135853684932, + "grad_norm": 1.9417921331359635, + "learning_rate": 3.5809514520257673e-06, + "loss": 0.9226, + "step": 3237 + }, + { + "epoch": 0.23361350600627684, + "grad_norm": 4.113306058136845, + "learning_rate": 3.580665154078061e-06, + "loss": 0.9341, + "step": 3238 + }, + { + "epoch": 0.23368565347570433, + "grad_norm": 2.2626500032553523, + "learning_rate": 3.5803787698160824e-06, + "loss": 0.984, + "step": 3239 + }, + { + "epoch": 0.23375780094513185, + "grad_norm": 3.470502668354705, + "learning_rate": 3.580092299255469e-06, + "loss": 0.9668, + "step": 3240 + }, + { + "epoch": 0.23382994841455937, + "grad_norm": 3.4067680594876166, + "learning_rate": 3.5798057424118637e-06, + "loss": 1.0072, + "step": 3241 + }, + { + "epoch": 0.23390209588398686, + "grad_norm": 2.921886913254874, + "learning_rate": 3.579519099300916e-06, + "loss": 0.9662, + "step": 3242 + }, + { + "epoch": 0.23397424335341438, + "grad_norm": 6.939757383651486, + "learning_rate": 3.5792323699382774e-06, + "loss": 1.0313, + "step": 3243 + }, + { + "epoch": 0.2340463908228419, + "grad_norm": 0.8571269368782206, + "learning_rate": 3.578945554339605e-06, + "loss": 0.8537, + "step": 3244 + }, + { + "epoch": 0.2341185382922694, + "grad_norm": 3.2975873855453197, + "learning_rate": 3.5786586525205607e-06, + "loss": 0.9781, + "step": 3245 + }, + { + "epoch": 0.2341906857616969, + "grad_norm": 3.2804406030641577, + "learning_rate": 3.578371664496812e-06, + "loss": 0.9188, + "step": 3246 + }, + { + "epoch": 0.23426283323112443, + "grad_norm": 1.5725624325583463, + "learning_rate": 3.578084590284029e-06, + "loss": 0.9725, + "step": 3247 + }, + { + "epoch": 0.23433498070055192, + "grad_norm": 2.570504410370963, + "learning_rate": 3.577797429897889e-06, + "loss": 1.0128, + "step": 3248 + }, + { + "epoch": 0.23440712816997944, + "grad_norm": 4.096973344101154, + "learning_rate": 3.577510183354072e-06, + "loss": 0.8712, + "step": 3249 + }, + { + "epoch": 0.23447927563940696, + "grad_norm": 2.343916620053951, + "learning_rate": 3.5772228506682636e-06, + "loss": 0.9159, + "step": 3250 + }, + { + "epoch": 0.23455142310883445, + "grad_norm": 2.1630635624195462, + "learning_rate": 3.576935431856154e-06, + "loss": 0.9186, + "step": 3251 + }, + { + "epoch": 0.23462357057826197, + "grad_norm": 2.5446889658027407, + "learning_rate": 3.5766479269334377e-06, + "loss": 0.8484, + "step": 3252 + }, + { + "epoch": 0.2346957180476895, + "grad_norm": 1.9674883841883015, + "learning_rate": 3.576360335915815e-06, + "loss": 0.9302, + "step": 3253 + }, + { + "epoch": 0.23476786551711698, + "grad_norm": 2.1778313722092464, + "learning_rate": 3.57607265881899e-06, + "loss": 0.9558, + "step": 3254 + }, + { + "epoch": 0.2348400129865445, + "grad_norm": 2.8178443993358484, + "learning_rate": 3.575784895658671e-06, + "loss": 0.9174, + "step": 3255 + }, + { + "epoch": 0.23491216045597202, + "grad_norm": 4.57974433251957, + "learning_rate": 3.5754970464505724e-06, + "loss": 0.9957, + "step": 3256 + }, + { + "epoch": 0.2349843079253995, + "grad_norm": 1.863864319619342, + "learning_rate": 3.575209111210413e-06, + "loss": 0.9614, + "step": 3257 + }, + { + "epoch": 0.23505645539482703, + "grad_norm": 2.0446631441876835, + "learning_rate": 3.574921089953914e-06, + "loss": 0.9003, + "step": 3258 + }, + { + "epoch": 0.23512860286425455, + "grad_norm": 1.9217745126644945, + "learning_rate": 3.5746329826968058e-06, + "loss": 1.0234, + "step": 3259 + }, + { + "epoch": 0.23520075033368204, + "grad_norm": 0.7682249726434224, + "learning_rate": 3.5743447894548185e-06, + "loss": 0.8131, + "step": 3260 + }, + { + "epoch": 0.23527289780310956, + "grad_norm": 2.7729737659730755, + "learning_rate": 3.5740565102436908e-06, + "loss": 0.9116, + "step": 3261 + }, + { + "epoch": 0.23534504527253708, + "grad_norm": 3.419576630738971, + "learning_rate": 3.573768145079164e-06, + "loss": 0.9002, + "step": 3262 + }, + { + "epoch": 0.23541719274196457, + "grad_norm": 2.5091016076542134, + "learning_rate": 3.573479693976985e-06, + "loss": 0.9917, + "step": 3263 + }, + { + "epoch": 0.2354893402113921, + "grad_norm": 2.1224541841376525, + "learning_rate": 3.5731911569529036e-06, + "loss": 0.9555, + "step": 3264 + }, + { + "epoch": 0.2355614876808196, + "grad_norm": 2.247554297549799, + "learning_rate": 3.5729025340226774e-06, + "loss": 0.8101, + "step": 3265 + }, + { + "epoch": 0.2356336351502471, + "grad_norm": 2.71174737007954, + "learning_rate": 3.572613825202067e-06, + "loss": 0.8961, + "step": 3266 + }, + { + "epoch": 0.23570578261967462, + "grad_norm": 2.459248765664946, + "learning_rate": 3.572325030506837e-06, + "loss": 0.9166, + "step": 3267 + }, + { + "epoch": 0.2357779300891021, + "grad_norm": 2.0926001508620655, + "learning_rate": 3.5720361499527575e-06, + "loss": 0.9613, + "step": 3268 + }, + { + "epoch": 0.23585007755852963, + "grad_norm": 5.526795916148423, + "learning_rate": 3.571747183555604e-06, + "loss": 0.8419, + "step": 3269 + }, + { + "epoch": 0.23592222502795715, + "grad_norm": 2.046484757125012, + "learning_rate": 3.571458131331154e-06, + "loss": 0.9533, + "step": 3270 + }, + { + "epoch": 0.23599437249738464, + "grad_norm": 2.227148253372186, + "learning_rate": 3.571168993295194e-06, + "loss": 0.9488, + "step": 3271 + }, + { + "epoch": 0.23606651996681216, + "grad_norm": 2.2492032229757783, + "learning_rate": 3.5708797694635106e-06, + "loss": 0.922, + "step": 3272 + }, + { + "epoch": 0.23613866743623968, + "grad_norm": 2.3295237800283406, + "learning_rate": 3.5705904598518983e-06, + "loss": 0.9941, + "step": 3273 + }, + { + "epoch": 0.23621081490566717, + "grad_norm": 2.849303638981254, + "learning_rate": 3.5703010644761557e-06, + "loss": 0.7625, + "step": 3274 + }, + { + "epoch": 0.2362829623750947, + "grad_norm": 4.243744004347868, + "learning_rate": 3.570011583352085e-06, + "loss": 0.9784, + "step": 3275 + }, + { + "epoch": 0.2363551098445222, + "grad_norm": 2.4802565115893254, + "learning_rate": 3.5697220164954924e-06, + "loss": 0.8828, + "step": 3276 + }, + { + "epoch": 0.2364272573139497, + "grad_norm": 2.374797611648004, + "learning_rate": 3.5694323639221926e-06, + "loss": 1.0037, + "step": 3277 + }, + { + "epoch": 0.23649940478337722, + "grad_norm": 3.8288989471945, + "learning_rate": 3.569142625648001e-06, + "loss": 0.9694, + "step": 3278 + }, + { + "epoch": 0.23657155225280474, + "grad_norm": 2.7573151828948146, + "learning_rate": 3.56885280168874e-06, + "loss": 0.9795, + "step": 3279 + }, + { + "epoch": 0.23664369972223223, + "grad_norm": 0.8965877232372453, + "learning_rate": 3.5685628920602343e-06, + "loss": 0.8114, + "step": 3280 + }, + { + "epoch": 0.23671584719165975, + "grad_norm": 2.3101751652479554, + "learning_rate": 3.568272896778316e-06, + "loss": 0.803, + "step": 3281 + }, + { + "epoch": 0.23678799466108727, + "grad_norm": 2.4677095755921106, + "learning_rate": 3.5679828158588203e-06, + "loss": 0.9841, + "step": 3282 + }, + { + "epoch": 0.23686014213051476, + "grad_norm": 2.285477157622166, + "learning_rate": 3.5676926493175875e-06, + "loss": 0.9612, + "step": 3283 + }, + { + "epoch": 0.23693228959994228, + "grad_norm": 1.872996786042494, + "learning_rate": 3.5674023971704625e-06, + "loss": 0.9709, + "step": 3284 + }, + { + "epoch": 0.2370044370693698, + "grad_norm": 2.071753924990637, + "learning_rate": 3.567112059433295e-06, + "loss": 0.9538, + "step": 3285 + }, + { + "epoch": 0.2370765845387973, + "grad_norm": 3.9049465598773625, + "learning_rate": 3.566821636121939e-06, + "loss": 1.0231, + "step": 3286 + }, + { + "epoch": 0.2371487320082248, + "grad_norm": 2.0452232393818215, + "learning_rate": 3.5665311272522536e-06, + "loss": 0.9343, + "step": 3287 + }, + { + "epoch": 0.23722087947765233, + "grad_norm": 3.701736898246926, + "learning_rate": 3.566240532840103e-06, + "loss": 0.8807, + "step": 3288 + }, + { + "epoch": 0.23729302694707982, + "grad_norm": 2.1008662662348394, + "learning_rate": 3.5659498529013545e-06, + "loss": 0.8948, + "step": 3289 + }, + { + "epoch": 0.23736517441650734, + "grad_norm": 1.7997897846600515, + "learning_rate": 3.5656590874518814e-06, + "loss": 0.9077, + "step": 3290 + }, + { + "epoch": 0.23743732188593486, + "grad_norm": 2.4239284546872217, + "learning_rate": 3.5653682365075614e-06, + "loss": 0.9807, + "step": 3291 + }, + { + "epoch": 0.23750946935536235, + "grad_norm": 2.387248728047203, + "learning_rate": 3.565077300084277e-06, + "loss": 0.8946, + "step": 3292 + }, + { + "epoch": 0.23758161682478987, + "grad_norm": 0.8278532122014753, + "learning_rate": 3.564786278197915e-06, + "loss": 0.8262, + "step": 3293 + }, + { + "epoch": 0.2376537642942174, + "grad_norm": 2.438650397725984, + "learning_rate": 3.564495170864367e-06, + "loss": 0.7952, + "step": 3294 + }, + { + "epoch": 0.23772591176364488, + "grad_norm": 2.130176746037838, + "learning_rate": 3.5642039780995293e-06, + "loss": 0.9291, + "step": 3295 + }, + { + "epoch": 0.2377980592330724, + "grad_norm": 1.8156974476002117, + "learning_rate": 3.5639126999193025e-06, + "loss": 1.0194, + "step": 3296 + }, + { + "epoch": 0.23787020670249992, + "grad_norm": 2.518403029858561, + "learning_rate": 3.563621336339593e-06, + "loss": 0.9812, + "step": 3297 + }, + { + "epoch": 0.2379423541719274, + "grad_norm": 2.1623270395602012, + "learning_rate": 3.56332988737631e-06, + "loss": 0.9953, + "step": 3298 + }, + { + "epoch": 0.23801450164135493, + "grad_norm": 2.594729410958871, + "learning_rate": 3.5630383530453693e-06, + "loss": 0.8657, + "step": 3299 + }, + { + "epoch": 0.23808664911078245, + "grad_norm": 2.957378412425401, + "learning_rate": 3.562746733362691e-06, + "loss": 0.9728, + "step": 3300 + }, + { + "epoch": 0.23815879658020994, + "grad_norm": 1.6726608568508488, + "learning_rate": 3.562455028344198e-06, + "loss": 0.8709, + "step": 3301 + }, + { + "epoch": 0.23823094404963746, + "grad_norm": 2.081918452143535, + "learning_rate": 3.56216323800582e-06, + "loss": 0.9718, + "step": 3302 + }, + { + "epoch": 0.23830309151906498, + "grad_norm": 3.321367450743577, + "learning_rate": 3.5618713623634905e-06, + "loss": 0.9421, + "step": 3303 + }, + { + "epoch": 0.23837523898849247, + "grad_norm": 2.8983272726892415, + "learning_rate": 3.5615794014331486e-06, + "loss": 0.792, + "step": 3304 + }, + { + "epoch": 0.23844738645792, + "grad_norm": 2.07707049927976, + "learning_rate": 3.5612873552307352e-06, + "loss": 0.9775, + "step": 3305 + }, + { + "epoch": 0.2385195339273475, + "grad_norm": 6.884625441835252, + "learning_rate": 3.5609952237721993e-06, + "loss": 0.9895, + "step": 3306 + }, + { + "epoch": 0.238591681396775, + "grad_norm": 0.7767961192882691, + "learning_rate": 3.560703007073493e-06, + "loss": 0.7556, + "step": 3307 + }, + { + "epoch": 0.23866382886620252, + "grad_norm": 2.4523804135967904, + "learning_rate": 3.5604107051505732e-06, + "loss": 0.9976, + "step": 3308 + }, + { + "epoch": 0.23873597633563004, + "grad_norm": 3.086785385891719, + "learning_rate": 3.560118318019401e-06, + "loss": 0.9378, + "step": 3309 + }, + { + "epoch": 0.23880812380505753, + "grad_norm": 2.756155926749013, + "learning_rate": 3.559825845695943e-06, + "loss": 1.0694, + "step": 3310 + }, + { + "epoch": 0.23888027127448505, + "grad_norm": 3.027530708692752, + "learning_rate": 3.5595332881961695e-06, + "loss": 0.9694, + "step": 3311 + }, + { + "epoch": 0.23895241874391257, + "grad_norm": 0.8903327596280063, + "learning_rate": 3.5592406455360563e-06, + "loss": 0.8377, + "step": 3312 + }, + { + "epoch": 0.23902456621334006, + "grad_norm": 2.406910892239755, + "learning_rate": 3.5589479177315836e-06, + "loss": 0.9183, + "step": 3313 + }, + { + "epoch": 0.23909671368276758, + "grad_norm": 2.3711977186457154, + "learning_rate": 3.5586551047987365e-06, + "loss": 0.8877, + "step": 3314 + }, + { + "epoch": 0.2391688611521951, + "grad_norm": 2.528979283722739, + "learning_rate": 3.558362206753504e-06, + "loss": 0.9972, + "step": 3315 + }, + { + "epoch": 0.2392410086216226, + "grad_norm": 2.167996353111156, + "learning_rate": 3.5580692236118806e-06, + "loss": 1.027, + "step": 3316 + }, + { + "epoch": 0.2393131560910501, + "grad_norm": 2.0752840054575916, + "learning_rate": 3.557776155389864e-06, + "loss": 0.9602, + "step": 3317 + }, + { + "epoch": 0.2393853035604776, + "grad_norm": 2.223373216998588, + "learning_rate": 3.557483002103458e-06, + "loss": 0.9805, + "step": 3318 + }, + { + "epoch": 0.23945745102990512, + "grad_norm": 3.4763835046529805, + "learning_rate": 3.5571897637686712e-06, + "loss": 0.9488, + "step": 3319 + }, + { + "epoch": 0.23952959849933264, + "grad_norm": 3.4255631311067316, + "learning_rate": 3.556896440401516e-06, + "loss": 0.891, + "step": 3320 + }, + { + "epoch": 0.23960174596876013, + "grad_norm": 3.627176486416237, + "learning_rate": 3.5566030320180104e-06, + "loss": 0.9901, + "step": 3321 + }, + { + "epoch": 0.23967389343818765, + "grad_norm": 2.563391460640973, + "learning_rate": 3.5563095386341747e-06, + "loss": 0.9509, + "step": 3322 + }, + { + "epoch": 0.23974604090761517, + "grad_norm": 1.7769535516691, + "learning_rate": 3.5560159602660363e-06, + "loss": 1.0265, + "step": 3323 + }, + { + "epoch": 0.23981818837704266, + "grad_norm": 2.6117231573808817, + "learning_rate": 3.555722296929627e-06, + "loss": 0.8941, + "step": 3324 + }, + { + "epoch": 0.23989033584647018, + "grad_norm": 0.8319547018021338, + "learning_rate": 3.5554285486409813e-06, + "loss": 0.8324, + "step": 3325 + }, + { + "epoch": 0.2399624833158977, + "grad_norm": 2.4242959013011243, + "learning_rate": 3.5551347154161412e-06, + "loss": 0.9745, + "step": 3326 + }, + { + "epoch": 0.2400346307853252, + "grad_norm": 2.3847968172100114, + "learning_rate": 3.554840797271151e-06, + "loss": 0.9944, + "step": 3327 + }, + { + "epoch": 0.2401067782547527, + "grad_norm": 1.9004746797797352, + "learning_rate": 3.5545467942220604e-06, + "loss": 1.0794, + "step": 3328 + }, + { + "epoch": 0.24017892572418023, + "grad_norm": 2.345532667587569, + "learning_rate": 3.5542527062849247e-06, + "loss": 1.0156, + "step": 3329 + }, + { + "epoch": 0.24025107319360772, + "grad_norm": 2.039926046452149, + "learning_rate": 3.553958533475802e-06, + "loss": 1.0335, + "step": 3330 + }, + { + "epoch": 0.24032322066303524, + "grad_norm": 3.0058960196657636, + "learning_rate": 3.5536642758107567e-06, + "loss": 0.9837, + "step": 3331 + }, + { + "epoch": 0.24039536813246276, + "grad_norm": 2.0002050294687566, + "learning_rate": 3.5533699333058566e-06, + "loss": 1.0193, + "step": 3332 + }, + { + "epoch": 0.24046751560189025, + "grad_norm": 2.742500569874679, + "learning_rate": 3.5530755059771744e-06, + "loss": 0.9591, + "step": 3333 + }, + { + "epoch": 0.24053966307131777, + "grad_norm": 2.207551013879823, + "learning_rate": 3.5527809938407883e-06, + "loss": 1.0148, + "step": 3334 + }, + { + "epoch": 0.2406118105407453, + "grad_norm": 2.3153524591058905, + "learning_rate": 3.5524863969127806e-06, + "loss": 0.9301, + "step": 3335 + }, + { + "epoch": 0.24068395801017278, + "grad_norm": 2.1958000754842373, + "learning_rate": 3.552191715209238e-06, + "loss": 0.9687, + "step": 3336 + }, + { + "epoch": 0.2407561054796003, + "grad_norm": 1.7289978389208003, + "learning_rate": 3.551896948746252e-06, + "loss": 0.9389, + "step": 3337 + }, + { + "epoch": 0.24082825294902782, + "grad_norm": 2.0818875318669394, + "learning_rate": 3.551602097539918e-06, + "loss": 0.931, + "step": 3338 + }, + { + "epoch": 0.2409004004184553, + "grad_norm": 2.0930843149071428, + "learning_rate": 3.5513071616063373e-06, + "loss": 0.9794, + "step": 3339 + }, + { + "epoch": 0.24097254788788283, + "grad_norm": 2.0545642656276875, + "learning_rate": 3.5510121409616157e-06, + "loss": 0.9564, + "step": 3340 + }, + { + "epoch": 0.24104469535731035, + "grad_norm": 2.117786393147508, + "learning_rate": 3.550717035621862e-06, + "loss": 0.9237, + "step": 3341 + }, + { + "epoch": 0.24111684282673784, + "grad_norm": 3.002268251595952, + "learning_rate": 3.550421845603192e-06, + "loss": 0.8562, + "step": 3342 + }, + { + "epoch": 0.24118899029616536, + "grad_norm": 4.625779575939678, + "learning_rate": 3.5501265709217245e-06, + "loss": 0.8934, + "step": 3343 + }, + { + "epoch": 0.24126113776559288, + "grad_norm": 2.4277956548983606, + "learning_rate": 3.5498312115935834e-06, + "loss": 0.913, + "step": 3344 + }, + { + "epoch": 0.24133328523502037, + "grad_norm": 2.1866140478866596, + "learning_rate": 3.5495357676348964e-06, + "loss": 0.9216, + "step": 3345 + }, + { + "epoch": 0.2414054327044479, + "grad_norm": 2.6182449254344005, + "learning_rate": 3.5492402390617985e-06, + "loss": 1.0142, + "step": 3346 + }, + { + "epoch": 0.2414775801738754, + "grad_norm": 3.6586196914786044, + "learning_rate": 3.548944625890425e-06, + "loss": 0.9756, + "step": 3347 + }, + { + "epoch": 0.2415497276433029, + "grad_norm": 2.3449657084676443, + "learning_rate": 3.5486489281369206e-06, + "loss": 0.956, + "step": 3348 + }, + { + "epoch": 0.24162187511273042, + "grad_norm": 2.1804196895163144, + "learning_rate": 3.5483531458174304e-06, + "loss": 0.9804, + "step": 3349 + }, + { + "epoch": 0.24169402258215794, + "grad_norm": 4.358306425654619, + "learning_rate": 3.548057278948107e-06, + "loss": 0.8947, + "step": 3350 + }, + { + "epoch": 0.24176617005158543, + "grad_norm": 2.101919604737601, + "learning_rate": 3.5477613275451065e-06, + "loss": 0.8851, + "step": 3351 + }, + { + "epoch": 0.24183831752101295, + "grad_norm": 2.399713034162999, + "learning_rate": 3.547465291624589e-06, + "loss": 0.9488, + "step": 3352 + }, + { + "epoch": 0.24191046499044047, + "grad_norm": 3.2839298022779144, + "learning_rate": 3.547169171202721e-06, + "loss": 0.9163, + "step": 3353 + }, + { + "epoch": 0.24198261245986796, + "grad_norm": 2.6769354120100175, + "learning_rate": 3.546872966295672e-06, + "loss": 0.8566, + "step": 3354 + }, + { + "epoch": 0.24205475992929548, + "grad_norm": 2.307453629181738, + "learning_rate": 3.546576676919616e-06, + "loss": 0.9531, + "step": 3355 + }, + { + "epoch": 0.242126907398723, + "grad_norm": 2.2129759540587393, + "learning_rate": 3.546280303090733e-06, + "loss": 0.9528, + "step": 3356 + }, + { + "epoch": 0.2421990548681505, + "grad_norm": 1.7566136319710055, + "learning_rate": 3.5459838448252073e-06, + "loss": 1.0233, + "step": 3357 + }, + { + "epoch": 0.242271202337578, + "grad_norm": 2.6786372403733836, + "learning_rate": 3.5456873021392265e-06, + "loss": 1.0081, + "step": 3358 + }, + { + "epoch": 0.24234334980700553, + "grad_norm": 3.6742353928465032, + "learning_rate": 3.5453906750489845e-06, + "loss": 1.0741, + "step": 3359 + }, + { + "epoch": 0.24241549727643302, + "grad_norm": 2.0227141394495662, + "learning_rate": 3.5450939635706782e-06, + "loss": 0.8846, + "step": 3360 + }, + { + "epoch": 0.24248764474586054, + "grad_norm": 2.376779792878989, + "learning_rate": 3.5447971677205103e-06, + "loss": 0.7853, + "step": 3361 + }, + { + "epoch": 0.24255979221528806, + "grad_norm": 2.4063801916002134, + "learning_rate": 3.5445002875146877e-06, + "loss": 0.902, + "step": 3362 + }, + { + "epoch": 0.24263193968471555, + "grad_norm": 1.5571303608344178, + "learning_rate": 3.5442033229694223e-06, + "loss": 1.0189, + "step": 3363 + }, + { + "epoch": 0.24270408715414307, + "grad_norm": 2.429486527057981, + "learning_rate": 3.5439062741009292e-06, + "loss": 1.0225, + "step": 3364 + }, + { + "epoch": 0.2427762346235706, + "grad_norm": 2.382881138547783, + "learning_rate": 3.5436091409254305e-06, + "loss": 0.8559, + "step": 3365 + }, + { + "epoch": 0.24284838209299808, + "grad_norm": 1.862053744727685, + "learning_rate": 3.5433119234591505e-06, + "loss": 0.9033, + "step": 3366 + }, + { + "epoch": 0.2429205295624256, + "grad_norm": 2.427190742790106, + "learning_rate": 3.5430146217183197e-06, + "loss": 0.9452, + "step": 3367 + }, + { + "epoch": 0.24299267703185312, + "grad_norm": 2.147861250844863, + "learning_rate": 3.542717235719172e-06, + "loss": 0.9708, + "step": 3368 + }, + { + "epoch": 0.2430648245012806, + "grad_norm": 2.209988339768801, + "learning_rate": 3.5424197654779474e-06, + "loss": 0.9588, + "step": 3369 + }, + { + "epoch": 0.24313697197070813, + "grad_norm": 0.7134755700305768, + "learning_rate": 3.5421222110108896e-06, + "loss": 0.7546, + "step": 3370 + }, + { + "epoch": 0.24320911944013562, + "grad_norm": 1.877708576771922, + "learning_rate": 3.541824572334246e-06, + "loss": 1.0282, + "step": 3371 + }, + { + "epoch": 0.24328126690956314, + "grad_norm": 0.783249394674399, + "learning_rate": 3.54152684946427e-06, + "loss": 0.7871, + "step": 3372 + }, + { + "epoch": 0.24335341437899066, + "grad_norm": 2.069157576964139, + "learning_rate": 3.54122904241722e-06, + "loss": 0.9842, + "step": 3373 + }, + { + "epoch": 0.24342556184841815, + "grad_norm": 2.043782234520976, + "learning_rate": 3.5409311512093563e-06, + "loss": 0.8704, + "step": 3374 + }, + { + "epoch": 0.24349770931784567, + "grad_norm": 1.9893222923858764, + "learning_rate": 3.540633175856948e-06, + "loss": 0.9279, + "step": 3375 + }, + { + "epoch": 0.2435698567872732, + "grad_norm": 2.1034756145727704, + "learning_rate": 3.540335116376265e-06, + "loss": 0.9051, + "step": 3376 + }, + { + "epoch": 0.24364200425670068, + "grad_norm": 2.1577927971782183, + "learning_rate": 3.5400369727835833e-06, + "loss": 0.9521, + "step": 3377 + }, + { + "epoch": 0.2437141517261282, + "grad_norm": 2.3535346699177566, + "learning_rate": 3.5397387450951824e-06, + "loss": 0.9937, + "step": 3378 + }, + { + "epoch": 0.24378629919555572, + "grad_norm": 2.049367188266898, + "learning_rate": 3.53944043332735e-06, + "loss": 1.0443, + "step": 3379 + }, + { + "epoch": 0.2438584466649832, + "grad_norm": 2.8240795391379625, + "learning_rate": 3.539142037496374e-06, + "loss": 1.0079, + "step": 3380 + }, + { + "epoch": 0.24393059413441073, + "grad_norm": 1.82510171632929, + "learning_rate": 3.538843557618549e-06, + "loss": 0.9444, + "step": 3381 + }, + { + "epoch": 0.24400274160383825, + "grad_norm": 1.8747478951405339, + "learning_rate": 3.5385449937101733e-06, + "loss": 0.9564, + "step": 3382 + }, + { + "epoch": 0.24407488907326574, + "grad_norm": 0.8169247381687197, + "learning_rate": 3.538246345787552e-06, + "loss": 0.8016, + "step": 3383 + }, + { + "epoch": 0.24414703654269326, + "grad_norm": 2.106833275874373, + "learning_rate": 3.5379476138669914e-06, + "loss": 1.0243, + "step": 3384 + }, + { + "epoch": 0.24421918401212078, + "grad_norm": 1.994244459776005, + "learning_rate": 3.537648797964805e-06, + "loss": 0.9451, + "step": 3385 + }, + { + "epoch": 0.24429133148154827, + "grad_norm": 2.134137199860775, + "learning_rate": 3.53734989809731e-06, + "loss": 1.0055, + "step": 3386 + }, + { + "epoch": 0.2443634789509758, + "grad_norm": 2.9584776793890737, + "learning_rate": 3.5370509142808283e-06, + "loss": 0.9998, + "step": 3387 + }, + { + "epoch": 0.2444356264204033, + "grad_norm": 5.594783239998086, + "learning_rate": 3.536751846531686e-06, + "loss": 1.0068, + "step": 3388 + }, + { + "epoch": 0.2445077738898308, + "grad_norm": 2.120070742920286, + "learning_rate": 3.5364526948662145e-06, + "loss": 0.9642, + "step": 3389 + }, + { + "epoch": 0.24457992135925832, + "grad_norm": 3.6762204693253797, + "learning_rate": 3.5361534593007488e-06, + "loss": 1.0041, + "step": 3390 + }, + { + "epoch": 0.24465206882868584, + "grad_norm": 0.9879150918545527, + "learning_rate": 3.5358541398516295e-06, + "loss": 0.8169, + "step": 3391 + }, + { + "epoch": 0.24472421629811333, + "grad_norm": 2.3377485622499083, + "learning_rate": 3.535554736535201e-06, + "loss": 0.9098, + "step": 3392 + }, + { + "epoch": 0.24479636376754085, + "grad_norm": 1.6194353192077584, + "learning_rate": 3.5352552493678127e-06, + "loss": 0.8807, + "step": 3393 + }, + { + "epoch": 0.24486851123696837, + "grad_norm": 3.104306177336792, + "learning_rate": 3.5349556783658187e-06, + "loss": 0.8605, + "step": 3394 + }, + { + "epoch": 0.24494065870639586, + "grad_norm": 2.5297708781534682, + "learning_rate": 3.534656023545577e-06, + "loss": 0.9103, + "step": 3395 + }, + { + "epoch": 0.24501280617582338, + "grad_norm": 2.1430043828423666, + "learning_rate": 3.5343562849234516e-06, + "loss": 0.9371, + "step": 3396 + }, + { + "epoch": 0.2450849536452509, + "grad_norm": 2.0573827667024833, + "learning_rate": 3.534056462515809e-06, + "loss": 0.9015, + "step": 3397 + }, + { + "epoch": 0.2451571011146784, + "grad_norm": 1.7931430042170056, + "learning_rate": 3.5337565563390223e-06, + "loss": 1.0521, + "step": 3398 + }, + { + "epoch": 0.2452292485841059, + "grad_norm": 0.8650477360910481, + "learning_rate": 3.5334565664094675e-06, + "loss": 0.7166, + "step": 3399 + }, + { + "epoch": 0.24530139605353343, + "grad_norm": 0.933347597751151, + "learning_rate": 3.533156492743526e-06, + "loss": 0.7993, + "step": 3400 + }, + { + "epoch": 0.24537354352296092, + "grad_norm": 3.0626829443025567, + "learning_rate": 3.532856335357584e-06, + "loss": 1.0634, + "step": 3401 + }, + { + "epoch": 0.24544569099238844, + "grad_norm": 2.301658579678408, + "learning_rate": 3.532556094268032e-06, + "loss": 0.9422, + "step": 3402 + }, + { + "epoch": 0.24551783846181596, + "grad_norm": 2.2216548976982344, + "learning_rate": 3.532255769491266e-06, + "loss": 1.1552, + "step": 3403 + }, + { + "epoch": 0.24558998593124345, + "grad_norm": 2.1874122602032857, + "learning_rate": 3.5319553610436836e-06, + "loss": 0.9507, + "step": 3404 + }, + { + "epoch": 0.24566213340067097, + "grad_norm": 2.208813944885657, + "learning_rate": 3.5316548689416903e-06, + "loss": 1.0044, + "step": 3405 + }, + { + "epoch": 0.2457342808700985, + "grad_norm": 2.5399294268197883, + "learning_rate": 3.5313542932016945e-06, + "loss": 0.9602, + "step": 3406 + }, + { + "epoch": 0.24580642833952598, + "grad_norm": 8.889496046099199, + "learning_rate": 3.53105363384011e-06, + "loss": 1.034, + "step": 3407 + }, + { + "epoch": 0.2458785758089535, + "grad_norm": 7.442878125022306, + "learning_rate": 3.5307528908733542e-06, + "loss": 0.8463, + "step": 3408 + }, + { + "epoch": 0.24595072327838102, + "grad_norm": 2.461423892440631, + "learning_rate": 3.5304520643178496e-06, + "loss": 0.8659, + "step": 3409 + }, + { + "epoch": 0.24602287074780851, + "grad_norm": 1.8851472300295113, + "learning_rate": 3.530151154190023e-06, + "loss": 0.9758, + "step": 3410 + }, + { + "epoch": 0.24609501821723603, + "grad_norm": 1.9272838866203559, + "learning_rate": 3.5298501605063067e-06, + "loss": 0.9792, + "step": 3411 + }, + { + "epoch": 0.24616716568666355, + "grad_norm": 0.8284168089157344, + "learning_rate": 3.529549083283136e-06, + "loss": 0.7692, + "step": 3412 + }, + { + "epoch": 0.24623931315609104, + "grad_norm": 1.7447539444067255, + "learning_rate": 3.5292479225369524e-06, + "loss": 0.9719, + "step": 3413 + }, + { + "epoch": 0.24631146062551856, + "grad_norm": 1.9459445646570361, + "learning_rate": 3.5289466782842015e-06, + "loss": 0.9312, + "step": 3414 + }, + { + "epoch": 0.24638360809494608, + "grad_norm": 2.527781523770455, + "learning_rate": 3.5286453505413315e-06, + "loss": 0.8269, + "step": 3415 + }, + { + "epoch": 0.24645575556437357, + "grad_norm": 1.8612967425459745, + "learning_rate": 3.5283439393247977e-06, + "loss": 1.0003, + "step": 3416 + }, + { + "epoch": 0.2465279030338011, + "grad_norm": 2.0834975241492337, + "learning_rate": 3.5280424446510593e-06, + "loss": 0.9256, + "step": 3417 + }, + { + "epoch": 0.2466000505032286, + "grad_norm": 2.369439794106769, + "learning_rate": 3.5277408665365796e-06, + "loss": 0.9832, + "step": 3418 + }, + { + "epoch": 0.2466721979726561, + "grad_norm": 4.087707486267349, + "learning_rate": 3.527439204997827e-06, + "loss": 1.0046, + "step": 3419 + }, + { + "epoch": 0.24674434544208362, + "grad_norm": 1.4728693719345367, + "learning_rate": 3.527137460051274e-06, + "loss": 0.7981, + "step": 3420 + }, + { + "epoch": 0.24681649291151112, + "grad_norm": 2.182086895521004, + "learning_rate": 3.526835631713397e-06, + "loss": 0.913, + "step": 3421 + }, + { + "epoch": 0.24688864038093863, + "grad_norm": 1.954662297831168, + "learning_rate": 3.5265337200006784e-06, + "loss": 0.8801, + "step": 3422 + }, + { + "epoch": 0.24696078785036615, + "grad_norm": 2.4338269068435383, + "learning_rate": 3.5262317249296044e-06, + "loss": 0.9403, + "step": 3423 + }, + { + "epoch": 0.24703293531979365, + "grad_norm": 2.380963317755149, + "learning_rate": 3.5259296465166665e-06, + "loss": 0.8856, + "step": 3424 + }, + { + "epoch": 0.24710508278922116, + "grad_norm": 2.0073685804701853, + "learning_rate": 3.525627484778359e-06, + "loss": 1.0166, + "step": 3425 + }, + { + "epoch": 0.24717723025864868, + "grad_norm": 19.872590692706467, + "learning_rate": 3.525325239731182e-06, + "loss": 0.9572, + "step": 3426 + }, + { + "epoch": 0.24724937772807618, + "grad_norm": 1.8860371308135409, + "learning_rate": 3.52502291139164e-06, + "loss": 0.8917, + "step": 3427 + }, + { + "epoch": 0.2473215251975037, + "grad_norm": 2.3432414202437917, + "learning_rate": 3.5247204997762424e-06, + "loss": 0.8417, + "step": 3428 + }, + { + "epoch": 0.2473936726669312, + "grad_norm": 1.6721827143623995, + "learning_rate": 3.524418004901503e-06, + "loss": 1.0445, + "step": 3429 + }, + { + "epoch": 0.2474658201363587, + "grad_norm": 1.4192181592026802, + "learning_rate": 3.524115426783939e-06, + "loss": 0.9509, + "step": 3430 + }, + { + "epoch": 0.24753796760578622, + "grad_norm": 1.695698707291851, + "learning_rate": 3.5238127654400744e-06, + "loss": 1.0975, + "step": 3431 + }, + { + "epoch": 0.24761011507521374, + "grad_norm": 2.7967567631967163, + "learning_rate": 3.5235100208864345e-06, + "loss": 0.9343, + "step": 3432 + }, + { + "epoch": 0.24768226254464124, + "grad_norm": 2.2231459790415062, + "learning_rate": 3.5232071931395535e-06, + "loss": 0.8937, + "step": 3433 + }, + { + "epoch": 0.24775441001406875, + "grad_norm": 2.179349702207454, + "learning_rate": 3.5229042822159657e-06, + "loss": 0.8617, + "step": 3434 + }, + { + "epoch": 0.24782655748349627, + "grad_norm": 1.6413025683012503, + "learning_rate": 3.522601288132212e-06, + "loss": 0.9616, + "step": 3435 + }, + { + "epoch": 0.24789870495292377, + "grad_norm": 1.6840411164934201, + "learning_rate": 3.5222982109048398e-06, + "loss": 0.9491, + "step": 3436 + }, + { + "epoch": 0.24797085242235128, + "grad_norm": 1.7948261687921285, + "learning_rate": 3.521995050550397e-06, + "loss": 0.9839, + "step": 3437 + }, + { + "epoch": 0.2480429998917788, + "grad_norm": 2.1577221917128506, + "learning_rate": 3.521691807085439e-06, + "loss": 0.9711, + "step": 3438 + }, + { + "epoch": 0.2481151473612063, + "grad_norm": 2.9115439159573637, + "learning_rate": 3.521388480526524e-06, + "loss": 0.9409, + "step": 3439 + }, + { + "epoch": 0.24818729483063381, + "grad_norm": 1.5530488853301494, + "learning_rate": 3.5210850708902163e-06, + "loss": 1.0254, + "step": 3440 + }, + { + "epoch": 0.24825944230006133, + "grad_norm": 2.2276658921381927, + "learning_rate": 3.5207815781930834e-06, + "loss": 0.997, + "step": 3441 + }, + { + "epoch": 0.24833158976948883, + "grad_norm": 2.4000405784991186, + "learning_rate": 3.520478002451699e-06, + "loss": 0.997, + "step": 3442 + }, + { + "epoch": 0.24840373723891634, + "grad_norm": 2.1931023926981608, + "learning_rate": 3.520174343682639e-06, + "loss": 0.9078, + "step": 3443 + }, + { + "epoch": 0.24847588470834386, + "grad_norm": 2.035417829166368, + "learning_rate": 3.519870601902486e-06, + "loss": 1.0308, + "step": 3444 + }, + { + "epoch": 0.24854803217777136, + "grad_norm": 1.0685155220921423, + "learning_rate": 3.519566777127825e-06, + "loss": 0.8969, + "step": 3445 + }, + { + "epoch": 0.24862017964719887, + "grad_norm": 3.177340304295679, + "learning_rate": 3.5192628693752485e-06, + "loss": 0.7171, + "step": 3446 + }, + { + "epoch": 0.2486923271166264, + "grad_norm": 2.344563050707461, + "learning_rate": 3.5189588786613504e-06, + "loss": 0.7598, + "step": 3447 + }, + { + "epoch": 0.24876447458605389, + "grad_norm": 2.754968143562807, + "learning_rate": 3.5186548050027306e-06, + "loss": 0.8729, + "step": 3448 + }, + { + "epoch": 0.2488366220554814, + "grad_norm": 2.2041692761391936, + "learning_rate": 3.5183506484159946e-06, + "loss": 0.9622, + "step": 3449 + }, + { + "epoch": 0.24890876952490892, + "grad_norm": 2.033417704139825, + "learning_rate": 3.5180464089177494e-06, + "loss": 0.9851, + "step": 3450 + }, + { + "epoch": 0.24898091699433642, + "grad_norm": 2.379981989431754, + "learning_rate": 3.5177420865246096e-06, + "loss": 0.867, + "step": 3451 + }, + { + "epoch": 0.24905306446376393, + "grad_norm": 0.8818386749529251, + "learning_rate": 3.5174376812531936e-06, + "loss": 0.8126, + "step": 3452 + }, + { + "epoch": 0.24912521193319145, + "grad_norm": 2.0430843984281073, + "learning_rate": 3.517133193120123e-06, + "loss": 0.9991, + "step": 3453 + }, + { + "epoch": 0.24919735940261895, + "grad_norm": 2.498447412944736, + "learning_rate": 3.516828622142025e-06, + "loss": 0.9678, + "step": 3454 + }, + { + "epoch": 0.24926950687204646, + "grad_norm": 1.7273835866432976, + "learning_rate": 3.5165239683355308e-06, + "loss": 0.9652, + "step": 3455 + }, + { + "epoch": 0.24934165434147398, + "grad_norm": 2.88848632876882, + "learning_rate": 3.5162192317172767e-06, + "loss": 1.022, + "step": 3456 + }, + { + "epoch": 0.24941380181090148, + "grad_norm": 3.539185241861253, + "learning_rate": 3.5159144123039034e-06, + "loss": 0.7839, + "step": 3457 + }, + { + "epoch": 0.249485949280329, + "grad_norm": 1.740407813063917, + "learning_rate": 3.5156095101120555e-06, + "loss": 1.0824, + "step": 3458 + }, + { + "epoch": 0.24955809674975651, + "grad_norm": 1.7220519513214247, + "learning_rate": 3.5153045251583827e-06, + "loss": 0.9752, + "step": 3459 + }, + { + "epoch": 0.249630244219184, + "grad_norm": 2.3101263493914326, + "learning_rate": 3.5149994574595398e-06, + "loss": 1.068, + "step": 3460 + }, + { + "epoch": 0.24970239168861152, + "grad_norm": 1.8333776280803107, + "learning_rate": 3.5146943070321845e-06, + "loss": 0.9421, + "step": 3461 + }, + { + "epoch": 0.24977453915803904, + "grad_norm": 1.9979524860479358, + "learning_rate": 3.5143890738929805e-06, + "loss": 1.0024, + "step": 3462 + }, + { + "epoch": 0.24984668662746654, + "grad_norm": 2.0037067157832147, + "learning_rate": 3.514083758058595e-06, + "loss": 0.9904, + "step": 3463 + }, + { + "epoch": 0.24991883409689405, + "grad_norm": 0.8715000317037886, + "learning_rate": 3.5137783595457006e-06, + "loss": 0.834, + "step": 3464 + }, + { + "epoch": 0.24999098156632157, + "grad_norm": 1.5781006763963221, + "learning_rate": 3.5134728783709737e-06, + "loss": 1.0492, + "step": 3465 + }, + { + "epoch": 0.25006312903574907, + "grad_norm": 2.411007920804772, + "learning_rate": 3.513167314551095e-06, + "loss": 0.9947, + "step": 3466 + }, + { + "epoch": 0.2501352765051766, + "grad_norm": 5.169835175297767, + "learning_rate": 3.512861668102752e-06, + "loss": 0.9786, + "step": 3467 + }, + { + "epoch": 0.2502074239746041, + "grad_norm": 1.7018106971385694, + "learning_rate": 3.5125559390426333e-06, + "loss": 1.0392, + "step": 3468 + }, + { + "epoch": 0.2502795714440316, + "grad_norm": 1.8946503591947517, + "learning_rate": 3.5122501273874336e-06, + "loss": 0.9602, + "step": 3469 + }, + { + "epoch": 0.2503517189134591, + "grad_norm": 2.199053526070355, + "learning_rate": 3.511944233153853e-06, + "loss": 0.8733, + "step": 3470 + }, + { + "epoch": 0.2504238663828866, + "grad_norm": 2.2321600973280926, + "learning_rate": 3.511638256358595e-06, + "loss": 0.8371, + "step": 3471 + }, + { + "epoch": 0.2504960138523141, + "grad_norm": 2.302457085788348, + "learning_rate": 3.5113321970183674e-06, + "loss": 1.0306, + "step": 3472 + }, + { + "epoch": 0.25056816132174164, + "grad_norm": 2.5070028453821407, + "learning_rate": 3.5110260551498834e-06, + "loss": 0.9971, + "step": 3473 + }, + { + "epoch": 0.25064030879116916, + "grad_norm": 2.5400338061027887, + "learning_rate": 3.51071983076986e-06, + "loss": 0.9195, + "step": 3474 + }, + { + "epoch": 0.2507124562605967, + "grad_norm": 2.2431664008069006, + "learning_rate": 3.5104135238950194e-06, + "loss": 0.9222, + "step": 3475 + }, + { + "epoch": 0.25078460373002415, + "grad_norm": 2.148823984236092, + "learning_rate": 3.5101071345420874e-06, + "loss": 0.9524, + "step": 3476 + }, + { + "epoch": 0.25085675119945167, + "grad_norm": 4.452082568846753, + "learning_rate": 3.5098006627277956e-06, + "loss": 0.9624, + "step": 3477 + }, + { + "epoch": 0.2509288986688792, + "grad_norm": 2.0059971302426503, + "learning_rate": 3.509494108468878e-06, + "loss": 0.9168, + "step": 3478 + }, + { + "epoch": 0.2510010461383067, + "grad_norm": 0.8216360042014444, + "learning_rate": 3.5091874717820754e-06, + "loss": 0.8016, + "step": 3479 + }, + { + "epoch": 0.2510731936077342, + "grad_norm": 2.2777638383247574, + "learning_rate": 3.508880752684131e-06, + "loss": 0.9914, + "step": 3480 + }, + { + "epoch": 0.25114534107716174, + "grad_norm": 6.4973611610318205, + "learning_rate": 3.5085739511917956e-06, + "loss": 0.8351, + "step": 3481 + }, + { + "epoch": 0.2512174885465892, + "grad_norm": 1.8785257251552854, + "learning_rate": 3.508267067321821e-06, + "loss": 0.9601, + "step": 3482 + }, + { + "epoch": 0.2512896360160167, + "grad_norm": 1.822632191577315, + "learning_rate": 3.5079601010909647e-06, + "loss": 0.9914, + "step": 3483 + }, + { + "epoch": 0.25136178348544425, + "grad_norm": 2.1879418063192073, + "learning_rate": 3.50765305251599e-06, + "loss": 0.8848, + "step": 3484 + }, + { + "epoch": 0.25143393095487176, + "grad_norm": 2.5507197467580527, + "learning_rate": 3.507345921613663e-06, + "loss": 0.9398, + "step": 3485 + }, + { + "epoch": 0.2515060784242993, + "grad_norm": 2.482711811076576, + "learning_rate": 3.5070387084007553e-06, + "loss": 0.9146, + "step": 3486 + }, + { + "epoch": 0.2515782258937268, + "grad_norm": 2.5827751069266958, + "learning_rate": 3.5067314128940427e-06, + "loss": 0.9063, + "step": 3487 + }, + { + "epoch": 0.25165037336315427, + "grad_norm": 1.7827563023715916, + "learning_rate": 3.5064240351103053e-06, + "loss": 0.8416, + "step": 3488 + }, + { + "epoch": 0.2517225208325818, + "grad_norm": 2.106905699862429, + "learning_rate": 3.506116575066328e-06, + "loss": 1.0293, + "step": 3489 + }, + { + "epoch": 0.2517946683020093, + "grad_norm": 2.2135501158309947, + "learning_rate": 3.5058090327788994e-06, + "loss": 0.8967, + "step": 3490 + }, + { + "epoch": 0.2518668157714368, + "grad_norm": 0.7641032529856794, + "learning_rate": 3.505501408264815e-06, + "loss": 0.7651, + "step": 3491 + }, + { + "epoch": 0.25193896324086434, + "grad_norm": 1.9251579219890425, + "learning_rate": 3.5051937015408706e-06, + "loss": 0.9943, + "step": 3492 + }, + { + "epoch": 0.25201111071029186, + "grad_norm": 1.9629858496802572, + "learning_rate": 3.5048859126238714e-06, + "loss": 0.8378, + "step": 3493 + }, + { + "epoch": 0.2520832581797193, + "grad_norm": 2.722522704955425, + "learning_rate": 3.5045780415306227e-06, + "loss": 0.7946, + "step": 3494 + }, + { + "epoch": 0.25215540564914685, + "grad_norm": 2.0078792338283598, + "learning_rate": 3.504270088277937e-06, + "loss": 0.8802, + "step": 3495 + }, + { + "epoch": 0.25222755311857437, + "grad_norm": 2.3443665774261984, + "learning_rate": 3.50396205288263e-06, + "loss": 0.9384, + "step": 3496 + }, + { + "epoch": 0.2522997005880019, + "grad_norm": 2.574269504075721, + "learning_rate": 3.503653935361524e-06, + "loss": 0.9121, + "step": 3497 + }, + { + "epoch": 0.2523718480574294, + "grad_norm": 2.0321013793811478, + "learning_rate": 3.5033457357314417e-06, + "loss": 1.0181, + "step": 3498 + }, + { + "epoch": 0.25244399552685687, + "grad_norm": 3.057005176194151, + "learning_rate": 3.5030374540092143e-06, + "loss": 0.8751, + "step": 3499 + }, + { + "epoch": 0.2525161429962844, + "grad_norm": 2.737728395035587, + "learning_rate": 3.502729090211676e-06, + "loss": 0.9245, + "step": 3500 + }, + { + "epoch": 0.2525882904657119, + "grad_norm": 3.5220918982938043, + "learning_rate": 3.5024206443556645e-06, + "loss": 0.999, + "step": 3501 + }, + { + "epoch": 0.2526604379351394, + "grad_norm": 2.808306725788189, + "learning_rate": 3.502112116458024e-06, + "loss": 0.9255, + "step": 3502 + }, + { + "epoch": 0.25273258540456695, + "grad_norm": 1.0742967057118724, + "learning_rate": 3.501803506535601e-06, + "loss": 0.7667, + "step": 3503 + }, + { + "epoch": 0.25280473287399446, + "grad_norm": 1.990213770287109, + "learning_rate": 3.5014948146052477e-06, + "loss": 1.0525, + "step": 3504 + }, + { + "epoch": 0.25287688034342193, + "grad_norm": 2.278762297060917, + "learning_rate": 3.5011860406838214e-06, + "loss": 0.9817, + "step": 3505 + }, + { + "epoch": 0.25294902781284945, + "grad_norm": 0.9393346953739754, + "learning_rate": 3.500877184788182e-06, + "loss": 0.8671, + "step": 3506 + }, + { + "epoch": 0.25302117528227697, + "grad_norm": 1.020511375311514, + "learning_rate": 3.500568246935196e-06, + "loss": 0.772, + "step": 3507 + }, + { + "epoch": 0.2530933227517045, + "grad_norm": 2.0821447287833625, + "learning_rate": 3.5002592271417334e-06, + "loss": 0.9489, + "step": 3508 + }, + { + "epoch": 0.253165470221132, + "grad_norm": 2.014505829712477, + "learning_rate": 3.4999501254246678e-06, + "loss": 0.9296, + "step": 3509 + }, + { + "epoch": 0.2532376176905595, + "grad_norm": 2.282141733286543, + "learning_rate": 3.4996409418008782e-06, + "loss": 0.9585, + "step": 3510 + }, + { + "epoch": 0.253309765159987, + "grad_norm": 1.9623821757718938, + "learning_rate": 3.4993316762872486e-06, + "loss": 0.9095, + "step": 3511 + }, + { + "epoch": 0.2533819126294145, + "grad_norm": 2.6116228301120925, + "learning_rate": 3.499022328900666e-06, + "loss": 0.9228, + "step": 3512 + }, + { + "epoch": 0.253454060098842, + "grad_norm": 2.081826262554918, + "learning_rate": 3.4987128996580243e-06, + "loss": 1.0035, + "step": 3513 + }, + { + "epoch": 0.25352620756826955, + "grad_norm": 2.256891716235193, + "learning_rate": 3.498403388576218e-06, + "loss": 0.9849, + "step": 3514 + }, + { + "epoch": 0.25359835503769707, + "grad_norm": 2.2223014883633327, + "learning_rate": 3.49809379567215e-06, + "loss": 1.0282, + "step": 3515 + }, + { + "epoch": 0.2536705025071246, + "grad_norm": 2.076090340076436, + "learning_rate": 3.497784120962726e-06, + "loss": 0.9067, + "step": 3516 + }, + { + "epoch": 0.25374264997655205, + "grad_norm": 5.633990753155406, + "learning_rate": 3.497474364464855e-06, + "loss": 0.8448, + "step": 3517 + }, + { + "epoch": 0.25381479744597957, + "grad_norm": 2.900704784911168, + "learning_rate": 3.497164526195453e-06, + "loss": 0.9777, + "step": 3518 + }, + { + "epoch": 0.2538869449154071, + "grad_norm": 1.971332672552919, + "learning_rate": 3.496854606171438e-06, + "loss": 1.0338, + "step": 3519 + }, + { + "epoch": 0.2539590923848346, + "grad_norm": 0.8024701428429715, + "learning_rate": 3.4965446044097347e-06, + "loss": 0.8332, + "step": 3520 + }, + { + "epoch": 0.2540312398542621, + "grad_norm": 2.391228693115897, + "learning_rate": 3.49623452092727e-06, + "loss": 0.9695, + "step": 3521 + }, + { + "epoch": 0.25410338732368964, + "grad_norm": 2.4995798711619748, + "learning_rate": 3.4959243557409774e-06, + "loss": 1.0301, + "step": 3522 + }, + { + "epoch": 0.2541755347931171, + "grad_norm": 3.1622177963094806, + "learning_rate": 3.4956141088677937e-06, + "loss": 1.0394, + "step": 3523 + }, + { + "epoch": 0.2542476822625446, + "grad_norm": 2.0861708835527595, + "learning_rate": 3.4953037803246596e-06, + "loss": 0.9528, + "step": 3524 + }, + { + "epoch": 0.25431982973197215, + "grad_norm": 2.0500469528031573, + "learning_rate": 3.494993370128522e-06, + "loss": 0.9028, + "step": 3525 + }, + { + "epoch": 0.25439197720139967, + "grad_norm": 1.9952548479490433, + "learning_rate": 3.49468287829633e-06, + "loss": 0.9663, + "step": 3526 + }, + { + "epoch": 0.2544641246708272, + "grad_norm": 3.5542439949117464, + "learning_rate": 3.4943723048450397e-06, + "loss": 0.8937, + "step": 3527 + }, + { + "epoch": 0.2545362721402547, + "grad_norm": 4.47147377074887, + "learning_rate": 3.4940616497916097e-06, + "loss": 1.0117, + "step": 3528 + }, + { + "epoch": 0.25460841960968217, + "grad_norm": 2.3049497293546977, + "learning_rate": 3.493750913153003e-06, + "loss": 0.873, + "step": 3529 + }, + { + "epoch": 0.2546805670791097, + "grad_norm": 2.024939491404041, + "learning_rate": 3.49344009494619e-06, + "loss": 0.9524, + "step": 3530 + }, + { + "epoch": 0.2547527145485372, + "grad_norm": 0.9817214501438047, + "learning_rate": 3.4931291951881415e-06, + "loss": 0.8042, + "step": 3531 + }, + { + "epoch": 0.2548248620179647, + "grad_norm": 2.719025608869514, + "learning_rate": 3.492818213895835e-06, + "loss": 1.0142, + "step": 3532 + }, + { + "epoch": 0.25489700948739225, + "grad_norm": 2.1118948120380043, + "learning_rate": 3.4925071510862515e-06, + "loss": 0.8774, + "step": 3533 + }, + { + "epoch": 0.25496915695681976, + "grad_norm": 2.4061615172131994, + "learning_rate": 3.492196006776378e-06, + "loss": 1.029, + "step": 3534 + }, + { + "epoch": 0.25504130442624723, + "grad_norm": 2.4105546767263557, + "learning_rate": 3.4918847809832043e-06, + "loss": 0.9228, + "step": 3535 + }, + { + "epoch": 0.25511345189567475, + "grad_norm": 2.0949589669299296, + "learning_rate": 3.4915734737237257e-06, + "loss": 1.0413, + "step": 3536 + }, + { + "epoch": 0.25518559936510227, + "grad_norm": 2.336080591418466, + "learning_rate": 3.4912620850149416e-06, + "loss": 0.9276, + "step": 3537 + }, + { + "epoch": 0.2552577468345298, + "grad_norm": 2.5727541379264225, + "learning_rate": 3.4909506148738546e-06, + "loss": 0.9539, + "step": 3538 + }, + { + "epoch": 0.2553298943039573, + "grad_norm": 2.368602971569163, + "learning_rate": 3.4906390633174743e-06, + "loss": 0.8727, + "step": 3539 + }, + { + "epoch": 0.2554020417733848, + "grad_norm": 2.527428178983659, + "learning_rate": 3.4903274303628135e-06, + "loss": 1.036, + "step": 3540 + }, + { + "epoch": 0.2554741892428123, + "grad_norm": 1.7175722421836441, + "learning_rate": 3.490015716026888e-06, + "loss": 0.9979, + "step": 3541 + }, + { + "epoch": 0.2555463367122398, + "grad_norm": 2.3615848726669397, + "learning_rate": 3.48970392032672e-06, + "loss": 0.9843, + "step": 3542 + }, + { + "epoch": 0.2556184841816673, + "grad_norm": 2.2946176660076287, + "learning_rate": 3.489392043279336e-06, + "loss": 1.1135, + "step": 3543 + }, + { + "epoch": 0.25569063165109485, + "grad_norm": 3.08488457981633, + "learning_rate": 3.489080084901766e-06, + "loss": 0.8793, + "step": 3544 + }, + { + "epoch": 0.25576277912052237, + "grad_norm": 1.588793508442643, + "learning_rate": 3.488768045211045e-06, + "loss": 0.9731, + "step": 3545 + }, + { + "epoch": 0.2558349265899499, + "grad_norm": 1.7677815188572048, + "learning_rate": 3.488455924224212e-06, + "loss": 0.9329, + "step": 3546 + }, + { + "epoch": 0.25590707405937735, + "grad_norm": 1.9757321992323493, + "learning_rate": 3.4881437219583117e-06, + "loss": 0.8932, + "step": 3547 + }, + { + "epoch": 0.25597922152880487, + "grad_norm": 2.1837892939542964, + "learning_rate": 3.4878314384303908e-06, + "loss": 0.8998, + "step": 3548 + }, + { + "epoch": 0.2560513689982324, + "grad_norm": 3.3144042101944486, + "learning_rate": 3.4875190736575035e-06, + "loss": 0.9987, + "step": 3549 + }, + { + "epoch": 0.2561235164676599, + "grad_norm": 2.3499727166397943, + "learning_rate": 3.4872066276567058e-06, + "loss": 0.8874, + "step": 3550 + }, + { + "epoch": 0.2561956639370874, + "grad_norm": 2.76798083297069, + "learning_rate": 3.4868941004450597e-06, + "loss": 0.9353, + "step": 3551 + }, + { + "epoch": 0.2562678114065149, + "grad_norm": 1.812673823473094, + "learning_rate": 3.4865814920396314e-06, + "loss": 0.9437, + "step": 3552 + }, + { + "epoch": 0.2563399588759424, + "grad_norm": 2.020952031166047, + "learning_rate": 3.4862688024574906e-06, + "loss": 0.9527, + "step": 3553 + }, + { + "epoch": 0.25641210634536993, + "grad_norm": 2.2064826215919324, + "learning_rate": 3.4859560317157124e-06, + "loss": 1.0081, + "step": 3554 + }, + { + "epoch": 0.25648425381479745, + "grad_norm": 2.035685582066146, + "learning_rate": 3.4856431798313757e-06, + "loss": 0.9661, + "step": 3555 + }, + { + "epoch": 0.25655640128422497, + "grad_norm": 1.9876660067727627, + "learning_rate": 3.4853302468215653e-06, + "loss": 0.9729, + "step": 3556 + }, + { + "epoch": 0.2566285487536525, + "grad_norm": 2.23706053355291, + "learning_rate": 3.485017232703369e-06, + "loss": 0.8932, + "step": 3557 + }, + { + "epoch": 0.25670069622307995, + "grad_norm": 2.4229574430210112, + "learning_rate": 3.484704137493878e-06, + "loss": 0.85, + "step": 3558 + }, + { + "epoch": 0.25677284369250747, + "grad_norm": 1.13456942360755, + "learning_rate": 3.4843909612101906e-06, + "loss": 0.8735, + "step": 3559 + }, + { + "epoch": 0.256844991161935, + "grad_norm": 2.2593431399303197, + "learning_rate": 3.484077703869408e-06, + "loss": 0.9189, + "step": 3560 + }, + { + "epoch": 0.2569171386313625, + "grad_norm": 3.0235999406257683, + "learning_rate": 3.4837643654886357e-06, + "loss": 0.9486, + "step": 3561 + }, + { + "epoch": 0.25698928610079, + "grad_norm": 3.510475829821053, + "learning_rate": 3.4834509460849844e-06, + "loss": 1.0002, + "step": 3562 + }, + { + "epoch": 0.25706143357021755, + "grad_norm": 2.7201425778662593, + "learning_rate": 3.4831374456755684e-06, + "loss": 0.8745, + "step": 3563 + }, + { + "epoch": 0.257133581039645, + "grad_norm": 2.70008873617091, + "learning_rate": 3.482823864277507e-06, + "loss": 1.0217, + "step": 3564 + }, + { + "epoch": 0.25720572850907253, + "grad_norm": 2.735894876144811, + "learning_rate": 3.482510201907923e-06, + "loss": 1.0202, + "step": 3565 + }, + { + "epoch": 0.25727787597850005, + "grad_norm": 4.458861394014202, + "learning_rate": 3.4821964585839453e-06, + "loss": 1.0076, + "step": 3566 + }, + { + "epoch": 0.25735002344792757, + "grad_norm": 2.020796772209906, + "learning_rate": 3.481882634322706e-06, + "loss": 0.8265, + "step": 3567 + }, + { + "epoch": 0.2574221709173551, + "grad_norm": 3.1370627474724806, + "learning_rate": 3.4815687291413416e-06, + "loss": 0.7737, + "step": 3568 + }, + { + "epoch": 0.2574943183867826, + "grad_norm": 3.419020206575801, + "learning_rate": 3.4812547430569933e-06, + "loss": 0.9692, + "step": 3569 + }, + { + "epoch": 0.25756646585621007, + "grad_norm": 3.152443885394556, + "learning_rate": 3.480940676086807e-06, + "loss": 0.8749, + "step": 3570 + }, + { + "epoch": 0.2576386133256376, + "grad_norm": 1.780672213891483, + "learning_rate": 3.480626528247933e-06, + "loss": 0.9149, + "step": 3571 + }, + { + "epoch": 0.2577107607950651, + "grad_norm": 3.19006588452264, + "learning_rate": 3.4803122995575258e-06, + "loss": 0.909, + "step": 3572 + }, + { + "epoch": 0.2577829082644926, + "grad_norm": 2.130583552088587, + "learning_rate": 3.4799979900327427e-06, + "loss": 0.9649, + "step": 3573 + }, + { + "epoch": 0.25785505573392015, + "grad_norm": 3.238921135606834, + "learning_rate": 3.479683599690749e-06, + "loss": 0.9306, + "step": 3574 + }, + { + "epoch": 0.25792720320334767, + "grad_norm": 2.1349127074002334, + "learning_rate": 3.4793691285487112e-06, + "loss": 0.9767, + "step": 3575 + }, + { + "epoch": 0.25799935067277513, + "grad_norm": 2.2794191512362763, + "learning_rate": 3.4790545766238015e-06, + "loss": 1.0379, + "step": 3576 + }, + { + "epoch": 0.25807149814220265, + "grad_norm": 2.0759036015526955, + "learning_rate": 3.478739943933197e-06, + "loss": 0.9142, + "step": 3577 + }, + { + "epoch": 0.25814364561163017, + "grad_norm": 2.1236014532326046, + "learning_rate": 3.478425230494078e-06, + "loss": 0.9198, + "step": 3578 + }, + { + "epoch": 0.2582157930810577, + "grad_norm": 2.600870401235836, + "learning_rate": 3.4781104363236305e-06, + "loss": 1.1122, + "step": 3579 + }, + { + "epoch": 0.2582879405504852, + "grad_norm": 2.4966029929719706, + "learning_rate": 3.477795561439043e-06, + "loss": 0.9853, + "step": 3580 + }, + { + "epoch": 0.2583600880199127, + "grad_norm": 3.3677849117924925, + "learning_rate": 3.4774806058575116e-06, + "loss": 0.9276, + "step": 3581 + }, + { + "epoch": 0.2584322354893402, + "grad_norm": 2.022943738395455, + "learning_rate": 3.477165569596233e-06, + "loss": 0.9989, + "step": 3582 + }, + { + "epoch": 0.2585043829587677, + "grad_norm": 2.20052405964692, + "learning_rate": 3.476850452672411e-06, + "loss": 0.9755, + "step": 3583 + }, + { + "epoch": 0.25857653042819523, + "grad_norm": 0.8809831282811147, + "learning_rate": 3.476535255103253e-06, + "loss": 0.8132, + "step": 3584 + }, + { + "epoch": 0.25864867789762275, + "grad_norm": 2.0761711859858925, + "learning_rate": 3.476219976905971e-06, + "loss": 0.8979, + "step": 3585 + }, + { + "epoch": 0.25872082536705027, + "grad_norm": 2.988678390940567, + "learning_rate": 3.4759046180977807e-06, + "loss": 1.0024, + "step": 3586 + }, + { + "epoch": 0.2587929728364778, + "grad_norm": 0.7948603125875605, + "learning_rate": 3.4755891786959025e-06, + "loss": 0.8418, + "step": 3587 + }, + { + "epoch": 0.25886512030590525, + "grad_norm": 3.9327583221353315, + "learning_rate": 3.475273658717562e-06, + "loss": 0.9425, + "step": 3588 + }, + { + "epoch": 0.25893726777533277, + "grad_norm": 2.1566147426661333, + "learning_rate": 3.4749580581799876e-06, + "loss": 0.942, + "step": 3589 + }, + { + "epoch": 0.2590094152447603, + "grad_norm": 4.764116692664877, + "learning_rate": 3.474642377100415e-06, + "loss": 0.9829, + "step": 3590 + }, + { + "epoch": 0.2590815627141878, + "grad_norm": 2.4375703263530157, + "learning_rate": 3.47432661549608e-06, + "loss": 0.8714, + "step": 3591 + }, + { + "epoch": 0.2591537101836153, + "grad_norm": 2.4051534953473634, + "learning_rate": 3.4740107733842265e-06, + "loss": 0.8165, + "step": 3592 + }, + { + "epoch": 0.25922585765304285, + "grad_norm": 1.8992490564373847, + "learning_rate": 3.4736948507821016e-06, + "loss": 0.8855, + "step": 3593 + }, + { + "epoch": 0.2592980051224703, + "grad_norm": 2.114749194125094, + "learning_rate": 3.4733788477069562e-06, + "loss": 0.9883, + "step": 3594 + }, + { + "epoch": 0.25937015259189783, + "grad_norm": 2.1938745533941253, + "learning_rate": 3.4730627641760463e-06, + "loss": 0.9586, + "step": 3595 + }, + { + "epoch": 0.25944230006132535, + "grad_norm": 1.916207928976431, + "learning_rate": 3.4727466002066323e-06, + "loss": 0.9667, + "step": 3596 + }, + { + "epoch": 0.25951444753075287, + "grad_norm": 2.690021751202421, + "learning_rate": 3.4724303558159774e-06, + "loss": 1.0507, + "step": 3597 + }, + { + "epoch": 0.2595865950001804, + "grad_norm": 2.5843832179611157, + "learning_rate": 3.472114031021352e-06, + "loss": 0.9444, + "step": 3598 + }, + { + "epoch": 0.2596587424696079, + "grad_norm": 2.5332836882428285, + "learning_rate": 3.471797625840029e-06, + "loss": 0.9911, + "step": 3599 + }, + { + "epoch": 0.25973088993903537, + "grad_norm": 2.814680568127055, + "learning_rate": 3.4714811402892866e-06, + "loss": 0.9653, + "step": 3600 + }, + { + "epoch": 0.2598030374084629, + "grad_norm": 2.848987660556367, + "learning_rate": 3.4711645743864058e-06, + "loss": 0.8599, + "step": 3601 + }, + { + "epoch": 0.2598751848778904, + "grad_norm": 2.1982438143805063, + "learning_rate": 3.4708479281486734e-06, + "loss": 0.9194, + "step": 3602 + }, + { + "epoch": 0.2599473323473179, + "grad_norm": 2.1289514939286374, + "learning_rate": 3.4705312015933814e-06, + "loss": 0.9658, + "step": 3603 + }, + { + "epoch": 0.26001947981674545, + "grad_norm": 3.7057282516396155, + "learning_rate": 3.470214394737823e-06, + "loss": 0.9063, + "step": 3604 + }, + { + "epoch": 0.2600916272861729, + "grad_norm": 3.1822686743438946, + "learning_rate": 3.4698975075993e-06, + "loss": 1.0241, + "step": 3605 + }, + { + "epoch": 0.26016377475560043, + "grad_norm": 1.966938153887318, + "learning_rate": 3.469580540195115e-06, + "loss": 1.0106, + "step": 3606 + }, + { + "epoch": 0.26023592222502795, + "grad_norm": 2.5160224553184145, + "learning_rate": 3.469263492542577e-06, + "loss": 0.9517, + "step": 3607 + }, + { + "epoch": 0.26030806969445547, + "grad_norm": 2.358491947065705, + "learning_rate": 3.468946364658999e-06, + "loss": 0.9701, + "step": 3608 + }, + { + "epoch": 0.260380217163883, + "grad_norm": 7.243245628775096, + "learning_rate": 3.4686291565616976e-06, + "loss": 0.977, + "step": 3609 + }, + { + "epoch": 0.2604523646333105, + "grad_norm": 2.952985326299694, + "learning_rate": 3.468311868267994e-06, + "loss": 0.903, + "step": 3610 + }, + { + "epoch": 0.26052451210273797, + "grad_norm": 1.7357230231655547, + "learning_rate": 3.467994499795216e-06, + "loss": 0.9159, + "step": 3611 + }, + { + "epoch": 0.2605966595721655, + "grad_norm": 2.2959425713870063, + "learning_rate": 3.467677051160692e-06, + "loss": 1.0097, + "step": 3612 + }, + { + "epoch": 0.260668807041593, + "grad_norm": 3.4004573738781816, + "learning_rate": 3.4673595223817573e-06, + "loss": 1.0522, + "step": 3613 + }, + { + "epoch": 0.26074095451102053, + "grad_norm": 2.6120090552636017, + "learning_rate": 3.4670419134757516e-06, + "loss": 1.0544, + "step": 3614 + }, + { + "epoch": 0.26081310198044805, + "grad_norm": 2.538049208113664, + "learning_rate": 3.4667242244600176e-06, + "loss": 1.0239, + "step": 3615 + }, + { + "epoch": 0.26088524944987557, + "grad_norm": 0.9477223661776818, + "learning_rate": 3.466406455351903e-06, + "loss": 0.8239, + "step": 3616 + }, + { + "epoch": 0.26095739691930303, + "grad_norm": 2.7422640102396176, + "learning_rate": 3.4660886061687604e-06, + "loss": 0.9022, + "step": 3617 + }, + { + "epoch": 0.26102954438873055, + "grad_norm": 2.360146503750358, + "learning_rate": 3.4657706769279465e-06, + "loss": 0.9681, + "step": 3618 + }, + { + "epoch": 0.26110169185815807, + "grad_norm": 2.17539758227334, + "learning_rate": 3.465452667646822e-06, + "loss": 0.8862, + "step": 3619 + }, + { + "epoch": 0.2611738393275856, + "grad_norm": 1.900674200904513, + "learning_rate": 3.4651345783427522e-06, + "loss": 0.8944, + "step": 3620 + }, + { + "epoch": 0.2612459867970131, + "grad_norm": 3.169716537509564, + "learning_rate": 3.4648164090331067e-06, + "loss": 0.9896, + "step": 3621 + }, + { + "epoch": 0.2613181342664406, + "grad_norm": 2.742999790660309, + "learning_rate": 3.4644981597352604e-06, + "loss": 0.916, + "step": 3622 + }, + { + "epoch": 0.2613902817358681, + "grad_norm": 2.760003794377248, + "learning_rate": 3.4641798304665903e-06, + "loss": 0.871, + "step": 3623 + }, + { + "epoch": 0.2614624292052956, + "grad_norm": 2.24760415418261, + "learning_rate": 3.4638614212444797e-06, + "loss": 0.8921, + "step": 3624 + }, + { + "epoch": 0.26153457667472313, + "grad_norm": 2.0464184666880825, + "learning_rate": 3.4635429320863163e-06, + "loss": 0.9489, + "step": 3625 + }, + { + "epoch": 0.26160672414415065, + "grad_norm": 2.0357337176320494, + "learning_rate": 3.463224363009491e-06, + "loss": 0.9985, + "step": 3626 + }, + { + "epoch": 0.26167887161357817, + "grad_norm": 2.231071003377758, + "learning_rate": 3.4629057140314e-06, + "loss": 1.0067, + "step": 3627 + }, + { + "epoch": 0.2617510190830057, + "grad_norm": 2.8157932921316635, + "learning_rate": 3.4625869851694437e-06, + "loss": 0.903, + "step": 3628 + }, + { + "epoch": 0.26182316655243315, + "grad_norm": 2.699597123782974, + "learning_rate": 3.462268176441026e-06, + "loss": 0.9132, + "step": 3629 + }, + { + "epoch": 0.26189531402186067, + "grad_norm": 1.9606981473343434, + "learning_rate": 3.4619492878635565e-06, + "loss": 0.9278, + "step": 3630 + }, + { + "epoch": 0.2619674614912882, + "grad_norm": 2.8678213972718716, + "learning_rate": 3.461630319454448e-06, + "loss": 0.905, + "step": 3631 + }, + { + "epoch": 0.2620396089607157, + "grad_norm": 3.291539942237556, + "learning_rate": 3.4613112712311194e-06, + "loss": 0.8876, + "step": 3632 + }, + { + "epoch": 0.26211175643014323, + "grad_norm": 2.253084611464571, + "learning_rate": 3.460992143210991e-06, + "loss": 0.9766, + "step": 3633 + }, + { + "epoch": 0.26218390389957075, + "grad_norm": 2.303530955671969, + "learning_rate": 3.460672935411491e-06, + "loss": 0.8979, + "step": 3634 + }, + { + "epoch": 0.2622560513689982, + "grad_norm": 3.7591845888288304, + "learning_rate": 3.4603536478500484e-06, + "loss": 1.0092, + "step": 3635 + }, + { + "epoch": 0.26232819883842573, + "grad_norm": 2.239448283064003, + "learning_rate": 3.460034280544099e-06, + "loss": 0.9744, + "step": 3636 + }, + { + "epoch": 0.26240034630785325, + "grad_norm": 3.785199207420391, + "learning_rate": 3.459714833511083e-06, + "loss": 0.8579, + "step": 3637 + }, + { + "epoch": 0.26247249377728077, + "grad_norm": 3.3694934428035603, + "learning_rate": 3.4593953067684435e-06, + "loss": 0.9421, + "step": 3638 + }, + { + "epoch": 0.2625446412467083, + "grad_norm": 1.957872525913848, + "learning_rate": 3.4590757003336285e-06, + "loss": 0.8649, + "step": 3639 + }, + { + "epoch": 0.2626167887161358, + "grad_norm": 3.175906621093536, + "learning_rate": 3.458756014224091e-06, + "loss": 0.9964, + "step": 3640 + }, + { + "epoch": 0.26268893618556327, + "grad_norm": 2.5845598775104848, + "learning_rate": 3.458436248457288e-06, + "loss": 0.8567, + "step": 3641 + }, + { + "epoch": 0.2627610836549908, + "grad_norm": 1.9719959324723209, + "learning_rate": 3.4581164030506804e-06, + "loss": 1.0377, + "step": 3642 + }, + { + "epoch": 0.2628332311244183, + "grad_norm": 3.044403012389007, + "learning_rate": 3.457796478021733e-06, + "loss": 0.8704, + "step": 3643 + }, + { + "epoch": 0.26290537859384583, + "grad_norm": 3.31554776460393, + "learning_rate": 3.457476473387917e-06, + "loss": 0.9516, + "step": 3644 + }, + { + "epoch": 0.26297752606327335, + "grad_norm": 2.6286232783256462, + "learning_rate": 3.4571563891667065e-06, + "loss": 0.9349, + "step": 3645 + }, + { + "epoch": 0.26304967353270087, + "grad_norm": 0.8056179333976925, + "learning_rate": 3.4568362253755796e-06, + "loss": 0.8054, + "step": 3646 + }, + { + "epoch": 0.26312182100212833, + "grad_norm": 2.641971312645086, + "learning_rate": 3.4565159820320196e-06, + "loss": 0.9105, + "step": 3647 + }, + { + "epoch": 0.26319396847155585, + "grad_norm": 3.4682836820564074, + "learning_rate": 3.456195659153513e-06, + "loss": 0.9466, + "step": 3648 + }, + { + "epoch": 0.26326611594098337, + "grad_norm": 1.9155290240441585, + "learning_rate": 3.4558752567575526e-06, + "loss": 0.9463, + "step": 3649 + }, + { + "epoch": 0.2633382634104109, + "grad_norm": 2.0182931672479993, + "learning_rate": 3.4555547748616337e-06, + "loss": 1.0104, + "step": 3650 + }, + { + "epoch": 0.2634104108798384, + "grad_norm": 2.002773745681624, + "learning_rate": 3.455234213483257e-06, + "loss": 0.8981, + "step": 3651 + }, + { + "epoch": 0.26348255834926587, + "grad_norm": 2.154347451135973, + "learning_rate": 3.4549135726399273e-06, + "loss": 0.9724, + "step": 3652 + }, + { + "epoch": 0.2635547058186934, + "grad_norm": 2.0545801635090593, + "learning_rate": 3.4545928523491527e-06, + "loss": 0.9035, + "step": 3653 + }, + { + "epoch": 0.2636268532881209, + "grad_norm": 2.243644958417524, + "learning_rate": 3.454272052628447e-06, + "loss": 0.8705, + "step": 3654 + }, + { + "epoch": 0.26369900075754843, + "grad_norm": 2.5199091664852578, + "learning_rate": 3.4539511734953285e-06, + "loss": 0.9844, + "step": 3655 + }, + { + "epoch": 0.26377114822697595, + "grad_norm": 2.294737567399563, + "learning_rate": 3.453630214967319e-06, + "loss": 1.0294, + "step": 3656 + }, + { + "epoch": 0.26384329569640347, + "grad_norm": 2.21415470983885, + "learning_rate": 3.4533091770619446e-06, + "loss": 0.9842, + "step": 3657 + }, + { + "epoch": 0.26391544316583093, + "grad_norm": 2.17578957794075, + "learning_rate": 3.452988059796736e-06, + "loss": 0.9383, + "step": 3658 + }, + { + "epoch": 0.26398759063525845, + "grad_norm": 2.112666860457452, + "learning_rate": 3.4526668631892273e-06, + "loss": 1.0048, + "step": 3659 + }, + { + "epoch": 0.26405973810468597, + "grad_norm": 2.5018332435540636, + "learning_rate": 3.4523455872569596e-06, + "loss": 0.9162, + "step": 3660 + }, + { + "epoch": 0.2641318855741135, + "grad_norm": 2.290006891931549, + "learning_rate": 3.4520242320174755e-06, + "loss": 0.9559, + "step": 3661 + }, + { + "epoch": 0.264204033043541, + "grad_norm": 2.1840491186843596, + "learning_rate": 3.451702797488323e-06, + "loss": 0.9771, + "step": 3662 + }, + { + "epoch": 0.26427618051296853, + "grad_norm": 2.0274481774044304, + "learning_rate": 3.4513812836870556e-06, + "loss": 0.9563, + "step": 3663 + }, + { + "epoch": 0.264348327982396, + "grad_norm": 1.8556597480848238, + "learning_rate": 3.451059690631229e-06, + "loss": 0.8503, + "step": 3664 + }, + { + "epoch": 0.2644204754518235, + "grad_norm": 2.117165765967492, + "learning_rate": 3.450738018338404e-06, + "loss": 0.9627, + "step": 3665 + }, + { + "epoch": 0.26449262292125103, + "grad_norm": 2.4119603161180576, + "learning_rate": 3.450416266826146e-06, + "loss": 0.9156, + "step": 3666 + }, + { + "epoch": 0.26456477039067855, + "grad_norm": 2.440004578101444, + "learning_rate": 3.4500944361120256e-06, + "loss": 0.9298, + "step": 3667 + }, + { + "epoch": 0.26463691786010607, + "grad_norm": 1.9037807293301012, + "learning_rate": 3.4497725262136154e-06, + "loss": 0.9535, + "step": 3668 + }, + { + "epoch": 0.2647090653295336, + "grad_norm": 2.272720590928399, + "learning_rate": 3.449450537148495e-06, + "loss": 0.93, + "step": 3669 + }, + { + "epoch": 0.26478121279896105, + "grad_norm": 1.9263928043967733, + "learning_rate": 3.449128468934246e-06, + "loss": 0.8822, + "step": 3670 + }, + { + "epoch": 0.26485336026838857, + "grad_norm": 2.330206842696273, + "learning_rate": 3.448806321588456e-06, + "loss": 0.8809, + "step": 3671 + }, + { + "epoch": 0.2649255077378161, + "grad_norm": 2.124886229218304, + "learning_rate": 3.448484095128716e-06, + "loss": 0.8929, + "step": 3672 + }, + { + "epoch": 0.2649976552072436, + "grad_norm": 1.9156681099501183, + "learning_rate": 3.448161789572622e-06, + "loss": 0.958, + "step": 3673 + }, + { + "epoch": 0.26506980267667113, + "grad_norm": 2.527658434089509, + "learning_rate": 3.447839404937773e-06, + "loss": 1.0285, + "step": 3674 + }, + { + "epoch": 0.26514195014609865, + "grad_norm": 2.223933009078652, + "learning_rate": 3.4475169412417745e-06, + "loss": 0.9598, + "step": 3675 + }, + { + "epoch": 0.2652140976155261, + "grad_norm": 1.96558640628643, + "learning_rate": 3.447194398502234e-06, + "loss": 1.0327, + "step": 3676 + }, + { + "epoch": 0.26528624508495363, + "grad_norm": 5.413881769507807, + "learning_rate": 3.446871776736764e-06, + "loss": 0.8908, + "step": 3677 + }, + { + "epoch": 0.26535839255438115, + "grad_norm": 2.350006805572395, + "learning_rate": 3.4465490759629835e-06, + "loss": 0.8594, + "step": 3678 + }, + { + "epoch": 0.26543054002380867, + "grad_norm": 2.2424397077503575, + "learning_rate": 3.446226296198513e-06, + "loss": 1.0521, + "step": 3679 + }, + { + "epoch": 0.2655026874932362, + "grad_norm": 0.8258199821634986, + "learning_rate": 3.4459034374609773e-06, + "loss": 0.8214, + "step": 3680 + }, + { + "epoch": 0.2655748349626637, + "grad_norm": 2.244439140896884, + "learning_rate": 3.445580499768008e-06, + "loss": 0.9887, + "step": 3681 + }, + { + "epoch": 0.26564698243209117, + "grad_norm": 1.9896502325451129, + "learning_rate": 3.445257483137239e-06, + "loss": 1.0264, + "step": 3682 + }, + { + "epoch": 0.2657191299015187, + "grad_norm": 3.866806967152074, + "learning_rate": 3.4449343875863087e-06, + "loss": 0.8788, + "step": 3683 + }, + { + "epoch": 0.2657912773709462, + "grad_norm": 2.0486941807458536, + "learning_rate": 3.4446112131328604e-06, + "loss": 0.9877, + "step": 3684 + }, + { + "epoch": 0.26586342484037373, + "grad_norm": 1.977358691619886, + "learning_rate": 3.444287959794542e-06, + "loss": 0.9367, + "step": 3685 + }, + { + "epoch": 0.26593557230980125, + "grad_norm": 2.9103415615810317, + "learning_rate": 3.4439646275890044e-06, + "loss": 1.0513, + "step": 3686 + }, + { + "epoch": 0.26600771977922877, + "grad_norm": 2.2046942566809338, + "learning_rate": 3.443641216533904e-06, + "loss": 0.9056, + "step": 3687 + }, + { + "epoch": 0.26607986724865623, + "grad_norm": 2.535709361910117, + "learning_rate": 3.4433177266469005e-06, + "loss": 0.96, + "step": 3688 + }, + { + "epoch": 0.26615201471808375, + "grad_norm": 0.6879678997976303, + "learning_rate": 3.4429941579456595e-06, + "loss": 0.7742, + "step": 3689 + }, + { + "epoch": 0.26622416218751127, + "grad_norm": 2.6331207366626344, + "learning_rate": 3.442670510447849e-06, + "loss": 0.9986, + "step": 3690 + }, + { + "epoch": 0.2662963096569388, + "grad_norm": 2.9189531537407225, + "learning_rate": 3.4423467841711425e-06, + "loss": 1.0084, + "step": 3691 + }, + { + "epoch": 0.2663684571263663, + "grad_norm": 2.115231331693621, + "learning_rate": 3.442022979133218e-06, + "loss": 0.9958, + "step": 3692 + }, + { + "epoch": 0.26644060459579383, + "grad_norm": 0.9333803579427755, + "learning_rate": 3.441699095351756e-06, + "loss": 0.8685, + "step": 3693 + }, + { + "epoch": 0.2665127520652213, + "grad_norm": 2.2955171892423674, + "learning_rate": 3.4413751328444438e-06, + "loss": 0.7528, + "step": 3694 + }, + { + "epoch": 0.2665848995346488, + "grad_norm": 2.3759238805233767, + "learning_rate": 3.4410510916289716e-06, + "loss": 1.0093, + "step": 3695 + }, + { + "epoch": 0.26665704700407633, + "grad_norm": 2.891028958524262, + "learning_rate": 3.4407269717230333e-06, + "loss": 0.9444, + "step": 3696 + }, + { + "epoch": 0.26672919447350385, + "grad_norm": 2.4345655139029625, + "learning_rate": 3.4404027731443284e-06, + "loss": 0.9273, + "step": 3697 + }, + { + "epoch": 0.26680134194293137, + "grad_norm": 3.06952779511927, + "learning_rate": 3.440078495910561e-06, + "loss": 0.9383, + "step": 3698 + }, + { + "epoch": 0.2668734894123589, + "grad_norm": 2.2390651961082098, + "learning_rate": 3.4397541400394376e-06, + "loss": 0.8702, + "step": 3699 + }, + { + "epoch": 0.26694563688178635, + "grad_norm": 2.8020107916951575, + "learning_rate": 3.4394297055486704e-06, + "loss": 0.9187, + "step": 3700 + }, + { + "epoch": 0.26701778435121387, + "grad_norm": 1.657174194443656, + "learning_rate": 3.4391051924559756e-06, + "loss": 1.0366, + "step": 3701 + }, + { + "epoch": 0.2670899318206414, + "grad_norm": 1.9408381223749172, + "learning_rate": 3.4387806007790733e-06, + "loss": 0.9333, + "step": 3702 + }, + { + "epoch": 0.2671620792900689, + "grad_norm": 2.6608814925763604, + "learning_rate": 3.4384559305356893e-06, + "loss": 1.0501, + "step": 3703 + }, + { + "epoch": 0.26723422675949643, + "grad_norm": 2.4955844986493005, + "learning_rate": 3.438131181743551e-06, + "loss": 0.982, + "step": 3704 + }, + { + "epoch": 0.2673063742289239, + "grad_norm": 0.8921643389419622, + "learning_rate": 3.437806354420393e-06, + "loss": 0.9002, + "step": 3705 + }, + { + "epoch": 0.2673785216983514, + "grad_norm": 1.996396752817291, + "learning_rate": 3.437481448583953e-06, + "loss": 0.9276, + "step": 3706 + }, + { + "epoch": 0.26745066916777893, + "grad_norm": 2.069292500982005, + "learning_rate": 3.437156464251972e-06, + "loss": 0.9939, + "step": 3707 + }, + { + "epoch": 0.26752281663720645, + "grad_norm": 4.678317014210262, + "learning_rate": 3.4368314014421962e-06, + "loss": 0.8605, + "step": 3708 + }, + { + "epoch": 0.26759496410663397, + "grad_norm": 2.1045131461035833, + "learning_rate": 3.4365062601723774e-06, + "loss": 0.89, + "step": 3709 + }, + { + "epoch": 0.2676671115760615, + "grad_norm": 1.824111445209606, + "learning_rate": 3.4361810404602693e-06, + "loss": 0.9788, + "step": 3710 + }, + { + "epoch": 0.26773925904548895, + "grad_norm": 2.6753227787825167, + "learning_rate": 3.435855742323631e-06, + "loss": 1.0113, + "step": 3711 + }, + { + "epoch": 0.2678114065149165, + "grad_norm": 2.461783902375395, + "learning_rate": 3.4355303657802265e-06, + "loss": 0.9247, + "step": 3712 + }, + { + "epoch": 0.267883553984344, + "grad_norm": 3.1496435841752675, + "learning_rate": 3.435204910847822e-06, + "loss": 0.9612, + "step": 3713 + }, + { + "epoch": 0.2679557014537715, + "grad_norm": 2.584918694462637, + "learning_rate": 3.4348793775441907e-06, + "loss": 0.9416, + "step": 3714 + }, + { + "epoch": 0.26802784892319903, + "grad_norm": 4.938524164140518, + "learning_rate": 3.4345537658871086e-06, + "loss": 0.9473, + "step": 3715 + }, + { + "epoch": 0.26809999639262655, + "grad_norm": 2.4967949349525265, + "learning_rate": 3.434228075894355e-06, + "loss": 0.8445, + "step": 3716 + }, + { + "epoch": 0.268172143862054, + "grad_norm": 2.8116180732596616, + "learning_rate": 3.4339023075837165e-06, + "loss": 0.8931, + "step": 3717 + }, + { + "epoch": 0.26824429133148153, + "grad_norm": 2.4476237236757195, + "learning_rate": 3.4335764609729806e-06, + "loss": 0.9485, + "step": 3718 + }, + { + "epoch": 0.26831643880090905, + "grad_norm": 3.541658842788731, + "learning_rate": 3.4332505360799412e-06, + "loss": 0.9224, + "step": 3719 + }, + { + "epoch": 0.26838858627033657, + "grad_norm": 1.9489710465027954, + "learning_rate": 3.432924532922396e-06, + "loss": 0.8518, + "step": 3720 + }, + { + "epoch": 0.2684607337397641, + "grad_norm": 2.5344028382001262, + "learning_rate": 3.4325984515181457e-06, + "loss": 0.9209, + "step": 3721 + }, + { + "epoch": 0.2685328812091916, + "grad_norm": 1.9785694894490962, + "learning_rate": 3.432272291884998e-06, + "loss": 0.9213, + "step": 3722 + }, + { + "epoch": 0.2686050286786191, + "grad_norm": 2.230506375558306, + "learning_rate": 3.4319460540407623e-06, + "loss": 0.966, + "step": 3723 + }, + { + "epoch": 0.2686771761480466, + "grad_norm": 1.7239169051464196, + "learning_rate": 3.4316197380032536e-06, + "loss": 0.9737, + "step": 3724 + }, + { + "epoch": 0.2687493236174741, + "grad_norm": 1.8799047533216355, + "learning_rate": 3.4312933437902904e-06, + "loss": 1.0044, + "step": 3725 + }, + { + "epoch": 0.26882147108690163, + "grad_norm": 4.6345147902124815, + "learning_rate": 3.4309668714196964e-06, + "loss": 0.8429, + "step": 3726 + }, + { + "epoch": 0.26889361855632915, + "grad_norm": 2.660835257877076, + "learning_rate": 3.4306403209092983e-06, + "loss": 0.8867, + "step": 3727 + }, + { + "epoch": 0.26896576602575667, + "grad_norm": 2.954439385384168, + "learning_rate": 3.4303136922769295e-06, + "loss": 0.9333, + "step": 3728 + }, + { + "epoch": 0.26903791349518413, + "grad_norm": 1.6929791121436386, + "learning_rate": 3.429986985540424e-06, + "loss": 1.0218, + "step": 3729 + }, + { + "epoch": 0.26911006096461165, + "grad_norm": 2.260208178865456, + "learning_rate": 3.429660200717623e-06, + "loss": 0.9147, + "step": 3730 + }, + { + "epoch": 0.26918220843403917, + "grad_norm": 2.7476062760351803, + "learning_rate": 3.429333337826371e-06, + "loss": 0.9934, + "step": 3731 + }, + { + "epoch": 0.2692543559034667, + "grad_norm": 2.221162800448493, + "learning_rate": 3.429006396884516e-06, + "loss": 0.969, + "step": 3732 + }, + { + "epoch": 0.2693265033728942, + "grad_norm": 2.4462673285274548, + "learning_rate": 3.4286793779099124e-06, + "loss": 0.9957, + "step": 3733 + }, + { + "epoch": 0.26939865084232173, + "grad_norm": 2.0850198277324257, + "learning_rate": 3.4283522809204163e-06, + "loss": 1.0113, + "step": 3734 + }, + { + "epoch": 0.2694707983117492, + "grad_norm": 3.8339628587256036, + "learning_rate": 3.4280251059338898e-06, + "loss": 0.9379, + "step": 3735 + }, + { + "epoch": 0.2695429457811767, + "grad_norm": 2.0259799375830485, + "learning_rate": 3.427697852968199e-06, + "loss": 0.9733, + "step": 3736 + }, + { + "epoch": 0.26961509325060423, + "grad_norm": 2.0491197231273417, + "learning_rate": 3.4273705220412137e-06, + "loss": 0.8803, + "step": 3737 + }, + { + "epoch": 0.26968724072003175, + "grad_norm": 2.6512288825000274, + "learning_rate": 3.427043113170807e-06, + "loss": 0.9831, + "step": 3738 + }, + { + "epoch": 0.26975938818945927, + "grad_norm": 2.709496458646463, + "learning_rate": 3.42671562637486e-06, + "loss": 0.9182, + "step": 3739 + }, + { + "epoch": 0.2698315356588868, + "grad_norm": 2.692351553109022, + "learning_rate": 3.426388061671253e-06, + "loss": 0.8674, + "step": 3740 + }, + { + "epoch": 0.26990368312831425, + "grad_norm": 1.8202171464429946, + "learning_rate": 3.426060419077875e-06, + "loss": 1.0098, + "step": 3741 + }, + { + "epoch": 0.2699758305977418, + "grad_norm": 2.634747708021456, + "learning_rate": 3.4257326986126165e-06, + "loss": 0.9076, + "step": 3742 + }, + { + "epoch": 0.2700479780671693, + "grad_norm": 3.5252051814539955, + "learning_rate": 3.425404900293373e-06, + "loss": 0.8999, + "step": 3743 + }, + { + "epoch": 0.2701201255365968, + "grad_norm": 1.9166840331355053, + "learning_rate": 3.4250770241380445e-06, + "loss": 0.9376, + "step": 3744 + }, + { + "epoch": 0.27019227300602433, + "grad_norm": 2.6471297824922875, + "learning_rate": 3.4247490701645356e-06, + "loss": 0.9515, + "step": 3745 + }, + { + "epoch": 0.27026442047545185, + "grad_norm": 3.1305320601967064, + "learning_rate": 3.424421038390754e-06, + "loss": 1.0064, + "step": 3746 + }, + { + "epoch": 0.2703365679448793, + "grad_norm": 2.1420843343009555, + "learning_rate": 3.4240929288346126e-06, + "loss": 1.0191, + "step": 3747 + }, + { + "epoch": 0.27040871541430683, + "grad_norm": 1.8042837244460144, + "learning_rate": 3.423764741514028e-06, + "loss": 1.0091, + "step": 3748 + }, + { + "epoch": 0.27048086288373435, + "grad_norm": 2.607725341510603, + "learning_rate": 3.4234364764469216e-06, + "loss": 0.9548, + "step": 3749 + }, + { + "epoch": 0.27055301035316187, + "grad_norm": 2.5333509791746787, + "learning_rate": 3.423108133651219e-06, + "loss": 0.8951, + "step": 3750 + }, + { + "epoch": 0.2706251578225894, + "grad_norm": 2.425865184173405, + "learning_rate": 3.4227797131448486e-06, + "loss": 0.9109, + "step": 3751 + }, + { + "epoch": 0.2706973052920169, + "grad_norm": 2.247591743166765, + "learning_rate": 3.422451214945745e-06, + "loss": 0.9223, + "step": 3752 + }, + { + "epoch": 0.2707694527614444, + "grad_norm": 2.024749566122061, + "learning_rate": 3.422122639071847e-06, + "loss": 0.996, + "step": 3753 + }, + { + "epoch": 0.2708416002308719, + "grad_norm": 1.986141229799808, + "learning_rate": 3.4217939855410958e-06, + "loss": 0.8906, + "step": 3754 + }, + { + "epoch": 0.2709137477002994, + "grad_norm": 0.8314872360144429, + "learning_rate": 3.421465254371438e-06, + "loss": 0.7919, + "step": 3755 + }, + { + "epoch": 0.27098589516972693, + "grad_norm": 2.532023182273189, + "learning_rate": 3.421136445580825e-06, + "loss": 0.9476, + "step": 3756 + }, + { + "epoch": 0.27105804263915445, + "grad_norm": 2.4747881866138597, + "learning_rate": 3.420807559187212e-06, + "loss": 0.9497, + "step": 3757 + }, + { + "epoch": 0.2711301901085819, + "grad_norm": 2.1241509480404246, + "learning_rate": 3.4204785952085574e-06, + "loss": 1.0023, + "step": 3758 + }, + { + "epoch": 0.27120233757800943, + "grad_norm": 2.0464682138427057, + "learning_rate": 3.420149553662825e-06, + "loss": 0.9046, + "step": 3759 + }, + { + "epoch": 0.27127448504743695, + "grad_norm": 2.33441398210162, + "learning_rate": 3.419820434567983e-06, + "loss": 0.8936, + "step": 3760 + }, + { + "epoch": 0.27134663251686447, + "grad_norm": 5.93460944236421, + "learning_rate": 3.4194912379420035e-06, + "loss": 0.8531, + "step": 3761 + }, + { + "epoch": 0.271418779986292, + "grad_norm": 2.3315310556796436, + "learning_rate": 3.4191619638028623e-06, + "loss": 1.0063, + "step": 3762 + }, + { + "epoch": 0.2714909274557195, + "grad_norm": 3.0399807128796352, + "learning_rate": 3.4188326121685387e-06, + "loss": 0.9977, + "step": 3763 + }, + { + "epoch": 0.271563074925147, + "grad_norm": 2.0855498793130756, + "learning_rate": 3.4185031830570195e-06, + "loss": 0.9977, + "step": 3764 + }, + { + "epoch": 0.2716352223945745, + "grad_norm": 1.791088240055174, + "learning_rate": 3.418173676486292e-06, + "loss": 0.9337, + "step": 3765 + }, + { + "epoch": 0.271707369864002, + "grad_norm": 2.1547808963708457, + "learning_rate": 3.41784409247435e-06, + "loss": 1.0205, + "step": 3766 + }, + { + "epoch": 0.27177951733342953, + "grad_norm": 2.3954770583469696, + "learning_rate": 3.417514431039191e-06, + "loss": 0.8923, + "step": 3767 + }, + { + "epoch": 0.27185166480285705, + "grad_norm": 2.5869265045177863, + "learning_rate": 3.417184692198816e-06, + "loss": 0.7972, + "step": 3768 + }, + { + "epoch": 0.27192381227228457, + "grad_norm": 5.3182774263219095, + "learning_rate": 3.416854875971232e-06, + "loss": 0.9294, + "step": 3769 + }, + { + "epoch": 0.27199595974171203, + "grad_norm": 2.021522470013935, + "learning_rate": 3.4165249823744474e-06, + "loss": 0.9147, + "step": 3770 + }, + { + "epoch": 0.27206810721113955, + "grad_norm": 1.9146549475735366, + "learning_rate": 3.4161950114264775e-06, + "loss": 0.9602, + "step": 3771 + }, + { + "epoch": 0.2721402546805671, + "grad_norm": 2.6528691935696562, + "learning_rate": 3.415864963145341e-06, + "loss": 0.9557, + "step": 3772 + }, + { + "epoch": 0.2722124021499946, + "grad_norm": 0.7620617216649536, + "learning_rate": 3.41553483754906e-06, + "loss": 0.7962, + "step": 3773 + }, + { + "epoch": 0.2722845496194221, + "grad_norm": 2.42004660222604, + "learning_rate": 3.4152046346556615e-06, + "loss": 0.9069, + "step": 3774 + }, + { + "epoch": 0.27235669708884963, + "grad_norm": 2.282919599356325, + "learning_rate": 3.414874354483177e-06, + "loss": 0.8238, + "step": 3775 + }, + { + "epoch": 0.2724288445582771, + "grad_norm": 0.8051428016029535, + "learning_rate": 3.4145439970496416e-06, + "loss": 0.8276, + "step": 3776 + }, + { + "epoch": 0.2725009920277046, + "grad_norm": 6.080778273412079, + "learning_rate": 3.414213562373095e-06, + "loss": 1.0606, + "step": 3777 + }, + { + "epoch": 0.27257313949713213, + "grad_norm": 1.9246692348529597, + "learning_rate": 3.4138830504715807e-06, + "loss": 1.0681, + "step": 3778 + }, + { + "epoch": 0.27264528696655965, + "grad_norm": 4.414602385055719, + "learning_rate": 3.4135524613631476e-06, + "loss": 0.941, + "step": 3779 + }, + { + "epoch": 0.27271743443598717, + "grad_norm": 2.14068436366278, + "learning_rate": 3.413221795065847e-06, + "loss": 1.014, + "step": 3780 + }, + { + "epoch": 0.2727895819054147, + "grad_norm": 2.1992390226849254, + "learning_rate": 3.4128910515977357e-06, + "loss": 0.8486, + "step": 3781 + }, + { + "epoch": 0.27286172937484215, + "grad_norm": 0.810905763155749, + "learning_rate": 3.4125602309768747e-06, + "loss": 0.8221, + "step": 3782 + }, + { + "epoch": 0.2729338768442697, + "grad_norm": 2.3345579839071484, + "learning_rate": 3.4122293332213286e-06, + "loss": 0.9132, + "step": 3783 + }, + { + "epoch": 0.2730060243136972, + "grad_norm": 2.1488129998558763, + "learning_rate": 3.411898358349166e-06, + "loss": 0.9661, + "step": 3784 + }, + { + "epoch": 0.2730781717831247, + "grad_norm": 2.1244647530037706, + "learning_rate": 3.4115673063784614e-06, + "loss": 0.9441, + "step": 3785 + }, + { + "epoch": 0.27315031925255223, + "grad_norm": 2.954771682774603, + "learning_rate": 3.411236177327291e-06, + "loss": 0.8648, + "step": 3786 + }, + { + "epoch": 0.27322246672197975, + "grad_norm": 2.0224213276239507, + "learning_rate": 3.4109049712137375e-06, + "loss": 0.9543, + "step": 3787 + }, + { + "epoch": 0.2732946141914072, + "grad_norm": 2.829959337730924, + "learning_rate": 3.4105736880558867e-06, + "loss": 0.9514, + "step": 3788 + }, + { + "epoch": 0.27336676166083473, + "grad_norm": 1.9793823994629822, + "learning_rate": 3.4102423278718285e-06, + "loss": 0.8956, + "step": 3789 + }, + { + "epoch": 0.27343890913026225, + "grad_norm": 2.6236546111338988, + "learning_rate": 3.409910890679657e-06, + "loss": 1.0436, + "step": 3790 + }, + { + "epoch": 0.2735110565996898, + "grad_norm": 2.3649446544663504, + "learning_rate": 3.409579376497471e-06, + "loss": 1.0859, + "step": 3791 + }, + { + "epoch": 0.2735832040691173, + "grad_norm": 2.058771404439975, + "learning_rate": 3.409247785343374e-06, + "loss": 1.0319, + "step": 3792 + }, + { + "epoch": 0.2736553515385448, + "grad_norm": 2.525615213059435, + "learning_rate": 3.4089161172354718e-06, + "loss": 0.9135, + "step": 3793 + }, + { + "epoch": 0.2737274990079723, + "grad_norm": 1.616731662361341, + "learning_rate": 3.4085843721918766e-06, + "loss": 1.0105, + "step": 3794 + }, + { + "epoch": 0.2737996464773998, + "grad_norm": 3.4600057273332236, + "learning_rate": 3.4082525502307026e-06, + "loss": 1.06, + "step": 3795 + }, + { + "epoch": 0.2738717939468273, + "grad_norm": 1.8434680787838766, + "learning_rate": 3.40792065137007e-06, + "loss": 0.9117, + "step": 3796 + }, + { + "epoch": 0.27394394141625483, + "grad_norm": 6.837724096890376, + "learning_rate": 3.4075886756281028e-06, + "loss": 0.9772, + "step": 3797 + }, + { + "epoch": 0.27401608888568235, + "grad_norm": 2.6771662559481255, + "learning_rate": 3.4072566230229286e-06, + "loss": 1.0109, + "step": 3798 + }, + { + "epoch": 0.27408823635510987, + "grad_norm": 1.8508521540253238, + "learning_rate": 3.40692449357268e-06, + "loss": 0.9189, + "step": 3799 + }, + { + "epoch": 0.27416038382453733, + "grad_norm": 2.2195871412577426, + "learning_rate": 3.4065922872954927e-06, + "loss": 0.9521, + "step": 3800 + }, + { + "epoch": 0.27423253129396485, + "grad_norm": 3.0968210410027828, + "learning_rate": 3.4062600042095077e-06, + "loss": 0.9616, + "step": 3801 + }, + { + "epoch": 0.2743046787633924, + "grad_norm": 2.5304935408176066, + "learning_rate": 3.4059276443328697e-06, + "loss": 0.9375, + "step": 3802 + }, + { + "epoch": 0.2743768262328199, + "grad_norm": 2.205347549832397, + "learning_rate": 3.4055952076837277e-06, + "loss": 0.8775, + "step": 3803 + }, + { + "epoch": 0.2744489737022474, + "grad_norm": 1.888520385599841, + "learning_rate": 3.4052626942802343e-06, + "loss": 0.9908, + "step": 3804 + }, + { + "epoch": 0.27452112117167493, + "grad_norm": 2.13473513512507, + "learning_rate": 3.4049301041405487e-06, + "loss": 0.9761, + "step": 3805 + }, + { + "epoch": 0.2745932686411024, + "grad_norm": 2.3241836673428926, + "learning_rate": 3.4045974372828297e-06, + "loss": 0.9617, + "step": 3806 + }, + { + "epoch": 0.2746654161105299, + "grad_norm": 3.069657505824382, + "learning_rate": 3.4042646937252447e-06, + "loss": 0.93, + "step": 3807 + }, + { + "epoch": 0.27473756357995743, + "grad_norm": 2.484708079916255, + "learning_rate": 3.4039318734859627e-06, + "loss": 0.8967, + "step": 3808 + }, + { + "epoch": 0.27480971104938495, + "grad_norm": 1.5683680399920192, + "learning_rate": 3.403598976583159e-06, + "loss": 0.969, + "step": 3809 + }, + { + "epoch": 0.27488185851881247, + "grad_norm": 1.8776462319186111, + "learning_rate": 3.40326600303501e-06, + "loss": 0.8737, + "step": 3810 + }, + { + "epoch": 0.27495400598823994, + "grad_norm": 2.1436397735462105, + "learning_rate": 3.4029329528597005e-06, + "loss": 0.904, + "step": 3811 + }, + { + "epoch": 0.27502615345766745, + "grad_norm": 2.52340839947907, + "learning_rate": 3.4025998260754154e-06, + "loss": 0.8886, + "step": 3812 + }, + { + "epoch": 0.275098300927095, + "grad_norm": 1.948512191992742, + "learning_rate": 3.402266622700346e-06, + "loss": 0.8942, + "step": 3813 + }, + { + "epoch": 0.2751704483965225, + "grad_norm": 2.0054957460459613, + "learning_rate": 3.4019333427526875e-06, + "loss": 0.9473, + "step": 3814 + }, + { + "epoch": 0.27524259586595, + "grad_norm": 3.260653567416591, + "learning_rate": 3.4015999862506387e-06, + "loss": 0.9412, + "step": 3815 + }, + { + "epoch": 0.27531474333537753, + "grad_norm": 2.5866370047346603, + "learning_rate": 3.4012665532124037e-06, + "loss": 0.9601, + "step": 3816 + }, + { + "epoch": 0.275386890804805, + "grad_norm": 3.048709884198202, + "learning_rate": 3.400933043656189e-06, + "loss": 0.9095, + "step": 3817 + }, + { + "epoch": 0.2754590382742325, + "grad_norm": 3.4801096447300712, + "learning_rate": 3.400599457600207e-06, + "loss": 0.8291, + "step": 3818 + }, + { + "epoch": 0.27553118574366003, + "grad_norm": 2.22650478355156, + "learning_rate": 3.4002657950626736e-06, + "loss": 0.843, + "step": 3819 + }, + { + "epoch": 0.27560333321308755, + "grad_norm": 2.59592227283373, + "learning_rate": 3.399932056061809e-06, + "loss": 0.969, + "step": 3820 + }, + { + "epoch": 0.2756754806825151, + "grad_norm": 0.6676147593741499, + "learning_rate": 3.3995982406158366e-06, + "loss": 0.7768, + "step": 3821 + }, + { + "epoch": 0.2757476281519426, + "grad_norm": 1.9110976319737165, + "learning_rate": 3.399264348742986e-06, + "loss": 1.0417, + "step": 3822 + }, + { + "epoch": 0.27581977562137006, + "grad_norm": 2.1755526574963775, + "learning_rate": 3.398930380461489e-06, + "loss": 1.003, + "step": 3823 + }, + { + "epoch": 0.2758919230907976, + "grad_norm": 2.8653493700828663, + "learning_rate": 3.398596335789582e-06, + "loss": 0.8932, + "step": 3824 + }, + { + "epoch": 0.2759640705602251, + "grad_norm": 2.573128685085724, + "learning_rate": 3.3982622147455072e-06, + "loss": 0.9947, + "step": 3825 + }, + { + "epoch": 0.2760362180296526, + "grad_norm": 1.556249705854641, + "learning_rate": 3.397928017347509e-06, + "loss": 0.9606, + "step": 3826 + }, + { + "epoch": 0.27610836549908013, + "grad_norm": 2.270894519319908, + "learning_rate": 3.3975937436138365e-06, + "loss": 1.0545, + "step": 3827 + }, + { + "epoch": 0.27618051296850765, + "grad_norm": 4.528587874717469, + "learning_rate": 3.3972593935627433e-06, + "loss": 1.0055, + "step": 3828 + }, + { + "epoch": 0.2762526604379351, + "grad_norm": 1.9407497347490394, + "learning_rate": 3.396924967212487e-06, + "loss": 0.9216, + "step": 3829 + }, + { + "epoch": 0.27632480790736264, + "grad_norm": 2.491650849444177, + "learning_rate": 3.39659046458133e-06, + "loss": 0.9291, + "step": 3830 + }, + { + "epoch": 0.27639695537679015, + "grad_norm": 1.7366123374499838, + "learning_rate": 3.3962558856875374e-06, + "loss": 0.9058, + "step": 3831 + }, + { + "epoch": 0.2764691028462177, + "grad_norm": 1.6544605160679735, + "learning_rate": 3.3959212305493797e-06, + "loss": 0.8613, + "step": 3832 + }, + { + "epoch": 0.2765412503156452, + "grad_norm": 2.1456337129153193, + "learning_rate": 3.395586499185131e-06, + "loss": 0.9466, + "step": 3833 + }, + { + "epoch": 0.2766133977850727, + "grad_norm": 2.52503704006108, + "learning_rate": 3.3952516916130703e-06, + "loss": 1.0128, + "step": 3834 + }, + { + "epoch": 0.2766855452545002, + "grad_norm": 3.2318501960954875, + "learning_rate": 3.394916807851479e-06, + "loss": 0.8203, + "step": 3835 + }, + { + "epoch": 0.2767576927239277, + "grad_norm": 2.9359136315512946, + "learning_rate": 3.3945818479186453e-06, + "loss": 1.0285, + "step": 3836 + }, + { + "epoch": 0.2768298401933552, + "grad_norm": 0.938799751832618, + "learning_rate": 3.3942468118328592e-06, + "loss": 0.8712, + "step": 3837 + }, + { + "epoch": 0.27690198766278273, + "grad_norm": 2.4873287467856318, + "learning_rate": 3.3939116996124163e-06, + "loss": 0.9395, + "step": 3838 + }, + { + "epoch": 0.27697413513221025, + "grad_norm": 1.8831336669519858, + "learning_rate": 3.3935765112756152e-06, + "loss": 1.0109, + "step": 3839 + }, + { + "epoch": 0.27704628260163777, + "grad_norm": 2.12715959203645, + "learning_rate": 3.3932412468407597e-06, + "loss": 1.0196, + "step": 3840 + }, + { + "epoch": 0.27711843007106524, + "grad_norm": 2.6786045744525406, + "learning_rate": 3.392905906326158e-06, + "loss": 0.9452, + "step": 3841 + }, + { + "epoch": 0.27719057754049276, + "grad_norm": 2.373543945497124, + "learning_rate": 3.3925704897501204e-06, + "loss": 0.981, + "step": 3842 + }, + { + "epoch": 0.2772627250099203, + "grad_norm": 9.00508101406224, + "learning_rate": 3.3922349971309637e-06, + "loss": 1.0116, + "step": 3843 + }, + { + "epoch": 0.2773348724793478, + "grad_norm": 1.8825732193850588, + "learning_rate": 3.3918994284870072e-06, + "loss": 0.9375, + "step": 3844 + }, + { + "epoch": 0.2774070199487753, + "grad_norm": 5.11826663686925, + "learning_rate": 3.391563783836576e-06, + "loss": 0.9793, + "step": 3845 + }, + { + "epoch": 0.27747916741820283, + "grad_norm": 2.129726203012628, + "learning_rate": 3.391228063197998e-06, + "loss": 0.7409, + "step": 3846 + }, + { + "epoch": 0.2775513148876303, + "grad_norm": 2.6212592037186697, + "learning_rate": 3.3908922665896054e-06, + "loss": 0.7688, + "step": 3847 + }, + { + "epoch": 0.2776234623570578, + "grad_norm": 2.993900774700939, + "learning_rate": 3.390556394029735e-06, + "loss": 0.7731, + "step": 3848 + }, + { + "epoch": 0.27769560982648533, + "grad_norm": 2.23323584908435, + "learning_rate": 3.390220445536728e-06, + "loss": 0.9444, + "step": 3849 + }, + { + "epoch": 0.27776775729591285, + "grad_norm": 1.7729883507455069, + "learning_rate": 3.3898844211289282e-06, + "loss": 0.9225, + "step": 3850 + }, + { + "epoch": 0.2778399047653404, + "grad_norm": 3.6591551886523086, + "learning_rate": 3.3895483208246864e-06, + "loss": 0.8505, + "step": 3851 + }, + { + "epoch": 0.2779120522347679, + "grad_norm": 2.095828490760421, + "learning_rate": 3.3892121446423535e-06, + "loss": 0.8551, + "step": 3852 + }, + { + "epoch": 0.27798419970419536, + "grad_norm": 5.690643447938247, + "learning_rate": 3.3888758926002894e-06, + "loss": 0.9603, + "step": 3853 + }, + { + "epoch": 0.2780563471736229, + "grad_norm": 2.2667815314902153, + "learning_rate": 3.388539564716853e-06, + "loss": 0.95, + "step": 3854 + }, + { + "epoch": 0.2781284946430504, + "grad_norm": 1.8921529097352519, + "learning_rate": 3.3882031610104117e-06, + "loss": 0.9727, + "step": 3855 + }, + { + "epoch": 0.2782006421124779, + "grad_norm": 2.053806826919174, + "learning_rate": 3.3878666814993348e-06, + "loss": 1.0357, + "step": 3856 + }, + { + "epoch": 0.27827278958190543, + "grad_norm": 2.0371300905661114, + "learning_rate": 3.3875301262019957e-06, + "loss": 1.0097, + "step": 3857 + }, + { + "epoch": 0.2783449370513329, + "grad_norm": 2.0447142166754637, + "learning_rate": 3.387193495136773e-06, + "loss": 0.9607, + "step": 3858 + }, + { + "epoch": 0.2784170845207604, + "grad_norm": 2.5600175696008494, + "learning_rate": 3.386856788322049e-06, + "loss": 0.9082, + "step": 3859 + }, + { + "epoch": 0.27848923199018794, + "grad_norm": 4.9526977354779875, + "learning_rate": 3.38652000577621e-06, + "loss": 1.0413, + "step": 3860 + }, + { + "epoch": 0.27856137945961545, + "grad_norm": 1.6054831548262685, + "learning_rate": 3.386183147517646e-06, + "loss": 0.9182, + "step": 3861 + }, + { + "epoch": 0.278633526929043, + "grad_norm": 2.7476470591348274, + "learning_rate": 3.3858462135647513e-06, + "loss": 0.8759, + "step": 3862 + }, + { + "epoch": 0.2787056743984705, + "grad_norm": 2.1293192331655004, + "learning_rate": 3.3855092039359257e-06, + "loss": 0.989, + "step": 3863 + }, + { + "epoch": 0.27877782186789796, + "grad_norm": 2.335187402430001, + "learning_rate": 3.3851721186495714e-06, + "loss": 0.9362, + "step": 3864 + }, + { + "epoch": 0.2788499693373255, + "grad_norm": 3.046863731950318, + "learning_rate": 3.3848349577240953e-06, + "loss": 0.9314, + "step": 3865 + }, + { + "epoch": 0.278922116806753, + "grad_norm": 2.351795679235362, + "learning_rate": 3.3844977211779087e-06, + "loss": 0.9885, + "step": 3866 + }, + { + "epoch": 0.2789942642761805, + "grad_norm": 3.8137273845170854, + "learning_rate": 3.3841604090294268e-06, + "loss": 0.9902, + "step": 3867 + }, + { + "epoch": 0.27906641174560803, + "grad_norm": 2.8200450146917326, + "learning_rate": 3.3838230212970685e-06, + "loss": 1.0233, + "step": 3868 + }, + { + "epoch": 0.27913855921503555, + "grad_norm": 2.396051866820172, + "learning_rate": 3.3834855579992582e-06, + "loss": 1.0201, + "step": 3869 + }, + { + "epoch": 0.279210706684463, + "grad_norm": 2.0218155299406955, + "learning_rate": 3.383148019154423e-06, + "loss": 0.9994, + "step": 3870 + }, + { + "epoch": 0.27928285415389054, + "grad_norm": 2.0338893468910073, + "learning_rate": 3.3828104047809945e-06, + "loss": 0.9015, + "step": 3871 + }, + { + "epoch": 0.27935500162331806, + "grad_norm": 2.3165109234426144, + "learning_rate": 3.382472714897409e-06, + "loss": 0.9998, + "step": 3872 + }, + { + "epoch": 0.2794271490927456, + "grad_norm": 1.9868173662522257, + "learning_rate": 3.3821349495221067e-06, + "loss": 0.9623, + "step": 3873 + }, + { + "epoch": 0.2794992965621731, + "grad_norm": 2.426886805707779, + "learning_rate": 3.38179710867353e-06, + "loss": 0.9097, + "step": 3874 + }, + { + "epoch": 0.2795714440316006, + "grad_norm": 2.3826317890619473, + "learning_rate": 3.38145919237013e-06, + "loss": 0.9046, + "step": 3875 + }, + { + "epoch": 0.2796435915010281, + "grad_norm": 2.4190802415247212, + "learning_rate": 3.3811212006303564e-06, + "loss": 0.9776, + "step": 3876 + }, + { + "epoch": 0.2797157389704556, + "grad_norm": 2.613028521213142, + "learning_rate": 3.3807831334726677e-06, + "loss": 0.9891, + "step": 3877 + }, + { + "epoch": 0.2797878864398831, + "grad_norm": 2.985513040744434, + "learning_rate": 3.3804449909155226e-06, + "loss": 0.9989, + "step": 3878 + }, + { + "epoch": 0.27986003390931063, + "grad_norm": 3.646137174025284, + "learning_rate": 3.380106772977387e-06, + "loss": 0.9658, + "step": 3879 + }, + { + "epoch": 0.27993218137873815, + "grad_norm": 2.306484233260663, + "learning_rate": 3.3797684796767297e-06, + "loss": 0.9903, + "step": 3880 + }, + { + "epoch": 0.2800043288481657, + "grad_norm": 1.8622897727141021, + "learning_rate": 3.3794301110320236e-06, + "loss": 1.0011, + "step": 3881 + }, + { + "epoch": 0.28007647631759314, + "grad_norm": 2.641519789919642, + "learning_rate": 3.3790916670617457e-06, + "loss": 0.9868, + "step": 3882 + }, + { + "epoch": 0.28014862378702066, + "grad_norm": 2.1598727987063953, + "learning_rate": 3.3787531477843768e-06, + "loss": 1.0251, + "step": 3883 + }, + { + "epoch": 0.2802207712564482, + "grad_norm": 2.1374748050030217, + "learning_rate": 3.3784145532184022e-06, + "loss": 0.898, + "step": 3884 + }, + { + "epoch": 0.2802929187258757, + "grad_norm": 20.222686808818903, + "learning_rate": 3.378075883382312e-06, + "loss": 1.0436, + "step": 3885 + }, + { + "epoch": 0.2803650661953032, + "grad_norm": 1.8314803541683071, + "learning_rate": 3.377737138294599e-06, + "loss": 0.9607, + "step": 3886 + }, + { + "epoch": 0.28043721366473073, + "grad_norm": 3.9591567270620796, + "learning_rate": 3.377398317973761e-06, + "loss": 0.9495, + "step": 3887 + }, + { + "epoch": 0.2805093611341582, + "grad_norm": 2.4987232762902356, + "learning_rate": 3.3770594224383004e-06, + "loss": 1.0143, + "step": 3888 + }, + { + "epoch": 0.2805815086035857, + "grad_norm": 2.089896684493152, + "learning_rate": 3.3767204517067223e-06, + "loss": 0.8626, + "step": 3889 + }, + { + "epoch": 0.28065365607301324, + "grad_norm": 0.7428839377402049, + "learning_rate": 3.3763814057975363e-06, + "loss": 0.8093, + "step": 3890 + }, + { + "epoch": 0.28072580354244075, + "grad_norm": 2.1429348477398267, + "learning_rate": 3.376042284729257e-06, + "loss": 0.8925, + "step": 3891 + }, + { + "epoch": 0.2807979510118683, + "grad_norm": 1.907327878793337, + "learning_rate": 3.375703088520403e-06, + "loss": 0.906, + "step": 3892 + }, + { + "epoch": 0.2808700984812958, + "grad_norm": 2.54792567386943, + "learning_rate": 3.3753638171894955e-06, + "loss": 0.9642, + "step": 3893 + }, + { + "epoch": 0.28094224595072326, + "grad_norm": 2.0194592343149833, + "learning_rate": 3.375024470755061e-06, + "loss": 0.9275, + "step": 3894 + }, + { + "epoch": 0.2810143934201508, + "grad_norm": 2.1211161181989886, + "learning_rate": 3.374685049235631e-06, + "loss": 0.8152, + "step": 3895 + }, + { + "epoch": 0.2810865408895783, + "grad_norm": 2.585209585580905, + "learning_rate": 3.374345552649739e-06, + "loss": 0.894, + "step": 3896 + }, + { + "epoch": 0.2811586883590058, + "grad_norm": 2.421557202561674, + "learning_rate": 3.374005981015925e-06, + "loss": 1.0243, + "step": 3897 + }, + { + "epoch": 0.28123083582843333, + "grad_norm": 2.128956981369452, + "learning_rate": 3.3736663343527296e-06, + "loss": 0.8817, + "step": 3898 + }, + { + "epoch": 0.28130298329786085, + "grad_norm": 2.169341551386283, + "learning_rate": 3.3733266126787016e-06, + "loss": 1.0109, + "step": 3899 + }, + { + "epoch": 0.2813751307672883, + "grad_norm": 1.8036765036938904, + "learning_rate": 3.3729868160123906e-06, + "loss": 0.9653, + "step": 3900 + }, + { + "epoch": 0.28144727823671584, + "grad_norm": 2.2546443689229934, + "learning_rate": 3.372646944372352e-06, + "loss": 1.0159, + "step": 3901 + }, + { + "epoch": 0.28151942570614336, + "grad_norm": 2.9238677547727603, + "learning_rate": 3.3723069977771457e-06, + "loss": 0.9343, + "step": 3902 + }, + { + "epoch": 0.2815915731755709, + "grad_norm": 4.724765808114548, + "learning_rate": 3.3719669762453343e-06, + "loss": 0.9951, + "step": 3903 + }, + { + "epoch": 0.2816637206449984, + "grad_norm": 2.582752213694719, + "learning_rate": 3.3716268797954853e-06, + "loss": 0.9245, + "step": 3904 + }, + { + "epoch": 0.2817358681144259, + "grad_norm": 2.5475697886123436, + "learning_rate": 3.3712867084461694e-06, + "loss": 1.0096, + "step": 3905 + }, + { + "epoch": 0.2818080155838534, + "grad_norm": 2.4591066361893774, + "learning_rate": 3.3709464622159633e-06, + "loss": 0.8979, + "step": 3906 + }, + { + "epoch": 0.2818801630532809, + "grad_norm": 2.044647985441234, + "learning_rate": 3.370606141123446e-06, + "loss": 0.9553, + "step": 3907 + }, + { + "epoch": 0.2819523105227084, + "grad_norm": 2.2149601120295777, + "learning_rate": 3.3702657451872e-06, + "loss": 0.9382, + "step": 3908 + }, + { + "epoch": 0.28202445799213594, + "grad_norm": 2.733396780239092, + "learning_rate": 3.369925274425815e-06, + "loss": 0.9382, + "step": 3909 + }, + { + "epoch": 0.28209660546156345, + "grad_norm": 2.079439364217675, + "learning_rate": 3.369584728857882e-06, + "loss": 0.9218, + "step": 3910 + }, + { + "epoch": 0.2821687529309909, + "grad_norm": 1.5441632632722249, + "learning_rate": 3.3692441085019977e-06, + "loss": 1.0122, + "step": 3911 + }, + { + "epoch": 0.28224090040041844, + "grad_norm": 2.5428850245133554, + "learning_rate": 3.3689034133767607e-06, + "loss": 0.9285, + "step": 3912 + }, + { + "epoch": 0.28231304786984596, + "grad_norm": 3.7751767906576528, + "learning_rate": 3.368562643500776e-06, + "loss": 0.9412, + "step": 3913 + }, + { + "epoch": 0.2823851953392735, + "grad_norm": 3.56786183864656, + "learning_rate": 3.3682217988926506e-06, + "loss": 0.9118, + "step": 3914 + }, + { + "epoch": 0.282457342808701, + "grad_norm": 2.0913718082868065, + "learning_rate": 3.3678808795709995e-06, + "loss": 0.9699, + "step": 3915 + }, + { + "epoch": 0.2825294902781285, + "grad_norm": 4.209608105249261, + "learning_rate": 3.3675398855544357e-06, + "loss": 0.902, + "step": 3916 + }, + { + "epoch": 0.282601637747556, + "grad_norm": 3.7728471945831994, + "learning_rate": 3.3671988168615825e-06, + "loss": 0.9794, + "step": 3917 + }, + { + "epoch": 0.2826737852169835, + "grad_norm": 3.077415888091632, + "learning_rate": 3.3668576735110624e-06, + "loss": 1.0128, + "step": 3918 + }, + { + "epoch": 0.282745932686411, + "grad_norm": 2.8112075379092007, + "learning_rate": 3.3665164555215046e-06, + "loss": 0.8321, + "step": 3919 + }, + { + "epoch": 0.28281808015583854, + "grad_norm": 0.8293647123847185, + "learning_rate": 3.366175162911542e-06, + "loss": 0.852, + "step": 3920 + }, + { + "epoch": 0.28289022762526606, + "grad_norm": 2.040357857899799, + "learning_rate": 3.365833795699811e-06, + "loss": 1.0124, + "step": 3921 + }, + { + "epoch": 0.2829623750946936, + "grad_norm": 2.201117500506867, + "learning_rate": 3.365492353904953e-06, + "loss": 0.8349, + "step": 3922 + }, + { + "epoch": 0.28303452256412104, + "grad_norm": 2.4221065287515935, + "learning_rate": 3.3651508375456123e-06, + "loss": 0.9357, + "step": 3923 + }, + { + "epoch": 0.28310667003354856, + "grad_norm": 2.547204774440671, + "learning_rate": 3.364809246640438e-06, + "loss": 0.9081, + "step": 3924 + }, + { + "epoch": 0.2831788175029761, + "grad_norm": 4.2479835663386005, + "learning_rate": 3.3644675812080827e-06, + "loss": 0.8888, + "step": 3925 + }, + { + "epoch": 0.2832509649724036, + "grad_norm": 2.0681841580647315, + "learning_rate": 3.3641258412672043e-06, + "loss": 0.9958, + "step": 3926 + }, + { + "epoch": 0.2833231124418311, + "grad_norm": 2.184918869921638, + "learning_rate": 3.3637840268364634e-06, + "loss": 0.8478, + "step": 3927 + }, + { + "epoch": 0.28339525991125863, + "grad_norm": 2.505600949383942, + "learning_rate": 3.3634421379345252e-06, + "loss": 0.9869, + "step": 3928 + }, + { + "epoch": 0.2834674073806861, + "grad_norm": 2.2537619612339883, + "learning_rate": 3.3631001745800588e-06, + "loss": 0.8573, + "step": 3929 + }, + { + "epoch": 0.2835395548501136, + "grad_norm": 3.110646510250073, + "learning_rate": 3.3627581367917386e-06, + "loss": 0.9282, + "step": 3930 + }, + { + "epoch": 0.28361170231954114, + "grad_norm": 3.400106647165228, + "learning_rate": 3.362416024588241e-06, + "loss": 0.919, + "step": 3931 + }, + { + "epoch": 0.28368384978896866, + "grad_norm": 2.50404098076847, + "learning_rate": 3.362073837988248e-06, + "loss": 0.9244, + "step": 3932 + }, + { + "epoch": 0.2837559972583962, + "grad_norm": 1.7732856450205814, + "learning_rate": 3.3617315770104444e-06, + "loss": 0.9787, + "step": 3933 + }, + { + "epoch": 0.2838281447278237, + "grad_norm": 1.8116048378148488, + "learning_rate": 3.361389241673521e-06, + "loss": 0.9791, + "step": 3934 + }, + { + "epoch": 0.28390029219725116, + "grad_norm": 1.9999613758172288, + "learning_rate": 3.3610468319961703e-06, + "loss": 0.9615, + "step": 3935 + }, + { + "epoch": 0.2839724396666787, + "grad_norm": 2.0866515119853393, + "learning_rate": 3.360704347997091e-06, + "loss": 1.0573, + "step": 3936 + }, + { + "epoch": 0.2840445871361062, + "grad_norm": 1.8623906532293932, + "learning_rate": 3.360361789694984e-06, + "loss": 1.0017, + "step": 3937 + }, + { + "epoch": 0.2841167346055337, + "grad_norm": 2.2795035588060344, + "learning_rate": 3.3600191571085557e-06, + "loss": 0.9843, + "step": 3938 + }, + { + "epoch": 0.28418888207496124, + "grad_norm": 2.5607838350083147, + "learning_rate": 3.359676450256516e-06, + "loss": 0.9725, + "step": 3939 + }, + { + "epoch": 0.28426102954438875, + "grad_norm": 3.4275958171782013, + "learning_rate": 3.3593336691575784e-06, + "loss": 1.0475, + "step": 3940 + }, + { + "epoch": 0.2843331770138162, + "grad_norm": 2.4003376842399837, + "learning_rate": 3.3589908138304616e-06, + "loss": 1.0576, + "step": 3941 + }, + { + "epoch": 0.28440532448324374, + "grad_norm": 2.39211082410362, + "learning_rate": 3.358647884293887e-06, + "loss": 0.9775, + "step": 3942 + }, + { + "epoch": 0.28447747195267126, + "grad_norm": 1.9837044611741506, + "learning_rate": 3.3583048805665808e-06, + "loss": 0.8384, + "step": 3943 + }, + { + "epoch": 0.2845496194220988, + "grad_norm": 1.990465504531137, + "learning_rate": 3.3579618026672735e-06, + "loss": 1.0195, + "step": 3944 + }, + { + "epoch": 0.2846217668915263, + "grad_norm": 2.286980837916406, + "learning_rate": 3.3576186506146994e-06, + "loss": 0.8989, + "step": 3945 + }, + { + "epoch": 0.2846939143609538, + "grad_norm": 3.07125335332977, + "learning_rate": 3.357275424427596e-06, + "loss": 0.9265, + "step": 3946 + }, + { + "epoch": 0.2847660618303813, + "grad_norm": 2.362351214048073, + "learning_rate": 3.356932124124706e-06, + "loss": 0.8925, + "step": 3947 + }, + { + "epoch": 0.2848382092998088, + "grad_norm": 2.1429762353046677, + "learning_rate": 3.3565887497247772e-06, + "loss": 0.9981, + "step": 3948 + }, + { + "epoch": 0.2849103567692363, + "grad_norm": 2.081295262666484, + "learning_rate": 3.356245301246558e-06, + "loss": 0.9193, + "step": 3949 + }, + { + "epoch": 0.28498250423866384, + "grad_norm": 2.0902442619647386, + "learning_rate": 3.3559017787088033e-06, + "loss": 0.9803, + "step": 3950 + }, + { + "epoch": 0.28505465170809136, + "grad_norm": 1.731030099722775, + "learning_rate": 3.355558182130272e-06, + "loss": 0.9484, + "step": 3951 + }, + { + "epoch": 0.2851267991775189, + "grad_norm": 2.071012213162117, + "learning_rate": 3.3552145115297262e-06, + "loss": 0.8932, + "step": 3952 + }, + { + "epoch": 0.28519894664694634, + "grad_norm": 1.547294867975243, + "learning_rate": 3.3548707669259327e-06, + "loss": 0.9742, + "step": 3953 + }, + { + "epoch": 0.28527109411637386, + "grad_norm": 7.143876618792576, + "learning_rate": 3.3545269483376622e-06, + "loss": 0.8699, + "step": 3954 + }, + { + "epoch": 0.2853432415858014, + "grad_norm": 2.2628233325553033, + "learning_rate": 3.35418305578369e-06, + "loss": 0.9565, + "step": 3955 + }, + { + "epoch": 0.2854153890552289, + "grad_norm": 2.743423922199143, + "learning_rate": 3.353839089282794e-06, + "loss": 0.9813, + "step": 3956 + }, + { + "epoch": 0.2854875365246564, + "grad_norm": 2.332234384964323, + "learning_rate": 3.3534950488537567e-06, + "loss": 0.8187, + "step": 3957 + }, + { + "epoch": 0.28555968399408393, + "grad_norm": 2.2163388758017435, + "learning_rate": 3.353150934515365e-06, + "loss": 1.0601, + "step": 3958 + }, + { + "epoch": 0.2856318314635114, + "grad_norm": 3.6419306038912342, + "learning_rate": 3.3528067462864107e-06, + "loss": 0.9522, + "step": 3959 + }, + { + "epoch": 0.2857039789329389, + "grad_norm": 2.4591188522787117, + "learning_rate": 3.3524624841856874e-06, + "loss": 0.9442, + "step": 3960 + }, + { + "epoch": 0.28577612640236644, + "grad_norm": 2.146579785434969, + "learning_rate": 3.3521181482319945e-06, + "loss": 0.9682, + "step": 3961 + }, + { + "epoch": 0.28584827387179396, + "grad_norm": 2.076320696688748, + "learning_rate": 3.351773738444135e-06, + "loss": 0.8496, + "step": 3962 + }, + { + "epoch": 0.2859204213412215, + "grad_norm": 2.1264045785626444, + "learning_rate": 3.351429254840916e-06, + "loss": 0.8441, + "step": 3963 + }, + { + "epoch": 0.28599256881064894, + "grad_norm": 2.5185179572738488, + "learning_rate": 3.3510846974411477e-06, + "loss": 0.9828, + "step": 3964 + }, + { + "epoch": 0.28606471628007646, + "grad_norm": 2.0374901800562584, + "learning_rate": 3.350740066263646e-06, + "loss": 0.8694, + "step": 3965 + }, + { + "epoch": 0.286136863749504, + "grad_norm": 2.6970808779393316, + "learning_rate": 3.3503953613272293e-06, + "loss": 0.935, + "step": 3966 + }, + { + "epoch": 0.2862090112189315, + "grad_norm": 2.376089348175934, + "learning_rate": 3.3500505826507208e-06, + "loss": 1.0016, + "step": 3967 + }, + { + "epoch": 0.286281158688359, + "grad_norm": 2.1623222983670543, + "learning_rate": 3.349705730252948e-06, + "loss": 1.0241, + "step": 3968 + }, + { + "epoch": 0.28635330615778654, + "grad_norm": 2.284110588082124, + "learning_rate": 3.3493608041527414e-06, + "loss": 0.9791, + "step": 3969 + }, + { + "epoch": 0.286425453627214, + "grad_norm": 2.0199170214539928, + "learning_rate": 3.3490158043689373e-06, + "loss": 0.9543, + "step": 3970 + }, + { + "epoch": 0.2864976010966415, + "grad_norm": 5.011858420087108, + "learning_rate": 3.348670730920373e-06, + "loss": 1.0947, + "step": 3971 + }, + { + "epoch": 0.28656974856606904, + "grad_norm": 1.6988915139286065, + "learning_rate": 3.348325583825893e-06, + "loss": 0.9427, + "step": 3972 + }, + { + "epoch": 0.28664189603549656, + "grad_norm": 1.7492745121820437, + "learning_rate": 3.3479803631043443e-06, + "loss": 0.8961, + "step": 3973 + }, + { + "epoch": 0.2867140435049241, + "grad_norm": 2.8784450953582885, + "learning_rate": 3.3476350687745783e-06, + "loss": 0.9572, + "step": 3974 + }, + { + "epoch": 0.2867861909743516, + "grad_norm": 2.0127723083131643, + "learning_rate": 3.3472897008554495e-06, + "loss": 0.9265, + "step": 3975 + }, + { + "epoch": 0.28685833844377906, + "grad_norm": 2.671913637473057, + "learning_rate": 3.346944259365818e-06, + "loss": 0.9004, + "step": 3976 + }, + { + "epoch": 0.2869304859132066, + "grad_norm": 2.1220073945171007, + "learning_rate": 3.3465987443245466e-06, + "loss": 1.0482, + "step": 3977 + }, + { + "epoch": 0.2870026333826341, + "grad_norm": 2.3554844389182628, + "learning_rate": 3.3462531557505026e-06, + "loss": 0.8313, + "step": 3978 + }, + { + "epoch": 0.2870747808520616, + "grad_norm": 1.8028567717920794, + "learning_rate": 3.345907493662557e-06, + "loss": 0.9212, + "step": 3979 + }, + { + "epoch": 0.28714692832148914, + "grad_norm": 0.7718788393983064, + "learning_rate": 3.3455617580795866e-06, + "loss": 0.8323, + "step": 3980 + }, + { + "epoch": 0.28721907579091666, + "grad_norm": 3.560250492284224, + "learning_rate": 3.345215949020469e-06, + "loss": 0.9749, + "step": 3981 + }, + { + "epoch": 0.2872912232603441, + "grad_norm": 2.6463401389142813, + "learning_rate": 3.344870066504088e-06, + "loss": 0.8971, + "step": 3982 + }, + { + "epoch": 0.28736337072977164, + "grad_norm": 2.028672681311697, + "learning_rate": 3.344524110549332e-06, + "loss": 1.1197, + "step": 3983 + }, + { + "epoch": 0.28743551819919916, + "grad_norm": 0.8967641755327093, + "learning_rate": 3.3441780811750907e-06, + "loss": 0.8486, + "step": 3984 + }, + { + "epoch": 0.2875076656686267, + "grad_norm": 2.3077602786055404, + "learning_rate": 3.343831978400261e-06, + "loss": 0.9149, + "step": 3985 + }, + { + "epoch": 0.2875798131380542, + "grad_norm": 3.2888202351486977, + "learning_rate": 3.343485802243741e-06, + "loss": 0.9506, + "step": 3986 + }, + { + "epoch": 0.2876519606074817, + "grad_norm": 2.8036291309142403, + "learning_rate": 3.3431395527244356e-06, + "loss": 0.9711, + "step": 3987 + }, + { + "epoch": 0.2877241080769092, + "grad_norm": 2.5900335688238245, + "learning_rate": 3.342793229861251e-06, + "loss": 0.9347, + "step": 3988 + }, + { + "epoch": 0.2877962555463367, + "grad_norm": 2.498912097735871, + "learning_rate": 3.3424468336730993e-06, + "loss": 1.0456, + "step": 3989 + }, + { + "epoch": 0.2878684030157642, + "grad_norm": 1.9493029008638976, + "learning_rate": 3.3421003641788944e-06, + "loss": 1.0447, + "step": 3990 + }, + { + "epoch": 0.28794055048519174, + "grad_norm": 1.907281252716096, + "learning_rate": 3.3417538213975575e-06, + "loss": 1.0114, + "step": 3991 + }, + { + "epoch": 0.28801269795461926, + "grad_norm": 2.408650179637178, + "learning_rate": 3.3414072053480125e-06, + "loss": 0.8522, + "step": 3992 + }, + { + "epoch": 0.2880848454240468, + "grad_norm": 0.8645716210609915, + "learning_rate": 3.3410605160491842e-06, + "loss": 0.9044, + "step": 3993 + }, + { + "epoch": 0.28815699289347424, + "grad_norm": 3.2334282267957266, + "learning_rate": 3.340713753520006e-06, + "loss": 0.8955, + "step": 3994 + }, + { + "epoch": 0.28822914036290176, + "grad_norm": 2.486256588658137, + "learning_rate": 3.3403669177794134e-06, + "loss": 0.9528, + "step": 3995 + }, + { + "epoch": 0.2883012878323293, + "grad_norm": 0.9374507573228809, + "learning_rate": 3.3400200088463447e-06, + "loss": 0.807, + "step": 3996 + }, + { + "epoch": 0.2883734353017568, + "grad_norm": 2.1716229409153343, + "learning_rate": 3.3396730267397445e-06, + "loss": 0.9521, + "step": 3997 + }, + { + "epoch": 0.2884455827711843, + "grad_norm": 0.7506498064933644, + "learning_rate": 3.3393259714785595e-06, + "loss": 0.7439, + "step": 3998 + }, + { + "epoch": 0.28851773024061184, + "grad_norm": 2.147065213368365, + "learning_rate": 3.3389788430817414e-06, + "loss": 0.9061, + "step": 3999 + }, + { + "epoch": 0.2885898777100393, + "grad_norm": 2.7429086981196993, + "learning_rate": 3.338631641568244e-06, + "loss": 1.0082, + "step": 4000 + }, + { + "epoch": 0.2886620251794668, + "grad_norm": 6.755863327550159, + "learning_rate": 3.3382843669570296e-06, + "loss": 1.0852, + "step": 4001 + }, + { + "epoch": 0.28873417264889434, + "grad_norm": 2.0642871627576254, + "learning_rate": 3.3379370192670593e-06, + "loss": 0.8158, + "step": 4002 + }, + { + "epoch": 0.28880632011832186, + "grad_norm": 2.476677443667856, + "learning_rate": 3.337589598517302e-06, + "loss": 0.9601, + "step": 4003 + }, + { + "epoch": 0.2888784675877494, + "grad_norm": 6.006156306176084, + "learning_rate": 3.3372421047267274e-06, + "loss": 0.8787, + "step": 4004 + }, + { + "epoch": 0.2889506150571769, + "grad_norm": 2.0843388864571453, + "learning_rate": 3.336894537914312e-06, + "loss": 1.0018, + "step": 4005 + }, + { + "epoch": 0.28902276252660436, + "grad_norm": 1.9257784025159603, + "learning_rate": 3.3365468980990353e-06, + "loss": 0.9526, + "step": 4006 + }, + { + "epoch": 0.2890949099960319, + "grad_norm": 2.8657267744821535, + "learning_rate": 3.33619918529988e-06, + "loss": 0.9257, + "step": 4007 + }, + { + "epoch": 0.2891670574654594, + "grad_norm": 2.4459233602649024, + "learning_rate": 3.3358513995358336e-06, + "loss": 1.0183, + "step": 4008 + }, + { + "epoch": 0.2892392049348869, + "grad_norm": 2.310787907440626, + "learning_rate": 3.335503540825888e-06, + "loss": 0.9983, + "step": 4009 + }, + { + "epoch": 0.28931135240431444, + "grad_norm": 2.4281359069657804, + "learning_rate": 3.335155609189037e-06, + "loss": 0.9571, + "step": 4010 + }, + { + "epoch": 0.28938349987374196, + "grad_norm": 3.8112991192718657, + "learning_rate": 3.334807604644281e-06, + "loss": 0.8697, + "step": 4011 + }, + { + "epoch": 0.2894556473431694, + "grad_norm": 1.9414100570180857, + "learning_rate": 3.3344595272106235e-06, + "loss": 1.0459, + "step": 4012 + }, + { + "epoch": 0.28952779481259694, + "grad_norm": 3.616452235063205, + "learning_rate": 3.334111376907071e-06, + "loss": 0.9267, + "step": 4013 + }, + { + "epoch": 0.28959994228202446, + "grad_norm": 1.962909330212492, + "learning_rate": 3.3337631537526347e-06, + "loss": 0.9458, + "step": 4014 + }, + { + "epoch": 0.289672089751452, + "grad_norm": 3.273605888606021, + "learning_rate": 3.3334148577663303e-06, + "loss": 0.8944, + "step": 4015 + }, + { + "epoch": 0.2897442372208795, + "grad_norm": 2.832633960650281, + "learning_rate": 3.333066488967176e-06, + "loss": 0.9735, + "step": 4016 + }, + { + "epoch": 0.28981638469030696, + "grad_norm": 0.7001503646661121, + "learning_rate": 3.3327180473741957e-06, + "loss": 0.8056, + "step": 4017 + }, + { + "epoch": 0.2898885321597345, + "grad_norm": 4.255609064676, + "learning_rate": 3.3323695330064167e-06, + "loss": 1.0527, + "step": 4018 + }, + { + "epoch": 0.289960679629162, + "grad_norm": 1.9754780444308462, + "learning_rate": 3.33202094588287e-06, + "loss": 1.0297, + "step": 4019 + }, + { + "epoch": 0.2900328270985895, + "grad_norm": 2.230595306173192, + "learning_rate": 3.3316722860225906e-06, + "loss": 0.9075, + "step": 4020 + }, + { + "epoch": 0.29010497456801704, + "grad_norm": 2.4618904327505424, + "learning_rate": 3.3313235534446165e-06, + "loss": 0.9059, + "step": 4021 + }, + { + "epoch": 0.29017712203744456, + "grad_norm": 2.172507982842953, + "learning_rate": 3.330974748167992e-06, + "loss": 0.9862, + "step": 4022 + }, + { + "epoch": 0.290249269506872, + "grad_norm": 2.1292340227535167, + "learning_rate": 3.3306258702117633e-06, + "loss": 0.9756, + "step": 4023 + }, + { + "epoch": 0.29032141697629954, + "grad_norm": 2.726527172832564, + "learning_rate": 3.3302769195949817e-06, + "loss": 0.8559, + "step": 4024 + }, + { + "epoch": 0.29039356444572706, + "grad_norm": 2.072971349662657, + "learning_rate": 3.329927896336703e-06, + "loss": 0.9346, + "step": 4025 + }, + { + "epoch": 0.2904657119151546, + "grad_norm": 2.270302305047707, + "learning_rate": 3.3295788004559837e-06, + "loss": 0.8476, + "step": 4026 + }, + { + "epoch": 0.2905378593845821, + "grad_norm": 1.9296939247425942, + "learning_rate": 3.3292296319718893e-06, + "loss": 1.0411, + "step": 4027 + }, + { + "epoch": 0.2906100068540096, + "grad_norm": 2.0202579924627497, + "learning_rate": 3.3288803909034845e-06, + "loss": 0.9713, + "step": 4028 + }, + { + "epoch": 0.2906821543234371, + "grad_norm": 3.4607975158626054, + "learning_rate": 3.328531077269841e-06, + "loss": 0.9122, + "step": 4029 + }, + { + "epoch": 0.2907543017928646, + "grad_norm": 2.8051743788753023, + "learning_rate": 3.328181691090033e-06, + "loss": 0.9632, + "step": 4030 + }, + { + "epoch": 0.2908264492622921, + "grad_norm": 3.102868869195311, + "learning_rate": 3.327832232383141e-06, + "loss": 0.8819, + "step": 4031 + }, + { + "epoch": 0.29089859673171964, + "grad_norm": 2.4457325924906317, + "learning_rate": 3.327482701168245e-06, + "loss": 0.9548, + "step": 4032 + }, + { + "epoch": 0.29097074420114716, + "grad_norm": 3.1375595998992467, + "learning_rate": 3.3271330974644336e-06, + "loss": 0.8837, + "step": 4033 + }, + { + "epoch": 0.2910428916705747, + "grad_norm": 2.01086881886344, + "learning_rate": 3.3267834212907957e-06, + "loss": 0.9804, + "step": 4034 + }, + { + "epoch": 0.29111503914000214, + "grad_norm": 3.2305331056543154, + "learning_rate": 3.3264336726664275e-06, + "loss": 0.8075, + "step": 4035 + }, + { + "epoch": 0.29118718660942966, + "grad_norm": 2.006732694842795, + "learning_rate": 3.326083851610427e-06, + "loss": 0.9663, + "step": 4036 + }, + { + "epoch": 0.2912593340788572, + "grad_norm": 2.120060959065519, + "learning_rate": 3.325733958141896e-06, + "loss": 0.9737, + "step": 4037 + }, + { + "epoch": 0.2913314815482847, + "grad_norm": 2.9306482799579197, + "learning_rate": 3.325383992279941e-06, + "loss": 0.959, + "step": 4038 + }, + { + "epoch": 0.2914036290177122, + "grad_norm": 7.8010678073932205, + "learning_rate": 3.3250339540436727e-06, + "loss": 1.0236, + "step": 4039 + }, + { + "epoch": 0.29147577648713974, + "grad_norm": 2.516596351887553, + "learning_rate": 3.324683843452205e-06, + "loss": 0.8307, + "step": 4040 + }, + { + "epoch": 0.2915479239565672, + "grad_norm": 2.411303081966385, + "learning_rate": 3.3243336605246567e-06, + "loss": 0.9922, + "step": 4041 + }, + { + "epoch": 0.2916200714259947, + "grad_norm": 1.966619519568367, + "learning_rate": 3.32398340528015e-06, + "loss": 1.0413, + "step": 4042 + }, + { + "epoch": 0.29169221889542224, + "grad_norm": 2.298482423261645, + "learning_rate": 3.323633077737811e-06, + "loss": 0.9196, + "step": 4043 + }, + { + "epoch": 0.29176436636484976, + "grad_norm": 2.672447723219952, + "learning_rate": 3.3232826779167685e-06, + "loss": 0.9153, + "step": 4044 + }, + { + "epoch": 0.2918365138342773, + "grad_norm": 2.517086954790413, + "learning_rate": 3.3229322058361585e-06, + "loss": 0.9121, + "step": 4045 + }, + { + "epoch": 0.2919086613037048, + "grad_norm": 2.8812307758476075, + "learning_rate": 3.3225816615151176e-06, + "loss": 0.9782, + "step": 4046 + }, + { + "epoch": 0.29198080877313226, + "grad_norm": 2.279121451013627, + "learning_rate": 3.3222310449727882e-06, + "loss": 0.8202, + "step": 4047 + }, + { + "epoch": 0.2920529562425598, + "grad_norm": 2.0205356601512556, + "learning_rate": 3.3218803562283163e-06, + "loss": 0.861, + "step": 4048 + }, + { + "epoch": 0.2921251037119873, + "grad_norm": 2.231303204241372, + "learning_rate": 3.3215295953008515e-06, + "loss": 0.8078, + "step": 4049 + }, + { + "epoch": 0.2921972511814148, + "grad_norm": 3.1617040062125343, + "learning_rate": 3.3211787622095474e-06, + "loss": 0.934, + "step": 4050 + }, + { + "epoch": 0.29226939865084234, + "grad_norm": 3.797451273823403, + "learning_rate": 3.3208278569735626e-06, + "loss": 0.9134, + "step": 4051 + }, + { + "epoch": 0.29234154612026986, + "grad_norm": 0.8606347993668189, + "learning_rate": 3.3204768796120574e-06, + "loss": 0.8033, + "step": 4052 + }, + { + "epoch": 0.2924136935896973, + "grad_norm": 2.081887989948463, + "learning_rate": 3.3201258301441986e-06, + "loss": 0.9552, + "step": 4053 + }, + { + "epoch": 0.29248584105912484, + "grad_norm": 3.7300925499527997, + "learning_rate": 3.319774708589155e-06, + "loss": 0.9619, + "step": 4054 + }, + { + "epoch": 0.29255798852855236, + "grad_norm": 3.196348884971415, + "learning_rate": 3.3194235149661e-06, + "loss": 0.8417, + "step": 4055 + }, + { + "epoch": 0.2926301359979799, + "grad_norm": 2.715243830120388, + "learning_rate": 3.319072249294211e-06, + "loss": 0.8222, + "step": 4056 + }, + { + "epoch": 0.2927022834674074, + "grad_norm": 1.9263324683753429, + "learning_rate": 3.3187209115926696e-06, + "loss": 0.9147, + "step": 4057 + }, + { + "epoch": 0.2927744309368349, + "grad_norm": 3.3048805559190186, + "learning_rate": 3.3183695018806615e-06, + "loss": 1.019, + "step": 4058 + }, + { + "epoch": 0.2928465784062624, + "grad_norm": 2.4583120722309277, + "learning_rate": 3.3180180201773746e-06, + "loss": 0.995, + "step": 4059 + }, + { + "epoch": 0.2929187258756899, + "grad_norm": 1.9595445647782366, + "learning_rate": 3.3176664665020035e-06, + "loss": 1.05, + "step": 4060 + }, + { + "epoch": 0.2929908733451174, + "grad_norm": 3.4154749552026726, + "learning_rate": 3.3173148408737437e-06, + "loss": 0.9405, + "step": 4061 + }, + { + "epoch": 0.29306302081454494, + "grad_norm": 2.0942612208939773, + "learning_rate": 3.316963143311798e-06, + "loss": 0.9839, + "step": 4062 + }, + { + "epoch": 0.29313516828397246, + "grad_norm": 3.405945510553834, + "learning_rate": 3.3166113738353695e-06, + "loss": 0.9228, + "step": 4063 + }, + { + "epoch": 0.2932073157533999, + "grad_norm": 2.166980305003474, + "learning_rate": 3.316259532463668e-06, + "loss": 1.0027, + "step": 4064 + }, + { + "epoch": 0.29327946322282744, + "grad_norm": 2.9803871553583763, + "learning_rate": 3.3159076192159063e-06, + "loss": 0.8421, + "step": 4065 + }, + { + "epoch": 0.29335161069225496, + "grad_norm": 1.9909733681538169, + "learning_rate": 3.3155556341113e-06, + "loss": 0.9526, + "step": 4066 + }, + { + "epoch": 0.2934237581616825, + "grad_norm": 2.1109211094443703, + "learning_rate": 3.315203577169071e-06, + "loss": 1.001, + "step": 4067 + }, + { + "epoch": 0.29349590563111, + "grad_norm": 2.047105499567886, + "learning_rate": 3.3148514484084434e-06, + "loss": 0.9625, + "step": 4068 + }, + { + "epoch": 0.2935680531005375, + "grad_norm": 2.277430014178794, + "learning_rate": 3.314499247848646e-06, + "loss": 0.7586, + "step": 4069 + }, + { + "epoch": 0.293640200569965, + "grad_norm": 2.6648680959555113, + "learning_rate": 3.314146975508911e-06, + "loss": 0.9423, + "step": 4070 + }, + { + "epoch": 0.2937123480393925, + "grad_norm": 2.1266515708953593, + "learning_rate": 3.3137946314084736e-06, + "loss": 0.9139, + "step": 4071 + }, + { + "epoch": 0.29378449550882, + "grad_norm": 14.847017526875874, + "learning_rate": 3.3134422155665755e-06, + "loss": 0.9357, + "step": 4072 + }, + { + "epoch": 0.29385664297824754, + "grad_norm": 2.7123875326581444, + "learning_rate": 3.3130897280024595e-06, + "loss": 0.9015, + "step": 4073 + }, + { + "epoch": 0.29392879044767506, + "grad_norm": 0.8534467656272672, + "learning_rate": 3.3127371687353747e-06, + "loss": 0.8368, + "step": 4074 + }, + { + "epoch": 0.2940009379171026, + "grad_norm": 2.8692463231466596, + "learning_rate": 3.312384537784573e-06, + "loss": 1.0225, + "step": 4075 + }, + { + "epoch": 0.29407308538653004, + "grad_norm": 2.6142830745942773, + "learning_rate": 3.31203183516931e-06, + "loss": 0.8992, + "step": 4076 + }, + { + "epoch": 0.29414523285595756, + "grad_norm": 2.476842629747926, + "learning_rate": 3.3116790609088446e-06, + "loss": 0.955, + "step": 4077 + }, + { + "epoch": 0.2942173803253851, + "grad_norm": 3.953221135233289, + "learning_rate": 3.3113262150224424e-06, + "loss": 0.9247, + "step": 4078 + }, + { + "epoch": 0.2942895277948126, + "grad_norm": 3.11844710903138, + "learning_rate": 3.3109732975293695e-06, + "loss": 0.9402, + "step": 4079 + }, + { + "epoch": 0.2943616752642401, + "grad_norm": 2.436493396764959, + "learning_rate": 3.310620308448898e-06, + "loss": 0.9113, + "step": 4080 + }, + { + "epoch": 0.29443382273366764, + "grad_norm": 2.537841315404746, + "learning_rate": 3.310267247800303e-06, + "loss": 0.9089, + "step": 4081 + }, + { + "epoch": 0.2945059702030951, + "grad_norm": 0.9719906410033587, + "learning_rate": 3.309914115602864e-06, + "loss": 0.8653, + "step": 4082 + }, + { + "epoch": 0.2945781176725226, + "grad_norm": 4.8501253564109446, + "learning_rate": 3.309560911875865e-06, + "loss": 0.9309, + "step": 4083 + }, + { + "epoch": 0.29465026514195014, + "grad_norm": 2.5457580551114427, + "learning_rate": 3.3092076366385915e-06, + "loss": 0.9182, + "step": 4084 + }, + { + "epoch": 0.29472241261137766, + "grad_norm": 2.9362098620299326, + "learning_rate": 3.308854289910336e-06, + "loss": 0.9459, + "step": 4085 + }, + { + "epoch": 0.2947945600808052, + "grad_norm": 5.822117222299402, + "learning_rate": 3.3085008717103926e-06, + "loss": 0.9026, + "step": 4086 + }, + { + "epoch": 0.2948667075502327, + "grad_norm": 2.697178821976267, + "learning_rate": 3.3081473820580606e-06, + "loss": 0.8216, + "step": 4087 + }, + { + "epoch": 0.29493885501966016, + "grad_norm": 2.8134179206835483, + "learning_rate": 3.3077938209726428e-06, + "loss": 0.9392, + "step": 4088 + }, + { + "epoch": 0.2950110024890877, + "grad_norm": 2.204587951186203, + "learning_rate": 3.3074401884734456e-06, + "loss": 1.0152, + "step": 4089 + }, + { + "epoch": 0.2950831499585152, + "grad_norm": 2.3077207098949235, + "learning_rate": 3.30708648457978e-06, + "loss": 0.9973, + "step": 4090 + }, + { + "epoch": 0.2951552974279427, + "grad_norm": 2.5654304309810496, + "learning_rate": 3.3067327093109598e-06, + "loss": 1.0018, + "step": 4091 + }, + { + "epoch": 0.29522744489737024, + "grad_norm": 3.3003632952379953, + "learning_rate": 3.306378862686304e-06, + "loss": 0.9901, + "step": 4092 + }, + { + "epoch": 0.29529959236679776, + "grad_norm": 2.308120291710521, + "learning_rate": 3.3060249447251344e-06, + "loss": 0.8778, + "step": 4093 + }, + { + "epoch": 0.2953717398362252, + "grad_norm": 2.3288793173905518, + "learning_rate": 3.3056709554467766e-06, + "loss": 0.9289, + "step": 4094 + }, + { + "epoch": 0.29544388730565274, + "grad_norm": 2.1763513116189888, + "learning_rate": 3.3053168948705623e-06, + "loss": 0.9843, + "step": 4095 + }, + { + "epoch": 0.29551603477508026, + "grad_norm": 3.628281653519085, + "learning_rate": 3.3049627630158245e-06, + "loss": 0.8784, + "step": 4096 + }, + { + "epoch": 0.2955881822445078, + "grad_norm": 2.039579830095233, + "learning_rate": 3.3046085599019007e-06, + "loss": 0.9167, + "step": 4097 + }, + { + "epoch": 0.2956603297139353, + "grad_norm": 2.252072121883108, + "learning_rate": 3.3042542855481327e-06, + "loss": 1.0103, + "step": 4098 + }, + { + "epoch": 0.2957324771833628, + "grad_norm": 2.220390036059306, + "learning_rate": 3.303899939973867e-06, + "loss": 1.0198, + "step": 4099 + }, + { + "epoch": 0.2958046246527903, + "grad_norm": 2.5163755067393994, + "learning_rate": 3.303545523198452e-06, + "loss": 1.0068, + "step": 4100 + }, + { + "epoch": 0.2958767721222178, + "grad_norm": 2.5156431019765537, + "learning_rate": 3.303191035241242e-06, + "loss": 0.8751, + "step": 4101 + }, + { + "epoch": 0.2959489195916453, + "grad_norm": 2.2295649045029196, + "learning_rate": 3.3028364761215934e-06, + "loss": 0.8749, + "step": 4102 + }, + { + "epoch": 0.29602106706107284, + "grad_norm": 3.7108383004833265, + "learning_rate": 3.3024818458588675e-06, + "loss": 1.0014, + "step": 4103 + }, + { + "epoch": 0.29609321453050036, + "grad_norm": 2.5832209306279488, + "learning_rate": 3.30212714447243e-06, + "loss": 0.8807, + "step": 4104 + }, + { + "epoch": 0.2961653619999279, + "grad_norm": 2.569738631261087, + "learning_rate": 3.3017723719816495e-06, + "loss": 0.8581, + "step": 4105 + }, + { + "epoch": 0.29623750946935534, + "grad_norm": 2.641043906306569, + "learning_rate": 3.301417528405898e-06, + "loss": 0.9662, + "step": 4106 + }, + { + "epoch": 0.29630965693878286, + "grad_norm": 2.5356960104086115, + "learning_rate": 3.301062613764554e-06, + "loss": 0.8776, + "step": 4107 + }, + { + "epoch": 0.2963818044082104, + "grad_norm": 2.8041667826757837, + "learning_rate": 3.3007076280769967e-06, + "loss": 0.8785, + "step": 4108 + }, + { + "epoch": 0.2964539518776379, + "grad_norm": 3.400781564079356, + "learning_rate": 3.3003525713626108e-06, + "loss": 1.0315, + "step": 4109 + }, + { + "epoch": 0.2965260993470654, + "grad_norm": 0.7213413544950424, + "learning_rate": 3.2999974436407846e-06, + "loss": 0.8197, + "step": 4110 + }, + { + "epoch": 0.29659824681649294, + "grad_norm": 1.9893185171341756, + "learning_rate": 3.2996422449309104e-06, + "loss": 0.9481, + "step": 4111 + }, + { + "epoch": 0.2966703942859204, + "grad_norm": 3.0371423455806834, + "learning_rate": 3.2992869752523847e-06, + "loss": 0.8017, + "step": 4112 + }, + { + "epoch": 0.2967425417553479, + "grad_norm": 2.5356016076456713, + "learning_rate": 3.298931634624607e-06, + "loss": 0.8857, + "step": 4113 + }, + { + "epoch": 0.29681468922477544, + "grad_norm": 2.39856483543929, + "learning_rate": 3.2985762230669807e-06, + "loss": 0.8744, + "step": 4114 + }, + { + "epoch": 0.29688683669420296, + "grad_norm": 3.186536007387865, + "learning_rate": 3.2982207405989144e-06, + "loss": 0.9049, + "step": 4115 + }, + { + "epoch": 0.2969589841636305, + "grad_norm": 2.4730904003670084, + "learning_rate": 3.2978651872398194e-06, + "loss": 0.9627, + "step": 4116 + }, + { + "epoch": 0.29703113163305794, + "grad_norm": 3.265539578867465, + "learning_rate": 3.297509563009111e-06, + "loss": 0.8601, + "step": 4117 + }, + { + "epoch": 0.29710327910248546, + "grad_norm": 3.400074251149536, + "learning_rate": 3.297153867926209e-06, + "loss": 0.8134, + "step": 4118 + }, + { + "epoch": 0.297175426571913, + "grad_norm": 4.364254862489213, + "learning_rate": 3.296798102010536e-06, + "loss": 0.9085, + "step": 4119 + }, + { + "epoch": 0.2972475740413405, + "grad_norm": 8.604926148917277, + "learning_rate": 3.2964422652815184e-06, + "loss": 0.9047, + "step": 4120 + }, + { + "epoch": 0.297319721510768, + "grad_norm": 2.0693965397125362, + "learning_rate": 3.2960863577585884e-06, + "loss": 0.9994, + "step": 4121 + }, + { + "epoch": 0.29739186898019554, + "grad_norm": 3.8440239041128015, + "learning_rate": 3.2957303794611805e-06, + "loss": 0.9184, + "step": 4122 + }, + { + "epoch": 0.297464016449623, + "grad_norm": 2.9129019463978896, + "learning_rate": 3.295374330408733e-06, + "loss": 0.9212, + "step": 4123 + }, + { + "epoch": 0.2975361639190505, + "grad_norm": 0.6182751066198153, + "learning_rate": 3.2950182106206884e-06, + "loss": 0.7652, + "step": 4124 + }, + { + "epoch": 0.29760831138847804, + "grad_norm": 2.4268460355568195, + "learning_rate": 3.2946620201164944e-06, + "loss": 0.9043, + "step": 4125 + }, + { + "epoch": 0.29768045885790556, + "grad_norm": 2.4040841598060623, + "learning_rate": 3.2943057589155993e-06, + "loss": 0.9459, + "step": 4126 + }, + { + "epoch": 0.2977526063273331, + "grad_norm": 5.269261765429034, + "learning_rate": 3.293949427037458e-06, + "loss": 0.9361, + "step": 4127 + }, + { + "epoch": 0.2978247537967606, + "grad_norm": 2.1900956149544757, + "learning_rate": 3.293593024501529e-06, + "loss": 0.9568, + "step": 4128 + }, + { + "epoch": 0.29789690126618806, + "grad_norm": 1.6884055357115009, + "learning_rate": 3.293236551327273e-06, + "loss": 1.0124, + "step": 4129 + }, + { + "epoch": 0.2979690487356156, + "grad_norm": 2.3403938169101375, + "learning_rate": 3.292880007534157e-06, + "loss": 0.964, + "step": 4130 + }, + { + "epoch": 0.2980411962050431, + "grad_norm": 2.9954234819542087, + "learning_rate": 3.29252339314165e-06, + "loss": 0.8995, + "step": 4131 + }, + { + "epoch": 0.2981133436744706, + "grad_norm": 3.1539831802703424, + "learning_rate": 3.2921667081692244e-06, + "loss": 0.9082, + "step": 4132 + }, + { + "epoch": 0.29818549114389814, + "grad_norm": 2.980508586572323, + "learning_rate": 3.2918099526363586e-06, + "loss": 0.9659, + "step": 4133 + }, + { + "epoch": 0.29825763861332566, + "grad_norm": 3.567954321778428, + "learning_rate": 3.2914531265625334e-06, + "loss": 0.9522, + "step": 4134 + }, + { + "epoch": 0.2983297860827531, + "grad_norm": 2.3953461746068383, + "learning_rate": 3.291096229967234e-06, + "loss": 0.9729, + "step": 4135 + }, + { + "epoch": 0.29840193355218064, + "grad_norm": 3.4743690686381985, + "learning_rate": 3.290739262869949e-06, + "loss": 0.825, + "step": 4136 + }, + { + "epoch": 0.29847408102160816, + "grad_norm": 3.004430519259208, + "learning_rate": 3.290382225290171e-06, + "loss": 0.9385, + "step": 4137 + }, + { + "epoch": 0.2985462284910357, + "grad_norm": 2.226465912447247, + "learning_rate": 3.290025117247396e-06, + "loss": 1.0018, + "step": 4138 + }, + { + "epoch": 0.2986183759604632, + "grad_norm": 2.1119265347910408, + "learning_rate": 3.289667938761125e-06, + "loss": 0.9693, + "step": 4139 + }, + { + "epoch": 0.2986905234298907, + "grad_norm": 3.0380495241509555, + "learning_rate": 3.2893106898508625e-06, + "loss": 0.9091, + "step": 4140 + }, + { + "epoch": 0.2987626708993182, + "grad_norm": 2.7490632455765605, + "learning_rate": 3.288953370536116e-06, + "loss": 0.9322, + "step": 4141 + }, + { + "epoch": 0.2988348183687457, + "grad_norm": 3.6668811359682527, + "learning_rate": 3.2885959808363976e-06, + "loss": 0.955, + "step": 4142 + }, + { + "epoch": 0.2989069658381732, + "grad_norm": 2.9409615432523357, + "learning_rate": 3.2882385207712223e-06, + "loss": 0.9779, + "step": 4143 + }, + { + "epoch": 0.29897911330760074, + "grad_norm": 3.655507851946108, + "learning_rate": 3.2878809903601107e-06, + "loss": 0.9276, + "step": 4144 + }, + { + "epoch": 0.29905126077702826, + "grad_norm": 2.8229195757823935, + "learning_rate": 3.2875233896225856e-06, + "loss": 0.8834, + "step": 4145 + }, + { + "epoch": 0.2991234082464558, + "grad_norm": 2.3216805583303657, + "learning_rate": 3.287165718578175e-06, + "loss": 0.924, + "step": 4146 + }, + { + "epoch": 0.29919555571588324, + "grad_norm": 1.781887141266305, + "learning_rate": 3.2868079772464087e-06, + "loss": 0.8685, + "step": 4147 + }, + { + "epoch": 0.29926770318531076, + "grad_norm": 2.7938296825131195, + "learning_rate": 3.2864501656468225e-06, + "loss": 0.9768, + "step": 4148 + }, + { + "epoch": 0.2993398506547383, + "grad_norm": 2.023706013305736, + "learning_rate": 3.2860922837989556e-06, + "loss": 0.745, + "step": 4149 + }, + { + "epoch": 0.2994119981241658, + "grad_norm": 2.8130752822832035, + "learning_rate": 3.2857343317223496e-06, + "loss": 0.9559, + "step": 4150 + }, + { + "epoch": 0.2994841455935933, + "grad_norm": 2.4180359961999027, + "learning_rate": 3.2853763094365513e-06, + "loss": 1.0009, + "step": 4151 + }, + { + "epoch": 0.29955629306302084, + "grad_norm": 1.881607683212949, + "learning_rate": 3.2850182169611112e-06, + "loss": 1.0495, + "step": 4152 + }, + { + "epoch": 0.2996284405324483, + "grad_norm": 4.6668819423249435, + "learning_rate": 3.284660054315583e-06, + "loss": 0.854, + "step": 4153 + }, + { + "epoch": 0.2997005880018758, + "grad_norm": 4.031334454738519, + "learning_rate": 3.2843018215195253e-06, + "loss": 0.8838, + "step": 4154 + }, + { + "epoch": 0.29977273547130334, + "grad_norm": 37.86647426216065, + "learning_rate": 3.2839435185924992e-06, + "loss": 0.9396, + "step": 4155 + }, + { + "epoch": 0.29984488294073086, + "grad_norm": 3.285429699343173, + "learning_rate": 3.2835851455540705e-06, + "loss": 0.9711, + "step": 4156 + }, + { + "epoch": 0.2999170304101584, + "grad_norm": 2.129073894094541, + "learning_rate": 3.2832267024238085e-06, + "loss": 1.0745, + "step": 4157 + }, + { + "epoch": 0.2999891778795859, + "grad_norm": 2.0385940422875977, + "learning_rate": 3.282868189221287e-06, + "loss": 1.0293, + "step": 4158 + }, + { + "epoch": 0.30006132534901336, + "grad_norm": 1.8376198061337774, + "learning_rate": 3.2825096059660825e-06, + "loss": 0.9374, + "step": 4159 + }, + { + "epoch": 0.3001334728184409, + "grad_norm": 2.0614661891402695, + "learning_rate": 3.2821509526777763e-06, + "loss": 0.9956, + "step": 4160 + }, + { + "epoch": 0.3002056202878684, + "grad_norm": 3.406351350448905, + "learning_rate": 3.2817922293759528e-06, + "loss": 0.9111, + "step": 4161 + }, + { + "epoch": 0.3002777677572959, + "grad_norm": 2.5081930376868, + "learning_rate": 3.2814334360802002e-06, + "loss": 0.8969, + "step": 4162 + }, + { + "epoch": 0.30034991522672344, + "grad_norm": 1.203673596834318, + "learning_rate": 3.2810745728101124e-06, + "loss": 0.8024, + "step": 4163 + }, + { + "epoch": 0.30042206269615096, + "grad_norm": 2.425528053915689, + "learning_rate": 3.280715639585284e-06, + "loss": 0.9228, + "step": 4164 + }, + { + "epoch": 0.3004942101655784, + "grad_norm": 2.82675770116661, + "learning_rate": 3.2803566364253154e-06, + "loss": 0.9829, + "step": 4165 + }, + { + "epoch": 0.30056635763500594, + "grad_norm": 2.030897139197956, + "learning_rate": 3.2799975633498103e-06, + "loss": 0.9006, + "step": 4166 + }, + { + "epoch": 0.30063850510443346, + "grad_norm": 4.239400773800283, + "learning_rate": 3.2796384203783776e-06, + "loss": 1.0751, + "step": 4167 + }, + { + "epoch": 0.300710652573861, + "grad_norm": 3.0316485899948518, + "learning_rate": 3.279279207530627e-06, + "loss": 0.8903, + "step": 4168 + }, + { + "epoch": 0.3007828000432885, + "grad_norm": 3.9709165171462644, + "learning_rate": 3.278919924826175e-06, + "loss": 0.9438, + "step": 4169 + }, + { + "epoch": 0.30085494751271596, + "grad_norm": 2.6140096473253145, + "learning_rate": 3.2785605722846407e-06, + "loss": 0.8236, + "step": 4170 + }, + { + "epoch": 0.3009270949821435, + "grad_norm": 4.2859010701121045, + "learning_rate": 3.278201149925646e-06, + "loss": 0.8827, + "step": 4171 + }, + { + "epoch": 0.300999242451571, + "grad_norm": 2.1033827829165395, + "learning_rate": 3.2778416577688184e-06, + "loss": 0.8967, + "step": 4172 + }, + { + "epoch": 0.3010713899209985, + "grad_norm": 2.0544707325354445, + "learning_rate": 3.277482095833788e-06, + "loss": 0.9912, + "step": 4173 + }, + { + "epoch": 0.30114353739042604, + "grad_norm": 2.568150315279485, + "learning_rate": 3.27712246414019e-06, + "loss": 0.9894, + "step": 4174 + }, + { + "epoch": 0.30121568485985356, + "grad_norm": 2.3625454852857213, + "learning_rate": 3.2767627627076617e-06, + "loss": 0.9768, + "step": 4175 + }, + { + "epoch": 0.301287832329281, + "grad_norm": 2.614750607270264, + "learning_rate": 3.2764029915558455e-06, + "loss": 0.9035, + "step": 4176 + }, + { + "epoch": 0.30135997979870854, + "grad_norm": 2.9667864278844203, + "learning_rate": 3.2760431507043863e-06, + "loss": 0.9539, + "step": 4177 + }, + { + "epoch": 0.30143212726813606, + "grad_norm": 0.7523475306208959, + "learning_rate": 3.275683240172935e-06, + "loss": 0.8255, + "step": 4178 + }, + { + "epoch": 0.3015042747375636, + "grad_norm": 4.006743468840706, + "learning_rate": 3.2753232599811445e-06, + "loss": 1.0081, + "step": 4179 + }, + { + "epoch": 0.3015764222069911, + "grad_norm": 2.0753028464785688, + "learning_rate": 3.274963210148672e-06, + "loss": 1.0002, + "step": 4180 + }, + { + "epoch": 0.3016485696764186, + "grad_norm": 3.4046108956754733, + "learning_rate": 3.2746030906951772e-06, + "loss": 1.0345, + "step": 4181 + }, + { + "epoch": 0.3017207171458461, + "grad_norm": 3.0142624697244202, + "learning_rate": 3.2742429016403273e-06, + "loss": 0.9911, + "step": 4182 + }, + { + "epoch": 0.3017928646152736, + "grad_norm": 2.8104915653218243, + "learning_rate": 3.273882643003789e-06, + "loss": 0.9209, + "step": 4183 + }, + { + "epoch": 0.3018650120847011, + "grad_norm": 2.764583585538376, + "learning_rate": 3.2735223148052356e-06, + "loss": 0.8771, + "step": 4184 + }, + { + "epoch": 0.30193715955412864, + "grad_norm": 3.448185628828497, + "learning_rate": 3.273161917064343e-06, + "loss": 0.9315, + "step": 4185 + }, + { + "epoch": 0.30200930702355616, + "grad_norm": 2.1247780627870467, + "learning_rate": 3.272801449800791e-06, + "loss": 0.877, + "step": 4186 + }, + { + "epoch": 0.3020814544929837, + "grad_norm": 2.2311775430834198, + "learning_rate": 3.2724409130342636e-06, + "loss": 0.969, + "step": 4187 + }, + { + "epoch": 0.30215360196241114, + "grad_norm": 2.7520485097302423, + "learning_rate": 3.272080306784448e-06, + "loss": 0.8862, + "step": 4188 + }, + { + "epoch": 0.30222574943183866, + "grad_norm": 3.646526873789904, + "learning_rate": 3.271719631071037e-06, + "loss": 0.9896, + "step": 4189 + }, + { + "epoch": 0.3022978969012662, + "grad_norm": 3.903837389717513, + "learning_rate": 3.2713588859137236e-06, + "loss": 0.9692, + "step": 4190 + }, + { + "epoch": 0.3023700443706937, + "grad_norm": 3.152420288820665, + "learning_rate": 3.270998071332209e-06, + "loss": 0.9698, + "step": 4191 + }, + { + "epoch": 0.3024421918401212, + "grad_norm": 3.0950935605995413, + "learning_rate": 3.2706371873461938e-06, + "loss": 1.0238, + "step": 4192 + }, + { + "epoch": 0.30251433930954874, + "grad_norm": 2.3328070955411904, + "learning_rate": 3.2702762339753855e-06, + "loss": 0.8819, + "step": 4193 + }, + { + "epoch": 0.3025864867789762, + "grad_norm": 2.2555041277033103, + "learning_rate": 3.2699152112394952e-06, + "loss": 0.9136, + "step": 4194 + }, + { + "epoch": 0.3026586342484037, + "grad_norm": 2.821368920452735, + "learning_rate": 3.2695541191582356e-06, + "loss": 0.9405, + "step": 4195 + }, + { + "epoch": 0.30273078171783124, + "grad_norm": 2.379494179420542, + "learning_rate": 3.2691929577513256e-06, + "loss": 0.9769, + "step": 4196 + }, + { + "epoch": 0.30280292918725876, + "grad_norm": 2.416489846787374, + "learning_rate": 3.2688317270384866e-06, + "loss": 0.9493, + "step": 4197 + }, + { + "epoch": 0.3028750766566863, + "grad_norm": 6.466615776307471, + "learning_rate": 3.268470427039444e-06, + "loss": 0.9554, + "step": 4198 + }, + { + "epoch": 0.3029472241261138, + "grad_norm": 3.428172409021898, + "learning_rate": 3.2681090577739266e-06, + "loss": 0.9365, + "step": 4199 + }, + { + "epoch": 0.30301937159554126, + "grad_norm": 2.0720882070169435, + "learning_rate": 3.2677476192616685e-06, + "loss": 0.9557, + "step": 4200 + }, + { + "epoch": 0.3030915190649688, + "grad_norm": 1.7110411033442108, + "learning_rate": 3.2673861115224063e-06, + "loss": 0.9361, + "step": 4201 + }, + { + "epoch": 0.3031636665343963, + "grad_norm": 2.2924969527329355, + "learning_rate": 3.2670245345758796e-06, + "loss": 0.8791, + "step": 4202 + }, + { + "epoch": 0.3032358140038238, + "grad_norm": 0.719275282602731, + "learning_rate": 3.266662888441834e-06, + "loss": 0.8154, + "step": 4203 + }, + { + "epoch": 0.30330796147325134, + "grad_norm": 2.604701259418798, + "learning_rate": 3.266301173140016e-06, + "loss": 0.9491, + "step": 4204 + }, + { + "epoch": 0.30338010894267886, + "grad_norm": 2.581794762566228, + "learning_rate": 3.26593938869018e-06, + "loss": 0.9713, + "step": 4205 + }, + { + "epoch": 0.3034522564121063, + "grad_norm": 2.5825601056352494, + "learning_rate": 3.2655775351120795e-06, + "loss": 0.8598, + "step": 4206 + }, + { + "epoch": 0.30352440388153384, + "grad_norm": 2.7131800981158642, + "learning_rate": 3.2652156124254752e-06, + "loss": 0.9415, + "step": 4207 + }, + { + "epoch": 0.30359655135096136, + "grad_norm": 6.037479010440837, + "learning_rate": 3.2648536206501293e-06, + "loss": 0.939, + "step": 4208 + }, + { + "epoch": 0.3036686988203889, + "grad_norm": 2.326037194030899, + "learning_rate": 3.26449155980581e-06, + "loss": 0.9769, + "step": 4209 + }, + { + "epoch": 0.3037408462898164, + "grad_norm": 0.8321886585326372, + "learning_rate": 3.2641294299122877e-06, + "loss": 0.8662, + "step": 4210 + }, + { + "epoch": 0.3038129937592439, + "grad_norm": 2.109910911017145, + "learning_rate": 3.2637672309893365e-06, + "loss": 0.96, + "step": 4211 + }, + { + "epoch": 0.3038851412286714, + "grad_norm": 2.554316782520106, + "learning_rate": 3.263404963056736e-06, + "loss": 0.8981, + "step": 4212 + }, + { + "epoch": 0.3039572886980989, + "grad_norm": 1.90095604133249, + "learning_rate": 3.2630426261342663e-06, + "loss": 0.8678, + "step": 4213 + }, + { + "epoch": 0.3040294361675264, + "grad_norm": 2.507026905829927, + "learning_rate": 3.2626802202417148e-06, + "loss": 0.9827, + "step": 4214 + }, + { + "epoch": 0.30410158363695394, + "grad_norm": 2.674216956211283, + "learning_rate": 3.2623177453988708e-06, + "loss": 0.9688, + "step": 4215 + }, + { + "epoch": 0.30417373110638146, + "grad_norm": 4.68022099225848, + "learning_rate": 3.261955201625528e-06, + "loss": 0.9451, + "step": 4216 + }, + { + "epoch": 0.304245878575809, + "grad_norm": 3.1630446315611103, + "learning_rate": 3.2615925889414827e-06, + "loss": 0.9523, + "step": 4217 + }, + { + "epoch": 0.30431802604523644, + "grad_norm": 2.2470194890832764, + "learning_rate": 3.261229907366537e-06, + "loss": 0.9613, + "step": 4218 + }, + { + "epoch": 0.30439017351466396, + "grad_norm": 2.7607872672190568, + "learning_rate": 3.2608671569204943e-06, + "loss": 0.9698, + "step": 4219 + }, + { + "epoch": 0.3044623209840915, + "grad_norm": 2.1473879938370337, + "learning_rate": 3.260504337623164e-06, + "loss": 1.0615, + "step": 4220 + }, + { + "epoch": 0.304534468453519, + "grad_norm": 3.708934492325171, + "learning_rate": 3.260141449494358e-06, + "loss": 1.0349, + "step": 4221 + }, + { + "epoch": 0.3046066159229465, + "grad_norm": 4.925326442065854, + "learning_rate": 3.2597784925538927e-06, + "loss": 0.9862, + "step": 4222 + }, + { + "epoch": 0.304678763392374, + "grad_norm": 2.599075090516116, + "learning_rate": 3.2594154668215873e-06, + "loss": 0.9337, + "step": 4223 + }, + { + "epoch": 0.3047509108618015, + "grad_norm": 9.443839867881884, + "learning_rate": 3.2590523723172654e-06, + "loss": 0.9436, + "step": 4224 + }, + { + "epoch": 0.304823058331229, + "grad_norm": 2.1377508540494747, + "learning_rate": 3.258689209060754e-06, + "loss": 0.9871, + "step": 4225 + }, + { + "epoch": 0.30489520580065654, + "grad_norm": 1.8987744821121564, + "learning_rate": 3.2583259770718846e-06, + "loss": 0.9782, + "step": 4226 + }, + { + "epoch": 0.30496735327008406, + "grad_norm": 2.5265280868083373, + "learning_rate": 3.257962676370492e-06, + "loss": 0.8654, + "step": 4227 + }, + { + "epoch": 0.3050395007395116, + "grad_norm": 2.274735437457465, + "learning_rate": 3.257599306976414e-06, + "loss": 0.8986, + "step": 4228 + }, + { + "epoch": 0.30511164820893905, + "grad_norm": 2.6342248948526352, + "learning_rate": 3.2572358689094936e-06, + "loss": 0.8219, + "step": 4229 + }, + { + "epoch": 0.30518379567836657, + "grad_norm": 2.3098206334020808, + "learning_rate": 3.2568723621895766e-06, + "loss": 0.9651, + "step": 4230 + }, + { + "epoch": 0.3052559431477941, + "grad_norm": 2.0744872413329336, + "learning_rate": 3.2565087868365123e-06, + "loss": 0.9655, + "step": 4231 + }, + { + "epoch": 0.3053280906172216, + "grad_norm": 2.0170597618074275, + "learning_rate": 3.256145142870155e-06, + "loss": 1.0329, + "step": 4232 + }, + { + "epoch": 0.3054002380866491, + "grad_norm": 2.8460665479266383, + "learning_rate": 3.2557814303103613e-06, + "loss": 0.8685, + "step": 4233 + }, + { + "epoch": 0.30547238555607664, + "grad_norm": 2.524985107529034, + "learning_rate": 3.2554176491769928e-06, + "loss": 0.9055, + "step": 4234 + }, + { + "epoch": 0.3055445330255041, + "grad_norm": 2.0491504397479554, + "learning_rate": 3.2550537994899135e-06, + "loss": 1.0289, + "step": 4235 + }, + { + "epoch": 0.3056166804949316, + "grad_norm": 3.0450983589710376, + "learning_rate": 3.2546898812689927e-06, + "loss": 0.8583, + "step": 4236 + }, + { + "epoch": 0.30568882796435914, + "grad_norm": 2.424798493025968, + "learning_rate": 3.2543258945341013e-06, + "loss": 1.0161, + "step": 4237 + }, + { + "epoch": 0.30576097543378666, + "grad_norm": 0.8230642757739877, + "learning_rate": 3.253961839305117e-06, + "loss": 0.8355, + "step": 4238 + }, + { + "epoch": 0.3058331229032142, + "grad_norm": 2.6598997286490493, + "learning_rate": 3.2535977156019185e-06, + "loss": 0.8993, + "step": 4239 + }, + { + "epoch": 0.3059052703726417, + "grad_norm": 3.611229622355241, + "learning_rate": 3.2532335234443893e-06, + "loss": 1.033, + "step": 4240 + }, + { + "epoch": 0.30597741784206917, + "grad_norm": 3.1226324649808754, + "learning_rate": 3.2528692628524166e-06, + "loss": 1.001, + "step": 4241 + }, + { + "epoch": 0.3060495653114967, + "grad_norm": 2.9000075373058367, + "learning_rate": 3.2525049338458918e-06, + "loss": 1.0253, + "step": 4242 + }, + { + "epoch": 0.3061217127809242, + "grad_norm": 5.254090305210098, + "learning_rate": 3.2521405364447086e-06, + "loss": 0.998, + "step": 4243 + }, + { + "epoch": 0.3061938602503517, + "grad_norm": 2.884317144968845, + "learning_rate": 3.251776070668766e-06, + "loss": 0.903, + "step": 4244 + }, + { + "epoch": 0.30626600771977924, + "grad_norm": 2.4196233311228754, + "learning_rate": 3.251411536537967e-06, + "loss": 0.9728, + "step": 4245 + }, + { + "epoch": 0.30633815518920676, + "grad_norm": 3.3557867714362724, + "learning_rate": 3.2510469340722156e-06, + "loss": 0.9525, + "step": 4246 + }, + { + "epoch": 0.3064103026586342, + "grad_norm": 8.05942257840866, + "learning_rate": 3.2506822632914227e-06, + "loss": 0.8588, + "step": 4247 + }, + { + "epoch": 0.30648245012806175, + "grad_norm": 2.3735518809037126, + "learning_rate": 3.250317524215501e-06, + "loss": 1.0237, + "step": 4248 + }, + { + "epoch": 0.30655459759748926, + "grad_norm": 2.5712528074079515, + "learning_rate": 3.249952716864368e-06, + "loss": 0.9392, + "step": 4249 + }, + { + "epoch": 0.3066267450669168, + "grad_norm": 3.363029018606987, + "learning_rate": 3.2495878412579444e-06, + "loss": 0.9674, + "step": 4250 + }, + { + "epoch": 0.3066988925363443, + "grad_norm": 2.886714455236149, + "learning_rate": 3.249222897416155e-06, + "loss": 0.8653, + "step": 4251 + }, + { + "epoch": 0.3067710400057718, + "grad_norm": 2.015593772468856, + "learning_rate": 3.2488578853589268e-06, + "loss": 0.8951, + "step": 4252 + }, + { + "epoch": 0.3068431874751993, + "grad_norm": 2.695746876088366, + "learning_rate": 3.248492805106193e-06, + "loss": 1.04, + "step": 4253 + }, + { + "epoch": 0.3069153349446268, + "grad_norm": 3.092230375435049, + "learning_rate": 3.248127656677889e-06, + "loss": 0.7971, + "step": 4254 + }, + { + "epoch": 0.3069874824140543, + "grad_norm": 8.58944405618524, + "learning_rate": 3.247762440093954e-06, + "loss": 1.0002, + "step": 4255 + }, + { + "epoch": 0.30705962988348184, + "grad_norm": 5.594646531190327, + "learning_rate": 3.2473971553743314e-06, + "loss": 1.0155, + "step": 4256 + }, + { + "epoch": 0.30713177735290936, + "grad_norm": 2.5481757838278867, + "learning_rate": 3.2470318025389674e-06, + "loss": 0.971, + "step": 4257 + }, + { + "epoch": 0.3072039248223369, + "grad_norm": 3.732038205727046, + "learning_rate": 3.246666381607813e-06, + "loss": 0.913, + "step": 4258 + }, + { + "epoch": 0.30727607229176435, + "grad_norm": 2.858975575383114, + "learning_rate": 3.2463008926008225e-06, + "loss": 0.9412, + "step": 4259 + }, + { + "epoch": 0.30734821976119187, + "grad_norm": 3.1920701717226034, + "learning_rate": 3.2459353355379545e-06, + "loss": 1.0223, + "step": 4260 + }, + { + "epoch": 0.3074203672306194, + "grad_norm": 2.410652393979607, + "learning_rate": 3.245569710439169e-06, + "loss": 0.9726, + "step": 4261 + }, + { + "epoch": 0.3074925147000469, + "grad_norm": 2.371944168147788, + "learning_rate": 3.245204017324434e-06, + "loss": 0.8839, + "step": 4262 + }, + { + "epoch": 0.3075646621694744, + "grad_norm": 2.4954628781443113, + "learning_rate": 3.2448382562137163e-06, + "loss": 1.002, + "step": 4263 + }, + { + "epoch": 0.30763680963890194, + "grad_norm": 2.312034353852372, + "learning_rate": 3.2444724271269898e-06, + "loss": 0.9256, + "step": 4264 + }, + { + "epoch": 0.3077089571083294, + "grad_norm": 3.318825063197797, + "learning_rate": 3.244106530084231e-06, + "loss": 0.9944, + "step": 4265 + }, + { + "epoch": 0.3077811045777569, + "grad_norm": 2.579507353122773, + "learning_rate": 3.24374056510542e-06, + "loss": 1.0146, + "step": 4266 + }, + { + "epoch": 0.30785325204718444, + "grad_norm": 4.197174511502337, + "learning_rate": 3.243374532210541e-06, + "loss": 0.9586, + "step": 4267 + }, + { + "epoch": 0.30792539951661196, + "grad_norm": 3.4952609812555346, + "learning_rate": 3.2430084314195823e-06, + "loss": 0.905, + "step": 4268 + }, + { + "epoch": 0.3079975469860395, + "grad_norm": 2.270186469195743, + "learning_rate": 3.2426422627525336e-06, + "loss": 0.9388, + "step": 4269 + }, + { + "epoch": 0.30806969445546695, + "grad_norm": 3.1247169366429124, + "learning_rate": 3.2422760262293913e-06, + "loss": 0.8339, + "step": 4270 + }, + { + "epoch": 0.30814184192489447, + "grad_norm": 2.586254365529229, + "learning_rate": 3.241909721870154e-06, + "loss": 0.9137, + "step": 4271 + }, + { + "epoch": 0.308213989394322, + "grad_norm": 2.973287387572739, + "learning_rate": 3.2415433496948243e-06, + "loss": 0.7641, + "step": 4272 + }, + { + "epoch": 0.3082861368637495, + "grad_norm": 2.0942838756803757, + "learning_rate": 3.241176909723408e-06, + "loss": 0.965, + "step": 4273 + }, + { + "epoch": 0.308358284333177, + "grad_norm": 2.4805169048290767, + "learning_rate": 3.2408104019759157e-06, + "loss": 0.9163, + "step": 4274 + }, + { + "epoch": 0.30843043180260454, + "grad_norm": 2.6227464539370566, + "learning_rate": 3.2404438264723605e-06, + "loss": 0.9695, + "step": 4275 + }, + { + "epoch": 0.308502579272032, + "grad_norm": 9.025894213728735, + "learning_rate": 3.2400771832327596e-06, + "loss": 1.0273, + "step": 4276 + }, + { + "epoch": 0.3085747267414595, + "grad_norm": 2.1420793256986186, + "learning_rate": 3.239710472277135e-06, + "loss": 0.9508, + "step": 4277 + }, + { + "epoch": 0.30864687421088705, + "grad_norm": 0.7879638199100731, + "learning_rate": 3.2393436936255106e-06, + "loss": 0.7437, + "step": 4278 + }, + { + "epoch": 0.30871902168031456, + "grad_norm": 2.6763468123791125, + "learning_rate": 3.2389768472979144e-06, + "loss": 0.9562, + "step": 4279 + }, + { + "epoch": 0.3087911691497421, + "grad_norm": 3.2031872254817166, + "learning_rate": 3.23860993331438e-06, + "loss": 0.9356, + "step": 4280 + }, + { + "epoch": 0.3088633166191696, + "grad_norm": 2.559003817479043, + "learning_rate": 3.238242951694942e-06, + "loss": 1.017, + "step": 4281 + }, + { + "epoch": 0.30893546408859707, + "grad_norm": 2.9596773953809565, + "learning_rate": 3.2378759024596395e-06, + "loss": 0.964, + "step": 4282 + }, + { + "epoch": 0.3090076115580246, + "grad_norm": 4.7966994461590176, + "learning_rate": 3.237508785628517e-06, + "loss": 0.8686, + "step": 4283 + }, + { + "epoch": 0.3090797590274521, + "grad_norm": 12.331152714588681, + "learning_rate": 3.237141601221621e-06, + "loss": 1.0107, + "step": 4284 + }, + { + "epoch": 0.3091519064968796, + "grad_norm": 2.5931036270154486, + "learning_rate": 3.2367743492590016e-06, + "loss": 1.009, + "step": 4285 + }, + { + "epoch": 0.30922405396630714, + "grad_norm": 2.3024246745260646, + "learning_rate": 3.2364070297607135e-06, + "loss": 0.9401, + "step": 4286 + }, + { + "epoch": 0.30929620143573466, + "grad_norm": 2.392401440383077, + "learning_rate": 3.2360396427468146e-06, + "loss": 0.9031, + "step": 4287 + }, + { + "epoch": 0.3093683489051621, + "grad_norm": 1.8490172327537306, + "learning_rate": 3.2356721882373663e-06, + "loss": 1.0173, + "step": 4288 + }, + { + "epoch": 0.30944049637458965, + "grad_norm": 3.056788041822559, + "learning_rate": 3.2353046662524336e-06, + "loss": 0.8774, + "step": 4289 + }, + { + "epoch": 0.30951264384401717, + "grad_norm": 5.978475426604485, + "learning_rate": 3.2349370768120877e-06, + "loss": 0.9531, + "step": 4290 + }, + { + "epoch": 0.3095847913134447, + "grad_norm": 2.527411199079441, + "learning_rate": 3.234569419936398e-06, + "loss": 0.9204, + "step": 4291 + }, + { + "epoch": 0.3096569387828722, + "grad_norm": 2.775391140427028, + "learning_rate": 3.234201695645443e-06, + "loss": 0.9229, + "step": 4292 + }, + { + "epoch": 0.3097290862522997, + "grad_norm": 2.267189485429437, + "learning_rate": 3.2338339039593026e-06, + "loss": 0.8758, + "step": 4293 + }, + { + "epoch": 0.3098012337217272, + "grad_norm": 2.5643763301882174, + "learning_rate": 3.233466044898059e-06, + "loss": 0.9597, + "step": 4294 + }, + { + "epoch": 0.3098733811911547, + "grad_norm": 2.493543393556265, + "learning_rate": 3.2330981184818027e-06, + "loss": 0.8418, + "step": 4295 + }, + { + "epoch": 0.3099455286605822, + "grad_norm": 3.23424024807988, + "learning_rate": 3.2327301247306214e-06, + "loss": 0.8137, + "step": 4296 + }, + { + "epoch": 0.31001767613000974, + "grad_norm": 0.9294234469830638, + "learning_rate": 3.232362063664612e-06, + "loss": 0.8687, + "step": 4297 + }, + { + "epoch": 0.31008982359943726, + "grad_norm": 2.787657226097065, + "learning_rate": 3.2319939353038725e-06, + "loss": 1.0483, + "step": 4298 + }, + { + "epoch": 0.3101619710688648, + "grad_norm": 4.647921083118501, + "learning_rate": 3.2316257396685042e-06, + "loss": 0.837, + "step": 4299 + }, + { + "epoch": 0.31023411853829225, + "grad_norm": 2.9089500436489644, + "learning_rate": 3.231257476778614e-06, + "loss": 0.9434, + "step": 4300 + }, + { + "epoch": 0.31030626600771977, + "grad_norm": 4.11688659842389, + "learning_rate": 3.230889146654311e-06, + "loss": 0.78, + "step": 4301 + }, + { + "epoch": 0.3103784134771473, + "grad_norm": 3.361177182839869, + "learning_rate": 3.2305207493157083e-06, + "loss": 0.8219, + "step": 4302 + }, + { + "epoch": 0.3104505609465748, + "grad_norm": 2.612475352193837, + "learning_rate": 3.230152284782923e-06, + "loss": 0.7866, + "step": 4303 + }, + { + "epoch": 0.3105227084160023, + "grad_norm": 2.949272428112501, + "learning_rate": 3.2297837530760744e-06, + "loss": 1.1049, + "step": 4304 + }, + { + "epoch": 0.31059485588542984, + "grad_norm": 2.9170412277221858, + "learning_rate": 3.2294151542152887e-06, + "loss": 0.998, + "step": 4305 + }, + { + "epoch": 0.3106670033548573, + "grad_norm": 1.8762448946523111, + "learning_rate": 3.2290464882206915e-06, + "loss": 0.9057, + "step": 4306 + }, + { + "epoch": 0.3107391508242848, + "grad_norm": 2.75955724233183, + "learning_rate": 3.228677755112416e-06, + "loss": 0.8915, + "step": 4307 + }, + { + "epoch": 0.31081129829371235, + "grad_norm": 2.235879884799926, + "learning_rate": 3.228308954910597e-06, + "loss": 0.9943, + "step": 4308 + }, + { + "epoch": 0.31088344576313987, + "grad_norm": 4.906131232701114, + "learning_rate": 3.227940087635372e-06, + "loss": 0.9916, + "step": 4309 + }, + { + "epoch": 0.3109555932325674, + "grad_norm": 3.439834079733171, + "learning_rate": 3.2275711533068856e-06, + "loss": 0.917, + "step": 4310 + }, + { + "epoch": 0.3110277407019949, + "grad_norm": 3.896014666629577, + "learning_rate": 3.227202151945282e-06, + "loss": 0.9632, + "step": 4311 + }, + { + "epoch": 0.31109988817142237, + "grad_norm": 2.8761718683684467, + "learning_rate": 3.226833083570712e-06, + "loss": 0.932, + "step": 4312 + }, + { + "epoch": 0.3111720356408499, + "grad_norm": 2.3706732036055618, + "learning_rate": 3.2264639482033294e-06, + "loss": 0.9259, + "step": 4313 + }, + { + "epoch": 0.3112441831102774, + "grad_norm": 2.414991576533505, + "learning_rate": 3.22609474586329e-06, + "loss": 0.9969, + "step": 4314 + }, + { + "epoch": 0.3113163305797049, + "grad_norm": 2.2316186082188927, + "learning_rate": 3.2257254765707558e-06, + "loss": 0.9678, + "step": 4315 + }, + { + "epoch": 0.31138847804913244, + "grad_norm": 2.5554723438131544, + "learning_rate": 3.2253561403458908e-06, + "loss": 0.9485, + "step": 4316 + }, + { + "epoch": 0.31146062551855996, + "grad_norm": 2.2902589346739823, + "learning_rate": 3.224986737208863e-06, + "loss": 1.0239, + "step": 4317 + }, + { + "epoch": 0.3115327729879874, + "grad_norm": 0.7674802373149662, + "learning_rate": 3.2246172671798446e-06, + "loss": 0.7843, + "step": 4318 + }, + { + "epoch": 0.31160492045741495, + "grad_norm": 2.4257464567628575, + "learning_rate": 3.2242477302790106e-06, + "loss": 1.0448, + "step": 4319 + }, + { + "epoch": 0.31167706792684247, + "grad_norm": 3.77299240993987, + "learning_rate": 3.223878126526539e-06, + "loss": 0.8336, + "step": 4320 + }, + { + "epoch": 0.31174921539627, + "grad_norm": 3.261216103204717, + "learning_rate": 3.2235084559426147e-06, + "loss": 0.9167, + "step": 4321 + }, + { + "epoch": 0.3118213628656975, + "grad_norm": 2.177833123812139, + "learning_rate": 3.2231387185474227e-06, + "loss": 0.9904, + "step": 4322 + }, + { + "epoch": 0.31189351033512497, + "grad_norm": 3.1379417989863487, + "learning_rate": 3.2227689143611532e-06, + "loss": 0.8747, + "step": 4323 + }, + { + "epoch": 0.3119656578045525, + "grad_norm": 2.049587287296711, + "learning_rate": 3.2223990434039994e-06, + "loss": 0.9454, + "step": 4324 + }, + { + "epoch": 0.31203780527398, + "grad_norm": 2.3299160410935245, + "learning_rate": 3.2220291056961593e-06, + "loss": 0.9567, + "step": 4325 + }, + { + "epoch": 0.3121099527434075, + "grad_norm": 2.7135071462435554, + "learning_rate": 3.221659101257833e-06, + "loss": 0.9321, + "step": 4326 + }, + { + "epoch": 0.31218210021283505, + "grad_norm": 1.8257694555774104, + "learning_rate": 3.221289030109227e-06, + "loss": 0.9407, + "step": 4327 + }, + { + "epoch": 0.31225424768226256, + "grad_norm": 2.2200734664170603, + "learning_rate": 3.220918892270547e-06, + "loss": 1.0267, + "step": 4328 + }, + { + "epoch": 0.31232639515169003, + "grad_norm": 3.2653955990467565, + "learning_rate": 3.2205486877620057e-06, + "loss": 0.9483, + "step": 4329 + }, + { + "epoch": 0.31239854262111755, + "grad_norm": 2.7845723742605646, + "learning_rate": 3.2201784166038195e-06, + "loss": 0.9994, + "step": 4330 + }, + { + "epoch": 0.31247069009054507, + "grad_norm": 2.656551837601336, + "learning_rate": 3.219808078816207e-06, + "loss": 0.9852, + "step": 4331 + }, + { + "epoch": 0.3125428375599726, + "grad_norm": 1.884103988972694, + "learning_rate": 3.2194376744193907e-06, + "loss": 0.9299, + "step": 4332 + }, + { + "epoch": 0.3126149850294001, + "grad_norm": 2.1926791324317185, + "learning_rate": 3.219067203433597e-06, + "loss": 0.9268, + "step": 4333 + }, + { + "epoch": 0.3126871324988276, + "grad_norm": 3.0078142438611017, + "learning_rate": 3.218696665879056e-06, + "loss": 0.9334, + "step": 4334 + }, + { + "epoch": 0.3127592799682551, + "grad_norm": 3.600871981874139, + "learning_rate": 3.218326061776002e-06, + "loss": 0.8193, + "step": 4335 + }, + { + "epoch": 0.3128314274376826, + "grad_norm": 2.217450150678894, + "learning_rate": 3.217955391144672e-06, + "loss": 0.8747, + "step": 4336 + }, + { + "epoch": 0.3129035749071101, + "grad_norm": 2.244258655070345, + "learning_rate": 3.217584654005306e-06, + "loss": 0.9276, + "step": 4337 + }, + { + "epoch": 0.31297572237653765, + "grad_norm": 2.3175976216023217, + "learning_rate": 3.21721385037815e-06, + "loss": 0.8851, + "step": 4338 + }, + { + "epoch": 0.31304786984596517, + "grad_norm": 3.328644474661764, + "learning_rate": 3.216842980283452e-06, + "loss": 0.9558, + "step": 4339 + }, + { + "epoch": 0.3131200173153927, + "grad_norm": 2.067834832640307, + "learning_rate": 3.216472043741463e-06, + "loss": 0.8993, + "step": 4340 + }, + { + "epoch": 0.31319216478482015, + "grad_norm": 3.2305459471108042, + "learning_rate": 3.2161010407724385e-06, + "loss": 0.9364, + "step": 4341 + }, + { + "epoch": 0.31326431225424767, + "grad_norm": 3.4894627944093193, + "learning_rate": 3.215729971396638e-06, + "loss": 0.9232, + "step": 4342 + }, + { + "epoch": 0.3133364597236752, + "grad_norm": 2.502238797057059, + "learning_rate": 3.215358835634325e-06, + "loss": 1.0048, + "step": 4343 + }, + { + "epoch": 0.3134086071931027, + "grad_norm": 2.9779110554464836, + "learning_rate": 3.2149876335057647e-06, + "loss": 0.9383, + "step": 4344 + }, + { + "epoch": 0.3134807546625302, + "grad_norm": 0.8966762362351219, + "learning_rate": 3.2146163650312276e-06, + "loss": 0.8186, + "step": 4345 + }, + { + "epoch": 0.31355290213195774, + "grad_norm": 3.670417528854259, + "learning_rate": 3.2142450302309873e-06, + "loss": 0.7895, + "step": 4346 + }, + { + "epoch": 0.3136250496013852, + "grad_norm": 3.265249422369324, + "learning_rate": 3.21387362912532e-06, + "loss": 1.0267, + "step": 4347 + }, + { + "epoch": 0.3136971970708127, + "grad_norm": 3.40169171152298, + "learning_rate": 3.2135021617345084e-06, + "loss": 0.9672, + "step": 4348 + }, + { + "epoch": 0.31376934454024025, + "grad_norm": 2.435284414581, + "learning_rate": 3.2131306280788354e-06, + "loss": 0.8819, + "step": 4349 + }, + { + "epoch": 0.31384149200966777, + "grad_norm": 2.433152650195386, + "learning_rate": 3.21275902817859e-06, + "loss": 0.8917, + "step": 4350 + }, + { + "epoch": 0.3139136394790953, + "grad_norm": 2.229710331394253, + "learning_rate": 3.2123873620540636e-06, + "loss": 0.9409, + "step": 4351 + }, + { + "epoch": 0.3139857869485228, + "grad_norm": 5.443760917643867, + "learning_rate": 3.2120156297255517e-06, + "loss": 0.9031, + "step": 4352 + }, + { + "epoch": 0.31405793441795027, + "grad_norm": 2.538045168788096, + "learning_rate": 3.2116438312133517e-06, + "loss": 0.9062, + "step": 4353 + }, + { + "epoch": 0.3141300818873778, + "grad_norm": 2.828577837861081, + "learning_rate": 3.2112719665377687e-06, + "loss": 0.9846, + "step": 4354 + }, + { + "epoch": 0.3142022293568053, + "grad_norm": 3.4840919824451637, + "learning_rate": 3.210900035719107e-06, + "loss": 0.9424, + "step": 4355 + }, + { + "epoch": 0.3142743768262328, + "grad_norm": 2.4243526459873186, + "learning_rate": 3.2105280387776773e-06, + "loss": 0.9986, + "step": 4356 + }, + { + "epoch": 0.31434652429566035, + "grad_norm": 3.430253036552171, + "learning_rate": 3.2101559757337925e-06, + "loss": 1.0004, + "step": 4357 + }, + { + "epoch": 0.31441867176508786, + "grad_norm": 3.5418424824476866, + "learning_rate": 3.2097838466077697e-06, + "loss": 0.9095, + "step": 4358 + }, + { + "epoch": 0.31449081923451533, + "grad_norm": 2.117681016447155, + "learning_rate": 3.2094116514199294e-06, + "loss": 0.9879, + "step": 4359 + }, + { + "epoch": 0.31456296670394285, + "grad_norm": 1.9674520301060539, + "learning_rate": 3.209039390190596e-06, + "loss": 0.9529, + "step": 4360 + }, + { + "epoch": 0.31463511417337037, + "grad_norm": 2.606262452166699, + "learning_rate": 3.2086670629400978e-06, + "loss": 1.0102, + "step": 4361 + }, + { + "epoch": 0.3147072616427979, + "grad_norm": 2.4885013309327273, + "learning_rate": 3.2082946696887643e-06, + "loss": 0.8564, + "step": 4362 + }, + { + "epoch": 0.3147794091122254, + "grad_norm": 2.2405488409615466, + "learning_rate": 3.2079222104569325e-06, + "loss": 0.8494, + "step": 4363 + }, + { + "epoch": 0.3148515565816529, + "grad_norm": 2.4415595654985447, + "learning_rate": 3.207549685264941e-06, + "loss": 0.9337, + "step": 4364 + }, + { + "epoch": 0.3149237040510804, + "grad_norm": 2.3429746235325157, + "learning_rate": 3.2071770941331304e-06, + "loss": 1.0014, + "step": 4365 + }, + { + "epoch": 0.3149958515205079, + "grad_norm": 7.6032775637822185, + "learning_rate": 3.2068044370818478e-06, + "loss": 0.9953, + "step": 4366 + }, + { + "epoch": 0.3150679989899354, + "grad_norm": 5.25256394040073, + "learning_rate": 3.206431714131443e-06, + "loss": 1.019, + "step": 4367 + }, + { + "epoch": 0.31514014645936295, + "grad_norm": 2.7728855497208293, + "learning_rate": 3.2060589253022676e-06, + "loss": 0.9663, + "step": 4368 + }, + { + "epoch": 0.31521229392879047, + "grad_norm": 2.829060268133647, + "learning_rate": 3.2056860706146793e-06, + "loss": 1.0272, + "step": 4369 + }, + { + "epoch": 0.315284441398218, + "grad_norm": 2.653547550336755, + "learning_rate": 3.2053131500890372e-06, + "loss": 0.9899, + "step": 4370 + }, + { + "epoch": 0.31535658886764545, + "grad_norm": 2.246453032575927, + "learning_rate": 3.204940163745707e-06, + "loss": 0.9059, + "step": 4371 + }, + { + "epoch": 0.31542873633707297, + "grad_norm": 2.357922542726707, + "learning_rate": 3.2045671116050547e-06, + "loss": 0.9596, + "step": 4372 + }, + { + "epoch": 0.3155008838065005, + "grad_norm": 0.779438359699107, + "learning_rate": 3.2041939936874514e-06, + "loss": 0.8348, + "step": 4373 + }, + { + "epoch": 0.315573031275928, + "grad_norm": 2.296576448097909, + "learning_rate": 3.2038208100132718e-06, + "loss": 0.9716, + "step": 4374 + }, + { + "epoch": 0.3156451787453555, + "grad_norm": 2.4727599979468873, + "learning_rate": 3.203447560602894e-06, + "loss": 0.9486, + "step": 4375 + }, + { + "epoch": 0.315717326214783, + "grad_norm": 2.012837456316162, + "learning_rate": 3.2030742454767003e-06, + "loss": 1.0041, + "step": 4376 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.8550453706047074, + "learning_rate": 3.2027008646550754e-06, + "loss": 0.8711, + "step": 4377 + }, + { + "epoch": 0.31586162115363803, + "grad_norm": 0.7178595458819438, + "learning_rate": 3.202327418158409e-06, + "loss": 0.8245, + "step": 4378 + }, + { + "epoch": 0.31593376862306555, + "grad_norm": 2.1481825798692853, + "learning_rate": 3.201953906007093e-06, + "loss": 0.8892, + "step": 4379 + }, + { + "epoch": 0.31600591609249307, + "grad_norm": 2.068990033103127, + "learning_rate": 3.2015803282215234e-06, + "loss": 0.9494, + "step": 4380 + }, + { + "epoch": 0.3160780635619206, + "grad_norm": 2.23566394254762, + "learning_rate": 3.2012066848221e-06, + "loss": 0.9705, + "step": 4381 + }, + { + "epoch": 0.31615021103134805, + "grad_norm": 2.3257584801053577, + "learning_rate": 3.200832975829227e-06, + "loss": 0.847, + "step": 4382 + }, + { + "epoch": 0.31622235850077557, + "grad_norm": 2.1312675050257104, + "learning_rate": 3.2004592012633097e-06, + "loss": 0.9308, + "step": 4383 + }, + { + "epoch": 0.3162945059702031, + "grad_norm": 1.0224693902803956, + "learning_rate": 3.20008536114476e-06, + "loss": 0.8717, + "step": 4384 + }, + { + "epoch": 0.3163666534396306, + "grad_norm": 2.0380609032812043, + "learning_rate": 3.1997114554939907e-06, + "loss": 0.9258, + "step": 4385 + }, + { + "epoch": 0.3164388009090581, + "grad_norm": 2.9526586416650367, + "learning_rate": 3.1993374843314203e-06, + "loss": 0.9736, + "step": 4386 + }, + { + "epoch": 0.31651094837848565, + "grad_norm": 2.994567561955762, + "learning_rate": 3.19896344767747e-06, + "loss": 0.8371, + "step": 4387 + }, + { + "epoch": 0.3165830958479131, + "grad_norm": 2.294557816868246, + "learning_rate": 3.198589345552564e-06, + "loss": 0.9201, + "step": 4388 + }, + { + "epoch": 0.31665524331734063, + "grad_norm": 1.883844243294972, + "learning_rate": 3.198215177977131e-06, + "loss": 1.0119, + "step": 4389 + }, + { + "epoch": 0.31672739078676815, + "grad_norm": 1.8249388749846993, + "learning_rate": 3.1978409449716023e-06, + "loss": 0.9592, + "step": 4390 + }, + { + "epoch": 0.31679953825619567, + "grad_norm": 2.995506258862967, + "learning_rate": 3.197466646556414e-06, + "loss": 0.9695, + "step": 4391 + }, + { + "epoch": 0.3168716857256232, + "grad_norm": 3.139986343354091, + "learning_rate": 3.197092282752005e-06, + "loss": 0.9424, + "step": 4392 + }, + { + "epoch": 0.3169438331950507, + "grad_norm": 3.313968404956555, + "learning_rate": 3.196717853578818e-06, + "loss": 0.9785, + "step": 4393 + }, + { + "epoch": 0.31701598066447817, + "grad_norm": 3.583645755248262, + "learning_rate": 3.1963433590572993e-06, + "loss": 0.8936, + "step": 4394 + }, + { + "epoch": 0.3170881281339057, + "grad_norm": 2.1645465405898285, + "learning_rate": 3.1959687992078987e-06, + "loss": 0.9558, + "step": 4395 + }, + { + "epoch": 0.3171602756033332, + "grad_norm": 4.563894816357558, + "learning_rate": 3.1955941740510687e-06, + "loss": 0.9087, + "step": 4396 + }, + { + "epoch": 0.3172324230727607, + "grad_norm": 0.7049003228950816, + "learning_rate": 3.1952194836072667e-06, + "loss": 0.8382, + "step": 4397 + }, + { + "epoch": 0.31730457054218825, + "grad_norm": 1.854896216149779, + "learning_rate": 3.1948447278969537e-06, + "loss": 1.0438, + "step": 4398 + }, + { + "epoch": 0.31737671801161577, + "grad_norm": 2.043529542937632, + "learning_rate": 3.194469906940593e-06, + "loss": 0.9834, + "step": 4399 + }, + { + "epoch": 0.31744886548104323, + "grad_norm": 2.994891585742556, + "learning_rate": 3.1940950207586533e-06, + "loss": 0.9523, + "step": 4400 + }, + { + "epoch": 0.31752101295047075, + "grad_norm": 2.4375230836997903, + "learning_rate": 3.193720069371604e-06, + "loss": 1.006, + "step": 4401 + }, + { + "epoch": 0.31759316041989827, + "grad_norm": 3.3565346708991766, + "learning_rate": 3.1933450527999205e-06, + "loss": 0.9437, + "step": 4402 + }, + { + "epoch": 0.3176653078893258, + "grad_norm": 3.0841683640295425, + "learning_rate": 3.192969971064082e-06, + "loss": 1.0405, + "step": 4403 + }, + { + "epoch": 0.3177374553587533, + "grad_norm": 2.1763146121684005, + "learning_rate": 3.192594824184569e-06, + "loss": 1.0017, + "step": 4404 + }, + { + "epoch": 0.3178096028281808, + "grad_norm": 2.6836638462290474, + "learning_rate": 3.1922196121818682e-06, + "loss": 0.9557, + "step": 4405 + }, + { + "epoch": 0.3178817502976083, + "grad_norm": 4.1470379381286495, + "learning_rate": 3.1918443350764673e-06, + "loss": 0.8186, + "step": 4406 + }, + { + "epoch": 0.3179538977670358, + "grad_norm": 2.0497827042577152, + "learning_rate": 3.1914689928888595e-06, + "loss": 0.8602, + "step": 4407 + }, + { + "epoch": 0.31802604523646333, + "grad_norm": 2.2872395726778354, + "learning_rate": 3.1910935856395405e-06, + "loss": 0.87, + "step": 4408 + }, + { + "epoch": 0.31809819270589085, + "grad_norm": 10.802421679787788, + "learning_rate": 3.1907181133490104e-06, + "loss": 0.9023, + "step": 4409 + }, + { + "epoch": 0.31817034017531837, + "grad_norm": 2.8821971205487, + "learning_rate": 3.1903425760377714e-06, + "loss": 1.0284, + "step": 4410 + }, + { + "epoch": 0.3182424876447459, + "grad_norm": 2.079178851237808, + "learning_rate": 3.1899669737263315e-06, + "loss": 0.9858, + "step": 4411 + }, + { + "epoch": 0.31831463511417335, + "grad_norm": 2.2474348492880574, + "learning_rate": 3.1895913064352e-06, + "loss": 0.9144, + "step": 4412 + }, + { + "epoch": 0.31838678258360087, + "grad_norm": 3.357152159077938, + "learning_rate": 3.1892155741848906e-06, + "loss": 0.8737, + "step": 4413 + }, + { + "epoch": 0.3184589300530284, + "grad_norm": 2.3387644321112573, + "learning_rate": 3.1888397769959216e-06, + "loss": 0.8144, + "step": 4414 + }, + { + "epoch": 0.3185310775224559, + "grad_norm": 7.794367385064012, + "learning_rate": 3.188463914888813e-06, + "loss": 0.9867, + "step": 4415 + }, + { + "epoch": 0.3186032249918834, + "grad_norm": 2.666554468496654, + "learning_rate": 3.18808798788409e-06, + "loss": 0.9421, + "step": 4416 + }, + { + "epoch": 0.31867537246131095, + "grad_norm": 0.9667519915402154, + "learning_rate": 3.1877119960022797e-06, + "loss": 0.8993, + "step": 4417 + }, + { + "epoch": 0.3187475199307384, + "grad_norm": 2.9818193767655172, + "learning_rate": 3.1873359392639134e-06, + "loss": 1.006, + "step": 4418 + }, + { + "epoch": 0.31881966740016593, + "grad_norm": 2.0853226001825895, + "learning_rate": 3.1869598176895284e-06, + "loss": 1.1041, + "step": 4419 + }, + { + "epoch": 0.31889181486959345, + "grad_norm": 3.3147139438204785, + "learning_rate": 3.1865836312996605e-06, + "loss": 0.9701, + "step": 4420 + }, + { + "epoch": 0.31896396233902097, + "grad_norm": 4.439340129308793, + "learning_rate": 3.1862073801148536e-06, + "loss": 0.9742, + "step": 4421 + }, + { + "epoch": 0.3190361098084485, + "grad_norm": 7.219376970287071, + "learning_rate": 3.1858310641556526e-06, + "loss": 1.0344, + "step": 4422 + }, + { + "epoch": 0.319108257277876, + "grad_norm": 5.455835756218118, + "learning_rate": 3.185454683442607e-06, + "loss": 0.915, + "step": 4423 + }, + { + "epoch": 0.31918040474730347, + "grad_norm": 4.126575978093523, + "learning_rate": 3.1850782379962695e-06, + "loss": 0.9518, + "step": 4424 + }, + { + "epoch": 0.319252552216731, + "grad_norm": 2.754335626987704, + "learning_rate": 3.184701727837197e-06, + "loss": 0.9677, + "step": 4425 + }, + { + "epoch": 0.3193246996861585, + "grad_norm": 2.777328040484975, + "learning_rate": 3.184325152985948e-06, + "loss": 0.9712, + "step": 4426 + }, + { + "epoch": 0.31939684715558603, + "grad_norm": 3.247106731468696, + "learning_rate": 3.183948513463087e-06, + "loss": 0.9083, + "step": 4427 + }, + { + "epoch": 0.31946899462501355, + "grad_norm": 3.2438938292734156, + "learning_rate": 3.1835718092891805e-06, + "loss": 0.8718, + "step": 4428 + }, + { + "epoch": 0.319541142094441, + "grad_norm": 2.2149999385170682, + "learning_rate": 3.183195040484799e-06, + "loss": 0.9838, + "step": 4429 + }, + { + "epoch": 0.31961328956386853, + "grad_norm": 3.3428220798297748, + "learning_rate": 3.1828182070705163e-06, + "loss": 0.9419, + "step": 4430 + }, + { + "epoch": 0.31968543703329605, + "grad_norm": 2.9516198899367803, + "learning_rate": 3.1824413090669097e-06, + "loss": 0.9643, + "step": 4431 + }, + { + "epoch": 0.31975758450272357, + "grad_norm": 2.8708422907526674, + "learning_rate": 3.1820643464945606e-06, + "loss": 0.904, + "step": 4432 + }, + { + "epoch": 0.3198297319721511, + "grad_norm": 2.6491811602846878, + "learning_rate": 3.1816873193740534e-06, + "loss": 1.0369, + "step": 4433 + }, + { + "epoch": 0.3199018794415786, + "grad_norm": 4.126081844995511, + "learning_rate": 3.1813102277259766e-06, + "loss": 0.8056, + "step": 4434 + }, + { + "epoch": 0.31997402691100607, + "grad_norm": 0.9748847587655592, + "learning_rate": 3.180933071570921e-06, + "loss": 0.7917, + "step": 4435 + }, + { + "epoch": 0.3200461743804336, + "grad_norm": 2.191012151575625, + "learning_rate": 3.1805558509294813e-06, + "loss": 0.9272, + "step": 4436 + }, + { + "epoch": 0.3201183218498611, + "grad_norm": 2.982099374498161, + "learning_rate": 3.1801785658222578e-06, + "loss": 0.9401, + "step": 4437 + }, + { + "epoch": 0.32019046931928863, + "grad_norm": 2.8559210923885323, + "learning_rate": 3.1798012162698518e-06, + "loss": 0.9764, + "step": 4438 + }, + { + "epoch": 0.32026261678871615, + "grad_norm": 2.210306893180727, + "learning_rate": 3.179423802292868e-06, + "loss": 0.9079, + "step": 4439 + }, + { + "epoch": 0.32033476425814367, + "grad_norm": 2.294095492004273, + "learning_rate": 3.1790463239119173e-06, + "loss": 1.0699, + "step": 4440 + }, + { + "epoch": 0.32040691172757113, + "grad_norm": 3.4724784264879625, + "learning_rate": 3.178668781147611e-06, + "loss": 0.8132, + "step": 4441 + }, + { + "epoch": 0.32047905919699865, + "grad_norm": 2.0890412403014977, + "learning_rate": 3.1782911740205656e-06, + "loss": 0.9948, + "step": 4442 + }, + { + "epoch": 0.32055120666642617, + "grad_norm": 1.8636684695140562, + "learning_rate": 3.1779135025514012e-06, + "loss": 1.0164, + "step": 4443 + }, + { + "epoch": 0.3206233541358537, + "grad_norm": 3.0690943507274246, + "learning_rate": 3.1775357667607415e-06, + "loss": 0.8495, + "step": 4444 + }, + { + "epoch": 0.3206955016052812, + "grad_norm": 2.9904352939112915, + "learning_rate": 3.1771579666692124e-06, + "loss": 0.9678, + "step": 4445 + }, + { + "epoch": 0.3207676490747087, + "grad_norm": 5.336919333969263, + "learning_rate": 3.176780102297444e-06, + "loss": 0.9316, + "step": 4446 + }, + { + "epoch": 0.3208397965441362, + "grad_norm": 2.3221743528045, + "learning_rate": 3.176402173666071e-06, + "loss": 0.9286, + "step": 4447 + }, + { + "epoch": 0.3209119440135637, + "grad_norm": 2.205701363188262, + "learning_rate": 3.17602418079573e-06, + "loss": 0.9429, + "step": 4448 + }, + { + "epoch": 0.32098409148299123, + "grad_norm": 2.953433227600553, + "learning_rate": 3.1756461237070627e-06, + "loss": 0.9701, + "step": 4449 + }, + { + "epoch": 0.32105623895241875, + "grad_norm": 2.3667847948636465, + "learning_rate": 3.1752680024207116e-06, + "loss": 0.8174, + "step": 4450 + }, + { + "epoch": 0.32112838642184627, + "grad_norm": 0.7601495866003194, + "learning_rate": 3.1748898169573257e-06, + "loss": 0.8332, + "step": 4451 + }, + { + "epoch": 0.3212005338912738, + "grad_norm": 2.078349165908397, + "learning_rate": 3.1745115673375566e-06, + "loss": 0.9675, + "step": 4452 + }, + { + "epoch": 0.32127268136070125, + "grad_norm": 4.529875237639454, + "learning_rate": 3.1741332535820587e-06, + "loss": 0.9571, + "step": 4453 + }, + { + "epoch": 0.32134482883012877, + "grad_norm": 1.8820229928804246, + "learning_rate": 3.1737548757114906e-06, + "loss": 0.924, + "step": 4454 + }, + { + "epoch": 0.3214169762995563, + "grad_norm": 2.14716215253894, + "learning_rate": 3.173376433746513e-06, + "loss": 0.9334, + "step": 4455 + }, + { + "epoch": 0.3214891237689838, + "grad_norm": 5.164976192098218, + "learning_rate": 3.1729979277077927e-06, + "loss": 0.9336, + "step": 4456 + }, + { + "epoch": 0.32156127123841133, + "grad_norm": 2.1626398250820698, + "learning_rate": 3.1726193576159972e-06, + "loss": 0.8622, + "step": 4457 + }, + { + "epoch": 0.32163341870783885, + "grad_norm": 4.963023214359321, + "learning_rate": 3.1722407234918e-06, + "loss": 1.0163, + "step": 4458 + }, + { + "epoch": 0.3217055661772663, + "grad_norm": 2.0645270356920773, + "learning_rate": 3.1718620253558765e-06, + "loss": 0.9172, + "step": 4459 + }, + { + "epoch": 0.32177771364669383, + "grad_norm": 2.4281032094873667, + "learning_rate": 3.1714832632289056e-06, + "loss": 0.9185, + "step": 4460 + }, + { + "epoch": 0.32184986111612135, + "grad_norm": 2.6706931193935546, + "learning_rate": 3.1711044371315704e-06, + "loss": 0.9641, + "step": 4461 + }, + { + "epoch": 0.32192200858554887, + "grad_norm": 1.9439406249354778, + "learning_rate": 3.170725547084557e-06, + "loss": 0.9683, + "step": 4462 + }, + { + "epoch": 0.3219941560549764, + "grad_norm": 2.133571391929609, + "learning_rate": 3.170346593108555e-06, + "loss": 0.905, + "step": 4463 + }, + { + "epoch": 0.3220663035244039, + "grad_norm": 2.841754758332666, + "learning_rate": 3.169967575224259e-06, + "loss": 0.9424, + "step": 4464 + }, + { + "epoch": 0.32213845099383137, + "grad_norm": 0.8727381286185231, + "learning_rate": 3.169588493452364e-06, + "loss": 0.8521, + "step": 4465 + }, + { + "epoch": 0.3222105984632589, + "grad_norm": 3.1516625331271246, + "learning_rate": 3.169209347813572e-06, + "loss": 0.9203, + "step": 4466 + }, + { + "epoch": 0.3222827459326864, + "grad_norm": 2.8832502988939024, + "learning_rate": 3.168830138328585e-06, + "loss": 0.9116, + "step": 4467 + }, + { + "epoch": 0.32235489340211393, + "grad_norm": 2.5540745551056903, + "learning_rate": 3.168450865018111e-06, + "loss": 0.9795, + "step": 4468 + }, + { + "epoch": 0.32242704087154145, + "grad_norm": 2.3048384277580216, + "learning_rate": 3.1680715279028613e-06, + "loss": 0.8698, + "step": 4469 + }, + { + "epoch": 0.32249918834096897, + "grad_norm": 2.461298937216588, + "learning_rate": 3.1676921270035487e-06, + "loss": 0.8231, + "step": 4470 + }, + { + "epoch": 0.32257133581039643, + "grad_norm": 2.0405833685514514, + "learning_rate": 3.1673126623408924e-06, + "loss": 0.8793, + "step": 4471 + }, + { + "epoch": 0.32264348327982395, + "grad_norm": 1.8073920919199953, + "learning_rate": 3.1669331339356126e-06, + "loss": 1.0328, + "step": 4472 + }, + { + "epoch": 0.32271563074925147, + "grad_norm": 3.629397914335249, + "learning_rate": 3.166553541808434e-06, + "loss": 0.8942, + "step": 4473 + }, + { + "epoch": 0.322787778218679, + "grad_norm": 3.668711886622259, + "learning_rate": 3.166173885980085e-06, + "loss": 0.9194, + "step": 4474 + }, + { + "epoch": 0.3228599256881065, + "grad_norm": 2.3283107606653326, + "learning_rate": 3.1657941664712972e-06, + "loss": 0.9473, + "step": 4475 + }, + { + "epoch": 0.32293207315753397, + "grad_norm": 2.2935371279359003, + "learning_rate": 3.1654143833028054e-06, + "loss": 0.9367, + "step": 4476 + }, + { + "epoch": 0.3230042206269615, + "grad_norm": 2.80796125609141, + "learning_rate": 3.1650345364953488e-06, + "loss": 0.9693, + "step": 4477 + }, + { + "epoch": 0.323076368096389, + "grad_norm": 2.2995192688800064, + "learning_rate": 3.1646546260696684e-06, + "loss": 0.9462, + "step": 4478 + }, + { + "epoch": 0.32314851556581653, + "grad_norm": 2.9703061492547014, + "learning_rate": 3.164274652046511e-06, + "loss": 1.0669, + "step": 4479 + }, + { + "epoch": 0.32322066303524405, + "grad_norm": 2.2170012123610467, + "learning_rate": 3.163894614446624e-06, + "loss": 1.0412, + "step": 4480 + }, + { + "epoch": 0.32329281050467157, + "grad_norm": 2.6684620793496396, + "learning_rate": 3.163514513290761e-06, + "loss": 0.965, + "step": 4481 + }, + { + "epoch": 0.32336495797409903, + "grad_norm": 3.559894612927659, + "learning_rate": 3.163134348599678e-06, + "loss": 0.9558, + "step": 4482 + }, + { + "epoch": 0.32343710544352655, + "grad_norm": 2.0901298541016144, + "learning_rate": 3.162754120394134e-06, + "loss": 0.9394, + "step": 4483 + }, + { + "epoch": 0.32350925291295407, + "grad_norm": 3.334385673984936, + "learning_rate": 3.1623738286948914e-06, + "loss": 0.886, + "step": 4484 + }, + { + "epoch": 0.3235814003823816, + "grad_norm": 2.9021529985999557, + "learning_rate": 3.1619934735227166e-06, + "loss": 1.0165, + "step": 4485 + }, + { + "epoch": 0.3236535478518091, + "grad_norm": 1.84130347915542, + "learning_rate": 3.161613054898381e-06, + "loss": 0.9733, + "step": 4486 + }, + { + "epoch": 0.32372569532123663, + "grad_norm": 2.5852220358019062, + "learning_rate": 3.161232572842656e-06, + "loss": 0.9638, + "step": 4487 + }, + { + "epoch": 0.3237978427906641, + "grad_norm": 3.370636415014844, + "learning_rate": 3.160852027376319e-06, + "loss": 1.0016, + "step": 4488 + }, + { + "epoch": 0.3238699902600916, + "grad_norm": 2.3017940655953906, + "learning_rate": 3.1604714185201497e-06, + "loss": 0.8739, + "step": 4489 + }, + { + "epoch": 0.32394213772951913, + "grad_norm": 2.6380312669302297, + "learning_rate": 3.1600907462949323e-06, + "loss": 0.7881, + "step": 4490 + }, + { + "epoch": 0.32401428519894665, + "grad_norm": 0.9139285193169474, + "learning_rate": 3.1597100107214536e-06, + "loss": 0.8476, + "step": 4491 + }, + { + "epoch": 0.32408643266837417, + "grad_norm": 2.8329704463668155, + "learning_rate": 3.1593292118205045e-06, + "loss": 1.0439, + "step": 4492 + }, + { + "epoch": 0.3241585801378017, + "grad_norm": 2.8491086670278483, + "learning_rate": 3.1589483496128795e-06, + "loss": 0.9785, + "step": 4493 + }, + { + "epoch": 0.32423072760722915, + "grad_norm": 2.329949093209339, + "learning_rate": 3.1585674241193746e-06, + "loss": 0.9418, + "step": 4494 + }, + { + "epoch": 0.32430287507665667, + "grad_norm": 6.533347053253923, + "learning_rate": 3.1581864353607917e-06, + "loss": 0.8544, + "step": 4495 + }, + { + "epoch": 0.3243750225460842, + "grad_norm": 2.3522846722540383, + "learning_rate": 3.157805383357935e-06, + "loss": 0.863, + "step": 4496 + }, + { + "epoch": 0.3244471700155117, + "grad_norm": 2.522765173381311, + "learning_rate": 3.1574242681316124e-06, + "loss": 0.824, + "step": 4497 + }, + { + "epoch": 0.32451931748493923, + "grad_norm": 2.1095688342292407, + "learning_rate": 3.1570430897026354e-06, + "loss": 0.953, + "step": 4498 + }, + { + "epoch": 0.32459146495436675, + "grad_norm": 4.872892853590487, + "learning_rate": 3.1566618480918187e-06, + "loss": 0.901, + "step": 4499 + }, + { + "epoch": 0.3246636124237942, + "grad_norm": 2.286577666230923, + "learning_rate": 3.15628054331998e-06, + "loss": 0.8809, + "step": 4500 + }, + { + "epoch": 0.32473575989322173, + "grad_norm": 1.8221701119629727, + "learning_rate": 3.1558991754079406e-06, + "loss": 0.9284, + "step": 4501 + }, + { + "epoch": 0.32480790736264925, + "grad_norm": 3.1017987908358218, + "learning_rate": 3.155517744376527e-06, + "loss": 0.8697, + "step": 4502 + }, + { + "epoch": 0.32488005483207677, + "grad_norm": 2.309202472193954, + "learning_rate": 3.1551362502465663e-06, + "loss": 1.0149, + "step": 4503 + }, + { + "epoch": 0.3249522023015043, + "grad_norm": 2.645764377526453, + "learning_rate": 3.154754693038892e-06, + "loss": 0.9874, + "step": 4504 + }, + { + "epoch": 0.3250243497709318, + "grad_norm": 2.5977384855404155, + "learning_rate": 3.154373072774338e-06, + "loss": 0.8721, + "step": 4505 + }, + { + "epoch": 0.3250964972403593, + "grad_norm": 2.3389525077405007, + "learning_rate": 3.153991389473744e-06, + "loss": 0.891, + "step": 4506 + }, + { + "epoch": 0.3251686447097868, + "grad_norm": 2.2965846494508986, + "learning_rate": 3.1536096431579523e-06, + "loss": 0.8895, + "step": 4507 + }, + { + "epoch": 0.3252407921792143, + "grad_norm": 2.68868495633713, + "learning_rate": 3.1532278338478086e-06, + "loss": 0.8528, + "step": 4508 + }, + { + "epoch": 0.32531293964864183, + "grad_norm": 2.294763541745514, + "learning_rate": 3.1528459615641615e-06, + "loss": 0.8296, + "step": 4509 + }, + { + "epoch": 0.32538508711806935, + "grad_norm": 1.695250127673373, + "learning_rate": 3.152464026327865e-06, + "loss": 0.8757, + "step": 4510 + }, + { + "epoch": 0.32545723458749687, + "grad_norm": 1.8258973593594723, + "learning_rate": 3.152082028159773e-06, + "loss": 0.9629, + "step": 4511 + }, + { + "epoch": 0.32552938205692433, + "grad_norm": 2.019990201114267, + "learning_rate": 3.1516999670807475e-06, + "loss": 0.9659, + "step": 4512 + }, + { + "epoch": 0.32560152952635185, + "grad_norm": 2.387882030339923, + "learning_rate": 3.1513178431116498e-06, + "loss": 1.0341, + "step": 4513 + }, + { + "epoch": 0.32567367699577937, + "grad_norm": 2.321971159184996, + "learning_rate": 3.1509356562733464e-06, + "loss": 1.0882, + "step": 4514 + }, + { + "epoch": 0.3257458244652069, + "grad_norm": 2.848667462347116, + "learning_rate": 3.150553406586708e-06, + "loss": 0.8597, + "step": 4515 + }, + { + "epoch": 0.3258179719346344, + "grad_norm": 2.275082337283303, + "learning_rate": 3.150171094072607e-06, + "loss": 0.9054, + "step": 4516 + }, + { + "epoch": 0.32589011940406193, + "grad_norm": 2.4504912020236, + "learning_rate": 3.1497887187519203e-06, + "loss": 0.9941, + "step": 4517 + }, + { + "epoch": 0.3259622668734894, + "grad_norm": 2.3314921971729854, + "learning_rate": 3.149406280645528e-06, + "loss": 0.9554, + "step": 4518 + }, + { + "epoch": 0.3260344143429169, + "grad_norm": 2.7094435739254, + "learning_rate": 3.149023779774314e-06, + "loss": 0.9616, + "step": 4519 + }, + { + "epoch": 0.32610656181234443, + "grad_norm": 2.458369971395633, + "learning_rate": 3.1486412161591645e-06, + "loss": 0.8758, + "step": 4520 + }, + { + "epoch": 0.32617870928177195, + "grad_norm": 1.9603160466132121, + "learning_rate": 3.1482585898209706e-06, + "loss": 1.0532, + "step": 4521 + }, + { + "epoch": 0.32625085675119947, + "grad_norm": 2.017369779333626, + "learning_rate": 3.1478759007806258e-06, + "loss": 0.9906, + "step": 4522 + }, + { + "epoch": 0.326323004220627, + "grad_norm": 1.9012501293134676, + "learning_rate": 3.147493149059027e-06, + "loss": 1.015, + "step": 4523 + }, + { + "epoch": 0.32639515169005445, + "grad_norm": 2.6219461024467576, + "learning_rate": 3.1471103346770757e-06, + "loss": 0.9679, + "step": 4524 + }, + { + "epoch": 0.32646729915948197, + "grad_norm": 2.850963623703256, + "learning_rate": 3.1467274576556753e-06, + "loss": 0.8823, + "step": 4525 + }, + { + "epoch": 0.3265394466289095, + "grad_norm": 2.0428612646124202, + "learning_rate": 3.1463445180157334e-06, + "loss": 0.9435, + "step": 4526 + }, + { + "epoch": 0.326611594098337, + "grad_norm": 2.252568474341657, + "learning_rate": 3.145961515778161e-06, + "loss": 1.0424, + "step": 4527 + }, + { + "epoch": 0.32668374156776453, + "grad_norm": 5.007360381428717, + "learning_rate": 3.1455784509638726e-06, + "loss": 0.9177, + "step": 4528 + }, + { + "epoch": 0.326755889037192, + "grad_norm": 2.003649243387511, + "learning_rate": 3.1451953235937854e-06, + "loss": 0.9514, + "step": 4529 + }, + { + "epoch": 0.3268280365066195, + "grad_norm": 3.052089044524351, + "learning_rate": 3.1448121336888214e-06, + "loss": 0.8079, + "step": 4530 + }, + { + "epoch": 0.32690018397604703, + "grad_norm": 2.1920972200538844, + "learning_rate": 3.1444288812699045e-06, + "loss": 0.8691, + "step": 4531 + }, + { + "epoch": 0.32697233144547455, + "grad_norm": 2.028821226739399, + "learning_rate": 3.1440455663579627e-06, + "loss": 0.9754, + "step": 4532 + }, + { + "epoch": 0.32704447891490207, + "grad_norm": 2.203365556286039, + "learning_rate": 3.143662188973928e-06, + "loss": 0.8069, + "step": 4533 + }, + { + "epoch": 0.3271166263843296, + "grad_norm": 2.3830950460044633, + "learning_rate": 3.1432787491387346e-06, + "loss": 0.8811, + "step": 4534 + }, + { + "epoch": 0.32718877385375705, + "grad_norm": 2.4840238310980705, + "learning_rate": 3.142895246873321e-06, + "loss": 1.0347, + "step": 4535 + }, + { + "epoch": 0.3272609213231846, + "grad_norm": 0.74179114750825, + "learning_rate": 3.142511682198629e-06, + "loss": 0.7932, + "step": 4536 + }, + { + "epoch": 0.3273330687926121, + "grad_norm": 2.7077510696900693, + "learning_rate": 3.1421280551356034e-06, + "loss": 0.8996, + "step": 4537 + }, + { + "epoch": 0.3274052162620396, + "grad_norm": 2.6336212274458375, + "learning_rate": 3.141744365705193e-06, + "loss": 0.9504, + "step": 4538 + }, + { + "epoch": 0.32747736373146713, + "grad_norm": 2.493244389496223, + "learning_rate": 3.1413606139283492e-06, + "loss": 0.9721, + "step": 4539 + }, + { + "epoch": 0.32754951120089465, + "grad_norm": 5.0022673234488515, + "learning_rate": 3.140976799826027e-06, + "loss": 0.8812, + "step": 4540 + }, + { + "epoch": 0.3276216586703221, + "grad_norm": 3.3325001788014093, + "learning_rate": 3.1405929234191855e-06, + "loss": 1.0133, + "step": 4541 + }, + { + "epoch": 0.32769380613974963, + "grad_norm": 2.2663636154588227, + "learning_rate": 3.1402089847287876e-06, + "loss": 1.0544, + "step": 4542 + }, + { + "epoch": 0.32776595360917715, + "grad_norm": 2.9746244433917424, + "learning_rate": 3.1398249837757976e-06, + "loss": 0.8855, + "step": 4543 + }, + { + "epoch": 0.32783810107860467, + "grad_norm": 3.9311315715763615, + "learning_rate": 3.1394409205811846e-06, + "loss": 0.8991, + "step": 4544 + }, + { + "epoch": 0.3279102485480322, + "grad_norm": 2.054365125538973, + "learning_rate": 3.1390567951659204e-06, + "loss": 0.8957, + "step": 4545 + }, + { + "epoch": 0.3279823960174597, + "grad_norm": 4.088543325371213, + "learning_rate": 3.138672607550982e-06, + "loss": 0.9004, + "step": 4546 + }, + { + "epoch": 0.3280545434868872, + "grad_norm": 2.2579203121223688, + "learning_rate": 3.138288357757348e-06, + "loss": 1.0091, + "step": 4547 + }, + { + "epoch": 0.3281266909563147, + "grad_norm": 2.2159457686085333, + "learning_rate": 3.1379040458060004e-06, + "loss": 0.801, + "step": 4548 + }, + { + "epoch": 0.3281988384257422, + "grad_norm": 2.8324418815986814, + "learning_rate": 3.1375196717179247e-06, + "loss": 0.9972, + "step": 4549 + }, + { + "epoch": 0.32827098589516973, + "grad_norm": 1.947614418556805, + "learning_rate": 3.137135235514111e-06, + "loss": 0.9926, + "step": 4550 + }, + { + "epoch": 0.32834313336459725, + "grad_norm": 2.1665377456261075, + "learning_rate": 3.1367507372155518e-06, + "loss": 1.004, + "step": 4551 + }, + { + "epoch": 0.32841528083402477, + "grad_norm": 0.6998991399718035, + "learning_rate": 3.1363661768432423e-06, + "loss": 0.7628, + "step": 4552 + }, + { + "epoch": 0.32848742830345223, + "grad_norm": 2.967656626886362, + "learning_rate": 3.1359815544181835e-06, + "loss": 0.9089, + "step": 4553 + }, + { + "epoch": 0.32855957577287975, + "grad_norm": 2.638882666017211, + "learning_rate": 3.1355968699613765e-06, + "loss": 0.9512, + "step": 4554 + }, + { + "epoch": 0.32863172324230727, + "grad_norm": 3.9037959820875865, + "learning_rate": 3.1352121234938284e-06, + "loss": 0.9486, + "step": 4555 + }, + { + "epoch": 0.3287038707117348, + "grad_norm": 2.3938267030702693, + "learning_rate": 3.134827315036549e-06, + "loss": 0.9625, + "step": 4556 + }, + { + "epoch": 0.3287760181811623, + "grad_norm": 2.165519471783184, + "learning_rate": 3.134442444610551e-06, + "loss": 1.0232, + "step": 4557 + }, + { + "epoch": 0.32884816565058983, + "grad_norm": 2.4598688620992757, + "learning_rate": 3.13405751223685e-06, + "loss": 0.8585, + "step": 4558 + }, + { + "epoch": 0.3289203131200173, + "grad_norm": 10.280527941732476, + "learning_rate": 3.133672517936467e-06, + "loss": 0.9817, + "step": 4559 + }, + { + "epoch": 0.3289924605894448, + "grad_norm": 2.08947351331279, + "learning_rate": 3.1332874617304243e-06, + "loss": 1.022, + "step": 4560 + }, + { + "epoch": 0.32906460805887233, + "grad_norm": 2.9904933346392, + "learning_rate": 3.132902343639748e-06, + "loss": 0.8146, + "step": 4561 + }, + { + "epoch": 0.32913675552829985, + "grad_norm": 3.125818069191579, + "learning_rate": 3.1325171636854696e-06, + "loss": 0.9874, + "step": 4562 + }, + { + "epoch": 0.32920890299772737, + "grad_norm": 2.3321411515861623, + "learning_rate": 3.1321319218886207e-06, + "loss": 0.9924, + "step": 4563 + }, + { + "epoch": 0.3292810504671549, + "grad_norm": 0.7591852555440026, + "learning_rate": 3.131746618270239e-06, + "loss": 0.8346, + "step": 4564 + }, + { + "epoch": 0.32935319793658235, + "grad_norm": 0.7265081487856296, + "learning_rate": 3.1313612528513637e-06, + "loss": 0.8045, + "step": 4565 + }, + { + "epoch": 0.3294253454060099, + "grad_norm": 2.0002057446510495, + "learning_rate": 3.1309758256530387e-06, + "loss": 0.7577, + "step": 4566 + }, + { + "epoch": 0.3294974928754374, + "grad_norm": 3.1543919599155372, + "learning_rate": 3.1305903366963105e-06, + "loss": 0.906, + "step": 4567 + }, + { + "epoch": 0.3295696403448649, + "grad_norm": 4.414767210753754, + "learning_rate": 3.1302047860022293e-06, + "loss": 0.8987, + "step": 4568 + }, + { + "epoch": 0.32964178781429243, + "grad_norm": 1.8778297687357983, + "learning_rate": 3.1298191735918487e-06, + "loss": 0.9666, + "step": 4569 + }, + { + "epoch": 0.32971393528371995, + "grad_norm": 1.9414497846484842, + "learning_rate": 3.1294334994862254e-06, + "loss": 0.9224, + "step": 4570 + }, + { + "epoch": 0.3297860827531474, + "grad_norm": 3.0684223132170927, + "learning_rate": 3.1290477637064197e-06, + "loss": 0.9136, + "step": 4571 + }, + { + "epoch": 0.32985823022257493, + "grad_norm": 1.8511795098525032, + "learning_rate": 3.1286619662734956e-06, + "loss": 0.8952, + "step": 4572 + }, + { + "epoch": 0.32993037769200245, + "grad_norm": 2.7007691806548912, + "learning_rate": 3.1282761072085188e-06, + "loss": 0.9358, + "step": 4573 + }, + { + "epoch": 0.33000252516142997, + "grad_norm": 0.8779956763124127, + "learning_rate": 3.1278901865325613e-06, + "loss": 0.7816, + "step": 4574 + }, + { + "epoch": 0.3300746726308575, + "grad_norm": 2.2865505561970245, + "learning_rate": 3.127504204266696e-06, + "loss": 0.8703, + "step": 4575 + }, + { + "epoch": 0.330146820100285, + "grad_norm": 2.596293661368454, + "learning_rate": 3.1271181604319994e-06, + "loss": 0.9869, + "step": 4576 + }, + { + "epoch": 0.3302189675697125, + "grad_norm": 2.9560014551698566, + "learning_rate": 3.1267320550495527e-06, + "loss": 0.9497, + "step": 4577 + }, + { + "epoch": 0.33029111503914, + "grad_norm": 2.5917472920853015, + "learning_rate": 3.12634588814044e-06, + "loss": 1.0652, + "step": 4578 + }, + { + "epoch": 0.3303632625085675, + "grad_norm": 2.605454932409247, + "learning_rate": 3.1259596597257477e-06, + "loss": 0.8943, + "step": 4579 + }, + { + "epoch": 0.33043540997799503, + "grad_norm": 0.7318530505999304, + "learning_rate": 3.125573369826566e-06, + "loss": 0.7557, + "step": 4580 + }, + { + "epoch": 0.33050755744742255, + "grad_norm": 2.2899279732183886, + "learning_rate": 3.12518701846399e-06, + "loss": 0.8138, + "step": 4581 + }, + { + "epoch": 0.33057970491685, + "grad_norm": 4.82843826413434, + "learning_rate": 3.1248006056591163e-06, + "loss": 1.0071, + "step": 4582 + }, + { + "epoch": 0.33065185238627753, + "grad_norm": 2.227486616066606, + "learning_rate": 3.1244141314330445e-06, + "loss": 1.0051, + "step": 4583 + }, + { + "epoch": 0.33072399985570505, + "grad_norm": 2.2558354892539563, + "learning_rate": 3.1240275958068805e-06, + "loss": 1.0158, + "step": 4584 + }, + { + "epoch": 0.3307961473251326, + "grad_norm": 2.7238723906652935, + "learning_rate": 3.12364099880173e-06, + "loss": 0.9686, + "step": 4585 + }, + { + "epoch": 0.3308682947945601, + "grad_norm": 0.781823176885681, + "learning_rate": 3.123254340438704e-06, + "loss": 0.8267, + "step": 4586 + }, + { + "epoch": 0.3309404422639876, + "grad_norm": 0.7719116958976463, + "learning_rate": 3.122867620738917e-06, + "loss": 0.8587, + "step": 4587 + }, + { + "epoch": 0.3310125897334151, + "grad_norm": 3.2818010140024962, + "learning_rate": 3.122480839723486e-06, + "loss": 0.9528, + "step": 4588 + }, + { + "epoch": 0.3310847372028426, + "grad_norm": 2.972716883518453, + "learning_rate": 3.1220939974135307e-06, + "loss": 0.9361, + "step": 4589 + }, + { + "epoch": 0.3311568846722701, + "grad_norm": 0.7981765440067297, + "learning_rate": 3.1217070938301765e-06, + "loss": 0.8282, + "step": 4590 + }, + { + "epoch": 0.33122903214169763, + "grad_norm": 2.4620096444592576, + "learning_rate": 3.1213201289945508e-06, + "loss": 0.9523, + "step": 4591 + }, + { + "epoch": 0.33130117961112515, + "grad_norm": 1.9747911439389143, + "learning_rate": 3.1209331029277836e-06, + "loss": 0.9765, + "step": 4592 + }, + { + "epoch": 0.33137332708055267, + "grad_norm": 2.7381310523792797, + "learning_rate": 3.1205460156510094e-06, + "loss": 0.9403, + "step": 4593 + }, + { + "epoch": 0.33144547454998013, + "grad_norm": 2.9743697131888065, + "learning_rate": 3.120158867185365e-06, + "loss": 1.029, + "step": 4594 + }, + { + "epoch": 0.33151762201940765, + "grad_norm": 2.480826861144894, + "learning_rate": 3.1197716575519916e-06, + "loss": 0.8474, + "step": 4595 + }, + { + "epoch": 0.3315897694888352, + "grad_norm": 2.0716226139228646, + "learning_rate": 3.119384386772033e-06, + "loss": 0.9707, + "step": 4596 + }, + { + "epoch": 0.3316619169582627, + "grad_norm": 3.1114824523774254, + "learning_rate": 3.118997054866637e-06, + "loss": 0.9149, + "step": 4597 + }, + { + "epoch": 0.3317340644276902, + "grad_norm": 3.163892501220189, + "learning_rate": 3.1186096618569545e-06, + "loss": 0.9916, + "step": 4598 + }, + { + "epoch": 0.33180621189711773, + "grad_norm": 2.9422669386306843, + "learning_rate": 3.118222207764139e-06, + "loss": 0.9342, + "step": 4599 + }, + { + "epoch": 0.3318783593665452, + "grad_norm": 2.283824460841558, + "learning_rate": 3.117834692609349e-06, + "loss": 0.9391, + "step": 4600 + }, + { + "epoch": 0.3319505068359727, + "grad_norm": 1.9410610083888022, + "learning_rate": 3.1174471164137436e-06, + "loss": 1.0218, + "step": 4601 + }, + { + "epoch": 0.33202265430540023, + "grad_norm": 4.903059389685605, + "learning_rate": 3.117059479198488e-06, + "loss": 1.0118, + "step": 4602 + }, + { + "epoch": 0.33209480177482775, + "grad_norm": 3.168459251051484, + "learning_rate": 3.1166717809847498e-06, + "loss": 0.8569, + "step": 4603 + }, + { + "epoch": 0.33216694924425527, + "grad_norm": 2.2586841418698578, + "learning_rate": 3.1162840217936996e-06, + "loss": 1.0028, + "step": 4604 + }, + { + "epoch": 0.3322390967136828, + "grad_norm": 2.213267793355282, + "learning_rate": 3.1158962016465112e-06, + "loss": 0.8414, + "step": 4605 + }, + { + "epoch": 0.33231124418311025, + "grad_norm": 5.329365784377751, + "learning_rate": 3.1155083205643617e-06, + "loss": 0.996, + "step": 4606 + }, + { + "epoch": 0.3323833916525378, + "grad_norm": 7.759194856934633, + "learning_rate": 3.115120378568432e-06, + "loss": 0.9623, + "step": 4607 + }, + { + "epoch": 0.3324555391219653, + "grad_norm": 2.0514828598939054, + "learning_rate": 3.1147323756799072e-06, + "loss": 0.9782, + "step": 4608 + }, + { + "epoch": 0.3325276865913928, + "grad_norm": 3.3233175165481703, + "learning_rate": 3.114344311919974e-06, + "loss": 1.0587, + "step": 4609 + }, + { + "epoch": 0.33259983406082033, + "grad_norm": 3.8484757068764845, + "learning_rate": 3.1139561873098224e-06, + "loss": 0.9944, + "step": 4610 + }, + { + "epoch": 0.33267198153024785, + "grad_norm": 2.2236082589155473, + "learning_rate": 3.1135680018706474e-06, + "loss": 0.9537, + "step": 4611 + }, + { + "epoch": 0.3327441289996753, + "grad_norm": 4.361557062327004, + "learning_rate": 3.1131797556236467e-06, + "loss": 0.9066, + "step": 4612 + }, + { + "epoch": 0.33281627646910283, + "grad_norm": 2.548856748222902, + "learning_rate": 3.11279144859002e-06, + "loss": 0.8822, + "step": 4613 + }, + { + "epoch": 0.33288842393853035, + "grad_norm": 4.6250610347535215, + "learning_rate": 3.1124030807909714e-06, + "loss": 0.9413, + "step": 4614 + }, + { + "epoch": 0.3329605714079579, + "grad_norm": 2.6306162200005403, + "learning_rate": 3.1120146522477087e-06, + "loss": 0.9159, + "step": 4615 + }, + { + "epoch": 0.3330327188773854, + "grad_norm": 2.5105660316278913, + "learning_rate": 3.1116261629814425e-06, + "loss": 0.9483, + "step": 4616 + }, + { + "epoch": 0.3331048663468129, + "grad_norm": 2.879211658028724, + "learning_rate": 3.1112376130133864e-06, + "loss": 0.9154, + "step": 4617 + }, + { + "epoch": 0.3331770138162404, + "grad_norm": 2.727823222839907, + "learning_rate": 3.110849002364758e-06, + "loss": 0.8295, + "step": 4618 + }, + { + "epoch": 0.3332491612856679, + "grad_norm": 1.730041520055965, + "learning_rate": 3.110460331056778e-06, + "loss": 0.9419, + "step": 4619 + }, + { + "epoch": 0.3333213087550954, + "grad_norm": 2.5233506698550365, + "learning_rate": 3.110071599110671e-06, + "loss": 1.0521, + "step": 4620 + }, + { + "epoch": 0.33339345622452293, + "grad_norm": 2.9561303402835803, + "learning_rate": 3.1096828065476616e-06, + "loss": 0.9508, + "step": 4621 + }, + { + "epoch": 0.33346560369395045, + "grad_norm": 2.6966323937908987, + "learning_rate": 3.109293953388983e-06, + "loss": 1.0034, + "step": 4622 + }, + { + "epoch": 0.33353775116337797, + "grad_norm": 2.7040789029104317, + "learning_rate": 3.108905039655868e-06, + "loss": 0.903, + "step": 4623 + }, + { + "epoch": 0.33360989863280543, + "grad_norm": 3.003626538809969, + "learning_rate": 3.108516065369553e-06, + "loss": 0.9085, + "step": 4624 + }, + { + "epoch": 0.33368204610223295, + "grad_norm": 0.786692127735749, + "learning_rate": 3.10812703055128e-06, + "loss": 0.818, + "step": 4625 + }, + { + "epoch": 0.3337541935716605, + "grad_norm": 2.0326666953409123, + "learning_rate": 3.1077379352222923e-06, + "loss": 0.8824, + "step": 4626 + }, + { + "epoch": 0.333826341041088, + "grad_norm": 3.251565996101256, + "learning_rate": 3.1073487794038364e-06, + "loss": 0.9684, + "step": 4627 + }, + { + "epoch": 0.3338984885105155, + "grad_norm": 2.181574616419717, + "learning_rate": 3.1069595631171625e-06, + "loss": 0.9132, + "step": 4628 + }, + { + "epoch": 0.333970635979943, + "grad_norm": 3.3308048831226356, + "learning_rate": 3.106570286383525e-06, + "loss": 1.0479, + "step": 4629 + }, + { + "epoch": 0.3340427834493705, + "grad_norm": 2.820275811038628, + "learning_rate": 3.1061809492241802e-06, + "loss": 0.8678, + "step": 4630 + }, + { + "epoch": 0.334114930918798, + "grad_norm": 3.578250932767852, + "learning_rate": 3.1057915516603896e-06, + "loss": 0.9444, + "step": 4631 + }, + { + "epoch": 0.33418707838822553, + "grad_norm": 2.720571499698757, + "learning_rate": 3.1054020937134154e-06, + "loss": 0.8997, + "step": 4632 + }, + { + "epoch": 0.33425922585765305, + "grad_norm": 2.502751838595765, + "learning_rate": 3.105012575404524e-06, + "loss": 0.9016, + "step": 4633 + }, + { + "epoch": 0.33433137332708057, + "grad_norm": 3.864143792148388, + "learning_rate": 3.104622996754988e-06, + "loss": 0.9427, + "step": 4634 + }, + { + "epoch": 0.33440352079650804, + "grad_norm": 1.866203527670402, + "learning_rate": 3.104233357786079e-06, + "loss": 0.9212, + "step": 4635 + }, + { + "epoch": 0.33447566826593556, + "grad_norm": 3.757800318327449, + "learning_rate": 3.1038436585190732e-06, + "loss": 0.9085, + "step": 4636 + }, + { + "epoch": 0.3345478157353631, + "grad_norm": 3.0030322645082, + "learning_rate": 3.1034538989752524e-06, + "loss": 0.9152, + "step": 4637 + }, + { + "epoch": 0.3346199632047906, + "grad_norm": 2.860994807804307, + "learning_rate": 3.1030640791758985e-06, + "loss": 0.9086, + "step": 4638 + }, + { + "epoch": 0.3346921106742181, + "grad_norm": 11.56515620883547, + "learning_rate": 3.1026741991422986e-06, + "loss": 0.8375, + "step": 4639 + }, + { + "epoch": 0.33476425814364563, + "grad_norm": 2.882930598127219, + "learning_rate": 3.102284258895742e-06, + "loss": 0.9485, + "step": 4640 + }, + { + "epoch": 0.3348364056130731, + "grad_norm": 3.2373731787155156, + "learning_rate": 3.101894258457524e-06, + "loss": 0.8295, + "step": 4641 + }, + { + "epoch": 0.3349085530825006, + "grad_norm": 3.083256041786648, + "learning_rate": 3.101504197848939e-06, + "loss": 1.0007, + "step": 4642 + }, + { + "epoch": 0.33498070055192813, + "grad_norm": 4.311429982884386, + "learning_rate": 3.1011140770912873e-06, + "loss": 0.8578, + "step": 4643 + }, + { + "epoch": 0.33505284802135565, + "grad_norm": 2.8873149994213985, + "learning_rate": 3.100723896205872e-06, + "loss": 0.9008, + "step": 4644 + }, + { + "epoch": 0.3351249954907832, + "grad_norm": 2.340889265678473, + "learning_rate": 3.1003336552139995e-06, + "loss": 0.979, + "step": 4645 + }, + { + "epoch": 0.3351971429602107, + "grad_norm": 2.9553880850432033, + "learning_rate": 3.0999433541369793e-06, + "loss": 0.8309, + "step": 4646 + }, + { + "epoch": 0.33526929042963816, + "grad_norm": 2.73914744652888, + "learning_rate": 3.0995529929961244e-06, + "loss": 0.8223, + "step": 4647 + }, + { + "epoch": 0.3353414378990657, + "grad_norm": 2.7795675096350685, + "learning_rate": 3.0991625718127516e-06, + "loss": 0.9881, + "step": 4648 + }, + { + "epoch": 0.3354135853684932, + "grad_norm": 2.4434485598356654, + "learning_rate": 3.098772090608179e-06, + "loss": 0.8924, + "step": 4649 + }, + { + "epoch": 0.3354857328379207, + "grad_norm": 2.886593042922188, + "learning_rate": 3.0983815494037304e-06, + "loss": 0.9054, + "step": 4650 + }, + { + "epoch": 0.33555788030734823, + "grad_norm": 2.7902259430269925, + "learning_rate": 3.0979909482207304e-06, + "loss": 0.9453, + "step": 4651 + }, + { + "epoch": 0.33563002777677575, + "grad_norm": 2.4943623873612313, + "learning_rate": 3.09760028708051e-06, + "loss": 0.9726, + "step": 4652 + }, + { + "epoch": 0.3357021752462032, + "grad_norm": 2.4678270570389182, + "learning_rate": 3.0972095660044015e-06, + "loss": 0.8699, + "step": 4653 + }, + { + "epoch": 0.33577432271563074, + "grad_norm": 2.1688580312948353, + "learning_rate": 3.096818785013741e-06, + "loss": 0.9317, + "step": 4654 + }, + { + "epoch": 0.33584647018505825, + "grad_norm": 3.3382648863102333, + "learning_rate": 3.096427944129866e-06, + "loss": 0.9237, + "step": 4655 + }, + { + "epoch": 0.3359186176544858, + "grad_norm": 2.0218882871306185, + "learning_rate": 3.0960370433741195e-06, + "loss": 0.9772, + "step": 4656 + }, + { + "epoch": 0.3359907651239133, + "grad_norm": 0.7922887658397029, + "learning_rate": 3.095646082767848e-06, + "loss": 0.8802, + "step": 4657 + }, + { + "epoch": 0.3360629125933408, + "grad_norm": 3.043694033881725, + "learning_rate": 3.0952550623324e-06, + "loss": 0.9127, + "step": 4658 + }, + { + "epoch": 0.3361350600627683, + "grad_norm": 4.013928006578118, + "learning_rate": 3.0948639820891267e-06, + "loss": 0.8659, + "step": 4659 + }, + { + "epoch": 0.3362072075321958, + "grad_norm": 2.8619848129221395, + "learning_rate": 3.094472842059385e-06, + "loss": 1.0126, + "step": 4660 + }, + { + "epoch": 0.3362793550016233, + "grad_norm": 2.3678567777443105, + "learning_rate": 3.094081642264533e-06, + "loss": 1.0072, + "step": 4661 + }, + { + "epoch": 0.33635150247105083, + "grad_norm": 0.7800281124673129, + "learning_rate": 3.093690382725933e-06, + "loss": 0.808, + "step": 4662 + }, + { + "epoch": 0.33642364994047835, + "grad_norm": 2.0443465362591886, + "learning_rate": 3.093299063464949e-06, + "loss": 0.7988, + "step": 4663 + }, + { + "epoch": 0.3364957974099059, + "grad_norm": 2.922791525966273, + "learning_rate": 3.0929076845029512e-06, + "loss": 0.949, + "step": 4664 + }, + { + "epoch": 0.33656794487933334, + "grad_norm": 2.9950251656877653, + "learning_rate": 3.09251624586131e-06, + "loss": 0.9357, + "step": 4665 + }, + { + "epoch": 0.33664009234876086, + "grad_norm": 2.960902875632401, + "learning_rate": 3.0921247475614013e-06, + "loss": 0.8959, + "step": 4666 + }, + { + "epoch": 0.3367122398181884, + "grad_norm": 3.405439647909891, + "learning_rate": 3.091733189624603e-06, + "loss": 1.0169, + "step": 4667 + }, + { + "epoch": 0.3367843872876159, + "grad_norm": 2.8220191082324377, + "learning_rate": 3.0913415720722968e-06, + "loss": 0.9961, + "step": 4668 + }, + { + "epoch": 0.3368565347570434, + "grad_norm": 2.30752419935372, + "learning_rate": 3.0909498949258666e-06, + "loss": 1.0067, + "step": 4669 + }, + { + "epoch": 0.33692868222647093, + "grad_norm": 3.480508069844886, + "learning_rate": 3.0905581582067025e-06, + "loss": 0.8611, + "step": 4670 + }, + { + "epoch": 0.3370008296958984, + "grad_norm": 3.5684284606493057, + "learning_rate": 3.0901663619361937e-06, + "loss": 0.9016, + "step": 4671 + }, + { + "epoch": 0.3370729771653259, + "grad_norm": 4.483616356029111, + "learning_rate": 3.089774506135735e-06, + "loss": 0.8995, + "step": 4672 + }, + { + "epoch": 0.33714512463475343, + "grad_norm": 1.9342896721528342, + "learning_rate": 3.0893825908267255e-06, + "loss": 0.9191, + "step": 4673 + }, + { + "epoch": 0.33721727210418095, + "grad_norm": 2.144000767757506, + "learning_rate": 3.0889906160305654e-06, + "loss": 0.8956, + "step": 4674 + }, + { + "epoch": 0.3372894195736085, + "grad_norm": 2.3917792033480767, + "learning_rate": 3.088598581768659e-06, + "loss": 0.9867, + "step": 4675 + }, + { + "epoch": 0.337361567043036, + "grad_norm": 3.6887436402181044, + "learning_rate": 3.088206488062414e-06, + "loss": 0.8961, + "step": 4676 + }, + { + "epoch": 0.33743371451246346, + "grad_norm": 2.2296112068880114, + "learning_rate": 3.087814334933241e-06, + "loss": 0.9501, + "step": 4677 + }, + { + "epoch": 0.337505861981891, + "grad_norm": 2.155881214394103, + "learning_rate": 3.087422122402554e-06, + "loss": 0.8555, + "step": 4678 + }, + { + "epoch": 0.3375780094513185, + "grad_norm": 2.1088733217803517, + "learning_rate": 3.0870298504917703e-06, + "loss": 0.9823, + "step": 4679 + }, + { + "epoch": 0.337650156920746, + "grad_norm": 2.1437957002534196, + "learning_rate": 3.086637519222311e-06, + "loss": 0.7956, + "step": 4680 + }, + { + "epoch": 0.33772230439017353, + "grad_norm": 3.0765829338452955, + "learning_rate": 3.086245128615599e-06, + "loss": 0.9241, + "step": 4681 + }, + { + "epoch": 0.337794451859601, + "grad_norm": 2.8110024597906302, + "learning_rate": 3.0858526786930615e-06, + "loss": 1.0351, + "step": 4682 + }, + { + "epoch": 0.3378665993290285, + "grad_norm": 3.2559974792152038, + "learning_rate": 3.0854601694761293e-06, + "loss": 0.9117, + "step": 4683 + }, + { + "epoch": 0.33793874679845604, + "grad_norm": 2.340260260038784, + "learning_rate": 3.0850676009862353e-06, + "loss": 0.8243, + "step": 4684 + }, + { + "epoch": 0.33801089426788355, + "grad_norm": 2.4770202213735573, + "learning_rate": 3.0846749732448163e-06, + "loss": 0.9748, + "step": 4685 + }, + { + "epoch": 0.3380830417373111, + "grad_norm": 3.323921378581878, + "learning_rate": 3.0842822862733123e-06, + "loss": 0.9301, + "step": 4686 + }, + { + "epoch": 0.3381551892067386, + "grad_norm": 1.941378126957712, + "learning_rate": 3.0838895400931664e-06, + "loss": 0.885, + "step": 4687 + }, + { + "epoch": 0.33822733667616606, + "grad_norm": 0.8968892564869948, + "learning_rate": 3.083496734725826e-06, + "loss": 0.8493, + "step": 4688 + }, + { + "epoch": 0.3382994841455936, + "grad_norm": 5.415059051614368, + "learning_rate": 3.0831038701927394e-06, + "loss": 0.8593, + "step": 4689 + }, + { + "epoch": 0.3383716316150211, + "grad_norm": 2.607106667657499, + "learning_rate": 3.0827109465153597e-06, + "loss": 0.8337, + "step": 4690 + }, + { + "epoch": 0.3384437790844486, + "grad_norm": 2.3967163670055056, + "learning_rate": 3.0823179637151437e-06, + "loss": 0.9912, + "step": 4691 + }, + { + "epoch": 0.33851592655387613, + "grad_norm": 2.5638336804820856, + "learning_rate": 3.0819249218135495e-06, + "loss": 0.9072, + "step": 4692 + }, + { + "epoch": 0.33858807402330365, + "grad_norm": 1.8190156303518548, + "learning_rate": 3.081531820832041e-06, + "loss": 0.9887, + "step": 4693 + }, + { + "epoch": 0.3386602214927311, + "grad_norm": 2.39281587626127, + "learning_rate": 3.081138660792084e-06, + "loss": 1.0137, + "step": 4694 + }, + { + "epoch": 0.33873236896215864, + "grad_norm": 0.7545928200670079, + "learning_rate": 3.080745441715146e-06, + "loss": 0.8397, + "step": 4695 + }, + { + "epoch": 0.33880451643158616, + "grad_norm": 3.1890246167098564, + "learning_rate": 3.0803521636227005e-06, + "loss": 0.9325, + "step": 4696 + }, + { + "epoch": 0.3388766639010137, + "grad_norm": 3.8647728365408445, + "learning_rate": 3.0799588265362228e-06, + "loss": 0.9199, + "step": 4697 + }, + { + "epoch": 0.3389488113704412, + "grad_norm": 3.1884391747425127, + "learning_rate": 3.079565430477192e-06, + "loss": 0.9327, + "step": 4698 + }, + { + "epoch": 0.3390209588398687, + "grad_norm": 3.926983306717174, + "learning_rate": 3.079171975467089e-06, + "loss": 0.9844, + "step": 4699 + }, + { + "epoch": 0.3390931063092962, + "grad_norm": 2.5280377763146897, + "learning_rate": 3.078778461527399e-06, + "loss": 0.9003, + "step": 4700 + }, + { + "epoch": 0.3391652537787237, + "grad_norm": 0.8459847490117391, + "learning_rate": 3.078384888679611e-06, + "loss": 0.8122, + "step": 4701 + }, + { + "epoch": 0.3392374012481512, + "grad_norm": 0.7796999622640959, + "learning_rate": 3.0779912569452164e-06, + "loss": 0.846, + "step": 4702 + }, + { + "epoch": 0.33930954871757874, + "grad_norm": 3.7951333403752794, + "learning_rate": 3.0775975663457096e-06, + "loss": 0.8522, + "step": 4703 + }, + { + "epoch": 0.33938169618700625, + "grad_norm": 2.672451737827201, + "learning_rate": 3.077203816902589e-06, + "loss": 0.8705, + "step": 4704 + }, + { + "epoch": 0.3394538436564338, + "grad_norm": 3.4936538109819595, + "learning_rate": 3.0768100086373557e-06, + "loss": 1.01, + "step": 4705 + }, + { + "epoch": 0.33952599112586124, + "grad_norm": 2.292597518061506, + "learning_rate": 3.0764161415715133e-06, + "loss": 0.9926, + "step": 4706 + }, + { + "epoch": 0.33959813859528876, + "grad_norm": 3.338724370391745, + "learning_rate": 3.0760222157265705e-06, + "loss": 0.8989, + "step": 4707 + }, + { + "epoch": 0.3396702860647163, + "grad_norm": 2.462333452818166, + "learning_rate": 3.075628231124039e-06, + "loss": 1.035, + "step": 4708 + }, + { + "epoch": 0.3397424335341438, + "grad_norm": 2.3887520228466688, + "learning_rate": 3.0752341877854306e-06, + "loss": 1.0553, + "step": 4709 + }, + { + "epoch": 0.3398145810035713, + "grad_norm": 3.0677835469289723, + "learning_rate": 3.074840085732264e-06, + "loss": 0.8741, + "step": 4710 + }, + { + "epoch": 0.33988672847299883, + "grad_norm": 2.671646822282368, + "learning_rate": 3.074445924986059e-06, + "loss": 0.897, + "step": 4711 + }, + { + "epoch": 0.3399588759424263, + "grad_norm": 1.7436950045974862, + "learning_rate": 3.074051705568339e-06, + "loss": 1.0078, + "step": 4712 + }, + { + "epoch": 0.3400310234118538, + "grad_norm": 2.631144280926225, + "learning_rate": 3.0736574275006318e-06, + "loss": 1.0738, + "step": 4713 + }, + { + "epoch": 0.34010317088128134, + "grad_norm": 2.9868449430675543, + "learning_rate": 3.073263090804468e-06, + "loss": 0.9671, + "step": 4714 + }, + { + "epoch": 0.34017531835070886, + "grad_norm": 4.59283583491395, + "learning_rate": 3.072868695501378e-06, + "loss": 0.9648, + "step": 4715 + }, + { + "epoch": 0.3402474658201364, + "grad_norm": 2.8323167959494198, + "learning_rate": 3.0724742416129015e-06, + "loss": 0.951, + "step": 4716 + }, + { + "epoch": 0.3403196132895639, + "grad_norm": 0.8821178167417435, + "learning_rate": 3.0720797291605757e-06, + "loss": 0.8871, + "step": 4717 + }, + { + "epoch": 0.34039176075899136, + "grad_norm": 2.5120349168901965, + "learning_rate": 3.0716851581659455e-06, + "loss": 0.9859, + "step": 4718 + }, + { + "epoch": 0.3404639082284189, + "grad_norm": 3.1547715151795512, + "learning_rate": 3.0712905286505565e-06, + "loss": 0.8588, + "step": 4719 + }, + { + "epoch": 0.3405360556978464, + "grad_norm": 2.4161348305778074, + "learning_rate": 3.0708958406359564e-06, + "loss": 1.0496, + "step": 4720 + }, + { + "epoch": 0.3406082031672739, + "grad_norm": 0.7358339305657814, + "learning_rate": 3.070501094143699e-06, + "loss": 0.8315, + "step": 4721 + }, + { + "epoch": 0.34068035063670143, + "grad_norm": 3.283786338369848, + "learning_rate": 3.0701062891953405e-06, + "loss": 0.9834, + "step": 4722 + }, + { + "epoch": 0.34075249810612895, + "grad_norm": 0.7294328657795187, + "learning_rate": 3.0697114258124383e-06, + "loss": 0.77, + "step": 4723 + }, + { + "epoch": 0.3408246455755564, + "grad_norm": 2.53958603376621, + "learning_rate": 3.069316504016555e-06, + "loss": 0.9351, + "step": 4724 + }, + { + "epoch": 0.34089679304498394, + "grad_norm": 2.7892894492256053, + "learning_rate": 3.068921523829257e-06, + "loss": 1.0134, + "step": 4725 + }, + { + "epoch": 0.34096894051441146, + "grad_norm": 2.255972881681715, + "learning_rate": 3.0685264852721104e-06, + "loss": 0.9838, + "step": 4726 + }, + { + "epoch": 0.341041087983839, + "grad_norm": 2.610912760737775, + "learning_rate": 3.068131388366688e-06, + "loss": 0.9037, + "step": 4727 + }, + { + "epoch": 0.3411132354532665, + "grad_norm": 6.703787390785852, + "learning_rate": 3.067736233134565e-06, + "loss": 0.9334, + "step": 4728 + }, + { + "epoch": 0.341185382922694, + "grad_norm": 2.119137812318122, + "learning_rate": 3.067341019597318e-06, + "loss": 0.8887, + "step": 4729 + }, + { + "epoch": 0.3412575303921215, + "grad_norm": 2.7299130028357603, + "learning_rate": 3.0669457477765304e-06, + "loss": 0.8977, + "step": 4730 + }, + { + "epoch": 0.341329677861549, + "grad_norm": 2.187912493369833, + "learning_rate": 3.066550417693785e-06, + "loss": 0.8978, + "step": 4731 + }, + { + "epoch": 0.3414018253309765, + "grad_norm": 0.9252979610580198, + "learning_rate": 3.0661550293706686e-06, + "loss": 0.8514, + "step": 4732 + }, + { + "epoch": 0.34147397280040404, + "grad_norm": 2.0328347699692295, + "learning_rate": 3.0657595828287735e-06, + "loss": 1.0118, + "step": 4733 + }, + { + "epoch": 0.34154612026983155, + "grad_norm": 3.683661968938492, + "learning_rate": 3.065364078089693e-06, + "loss": 0.9549, + "step": 4734 + }, + { + "epoch": 0.341618267739259, + "grad_norm": 2.402574779287496, + "learning_rate": 3.064968515175024e-06, + "loss": 0.7979, + "step": 4735 + }, + { + "epoch": 0.34169041520868654, + "grad_norm": 3.4230205412551196, + "learning_rate": 3.064572894106366e-06, + "loss": 0.9895, + "step": 4736 + }, + { + "epoch": 0.34176256267811406, + "grad_norm": 2.5288157119547674, + "learning_rate": 3.0641772149053244e-06, + "loss": 0.929, + "step": 4737 + }, + { + "epoch": 0.3418347101475416, + "grad_norm": 2.109982438398562, + "learning_rate": 3.063781477593504e-06, + "loss": 1.0266, + "step": 4738 + }, + { + "epoch": 0.3419068576169691, + "grad_norm": 2.5092704551648244, + "learning_rate": 3.0633856821925146e-06, + "loss": 0.9799, + "step": 4739 + }, + { + "epoch": 0.3419790050863966, + "grad_norm": 2.1655147375795707, + "learning_rate": 3.06298982872397e-06, + "loss": 0.9699, + "step": 4740 + }, + { + "epoch": 0.3420511525558241, + "grad_norm": 2.6478801129507006, + "learning_rate": 3.062593917209486e-06, + "loss": 0.9071, + "step": 4741 + }, + { + "epoch": 0.3421233000252516, + "grad_norm": 3.7829814131035864, + "learning_rate": 3.0621979476706815e-06, + "loss": 0.9082, + "step": 4742 + }, + { + "epoch": 0.3421954474946791, + "grad_norm": 4.623132689581656, + "learning_rate": 3.0618019201291795e-06, + "loss": 0.9177, + "step": 4743 + }, + { + "epoch": 0.34226759496410664, + "grad_norm": 2.1106047260009926, + "learning_rate": 3.0614058346066046e-06, + "loss": 0.9547, + "step": 4744 + }, + { + "epoch": 0.34233974243353416, + "grad_norm": 0.8145939181551513, + "learning_rate": 3.0610096911245872e-06, + "loss": 0.8045, + "step": 4745 + }, + { + "epoch": 0.3424118899029617, + "grad_norm": 3.661840615137943, + "learning_rate": 3.0606134897047586e-06, + "loss": 0.8518, + "step": 4746 + }, + { + "epoch": 0.34248403737238914, + "grad_norm": 2.5581745765294177, + "learning_rate": 3.060217230368752e-06, + "loss": 1.0117, + "step": 4747 + }, + { + "epoch": 0.34255618484181666, + "grad_norm": 2.7135271790765088, + "learning_rate": 3.0598209131382085e-06, + "loss": 1.0481, + "step": 4748 + }, + { + "epoch": 0.3426283323112442, + "grad_norm": 3.265202107037892, + "learning_rate": 3.059424538034768e-06, + "loss": 0.9474, + "step": 4749 + }, + { + "epoch": 0.3427004797806717, + "grad_norm": 2.40382292574884, + "learning_rate": 3.059028105080075e-06, + "loss": 0.9964, + "step": 4750 + }, + { + "epoch": 0.3427726272500992, + "grad_norm": 2.9916001981047358, + "learning_rate": 3.058631614295778e-06, + "loss": 0.9631, + "step": 4751 + }, + { + "epoch": 0.34284477471952673, + "grad_norm": 2.6189601076459095, + "learning_rate": 3.0582350657035276e-06, + "loss": 0.9438, + "step": 4752 + }, + { + "epoch": 0.3429169221889542, + "grad_norm": 2.720302796058941, + "learning_rate": 3.0578384593249774e-06, + "loss": 1.0207, + "step": 4753 + }, + { + "epoch": 0.3429890696583817, + "grad_norm": 3.676130709925854, + "learning_rate": 3.057441795181785e-06, + "loss": 0.8624, + "step": 4754 + }, + { + "epoch": 0.34306121712780924, + "grad_norm": 2.4755778476199177, + "learning_rate": 3.05704507329561e-06, + "loss": 0.9197, + "step": 4755 + }, + { + "epoch": 0.34313336459723676, + "grad_norm": 2.458964789375436, + "learning_rate": 3.056648293688117e-06, + "loss": 0.9462, + "step": 4756 + }, + { + "epoch": 0.3432055120666643, + "grad_norm": 2.1272913976441314, + "learning_rate": 3.056251456380973e-06, + "loss": 0.9061, + "step": 4757 + }, + { + "epoch": 0.3432776595360918, + "grad_norm": 2.387092823238978, + "learning_rate": 3.055854561395846e-06, + "loss": 0.9544, + "step": 4758 + }, + { + "epoch": 0.34334980700551926, + "grad_norm": 2.0886876411192525, + "learning_rate": 3.055457608754411e-06, + "loss": 0.9576, + "step": 4759 + }, + { + "epoch": 0.3434219544749468, + "grad_norm": 2.9851455093266437, + "learning_rate": 3.0550605984783426e-06, + "loss": 0.9334, + "step": 4760 + }, + { + "epoch": 0.3434941019443743, + "grad_norm": 3.7574128951625596, + "learning_rate": 3.054663530589321e-06, + "loss": 0.9212, + "step": 4761 + }, + { + "epoch": 0.3435662494138018, + "grad_norm": 4.018900325923985, + "learning_rate": 3.054266405109028e-06, + "loss": 0.8662, + "step": 4762 + }, + { + "epoch": 0.34363839688322934, + "grad_norm": 3.8636847151316442, + "learning_rate": 3.053869222059149e-06, + "loss": 0.9906, + "step": 4763 + }, + { + "epoch": 0.34371054435265685, + "grad_norm": 4.071431128900246, + "learning_rate": 3.0534719814613737e-06, + "loss": 0.9741, + "step": 4764 + }, + { + "epoch": 0.3437826918220843, + "grad_norm": 3.5309352228083224, + "learning_rate": 3.0530746833373933e-06, + "loss": 0.9947, + "step": 4765 + }, + { + "epoch": 0.34385483929151184, + "grad_norm": 2.932084793915158, + "learning_rate": 3.0526773277089022e-06, + "loss": 0.931, + "step": 4766 + }, + { + "epoch": 0.34392698676093936, + "grad_norm": 3.595510433075258, + "learning_rate": 3.0522799145975993e-06, + "loss": 0.9682, + "step": 4767 + }, + { + "epoch": 0.3439991342303669, + "grad_norm": 2.5143945184301377, + "learning_rate": 3.051882444025186e-06, + "loss": 1.0002, + "step": 4768 + }, + { + "epoch": 0.3440712816997944, + "grad_norm": 1.8414097821117217, + "learning_rate": 3.051484916013366e-06, + "loss": 0.886, + "step": 4769 + }, + { + "epoch": 0.3441434291692219, + "grad_norm": 3.014807872059533, + "learning_rate": 3.051087330583847e-06, + "loss": 0.872, + "step": 4770 + }, + { + "epoch": 0.3442155766386494, + "grad_norm": 5.24583487819965, + "learning_rate": 3.0506896877583404e-06, + "loss": 0.9915, + "step": 4771 + }, + { + "epoch": 0.3442877241080769, + "grad_norm": 3.3118639641250542, + "learning_rate": 3.0502919875585595e-06, + "loss": 0.9064, + "step": 4772 + }, + { + "epoch": 0.3443598715775044, + "grad_norm": 2.178871357482138, + "learning_rate": 3.0498942300062213e-06, + "loss": 0.9749, + "step": 4773 + }, + { + "epoch": 0.34443201904693194, + "grad_norm": 2.061767101118395, + "learning_rate": 3.0494964151230455e-06, + "loss": 0.9678, + "step": 4774 + }, + { + "epoch": 0.34450416651635946, + "grad_norm": 2.6209821378818945, + "learning_rate": 3.049098542930755e-06, + "loss": 0.9851, + "step": 4775 + }, + { + "epoch": 0.344576313985787, + "grad_norm": 2.7538233101982965, + "learning_rate": 3.0487006134510774e-06, + "loss": 0.9872, + "step": 4776 + }, + { + "epoch": 0.34464846145521444, + "grad_norm": 5.8095229533758435, + "learning_rate": 3.048302626705742e-06, + "loss": 0.8601, + "step": 4777 + }, + { + "epoch": 0.34472060892464196, + "grad_norm": 3.2403039815553263, + "learning_rate": 3.0479045827164797e-06, + "loss": 0.8911, + "step": 4778 + }, + { + "epoch": 0.3447927563940695, + "grad_norm": 2.4173619706080713, + "learning_rate": 3.047506481505028e-06, + "loss": 0.8621, + "step": 4779 + }, + { + "epoch": 0.344864903863497, + "grad_norm": 3.3808441426216644, + "learning_rate": 3.047108323093125e-06, + "loss": 0.9102, + "step": 4780 + }, + { + "epoch": 0.3449370513329245, + "grad_norm": 2.3934176221720134, + "learning_rate": 3.0467101075025124e-06, + "loss": 0.9361, + "step": 4781 + }, + { + "epoch": 0.34500919880235204, + "grad_norm": 2.587046866429989, + "learning_rate": 3.0463118347549354e-06, + "loss": 0.9368, + "step": 4782 + }, + { + "epoch": 0.3450813462717795, + "grad_norm": 2.299990143962143, + "learning_rate": 3.045913504872143e-06, + "loss": 0.9493, + "step": 4783 + }, + { + "epoch": 0.345153493741207, + "grad_norm": 2.9726458234767046, + "learning_rate": 3.045515117875886e-06, + "loss": 0.8706, + "step": 4784 + }, + { + "epoch": 0.34522564121063454, + "grad_norm": 3.317231164430149, + "learning_rate": 3.0451166737879186e-06, + "loss": 0.9697, + "step": 4785 + }, + { + "epoch": 0.34529778868006206, + "grad_norm": 2.818056086283279, + "learning_rate": 3.0447181726299987e-06, + "loss": 0.995, + "step": 4786 + }, + { + "epoch": 0.3453699361494896, + "grad_norm": 2.4545168481630473, + "learning_rate": 3.0443196144238866e-06, + "loss": 0.986, + "step": 4787 + }, + { + "epoch": 0.34544208361891704, + "grad_norm": 5.06908318380561, + "learning_rate": 3.0439209991913463e-06, + "loss": 0.957, + "step": 4788 + }, + { + "epoch": 0.34551423108834456, + "grad_norm": 4.160457689709896, + "learning_rate": 3.043522326954145e-06, + "loss": 0.962, + "step": 4789 + }, + { + "epoch": 0.3455863785577721, + "grad_norm": 2.451781275404541, + "learning_rate": 3.043123597734052e-06, + "loss": 1.0266, + "step": 4790 + }, + { + "epoch": 0.3456585260271996, + "grad_norm": 3.083385638918852, + "learning_rate": 3.042724811552841e-06, + "loss": 0.9994, + "step": 4791 + }, + { + "epoch": 0.3457306734966271, + "grad_norm": 2.1616704507332383, + "learning_rate": 3.0423259684322878e-06, + "loss": 0.9372, + "step": 4792 + }, + { + "epoch": 0.34580282096605464, + "grad_norm": 3.0083612907427937, + "learning_rate": 3.0419270683941723e-06, + "loss": 0.8573, + "step": 4793 + }, + { + "epoch": 0.3458749684354821, + "grad_norm": 50.04494074079817, + "learning_rate": 3.0415281114602765e-06, + "loss": 0.982, + "step": 4794 + }, + { + "epoch": 0.3459471159049096, + "grad_norm": 3.1348220582439805, + "learning_rate": 3.041129097652386e-06, + "loss": 0.9246, + "step": 4795 + }, + { + "epoch": 0.34601926337433714, + "grad_norm": 3.7743764147540273, + "learning_rate": 3.0407300269922904e-06, + "loss": 0.9316, + "step": 4796 + }, + { + "epoch": 0.34609141084376466, + "grad_norm": 3.0390651673449893, + "learning_rate": 3.04033089950178e-06, + "loss": 0.8781, + "step": 4797 + }, + { + "epoch": 0.3461635583131922, + "grad_norm": 1.9850471139308372, + "learning_rate": 3.0399317152026504e-06, + "loss": 0.9547, + "step": 4798 + }, + { + "epoch": 0.3462357057826197, + "grad_norm": 5.707441645493955, + "learning_rate": 3.0395324741166997e-06, + "loss": 1.0363, + "step": 4799 + }, + { + "epoch": 0.34630785325204716, + "grad_norm": 2.970139188491211, + "learning_rate": 3.0391331762657285e-06, + "loss": 0.9196, + "step": 4800 + }, + { + "epoch": 0.3463800007214747, + "grad_norm": 2.623890415194527, + "learning_rate": 3.038733821671542e-06, + "loss": 0.9134, + "step": 4801 + }, + { + "epoch": 0.3464521481909022, + "grad_norm": 0.7709676436525752, + "learning_rate": 3.0383344103559464e-06, + "loss": 0.8332, + "step": 4802 + }, + { + "epoch": 0.3465242956603297, + "grad_norm": 2.9711477133830306, + "learning_rate": 3.037934942340752e-06, + "loss": 1.0151, + "step": 4803 + }, + { + "epoch": 0.34659644312975724, + "grad_norm": 1.9526892824531437, + "learning_rate": 3.0375354176477733e-06, + "loss": 0.9216, + "step": 4804 + }, + { + "epoch": 0.34666859059918476, + "grad_norm": 0.8562986276559682, + "learning_rate": 3.0371358362988257e-06, + "loss": 0.8711, + "step": 4805 + }, + { + "epoch": 0.3467407380686122, + "grad_norm": 2.8210866610560252, + "learning_rate": 3.0367361983157297e-06, + "loss": 0.9512, + "step": 4806 + }, + { + "epoch": 0.34681288553803974, + "grad_norm": 2.7190203477463606, + "learning_rate": 3.0363365037203084e-06, + "loss": 0.7889, + "step": 4807 + }, + { + "epoch": 0.34688503300746726, + "grad_norm": 2.1185597832288674, + "learning_rate": 3.0359367525343862e-06, + "loss": 0.8952, + "step": 4808 + }, + { + "epoch": 0.3469571804768948, + "grad_norm": 2.3712956501252083, + "learning_rate": 3.0355369447797927e-06, + "loss": 0.9724, + "step": 4809 + }, + { + "epoch": 0.3470293279463223, + "grad_norm": 2.42150334613702, + "learning_rate": 3.0351370804783604e-06, + "loss": 0.9563, + "step": 4810 + }, + { + "epoch": 0.3471014754157498, + "grad_norm": 2.296622022334206, + "learning_rate": 3.034737159651924e-06, + "loss": 0.8661, + "step": 4811 + }, + { + "epoch": 0.3471736228851773, + "grad_norm": 3.3204742033234536, + "learning_rate": 3.034337182322322e-06, + "loss": 0.9549, + "step": 4812 + }, + { + "epoch": 0.3472457703546048, + "grad_norm": 2.7661140138872597, + "learning_rate": 3.0339371485113957e-06, + "loss": 0.9938, + "step": 4813 + }, + { + "epoch": 0.3473179178240323, + "grad_norm": 2.3962881789849826, + "learning_rate": 3.0335370582409887e-06, + "loss": 0.8196, + "step": 4814 + }, + { + "epoch": 0.34739006529345984, + "grad_norm": 2.2483933328475056, + "learning_rate": 3.0331369115329493e-06, + "loss": 0.9028, + "step": 4815 + }, + { + "epoch": 0.34746221276288736, + "grad_norm": 1.7404626767201261, + "learning_rate": 3.0327367084091275e-06, + "loss": 0.9216, + "step": 4816 + }, + { + "epoch": 0.3475343602323149, + "grad_norm": 2.6986432269319836, + "learning_rate": 3.032336448891377e-06, + "loss": 0.9756, + "step": 4817 + }, + { + "epoch": 0.34760650770174234, + "grad_norm": 2.392160857268407, + "learning_rate": 3.0319361330015554e-06, + "loss": 1.0082, + "step": 4818 + }, + { + "epoch": 0.34767865517116986, + "grad_norm": 2.561226411992893, + "learning_rate": 3.0315357607615205e-06, + "loss": 0.8738, + "step": 4819 + }, + { + "epoch": 0.3477508026405974, + "grad_norm": 1.9496830633779123, + "learning_rate": 3.0311353321931366e-06, + "loss": 1.0178, + "step": 4820 + }, + { + "epoch": 0.3478229501100249, + "grad_norm": 2.4000969032751747, + "learning_rate": 3.03073484731827e-06, + "loss": 0.994, + "step": 4821 + }, + { + "epoch": 0.3478950975794524, + "grad_norm": 4.610742191636749, + "learning_rate": 3.0303343061587884e-06, + "loss": 1.0098, + "step": 4822 + }, + { + "epoch": 0.34796724504887994, + "grad_norm": 2.15436028865332, + "learning_rate": 3.0299337087365645e-06, + "loss": 0.8894, + "step": 4823 + }, + { + "epoch": 0.3480393925183074, + "grad_norm": 2.9248280010026604, + "learning_rate": 3.0295330550734743e-06, + "loss": 0.9364, + "step": 4824 + }, + { + "epoch": 0.3481115399877349, + "grad_norm": 2.4718079761624905, + "learning_rate": 3.029132345191394e-06, + "loss": 0.9199, + "step": 4825 + }, + { + "epoch": 0.34818368745716244, + "grad_norm": 2.0891496593065595, + "learning_rate": 3.0287315791122075e-06, + "loss": 1.0119, + "step": 4826 + }, + { + "epoch": 0.34825583492658996, + "grad_norm": 2.2190320547572724, + "learning_rate": 3.0283307568577963e-06, + "loss": 0.8804, + "step": 4827 + }, + { + "epoch": 0.3483279823960175, + "grad_norm": 2.6328644789819813, + "learning_rate": 3.02792987845005e-06, + "loss": 0.921, + "step": 4828 + }, + { + "epoch": 0.348400129865445, + "grad_norm": 0.8264008333285064, + "learning_rate": 3.027528943910858e-06, + "loss": 0.899, + "step": 4829 + }, + { + "epoch": 0.34847227733487246, + "grad_norm": 2.9134440647907676, + "learning_rate": 3.027127953262114e-06, + "loss": 0.9206, + "step": 4830 + }, + { + "epoch": 0.3485444248043, + "grad_norm": 0.7384669156214985, + "learning_rate": 3.026726906525715e-06, + "loss": 0.8331, + "step": 4831 + }, + { + "epoch": 0.3486165722737275, + "grad_norm": 2.267857996765594, + "learning_rate": 3.0263258037235604e-06, + "loss": 0.9843, + "step": 4832 + }, + { + "epoch": 0.348688719743155, + "grad_norm": 1.9863014181886685, + "learning_rate": 3.025924644877553e-06, + "loss": 0.9513, + "step": 4833 + }, + { + "epoch": 0.34876086721258254, + "grad_norm": 2.3412907222825488, + "learning_rate": 3.0255234300095983e-06, + "loss": 1.0129, + "step": 4834 + }, + { + "epoch": 0.34883301468201, + "grad_norm": 2.4623916447225676, + "learning_rate": 3.025122159141606e-06, + "loss": 0.9666, + "step": 4835 + }, + { + "epoch": 0.3489051621514375, + "grad_norm": 2.8201715745878877, + "learning_rate": 3.024720832295487e-06, + "loss": 0.8854, + "step": 4836 + }, + { + "epoch": 0.34897730962086504, + "grad_norm": 1.8370894044686519, + "learning_rate": 3.0243194494931567e-06, + "loss": 1.0708, + "step": 4837 + }, + { + "epoch": 0.34904945709029256, + "grad_norm": 0.7948821336714355, + "learning_rate": 3.0239180107565337e-06, + "loss": 0.8251, + "step": 4838 + }, + { + "epoch": 0.3491216045597201, + "grad_norm": 2.929607420780569, + "learning_rate": 3.023516516107538e-06, + "loss": 0.9246, + "step": 4839 + }, + { + "epoch": 0.3491937520291476, + "grad_norm": 2.875701984452031, + "learning_rate": 3.0231149655680947e-06, + "loss": 0.9361, + "step": 4840 + }, + { + "epoch": 0.34926589949857506, + "grad_norm": 2.1710155076255413, + "learning_rate": 3.0227133591601305e-06, + "loss": 0.9435, + "step": 4841 + }, + { + "epoch": 0.3493380469680026, + "grad_norm": 3.015777900258732, + "learning_rate": 3.022311696905576e-06, + "loss": 0.8636, + "step": 4842 + }, + { + "epoch": 0.3494101944374301, + "grad_norm": 2.2356865508127717, + "learning_rate": 3.0219099788263634e-06, + "loss": 0.8783, + "step": 4843 + }, + { + "epoch": 0.3494823419068576, + "grad_norm": 1.832906745640408, + "learning_rate": 3.02150820494443e-06, + "loss": 0.9889, + "step": 4844 + }, + { + "epoch": 0.34955448937628514, + "grad_norm": 1.801031860654342, + "learning_rate": 3.0211063752817152e-06, + "loss": 0.9318, + "step": 4845 + }, + { + "epoch": 0.34962663684571266, + "grad_norm": 3.2386171100050225, + "learning_rate": 3.0207044898601613e-06, + "loss": 0.9811, + "step": 4846 + }, + { + "epoch": 0.3496987843151401, + "grad_norm": 4.220591051554615, + "learning_rate": 3.0203025487017143e-06, + "loss": 0.9732, + "step": 4847 + }, + { + "epoch": 0.34977093178456764, + "grad_norm": 2.7767500745863116, + "learning_rate": 3.0199005518283212e-06, + "loss": 0.9366, + "step": 4848 + }, + { + "epoch": 0.34984307925399516, + "grad_norm": 2.6576447359363047, + "learning_rate": 3.0194984992619346e-06, + "loss": 0.8827, + "step": 4849 + }, + { + "epoch": 0.3499152267234227, + "grad_norm": 2.1780106852455856, + "learning_rate": 3.01909639102451e-06, + "loss": 0.9326, + "step": 4850 + }, + { + "epoch": 0.3499873741928502, + "grad_norm": 0.7321206248958488, + "learning_rate": 3.0186942271380028e-06, + "loss": 0.7789, + "step": 4851 + }, + { + "epoch": 0.3500595216622777, + "grad_norm": 2.093185462568185, + "learning_rate": 3.0182920076243754e-06, + "loss": 0.9215, + "step": 4852 + }, + { + "epoch": 0.3501316691317052, + "grad_norm": 2.3352561135505554, + "learning_rate": 3.0178897325055914e-06, + "loss": 0.9053, + "step": 4853 + }, + { + "epoch": 0.3502038166011327, + "grad_norm": 2.95667630563546, + "learning_rate": 3.017487401803617e-06, + "loss": 0.9622, + "step": 4854 + }, + { + "epoch": 0.3502759640705602, + "grad_norm": 2.072423010340568, + "learning_rate": 3.017085015540422e-06, + "loss": 0.9396, + "step": 4855 + }, + { + "epoch": 0.35034811153998774, + "grad_norm": 0.7413828855155299, + "learning_rate": 3.0166825737379796e-06, + "loss": 0.8254, + "step": 4856 + }, + { + "epoch": 0.35042025900941526, + "grad_norm": 4.411324899480247, + "learning_rate": 3.0162800764182653e-06, + "loss": 0.9149, + "step": 4857 + }, + { + "epoch": 0.3504924064788428, + "grad_norm": 3.005234443291083, + "learning_rate": 3.0158775236032587e-06, + "loss": 0.8975, + "step": 4858 + }, + { + "epoch": 0.35056455394827024, + "grad_norm": 2.9679289636156807, + "learning_rate": 3.0154749153149403e-06, + "loss": 0.8619, + "step": 4859 + }, + { + "epoch": 0.35063670141769776, + "grad_norm": 2.1330795400376417, + "learning_rate": 3.0150722515752963e-06, + "loss": 0.9736, + "step": 4860 + }, + { + "epoch": 0.3507088488871253, + "grad_norm": 1.9680607512602972, + "learning_rate": 3.014669532406315e-06, + "loss": 0.9395, + "step": 4861 + }, + { + "epoch": 0.3507809963565528, + "grad_norm": 0.9133129796372963, + "learning_rate": 3.014266757829986e-06, + "loss": 0.767, + "step": 4862 + }, + { + "epoch": 0.3508531438259803, + "grad_norm": 2.590443169225737, + "learning_rate": 3.0138639278683047e-06, + "loss": 0.9248, + "step": 4863 + }, + { + "epoch": 0.35092529129540784, + "grad_norm": 2.408018081869628, + "learning_rate": 3.013461042543266e-06, + "loss": 0.9537, + "step": 4864 + }, + { + "epoch": 0.3509974387648353, + "grad_norm": 3.132757400446723, + "learning_rate": 3.013058101876873e-06, + "loss": 0.8862, + "step": 4865 + }, + { + "epoch": 0.3510695862342628, + "grad_norm": 2.362099192695, + "learning_rate": 3.0126551058911266e-06, + "loss": 1.0347, + "step": 4866 + }, + { + "epoch": 0.35114173370369034, + "grad_norm": 0.9111846549279291, + "learning_rate": 3.0122520546080336e-06, + "loss": 0.8124, + "step": 4867 + }, + { + "epoch": 0.35121388117311786, + "grad_norm": 1.4828568575528305, + "learning_rate": 3.0118489480496037e-06, + "loss": 0.9201, + "step": 4868 + }, + { + "epoch": 0.3512860286425454, + "grad_norm": 2.7469809172154616, + "learning_rate": 3.011445786237848e-06, + "loss": 0.9267, + "step": 4869 + }, + { + "epoch": 0.3513581761119729, + "grad_norm": 3.2484791938781266, + "learning_rate": 3.011042569194782e-06, + "loss": 0.9531, + "step": 4870 + }, + { + "epoch": 0.35143032358140036, + "grad_norm": 2.3164078968164468, + "learning_rate": 3.010639296942424e-06, + "loss": 0.8847, + "step": 4871 + }, + { + "epoch": 0.3515024710508279, + "grad_norm": 2.9281776708468183, + "learning_rate": 3.0102359695027953e-06, + "loss": 0.9936, + "step": 4872 + }, + { + "epoch": 0.3515746185202554, + "grad_norm": 2.2075447497902867, + "learning_rate": 3.009832586897921e-06, + "loss": 0.9546, + "step": 4873 + }, + { + "epoch": 0.3516467659896829, + "grad_norm": 3.141761232634221, + "learning_rate": 3.0094291491498267e-06, + "loss": 0.9344, + "step": 4874 + }, + { + "epoch": 0.35171891345911044, + "grad_norm": 2.4812037542919567, + "learning_rate": 3.009025656280543e-06, + "loss": 0.942, + "step": 4875 + }, + { + "epoch": 0.35179106092853796, + "grad_norm": 1.862316977705906, + "learning_rate": 3.0086221083121033e-06, + "loss": 1.0156, + "step": 4876 + }, + { + "epoch": 0.3518632083979654, + "grad_norm": 2.239872178960727, + "learning_rate": 3.008218505266545e-06, + "loss": 0.9508, + "step": 4877 + }, + { + "epoch": 0.35193535586739294, + "grad_norm": 4.475210574859138, + "learning_rate": 3.007814847165906e-06, + "loss": 0.9441, + "step": 4878 + }, + { + "epoch": 0.35200750333682046, + "grad_norm": 0.8763574901067235, + "learning_rate": 3.007411134032229e-06, + "loss": 0.8039, + "step": 4879 + }, + { + "epoch": 0.352079650806248, + "grad_norm": 2.7760086391600374, + "learning_rate": 3.0070073658875593e-06, + "loss": 0.8417, + "step": 4880 + }, + { + "epoch": 0.3521517982756755, + "grad_norm": 1.9665983643497744, + "learning_rate": 3.006603542753945e-06, + "loss": 0.9558, + "step": 4881 + }, + { + "epoch": 0.352223945745103, + "grad_norm": 2.3968232031128163, + "learning_rate": 3.0061996646534374e-06, + "loss": 0.9281, + "step": 4882 + }, + { + "epoch": 0.3522960932145305, + "grad_norm": 2.235575426856866, + "learning_rate": 3.0057957316080913e-06, + "loss": 0.88, + "step": 4883 + }, + { + "epoch": 0.352368240683958, + "grad_norm": 2.674750314628539, + "learning_rate": 3.005391743639964e-06, + "loss": 0.9297, + "step": 4884 + }, + { + "epoch": 0.3524403881533855, + "grad_norm": 2.323782948901105, + "learning_rate": 3.0049877007711145e-06, + "loss": 1.0023, + "step": 4885 + }, + { + "epoch": 0.35251253562281304, + "grad_norm": 2.031695155735984, + "learning_rate": 3.0045836030236076e-06, + "loss": 0.858, + "step": 4886 + }, + { + "epoch": 0.35258468309224056, + "grad_norm": 1.972546806698718, + "learning_rate": 3.0041794504195085e-06, + "loss": 0.882, + "step": 4887 + }, + { + "epoch": 0.352656830561668, + "grad_norm": 2.5613994561112663, + "learning_rate": 3.003775242980887e-06, + "loss": 1.0647, + "step": 4888 + }, + { + "epoch": 0.35272897803109554, + "grad_norm": 1.9319128880261105, + "learning_rate": 3.0033709807298164e-06, + "loss": 0.9007, + "step": 4889 + }, + { + "epoch": 0.35280112550052306, + "grad_norm": 1.9031239501749786, + "learning_rate": 3.00296666368837e-06, + "loss": 1.0028, + "step": 4890 + }, + { + "epoch": 0.3528732729699506, + "grad_norm": 2.2320077800751865, + "learning_rate": 3.0025622918786272e-06, + "loss": 0.8406, + "step": 4891 + }, + { + "epoch": 0.3529454204393781, + "grad_norm": 2.663698233034734, + "learning_rate": 3.002157865322669e-06, + "loss": 0.9792, + "step": 4892 + }, + { + "epoch": 0.3530175679088056, + "grad_norm": 9.23320957839972, + "learning_rate": 3.0017533840425787e-06, + "loss": 0.9041, + "step": 4893 + }, + { + "epoch": 0.3530897153782331, + "grad_norm": 2.634469979272072, + "learning_rate": 3.0013488480604457e-06, + "loss": 0.8621, + "step": 4894 + }, + { + "epoch": 0.3531618628476606, + "grad_norm": 2.7472586406183437, + "learning_rate": 3.000944257398359e-06, + "loss": 0.9779, + "step": 4895 + }, + { + "epoch": 0.3532340103170881, + "grad_norm": 0.9842859560944246, + "learning_rate": 3.0005396120784107e-06, + "loss": 0.8831, + "step": 4896 + }, + { + "epoch": 0.35330615778651564, + "grad_norm": 3.2108991857317193, + "learning_rate": 3.000134912122699e-06, + "loss": 0.9517, + "step": 4897 + }, + { + "epoch": 0.35337830525594316, + "grad_norm": 3.2521122157500293, + "learning_rate": 2.999730157553321e-06, + "loss": 0.8505, + "step": 4898 + }, + { + "epoch": 0.3534504527253707, + "grad_norm": 2.178823210842597, + "learning_rate": 2.999325348392381e-06, + "loss": 0.906, + "step": 4899 + }, + { + "epoch": 0.35352260019479814, + "grad_norm": 0.8918352771298945, + "learning_rate": 2.998920484661983e-06, + "loss": 0.8482, + "step": 4900 + }, + { + "epoch": 0.35359474766422566, + "grad_norm": 3.602141559248638, + "learning_rate": 2.998515566384235e-06, + "loss": 0.9092, + "step": 4901 + }, + { + "epoch": 0.3536668951336532, + "grad_norm": 2.520530230582452, + "learning_rate": 2.9981105935812483e-06, + "loss": 0.9202, + "step": 4902 + }, + { + "epoch": 0.3537390426030807, + "grad_norm": 2.679181554425662, + "learning_rate": 2.9977055662751372e-06, + "loss": 0.9718, + "step": 4903 + }, + { + "epoch": 0.3538111900725082, + "grad_norm": 3.3450159546339044, + "learning_rate": 2.9973004844880186e-06, + "loss": 0.8887, + "step": 4904 + }, + { + "epoch": 0.35388333754193574, + "grad_norm": 3.1174934256208244, + "learning_rate": 2.996895348242012e-06, + "loss": 0.8555, + "step": 4905 + }, + { + "epoch": 0.3539554850113632, + "grad_norm": 2.080980332691794, + "learning_rate": 2.996490157559241e-06, + "loss": 0.989, + "step": 4906 + }, + { + "epoch": 0.3540276324807907, + "grad_norm": 1.9474941414192821, + "learning_rate": 2.9960849124618315e-06, + "loss": 0.9898, + "step": 4907 + }, + { + "epoch": 0.35409977995021824, + "grad_norm": 1.9747372971630952, + "learning_rate": 2.9956796129719116e-06, + "loss": 1.012, + "step": 4908 + }, + { + "epoch": 0.35417192741964576, + "grad_norm": 2.6129142834297627, + "learning_rate": 2.995274259111615e-06, + "loss": 0.9483, + "step": 4909 + }, + { + "epoch": 0.3542440748890733, + "grad_norm": 3.598151919403546, + "learning_rate": 2.9948688509030745e-06, + "loss": 0.9111, + "step": 4910 + }, + { + "epoch": 0.3543162223585008, + "grad_norm": 2.2358482145747156, + "learning_rate": 2.9944633883684293e-06, + "loss": 0.7646, + "step": 4911 + }, + { + "epoch": 0.35438836982792826, + "grad_norm": 2.3701820694727127, + "learning_rate": 2.9940578715298204e-06, + "loss": 1.0019, + "step": 4912 + }, + { + "epoch": 0.3544605172973558, + "grad_norm": 1.918797704246536, + "learning_rate": 2.99365230040939e-06, + "loss": 0.9679, + "step": 4913 + }, + { + "epoch": 0.3545326647667833, + "grad_norm": 1.5542100336128202, + "learning_rate": 2.9932466750292866e-06, + "loss": 0.9709, + "step": 4914 + }, + { + "epoch": 0.3546048122362108, + "grad_norm": 2.1472978378191923, + "learning_rate": 2.992840995411659e-06, + "loss": 0.9226, + "step": 4915 + }, + { + "epoch": 0.35467695970563834, + "grad_norm": 0.91885067394668, + "learning_rate": 2.9924352615786597e-06, + "loss": 0.8321, + "step": 4916 + }, + { + "epoch": 0.35474910717506586, + "grad_norm": 3.3566939190007976, + "learning_rate": 2.992029473552445e-06, + "loss": 0.8905, + "step": 4917 + }, + { + "epoch": 0.3548212546444933, + "grad_norm": 1.9266922287526718, + "learning_rate": 2.991623631355173e-06, + "loss": 0.9492, + "step": 4918 + }, + { + "epoch": 0.35489340211392084, + "grad_norm": 1.8092290368114263, + "learning_rate": 2.991217735009005e-06, + "loss": 0.9175, + "step": 4919 + }, + { + "epoch": 0.35496554958334836, + "grad_norm": 2.1107259309745725, + "learning_rate": 2.9908117845361064e-06, + "loss": 0.9695, + "step": 4920 + }, + { + "epoch": 0.3550376970527759, + "grad_norm": 2.3509915187784935, + "learning_rate": 2.990405779958644e-06, + "loss": 0.98, + "step": 4921 + }, + { + "epoch": 0.3551098445222034, + "grad_norm": 2.861509599887196, + "learning_rate": 2.9899997212987882e-06, + "loss": 0.9165, + "step": 4922 + }, + { + "epoch": 0.3551819919916309, + "grad_norm": 1.7149519831883702, + "learning_rate": 2.9895936085787126e-06, + "loss": 0.8225, + "step": 4923 + }, + { + "epoch": 0.3552541394610584, + "grad_norm": 2.502501285487863, + "learning_rate": 2.9891874418205934e-06, + "loss": 1.003, + "step": 4924 + }, + { + "epoch": 0.3553262869304859, + "grad_norm": 2.280693090733844, + "learning_rate": 2.9887812210466097e-06, + "loss": 0.9519, + "step": 4925 + }, + { + "epoch": 0.3553984343999134, + "grad_norm": 2.8196547314459317, + "learning_rate": 2.988374946278943e-06, + "loss": 1.0137, + "step": 4926 + }, + { + "epoch": 0.35547058186934094, + "grad_norm": 2.1991328871449003, + "learning_rate": 2.987968617539781e-06, + "loss": 0.8381, + "step": 4927 + }, + { + "epoch": 0.35554272933876846, + "grad_norm": 1.722545248248412, + "learning_rate": 2.9875622348513096e-06, + "loss": 0.9104, + "step": 4928 + }, + { + "epoch": 0.355614876808196, + "grad_norm": 2.322560772054517, + "learning_rate": 2.9871557982357194e-06, + "loss": 0.902, + "step": 4929 + }, + { + "epoch": 0.35568702427762344, + "grad_norm": 2.0893514178032415, + "learning_rate": 2.9867493077152066e-06, + "loss": 0.926, + "step": 4930 + }, + { + "epoch": 0.35575917174705096, + "grad_norm": 1.9094586492967118, + "learning_rate": 2.9863427633119664e-06, + "loss": 1.0133, + "step": 4931 + }, + { + "epoch": 0.3558313192164785, + "grad_norm": 2.0612385678133336, + "learning_rate": 2.9859361650481994e-06, + "loss": 0.8926, + "step": 4932 + }, + { + "epoch": 0.355903466685906, + "grad_norm": 0.7836527306826448, + "learning_rate": 2.985529512946108e-06, + "loss": 0.7808, + "step": 4933 + }, + { + "epoch": 0.3559756141553335, + "grad_norm": 0.7299858169614909, + "learning_rate": 2.985122807027898e-06, + "loss": 0.7973, + "step": 4934 + }, + { + "epoch": 0.35604776162476104, + "grad_norm": 1.861980311964701, + "learning_rate": 2.984716047315779e-06, + "loss": 0.9203, + "step": 4935 + }, + { + "epoch": 0.3561199090941885, + "grad_norm": 1.692981928697949, + "learning_rate": 2.984309233831961e-06, + "loss": 0.9864, + "step": 4936 + }, + { + "epoch": 0.356192056563616, + "grad_norm": 2.30542921641887, + "learning_rate": 2.9839023665986603e-06, + "loss": 0.9331, + "step": 4937 + }, + { + "epoch": 0.35626420403304354, + "grad_norm": 1.7587552403613917, + "learning_rate": 2.9834954456380934e-06, + "loss": 0.9167, + "step": 4938 + }, + { + "epoch": 0.35633635150247106, + "grad_norm": 2.892830167372136, + "learning_rate": 2.9830884709724812e-06, + "loss": 0.9722, + "step": 4939 + }, + { + "epoch": 0.3564084989718986, + "grad_norm": 2.1543815367898023, + "learning_rate": 2.9826814426240465e-06, + "loss": 0.9605, + "step": 4940 + }, + { + "epoch": 0.35648064644132604, + "grad_norm": 3.84655064223282, + "learning_rate": 2.982274360615016e-06, + "loss": 0.8366, + "step": 4941 + }, + { + "epoch": 0.35655279391075356, + "grad_norm": 2.4566052284304964, + "learning_rate": 2.9818672249676194e-06, + "loss": 0.9151, + "step": 4942 + }, + { + "epoch": 0.3566249413801811, + "grad_norm": 7.541407656217496, + "learning_rate": 2.9814600357040882e-06, + "loss": 0.8401, + "step": 4943 + }, + { + "epoch": 0.3566970888496086, + "grad_norm": 2.3731360901439578, + "learning_rate": 2.9810527928466577e-06, + "loss": 1.0114, + "step": 4944 + }, + { + "epoch": 0.3567692363190361, + "grad_norm": 1.9304436194568364, + "learning_rate": 2.9806454964175654e-06, + "loss": 1.0012, + "step": 4945 + }, + { + "epoch": 0.35684138378846364, + "grad_norm": 2.068091125671993, + "learning_rate": 2.9802381464390536e-06, + "loss": 0.9606, + "step": 4946 + }, + { + "epoch": 0.3569135312578911, + "grad_norm": 0.7513971824995254, + "learning_rate": 2.979830742933365e-06, + "loss": 0.8033, + "step": 4947 + }, + { + "epoch": 0.3569856787273186, + "grad_norm": 2.426418546298744, + "learning_rate": 2.9794232859227466e-06, + "loss": 1.0107, + "step": 4948 + }, + { + "epoch": 0.35705782619674614, + "grad_norm": 2.324088777386122, + "learning_rate": 2.9790157754294477e-06, + "loss": 0.9856, + "step": 4949 + }, + { + "epoch": 0.35712997366617366, + "grad_norm": 2.0893428594391947, + "learning_rate": 2.9786082114757225e-06, + "loss": 0.9408, + "step": 4950 + }, + { + "epoch": 0.3572021211356012, + "grad_norm": 2.744570226742742, + "learning_rate": 2.978200594083825e-06, + "loss": 0.9009, + "step": 4951 + }, + { + "epoch": 0.3572742686050287, + "grad_norm": 2.1487939157486458, + "learning_rate": 2.977792923276014e-06, + "loss": 1.0219, + "step": 4952 + }, + { + "epoch": 0.35734641607445616, + "grad_norm": 2.2239086732226028, + "learning_rate": 2.977385199074552e-06, + "loss": 1.0114, + "step": 4953 + }, + { + "epoch": 0.3574185635438837, + "grad_norm": 2.5780532364538624, + "learning_rate": 2.9769774215017016e-06, + "loss": 1.0001, + "step": 4954 + }, + { + "epoch": 0.3574907110133112, + "grad_norm": 2.2441360566546735, + "learning_rate": 2.9765695905797314e-06, + "loss": 0.8545, + "step": 4955 + }, + { + "epoch": 0.3575628584827387, + "grad_norm": 3.1528849273115194, + "learning_rate": 2.9761617063309106e-06, + "loss": 0.9991, + "step": 4956 + }, + { + "epoch": 0.35763500595216624, + "grad_norm": 3.6725299656880885, + "learning_rate": 2.975753768777513e-06, + "loss": 1.0257, + "step": 4957 + }, + { + "epoch": 0.35770715342159376, + "grad_norm": 2.235399238208, + "learning_rate": 2.975345777941814e-06, + "loss": 0.9068, + "step": 4958 + }, + { + "epoch": 0.3577793008910212, + "grad_norm": 1.9302787955924887, + "learning_rate": 2.974937733846093e-06, + "loss": 0.9767, + "step": 4959 + }, + { + "epoch": 0.35785144836044874, + "grad_norm": 3.4291637255322684, + "learning_rate": 2.9745296365126303e-06, + "loss": 1.025, + "step": 4960 + }, + { + "epoch": 0.35792359582987626, + "grad_norm": 1.9280638872103468, + "learning_rate": 2.9741214859637134e-06, + "loss": 0.9285, + "step": 4961 + }, + { + "epoch": 0.3579957432993038, + "grad_norm": 3.8781969972085886, + "learning_rate": 2.973713282221627e-06, + "loss": 0.9955, + "step": 4962 + }, + { + "epoch": 0.3580678907687313, + "grad_norm": 3.492434226244783, + "learning_rate": 2.9733050253086634e-06, + "loss": 0.9637, + "step": 4963 + }, + { + "epoch": 0.3581400382381588, + "grad_norm": 2.3518524498103, + "learning_rate": 2.972896715247115e-06, + "loss": 0.9332, + "step": 4964 + }, + { + "epoch": 0.3582121857075863, + "grad_norm": 4.822464374511503, + "learning_rate": 2.9724883520592788e-06, + "loss": 1.017, + "step": 4965 + }, + { + "epoch": 0.3582843331770138, + "grad_norm": 2.5293328831293045, + "learning_rate": 2.9720799357674534e-06, + "loss": 0.9073, + "step": 4966 + }, + { + "epoch": 0.3583564806464413, + "grad_norm": 1.9819623442728675, + "learning_rate": 2.971671466393942e-06, + "loss": 0.9249, + "step": 4967 + }, + { + "epoch": 0.35842862811586884, + "grad_norm": 2.235800015256605, + "learning_rate": 2.971262943961048e-06, + "loss": 1.0564, + "step": 4968 + }, + { + "epoch": 0.35850077558529636, + "grad_norm": 3.74844124504815, + "learning_rate": 2.9708543684910808e-06, + "loss": 0.9656, + "step": 4969 + }, + { + "epoch": 0.3585729230547239, + "grad_norm": 2.412974090639427, + "learning_rate": 2.97044574000635e-06, + "loss": 1.0505, + "step": 4970 + }, + { + "epoch": 0.35864507052415134, + "grad_norm": 2.566344749607958, + "learning_rate": 2.97003705852917e-06, + "loss": 1.0119, + "step": 4971 + }, + { + "epoch": 0.35871721799357886, + "grad_norm": 2.5146200885841363, + "learning_rate": 2.9696283240818574e-06, + "loss": 1.0016, + "step": 4972 + }, + { + "epoch": 0.3587893654630064, + "grad_norm": 1.8524797959969836, + "learning_rate": 2.969219536686731e-06, + "loss": 0.9331, + "step": 4973 + }, + { + "epoch": 0.3588615129324339, + "grad_norm": 1.4465525488223774, + "learning_rate": 2.968810696366114e-06, + "loss": 0.9786, + "step": 4974 + }, + { + "epoch": 0.3589336604018614, + "grad_norm": 2.10897156424817, + "learning_rate": 2.9684018031423313e-06, + "loss": 0.9815, + "step": 4975 + }, + { + "epoch": 0.35900580787128894, + "grad_norm": 2.106905699862429, + "learning_rate": 2.9679928570377108e-06, + "loss": 0.9931, + "step": 4976 + }, + { + "epoch": 0.3590779553407164, + "grad_norm": 3.5067799159233166, + "learning_rate": 2.9675838580745837e-06, + "loss": 1.0179, + "step": 4977 + }, + { + "epoch": 0.3591501028101439, + "grad_norm": 1.680521629278293, + "learning_rate": 2.967174806275284e-06, + "loss": 0.9431, + "step": 4978 + }, + { + "epoch": 0.35922225027957144, + "grad_norm": 2.406764483256006, + "learning_rate": 2.9667657016621486e-06, + "loss": 0.8382, + "step": 4979 + }, + { + "epoch": 0.35929439774899896, + "grad_norm": 2.0319955484611834, + "learning_rate": 2.9663565442575175e-06, + "loss": 0.8884, + "step": 4980 + }, + { + "epoch": 0.3593665452184265, + "grad_norm": 2.0362520105247803, + "learning_rate": 2.9659473340837324e-06, + "loss": 0.9055, + "step": 4981 + }, + { + "epoch": 0.359438692687854, + "grad_norm": 7.729545673081689, + "learning_rate": 2.9655380711631393e-06, + "loss": 0.9376, + "step": 4982 + }, + { + "epoch": 0.35951084015728146, + "grad_norm": 2.362772938942406, + "learning_rate": 2.965128755518086e-06, + "loss": 0.8397, + "step": 4983 + }, + { + "epoch": 0.359582987626709, + "grad_norm": 2.932068043251027, + "learning_rate": 2.964719387170925e-06, + "loss": 0.9342, + "step": 4984 + }, + { + "epoch": 0.3596551350961365, + "grad_norm": 3.2982057895708823, + "learning_rate": 2.964309966144009e-06, + "loss": 0.9843, + "step": 4985 + }, + { + "epoch": 0.359727282565564, + "grad_norm": 2.577131507647357, + "learning_rate": 2.963900492459695e-06, + "loss": 0.9841, + "step": 4986 + }, + { + "epoch": 0.35979943003499154, + "grad_norm": 3.1933849088715403, + "learning_rate": 2.963490966140344e-06, + "loss": 0.8528, + "step": 4987 + }, + { + "epoch": 0.35987157750441906, + "grad_norm": 2.4555466477226298, + "learning_rate": 2.963081387208318e-06, + "loss": 1.0072, + "step": 4988 + }, + { + "epoch": 0.3599437249738465, + "grad_norm": 4.425700659157021, + "learning_rate": 2.962671755685983e-06, + "loss": 0.9389, + "step": 4989 + }, + { + "epoch": 0.36001587244327404, + "grad_norm": 2.068101155387718, + "learning_rate": 2.9622620715957066e-06, + "loss": 0.9727, + "step": 4990 + }, + { + "epoch": 0.36008801991270156, + "grad_norm": 2.0810136723873502, + "learning_rate": 2.9618523349598603e-06, + "loss": 0.9989, + "step": 4991 + }, + { + "epoch": 0.3601601673821291, + "grad_norm": 0.8655085419906019, + "learning_rate": 2.9614425458008195e-06, + "loss": 0.8057, + "step": 4992 + }, + { + "epoch": 0.3602323148515566, + "grad_norm": 0.7917312963774087, + "learning_rate": 2.96103270414096e-06, + "loss": 0.8849, + "step": 4993 + }, + { + "epoch": 0.36030446232098406, + "grad_norm": 1.9795524087410863, + "learning_rate": 2.960622810002662e-06, + "loss": 1.0062, + "step": 4994 + }, + { + "epoch": 0.3603766097904116, + "grad_norm": 2.3897459121734586, + "learning_rate": 2.9602128634083087e-06, + "loss": 0.9545, + "step": 4995 + }, + { + "epoch": 0.3604487572598391, + "grad_norm": 2.2061774568815724, + "learning_rate": 2.959802864380286e-06, + "loss": 0.9316, + "step": 4996 + }, + { + "epoch": 0.3605209047292666, + "grad_norm": 1.8293100412380858, + "learning_rate": 2.9593928129409814e-06, + "loss": 0.9804, + "step": 4997 + }, + { + "epoch": 0.36059305219869414, + "grad_norm": 2.015493344287676, + "learning_rate": 2.9589827091127865e-06, + "loss": 1.0357, + "step": 4998 + }, + { + "epoch": 0.36066519966812166, + "grad_norm": 2.399060991492225, + "learning_rate": 2.958572552918096e-06, + "loss": 1.0673, + "step": 4999 + }, + { + "epoch": 0.3607373471375491, + "grad_norm": 2.4834818644129295, + "learning_rate": 2.9581623443793074e-06, + "loss": 0.9377, + "step": 5000 + }, + { + "epoch": 0.36080949460697664, + "grad_norm": 2.0051352615583755, + "learning_rate": 2.9577520835188198e-06, + "loss": 0.8995, + "step": 5001 + }, + { + "epoch": 0.36088164207640416, + "grad_norm": 2.1282296041584323, + "learning_rate": 2.9573417703590362e-06, + "loss": 0.9571, + "step": 5002 + }, + { + "epoch": 0.3609537895458317, + "grad_norm": 0.7164756616477077, + "learning_rate": 2.956931404922363e-06, + "loss": 0.8658, + "step": 5003 + }, + { + "epoch": 0.3610259370152592, + "grad_norm": 2.08536318755833, + "learning_rate": 2.9565209872312074e-06, + "loss": 0.9822, + "step": 5004 + }, + { + "epoch": 0.3610980844846867, + "grad_norm": 3.9820673224881116, + "learning_rate": 2.9561105173079828e-06, + "loss": 0.986, + "step": 5005 + }, + { + "epoch": 0.3611702319541142, + "grad_norm": 2.931388340920671, + "learning_rate": 2.955699995175101e-06, + "loss": 0.8318, + "step": 5006 + }, + { + "epoch": 0.3612423794235417, + "grad_norm": 2.3349580784696005, + "learning_rate": 2.9552894208549816e-06, + "loss": 0.8703, + "step": 5007 + }, + { + "epoch": 0.3613145268929692, + "grad_norm": 1.7393171674055847, + "learning_rate": 2.9548787943700422e-06, + "loss": 0.9716, + "step": 5008 + }, + { + "epoch": 0.36138667436239674, + "grad_norm": 2.058078766600951, + "learning_rate": 2.9544681157427076e-06, + "loss": 0.8834, + "step": 5009 + }, + { + "epoch": 0.36145882183182426, + "grad_norm": 1.9919684196136642, + "learning_rate": 2.954057384995402e-06, + "loss": 0.9876, + "step": 5010 + }, + { + "epoch": 0.3615309693012518, + "grad_norm": 2.19397779189548, + "learning_rate": 2.9536466021505544e-06, + "loss": 0.9254, + "step": 5011 + }, + { + "epoch": 0.36160311677067924, + "grad_norm": 1.8447593253898111, + "learning_rate": 2.953235767230596e-06, + "loss": 0.9267, + "step": 5012 + }, + { + "epoch": 0.36167526424010676, + "grad_norm": 3.666119288999516, + "learning_rate": 2.9528248802579613e-06, + "loss": 0.9449, + "step": 5013 + }, + { + "epoch": 0.3617474117095343, + "grad_norm": 2.0113024827215704, + "learning_rate": 2.952413941255087e-06, + "loss": 1.0539, + "step": 5014 + }, + { + "epoch": 0.3618195591789618, + "grad_norm": 2.1848409567766462, + "learning_rate": 2.9520029502444132e-06, + "loss": 0.8693, + "step": 5015 + }, + { + "epoch": 0.3618917066483893, + "grad_norm": 3.3243670671414622, + "learning_rate": 2.951591907248383e-06, + "loss": 0.8897, + "step": 5016 + }, + { + "epoch": 0.36196385411781684, + "grad_norm": 1.9530913693393648, + "learning_rate": 2.951180812289441e-06, + "loss": 0.8979, + "step": 5017 + }, + { + "epoch": 0.3620360015872443, + "grad_norm": 3.2589056801785174, + "learning_rate": 2.9507696653900355e-06, + "loss": 0.8898, + "step": 5018 + }, + { + "epoch": 0.3621081490566718, + "grad_norm": 2.1493644795294813, + "learning_rate": 2.950358466572618e-06, + "loss": 0.8818, + "step": 5019 + }, + { + "epoch": 0.36218029652609934, + "grad_norm": 2.7044739637258837, + "learning_rate": 2.949947215859644e-06, + "loss": 0.859, + "step": 5020 + }, + { + "epoch": 0.36225244399552686, + "grad_norm": 2.5587780602150683, + "learning_rate": 2.949535913273569e-06, + "loss": 0.714, + "step": 5021 + }, + { + "epoch": 0.3623245914649544, + "grad_norm": 1.9914473411537903, + "learning_rate": 2.949124558836852e-06, + "loss": 0.8765, + "step": 5022 + }, + { + "epoch": 0.3623967389343819, + "grad_norm": 1.9958483998706198, + "learning_rate": 2.948713152571957e-06, + "loss": 0.9652, + "step": 5023 + }, + { + "epoch": 0.36246888640380936, + "grad_norm": 1.9096966836165732, + "learning_rate": 2.948301694501348e-06, + "loss": 0.9741, + "step": 5024 + }, + { + "epoch": 0.3625410338732369, + "grad_norm": 2.2128913791853857, + "learning_rate": 2.9478901846474946e-06, + "loss": 1.0032, + "step": 5025 + }, + { + "epoch": 0.3626131813426644, + "grad_norm": 2.4196901372091886, + "learning_rate": 2.947478623032867e-06, + "loss": 0.9838, + "step": 5026 + }, + { + "epoch": 0.3626853288120919, + "grad_norm": 2.0359385445768496, + "learning_rate": 2.947067009679939e-06, + "loss": 0.9545, + "step": 5027 + }, + { + "epoch": 0.36275747628151944, + "grad_norm": 2.153899418125103, + "learning_rate": 2.9466553446111875e-06, + "loss": 0.9695, + "step": 5028 + }, + { + "epoch": 0.36282962375094696, + "grad_norm": 2.018900139956399, + "learning_rate": 2.9462436278490923e-06, + "loss": 0.9935, + "step": 5029 + }, + { + "epoch": 0.3629017712203744, + "grad_norm": 0.9545796035256374, + "learning_rate": 2.9458318594161357e-06, + "loss": 0.8549, + "step": 5030 + }, + { + "epoch": 0.36297391868980194, + "grad_norm": 6.762161601796322, + "learning_rate": 2.9454200393348022e-06, + "loss": 0.9578, + "step": 5031 + }, + { + "epoch": 0.36304606615922946, + "grad_norm": 3.6221227078981983, + "learning_rate": 2.9450081676275805e-06, + "loss": 0.954, + "step": 5032 + }, + { + "epoch": 0.363118213628657, + "grad_norm": 2.3207990861850796, + "learning_rate": 2.944596244316961e-06, + "loss": 0.9611, + "step": 5033 + }, + { + "epoch": 0.3631903610980845, + "grad_norm": 2.86355069130014, + "learning_rate": 2.944184269425437e-06, + "loss": 0.9799, + "step": 5034 + }, + { + "epoch": 0.363262508567512, + "grad_norm": 2.0322780648432484, + "learning_rate": 2.943772242975506e-06, + "loss": 0.8766, + "step": 5035 + }, + { + "epoch": 0.3633346560369395, + "grad_norm": 2.093876621346139, + "learning_rate": 2.943360164989666e-06, + "loss": 0.9562, + "step": 5036 + }, + { + "epoch": 0.363406803506367, + "grad_norm": 2.413348243502227, + "learning_rate": 2.94294803549042e-06, + "loss": 0.9095, + "step": 5037 + }, + { + "epoch": 0.3634789509757945, + "grad_norm": 2.137322879167589, + "learning_rate": 2.9425358545002717e-06, + "loss": 0.9201, + "step": 5038 + }, + { + "epoch": 0.36355109844522204, + "grad_norm": 1.8544851808179577, + "learning_rate": 2.94212362204173e-06, + "loss": 0.9097, + "step": 5039 + }, + { + "epoch": 0.36362324591464956, + "grad_norm": 1.9090863373317721, + "learning_rate": 2.941711338137305e-06, + "loss": 0.9173, + "step": 5040 + }, + { + "epoch": 0.363695393384077, + "grad_norm": 3.5201294927186715, + "learning_rate": 2.94129900280951e-06, + "loss": 0.9101, + "step": 5041 + }, + { + "epoch": 0.36376754085350455, + "grad_norm": 2.107151017623506, + "learning_rate": 2.94088661608086e-06, + "loss": 0.9322, + "step": 5042 + }, + { + "epoch": 0.36383968832293206, + "grad_norm": 2.0606824785060116, + "learning_rate": 2.940474177973876e-06, + "loss": 0.8893, + "step": 5043 + }, + { + "epoch": 0.3639118357923596, + "grad_norm": 4.327234279820215, + "learning_rate": 2.940061688511079e-06, + "loss": 0.8061, + "step": 5044 + }, + { + "epoch": 0.3639839832617871, + "grad_norm": 2.277033527896396, + "learning_rate": 2.9396491477149922e-06, + "loss": 0.973, + "step": 5045 + }, + { + "epoch": 0.3640561307312146, + "grad_norm": 2.8108765897147103, + "learning_rate": 2.9392365556081446e-06, + "loss": 0.9399, + "step": 5046 + }, + { + "epoch": 0.3641282782006421, + "grad_norm": 2.6967067484790554, + "learning_rate": 2.9388239122130653e-06, + "loss": 0.9771, + "step": 5047 + }, + { + "epoch": 0.3642004256700696, + "grad_norm": 2.1205355926743095, + "learning_rate": 2.9384112175522875e-06, + "loss": 0.9132, + "step": 5048 + }, + { + "epoch": 0.3642725731394971, + "grad_norm": 2.398859042918985, + "learning_rate": 2.937998471648347e-06, + "loss": 0.974, + "step": 5049 + }, + { + "epoch": 0.36434472060892464, + "grad_norm": 2.4741229728659975, + "learning_rate": 2.9375856745237827e-06, + "loss": 0.9097, + "step": 5050 + }, + { + "epoch": 0.36441686807835216, + "grad_norm": 1.8142030541563436, + "learning_rate": 2.937172826201135e-06, + "loss": 0.9625, + "step": 5051 + }, + { + "epoch": 0.3644890155477797, + "grad_norm": 2.607107673601712, + "learning_rate": 2.9367599267029494e-06, + "loss": 0.988, + "step": 5052 + }, + { + "epoch": 0.36456116301720715, + "grad_norm": 2.0995979060369896, + "learning_rate": 2.936346976051771e-06, + "loss": 0.9085, + "step": 5053 + }, + { + "epoch": 0.36463331048663467, + "grad_norm": 2.4350135054008173, + "learning_rate": 2.9359339742701505e-06, + "loss": 0.9256, + "step": 5054 + }, + { + "epoch": 0.3647054579560622, + "grad_norm": 2.3635143827093943, + "learning_rate": 2.9355209213806414e-06, + "loss": 1.0177, + "step": 5055 + }, + { + "epoch": 0.3647776054254897, + "grad_norm": 4.071907536339335, + "learning_rate": 2.935107817405797e-06, + "loss": 0.951, + "step": 5056 + }, + { + "epoch": 0.3648497528949172, + "grad_norm": 2.2188830268632582, + "learning_rate": 2.9346946623681764e-06, + "loss": 0.9082, + "step": 5057 + }, + { + "epoch": 0.36492190036434474, + "grad_norm": 1.9316501906479215, + "learning_rate": 2.934281456290341e-06, + "loss": 0.906, + "step": 5058 + }, + { + "epoch": 0.3649940478337722, + "grad_norm": 1.7581703330446026, + "learning_rate": 2.933868199194854e-06, + "loss": 0.9828, + "step": 5059 + }, + { + "epoch": 0.3650661953031997, + "grad_norm": 2.2966151706844613, + "learning_rate": 2.9334548911042807e-06, + "loss": 0.9502, + "step": 5060 + }, + { + "epoch": 0.36513834277262724, + "grad_norm": 2.0438311126198374, + "learning_rate": 2.933041532041192e-06, + "loss": 0.9976, + "step": 5061 + }, + { + "epoch": 0.36521049024205476, + "grad_norm": 2.578124260179818, + "learning_rate": 2.9326281220281594e-06, + "loss": 0.9743, + "step": 5062 + }, + { + "epoch": 0.3652826377114823, + "grad_norm": 2.0788845882555917, + "learning_rate": 2.932214661087757e-06, + "loss": 0.901, + "step": 5063 + }, + { + "epoch": 0.3653547851809098, + "grad_norm": 1.936904323438382, + "learning_rate": 2.9318011492425626e-06, + "loss": 0.9065, + "step": 5064 + }, + { + "epoch": 0.36542693265033727, + "grad_norm": 2.405845608109002, + "learning_rate": 2.9313875865151575e-06, + "loss": 0.9667, + "step": 5065 + }, + { + "epoch": 0.3654990801197648, + "grad_norm": 2.700278310207243, + "learning_rate": 2.9309739729281245e-06, + "loss": 0.9709, + "step": 5066 + }, + { + "epoch": 0.3655712275891923, + "grad_norm": 2.3453934057666577, + "learning_rate": 2.9305603085040485e-06, + "loss": 0.9925, + "step": 5067 + }, + { + "epoch": 0.3656433750586198, + "grad_norm": 1.9805573513218444, + "learning_rate": 2.930146593265519e-06, + "loss": 1.0457, + "step": 5068 + }, + { + "epoch": 0.36571552252804734, + "grad_norm": 0.9556908699421789, + "learning_rate": 2.9297328272351273e-06, + "loss": 0.8433, + "step": 5069 + }, + { + "epoch": 0.36578766999747486, + "grad_norm": 2.5244844231309123, + "learning_rate": 2.929319010435468e-06, + "loss": 0.8475, + "step": 5070 + }, + { + "epoch": 0.3658598174669023, + "grad_norm": 0.7126176285267942, + "learning_rate": 2.9289051428891375e-06, + "loss": 0.8306, + "step": 5071 + }, + { + "epoch": 0.36593196493632985, + "grad_norm": 2.252345769684806, + "learning_rate": 2.9284912246187355e-06, + "loss": 1.0096, + "step": 5072 + }, + { + "epoch": 0.36600411240575736, + "grad_norm": 3.8885726724460277, + "learning_rate": 2.9280772556468657e-06, + "loss": 0.896, + "step": 5073 + }, + { + "epoch": 0.3660762598751849, + "grad_norm": 3.568784424303446, + "learning_rate": 2.927663235996132e-06, + "loss": 0.862, + "step": 5074 + }, + { + "epoch": 0.3661484073446124, + "grad_norm": 5.279806086418389, + "learning_rate": 2.9272491656891435e-06, + "loss": 0.8772, + "step": 5075 + }, + { + "epoch": 0.3662205548140399, + "grad_norm": 2.503038181509032, + "learning_rate": 2.92683504474851e-06, + "loss": 1.066, + "step": 5076 + }, + { + "epoch": 0.3662927022834674, + "grad_norm": 2.576833690577738, + "learning_rate": 2.9264208731968465e-06, + "loss": 0.9342, + "step": 5077 + }, + { + "epoch": 0.3663648497528949, + "grad_norm": 2.169773319903631, + "learning_rate": 2.926006651056768e-06, + "loss": 1.0358, + "step": 5078 + }, + { + "epoch": 0.3664369972223224, + "grad_norm": 2.28000376901817, + "learning_rate": 2.9255923783508945e-06, + "loss": 1.0141, + "step": 5079 + }, + { + "epoch": 0.36650914469174994, + "grad_norm": 2.2707693691752624, + "learning_rate": 2.9251780551018473e-06, + "loss": 0.9528, + "step": 5080 + }, + { + "epoch": 0.36658129216117746, + "grad_norm": 2.245683450126622, + "learning_rate": 2.9247636813322514e-06, + "loss": 0.9036, + "step": 5081 + }, + { + "epoch": 0.366653439630605, + "grad_norm": 2.4568626940333407, + "learning_rate": 2.924349257064735e-06, + "loss": 0.9266, + "step": 5082 + }, + { + "epoch": 0.36672558710003245, + "grad_norm": 2.580356348992686, + "learning_rate": 2.923934782321927e-06, + "loss": 1.0242, + "step": 5083 + }, + { + "epoch": 0.36679773456945997, + "grad_norm": 3.225484672617592, + "learning_rate": 2.9235202571264605e-06, + "loss": 1.0091, + "step": 5084 + }, + { + "epoch": 0.3668698820388875, + "grad_norm": 2.763512877079361, + "learning_rate": 2.9231056815009723e-06, + "loss": 0.964, + "step": 5085 + }, + { + "epoch": 0.366942029508315, + "grad_norm": 2.2385778846372886, + "learning_rate": 2.9226910554681e-06, + "loss": 0.9765, + "step": 5086 + }, + { + "epoch": 0.3670141769777425, + "grad_norm": 2.22585652184937, + "learning_rate": 2.922276379050484e-06, + "loss": 0.9628, + "step": 5087 + }, + { + "epoch": 0.36708632444717004, + "grad_norm": 2.730169320776842, + "learning_rate": 2.9218616522707695e-06, + "loss": 0.8557, + "step": 5088 + }, + { + "epoch": 0.3671584719165975, + "grad_norm": 2.071208946912411, + "learning_rate": 2.9214468751516032e-06, + "loss": 0.948, + "step": 5089 + }, + { + "epoch": 0.367230619386025, + "grad_norm": 3.7022814856318718, + "learning_rate": 2.9210320477156335e-06, + "loss": 0.9501, + "step": 5090 + }, + { + "epoch": 0.36730276685545254, + "grad_norm": 2.5201704766875164, + "learning_rate": 2.920617169985513e-06, + "loss": 0.9706, + "step": 5091 + }, + { + "epoch": 0.36737491432488006, + "grad_norm": 4.054409010185702, + "learning_rate": 2.9202022419838974e-06, + "loss": 0.9036, + "step": 5092 + }, + { + "epoch": 0.3674470617943076, + "grad_norm": 1.9585034587363077, + "learning_rate": 2.919787263733444e-06, + "loss": 0.9296, + "step": 5093 + }, + { + "epoch": 0.36751920926373505, + "grad_norm": 2.5380222478422034, + "learning_rate": 2.9193722352568125e-06, + "loss": 0.9287, + "step": 5094 + }, + { + "epoch": 0.36759135673316257, + "grad_norm": 3.8945343109394317, + "learning_rate": 2.9189571565766667e-06, + "loss": 0.9064, + "step": 5095 + }, + { + "epoch": 0.3676635042025901, + "grad_norm": 2.778543677175671, + "learning_rate": 2.9185420277156727e-06, + "loss": 0.9858, + "step": 5096 + }, + { + "epoch": 0.3677356516720176, + "grad_norm": 0.8249080361058886, + "learning_rate": 2.9181268486964996e-06, + "loss": 0.8063, + "step": 5097 + }, + { + "epoch": 0.3678077991414451, + "grad_norm": 2.0460226599369777, + "learning_rate": 2.917711619541817e-06, + "loss": 1.0063, + "step": 5098 + }, + { + "epoch": 0.36787994661087264, + "grad_norm": 4.198681833407194, + "learning_rate": 2.9172963402743005e-06, + "loss": 0.8664, + "step": 5099 + }, + { + "epoch": 0.3679520940803001, + "grad_norm": 2.1566927911315057, + "learning_rate": 2.9168810109166268e-06, + "loss": 1.0003, + "step": 5100 + }, + { + "epoch": 0.3680242415497276, + "grad_norm": 1.8420511598099156, + "learning_rate": 2.9164656314914747e-06, + "loss": 0.915, + "step": 5101 + }, + { + "epoch": 0.36809638901915515, + "grad_norm": 2.737275596025843, + "learning_rate": 2.9160502020215277e-06, + "loss": 0.9004, + "step": 5102 + }, + { + "epoch": 0.36816853648858267, + "grad_norm": 2.6944237169689815, + "learning_rate": 2.9156347225294698e-06, + "loss": 0.8688, + "step": 5103 + }, + { + "epoch": 0.3682406839580102, + "grad_norm": 2.0914592452632608, + "learning_rate": 2.9152191930379895e-06, + "loss": 0.7858, + "step": 5104 + }, + { + "epoch": 0.3683128314274377, + "grad_norm": 0.7004604928123288, + "learning_rate": 2.914803613569777e-06, + "loss": 0.8199, + "step": 5105 + }, + { + "epoch": 0.36838497889686517, + "grad_norm": 2.3718862701670957, + "learning_rate": 2.9143879841475256e-06, + "loss": 0.9107, + "step": 5106 + }, + { + "epoch": 0.3684571263662927, + "grad_norm": 2.554825338675604, + "learning_rate": 2.9139723047939315e-06, + "loss": 1.045, + "step": 5107 + }, + { + "epoch": 0.3685292738357202, + "grad_norm": 2.362887868376688, + "learning_rate": 2.9135565755316932e-06, + "loss": 0.8847, + "step": 5108 + }, + { + "epoch": 0.3686014213051477, + "grad_norm": 3.2589333341978026, + "learning_rate": 2.9131407963835122e-06, + "loss": 0.9381, + "step": 5109 + }, + { + "epoch": 0.36867356877457524, + "grad_norm": 2.084760469541417, + "learning_rate": 2.9127249673720924e-06, + "loss": 0.8982, + "step": 5110 + }, + { + "epoch": 0.36874571624400276, + "grad_norm": 2.4618685459769982, + "learning_rate": 2.912309088520141e-06, + "loss": 0.8773, + "step": 5111 + }, + { + "epoch": 0.3688178637134302, + "grad_norm": 2.425431230880645, + "learning_rate": 2.911893159850368e-06, + "loss": 0.9129, + "step": 5112 + }, + { + "epoch": 0.36889001118285775, + "grad_norm": 2.1302513982497167, + "learning_rate": 2.911477181385485e-06, + "loss": 0.9774, + "step": 5113 + }, + { + "epoch": 0.36896215865228527, + "grad_norm": 0.7649999129850051, + "learning_rate": 2.911061153148207e-06, + "loss": 0.7722, + "step": 5114 + }, + { + "epoch": 0.3690343061217128, + "grad_norm": 1.9780013686079916, + "learning_rate": 2.9106450751612525e-06, + "loss": 1.0203, + "step": 5115 + }, + { + "epoch": 0.3691064535911403, + "grad_norm": 3.0765714646223823, + "learning_rate": 2.9102289474473407e-06, + "loss": 0.8727, + "step": 5116 + }, + { + "epoch": 0.3691786010605678, + "grad_norm": 4.9541043537777965, + "learning_rate": 2.9098127700291964e-06, + "loss": 0.9503, + "step": 5117 + }, + { + "epoch": 0.3692507485299953, + "grad_norm": 2.7530421856170952, + "learning_rate": 2.909396542929545e-06, + "loss": 0.9006, + "step": 5118 + }, + { + "epoch": 0.3693228959994228, + "grad_norm": 2.9843682893208254, + "learning_rate": 2.908980266171114e-06, + "loss": 0.8389, + "step": 5119 + }, + { + "epoch": 0.3693950434688503, + "grad_norm": 2.539495249329758, + "learning_rate": 2.9085639397766358e-06, + "loss": 0.9756, + "step": 5120 + }, + { + "epoch": 0.36946719093827785, + "grad_norm": 2.437906475898838, + "learning_rate": 2.908147563768845e-06, + "loss": 0.9607, + "step": 5121 + }, + { + "epoch": 0.36953933840770536, + "grad_norm": 36.08578950364759, + "learning_rate": 2.9077311381704766e-06, + "loss": 0.8922, + "step": 5122 + }, + { + "epoch": 0.3696114858771329, + "grad_norm": 2.630913023766283, + "learning_rate": 2.9073146630042716e-06, + "loss": 0.8696, + "step": 5123 + }, + { + "epoch": 0.36968363334656035, + "grad_norm": 2.260199529060063, + "learning_rate": 2.906898138292972e-06, + "loss": 1.0075, + "step": 5124 + }, + { + "epoch": 0.36975578081598787, + "grad_norm": 1.8337820833255998, + "learning_rate": 2.9064815640593216e-06, + "loss": 0.9388, + "step": 5125 + }, + { + "epoch": 0.3698279282854154, + "grad_norm": 6.5123361753626865, + "learning_rate": 2.9060649403260686e-06, + "loss": 0.8883, + "step": 5126 + }, + { + "epoch": 0.3699000757548429, + "grad_norm": 2.0507964796681524, + "learning_rate": 2.905648267115964e-06, + "loss": 0.9807, + "step": 5127 + }, + { + "epoch": 0.3699722232242704, + "grad_norm": 2.0836870147577278, + "learning_rate": 2.9052315444517597e-06, + "loss": 0.8741, + "step": 5128 + }, + { + "epoch": 0.37004437069369794, + "grad_norm": 2.3354411028338755, + "learning_rate": 2.9048147723562116e-06, + "loss": 0.9461, + "step": 5129 + }, + { + "epoch": 0.3701165181631254, + "grad_norm": 2.321377288335747, + "learning_rate": 2.904397950852079e-06, + "loss": 0.9042, + "step": 5130 + }, + { + "epoch": 0.3701886656325529, + "grad_norm": 2.154534804899866, + "learning_rate": 2.903981079962121e-06, + "loss": 0.8589, + "step": 5131 + }, + { + "epoch": 0.37026081310198045, + "grad_norm": 4.894219877989886, + "learning_rate": 2.9035641597091044e-06, + "loss": 0.8001, + "step": 5132 + }, + { + "epoch": 0.37033296057140797, + "grad_norm": 3.00701687681272, + "learning_rate": 2.903147190115793e-06, + "loss": 0.8264, + "step": 5133 + }, + { + "epoch": 0.3704051080408355, + "grad_norm": 3.828367762750553, + "learning_rate": 2.902730171204956e-06, + "loss": 0.8267, + "step": 5134 + }, + { + "epoch": 0.370477255510263, + "grad_norm": 2.1712451268450526, + "learning_rate": 2.9023131029993676e-06, + "loss": 0.974, + "step": 5135 + }, + { + "epoch": 0.37054940297969047, + "grad_norm": 4.480907841046951, + "learning_rate": 2.9018959855218e-06, + "loss": 0.9748, + "step": 5136 + }, + { + "epoch": 0.370621550449118, + "grad_norm": 2.262227319123764, + "learning_rate": 2.9014788187950316e-06, + "loss": 0.8524, + "step": 5137 + }, + { + "epoch": 0.3706936979185455, + "grad_norm": 2.5413472855649597, + "learning_rate": 2.9010616028418422e-06, + "loss": 0.9895, + "step": 5138 + }, + { + "epoch": 0.370765845387973, + "grad_norm": 1.9511451757549843, + "learning_rate": 2.9006443376850146e-06, + "loss": 0.9576, + "step": 5139 + }, + { + "epoch": 0.37083799285740054, + "grad_norm": 4.007630937118513, + "learning_rate": 2.9002270233473336e-06, + "loss": 0.8617, + "step": 5140 + }, + { + "epoch": 0.37091014032682806, + "grad_norm": 2.2862086293694137, + "learning_rate": 2.8998096598515868e-06, + "loss": 0.8722, + "step": 5141 + }, + { + "epoch": 0.3709822877962555, + "grad_norm": 2.4460279496686668, + "learning_rate": 2.8993922472205653e-06, + "loss": 0.9321, + "step": 5142 + }, + { + "epoch": 0.37105443526568305, + "grad_norm": 2.643859146787006, + "learning_rate": 2.898974785477064e-06, + "loss": 0.9485, + "step": 5143 + }, + { + "epoch": 0.37112658273511057, + "grad_norm": 5.99366584854003, + "learning_rate": 2.8985572746438762e-06, + "loss": 0.8837, + "step": 5144 + }, + { + "epoch": 0.3711987302045381, + "grad_norm": 3.964362655541044, + "learning_rate": 2.8981397147438026e-06, + "loss": 0.9974, + "step": 5145 + }, + { + "epoch": 0.3712708776739656, + "grad_norm": 1.8826928318346963, + "learning_rate": 2.8977221057996434e-06, + "loss": 0.992, + "step": 5146 + }, + { + "epoch": 0.37134302514339307, + "grad_norm": 2.0501999966826907, + "learning_rate": 2.8973044478342037e-06, + "loss": 1.0802, + "step": 5147 + }, + { + "epoch": 0.3714151726128206, + "grad_norm": 3.930567982133169, + "learning_rate": 2.8968867408702903e-06, + "loss": 0.9053, + "step": 5148 + }, + { + "epoch": 0.3714873200822481, + "grad_norm": 2.6438875527817274, + "learning_rate": 2.8964689849307116e-06, + "loss": 0.9723, + "step": 5149 + }, + { + "epoch": 0.3715594675516756, + "grad_norm": 2.443431581791776, + "learning_rate": 2.89605118003828e-06, + "loss": 0.9615, + "step": 5150 + }, + { + "epoch": 0.37163161502110315, + "grad_norm": 2.279928896113163, + "learning_rate": 2.895633326215811e-06, + "loss": 1.0205, + "step": 5151 + }, + { + "epoch": 0.37170376249053066, + "grad_norm": 2.6409669011449415, + "learning_rate": 2.8952154234861215e-06, + "loss": 1.1246, + "step": 5152 + }, + { + "epoch": 0.37177590995995813, + "grad_norm": 2.291092424983526, + "learning_rate": 2.8947974718720315e-06, + "loss": 0.9535, + "step": 5153 + }, + { + "epoch": 0.37184805742938565, + "grad_norm": 2.099385775702834, + "learning_rate": 2.8943794713963644e-06, + "loss": 0.8096, + "step": 5154 + }, + { + "epoch": 0.37192020489881317, + "grad_norm": 3.9971433213922274, + "learning_rate": 2.8939614220819447e-06, + "loss": 0.865, + "step": 5155 + }, + { + "epoch": 0.3719923523682407, + "grad_norm": 0.8995639539441972, + "learning_rate": 2.8935433239516018e-06, + "loss": 0.8083, + "step": 5156 + }, + { + "epoch": 0.3720644998376682, + "grad_norm": 1.8854834894242913, + "learning_rate": 2.8931251770281647e-06, + "loss": 1.0011, + "step": 5157 + }, + { + "epoch": 0.3721366473070957, + "grad_norm": 2.8101220144477685, + "learning_rate": 2.892706981334469e-06, + "loss": 0.7961, + "step": 5158 + }, + { + "epoch": 0.3722087947765232, + "grad_norm": 2.8524818924004003, + "learning_rate": 2.892288736893349e-06, + "loss": 0.9085, + "step": 5159 + }, + { + "epoch": 0.3722809422459507, + "grad_norm": 2.520125823134375, + "learning_rate": 2.8918704437276448e-06, + "loss": 0.9013, + "step": 5160 + }, + { + "epoch": 0.3723530897153782, + "grad_norm": 2.316151596918835, + "learning_rate": 2.8914521018601962e-06, + "loss": 0.8782, + "step": 5161 + }, + { + "epoch": 0.37242523718480575, + "grad_norm": 2.013310249187855, + "learning_rate": 2.8910337113138494e-06, + "loss": 0.9413, + "step": 5162 + }, + { + "epoch": 0.37249738465423327, + "grad_norm": 2.508718261670044, + "learning_rate": 2.89061527211145e-06, + "loss": 0.9803, + "step": 5163 + }, + { + "epoch": 0.3725695321236608, + "grad_norm": 1.9132749844779888, + "learning_rate": 2.8901967842758474e-06, + "loss": 0.9403, + "step": 5164 + }, + { + "epoch": 0.37264167959308825, + "grad_norm": 2.9867772524944116, + "learning_rate": 2.8897782478298935e-06, + "loss": 0.9548, + "step": 5165 + }, + { + "epoch": 0.37271382706251577, + "grad_norm": 3.147738925873457, + "learning_rate": 2.8893596627964437e-06, + "loss": 0.9344, + "step": 5166 + }, + { + "epoch": 0.3727859745319433, + "grad_norm": 3.0905301070808933, + "learning_rate": 2.888941029198355e-06, + "loss": 0.9787, + "step": 5167 + }, + { + "epoch": 0.3728581220013708, + "grad_norm": 4.51027735024389, + "learning_rate": 2.8885223470584866e-06, + "loss": 0.8843, + "step": 5168 + }, + { + "epoch": 0.3729302694707983, + "grad_norm": 1.9995845720854724, + "learning_rate": 2.888103616399703e-06, + "loss": 0.9333, + "step": 5169 + }, + { + "epoch": 0.37300241694022584, + "grad_norm": 2.0388453576236394, + "learning_rate": 2.8876848372448686e-06, + "loss": 0.9012, + "step": 5170 + }, + { + "epoch": 0.3730745644096533, + "grad_norm": 3.263560249250545, + "learning_rate": 2.887266009616851e-06, + "loss": 0.8736, + "step": 5171 + }, + { + "epoch": 0.37314671187908083, + "grad_norm": 2.6787025710200187, + "learning_rate": 2.886847133538521e-06, + "loss": 0.9451, + "step": 5172 + }, + { + "epoch": 0.37321885934850835, + "grad_norm": 2.0330095153150594, + "learning_rate": 2.8864282090327524e-06, + "loss": 0.8935, + "step": 5173 + }, + { + "epoch": 0.37329100681793587, + "grad_norm": 2.642390727343503, + "learning_rate": 2.88600923612242e-06, + "loss": 0.9586, + "step": 5174 + }, + { + "epoch": 0.3733631542873634, + "grad_norm": 2.3233838032185092, + "learning_rate": 2.8855902148304042e-06, + "loss": 1.06, + "step": 5175 + }, + { + "epoch": 0.3734353017567909, + "grad_norm": 2.3639166364026214, + "learning_rate": 2.8851711451795843e-06, + "loss": 0.9031, + "step": 5176 + }, + { + "epoch": 0.37350744922621837, + "grad_norm": 0.8588753374743364, + "learning_rate": 2.8847520271928457e-06, + "loss": 0.8457, + "step": 5177 + }, + { + "epoch": 0.3735795966956459, + "grad_norm": 5.375825330666896, + "learning_rate": 2.8843328608930737e-06, + "loss": 1.0015, + "step": 5178 + }, + { + "epoch": 0.3736517441650734, + "grad_norm": 4.400821669758791, + "learning_rate": 2.8839136463031575e-06, + "loss": 0.9705, + "step": 5179 + }, + { + "epoch": 0.3737238916345009, + "grad_norm": 3.8524296463391354, + "learning_rate": 2.8834943834459896e-06, + "loss": 0.9383, + "step": 5180 + }, + { + "epoch": 0.37379603910392845, + "grad_norm": 2.142337309136487, + "learning_rate": 2.8830750723444643e-06, + "loss": 0.9322, + "step": 5181 + }, + { + "epoch": 0.37386818657335597, + "grad_norm": 2.2842936656578012, + "learning_rate": 2.882655713021478e-06, + "loss": 1.0012, + "step": 5182 + }, + { + "epoch": 0.37394033404278343, + "grad_norm": 2.1622016700956244, + "learning_rate": 2.8822363054999303e-06, + "loss": 0.8764, + "step": 5183 + }, + { + "epoch": 0.37401248151221095, + "grad_norm": 2.022621844644798, + "learning_rate": 2.8818168498027243e-06, + "loss": 0.8805, + "step": 5184 + }, + { + "epoch": 0.37408462898163847, + "grad_norm": 2.0341296398142976, + "learning_rate": 2.8813973459527643e-06, + "loss": 0.9012, + "step": 5185 + }, + { + "epoch": 0.374156776451066, + "grad_norm": 3.1567716261894128, + "learning_rate": 2.880977793972958e-06, + "loss": 0.8692, + "step": 5186 + }, + { + "epoch": 0.3742289239204935, + "grad_norm": 1.7171106323032088, + "learning_rate": 2.8805581938862165e-06, + "loss": 1.0264, + "step": 5187 + }, + { + "epoch": 0.374301071389921, + "grad_norm": 2.5594773492155505, + "learning_rate": 2.880138545715451e-06, + "loss": 0.9928, + "step": 5188 + }, + { + "epoch": 0.3743732188593485, + "grad_norm": 1.5825442389019224, + "learning_rate": 2.879718849483578e-06, + "loss": 0.9546, + "step": 5189 + }, + { + "epoch": 0.374445366328776, + "grad_norm": 2.784312244739892, + "learning_rate": 2.8792991052135147e-06, + "loss": 0.9027, + "step": 5190 + }, + { + "epoch": 0.3745175137982035, + "grad_norm": 3.511177063643105, + "learning_rate": 2.878879312928183e-06, + "loss": 0.852, + "step": 5191 + }, + { + "epoch": 0.37458966126763105, + "grad_norm": 3.011716850072358, + "learning_rate": 2.878459472650505e-06, + "loss": 0.911, + "step": 5192 + }, + { + "epoch": 0.37466180873705857, + "grad_norm": 3.8586876137364152, + "learning_rate": 2.8780395844034077e-06, + "loss": 0.9273, + "step": 5193 + }, + { + "epoch": 0.3747339562064861, + "grad_norm": 3.639085829034751, + "learning_rate": 2.8776196482098184e-06, + "loss": 0.8524, + "step": 5194 + }, + { + "epoch": 0.37480610367591355, + "grad_norm": 2.3586337712554566, + "learning_rate": 2.8771996640926686e-06, + "loss": 0.8829, + "step": 5195 + }, + { + "epoch": 0.37487825114534107, + "grad_norm": 2.5996396401905586, + "learning_rate": 2.8767796320748938e-06, + "loss": 0.9982, + "step": 5196 + }, + { + "epoch": 0.3749503986147686, + "grad_norm": 2.2919292530705047, + "learning_rate": 2.8763595521794278e-06, + "loss": 1.0229, + "step": 5197 + }, + { + "epoch": 0.3750225460841961, + "grad_norm": 1.677382622149785, + "learning_rate": 2.8759394244292116e-06, + "loss": 0.9561, + "step": 5198 + }, + { + "epoch": 0.3750946935536236, + "grad_norm": 1.9275094532023567, + "learning_rate": 2.8755192488471856e-06, + "loss": 1.0112, + "step": 5199 + }, + { + "epoch": 0.3751668410230511, + "grad_norm": 2.3736753283060814, + "learning_rate": 2.8750990254562945e-06, + "loss": 0.9145, + "step": 5200 + }, + { + "epoch": 0.3752389884924786, + "grad_norm": 2.133233332773723, + "learning_rate": 2.8746787542794857e-06, + "loss": 0.915, + "step": 5201 + }, + { + "epoch": 0.37531113596190613, + "grad_norm": 2.712581432895671, + "learning_rate": 2.8742584353397075e-06, + "loss": 1.0339, + "step": 5202 + }, + { + "epoch": 0.37538328343133365, + "grad_norm": 5.371739840402164, + "learning_rate": 2.8738380686599123e-06, + "loss": 0.9854, + "step": 5203 + }, + { + "epoch": 0.37545543090076117, + "grad_norm": 0.8176199182872377, + "learning_rate": 2.873417654263056e-06, + "loss": 0.794, + "step": 5204 + }, + { + "epoch": 0.3755275783701887, + "grad_norm": 3.015001459833048, + "learning_rate": 2.8729971921720945e-06, + "loss": 0.9731, + "step": 5205 + }, + { + "epoch": 0.37559972583961615, + "grad_norm": 3.0091926875997017, + "learning_rate": 2.8725766824099874e-06, + "loss": 0.9306, + "step": 5206 + }, + { + "epoch": 0.37567187330904367, + "grad_norm": 2.638149749126444, + "learning_rate": 2.8721561249996982e-06, + "loss": 0.9665, + "step": 5207 + }, + { + "epoch": 0.3757440207784712, + "grad_norm": 2.957126389135784, + "learning_rate": 2.8717355199641917e-06, + "loss": 0.8501, + "step": 5208 + }, + { + "epoch": 0.3758161682478987, + "grad_norm": 2.545378918422243, + "learning_rate": 2.8713148673264357e-06, + "loss": 1.0224, + "step": 5209 + }, + { + "epoch": 0.3758883157173262, + "grad_norm": 2.313066980087286, + "learning_rate": 2.8708941671094003e-06, + "loss": 0.9187, + "step": 5210 + }, + { + "epoch": 0.37596046318675375, + "grad_norm": 2.971422137263951, + "learning_rate": 2.870473419336058e-06, + "loss": 0.9608, + "step": 5211 + }, + { + "epoch": 0.3760326106561812, + "grad_norm": 2.649864420031629, + "learning_rate": 2.8700526240293848e-06, + "loss": 0.9553, + "step": 5212 + }, + { + "epoch": 0.37610475812560873, + "grad_norm": 2.433646946994565, + "learning_rate": 2.8696317812123588e-06, + "loss": 0.9647, + "step": 5213 + }, + { + "epoch": 0.37617690559503625, + "grad_norm": 2.554078102339539, + "learning_rate": 2.8692108909079603e-06, + "loss": 1.0153, + "step": 5214 + }, + { + "epoch": 0.37624905306446377, + "grad_norm": 3.425225555343833, + "learning_rate": 2.8687899531391724e-06, + "loss": 0.8682, + "step": 5215 + }, + { + "epoch": 0.3763212005338913, + "grad_norm": 3.0874387541962816, + "learning_rate": 2.8683689679289822e-06, + "loss": 0.9736, + "step": 5216 + }, + { + "epoch": 0.3763933480033188, + "grad_norm": 5.3409328954524025, + "learning_rate": 2.8679479353003765e-06, + "loss": 0.9566, + "step": 5217 + }, + { + "epoch": 0.37646549547274627, + "grad_norm": 5.847918161197911, + "learning_rate": 2.8675268552763477e-06, + "loss": 0.9311, + "step": 5218 + }, + { + "epoch": 0.3765376429421738, + "grad_norm": 2.5378296661454005, + "learning_rate": 2.8671057278798883e-06, + "loss": 0.8735, + "step": 5219 + }, + { + "epoch": 0.3766097904116013, + "grad_norm": 2.414478550091874, + "learning_rate": 2.8666845531339957e-06, + "loss": 0.8987, + "step": 5220 + }, + { + "epoch": 0.3766819378810288, + "grad_norm": 4.119416822250977, + "learning_rate": 2.866263331061667e-06, + "loss": 0.9646, + "step": 5221 + }, + { + "epoch": 0.37675408535045635, + "grad_norm": 2.8586690062411306, + "learning_rate": 2.865842061685905e-06, + "loss": 0.7954, + "step": 5222 + }, + { + "epoch": 0.37682623281988387, + "grad_norm": 2.139820142478564, + "learning_rate": 2.8654207450297134e-06, + "loss": 1.0055, + "step": 5223 + }, + { + "epoch": 0.37689838028931133, + "grad_norm": 2.3940203123131227, + "learning_rate": 2.864999381116098e-06, + "loss": 0.9487, + "step": 5224 + }, + { + "epoch": 0.37697052775873885, + "grad_norm": 2.250575309952778, + "learning_rate": 2.86457796996807e-06, + "loss": 1.011, + "step": 5225 + }, + { + "epoch": 0.37704267522816637, + "grad_norm": 4.529482371754003, + "learning_rate": 2.8641565116086384e-06, + "loss": 0.9407, + "step": 5226 + }, + { + "epoch": 0.3771148226975939, + "grad_norm": 7.501614460430464, + "learning_rate": 2.8637350060608186e-06, + "loss": 0.9747, + "step": 5227 + }, + { + "epoch": 0.3771869701670214, + "grad_norm": 2.6864886487887496, + "learning_rate": 2.8633134533476284e-06, + "loss": 0.8991, + "step": 5228 + }, + { + "epoch": 0.3772591176364489, + "grad_norm": 2.8651513293712347, + "learning_rate": 2.8628918534920856e-06, + "loss": 0.9597, + "step": 5229 + }, + { + "epoch": 0.3773312651058764, + "grad_norm": 2.3680319711872433, + "learning_rate": 2.862470206517213e-06, + "loss": 1.0076, + "step": 5230 + }, + { + "epoch": 0.3774034125753039, + "grad_norm": 1.8897228137615718, + "learning_rate": 2.862048512446036e-06, + "loss": 0.9575, + "step": 5231 + }, + { + "epoch": 0.37747556004473143, + "grad_norm": 2.0969376851702246, + "learning_rate": 2.8616267713015806e-06, + "loss": 1.0467, + "step": 5232 + }, + { + "epoch": 0.37754770751415895, + "grad_norm": 2.2845115860837044, + "learning_rate": 2.8612049831068767e-06, + "loss": 1.0802, + "step": 5233 + }, + { + "epoch": 0.37761985498358647, + "grad_norm": 2.010152558947131, + "learning_rate": 2.860783147884956e-06, + "loss": 0.9094, + "step": 5234 + }, + { + "epoch": 0.377692002453014, + "grad_norm": 2.9669521308742772, + "learning_rate": 2.860361265658855e-06, + "loss": 0.9267, + "step": 5235 + }, + { + "epoch": 0.37776414992244145, + "grad_norm": 2.528749243169983, + "learning_rate": 2.8599393364516103e-06, + "loss": 0.8755, + "step": 5236 + }, + { + "epoch": 0.37783629739186897, + "grad_norm": 2.0578770701966773, + "learning_rate": 2.8595173602862613e-06, + "loss": 0.9921, + "step": 5237 + }, + { + "epoch": 0.3779084448612965, + "grad_norm": 3.4628114470731663, + "learning_rate": 2.8590953371858515e-06, + "loss": 0.9458, + "step": 5238 + }, + { + "epoch": 0.377980592330724, + "grad_norm": 1.7115900197584393, + "learning_rate": 2.858673267173425e-06, + "loss": 1.0102, + "step": 5239 + }, + { + "epoch": 0.3780527398001515, + "grad_norm": 2.1125104418615015, + "learning_rate": 2.8582511502720313e-06, + "loss": 0.9343, + "step": 5240 + }, + { + "epoch": 0.37812488726957905, + "grad_norm": 2.290698719789971, + "learning_rate": 2.8578289865047184e-06, + "loss": 0.9536, + "step": 5241 + }, + { + "epoch": 0.3781970347390065, + "grad_norm": 2.6643571886419832, + "learning_rate": 2.85740677589454e-06, + "loss": 0.8681, + "step": 5242 + }, + { + "epoch": 0.37826918220843403, + "grad_norm": 2.8373828073911196, + "learning_rate": 2.8569845184645526e-06, + "loss": 0.9762, + "step": 5243 + }, + { + "epoch": 0.37834132967786155, + "grad_norm": 3.743162151667592, + "learning_rate": 2.856562214237812e-06, + "loss": 0.8517, + "step": 5244 + }, + { + "epoch": 0.37841347714728907, + "grad_norm": 1.9334200694369896, + "learning_rate": 2.8561398632373805e-06, + "loss": 1.0115, + "step": 5245 + }, + { + "epoch": 0.3784856246167166, + "grad_norm": 5.410700735518535, + "learning_rate": 2.85571746548632e-06, + "loss": 0.9273, + "step": 5246 + }, + { + "epoch": 0.37855777208614405, + "grad_norm": 2.764469832213141, + "learning_rate": 2.8552950210076966e-06, + "loss": 0.8932, + "step": 5247 + }, + { + "epoch": 0.37862991955557157, + "grad_norm": 2.437499510936199, + "learning_rate": 2.8548725298245783e-06, + "loss": 1.0238, + "step": 5248 + }, + { + "epoch": 0.3787020670249991, + "grad_norm": 2.4293849548182993, + "learning_rate": 2.8544499919600355e-06, + "loss": 0.9007, + "step": 5249 + }, + { + "epoch": 0.3787742144944266, + "grad_norm": 2.3604464087726593, + "learning_rate": 2.854027407437142e-06, + "loss": 0.9176, + "step": 5250 + }, + { + "epoch": 0.37884636196385413, + "grad_norm": 5.6061497503905215, + "learning_rate": 2.8536047762789733e-06, + "loss": 1.0011, + "step": 5251 + }, + { + "epoch": 0.37891850943328165, + "grad_norm": 3.247355191920446, + "learning_rate": 2.853182098508608e-06, + "loss": 1.0232, + "step": 5252 + }, + { + "epoch": 0.3789906569027091, + "grad_norm": 1.768910549681888, + "learning_rate": 2.8527593741491255e-06, + "loss": 0.9478, + "step": 5253 + }, + { + "epoch": 0.37906280437213663, + "grad_norm": 2.0094671296494506, + "learning_rate": 2.8523366032236117e-06, + "loss": 0.9758, + "step": 5254 + }, + { + "epoch": 0.37913495184156415, + "grad_norm": 5.80859375, + "learning_rate": 2.8519137857551507e-06, + "loss": 0.8991, + "step": 5255 + }, + { + "epoch": 0.37920709931099167, + "grad_norm": 2.316874310452462, + "learning_rate": 2.8514909217668318e-06, + "loss": 0.9928, + "step": 5256 + }, + { + "epoch": 0.3792792467804192, + "grad_norm": 2.51537819356554, + "learning_rate": 2.851068011281745e-06, + "loss": 0.9888, + "step": 5257 + }, + { + "epoch": 0.3793513942498467, + "grad_norm": 3.772528434226618, + "learning_rate": 2.8506450543229856e-06, + "loss": 0.9513, + "step": 5258 + }, + { + "epoch": 0.37942354171927417, + "grad_norm": 4.749255674168128, + "learning_rate": 2.8502220509136483e-06, + "loss": 0.8809, + "step": 5259 + }, + { + "epoch": 0.3794956891887017, + "grad_norm": 3.0834549201155457, + "learning_rate": 2.849799001076832e-06, + "loss": 0.9324, + "step": 5260 + }, + { + "epoch": 0.3795678366581292, + "grad_norm": 2.701542826768982, + "learning_rate": 2.8493759048356387e-06, + "loss": 0.9911, + "step": 5261 + }, + { + "epoch": 0.37963998412755673, + "grad_norm": 7.792891407824224, + "learning_rate": 2.8489527622131708e-06, + "loss": 0.8465, + "step": 5262 + }, + { + "epoch": 0.37971213159698425, + "grad_norm": 0.7678382568634853, + "learning_rate": 2.8485295732325355e-06, + "loss": 0.7967, + "step": 5263 + }, + { + "epoch": 0.37978427906641177, + "grad_norm": 2.34101148197922, + "learning_rate": 2.8481063379168415e-06, + "loss": 0.9032, + "step": 5264 + }, + { + "epoch": 0.37985642653583923, + "grad_norm": 4.29410066685789, + "learning_rate": 2.8476830562892e-06, + "loss": 0.9759, + "step": 5265 + }, + { + "epoch": 0.37992857400526675, + "grad_norm": 0.7503103567757714, + "learning_rate": 2.847259728372725e-06, + "loss": 0.8224, + "step": 5266 + }, + { + "epoch": 0.38000072147469427, + "grad_norm": 3.838611548661963, + "learning_rate": 2.8468363541905324e-06, + "loss": 0.9792, + "step": 5267 + }, + { + "epoch": 0.3800728689441218, + "grad_norm": 2.584057819132937, + "learning_rate": 2.8464129337657413e-06, + "loss": 1.0439, + "step": 5268 + }, + { + "epoch": 0.3801450164135493, + "grad_norm": 3.356187878820537, + "learning_rate": 2.8459894671214742e-06, + "loss": 1.0237, + "step": 5269 + }, + { + "epoch": 0.3802171638829768, + "grad_norm": 2.308139298015388, + "learning_rate": 2.845565954280853e-06, + "loss": 0.8871, + "step": 5270 + }, + { + "epoch": 0.3802893113524043, + "grad_norm": 2.756366469502337, + "learning_rate": 2.845142395267005e-06, + "loss": 0.7772, + "step": 5271 + }, + { + "epoch": 0.3803614588218318, + "grad_norm": 2.5812315748540815, + "learning_rate": 2.8447187901030596e-06, + "loss": 0.8012, + "step": 5272 + }, + { + "epoch": 0.38043360629125933, + "grad_norm": 2.395249624602863, + "learning_rate": 2.8442951388121487e-06, + "loss": 0.9414, + "step": 5273 + }, + { + "epoch": 0.38050575376068685, + "grad_norm": 2.098649512606599, + "learning_rate": 2.8438714414174048e-06, + "loss": 0.8899, + "step": 5274 + }, + { + "epoch": 0.38057790123011437, + "grad_norm": 3.944427693171743, + "learning_rate": 2.8434476979419666e-06, + "loss": 1.0402, + "step": 5275 + }, + { + "epoch": 0.3806500486995419, + "grad_norm": 2.501064074086377, + "learning_rate": 2.8430239084089713e-06, + "loss": 0.878, + "step": 5276 + }, + { + "epoch": 0.38072219616896935, + "grad_norm": 3.0017742632344944, + "learning_rate": 2.8426000728415606e-06, + "loss": 0.8884, + "step": 5277 + }, + { + "epoch": 0.38079434363839687, + "grad_norm": 2.175016368058977, + "learning_rate": 2.8421761912628794e-06, + "loss": 0.8278, + "step": 5278 + }, + { + "epoch": 0.3808664911078244, + "grad_norm": 2.071364455804616, + "learning_rate": 2.841752263696075e-06, + "loss": 0.8687, + "step": 5279 + }, + { + "epoch": 0.3809386385772519, + "grad_norm": 0.7490106653668848, + "learning_rate": 2.8413282901642945e-06, + "loss": 0.7517, + "step": 5280 + }, + { + "epoch": 0.38101078604667943, + "grad_norm": 2.4409418503626434, + "learning_rate": 2.8409042706906912e-06, + "loss": 0.9286, + "step": 5281 + }, + { + "epoch": 0.38108293351610695, + "grad_norm": 2.40216232483129, + "learning_rate": 2.8404802052984184e-06, + "loss": 0.8906, + "step": 5282 + }, + { + "epoch": 0.3811550809855344, + "grad_norm": 3.1402788113770397, + "learning_rate": 2.8400560940106327e-06, + "loss": 0.9502, + "step": 5283 + }, + { + "epoch": 0.38122722845496193, + "grad_norm": 2.4200721182993234, + "learning_rate": 2.839631936850493e-06, + "loss": 0.9992, + "step": 5284 + }, + { + "epoch": 0.38129937592438945, + "grad_norm": 3.7039866028578436, + "learning_rate": 2.8392077338411624e-06, + "loss": 0.983, + "step": 5285 + }, + { + "epoch": 0.38137152339381697, + "grad_norm": 4.619437300528143, + "learning_rate": 2.8387834850058042e-06, + "loss": 0.9695, + "step": 5286 + }, + { + "epoch": 0.3814436708632445, + "grad_norm": 3.73342794116808, + "learning_rate": 2.838359190367584e-06, + "loss": 0.848, + "step": 5287 + }, + { + "epoch": 0.381515818332672, + "grad_norm": 3.7467209785117856, + "learning_rate": 2.8379348499496727e-06, + "loss": 0.9812, + "step": 5288 + }, + { + "epoch": 0.38158796580209947, + "grad_norm": 2.5171615930759668, + "learning_rate": 2.837510463775241e-06, + "loss": 0.9898, + "step": 5289 + }, + { + "epoch": 0.381660113271527, + "grad_norm": 2.777964590996319, + "learning_rate": 2.8370860318674627e-06, + "loss": 0.9468, + "step": 5290 + }, + { + "epoch": 0.3817322607409545, + "grad_norm": 2.241113704658251, + "learning_rate": 2.836661554249516e-06, + "loss": 0.8929, + "step": 5291 + }, + { + "epoch": 0.38180440821038203, + "grad_norm": 3.0777082596818475, + "learning_rate": 2.836237030944578e-06, + "loss": 0.8267, + "step": 5292 + }, + { + "epoch": 0.38187655567980955, + "grad_norm": 2.8600067941544935, + "learning_rate": 2.835812461975832e-06, + "loss": 0.9689, + "step": 5293 + }, + { + "epoch": 0.38194870314923707, + "grad_norm": 2.0668870910899932, + "learning_rate": 2.8353878473664612e-06, + "loss": 0.981, + "step": 5294 + }, + { + "epoch": 0.38202085061866453, + "grad_norm": 3.4748742279784106, + "learning_rate": 2.834963187139653e-06, + "loss": 0.9296, + "step": 5295 + }, + { + "epoch": 0.38209299808809205, + "grad_norm": 2.8562928979102167, + "learning_rate": 2.8345384813185956e-06, + "loss": 0.973, + "step": 5296 + }, + { + "epoch": 0.38216514555751957, + "grad_norm": 2.171242271853207, + "learning_rate": 2.834113729926481e-06, + "loss": 0.9228, + "step": 5297 + }, + { + "epoch": 0.3822372930269471, + "grad_norm": 3.227794199683092, + "learning_rate": 2.833688932986504e-06, + "loss": 0.8983, + "step": 5298 + }, + { + "epoch": 0.3823094404963746, + "grad_norm": 5.252930686030068, + "learning_rate": 2.83326409052186e-06, + "loss": 0.9543, + "step": 5299 + }, + { + "epoch": 0.38238158796580207, + "grad_norm": 2.747143735965537, + "learning_rate": 2.832839202555749e-06, + "loss": 0.9941, + "step": 5300 + }, + { + "epoch": 0.3824537354352296, + "grad_norm": 1.803838819298357, + "learning_rate": 2.8324142691113723e-06, + "loss": 0.9851, + "step": 5301 + }, + { + "epoch": 0.3825258829046571, + "grad_norm": 2.233311433352614, + "learning_rate": 2.831989290211933e-06, + "loss": 0.9383, + "step": 5302 + }, + { + "epoch": 0.38259803037408463, + "grad_norm": 2.4891140440963775, + "learning_rate": 2.83156426588064e-06, + "loss": 1.0099, + "step": 5303 + }, + { + "epoch": 0.38267017784351215, + "grad_norm": 2.2245445342490173, + "learning_rate": 2.8311391961406996e-06, + "loss": 0.9524, + "step": 5304 + }, + { + "epoch": 0.38274232531293967, + "grad_norm": 1.98427725723922, + "learning_rate": 2.830714081015325e-06, + "loss": 0.8966, + "step": 5305 + }, + { + "epoch": 0.38281447278236713, + "grad_norm": 2.8904382593819276, + "learning_rate": 2.8302889205277293e-06, + "loss": 0.9532, + "step": 5306 + }, + { + "epoch": 0.38288662025179465, + "grad_norm": 2.8758130789697267, + "learning_rate": 2.829863714701129e-06, + "loss": 0.926, + "step": 5307 + }, + { + "epoch": 0.38295876772122217, + "grad_norm": 2.5747716191316923, + "learning_rate": 2.8294384635587443e-06, + "loss": 0.8645, + "step": 5308 + }, + { + "epoch": 0.3830309151906497, + "grad_norm": 2.2849876406813245, + "learning_rate": 2.829013167123795e-06, + "loss": 0.9543, + "step": 5309 + }, + { + "epoch": 0.3831030626600772, + "grad_norm": 2.2534937753872826, + "learning_rate": 2.828587825419506e-06, + "loss": 0.9396, + "step": 5310 + }, + { + "epoch": 0.38317521012950473, + "grad_norm": 2.800587922770018, + "learning_rate": 2.828162438469102e-06, + "loss": 0.9958, + "step": 5311 + }, + { + "epoch": 0.3832473575989322, + "grad_norm": 2.4637911282478497, + "learning_rate": 2.8277370062958134e-06, + "loss": 0.9846, + "step": 5312 + }, + { + "epoch": 0.3833195050683597, + "grad_norm": 0.8597603367491627, + "learning_rate": 2.8273115289228715e-06, + "loss": 0.8399, + "step": 5313 + }, + { + "epoch": 0.38339165253778723, + "grad_norm": 2.5833871487170192, + "learning_rate": 2.8268860063735096e-06, + "loss": 0.9281, + "step": 5314 + }, + { + "epoch": 0.38346380000721475, + "grad_norm": 4.332867059319424, + "learning_rate": 2.826460438670964e-06, + "loss": 0.8988, + "step": 5315 + }, + { + "epoch": 0.38353594747664227, + "grad_norm": 2.5458666905924807, + "learning_rate": 2.826034825838473e-06, + "loss": 0.9428, + "step": 5316 + }, + { + "epoch": 0.3836080949460698, + "grad_norm": 5.260333472875043, + "learning_rate": 2.825609167899278e-06, + "loss": 0.8445, + "step": 5317 + }, + { + "epoch": 0.38368024241549725, + "grad_norm": 2.503722471256295, + "learning_rate": 2.825183464876623e-06, + "loss": 1.0623, + "step": 5318 + }, + { + "epoch": 0.38375238988492477, + "grad_norm": 1.8581039828880312, + "learning_rate": 2.8247577167937533e-06, + "loss": 0.8776, + "step": 5319 + }, + { + "epoch": 0.3838245373543523, + "grad_norm": 2.537051205338978, + "learning_rate": 2.8243319236739186e-06, + "loss": 0.9469, + "step": 5320 + }, + { + "epoch": 0.3838966848237798, + "grad_norm": 2.078317618944136, + "learning_rate": 2.823906085540368e-06, + "loss": 0.9406, + "step": 5321 + }, + { + "epoch": 0.38396883229320733, + "grad_norm": 0.7783635796327407, + "learning_rate": 2.8234802024163566e-06, + "loss": 0.8462, + "step": 5322 + }, + { + "epoch": 0.38404097976263485, + "grad_norm": 2.494149419400118, + "learning_rate": 2.8230542743251396e-06, + "loss": 0.8326, + "step": 5323 + }, + { + "epoch": 0.3841131272320623, + "grad_norm": 2.6737420769743947, + "learning_rate": 2.8226283012899757e-06, + "loss": 1.0026, + "step": 5324 + }, + { + "epoch": 0.38418527470148983, + "grad_norm": 2.3318789127248523, + "learning_rate": 2.822202283334126e-06, + "loss": 0.9526, + "step": 5325 + }, + { + "epoch": 0.38425742217091735, + "grad_norm": 2.661764636146661, + "learning_rate": 2.8217762204808525e-06, + "loss": 1.0235, + "step": 5326 + }, + { + "epoch": 0.38432956964034487, + "grad_norm": 2.2889881317244294, + "learning_rate": 2.821350112753422e-06, + "loss": 0.9319, + "step": 5327 + }, + { + "epoch": 0.3844017171097724, + "grad_norm": 6.453461054071375, + "learning_rate": 2.8209239601751025e-06, + "loss": 0.9215, + "step": 5328 + }, + { + "epoch": 0.3844738645791999, + "grad_norm": 2.2354736828058015, + "learning_rate": 2.820497762769164e-06, + "loss": 0.8936, + "step": 5329 + }, + { + "epoch": 0.3845460120486274, + "grad_norm": 7.356759483306141, + "learning_rate": 2.8200715205588807e-06, + "loss": 0.9227, + "step": 5330 + }, + { + "epoch": 0.3846181595180549, + "grad_norm": 2.387805447836982, + "learning_rate": 2.8196452335675275e-06, + "loss": 0.9137, + "step": 5331 + }, + { + "epoch": 0.3846903069874824, + "grad_norm": 2.168224862581835, + "learning_rate": 2.819218901818382e-06, + "loss": 0.9911, + "step": 5332 + }, + { + "epoch": 0.38476245445690993, + "grad_norm": 2.767060246872221, + "learning_rate": 2.818792525334725e-06, + "loss": 0.8088, + "step": 5333 + }, + { + "epoch": 0.38483460192633745, + "grad_norm": 2.6183583843053473, + "learning_rate": 2.818366104139839e-06, + "loss": 0.9671, + "step": 5334 + }, + { + "epoch": 0.38490674939576497, + "grad_norm": 2.108782988936389, + "learning_rate": 2.81793963825701e-06, + "loss": 1.0136, + "step": 5335 + }, + { + "epoch": 0.38497889686519243, + "grad_norm": 2.6762740300973635, + "learning_rate": 2.817513127709525e-06, + "loss": 1.0402, + "step": 5336 + }, + { + "epoch": 0.38505104433461995, + "grad_norm": 2.0045935807263175, + "learning_rate": 2.817086572520674e-06, + "loss": 0.8931, + "step": 5337 + }, + { + "epoch": 0.38512319180404747, + "grad_norm": 3.4680884477892793, + "learning_rate": 2.8166599727137508e-06, + "loss": 1.0195, + "step": 5338 + }, + { + "epoch": 0.385195339273475, + "grad_norm": 4.474074386257959, + "learning_rate": 2.816233328312049e-06, + "loss": 1.0183, + "step": 5339 + }, + { + "epoch": 0.3852674867429025, + "grad_norm": 2.5746745746490447, + "learning_rate": 2.815806639338867e-06, + "loss": 0.9949, + "step": 5340 + }, + { + "epoch": 0.38533963421233003, + "grad_norm": 2.7629516094331685, + "learning_rate": 2.8153799058175046e-06, + "loss": 0.9552, + "step": 5341 + }, + { + "epoch": 0.3854117816817575, + "grad_norm": 4.185791122423172, + "learning_rate": 2.814953127771264e-06, + "loss": 0.9638, + "step": 5342 + }, + { + "epoch": 0.385483929151185, + "grad_norm": 2.096148014397003, + "learning_rate": 2.8145263052234498e-06, + "loss": 0.9019, + "step": 5343 + }, + { + "epoch": 0.38555607662061253, + "grad_norm": 2.851365006806139, + "learning_rate": 2.8140994381973694e-06, + "loss": 1.0725, + "step": 5344 + }, + { + "epoch": 0.38562822409004005, + "grad_norm": 3.94955971110972, + "learning_rate": 2.8136725267163323e-06, + "loss": 0.9402, + "step": 5345 + }, + { + "epoch": 0.38570037155946757, + "grad_norm": 1.8898957792944162, + "learning_rate": 2.813245570803651e-06, + "loss": 1.0605, + "step": 5346 + }, + { + "epoch": 0.3857725190288951, + "grad_norm": 3.3823161476846284, + "learning_rate": 2.8128185704826387e-06, + "loss": 0.9649, + "step": 5347 + }, + { + "epoch": 0.38584466649832255, + "grad_norm": 1.7451498394661475, + "learning_rate": 2.812391525776614e-06, + "loss": 0.914, + "step": 5348 + }, + { + "epoch": 0.38591681396775007, + "grad_norm": 2.5669035872397084, + "learning_rate": 2.811964436708895e-06, + "loss": 1.0306, + "step": 5349 + }, + { + "epoch": 0.3859889614371776, + "grad_norm": 1.9502467806256152, + "learning_rate": 2.8115373033028045e-06, + "loss": 0.927, + "step": 5350 + }, + { + "epoch": 0.3860611089066051, + "grad_norm": 2.1712066939476347, + "learning_rate": 2.811110125581665e-06, + "loss": 0.951, + "step": 5351 + }, + { + "epoch": 0.38613325637603263, + "grad_norm": 38.95186676918948, + "learning_rate": 2.8106829035688055e-06, + "loss": 0.8933, + "step": 5352 + }, + { + "epoch": 0.3862054038454601, + "grad_norm": 2.6969702004330576, + "learning_rate": 2.810255637287553e-06, + "loss": 1.0123, + "step": 5353 + }, + { + "epoch": 0.3862775513148876, + "grad_norm": 2.6824936478993773, + "learning_rate": 2.8098283267612398e-06, + "loss": 0.9329, + "step": 5354 + }, + { + "epoch": 0.38634969878431513, + "grad_norm": 2.692634114190116, + "learning_rate": 2.8094009720131988e-06, + "loss": 0.854, + "step": 5355 + }, + { + "epoch": 0.38642184625374265, + "grad_norm": 2.0175635899539963, + "learning_rate": 2.808973573066768e-06, + "loss": 0.9798, + "step": 5356 + }, + { + "epoch": 0.38649399372317017, + "grad_norm": 2.159505625416061, + "learning_rate": 2.8085461299452848e-06, + "loss": 0.9575, + "step": 5357 + }, + { + "epoch": 0.3865661411925977, + "grad_norm": 3.0560668016692056, + "learning_rate": 2.8081186426720905e-06, + "loss": 1.0544, + "step": 5358 + }, + { + "epoch": 0.38663828866202515, + "grad_norm": 1.9351041193113314, + "learning_rate": 2.8076911112705286e-06, + "loss": 0.9, + "step": 5359 + }, + { + "epoch": 0.3867104361314527, + "grad_norm": 2.2274959280745326, + "learning_rate": 2.8072635357639455e-06, + "loss": 0.9675, + "step": 5360 + }, + { + "epoch": 0.3867825836008802, + "grad_norm": 2.1871196961197685, + "learning_rate": 2.8068359161756884e-06, + "loss": 1.0051, + "step": 5361 + }, + { + "epoch": 0.3868547310703077, + "grad_norm": 2.836399513631275, + "learning_rate": 2.806408252529109e-06, + "loss": 0.9914, + "step": 5362 + }, + { + "epoch": 0.38692687853973523, + "grad_norm": 0.8519554631386094, + "learning_rate": 2.8059805448475607e-06, + "loss": 0.8402, + "step": 5363 + }, + { + "epoch": 0.38699902600916275, + "grad_norm": 2.34312583241584, + "learning_rate": 2.805552793154398e-06, + "loss": 0.9744, + "step": 5364 + }, + { + "epoch": 0.3870711734785902, + "grad_norm": 1.976553935759917, + "learning_rate": 2.805124997472979e-06, + "loss": 0.8906, + "step": 5365 + }, + { + "epoch": 0.38714332094801773, + "grad_norm": 6.476026487544875, + "learning_rate": 2.804697157826665e-06, + "loss": 0.8148, + "step": 5366 + }, + { + "epoch": 0.38721546841744525, + "grad_norm": 2.480636855308785, + "learning_rate": 2.8042692742388173e-06, + "loss": 0.8545, + "step": 5367 + }, + { + "epoch": 0.38728761588687277, + "grad_norm": 2.0110535345857223, + "learning_rate": 2.8038413467328025e-06, + "loss": 0.9163, + "step": 5368 + }, + { + "epoch": 0.3873597633563003, + "grad_norm": 2.230051512700569, + "learning_rate": 2.8034133753319874e-06, + "loss": 1.0485, + "step": 5369 + }, + { + "epoch": 0.3874319108257278, + "grad_norm": 2.1758394353404316, + "learning_rate": 2.802985360059742e-06, + "loss": 0.8886, + "step": 5370 + }, + { + "epoch": 0.3875040582951553, + "grad_norm": 2.2767711200834064, + "learning_rate": 2.802557300939439e-06, + "loss": 0.9072, + "step": 5371 + }, + { + "epoch": 0.3875762057645828, + "grad_norm": 1.9152544942130143, + "learning_rate": 2.802129197994452e-06, + "loss": 1.0221, + "step": 5372 + }, + { + "epoch": 0.3876483532340103, + "grad_norm": 2.0483761007853403, + "learning_rate": 2.801701051248159e-06, + "loss": 0.9786, + "step": 5373 + }, + { + "epoch": 0.38772050070343783, + "grad_norm": 4.982637969698878, + "learning_rate": 2.80127286072394e-06, + "loss": 0.8971, + "step": 5374 + }, + { + "epoch": 0.38779264817286535, + "grad_norm": 2.1572580607209826, + "learning_rate": 2.8008446264451758e-06, + "loss": 0.8609, + "step": 5375 + }, + { + "epoch": 0.38786479564229287, + "grad_norm": 9.821364365404985, + "learning_rate": 2.8004163484352516e-06, + "loss": 0.8961, + "step": 5376 + }, + { + "epoch": 0.38793694311172033, + "grad_norm": 2.7995304735790176, + "learning_rate": 2.799988026717553e-06, + "loss": 0.8917, + "step": 5377 + }, + { + "epoch": 0.38800909058114785, + "grad_norm": 3.546323758616482, + "learning_rate": 2.7995596613154697e-06, + "loss": 0.9127, + "step": 5378 + }, + { + "epoch": 0.3880812380505754, + "grad_norm": 2.3842858934063638, + "learning_rate": 2.7991312522523937e-06, + "loss": 0.8994, + "step": 5379 + }, + { + "epoch": 0.3881533855200029, + "grad_norm": 2.0966468243450964, + "learning_rate": 2.798702799551718e-06, + "loss": 0.9013, + "step": 5380 + }, + { + "epoch": 0.3882255329894304, + "grad_norm": 2.381142853656677, + "learning_rate": 2.7982743032368393e-06, + "loss": 0.8857, + "step": 5381 + }, + { + "epoch": 0.38829768045885793, + "grad_norm": 3.669022381368823, + "learning_rate": 2.797845763331156e-06, + "loss": 0.8786, + "step": 5382 + }, + { + "epoch": 0.3883698279282854, + "grad_norm": 2.7046201242465626, + "learning_rate": 2.797417179858069e-06, + "loss": 0.8289, + "step": 5383 + }, + { + "epoch": 0.3884419753977129, + "grad_norm": 3.913471961633893, + "learning_rate": 2.7969885528409814e-06, + "loss": 1.0305, + "step": 5384 + }, + { + "epoch": 0.38851412286714043, + "grad_norm": 2.1806244835889865, + "learning_rate": 2.7965598823033e-06, + "loss": 1.0628, + "step": 5385 + }, + { + "epoch": 0.38858627033656795, + "grad_norm": 2.1516499863777554, + "learning_rate": 2.7961311682684317e-06, + "loss": 1.0008, + "step": 5386 + }, + { + "epoch": 0.38865841780599547, + "grad_norm": 2.890381508959325, + "learning_rate": 2.795702410759788e-06, + "loss": 0.8557, + "step": 5387 + }, + { + "epoch": 0.388730565275423, + "grad_norm": 2.5762505392770882, + "learning_rate": 2.7952736098007803e-06, + "loss": 0.9591, + "step": 5388 + }, + { + "epoch": 0.38880271274485045, + "grad_norm": 2.5745196477748165, + "learning_rate": 2.794844765414825e-06, + "loss": 0.8932, + "step": 5389 + }, + { + "epoch": 0.388874860214278, + "grad_norm": 4.002401346376396, + "learning_rate": 2.79441587762534e-06, + "loss": 0.7912, + "step": 5390 + }, + { + "epoch": 0.3889470076837055, + "grad_norm": 2.0540620845560373, + "learning_rate": 2.7939869464557445e-06, + "loss": 0.9718, + "step": 5391 + }, + { + "epoch": 0.389019155153133, + "grad_norm": 3.930923419238472, + "learning_rate": 2.793557971929461e-06, + "loss": 0.8826, + "step": 5392 + }, + { + "epoch": 0.38909130262256053, + "grad_norm": 2.9633231906552804, + "learning_rate": 2.7931289540699147e-06, + "loss": 0.9486, + "step": 5393 + }, + { + "epoch": 0.38916345009198805, + "grad_norm": 2.445326344996249, + "learning_rate": 2.792699892900532e-06, + "loss": 0.9322, + "step": 5394 + }, + { + "epoch": 0.3892355975614155, + "grad_norm": 2.065496175791539, + "learning_rate": 2.7922707884447425e-06, + "loss": 0.9724, + "step": 5395 + }, + { + "epoch": 0.38930774503084303, + "grad_norm": 2.7525809054563255, + "learning_rate": 2.7918416407259783e-06, + "loss": 0.9999, + "step": 5396 + }, + { + "epoch": 0.38937989250027055, + "grad_norm": 3.485419557292414, + "learning_rate": 2.791412449767673e-06, + "loss": 0.9615, + "step": 5397 + }, + { + "epoch": 0.38945203996969807, + "grad_norm": 5.613335106789738, + "learning_rate": 2.7909832155932647e-06, + "loss": 0.903, + "step": 5398 + }, + { + "epoch": 0.3895241874391256, + "grad_norm": 1.5590135969703183, + "learning_rate": 2.79055393822619e-06, + "loss": 0.9305, + "step": 5399 + }, + { + "epoch": 0.3895963349085531, + "grad_norm": 2.0630108749687333, + "learning_rate": 2.7901246176898914e-06, + "loss": 1.0152, + "step": 5400 + }, + { + "epoch": 0.3896684823779806, + "grad_norm": 2.5495968182036717, + "learning_rate": 2.7896952540078123e-06, + "loss": 0.9498, + "step": 5401 + }, + { + "epoch": 0.3897406298474081, + "grad_norm": 2.77502868568784, + "learning_rate": 2.7892658472033993e-06, + "loss": 0.954, + "step": 5402 + }, + { + "epoch": 0.3898127773168356, + "grad_norm": 2.2443950565284374, + "learning_rate": 2.7888363973000996e-06, + "loss": 0.8782, + "step": 5403 + }, + { + "epoch": 0.38988492478626313, + "grad_norm": 2.240977954597761, + "learning_rate": 2.7884069043213646e-06, + "loss": 0.946, + "step": 5404 + }, + { + "epoch": 0.38995707225569065, + "grad_norm": 0.7628634727790983, + "learning_rate": 2.7879773682906474e-06, + "loss": 0.8057, + "step": 5405 + }, + { + "epoch": 0.3900292197251181, + "grad_norm": 2.2761082643124824, + "learning_rate": 2.7875477892314022e-06, + "loss": 0.858, + "step": 5406 + }, + { + "epoch": 0.39010136719454563, + "grad_norm": 0.8473547825700891, + "learning_rate": 2.787118167167089e-06, + "loss": 0.831, + "step": 5407 + }, + { + "epoch": 0.39017351466397315, + "grad_norm": 2.808271917545079, + "learning_rate": 2.786688502121166e-06, + "loss": 0.9262, + "step": 5408 + }, + { + "epoch": 0.3902456621334007, + "grad_norm": 0.9059071056561023, + "learning_rate": 2.7862587941170957e-06, + "loss": 0.8982, + "step": 5409 + }, + { + "epoch": 0.3903178096028282, + "grad_norm": 2.0131809291040152, + "learning_rate": 2.7858290431783437e-06, + "loss": 0.9817, + "step": 5410 + }, + { + "epoch": 0.3903899570722557, + "grad_norm": 0.7028787181592407, + "learning_rate": 2.7853992493283766e-06, + "loss": 0.8039, + "step": 5411 + }, + { + "epoch": 0.3904621045416832, + "grad_norm": 0.7545456225743895, + "learning_rate": 2.784969412590663e-06, + "loss": 0.7786, + "step": 5412 + }, + { + "epoch": 0.3905342520111107, + "grad_norm": 3.0014169843391296, + "learning_rate": 2.784539532988677e-06, + "loss": 0.9843, + "step": 5413 + }, + { + "epoch": 0.3906063994805382, + "grad_norm": 2.3387277326640095, + "learning_rate": 2.784109610545891e-06, + "loss": 0.7925, + "step": 5414 + }, + { + "epoch": 0.39067854694996573, + "grad_norm": 3.0061085816427964, + "learning_rate": 2.783679645285781e-06, + "loss": 0.9605, + "step": 5415 + }, + { + "epoch": 0.39075069441939325, + "grad_norm": 1.9622275679704986, + "learning_rate": 2.7832496372318277e-06, + "loss": 0.9979, + "step": 5416 + }, + { + "epoch": 0.39082284188882077, + "grad_norm": 4.133465720691989, + "learning_rate": 2.782819586407511e-06, + "loss": 0.8409, + "step": 5417 + }, + { + "epoch": 0.39089498935824823, + "grad_norm": 2.803213853395319, + "learning_rate": 2.7823894928363144e-06, + "loss": 1.0178, + "step": 5418 + }, + { + "epoch": 0.39096713682767575, + "grad_norm": 2.076202191394369, + "learning_rate": 2.7819593565417235e-06, + "loss": 0.8684, + "step": 5419 + }, + { + "epoch": 0.3910392842971033, + "grad_norm": 2.97049541361695, + "learning_rate": 2.7815291775472276e-06, + "loss": 0.9186, + "step": 5420 + }, + { + "epoch": 0.3911114317665308, + "grad_norm": 3.252221155433261, + "learning_rate": 2.7810989558763158e-06, + "loss": 0.864, + "step": 5421 + }, + { + "epoch": 0.3911835792359583, + "grad_norm": 1.7448760358614825, + "learning_rate": 2.7806686915524823e-06, + "loss": 0.9132, + "step": 5422 + }, + { + "epoch": 0.39125572670538583, + "grad_norm": 4.158310458337607, + "learning_rate": 2.780238384599221e-06, + "loss": 0.9379, + "step": 5423 + }, + { + "epoch": 0.3913278741748133, + "grad_norm": 2.390983604512147, + "learning_rate": 2.7798080350400304e-06, + "loss": 0.9605, + "step": 5424 + }, + { + "epoch": 0.3914000216442408, + "grad_norm": 2.9256114222866074, + "learning_rate": 2.779377642898409e-06, + "loss": 0.8373, + "step": 5425 + }, + { + "epoch": 0.39147216911366833, + "grad_norm": 2.4785215890270047, + "learning_rate": 2.7789472081978596e-06, + "loss": 0.8901, + "step": 5426 + }, + { + "epoch": 0.39154431658309585, + "grad_norm": 2.875640134450298, + "learning_rate": 2.7785167309618873e-06, + "loss": 0.9179, + "step": 5427 + }, + { + "epoch": 0.39161646405252337, + "grad_norm": 2.6084998027910387, + "learning_rate": 2.778086211213998e-06, + "loss": 0.8494, + "step": 5428 + }, + { + "epoch": 0.3916886115219509, + "grad_norm": 1.9209916480546583, + "learning_rate": 2.7776556489777015e-06, + "loss": 1.0537, + "step": 5429 + }, + { + "epoch": 0.39176075899137836, + "grad_norm": 2.3948123138409345, + "learning_rate": 2.777225044276508e-06, + "loss": 0.8781, + "step": 5430 + }, + { + "epoch": 0.3918329064608059, + "grad_norm": 3.1677143639194862, + "learning_rate": 2.7767943971339326e-06, + "loss": 1.0377, + "step": 5431 + }, + { + "epoch": 0.3919050539302334, + "grad_norm": 2.0987002937469734, + "learning_rate": 2.7763637075734904e-06, + "loss": 0.8917, + "step": 5432 + }, + { + "epoch": 0.3919772013996609, + "grad_norm": 2.6500191705838034, + "learning_rate": 2.7759329756187002e-06, + "loss": 0.8943, + "step": 5433 + }, + { + "epoch": 0.39204934886908843, + "grad_norm": 4.6613487642790785, + "learning_rate": 2.7755022012930833e-06, + "loss": 0.9675, + "step": 5434 + }, + { + "epoch": 0.39212149633851595, + "grad_norm": 2.7421350718645963, + "learning_rate": 2.775071384620161e-06, + "loss": 0.8629, + "step": 5435 + }, + { + "epoch": 0.3921936438079434, + "grad_norm": 1.9088266187677594, + "learning_rate": 2.7746405256234607e-06, + "loss": 0.9698, + "step": 5436 + }, + { + "epoch": 0.39226579127737093, + "grad_norm": 2.802137145101227, + "learning_rate": 2.774209624326508e-06, + "loss": 0.8801, + "step": 5437 + }, + { + "epoch": 0.39233793874679845, + "grad_norm": 3.3750772820560826, + "learning_rate": 2.773778680752834e-06, + "loss": 1.0434, + "step": 5438 + }, + { + "epoch": 0.392410086216226, + "grad_norm": 2.8497373828021315, + "learning_rate": 2.7733476949259707e-06, + "loss": 0.9384, + "step": 5439 + }, + { + "epoch": 0.3924822336856535, + "grad_norm": 0.7739816158141918, + "learning_rate": 2.772916666869453e-06, + "loss": 0.8412, + "step": 5440 + }, + { + "epoch": 0.392554381155081, + "grad_norm": 2.1483160920951963, + "learning_rate": 2.772485596606817e-06, + "loss": 1.0254, + "step": 5441 + }, + { + "epoch": 0.3926265286245085, + "grad_norm": 2.0669045090969593, + "learning_rate": 2.772054484161602e-06, + "loss": 0.9174, + "step": 5442 + }, + { + "epoch": 0.392698676093936, + "grad_norm": 2.129604288156206, + "learning_rate": 2.7716233295573495e-06, + "loss": 0.9064, + "step": 5443 + }, + { + "epoch": 0.3927708235633635, + "grad_norm": 0.753702837922011, + "learning_rate": 2.771192132817604e-06, + "loss": 0.7833, + "step": 5444 + }, + { + "epoch": 0.39284297103279103, + "grad_norm": 2.5931417832161294, + "learning_rate": 2.7707608939659103e-06, + "loss": 0.9492, + "step": 5445 + }, + { + "epoch": 0.39291511850221855, + "grad_norm": 2.588846190031785, + "learning_rate": 2.770329613025818e-06, + "loss": 0.8637, + "step": 5446 + }, + { + "epoch": 0.39298726597164607, + "grad_norm": 4.099547496108755, + "learning_rate": 2.7698982900208775e-06, + "loss": 0.9408, + "step": 5447 + }, + { + "epoch": 0.39305941344107354, + "grad_norm": 3.1316418065709697, + "learning_rate": 2.769466924974641e-06, + "loss": 0.9972, + "step": 5448 + }, + { + "epoch": 0.39313156091050105, + "grad_norm": 2.340729763808491, + "learning_rate": 2.7690355179106642e-06, + "loss": 0.9238, + "step": 5449 + }, + { + "epoch": 0.3932037083799286, + "grad_norm": 1.910234071359372, + "learning_rate": 2.7686040688525048e-06, + "loss": 0.7844, + "step": 5450 + }, + { + "epoch": 0.3932758558493561, + "grad_norm": 14.597171749887139, + "learning_rate": 2.768172577823723e-06, + "loss": 0.877, + "step": 5451 + }, + { + "epoch": 0.3933480033187836, + "grad_norm": 2.7451517543292345, + "learning_rate": 2.767741044847879e-06, + "loss": 0.8592, + "step": 5452 + }, + { + "epoch": 0.3934201507882111, + "grad_norm": 1.999020336541928, + "learning_rate": 2.7673094699485394e-06, + "loss": 0.9424, + "step": 5453 + }, + { + "epoch": 0.3934922982576386, + "grad_norm": 2.2185588270757113, + "learning_rate": 2.7668778531492702e-06, + "loss": 0.9124, + "step": 5454 + }, + { + "epoch": 0.3935644457270661, + "grad_norm": 2.133688945734799, + "learning_rate": 2.76644619447364e-06, + "loss": 0.9394, + "step": 5455 + }, + { + "epoch": 0.39363659319649363, + "grad_norm": 2.2390392145195883, + "learning_rate": 2.766014493945221e-06, + "loss": 0.9035, + "step": 5456 + }, + { + "epoch": 0.39370874066592115, + "grad_norm": 2.1289651565101475, + "learning_rate": 2.7655827515875855e-06, + "loss": 0.9778, + "step": 5457 + }, + { + "epoch": 0.3937808881353487, + "grad_norm": 2.606803220585181, + "learning_rate": 2.7651509674243105e-06, + "loss": 0.9662, + "step": 5458 + }, + { + "epoch": 0.39385303560477614, + "grad_norm": 2.556909557212505, + "learning_rate": 2.7647191414789733e-06, + "loss": 0.9406, + "step": 5459 + }, + { + "epoch": 0.39392518307420366, + "grad_norm": 2.4875409568596942, + "learning_rate": 2.7642872737751553e-06, + "loss": 0.9366, + "step": 5460 + }, + { + "epoch": 0.3939973305436312, + "grad_norm": 1.7449015188983263, + "learning_rate": 2.763855364336438e-06, + "loss": 0.8857, + "step": 5461 + }, + { + "epoch": 0.3940694780130587, + "grad_norm": 2.7503037718409113, + "learning_rate": 2.763423413186407e-06, + "loss": 0.9664, + "step": 5462 + }, + { + "epoch": 0.3941416254824862, + "grad_norm": 1.894558306628639, + "learning_rate": 2.76299142034865e-06, + "loss": 0.9628, + "step": 5463 + }, + { + "epoch": 0.39421377295191373, + "grad_norm": 0.7443157000106532, + "learning_rate": 2.762559385846755e-06, + "loss": 0.7742, + "step": 5464 + }, + { + "epoch": 0.3942859204213412, + "grad_norm": 3.0458278323713968, + "learning_rate": 2.762127309704316e-06, + "loss": 0.9114, + "step": 5465 + }, + { + "epoch": 0.3943580678907687, + "grad_norm": 1.9944009489536896, + "learning_rate": 2.761695191944925e-06, + "loss": 1.0148, + "step": 5466 + }, + { + "epoch": 0.39443021536019623, + "grad_norm": 2.8092124374253373, + "learning_rate": 2.7612630325921797e-06, + "loss": 0.8843, + "step": 5467 + }, + { + "epoch": 0.39450236282962375, + "grad_norm": 2.9377618226589375, + "learning_rate": 2.7608308316696787e-06, + "loss": 0.9539, + "step": 5468 + }, + { + "epoch": 0.3945745102990513, + "grad_norm": 2.6816437367687245, + "learning_rate": 2.7603985892010214e-06, + "loss": 0.8724, + "step": 5469 + }, + { + "epoch": 0.3946466577684788, + "grad_norm": 2.2182700484277076, + "learning_rate": 2.759966305209813e-06, + "loss": 0.918, + "step": 5470 + }, + { + "epoch": 0.39471880523790626, + "grad_norm": 2.7541563529217274, + "learning_rate": 2.7595339797196575e-06, + "loss": 0.936, + "step": 5471 + }, + { + "epoch": 0.3947909527073338, + "grad_norm": 2.378128099878123, + "learning_rate": 2.7591016127541637e-06, + "loss": 1.0153, + "step": 5472 + }, + { + "epoch": 0.3948631001767613, + "grad_norm": 2.042037489035992, + "learning_rate": 2.75866920433694e-06, + "loss": 0.9572, + "step": 5473 + }, + { + "epoch": 0.3949352476461888, + "grad_norm": 2.7572896748184794, + "learning_rate": 2.7582367544916e-06, + "loss": 0.8569, + "step": 5474 + }, + { + "epoch": 0.39500739511561633, + "grad_norm": 3.2879100239269707, + "learning_rate": 2.757804263241758e-06, + "loss": 0.9205, + "step": 5475 + }, + { + "epoch": 0.39507954258504385, + "grad_norm": 1.876334287820179, + "learning_rate": 2.7573717306110303e-06, + "loss": 0.9654, + "step": 5476 + }, + { + "epoch": 0.3951516900544713, + "grad_norm": 2.655663010308821, + "learning_rate": 2.756939156623036e-06, + "loss": 0.9507, + "step": 5477 + }, + { + "epoch": 0.39522383752389884, + "grad_norm": 2.360504890357642, + "learning_rate": 2.756506541301396e-06, + "loss": 0.9443, + "step": 5478 + }, + { + "epoch": 0.39529598499332635, + "grad_norm": 2.7358102573889904, + "learning_rate": 2.756073884669735e-06, + "loss": 0.9011, + "step": 5479 + }, + { + "epoch": 0.3953681324627539, + "grad_norm": 2.7188939900470137, + "learning_rate": 2.755641186751678e-06, + "loss": 0.9275, + "step": 5480 + }, + { + "epoch": 0.3954402799321814, + "grad_norm": 2.4745967536489992, + "learning_rate": 2.7552084475708525e-06, + "loss": 0.8445, + "step": 5481 + }, + { + "epoch": 0.3955124274016089, + "grad_norm": 2.1786783268093948, + "learning_rate": 2.7547756671508894e-06, + "loss": 0.8365, + "step": 5482 + }, + { + "epoch": 0.3955845748710364, + "grad_norm": 2.5465833812929035, + "learning_rate": 2.7543428455154216e-06, + "loss": 1.023, + "step": 5483 + }, + { + "epoch": 0.3956567223404639, + "grad_norm": 3.795290518234125, + "learning_rate": 2.753909982688084e-06, + "loss": 0.8426, + "step": 5484 + }, + { + "epoch": 0.3957288698098914, + "grad_norm": 2.816310166665686, + "learning_rate": 2.753477078692512e-06, + "loss": 1.0058, + "step": 5485 + }, + { + "epoch": 0.39580101727931893, + "grad_norm": 2.6022212480372557, + "learning_rate": 2.7530441335523464e-06, + "loss": 0.9749, + "step": 5486 + }, + { + "epoch": 0.39587316474874645, + "grad_norm": 2.627489816517065, + "learning_rate": 2.752611147291229e-06, + "loss": 0.9436, + "step": 5487 + }, + { + "epoch": 0.395945312218174, + "grad_norm": 2.445255754196312, + "learning_rate": 2.752178119932802e-06, + "loss": 1.0201, + "step": 5488 + }, + { + "epoch": 0.39601745968760144, + "grad_norm": 2.7358038084784346, + "learning_rate": 2.7517450515007124e-06, + "loss": 0.9402, + "step": 5489 + }, + { + "epoch": 0.39608960715702896, + "grad_norm": 2.3504118781629058, + "learning_rate": 2.751311942018609e-06, + "loss": 0.896, + "step": 5490 + }, + { + "epoch": 0.3961617546264565, + "grad_norm": 4.624713991961697, + "learning_rate": 2.7508787915101413e-06, + "loss": 0.9083, + "step": 5491 + }, + { + "epoch": 0.396233902095884, + "grad_norm": 2.6897187056824814, + "learning_rate": 2.750445599998963e-06, + "loss": 0.8477, + "step": 5492 + }, + { + "epoch": 0.3963060495653115, + "grad_norm": 2.1089403623032372, + "learning_rate": 2.7500123675087276e-06, + "loss": 0.9319, + "step": 5493 + }, + { + "epoch": 0.39637819703473903, + "grad_norm": 0.7228982056414337, + "learning_rate": 2.749579094063094e-06, + "loss": 0.826, + "step": 5494 + }, + { + "epoch": 0.3964503445041665, + "grad_norm": 5.0389404282089, + "learning_rate": 2.749145779685721e-06, + "loss": 0.9037, + "step": 5495 + }, + { + "epoch": 0.396522491973594, + "grad_norm": 2.423388199147947, + "learning_rate": 2.74871242440027e-06, + "loss": 1.0, + "step": 5496 + }, + { + "epoch": 0.39659463944302153, + "grad_norm": 2.0254466564995472, + "learning_rate": 2.7482790282304046e-06, + "loss": 0.9396, + "step": 5497 + }, + { + "epoch": 0.39666678691244905, + "grad_norm": 1.7973103534892474, + "learning_rate": 2.747845591199792e-06, + "loss": 0.9193, + "step": 5498 + }, + { + "epoch": 0.3967389343818766, + "grad_norm": 2.1696996978544356, + "learning_rate": 2.7474121133321004e-06, + "loss": 0.9921, + "step": 5499 + }, + { + "epoch": 0.3968110818513041, + "grad_norm": 3.0946447446485768, + "learning_rate": 2.7469785946509992e-06, + "loss": 0.9283, + "step": 5500 + }, + { + "epoch": 0.39688322932073156, + "grad_norm": 2.7688500517599106, + "learning_rate": 2.746545035180163e-06, + "loss": 0.9092, + "step": 5501 + }, + { + "epoch": 0.3969553767901591, + "grad_norm": 2.6119615903486766, + "learning_rate": 2.746111434943266e-06, + "loss": 0.9624, + "step": 5502 + }, + { + "epoch": 0.3970275242595866, + "grad_norm": 2.0575914639321113, + "learning_rate": 2.7456777939639854e-06, + "loss": 0.9266, + "step": 5503 + }, + { + "epoch": 0.3970996717290141, + "grad_norm": 2.9592096526249163, + "learning_rate": 2.7452441122660005e-06, + "loss": 0.9882, + "step": 5504 + }, + { + "epoch": 0.39717181919844163, + "grad_norm": 2.3409938628561537, + "learning_rate": 2.7448103898729935e-06, + "loss": 0.9543, + "step": 5505 + }, + { + "epoch": 0.3972439666678691, + "grad_norm": 2.7941360275207265, + "learning_rate": 2.7443766268086493e-06, + "loss": 0.9805, + "step": 5506 + }, + { + "epoch": 0.3973161141372966, + "grad_norm": 2.189303826066261, + "learning_rate": 2.7439428230966517e-06, + "loss": 0.9406, + "step": 5507 + }, + { + "epoch": 0.39738826160672414, + "grad_norm": 3.7945496773699405, + "learning_rate": 2.7435089787606913e-06, + "loss": 0.9733, + "step": 5508 + }, + { + "epoch": 0.39746040907615166, + "grad_norm": 0.641747723657474, + "learning_rate": 2.7430750938244576e-06, + "loss": 0.7818, + "step": 5509 + }, + { + "epoch": 0.3975325565455792, + "grad_norm": 2.5817804488869465, + "learning_rate": 2.742641168311644e-06, + "loss": 0.932, + "step": 5510 + }, + { + "epoch": 0.3976047040150067, + "grad_norm": 2.312849791594297, + "learning_rate": 2.7422072022459456e-06, + "loss": 0.8719, + "step": 5511 + }, + { + "epoch": 0.39767685148443416, + "grad_norm": 2.7441947524410417, + "learning_rate": 2.741773195651059e-06, + "loss": 0.9006, + "step": 5512 + }, + { + "epoch": 0.3977489989538617, + "grad_norm": 2.283307336626774, + "learning_rate": 2.7413391485506845e-06, + "loss": 1.0048, + "step": 5513 + }, + { + "epoch": 0.3978211464232892, + "grad_norm": 2.674642100411831, + "learning_rate": 2.7409050609685233e-06, + "loss": 1.043, + "step": 5514 + }, + { + "epoch": 0.3978932938927167, + "grad_norm": 2.7381615279250844, + "learning_rate": 2.7404709329282798e-06, + "loss": 1.0043, + "step": 5515 + }, + { + "epoch": 0.39796544136214423, + "grad_norm": 2.7288508827059728, + "learning_rate": 2.740036764453659e-06, + "loss": 1.0209, + "step": 5516 + }, + { + "epoch": 0.39803758883157175, + "grad_norm": 2.3142058807812624, + "learning_rate": 2.7396025555683706e-06, + "loss": 1.0148, + "step": 5517 + }, + { + "epoch": 0.3981097363009992, + "grad_norm": 0.9951705244480742, + "learning_rate": 2.739168306296124e-06, + "loss": 0.8903, + "step": 5518 + }, + { + "epoch": 0.39818188377042674, + "grad_norm": 2.3302278174817945, + "learning_rate": 2.738734016660633e-06, + "loss": 0.9796, + "step": 5519 + }, + { + "epoch": 0.39825403123985426, + "grad_norm": 2.23149563588633, + "learning_rate": 2.738299686685612e-06, + "loss": 1.0337, + "step": 5520 + }, + { + "epoch": 0.3983261787092818, + "grad_norm": 2.080089357730821, + "learning_rate": 2.737865316394778e-06, + "loss": 0.812, + "step": 5521 + }, + { + "epoch": 0.3983983261787093, + "grad_norm": 2.035400141711069, + "learning_rate": 2.7374309058118506e-06, + "loss": 0.8513, + "step": 5522 + }, + { + "epoch": 0.3984704736481368, + "grad_norm": 2.634037717539028, + "learning_rate": 2.7369964549605517e-06, + "loss": 0.9288, + "step": 5523 + }, + { + "epoch": 0.3985426211175643, + "grad_norm": 2.12199469834078, + "learning_rate": 2.736561963864604e-06, + "loss": 0.9043, + "step": 5524 + }, + { + "epoch": 0.3986147685869918, + "grad_norm": 2.480705766542798, + "learning_rate": 2.7361274325477346e-06, + "loss": 0.929, + "step": 5525 + }, + { + "epoch": 0.3986869160564193, + "grad_norm": 2.1737568338910824, + "learning_rate": 2.735692861033671e-06, + "loss": 1.0201, + "step": 5526 + }, + { + "epoch": 0.39875906352584684, + "grad_norm": 4.1623029874166555, + "learning_rate": 2.735258249346144e-06, + "loss": 0.9245, + "step": 5527 + }, + { + "epoch": 0.39883121099527435, + "grad_norm": 2.8002276225892366, + "learning_rate": 2.734823597508886e-06, + "loss": 0.9182, + "step": 5528 + }, + { + "epoch": 0.3989033584647019, + "grad_norm": 0.6432244664894354, + "learning_rate": 2.734388905545631e-06, + "loss": 0.7441, + "step": 5529 + }, + { + "epoch": 0.39897550593412934, + "grad_norm": 3.241028360188008, + "learning_rate": 2.733954173480116e-06, + "loss": 0.9244, + "step": 5530 + }, + { + "epoch": 0.39904765340355686, + "grad_norm": 2.2951104269284417, + "learning_rate": 2.7335194013360812e-06, + "loss": 0.9473, + "step": 5531 + }, + { + "epoch": 0.3991198008729844, + "grad_norm": 2.203539870138399, + "learning_rate": 2.7330845891372677e-06, + "loss": 0.8799, + "step": 5532 + }, + { + "epoch": 0.3991919483424119, + "grad_norm": 2.1071204676142896, + "learning_rate": 2.732649736907418e-06, + "loss": 0.942, + "step": 5533 + }, + { + "epoch": 0.3992640958118394, + "grad_norm": 2.580884439292692, + "learning_rate": 2.7322148446702783e-06, + "loss": 0.8358, + "step": 5534 + }, + { + "epoch": 0.39933624328126693, + "grad_norm": 1.935338137059695, + "learning_rate": 2.731779912449597e-06, + "loss": 0.9957, + "step": 5535 + }, + { + "epoch": 0.3994083907506944, + "grad_norm": 2.2027268286648645, + "learning_rate": 2.7313449402691227e-06, + "loss": 0.8919, + "step": 5536 + }, + { + "epoch": 0.3994805382201219, + "grad_norm": 0.6368846794119117, + "learning_rate": 2.7309099281526095e-06, + "loss": 0.7464, + "step": 5537 + }, + { + "epoch": 0.39955268568954944, + "grad_norm": 1.8844928761522435, + "learning_rate": 2.7304748761238107e-06, + "loss": 0.9131, + "step": 5538 + }, + { + "epoch": 0.39962483315897696, + "grad_norm": 2.180034432095528, + "learning_rate": 2.730039784206483e-06, + "loss": 0.9905, + "step": 5539 + }, + { + "epoch": 0.3996969806284045, + "grad_norm": 2.2238552834952934, + "learning_rate": 2.7296046524243856e-06, + "loss": 0.9123, + "step": 5540 + }, + { + "epoch": 0.399769128097832, + "grad_norm": 2.192052083037106, + "learning_rate": 2.7291694808012782e-06, + "loss": 0.9827, + "step": 5541 + }, + { + "epoch": 0.39984127556725946, + "grad_norm": 2.732002755142766, + "learning_rate": 2.728734269360925e-06, + "loss": 1.0321, + "step": 5542 + }, + { + "epoch": 0.399913423036687, + "grad_norm": 2.6813313860868, + "learning_rate": 2.728299018127091e-06, + "loss": 0.9008, + "step": 5543 + }, + { + "epoch": 0.3999855705061145, + "grad_norm": 2.419308884194479, + "learning_rate": 2.7278637271235436e-06, + "loss": 0.9513, + "step": 5544 + }, + { + "epoch": 0.400057717975542, + "grad_norm": 3.7401990604974484, + "learning_rate": 2.727428396374053e-06, + "loss": 0.9263, + "step": 5545 + }, + { + "epoch": 0.40012986544496953, + "grad_norm": 0.7601853023046597, + "learning_rate": 2.7269930259023907e-06, + "loss": 0.8361, + "step": 5546 + }, + { + "epoch": 0.40020201291439705, + "grad_norm": 2.670822650175902, + "learning_rate": 2.7265576157323294e-06, + "loss": 0.9039, + "step": 5547 + }, + { + "epoch": 0.4002741603838245, + "grad_norm": 2.3216295197256542, + "learning_rate": 2.7261221658876468e-06, + "loss": 1.0019, + "step": 5548 + }, + { + "epoch": 0.40034630785325204, + "grad_norm": 2.5646570012791585, + "learning_rate": 2.7256866763921207e-06, + "loss": 0.9717, + "step": 5549 + }, + { + "epoch": 0.40041845532267956, + "grad_norm": 0.8332174379502741, + "learning_rate": 2.7252511472695326e-06, + "loss": 0.809, + "step": 5550 + }, + { + "epoch": 0.4004906027921071, + "grad_norm": 2.1809856965139494, + "learning_rate": 2.724815578543663e-06, + "loss": 0.8517, + "step": 5551 + }, + { + "epoch": 0.4005627502615346, + "grad_norm": 2.5191085105927593, + "learning_rate": 2.7243799702382984e-06, + "loss": 0.9372, + "step": 5552 + }, + { + "epoch": 0.4006348977309621, + "grad_norm": 2.7824077982334234, + "learning_rate": 2.723944322377225e-06, + "loss": 0.8788, + "step": 5553 + }, + { + "epoch": 0.4007070452003896, + "grad_norm": 3.1011776625327814, + "learning_rate": 2.723508634984232e-06, + "loss": 1.0583, + "step": 5554 + }, + { + "epoch": 0.4007791926698171, + "grad_norm": 2.5923530769467766, + "learning_rate": 2.723072908083111e-06, + "loss": 0.9294, + "step": 5555 + }, + { + "epoch": 0.4008513401392446, + "grad_norm": 2.0987694768132537, + "learning_rate": 2.722637141697655e-06, + "loss": 1.0052, + "step": 5556 + }, + { + "epoch": 0.40092348760867214, + "grad_norm": 2.646777147378166, + "learning_rate": 2.7222013358516597e-06, + "loss": 0.9906, + "step": 5557 + }, + { + "epoch": 0.40099563507809965, + "grad_norm": 0.8455403019229204, + "learning_rate": 2.721765490568923e-06, + "loss": 0.8516, + "step": 5558 + }, + { + "epoch": 0.4010677825475271, + "grad_norm": 3.1645985090449793, + "learning_rate": 2.7213296058732446e-06, + "loss": 0.9732, + "step": 5559 + }, + { + "epoch": 0.40113993001695464, + "grad_norm": 2.605006232666057, + "learning_rate": 2.720893681788427e-06, + "loss": 0.9996, + "step": 5560 + }, + { + "epoch": 0.40121207748638216, + "grad_norm": 2.152553319680463, + "learning_rate": 2.7204577183382747e-06, + "loss": 0.8801, + "step": 5561 + }, + { + "epoch": 0.4012842249558097, + "grad_norm": 2.345682390234128, + "learning_rate": 2.7200217155465924e-06, + "loss": 0.9367, + "step": 5562 + }, + { + "epoch": 0.4013563724252372, + "grad_norm": 2.078634787536294, + "learning_rate": 2.71958567343719e-06, + "loss": 0.8751, + "step": 5563 + }, + { + "epoch": 0.4014285198946647, + "grad_norm": 1.9785873836899017, + "learning_rate": 2.719149592033878e-06, + "loss": 1.03, + "step": 5564 + }, + { + "epoch": 0.4015006673640922, + "grad_norm": 2.3944194325760684, + "learning_rate": 2.718713471360469e-06, + "loss": 0.9415, + "step": 5565 + }, + { + "epoch": 0.4015728148335197, + "grad_norm": 2.519102358723358, + "learning_rate": 2.7182773114407776e-06, + "loss": 0.9577, + "step": 5566 + }, + { + "epoch": 0.4016449623029472, + "grad_norm": 1.8916396893945733, + "learning_rate": 2.717841112298621e-06, + "loss": 0.8651, + "step": 5567 + }, + { + "epoch": 0.40171710977237474, + "grad_norm": 2.461526176646645, + "learning_rate": 2.717404873957819e-06, + "loss": 0.9125, + "step": 5568 + }, + { + "epoch": 0.40178925724180226, + "grad_norm": 3.309260817909297, + "learning_rate": 2.7169685964421927e-06, + "loss": 0.9096, + "step": 5569 + }, + { + "epoch": 0.4018614047112298, + "grad_norm": 3.976233927824078, + "learning_rate": 2.716532279775565e-06, + "loss": 0.927, + "step": 5570 + }, + { + "epoch": 0.40193355218065724, + "grad_norm": 2.076306343209805, + "learning_rate": 2.7160959239817623e-06, + "loss": 0.9952, + "step": 5571 + }, + { + "epoch": 0.40200569965008476, + "grad_norm": 2.007024469360444, + "learning_rate": 2.7156595290846124e-06, + "loss": 0.8613, + "step": 5572 + }, + { + "epoch": 0.4020778471195123, + "grad_norm": 2.344884165686715, + "learning_rate": 2.7152230951079445e-06, + "loss": 0.9928, + "step": 5573 + }, + { + "epoch": 0.4021499945889398, + "grad_norm": 2.5929865805164285, + "learning_rate": 2.714786622075591e-06, + "loss": 0.8968, + "step": 5574 + }, + { + "epoch": 0.4022221420583673, + "grad_norm": 2.1468076870289687, + "learning_rate": 2.714350110011386e-06, + "loss": 0.9799, + "step": 5575 + }, + { + "epoch": 0.40229428952779483, + "grad_norm": 0.7255867178952524, + "learning_rate": 2.7139135589391666e-06, + "loss": 0.7391, + "step": 5576 + }, + { + "epoch": 0.4023664369972223, + "grad_norm": 2.003462773974211, + "learning_rate": 2.71347696888277e-06, + "loss": 0.898, + "step": 5577 + }, + { + "epoch": 0.4024385844666498, + "grad_norm": 3.38228555499998, + "learning_rate": 2.7130403398660374e-06, + "loss": 0.9738, + "step": 5578 + }, + { + "epoch": 0.40251073193607734, + "grad_norm": 2.7290541841656983, + "learning_rate": 2.7126036719128118e-06, + "loss": 0.93, + "step": 5579 + }, + { + "epoch": 0.40258287940550486, + "grad_norm": 2.0509715546002925, + "learning_rate": 2.712166965046937e-06, + "loss": 0.8193, + "step": 5580 + }, + { + "epoch": 0.4026550268749324, + "grad_norm": 2.5373773704926283, + "learning_rate": 2.711730219292261e-06, + "loss": 0.9001, + "step": 5581 + }, + { + "epoch": 0.4027271743443599, + "grad_norm": 4.284208737186387, + "learning_rate": 2.7112934346726326e-06, + "loss": 0.8602, + "step": 5582 + }, + { + "epoch": 0.40279932181378736, + "grad_norm": 0.9789901466068596, + "learning_rate": 2.7108566112119027e-06, + "loss": 0.8167, + "step": 5583 + }, + { + "epoch": 0.4028714692832149, + "grad_norm": 2.704855921463782, + "learning_rate": 2.7104197489339245e-06, + "loss": 0.9655, + "step": 5584 + }, + { + "epoch": 0.4029436167526424, + "grad_norm": 2.872754754438741, + "learning_rate": 2.7099828478625536e-06, + "loss": 0.8528, + "step": 5585 + }, + { + "epoch": 0.4030157642220699, + "grad_norm": 2.05632899170321, + "learning_rate": 2.7095459080216478e-06, + "loss": 0.8615, + "step": 5586 + }, + { + "epoch": 0.40308791169149744, + "grad_norm": 2.247991088917403, + "learning_rate": 2.7091089294350667e-06, + "loss": 0.8832, + "step": 5587 + }, + { + "epoch": 0.40316005916092496, + "grad_norm": 2.4778516043748913, + "learning_rate": 2.7086719121266727e-06, + "loss": 0.9343, + "step": 5588 + }, + { + "epoch": 0.4032322066303524, + "grad_norm": 3.4598148497066266, + "learning_rate": 2.708234856120328e-06, + "loss": 0.9806, + "step": 5589 + }, + { + "epoch": 0.40330435409977994, + "grad_norm": 2.5144760635702177, + "learning_rate": 2.7077977614398998e-06, + "loss": 0.875, + "step": 5590 + }, + { + "epoch": 0.40337650156920746, + "grad_norm": 1.8909622396030583, + "learning_rate": 2.7073606281092567e-06, + "loss": 0.9287, + "step": 5591 + }, + { + "epoch": 0.403448649038635, + "grad_norm": 2.1188280569456084, + "learning_rate": 2.7069234561522676e-06, + "loss": 0.8657, + "step": 5592 + }, + { + "epoch": 0.4035207965080625, + "grad_norm": 4.2560547726483975, + "learning_rate": 2.7064862455928056e-06, + "loss": 0.9604, + "step": 5593 + }, + { + "epoch": 0.40359294397749, + "grad_norm": 2.4454002486502353, + "learning_rate": 2.706048996454745e-06, + "loss": 0.8799, + "step": 5594 + }, + { + "epoch": 0.4036650914469175, + "grad_norm": 2.0865182818140457, + "learning_rate": 2.7056117087619626e-06, + "loss": 0.9579, + "step": 5595 + }, + { + "epoch": 0.403737238916345, + "grad_norm": 2.6594686417729174, + "learning_rate": 2.7051743825383364e-06, + "loss": 0.9014, + "step": 5596 + }, + { + "epoch": 0.4038093863857725, + "grad_norm": 2.809223640276809, + "learning_rate": 2.7047370178077476e-06, + "loss": 0.8945, + "step": 5597 + }, + { + "epoch": 0.40388153385520004, + "grad_norm": 0.7734165381471307, + "learning_rate": 2.70429961459408e-06, + "loss": 0.8234, + "step": 5598 + }, + { + "epoch": 0.40395368132462756, + "grad_norm": 2.287248120224489, + "learning_rate": 2.7038621729212166e-06, + "loss": 0.9952, + "step": 5599 + }, + { + "epoch": 0.4040258287940551, + "grad_norm": 2.378263740653038, + "learning_rate": 2.7034246928130466e-06, + "loss": 0.963, + "step": 5600 + }, + { + "epoch": 0.40409797626348254, + "grad_norm": 3.3129256352887873, + "learning_rate": 2.7029871742934576e-06, + "loss": 0.985, + "step": 5601 + }, + { + "epoch": 0.40417012373291006, + "grad_norm": 1.962875138406725, + "learning_rate": 2.7025496173863413e-06, + "loss": 0.9629, + "step": 5602 + }, + { + "epoch": 0.4042422712023376, + "grad_norm": 2.675151132610251, + "learning_rate": 2.7021120221155913e-06, + "loss": 0.9517, + "step": 5603 + }, + { + "epoch": 0.4043144186717651, + "grad_norm": 1.9031301514003043, + "learning_rate": 2.701674388505103e-06, + "loss": 0.9673, + "step": 5604 + }, + { + "epoch": 0.4043865661411926, + "grad_norm": 3.644093052728061, + "learning_rate": 2.7012367165787732e-06, + "loss": 0.9409, + "step": 5605 + }, + { + "epoch": 0.4044587136106201, + "grad_norm": 2.1196147898019624, + "learning_rate": 2.700799006360503e-06, + "loss": 0.9548, + "step": 5606 + }, + { + "epoch": 0.4045308610800476, + "grad_norm": 2.3250990282524353, + "learning_rate": 2.700361257874193e-06, + "loss": 0.8417, + "step": 5607 + }, + { + "epoch": 0.4046030085494751, + "grad_norm": 6.354674905764966, + "learning_rate": 2.6999234711437477e-06, + "loss": 0.9409, + "step": 5608 + }, + { + "epoch": 0.40467515601890264, + "grad_norm": 2.250001483492892, + "learning_rate": 2.699485646193072e-06, + "loss": 0.9663, + "step": 5609 + }, + { + "epoch": 0.40474730348833016, + "grad_norm": 2.382818703565183, + "learning_rate": 2.699047783046075e-06, + "loss": 0.949, + "step": 5610 + }, + { + "epoch": 0.4048194509577577, + "grad_norm": 2.0372250049577723, + "learning_rate": 2.6986098817266666e-06, + "loss": 0.9974, + "step": 5611 + }, + { + "epoch": 0.40489159842718514, + "grad_norm": 2.229114341715921, + "learning_rate": 2.6981719422587583e-06, + "loss": 0.8958, + "step": 5612 + }, + { + "epoch": 0.40496374589661266, + "grad_norm": 2.326413133282957, + "learning_rate": 2.6977339646662647e-06, + "loss": 0.889, + "step": 5613 + }, + { + "epoch": 0.4050358933660402, + "grad_norm": 2.153297614499135, + "learning_rate": 2.697295948973102e-06, + "loss": 0.8735, + "step": 5614 + }, + { + "epoch": 0.4051080408354677, + "grad_norm": 2.175896632972061, + "learning_rate": 2.69685789520319e-06, + "loss": 0.8524, + "step": 5615 + }, + { + "epoch": 0.4051801883048952, + "grad_norm": 2.2287087252352418, + "learning_rate": 2.6964198033804466e-06, + "loss": 0.9819, + "step": 5616 + }, + { + "epoch": 0.40525233577432274, + "grad_norm": 2.166081887582595, + "learning_rate": 2.6959816735287963e-06, + "loss": 0.8665, + "step": 5617 + }, + { + "epoch": 0.4053244832437502, + "grad_norm": 6.618536333160027, + "learning_rate": 2.695543505672164e-06, + "loss": 0.9744, + "step": 5618 + }, + { + "epoch": 0.4053966307131777, + "grad_norm": 0.9339652180352346, + "learning_rate": 2.695105299834475e-06, + "loss": 0.8219, + "step": 5619 + }, + { + "epoch": 0.40546877818260524, + "grad_norm": 3.3075139649342167, + "learning_rate": 2.6946670560396587e-06, + "loss": 0.8849, + "step": 5620 + }, + { + "epoch": 0.40554092565203276, + "grad_norm": 2.404301436518317, + "learning_rate": 2.694228774311646e-06, + "loss": 0.9375, + "step": 5621 + }, + { + "epoch": 0.4056130731214603, + "grad_norm": 2.4310038150206634, + "learning_rate": 2.69379045467437e-06, + "loss": 0.8862, + "step": 5622 + }, + { + "epoch": 0.4056852205908878, + "grad_norm": 1.8088971206950015, + "learning_rate": 2.6933520971517655e-06, + "loss": 0.9291, + "step": 5623 + }, + { + "epoch": 0.40575736806031526, + "grad_norm": 3.8212733396367056, + "learning_rate": 2.69291370176777e-06, + "loss": 0.8352, + "step": 5624 + }, + { + "epoch": 0.4058295155297428, + "grad_norm": 1.9628880135438163, + "learning_rate": 2.692475268546322e-06, + "loss": 1.0262, + "step": 5625 + }, + { + "epoch": 0.4059016629991703, + "grad_norm": 0.8004662242985824, + "learning_rate": 2.692036797511363e-06, + "loss": 0.8332, + "step": 5626 + }, + { + "epoch": 0.4059738104685978, + "grad_norm": 7.0921763552594745, + "learning_rate": 2.691598288686837e-06, + "loss": 0.911, + "step": 5627 + }, + { + "epoch": 0.40604595793802534, + "grad_norm": 3.3024414221328215, + "learning_rate": 2.691159742096688e-06, + "loss": 0.9439, + "step": 5628 + }, + { + "epoch": 0.40611810540745286, + "grad_norm": 5.417650671370116, + "learning_rate": 2.690721157764864e-06, + "loss": 0.7806, + "step": 5629 + }, + { + "epoch": 0.4061902528768803, + "grad_norm": 3.2419311422087227, + "learning_rate": 2.690282535715315e-06, + "loss": 0.9585, + "step": 5630 + }, + { + "epoch": 0.40626240034630784, + "grad_norm": 2.3980811037565206, + "learning_rate": 2.689843875971992e-06, + "loss": 0.9454, + "step": 5631 + }, + { + "epoch": 0.40633454781573536, + "grad_norm": 1.7137926916902588, + "learning_rate": 2.6894051785588486e-06, + "loss": 1.04, + "step": 5632 + }, + { + "epoch": 0.4064066952851629, + "grad_norm": 2.4545087859757455, + "learning_rate": 2.68896644349984e-06, + "loss": 1.0147, + "step": 5633 + }, + { + "epoch": 0.4064788427545904, + "grad_norm": 1.7667071646721013, + "learning_rate": 2.688527670818925e-06, + "loss": 0.9074, + "step": 5634 + }, + { + "epoch": 0.4065509902240179, + "grad_norm": 1.770671148914003, + "learning_rate": 2.688088860540062e-06, + "loss": 1.0017, + "step": 5635 + }, + { + "epoch": 0.4066231376934454, + "grad_norm": 2.4856505568247744, + "learning_rate": 2.6876500126872144e-06, + "loss": 0.8556, + "step": 5636 + }, + { + "epoch": 0.4066952851628729, + "grad_norm": 1.9980142391220581, + "learning_rate": 2.6872111272843453e-06, + "loss": 1.0127, + "step": 5637 + }, + { + "epoch": 0.4067674326323004, + "grad_norm": 4.183710005825188, + "learning_rate": 2.686772204355419e-06, + "loss": 0.9318, + "step": 5638 + }, + { + "epoch": 0.40683958010172794, + "grad_norm": 3.096371859453674, + "learning_rate": 2.6863332439244064e-06, + "loss": 0.8898, + "step": 5639 + }, + { + "epoch": 0.40691172757115546, + "grad_norm": 2.2685158012298894, + "learning_rate": 2.685894246015276e-06, + "loss": 1.0047, + "step": 5640 + }, + { + "epoch": 0.406983875040583, + "grad_norm": 1.8217798303051913, + "learning_rate": 2.685455210651999e-06, + "loss": 0.9433, + "step": 5641 + }, + { + "epoch": 0.40705602251001044, + "grad_norm": 1.5760156112033255, + "learning_rate": 2.685016137858551e-06, + "loss": 1.0115, + "step": 5642 + }, + { + "epoch": 0.40712816997943796, + "grad_norm": 2.6932654526940967, + "learning_rate": 2.684577027658908e-06, + "loss": 1.0012, + "step": 5643 + }, + { + "epoch": 0.4072003174488655, + "grad_norm": 2.5326421229709646, + "learning_rate": 2.6841378800770474e-06, + "loss": 0.911, + "step": 5644 + }, + { + "epoch": 0.407272464918293, + "grad_norm": 2.6031470070008287, + "learning_rate": 2.6836986951369494e-06, + "loss": 0.8296, + "step": 5645 + }, + { + "epoch": 0.4073446123877205, + "grad_norm": 1.9367538984463082, + "learning_rate": 2.683259472862597e-06, + "loss": 0.9608, + "step": 5646 + }, + { + "epoch": 0.40741675985714804, + "grad_norm": 1.8647775460083174, + "learning_rate": 2.682820213277974e-06, + "loss": 0.9842, + "step": 5647 + }, + { + "epoch": 0.4074889073265755, + "grad_norm": 2.6817907859507972, + "learning_rate": 2.682380916407067e-06, + "loss": 0.9098, + "step": 5648 + }, + { + "epoch": 0.407561054796003, + "grad_norm": 2.2649634085893164, + "learning_rate": 2.6819415822738643e-06, + "loss": 0.8239, + "step": 5649 + }, + { + "epoch": 0.40763320226543054, + "grad_norm": 4.011035478418367, + "learning_rate": 2.6815022109023555e-06, + "loss": 0.872, + "step": 5650 + }, + { + "epoch": 0.40770534973485806, + "grad_norm": 2.311681757325099, + "learning_rate": 2.6810628023165345e-06, + "loss": 0.8347, + "step": 5651 + }, + { + "epoch": 0.4077774972042856, + "grad_norm": 2.900978640151919, + "learning_rate": 2.680623356540395e-06, + "loss": 0.9063, + "step": 5652 + }, + { + "epoch": 0.4078496446737131, + "grad_norm": 4.8140278162166465, + "learning_rate": 2.6801838735979333e-06, + "loss": 0.946, + "step": 5653 + }, + { + "epoch": 0.40792179214314056, + "grad_norm": 2.288621879587594, + "learning_rate": 2.679744353513149e-06, + "loss": 0.9202, + "step": 5654 + }, + { + "epoch": 0.4079939396125681, + "grad_norm": 2.032605936402872, + "learning_rate": 2.679304796310041e-06, + "loss": 0.9868, + "step": 5655 + }, + { + "epoch": 0.4080660870819956, + "grad_norm": 2.178523583523319, + "learning_rate": 2.6788652020126132e-06, + "loss": 0.9352, + "step": 5656 + }, + { + "epoch": 0.4081382345514231, + "grad_norm": 2.572189063436816, + "learning_rate": 2.6784255706448693e-06, + "loss": 1.0315, + "step": 5657 + }, + { + "epoch": 0.40821038202085064, + "grad_norm": 2.6149044270309583, + "learning_rate": 2.6779859022308172e-06, + "loss": 0.8307, + "step": 5658 + }, + { + "epoch": 0.4082825294902781, + "grad_norm": 2.2348687753407726, + "learning_rate": 2.6775461967944645e-06, + "loss": 1.0175, + "step": 5659 + }, + { + "epoch": 0.4083546769597056, + "grad_norm": 2.1434560279473653, + "learning_rate": 2.677106454359822e-06, + "loss": 0.9182, + "step": 5660 + }, + { + "epoch": 0.40842682442913314, + "grad_norm": 2.5843336773189907, + "learning_rate": 2.676666674950903e-06, + "loss": 1.013, + "step": 5661 + }, + { + "epoch": 0.40849897189856066, + "grad_norm": 2.6970457834071317, + "learning_rate": 2.676226858591721e-06, + "loss": 1.0689, + "step": 5662 + }, + { + "epoch": 0.4085711193679882, + "grad_norm": 2.433833078338889, + "learning_rate": 2.675787005306294e-06, + "loss": 0.9442, + "step": 5663 + }, + { + "epoch": 0.4086432668374157, + "grad_norm": 3.470146379269135, + "learning_rate": 2.675347115118641e-06, + "loss": 0.923, + "step": 5664 + }, + { + "epoch": 0.40871541430684316, + "grad_norm": 2.4257023256396333, + "learning_rate": 2.674907188052781e-06, + "loss": 0.8808, + "step": 5665 + }, + { + "epoch": 0.4087875617762707, + "grad_norm": 0.8092704558145546, + "learning_rate": 2.674467224132738e-06, + "loss": 0.8462, + "step": 5666 + }, + { + "epoch": 0.4088597092456982, + "grad_norm": 2.156656973206297, + "learning_rate": 2.6740272233825373e-06, + "loss": 0.9168, + "step": 5667 + }, + { + "epoch": 0.4089318567151257, + "grad_norm": 2.345247731086423, + "learning_rate": 2.6735871858262054e-06, + "loss": 0.9036, + "step": 5668 + }, + { + "epoch": 0.40900400418455324, + "grad_norm": 2.159144904634185, + "learning_rate": 2.67314711148777e-06, + "loss": 0.9512, + "step": 5669 + }, + { + "epoch": 0.40907615165398076, + "grad_norm": 2.2891926907436178, + "learning_rate": 2.672707000391263e-06, + "loss": 0.808, + "step": 5670 + }, + { + "epoch": 0.4091482991234082, + "grad_norm": 2.0759684910744283, + "learning_rate": 2.672266852560717e-06, + "loss": 0.8985, + "step": 5671 + }, + { + "epoch": 0.40922044659283574, + "grad_norm": 2.530313954531353, + "learning_rate": 2.6718266680201663e-06, + "loss": 0.9738, + "step": 5672 + }, + { + "epoch": 0.40929259406226326, + "grad_norm": 1.9123308717830287, + "learning_rate": 2.6713864467936492e-06, + "loss": 0.8857, + "step": 5673 + }, + { + "epoch": 0.4093647415316908, + "grad_norm": 2.275384652453267, + "learning_rate": 2.6709461889052037e-06, + "loss": 0.9049, + "step": 5674 + }, + { + "epoch": 0.4094368890011183, + "grad_norm": 2.372064080824976, + "learning_rate": 2.6705058943788695e-06, + "loss": 0.9966, + "step": 5675 + }, + { + "epoch": 0.4095090364705458, + "grad_norm": 2.176332688094538, + "learning_rate": 2.670065563238691e-06, + "loss": 1.0116, + "step": 5676 + }, + { + "epoch": 0.4095811839399733, + "grad_norm": 2.800666072504274, + "learning_rate": 2.6696251955087126e-06, + "loss": 0.9651, + "step": 5677 + }, + { + "epoch": 0.4096533314094008, + "grad_norm": 2.078102398512844, + "learning_rate": 2.6691847912129816e-06, + "loss": 0.9211, + "step": 5678 + }, + { + "epoch": 0.4097254788788283, + "grad_norm": 1.741394272938918, + "learning_rate": 2.668744350375546e-06, + "loss": 0.9035, + "step": 5679 + }, + { + "epoch": 0.40979762634825584, + "grad_norm": 1.781013138398636, + "learning_rate": 2.668303873020457e-06, + "loss": 0.8841, + "step": 5680 + }, + { + "epoch": 0.40986977381768336, + "grad_norm": 3.8105075820469003, + "learning_rate": 2.6678633591717678e-06, + "loss": 0.9609, + "step": 5681 + }, + { + "epoch": 0.4099419212871109, + "grad_norm": 2.2544212347989347, + "learning_rate": 2.6674228088535325e-06, + "loss": 0.9146, + "step": 5682 + }, + { + "epoch": 0.41001406875653834, + "grad_norm": 1.9309422047093632, + "learning_rate": 2.6669822220898085e-06, + "loss": 0.9852, + "step": 5683 + }, + { + "epoch": 0.41008621622596586, + "grad_norm": 2.9607068779744403, + "learning_rate": 2.666541598904654e-06, + "loss": 0.8117, + "step": 5684 + }, + { + "epoch": 0.4101583636953934, + "grad_norm": 5.481415822425348, + "learning_rate": 2.666100939322131e-06, + "loss": 0.9808, + "step": 5685 + }, + { + "epoch": 0.4102305111648209, + "grad_norm": 2.0783596049802173, + "learning_rate": 2.6656602433663016e-06, + "loss": 0.9517, + "step": 5686 + }, + { + "epoch": 0.4103026586342484, + "grad_norm": 2.658796548296916, + "learning_rate": 2.6652195110612295e-06, + "loss": 0.9229, + "step": 5687 + }, + { + "epoch": 0.41037480610367594, + "grad_norm": 2.623142220815757, + "learning_rate": 2.664778742430983e-06, + "loss": 0.918, + "step": 5688 + }, + { + "epoch": 0.4104469535731034, + "grad_norm": 1.9640730755141145, + "learning_rate": 2.6643379374996304e-06, + "loss": 0.9853, + "step": 5689 + }, + { + "epoch": 0.4105191010425309, + "grad_norm": 4.672633693164184, + "learning_rate": 2.663897096291242e-06, + "loss": 0.9083, + "step": 5690 + }, + { + "epoch": 0.41059124851195844, + "grad_norm": 2.425175442199091, + "learning_rate": 2.6634562188298917e-06, + "loss": 0.9808, + "step": 5691 + }, + { + "epoch": 0.41066339598138596, + "grad_norm": 3.7588744378050953, + "learning_rate": 2.663015305139652e-06, + "loss": 0.8796, + "step": 5692 + }, + { + "epoch": 0.4107355434508135, + "grad_norm": 4.754390293827384, + "learning_rate": 2.6625743552446023e-06, + "loss": 0.8971, + "step": 5693 + }, + { + "epoch": 0.410807690920241, + "grad_norm": 2.207166387421092, + "learning_rate": 2.662133369168819e-06, + "loss": 0.966, + "step": 5694 + }, + { + "epoch": 0.41087983838966846, + "grad_norm": 2.2472509968846976, + "learning_rate": 2.6616923469363837e-06, + "loss": 0.8787, + "step": 5695 + }, + { + "epoch": 0.410951985859096, + "grad_norm": 2.487377631469911, + "learning_rate": 2.6612512885713794e-06, + "loss": 0.946, + "step": 5696 + }, + { + "epoch": 0.4110241333285235, + "grad_norm": 2.121246838182331, + "learning_rate": 2.660810194097889e-06, + "loss": 0.8521, + "step": 5697 + }, + { + "epoch": 0.411096280797951, + "grad_norm": 1.9360437150070906, + "learning_rate": 2.660369063540001e-06, + "loss": 0.9212, + "step": 5698 + }, + { + "epoch": 0.41116842826737854, + "grad_norm": 2.7431955250843445, + "learning_rate": 2.6599278969218024e-06, + "loss": 0.8972, + "step": 5699 + }, + { + "epoch": 0.41124057573680606, + "grad_norm": 2.1588888192396314, + "learning_rate": 2.6594866942673845e-06, + "loss": 0.951, + "step": 5700 + }, + { + "epoch": 0.4113127232062335, + "grad_norm": 4.101994954880281, + "learning_rate": 2.6590454556008395e-06, + "loss": 0.8769, + "step": 5701 + }, + { + "epoch": 0.41138487067566104, + "grad_norm": 1.9017798745406593, + "learning_rate": 2.658604180946262e-06, + "loss": 0.9222, + "step": 5702 + }, + { + "epoch": 0.41145701814508856, + "grad_norm": 2.354797368818251, + "learning_rate": 2.658162870327749e-06, + "loss": 0.8497, + "step": 5703 + }, + { + "epoch": 0.4115291656145161, + "grad_norm": 2.221983903450609, + "learning_rate": 2.6577215237693967e-06, + "loss": 0.9104, + "step": 5704 + }, + { + "epoch": 0.4116013130839436, + "grad_norm": 1.9263161309120989, + "learning_rate": 2.6572801412953077e-06, + "loss": 0.8982, + "step": 5705 + }, + { + "epoch": 0.4116734605533711, + "grad_norm": 3.73598314717676, + "learning_rate": 2.6568387229295833e-06, + "loss": 0.8482, + "step": 5706 + }, + { + "epoch": 0.4117456080227986, + "grad_norm": 5.043695446225165, + "learning_rate": 2.6563972686963275e-06, + "loss": 0.9409, + "step": 5707 + }, + { + "epoch": 0.4118177554922261, + "grad_norm": 2.4021124999852628, + "learning_rate": 2.655955778619647e-06, + "loss": 0.894, + "step": 5708 + }, + { + "epoch": 0.4118899029616536, + "grad_norm": 2.1685297609573437, + "learning_rate": 2.6555142527236493e-06, + "loss": 0.9902, + "step": 5709 + }, + { + "epoch": 0.41196205043108114, + "grad_norm": 4.0572038607549175, + "learning_rate": 2.6550726910324455e-06, + "loss": 0.9675, + "step": 5710 + }, + { + "epoch": 0.41203419790050866, + "grad_norm": 2.1388607133382975, + "learning_rate": 2.654631093570146e-06, + "loss": 0.963, + "step": 5711 + }, + { + "epoch": 0.4121063453699361, + "grad_norm": 2.3769969575675525, + "learning_rate": 2.6541894603608668e-06, + "loss": 0.8574, + "step": 5712 + }, + { + "epoch": 0.41217849283936364, + "grad_norm": 3.6650430957295286, + "learning_rate": 2.653747791428723e-06, + "loss": 0.8901, + "step": 5713 + }, + { + "epoch": 0.41225064030879116, + "grad_norm": 2.9388158570004723, + "learning_rate": 2.6533060867978317e-06, + "loss": 0.9178, + "step": 5714 + }, + { + "epoch": 0.4123227877782187, + "grad_norm": 3.066634831747037, + "learning_rate": 2.652864346492314e-06, + "loss": 0.8671, + "step": 5715 + }, + { + "epoch": 0.4123949352476462, + "grad_norm": 1.5755287191230423, + "learning_rate": 2.65242257053629e-06, + "loss": 1.0213, + "step": 5716 + }, + { + "epoch": 0.4124670827170737, + "grad_norm": 2.294849151108363, + "learning_rate": 2.651980758953886e-06, + "loss": 0.951, + "step": 5717 + }, + { + "epoch": 0.4125392301865012, + "grad_norm": 3.180780846077184, + "learning_rate": 2.6515389117692264e-06, + "loss": 0.881, + "step": 5718 + }, + { + "epoch": 0.4126113776559287, + "grad_norm": 2.4879939753346005, + "learning_rate": 2.651097029006438e-06, + "loss": 0.9495, + "step": 5719 + }, + { + "epoch": 0.4126835251253562, + "grad_norm": 2.944641036742189, + "learning_rate": 2.650655110689651e-06, + "loss": 0.9046, + "step": 5720 + }, + { + "epoch": 0.41275567259478374, + "grad_norm": 4.514313715318476, + "learning_rate": 2.6502131568429976e-06, + "loss": 0.8902, + "step": 5721 + }, + { + "epoch": 0.41282782006421126, + "grad_norm": 0.7215115941174208, + "learning_rate": 2.6497711674906105e-06, + "loss": 0.7851, + "step": 5722 + }, + { + "epoch": 0.4128999675336388, + "grad_norm": 2.1105714018852937, + "learning_rate": 2.649329142656625e-06, + "loss": 0.8823, + "step": 5723 + }, + { + "epoch": 0.41297211500306624, + "grad_norm": 2.543133286457713, + "learning_rate": 2.6488870823651793e-06, + "loss": 0.9225, + "step": 5724 + }, + { + "epoch": 0.41304426247249376, + "grad_norm": 0.7503613157967515, + "learning_rate": 2.648444986640412e-06, + "loss": 0.7893, + "step": 5725 + }, + { + "epoch": 0.4131164099419213, + "grad_norm": 2.0468175748966995, + "learning_rate": 2.6480028555064636e-06, + "loss": 0.9044, + "step": 5726 + }, + { + "epoch": 0.4131885574113488, + "grad_norm": 1.983214271668867, + "learning_rate": 2.6475606889874795e-06, + "loss": 0.9152, + "step": 5727 + }, + { + "epoch": 0.4132607048807763, + "grad_norm": 2.632479411712156, + "learning_rate": 2.647118487107602e-06, + "loss": 0.937, + "step": 5728 + }, + { + "epoch": 0.41333285235020384, + "grad_norm": 3.166133099493914, + "learning_rate": 2.6466762498909805e-06, + "loss": 0.9875, + "step": 5729 + }, + { + "epoch": 0.4134049998196313, + "grad_norm": 4.7107344499799115, + "learning_rate": 2.6462339773617628e-06, + "loss": 0.9109, + "step": 5730 + }, + { + "epoch": 0.4134771472890588, + "grad_norm": 2.256170078035011, + "learning_rate": 2.645791669544099e-06, + "loss": 0.9468, + "step": 5731 + }, + { + "epoch": 0.41354929475848634, + "grad_norm": 2.9512865913963124, + "learning_rate": 2.6453493264621436e-06, + "loss": 0.9323, + "step": 5732 + }, + { + "epoch": 0.41362144222791386, + "grad_norm": 2.680880089343102, + "learning_rate": 2.64490694814005e-06, + "loss": 0.9474, + "step": 5733 + }, + { + "epoch": 0.4136935896973414, + "grad_norm": 2.472070705995169, + "learning_rate": 2.644464534601976e-06, + "loss": 0.9761, + "step": 5734 + }, + { + "epoch": 0.4137657371667689, + "grad_norm": 2.7480013259984446, + "learning_rate": 2.644022085872079e-06, + "loss": 0.9388, + "step": 5735 + }, + { + "epoch": 0.41383788463619636, + "grad_norm": 5.349445899005953, + "learning_rate": 2.64357960197452e-06, + "loss": 1.0485, + "step": 5736 + }, + { + "epoch": 0.4139100321056239, + "grad_norm": 2.5491020903108987, + "learning_rate": 2.6431370829334614e-06, + "loss": 1.1021, + "step": 5737 + }, + { + "epoch": 0.4139821795750514, + "grad_norm": 2.0926940304799215, + "learning_rate": 2.6426945287730677e-06, + "loss": 1.0084, + "step": 5738 + }, + { + "epoch": 0.4140543270444789, + "grad_norm": 2.1673948335928688, + "learning_rate": 2.6422519395175044e-06, + "loss": 0.9956, + "step": 5739 + }, + { + "epoch": 0.41412647451390644, + "grad_norm": 3.9251442463604223, + "learning_rate": 2.641809315190941e-06, + "loss": 0.9493, + "step": 5740 + }, + { + "epoch": 0.41419862198333396, + "grad_norm": 2.268667664097871, + "learning_rate": 2.641366655817546e-06, + "loss": 1.0037, + "step": 5741 + }, + { + "epoch": 0.4142707694527614, + "grad_norm": 2.892651151962622, + "learning_rate": 2.6409239614214933e-06, + "loss": 0.9398, + "step": 5742 + }, + { + "epoch": 0.41434291692218894, + "grad_norm": 2.7239861763478768, + "learning_rate": 2.6404812320269548e-06, + "loss": 1.0203, + "step": 5743 + }, + { + "epoch": 0.41441506439161646, + "grad_norm": 2.554336290395204, + "learning_rate": 2.6400384676581073e-06, + "loss": 1.0012, + "step": 5744 + }, + { + "epoch": 0.414487211861044, + "grad_norm": 2.712688660982557, + "learning_rate": 2.6395956683391283e-06, + "loss": 0.8731, + "step": 5745 + }, + { + "epoch": 0.4145593593304715, + "grad_norm": 2.281140991114752, + "learning_rate": 2.6391528340941984e-06, + "loss": 1.0086, + "step": 5746 + }, + { + "epoch": 0.414631506799899, + "grad_norm": 2.6142010858969673, + "learning_rate": 2.6387099649474974e-06, + "loss": 0.9058, + "step": 5747 + }, + { + "epoch": 0.4147036542693265, + "grad_norm": 1.9918545313807756, + "learning_rate": 2.6382670609232093e-06, + "loss": 0.9723, + "step": 5748 + }, + { + "epoch": 0.414775801738754, + "grad_norm": 2.812073484505176, + "learning_rate": 2.637824122045521e-06, + "loss": 0.9807, + "step": 5749 + }, + { + "epoch": 0.4148479492081815, + "grad_norm": 2.0670101675033568, + "learning_rate": 2.6373811483386173e-06, + "loss": 0.9527, + "step": 5750 + }, + { + "epoch": 0.41492009667760904, + "grad_norm": 1.9979109582696803, + "learning_rate": 2.6369381398266894e-06, + "loss": 0.9137, + "step": 5751 + }, + { + "epoch": 0.41499224414703656, + "grad_norm": 2.660732302142963, + "learning_rate": 2.6364950965339277e-06, + "loss": 0.8631, + "step": 5752 + }, + { + "epoch": 0.4150643916164641, + "grad_norm": 1.8388651778442617, + "learning_rate": 2.6360520184845243e-06, + "loss": 1.0304, + "step": 5753 + }, + { + "epoch": 0.41513653908589154, + "grad_norm": 2.6537592261248797, + "learning_rate": 2.6356089057026755e-06, + "loss": 0.8448, + "step": 5754 + }, + { + "epoch": 0.41520868655531906, + "grad_norm": 0.7026457106682279, + "learning_rate": 2.6351657582125768e-06, + "loss": 0.785, + "step": 5755 + }, + { + "epoch": 0.4152808340247466, + "grad_norm": 4.294030708121454, + "learning_rate": 2.634722576038427e-06, + "loss": 0.9433, + "step": 5756 + }, + { + "epoch": 0.4153529814941741, + "grad_norm": 2.656494129406333, + "learning_rate": 2.634279359204428e-06, + "loss": 0.8975, + "step": 5757 + }, + { + "epoch": 0.4154251289636016, + "grad_norm": 4.749665700291587, + "learning_rate": 2.6338361077347806e-06, + "loss": 0.9316, + "step": 5758 + }, + { + "epoch": 0.41549727643302914, + "grad_norm": 2.355189570999379, + "learning_rate": 2.6333928216536894e-06, + "loss": 0.9985, + "step": 5759 + }, + { + "epoch": 0.4155694239024566, + "grad_norm": 2.4389576588201227, + "learning_rate": 2.6329495009853617e-06, + "loss": 0.9738, + "step": 5760 + }, + { + "epoch": 0.4156415713718841, + "grad_norm": 2.330295549402414, + "learning_rate": 2.6325061457540045e-06, + "loss": 0.8982, + "step": 5761 + }, + { + "epoch": 0.41571371884131164, + "grad_norm": 2.526159277206633, + "learning_rate": 2.632062755983828e-06, + "loss": 0.9242, + "step": 5762 + }, + { + "epoch": 0.41578586631073916, + "grad_norm": 1.6895561407357123, + "learning_rate": 2.6316193316990443e-06, + "loss": 0.9067, + "step": 5763 + }, + { + "epoch": 0.4158580137801667, + "grad_norm": 2.343824665151837, + "learning_rate": 2.6311758729238673e-06, + "loss": 0.9071, + "step": 5764 + }, + { + "epoch": 0.41593016124959414, + "grad_norm": 2.6316563815720597, + "learning_rate": 2.630732379682512e-06, + "loss": 0.8582, + "step": 5765 + }, + { + "epoch": 0.41600230871902166, + "grad_norm": 2.619430992421839, + "learning_rate": 2.6302888519991964e-06, + "loss": 0.9668, + "step": 5766 + }, + { + "epoch": 0.4160744561884492, + "grad_norm": 2.127532011479354, + "learning_rate": 2.6298452898981404e-06, + "loss": 0.9447, + "step": 5767 + }, + { + "epoch": 0.4161466036578767, + "grad_norm": 2.182964309295775, + "learning_rate": 2.6294016934035645e-06, + "loss": 0.9234, + "step": 5768 + }, + { + "epoch": 0.4162187511273042, + "grad_norm": 2.4397914581607103, + "learning_rate": 2.628958062539692e-06, + "loss": 0.9581, + "step": 5769 + }, + { + "epoch": 0.41629089859673174, + "grad_norm": 2.600949785175343, + "learning_rate": 2.6285143973307477e-06, + "loss": 0.8775, + "step": 5770 + }, + { + "epoch": 0.4163630460661592, + "grad_norm": 2.0041811867846855, + "learning_rate": 2.6280706978009592e-06, + "loss": 0.8995, + "step": 5771 + }, + { + "epoch": 0.4164351935355867, + "grad_norm": 3.246966339970717, + "learning_rate": 2.6276269639745543e-06, + "loss": 0.9369, + "step": 5772 + }, + { + "epoch": 0.41650734100501424, + "grad_norm": 1.894094829827824, + "learning_rate": 2.6271831958757653e-06, + "loss": 0.9677, + "step": 5773 + }, + { + "epoch": 0.41657948847444176, + "grad_norm": 2.2434110592682672, + "learning_rate": 2.6267393935288233e-06, + "loss": 0.8093, + "step": 5774 + }, + { + "epoch": 0.4166516359438693, + "grad_norm": 3.0254812504079123, + "learning_rate": 2.626295556957963e-06, + "loss": 0.9604, + "step": 5775 + }, + { + "epoch": 0.4167237834132968, + "grad_norm": 2.2528252347982303, + "learning_rate": 2.625851686187421e-06, + "loss": 0.9422, + "step": 5776 + }, + { + "epoch": 0.41679593088272426, + "grad_norm": 2.441349511059436, + "learning_rate": 2.6254077812414345e-06, + "loss": 0.8634, + "step": 5777 + }, + { + "epoch": 0.4168680783521518, + "grad_norm": 4.095956688100583, + "learning_rate": 2.6249638421442457e-06, + "loss": 0.9425, + "step": 5778 + }, + { + "epoch": 0.4169402258215793, + "grad_norm": 2.307075013838154, + "learning_rate": 2.6245198689200943e-06, + "loss": 0.911, + "step": 5779 + }, + { + "epoch": 0.4170123732910068, + "grad_norm": 2.4507360792876627, + "learning_rate": 2.6240758615932238e-06, + "loss": 0.8392, + "step": 5780 + }, + { + "epoch": 0.41708452076043434, + "grad_norm": 2.043806265426874, + "learning_rate": 2.623631820187882e-06, + "loss": 0.8545, + "step": 5781 + }, + { + "epoch": 0.41715666822986186, + "grad_norm": 2.0013647191693327, + "learning_rate": 2.6231877447283147e-06, + "loss": 0.8688, + "step": 5782 + }, + { + "epoch": 0.4172288156992893, + "grad_norm": 2.234555217170792, + "learning_rate": 2.6227436352387717e-06, + "loss": 1.0104, + "step": 5783 + }, + { + "epoch": 0.41730096316871684, + "grad_norm": 2.0904915349351447, + "learning_rate": 2.622299491743504e-06, + "loss": 0.9285, + "step": 5784 + }, + { + "epoch": 0.41737311063814436, + "grad_norm": 2.1027230954279603, + "learning_rate": 2.6218553142667647e-06, + "loss": 0.9818, + "step": 5785 + }, + { + "epoch": 0.4174452581075719, + "grad_norm": 2.205151538961424, + "learning_rate": 2.6214111028328086e-06, + "loss": 0.9437, + "step": 5786 + }, + { + "epoch": 0.4175174055769994, + "grad_norm": 2.175863760953725, + "learning_rate": 2.6209668574658926e-06, + "loss": 0.8144, + "step": 5787 + }, + { + "epoch": 0.4175895530464269, + "grad_norm": 2.4035887430725738, + "learning_rate": 2.620522578190276e-06, + "loss": 0.7724, + "step": 5788 + }, + { + "epoch": 0.4176617005158544, + "grad_norm": 2.3088894056067297, + "learning_rate": 2.620078265030217e-06, + "loss": 0.9596, + "step": 5789 + }, + { + "epoch": 0.4177338479852819, + "grad_norm": 2.0744369018550146, + "learning_rate": 2.619633918009981e-06, + "loss": 0.8407, + "step": 5790 + }, + { + "epoch": 0.4178059954547094, + "grad_norm": 2.8193671420002007, + "learning_rate": 2.6191895371538296e-06, + "loss": 1.0352, + "step": 5791 + }, + { + "epoch": 0.41787814292413694, + "grad_norm": 2.5628690686429128, + "learning_rate": 2.6187451224860296e-06, + "loss": 0.9858, + "step": 5792 + }, + { + "epoch": 0.41795029039356446, + "grad_norm": 2.050419076092436, + "learning_rate": 2.61830067403085e-06, + "loss": 0.927, + "step": 5793 + }, + { + "epoch": 0.418022437862992, + "grad_norm": 2.011010854668063, + "learning_rate": 2.6178561918125582e-06, + "loss": 0.8803, + "step": 5794 + }, + { + "epoch": 0.41809458533241944, + "grad_norm": 2.462856549625496, + "learning_rate": 2.6174116758554277e-06, + "loss": 0.8391, + "step": 5795 + }, + { + "epoch": 0.41816673280184696, + "grad_norm": 2.1687400751683814, + "learning_rate": 2.6169671261837313e-06, + "loss": 0.9142, + "step": 5796 + }, + { + "epoch": 0.4182388802712745, + "grad_norm": 2.252603084733001, + "learning_rate": 2.6165225428217436e-06, + "loss": 0.9089, + "step": 5797 + }, + { + "epoch": 0.418311027740702, + "grad_norm": 3.4197029640638026, + "learning_rate": 2.6160779257937424e-06, + "loss": 0.7987, + "step": 5798 + }, + { + "epoch": 0.4183831752101295, + "grad_norm": 1.9487160319168944, + "learning_rate": 2.615633275124006e-06, + "loss": 0.9209, + "step": 5799 + }, + { + "epoch": 0.41845532267955704, + "grad_norm": 2.644288991540537, + "learning_rate": 2.615188590836816e-06, + "loss": 0.93, + "step": 5800 + }, + { + "epoch": 0.4185274701489845, + "grad_norm": 2.3798494520576776, + "learning_rate": 2.614743872956454e-06, + "loss": 1.0042, + "step": 5801 + }, + { + "epoch": 0.418599617618412, + "grad_norm": 2.4454474366224277, + "learning_rate": 2.614299121507205e-06, + "loss": 0.9684, + "step": 5802 + }, + { + "epoch": 0.41867176508783954, + "grad_norm": 3.951155220784847, + "learning_rate": 2.6138543365133554e-06, + "loss": 0.9004, + "step": 5803 + }, + { + "epoch": 0.41874391255726706, + "grad_norm": 3.629534023421306, + "learning_rate": 2.6134095179991918e-06, + "loss": 0.9168, + "step": 5804 + }, + { + "epoch": 0.4188160600266946, + "grad_norm": 1.9667856619552198, + "learning_rate": 2.612964665989006e-06, + "loss": 0.8995, + "step": 5805 + }, + { + "epoch": 0.4188882074961221, + "grad_norm": 2.376956033822429, + "learning_rate": 2.6125197805070887e-06, + "loss": 0.8588, + "step": 5806 + }, + { + "epoch": 0.41896035496554956, + "grad_norm": 2.051529462654367, + "learning_rate": 2.6120748615777333e-06, + "loss": 0.9112, + "step": 5807 + }, + { + "epoch": 0.4190325024349771, + "grad_norm": 0.712009836879028, + "learning_rate": 2.611629909225236e-06, + "loss": 0.7923, + "step": 5808 + }, + { + "epoch": 0.4191046499044046, + "grad_norm": 1.9743461395695052, + "learning_rate": 2.6111849234738935e-06, + "loss": 0.9339, + "step": 5809 + }, + { + "epoch": 0.4191767973738321, + "grad_norm": 8.578980561273182, + "learning_rate": 2.610739904348005e-06, + "loss": 0.9194, + "step": 5810 + }, + { + "epoch": 0.41924894484325964, + "grad_norm": 2.2672512137506207, + "learning_rate": 2.610294851871871e-06, + "loss": 0.8553, + "step": 5811 + }, + { + "epoch": 0.4193210923126871, + "grad_norm": 2.752546778393136, + "learning_rate": 2.6098497660697945e-06, + "loss": 0.833, + "step": 5812 + }, + { + "epoch": 0.4193932397821146, + "grad_norm": 2.449944250289955, + "learning_rate": 2.6094046469660797e-06, + "loss": 0.8335, + "step": 5813 + }, + { + "epoch": 0.41946538725154214, + "grad_norm": 3.366464630208568, + "learning_rate": 2.6089594945850327e-06, + "loss": 1.0166, + "step": 5814 + }, + { + "epoch": 0.41953753472096966, + "grad_norm": 1.6975718969251647, + "learning_rate": 2.608514308950963e-06, + "loss": 1.0465, + "step": 5815 + }, + { + "epoch": 0.4196096821903972, + "grad_norm": 1.820968378792754, + "learning_rate": 2.608069090088178e-06, + "loss": 0.9766, + "step": 5816 + }, + { + "epoch": 0.4196818296598247, + "grad_norm": 0.8437299726016686, + "learning_rate": 2.6076238380209925e-06, + "loss": 0.869, + "step": 5817 + }, + { + "epoch": 0.41975397712925216, + "grad_norm": 1.981549511722386, + "learning_rate": 2.6071785527737183e-06, + "loss": 0.9644, + "step": 5818 + }, + { + "epoch": 0.4198261245986797, + "grad_norm": 1.9625068858809456, + "learning_rate": 2.6067332343706705e-06, + "loss": 0.9904, + "step": 5819 + }, + { + "epoch": 0.4198982720681072, + "grad_norm": 2.3338928460086574, + "learning_rate": 2.606287882836167e-06, + "loss": 0.9001, + "step": 5820 + }, + { + "epoch": 0.4199704195375347, + "grad_norm": 1.9605710724505767, + "learning_rate": 2.6058424981945265e-06, + "loss": 0.7523, + "step": 5821 + }, + { + "epoch": 0.42004256700696224, + "grad_norm": 2.296598145284471, + "learning_rate": 2.60539708047007e-06, + "loss": 0.888, + "step": 5822 + }, + { + "epoch": 0.42011471447638976, + "grad_norm": 0.8497385211875201, + "learning_rate": 2.60495162968712e-06, + "loss": 0.778, + "step": 5823 + }, + { + "epoch": 0.4201868619458172, + "grad_norm": 1.9978786781791276, + "learning_rate": 2.6045061458700015e-06, + "loss": 0.9384, + "step": 5824 + }, + { + "epoch": 0.42025900941524474, + "grad_norm": 3.492480101477503, + "learning_rate": 2.6040606290430394e-06, + "loss": 0.8361, + "step": 5825 + }, + { + "epoch": 0.42033115688467226, + "grad_norm": 2.5207488200835555, + "learning_rate": 2.603615079230563e-06, + "loss": 0.9922, + "step": 5826 + }, + { + "epoch": 0.4204033043540998, + "grad_norm": 3.1048892074205234, + "learning_rate": 2.6031694964569016e-06, + "loss": 0.9143, + "step": 5827 + }, + { + "epoch": 0.4204754518235273, + "grad_norm": 1.9719056164862963, + "learning_rate": 2.602723880746387e-06, + "loss": 0.9866, + "step": 5828 + }, + { + "epoch": 0.4205475992929548, + "grad_norm": 3.2472384531184546, + "learning_rate": 2.6022782321233516e-06, + "loss": 1.0455, + "step": 5829 + }, + { + "epoch": 0.4206197467623823, + "grad_norm": 2.516895802489809, + "learning_rate": 2.6018325506121324e-06, + "loss": 0.8762, + "step": 5830 + }, + { + "epoch": 0.4206918942318098, + "grad_norm": 2.5175292585201237, + "learning_rate": 2.601386836237065e-06, + "loss": 1.0597, + "step": 5831 + }, + { + "epoch": 0.4207640417012373, + "grad_norm": 5.332231924451765, + "learning_rate": 2.6009410890224894e-06, + "loss": 0.9071, + "step": 5832 + }, + { + "epoch": 0.42083618917066484, + "grad_norm": 0.9293736160360369, + "learning_rate": 2.6004953089927445e-06, + "loss": 0.9146, + "step": 5833 + }, + { + "epoch": 0.42090833664009236, + "grad_norm": 1.664425590912287, + "learning_rate": 2.6000494961721748e-06, + "loss": 0.8273, + "step": 5834 + }, + { + "epoch": 0.4209804841095199, + "grad_norm": 2.292943483177708, + "learning_rate": 2.599603650585123e-06, + "loss": 1.0375, + "step": 5835 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.238470755302073, + "learning_rate": 2.5991577722559354e-06, + "loss": 0.8596, + "step": 5836 + }, + { + "epoch": 0.42112477904837486, + "grad_norm": 2.2755362664628733, + "learning_rate": 2.59871186120896e-06, + "loss": 0.951, + "step": 5837 + }, + { + "epoch": 0.4211969265178024, + "grad_norm": 3.0270521848003207, + "learning_rate": 2.598265917468546e-06, + "loss": 1.038, + "step": 5838 + }, + { + "epoch": 0.4212690739872299, + "grad_norm": 2.2040770076791847, + "learning_rate": 2.597819941059045e-06, + "loss": 0.9879, + "step": 5839 + }, + { + "epoch": 0.4213412214566574, + "grad_norm": 1.9865445869601908, + "learning_rate": 2.59737393200481e-06, + "loss": 1.0107, + "step": 5840 + }, + { + "epoch": 0.42141336892608494, + "grad_norm": 2.191127058933624, + "learning_rate": 2.596927890330196e-06, + "loss": 0.7796, + "step": 5841 + }, + { + "epoch": 0.4214855163955124, + "grad_norm": 2.8620736150224397, + "learning_rate": 2.5964818160595605e-06, + "loss": 0.8551, + "step": 5842 + }, + { + "epoch": 0.4215576638649399, + "grad_norm": 2.417049892777057, + "learning_rate": 2.5960357092172598e-06, + "loss": 0.9394, + "step": 5843 + }, + { + "epoch": 0.42162981133436744, + "grad_norm": 2.8477972385579835, + "learning_rate": 2.5955895698276566e-06, + "loss": 0.8914, + "step": 5844 + }, + { + "epoch": 0.42170195880379496, + "grad_norm": 2.8363332761237827, + "learning_rate": 2.5951433979151117e-06, + "loss": 0.9258, + "step": 5845 + }, + { + "epoch": 0.4217741062732225, + "grad_norm": 2.490487598185361, + "learning_rate": 2.5946971935039878e-06, + "loss": 0.876, + "step": 5846 + }, + { + "epoch": 0.42184625374265, + "grad_norm": 0.7464833066444597, + "learning_rate": 2.594250956618653e-06, + "loss": 0.8882, + "step": 5847 + }, + { + "epoch": 0.42191840121207747, + "grad_norm": 1.8490014371190693, + "learning_rate": 2.593804687283472e-06, + "loss": 0.979, + "step": 5848 + }, + { + "epoch": 0.421990548681505, + "grad_norm": 2.4919602819678963, + "learning_rate": 2.593358385522816e-06, + "loss": 0.9349, + "step": 5849 + }, + { + "epoch": 0.4220626961509325, + "grad_norm": 4.387465456747811, + "learning_rate": 2.5929120513610554e-06, + "loss": 0.9207, + "step": 5850 + }, + { + "epoch": 0.42213484362036, + "grad_norm": 14.586435449454186, + "learning_rate": 2.592465684822562e-06, + "loss": 0.7793, + "step": 5851 + }, + { + "epoch": 0.42220699108978754, + "grad_norm": 2.714101829968566, + "learning_rate": 2.592019285931711e-06, + "loss": 0.8885, + "step": 5852 + }, + { + "epoch": 0.42227913855921506, + "grad_norm": 7.397920584420238, + "learning_rate": 2.5915728547128785e-06, + "loss": 1.0245, + "step": 5853 + }, + { + "epoch": 0.4223512860286425, + "grad_norm": 0.8664555596645604, + "learning_rate": 2.591126391190442e-06, + "loss": 0.7733, + "step": 5854 + }, + { + "epoch": 0.42242343349807004, + "grad_norm": 2.0751445306286613, + "learning_rate": 2.5906798953887814e-06, + "loss": 0.9166, + "step": 5855 + }, + { + "epoch": 0.42249558096749756, + "grad_norm": 2.1790397529284613, + "learning_rate": 2.590233367332279e-06, + "loss": 1.0158, + "step": 5856 + }, + { + "epoch": 0.4225677284369251, + "grad_norm": 2.797322498516316, + "learning_rate": 2.589786807045318e-06, + "loss": 0.9596, + "step": 5857 + }, + { + "epoch": 0.4226398759063526, + "grad_norm": 2.7185592584457137, + "learning_rate": 2.5893402145522806e-06, + "loss": 0.8806, + "step": 5858 + }, + { + "epoch": 0.4227120233757801, + "grad_norm": 2.7807972571670034, + "learning_rate": 2.5888935898775573e-06, + "loss": 0.9498, + "step": 5859 + }, + { + "epoch": 0.4227841708452076, + "grad_norm": 3.0409905365856558, + "learning_rate": 2.5884469330455348e-06, + "loss": 0.8866, + "step": 5860 + }, + { + "epoch": 0.4228563183146351, + "grad_norm": 2.7492792745628836, + "learning_rate": 2.5880002440806037e-06, + "loss": 0.925, + "step": 5861 + }, + { + "epoch": 0.4229284657840626, + "grad_norm": 1.9461864055810674, + "learning_rate": 2.587553523007156e-06, + "loss": 0.9893, + "step": 5862 + }, + { + "epoch": 0.42300061325349014, + "grad_norm": 2.749923444896113, + "learning_rate": 2.587106769849585e-06, + "loss": 0.965, + "step": 5863 + }, + { + "epoch": 0.42307276072291766, + "grad_norm": 3.0332908445743083, + "learning_rate": 2.586659984632287e-06, + "loss": 0.8705, + "step": 5864 + }, + { + "epoch": 0.4231449081923451, + "grad_norm": 6.255385253166055, + "learning_rate": 2.586213167379659e-06, + "loss": 0.8217, + "step": 5865 + }, + { + "epoch": 0.42321705566177265, + "grad_norm": 2.838577428936607, + "learning_rate": 2.5857663181161006e-06, + "loss": 0.9535, + "step": 5866 + }, + { + "epoch": 0.42328920313120016, + "grad_norm": 0.6815160511810603, + "learning_rate": 2.5853194368660114e-06, + "loss": 0.7848, + "step": 5867 + }, + { + "epoch": 0.4233613506006277, + "grad_norm": 4.022029296950402, + "learning_rate": 2.584872523653794e-06, + "loss": 0.8787, + "step": 5868 + }, + { + "epoch": 0.4234334980700552, + "grad_norm": 2.010540011349575, + "learning_rate": 2.584425578503854e-06, + "loss": 0.8619, + "step": 5869 + }, + { + "epoch": 0.4235056455394827, + "grad_norm": 2.2565429708631313, + "learning_rate": 2.583978601440596e-06, + "loss": 0.8816, + "step": 5870 + }, + { + "epoch": 0.4235777930089102, + "grad_norm": 2.3073823335513373, + "learning_rate": 2.583531592488429e-06, + "loss": 0.949, + "step": 5871 + }, + { + "epoch": 0.4236499404783377, + "grad_norm": 2.1253314040321647, + "learning_rate": 2.5830845516717618e-06, + "loss": 0.9406, + "step": 5872 + }, + { + "epoch": 0.4237220879477652, + "grad_norm": 2.1810218801052415, + "learning_rate": 2.5826374790150054e-06, + "loss": 0.9072, + "step": 5873 + }, + { + "epoch": 0.42379423541719274, + "grad_norm": 2.200381254626218, + "learning_rate": 2.5821903745425734e-06, + "loss": 0.9754, + "step": 5874 + }, + { + "epoch": 0.42386638288662026, + "grad_norm": 2.4394371942346615, + "learning_rate": 2.5817432382788802e-06, + "loss": 0.9658, + "step": 5875 + }, + { + "epoch": 0.4239385303560478, + "grad_norm": 2.2452628813101074, + "learning_rate": 2.581296070248342e-06, + "loss": 0.9448, + "step": 5876 + }, + { + "epoch": 0.42401067782547525, + "grad_norm": 2.0671441936498014, + "learning_rate": 2.5808488704753773e-06, + "loss": 1.0283, + "step": 5877 + }, + { + "epoch": 0.42408282529490277, + "grad_norm": 3.4246978960663506, + "learning_rate": 2.5804016389844066e-06, + "loss": 0.8463, + "step": 5878 + }, + { + "epoch": 0.4241549727643303, + "grad_norm": 2.869479476831389, + "learning_rate": 2.5799543757998516e-06, + "loss": 0.8873, + "step": 5879 + }, + { + "epoch": 0.4242271202337578, + "grad_norm": 2.489239231308151, + "learning_rate": 2.579507080946133e-06, + "loss": 0.9626, + "step": 5880 + }, + { + "epoch": 0.4242992677031853, + "grad_norm": 2.08477659460237, + "learning_rate": 2.5790597544476793e-06, + "loss": 0.9856, + "step": 5881 + }, + { + "epoch": 0.42437141517261284, + "grad_norm": 2.7737250380490925, + "learning_rate": 2.5786123963289154e-06, + "loss": 0.966, + "step": 5882 + }, + { + "epoch": 0.4244435626420403, + "grad_norm": 1.9417505093374083, + "learning_rate": 2.578165006614271e-06, + "loss": 0.9555, + "step": 5883 + }, + { + "epoch": 0.4245157101114678, + "grad_norm": 2.427429229144715, + "learning_rate": 2.577717585328176e-06, + "loss": 0.8676, + "step": 5884 + }, + { + "epoch": 0.42458785758089534, + "grad_norm": 1.9408177303125433, + "learning_rate": 2.577270132495062e-06, + "loss": 0.9283, + "step": 5885 + }, + { + "epoch": 0.42466000505032286, + "grad_norm": 1.8454301581797894, + "learning_rate": 2.5768226481393632e-06, + "loss": 0.9175, + "step": 5886 + }, + { + "epoch": 0.4247321525197504, + "grad_norm": 2.460367966400622, + "learning_rate": 2.5763751322855145e-06, + "loss": 0.9162, + "step": 5887 + }, + { + "epoch": 0.4248042999891779, + "grad_norm": 2.7972474090021406, + "learning_rate": 2.575927584957954e-06, + "loss": 0.9345, + "step": 5888 + }, + { + "epoch": 0.42487644745860537, + "grad_norm": 2.489161456979264, + "learning_rate": 2.57548000618112e-06, + "loss": 0.9523, + "step": 5889 + }, + { + "epoch": 0.4249485949280329, + "grad_norm": 2.0976149911925295, + "learning_rate": 2.575032395979453e-06, + "loss": 0.9448, + "step": 5890 + }, + { + "epoch": 0.4250207423974604, + "grad_norm": 2.868126323115263, + "learning_rate": 2.5745847543773962e-06, + "loss": 0.8901, + "step": 5891 + }, + { + "epoch": 0.4250928898668879, + "grad_norm": 2.0945224760356256, + "learning_rate": 2.574137081399392e-06, + "loss": 1.0184, + "step": 5892 + }, + { + "epoch": 0.42516503733631544, + "grad_norm": 4.604422077442111, + "learning_rate": 2.573689377069888e-06, + "loss": 0.954, + "step": 5893 + }, + { + "epoch": 0.42523718480574296, + "grad_norm": 2.33417702362621, + "learning_rate": 2.573241641413331e-06, + "loss": 0.8064, + "step": 5894 + }, + { + "epoch": 0.4253093322751704, + "grad_norm": 0.7975038777080721, + "learning_rate": 2.5727938744541682e-06, + "loss": 0.7723, + "step": 5895 + }, + { + "epoch": 0.42538147974459795, + "grad_norm": 2.1470577734115457, + "learning_rate": 2.5723460762168546e-06, + "loss": 0.7896, + "step": 5896 + }, + { + "epoch": 0.42545362721402546, + "grad_norm": 2.1450808295112607, + "learning_rate": 2.5718982467258387e-06, + "loss": 0.992, + "step": 5897 + }, + { + "epoch": 0.425525774683453, + "grad_norm": 2.192058826457703, + "learning_rate": 2.5714503860055782e-06, + "loss": 0.803, + "step": 5898 + }, + { + "epoch": 0.4255979221528805, + "grad_norm": 3.240324436474422, + "learning_rate": 2.5710024940805265e-06, + "loss": 0.8999, + "step": 5899 + }, + { + "epoch": 0.425670069622308, + "grad_norm": 2.875082761153822, + "learning_rate": 2.5705545709751426e-06, + "loss": 1.045, + "step": 5900 + }, + { + "epoch": 0.4257422170917355, + "grad_norm": 2.4329993928970475, + "learning_rate": 2.570106616713886e-06, + "loss": 0.9126, + "step": 5901 + }, + { + "epoch": 0.425814364561163, + "grad_norm": 2.8996538350251524, + "learning_rate": 2.5696586313212175e-06, + "loss": 0.9861, + "step": 5902 + }, + { + "epoch": 0.4258865120305905, + "grad_norm": 1.738488414292436, + "learning_rate": 2.5692106148215996e-06, + "loss": 0.93, + "step": 5903 + }, + { + "epoch": 0.42595865950001804, + "grad_norm": 0.781800686348638, + "learning_rate": 2.5687625672394974e-06, + "loss": 0.8409, + "step": 5904 + }, + { + "epoch": 0.42603080696944556, + "grad_norm": 2.436716785289465, + "learning_rate": 2.5683144885993767e-06, + "loss": 0.9433, + "step": 5905 + }, + { + "epoch": 0.4261029544388731, + "grad_norm": 2.4423783221045285, + "learning_rate": 2.567866378925706e-06, + "loss": 0.9352, + "step": 5906 + }, + { + "epoch": 0.42617510190830055, + "grad_norm": 2.998800355743676, + "learning_rate": 2.567418238242954e-06, + "loss": 0.9754, + "step": 5907 + }, + { + "epoch": 0.42624724937772807, + "grad_norm": 2.4044477971645697, + "learning_rate": 2.5669700665755934e-06, + "loss": 1.0614, + "step": 5908 + }, + { + "epoch": 0.4263193968471556, + "grad_norm": 2.9756848100060895, + "learning_rate": 2.566521863948095e-06, + "loss": 0.8098, + "step": 5909 + }, + { + "epoch": 0.4263915443165831, + "grad_norm": 2.2935605170958655, + "learning_rate": 2.566073630384936e-06, + "loss": 1.0246, + "step": 5910 + }, + { + "epoch": 0.4264636917860106, + "grad_norm": 2.2512521438546065, + "learning_rate": 2.5656253659105914e-06, + "loss": 0.9223, + "step": 5911 + }, + { + "epoch": 0.42653583925543814, + "grad_norm": 5.137941349869349, + "learning_rate": 2.565177070549539e-06, + "loss": 0.9018, + "step": 5912 + }, + { + "epoch": 0.4266079867248656, + "grad_norm": 2.3263160795391946, + "learning_rate": 2.5647287443262587e-06, + "loss": 1.0585, + "step": 5913 + }, + { + "epoch": 0.4266801341942931, + "grad_norm": 2.5186218986403155, + "learning_rate": 2.564280387265233e-06, + "loss": 0.9075, + "step": 5914 + }, + { + "epoch": 0.42675228166372065, + "grad_norm": 2.341556794472584, + "learning_rate": 2.563831999390944e-06, + "loss": 1.0143, + "step": 5915 + }, + { + "epoch": 0.42682442913314816, + "grad_norm": 3.346636515579071, + "learning_rate": 2.5633835807278766e-06, + "loss": 0.9416, + "step": 5916 + }, + { + "epoch": 0.4268965766025757, + "grad_norm": 2.2979862353623965, + "learning_rate": 2.5629351313005172e-06, + "loss": 1.0195, + "step": 5917 + }, + { + "epoch": 0.42696872407200315, + "grad_norm": 2.2479863162795124, + "learning_rate": 2.5624866511333547e-06, + "loss": 0.9323, + "step": 5918 + }, + { + "epoch": 0.42704087154143067, + "grad_norm": 8.342645807818228, + "learning_rate": 2.562038140250878e-06, + "loss": 0.8803, + "step": 5919 + }, + { + "epoch": 0.4271130190108582, + "grad_norm": 3.8307354181663524, + "learning_rate": 2.5615895986775797e-06, + "loss": 0.9272, + "step": 5920 + }, + { + "epoch": 0.4271851664802857, + "grad_norm": 2.666733283959211, + "learning_rate": 2.5611410264379515e-06, + "loss": 0.8649, + "step": 5921 + }, + { + "epoch": 0.4272573139497132, + "grad_norm": 2.2735307287652597, + "learning_rate": 2.5606924235564897e-06, + "loss": 1.0044, + "step": 5922 + }, + { + "epoch": 0.42732946141914074, + "grad_norm": 2.0024974250538783, + "learning_rate": 2.5602437900576905e-06, + "loss": 0.8837, + "step": 5923 + }, + { + "epoch": 0.4274016088885682, + "grad_norm": 2.1561254935322904, + "learning_rate": 2.5597951259660513e-06, + "loss": 0.912, + "step": 5924 + }, + { + "epoch": 0.4274737563579957, + "grad_norm": 0.7809248056711858, + "learning_rate": 2.5593464313060726e-06, + "loss": 0.84, + "step": 5925 + }, + { + "epoch": 0.42754590382742325, + "grad_norm": 1.8642014124768058, + "learning_rate": 2.558897706102256e-06, + "loss": 1.0375, + "step": 5926 + }, + { + "epoch": 0.42761805129685077, + "grad_norm": 2.695419530540223, + "learning_rate": 2.558448950379105e-06, + "loss": 0.9678, + "step": 5927 + }, + { + "epoch": 0.4276901987662783, + "grad_norm": 2.8681584099365476, + "learning_rate": 2.5580001641611235e-06, + "loss": 1.032, + "step": 5928 + }, + { + "epoch": 0.4277623462357058, + "grad_norm": 2.3573549224776875, + "learning_rate": 2.557551347472819e-06, + "loss": 1.0078, + "step": 5929 + }, + { + "epoch": 0.42783449370513327, + "grad_norm": 3.1068245860989743, + "learning_rate": 2.5571025003386995e-06, + "loss": 0.7585, + "step": 5930 + }, + { + "epoch": 0.4279066411745608, + "grad_norm": 1.8003811671082466, + "learning_rate": 2.5566536227832744e-06, + "loss": 0.8946, + "step": 5931 + }, + { + "epoch": 0.4279787886439883, + "grad_norm": 2.0969314317457686, + "learning_rate": 2.5562047148310564e-06, + "loss": 0.8647, + "step": 5932 + }, + { + "epoch": 0.4280509361134158, + "grad_norm": 2.435184454854005, + "learning_rate": 2.5557557765065577e-06, + "loss": 0.8427, + "step": 5933 + }, + { + "epoch": 0.42812308358284334, + "grad_norm": 3.619662334226545, + "learning_rate": 2.5553068078342924e-06, + "loss": 0.843, + "step": 5934 + }, + { + "epoch": 0.42819523105227086, + "grad_norm": 8.564080976449919, + "learning_rate": 2.554857808838779e-06, + "loss": 0.8371, + "step": 5935 + }, + { + "epoch": 0.4282673785216983, + "grad_norm": 1.6038556313458987, + "learning_rate": 2.554408779544534e-06, + "loss": 0.9756, + "step": 5936 + }, + { + "epoch": 0.42833952599112585, + "grad_norm": 2.410834762549963, + "learning_rate": 2.5539597199760785e-06, + "loss": 0.9815, + "step": 5937 + }, + { + "epoch": 0.42841167346055337, + "grad_norm": 2.1668164739415254, + "learning_rate": 2.5535106301579335e-06, + "loss": 0.9809, + "step": 5938 + }, + { + "epoch": 0.4284838209299809, + "grad_norm": 2.4159472370259882, + "learning_rate": 2.5530615101146214e-06, + "loss": 0.8582, + "step": 5939 + }, + { + "epoch": 0.4285559683994084, + "grad_norm": 2.1481030013040403, + "learning_rate": 2.552612359870668e-06, + "loss": 0.9565, + "step": 5940 + }, + { + "epoch": 0.4286281158688359, + "grad_norm": 1.966922396127281, + "learning_rate": 2.552163179450599e-06, + "loss": 0.9107, + "step": 5941 + }, + { + "epoch": 0.4287002633382634, + "grad_norm": 1.8449295925776148, + "learning_rate": 2.5517139688789436e-06, + "loss": 0.8771, + "step": 5942 + }, + { + "epoch": 0.4287724108076909, + "grad_norm": 2.4175331820367596, + "learning_rate": 2.5512647281802295e-06, + "loss": 1.0129, + "step": 5943 + }, + { + "epoch": 0.4288445582771184, + "grad_norm": 2.4939937921735873, + "learning_rate": 2.5508154573789903e-06, + "loss": 0.9296, + "step": 5944 + }, + { + "epoch": 0.42891670574654595, + "grad_norm": 8.797127809838658, + "learning_rate": 2.5503661564997577e-06, + "loss": 0.9684, + "step": 5945 + }, + { + "epoch": 0.42898885321597346, + "grad_norm": 2.638996863805274, + "learning_rate": 2.5499168255670664e-06, + "loss": 0.972, + "step": 5946 + }, + { + "epoch": 0.429061000685401, + "grad_norm": 2.8461995735777696, + "learning_rate": 2.5494674646054537e-06, + "loss": 1.0259, + "step": 5947 + }, + { + "epoch": 0.42913314815482845, + "grad_norm": 9.11985403780666, + "learning_rate": 2.5490180736394555e-06, + "loss": 1.0364, + "step": 5948 + }, + { + "epoch": 0.42920529562425597, + "grad_norm": 1.8143830219108545, + "learning_rate": 2.548568652693614e-06, + "loss": 1.0805, + "step": 5949 + }, + { + "epoch": 0.4292774430936835, + "grad_norm": 0.8392691487413816, + "learning_rate": 2.5481192017924687e-06, + "loss": 0.8447, + "step": 5950 + }, + { + "epoch": 0.429349590563111, + "grad_norm": 2.015307142641267, + "learning_rate": 2.547669720960563e-06, + "loss": 0.8169, + "step": 5951 + }, + { + "epoch": 0.4294217380325385, + "grad_norm": 1.8999864828733173, + "learning_rate": 2.5472202102224408e-06, + "loss": 0.9949, + "step": 5952 + }, + { + "epoch": 0.42949388550196604, + "grad_norm": 2.0744280520945164, + "learning_rate": 2.546770669602649e-06, + "loss": 1.0498, + "step": 5953 + }, + { + "epoch": 0.4295660329713935, + "grad_norm": 2.347811219341853, + "learning_rate": 2.546321099125735e-06, + "loss": 1.0054, + "step": 5954 + }, + { + "epoch": 0.429638180440821, + "grad_norm": 2.1728038311119744, + "learning_rate": 2.5458714988162477e-06, + "loss": 1.0391, + "step": 5955 + }, + { + "epoch": 0.42971032791024855, + "grad_norm": 2.0806274261872986, + "learning_rate": 2.545421868698739e-06, + "loss": 0.9494, + "step": 5956 + }, + { + "epoch": 0.42978247537967607, + "grad_norm": 2.631937487170022, + "learning_rate": 2.5449722087977616e-06, + "loss": 0.9708, + "step": 5957 + }, + { + "epoch": 0.4298546228491036, + "grad_norm": 2.459601145408464, + "learning_rate": 2.544522519137868e-06, + "loss": 0.9221, + "step": 5958 + }, + { + "epoch": 0.4299267703185311, + "grad_norm": 2.2588572477413944, + "learning_rate": 2.544072799743617e-06, + "loss": 1.0481, + "step": 5959 + }, + { + "epoch": 0.42999891778795857, + "grad_norm": 2.341641914897532, + "learning_rate": 2.5436230506395637e-06, + "loss": 0.7931, + "step": 5960 + }, + { + "epoch": 0.4300710652573861, + "grad_norm": 1.8173041636704812, + "learning_rate": 2.543173271850268e-06, + "loss": 1.0478, + "step": 5961 + }, + { + "epoch": 0.4301432127268136, + "grad_norm": 2.208518710243056, + "learning_rate": 2.542723463400291e-06, + "loss": 0.9598, + "step": 5962 + }, + { + "epoch": 0.4302153601962411, + "grad_norm": 2.4562728026595915, + "learning_rate": 2.542273625314194e-06, + "loss": 0.9134, + "step": 5963 + }, + { + "epoch": 0.43028750766566864, + "grad_norm": 2.641631346585789, + "learning_rate": 2.541823757616542e-06, + "loss": 0.9248, + "step": 5964 + }, + { + "epoch": 0.43035965513509616, + "grad_norm": 3.266178066017765, + "learning_rate": 2.541373860331901e-06, + "loss": 0.8484, + "step": 5965 + }, + { + "epoch": 0.43043180260452363, + "grad_norm": 3.571747005435492, + "learning_rate": 2.540923933484837e-06, + "loss": 1.0033, + "step": 5966 + }, + { + "epoch": 0.43050395007395115, + "grad_norm": 24.469680398179754, + "learning_rate": 2.5404739770999194e-06, + "loss": 0.8812, + "step": 5967 + }, + { + "epoch": 0.43057609754337867, + "grad_norm": 3.6993615707976075, + "learning_rate": 2.540023991201719e-06, + "loss": 0.8992, + "step": 5968 + }, + { + "epoch": 0.4306482450128062, + "grad_norm": 2.289212479088401, + "learning_rate": 2.539573975814808e-06, + "loss": 0.9418, + "step": 5969 + }, + { + "epoch": 0.4307203924822337, + "grad_norm": 1.8745990960359404, + "learning_rate": 2.5391239309637585e-06, + "loss": 0.9282, + "step": 5970 + }, + { + "epoch": 0.43079253995166117, + "grad_norm": 1.869365236186066, + "learning_rate": 2.5386738566731476e-06, + "loss": 0.9492, + "step": 5971 + }, + { + "epoch": 0.4308646874210887, + "grad_norm": 0.7002137845421248, + "learning_rate": 2.5382237529675516e-06, + "loss": 0.7439, + "step": 5972 + }, + { + "epoch": 0.4309368348905162, + "grad_norm": 2.4296024221753005, + "learning_rate": 2.5377736198715484e-06, + "loss": 0.8218, + "step": 5973 + }, + { + "epoch": 0.4310089823599437, + "grad_norm": 2.4185705468293115, + "learning_rate": 2.537323457409719e-06, + "loss": 0.9408, + "step": 5974 + }, + { + "epoch": 0.43108112982937125, + "grad_norm": 4.646798596655119, + "learning_rate": 2.536873265606644e-06, + "loss": 0.9802, + "step": 5975 + }, + { + "epoch": 0.43115327729879876, + "grad_norm": 0.7544634247466179, + "learning_rate": 2.5364230444869077e-06, + "loss": 0.7951, + "step": 5976 + }, + { + "epoch": 0.43122542476822623, + "grad_norm": 1.7552322099308708, + "learning_rate": 2.5359727940750955e-06, + "loss": 0.9824, + "step": 5977 + }, + { + "epoch": 0.43129757223765375, + "grad_norm": 2.2929639670015063, + "learning_rate": 2.5355225143957925e-06, + "loss": 0.952, + "step": 5978 + }, + { + "epoch": 0.43136971970708127, + "grad_norm": 2.7001929284856114, + "learning_rate": 2.535072205473587e-06, + "loss": 1.0101, + "step": 5979 + }, + { + "epoch": 0.4314418671765088, + "grad_norm": 2.671891954156279, + "learning_rate": 2.5346218673330695e-06, + "loss": 0.8758, + "step": 5980 + }, + { + "epoch": 0.4315140146459363, + "grad_norm": 2.59127855328416, + "learning_rate": 2.5341714999988313e-06, + "loss": 0.8975, + "step": 5981 + }, + { + "epoch": 0.4315861621153638, + "grad_norm": 0.8712626751014664, + "learning_rate": 2.5337211034954643e-06, + "loss": 0.8262, + "step": 5982 + }, + { + "epoch": 0.4316583095847913, + "grad_norm": 1.9005718399636766, + "learning_rate": 2.5332706778475642e-06, + "loss": 0.9433, + "step": 5983 + }, + { + "epoch": 0.4317304570542188, + "grad_norm": 3.607744649626575, + "learning_rate": 2.532820223079726e-06, + "loss": 0.913, + "step": 5984 + }, + { + "epoch": 0.4318026045236463, + "grad_norm": 3.7066211533896825, + "learning_rate": 2.532369739216548e-06, + "loss": 0.9266, + "step": 5985 + }, + { + "epoch": 0.43187475199307385, + "grad_norm": 1.7238721643523718, + "learning_rate": 2.53191922628263e-06, + "loss": 0.9651, + "step": 5986 + }, + { + "epoch": 0.43194689946250137, + "grad_norm": 2.084642672759435, + "learning_rate": 2.5314686843025713e-06, + "loss": 0.9424, + "step": 5987 + }, + { + "epoch": 0.4320190469319289, + "grad_norm": 2.089743809672247, + "learning_rate": 2.5310181133009753e-06, + "loss": 0.8944, + "step": 5988 + }, + { + "epoch": 0.43209119440135635, + "grad_norm": 2.1755202186467315, + "learning_rate": 2.5305675133024464e-06, + "loss": 0.9896, + "step": 5989 + }, + { + "epoch": 0.43216334187078387, + "grad_norm": 2.0456166366932993, + "learning_rate": 2.530116884331589e-06, + "loss": 0.9883, + "step": 5990 + }, + { + "epoch": 0.4322354893402114, + "grad_norm": 2.9097831333272253, + "learning_rate": 2.529666226413011e-06, + "loss": 1.0251, + "step": 5991 + }, + { + "epoch": 0.4323076368096389, + "grad_norm": 7.374229778867957, + "learning_rate": 2.5292155395713216e-06, + "loss": 0.992, + "step": 5992 + }, + { + "epoch": 0.4323797842790664, + "grad_norm": 2.897896687932167, + "learning_rate": 2.5287648238311306e-06, + "loss": 0.9814, + "step": 5993 + }, + { + "epoch": 0.43245193174849395, + "grad_norm": 3.563511085917995, + "learning_rate": 2.5283140792170496e-06, + "loss": 0.9151, + "step": 5994 + }, + { + "epoch": 0.4325240792179214, + "grad_norm": 2.3337225475852237, + "learning_rate": 2.5278633057536926e-06, + "loss": 0.9923, + "step": 5995 + }, + { + "epoch": 0.43259622668734893, + "grad_norm": 1.6641194508437895, + "learning_rate": 2.5274125034656753e-06, + "loss": 0.9954, + "step": 5996 + }, + { + "epoch": 0.43266837415677645, + "grad_norm": 2.102773664810053, + "learning_rate": 2.526961672377612e-06, + "loss": 0.9778, + "step": 5997 + }, + { + "epoch": 0.43274052162620397, + "grad_norm": 1.9929171553647111, + "learning_rate": 2.5265108125141237e-06, + "loss": 0.9751, + "step": 5998 + }, + { + "epoch": 0.4328126690956315, + "grad_norm": 1.6689733120091317, + "learning_rate": 2.5260599238998296e-06, + "loss": 0.9742, + "step": 5999 + }, + { + "epoch": 0.432884816565059, + "grad_norm": 2.7431810106220396, + "learning_rate": 2.5256090065593487e-06, + "loss": 0.9193, + "step": 6000 + }, + { + "epoch": 0.43295696403448647, + "grad_norm": 1.7539298346957928, + "learning_rate": 2.5251580605173075e-06, + "loss": 0.9652, + "step": 6001 + }, + { + "epoch": 0.433029111503914, + "grad_norm": 22.38730594075929, + "learning_rate": 2.5247070857983276e-06, + "loss": 0.8432, + "step": 6002 + }, + { + "epoch": 0.4331012589733415, + "grad_norm": 2.3535742788010197, + "learning_rate": 2.524256082427036e-06, + "loss": 0.9841, + "step": 6003 + }, + { + "epoch": 0.433173406442769, + "grad_norm": 2.6213893581559424, + "learning_rate": 2.5238050504280614e-06, + "loss": 0.9197, + "step": 6004 + }, + { + "epoch": 0.43324555391219655, + "grad_norm": 2.3595130324249336, + "learning_rate": 2.5233539898260313e-06, + "loss": 0.9674, + "step": 6005 + }, + { + "epoch": 0.43331770138162407, + "grad_norm": 2.2976155028091143, + "learning_rate": 2.522902900645577e-06, + "loss": 0.9687, + "step": 6006 + }, + { + "epoch": 0.43338984885105153, + "grad_norm": 2.036806107766762, + "learning_rate": 2.5224517829113314e-06, + "loss": 0.9832, + "step": 6007 + }, + { + "epoch": 0.43346199632047905, + "grad_norm": 2.3231634741617824, + "learning_rate": 2.522000636647929e-06, + "loss": 0.9563, + "step": 6008 + }, + { + "epoch": 0.43353414378990657, + "grad_norm": 2.164766823631543, + "learning_rate": 2.5215494618800025e-06, + "loss": 0.9344, + "step": 6009 + }, + { + "epoch": 0.4336062912593341, + "grad_norm": 2.8009004235018864, + "learning_rate": 2.5210982586321912e-06, + "loss": 1.0193, + "step": 6010 + }, + { + "epoch": 0.4336784387287616, + "grad_norm": 2.135871677458831, + "learning_rate": 2.520647026929134e-06, + "loss": 0.8832, + "step": 6011 + }, + { + "epoch": 0.4337505861981891, + "grad_norm": 3.8041108181973966, + "learning_rate": 2.5201957667954687e-06, + "loss": 0.92, + "step": 6012 + }, + { + "epoch": 0.4338227336676166, + "grad_norm": 2.4428031160121426, + "learning_rate": 2.5197444782558387e-06, + "loss": 0.971, + "step": 6013 + }, + { + "epoch": 0.4338948811370441, + "grad_norm": 2.5595080889517945, + "learning_rate": 2.519293161334887e-06, + "loss": 0.9551, + "step": 6014 + }, + { + "epoch": 0.4339670286064716, + "grad_norm": 2.451753366503228, + "learning_rate": 2.5188418160572577e-06, + "loss": 0.978, + "step": 6015 + }, + { + "epoch": 0.43403917607589915, + "grad_norm": 2.1785096845431267, + "learning_rate": 2.518390442447598e-06, + "loss": 0.9742, + "step": 6016 + }, + { + "epoch": 0.43411132354532667, + "grad_norm": 1.900504850775297, + "learning_rate": 2.517939040530555e-06, + "loss": 0.9677, + "step": 6017 + }, + { + "epoch": 0.43418347101475413, + "grad_norm": 3.3660066662709345, + "learning_rate": 2.5174876103307784e-06, + "loss": 0.9235, + "step": 6018 + }, + { + "epoch": 0.43425561848418165, + "grad_norm": 2.0579027901945177, + "learning_rate": 2.5170361518729184e-06, + "loss": 0.942, + "step": 6019 + }, + { + "epoch": 0.43432776595360917, + "grad_norm": 1.8756711394479988, + "learning_rate": 2.5165846651816296e-06, + "loss": 0.9813, + "step": 6020 + }, + { + "epoch": 0.4343999134230367, + "grad_norm": 2.4791421541770604, + "learning_rate": 2.5161331502815636e-06, + "loss": 0.9407, + "step": 6021 + }, + { + "epoch": 0.4344720608924642, + "grad_norm": 2.3429744200146567, + "learning_rate": 2.515681607197377e-06, + "loss": 0.8824, + "step": 6022 + }, + { + "epoch": 0.4345442083618917, + "grad_norm": 2.0592469657494914, + "learning_rate": 2.515230035953727e-06, + "loss": 0.8791, + "step": 6023 + }, + { + "epoch": 0.4346163558313192, + "grad_norm": 2.4063725626063595, + "learning_rate": 2.5147784365752715e-06, + "loss": 0.9115, + "step": 6024 + }, + { + "epoch": 0.4346885033007467, + "grad_norm": 5.183620426940115, + "learning_rate": 2.5143268090866718e-06, + "loss": 0.9153, + "step": 6025 + }, + { + "epoch": 0.43476065077017423, + "grad_norm": 2.2425651630983183, + "learning_rate": 2.5138751535125898e-06, + "loss": 0.9303, + "step": 6026 + }, + { + "epoch": 0.43483279823960175, + "grad_norm": 2.8602003563546403, + "learning_rate": 2.5134234698776865e-06, + "loss": 0.8961, + "step": 6027 + }, + { + "epoch": 0.43490494570902927, + "grad_norm": 0.8416634296757862, + "learning_rate": 2.5129717582066288e-06, + "loss": 0.862, + "step": 6028 + }, + { + "epoch": 0.4349770931784568, + "grad_norm": 3.074047505554566, + "learning_rate": 2.5125200185240822e-06, + "loss": 0.9302, + "step": 6029 + }, + { + "epoch": 0.43504924064788425, + "grad_norm": 2.565601611756787, + "learning_rate": 2.5120682508547146e-06, + "loss": 1.0367, + "step": 6030 + }, + { + "epoch": 0.43512138811731177, + "grad_norm": 2.770756933167638, + "learning_rate": 2.511616455223195e-06, + "loss": 0.8467, + "step": 6031 + }, + { + "epoch": 0.4351935355867393, + "grad_norm": 2.601950126844373, + "learning_rate": 2.511164631654196e-06, + "loss": 0.9111, + "step": 6032 + }, + { + "epoch": 0.4352656830561668, + "grad_norm": 2.3292703563377564, + "learning_rate": 2.510712780172388e-06, + "loss": 0.8198, + "step": 6033 + }, + { + "epoch": 0.4353378305255943, + "grad_norm": 2.932880584137886, + "learning_rate": 2.5102609008024455e-06, + "loss": 0.8021, + "step": 6034 + }, + { + "epoch": 0.43540997799502185, + "grad_norm": 2.4155117980567833, + "learning_rate": 2.5098089935690444e-06, + "loss": 0.9998, + "step": 6035 + }, + { + "epoch": 0.4354821254644493, + "grad_norm": 1.9453069234867673, + "learning_rate": 2.509357058496861e-06, + "loss": 0.9486, + "step": 6036 + }, + { + "epoch": 0.43555427293387683, + "grad_norm": 2.814301740187174, + "learning_rate": 2.508905095610575e-06, + "loss": 0.9471, + "step": 6037 + }, + { + "epoch": 0.43562642040330435, + "grad_norm": 2.7380113237368344, + "learning_rate": 2.5084531049348655e-06, + "loss": 0.9202, + "step": 6038 + }, + { + "epoch": 0.43569856787273187, + "grad_norm": 4.466007271295651, + "learning_rate": 2.508001086494413e-06, + "loss": 0.8826, + "step": 6039 + }, + { + "epoch": 0.4357707153421594, + "grad_norm": 2.744458945060583, + "learning_rate": 2.507549040313903e-06, + "loss": 0.8659, + "step": 6040 + }, + { + "epoch": 0.4358428628115869, + "grad_norm": 2.5811181466553226, + "learning_rate": 2.5070969664180183e-06, + "loss": 1.0152, + "step": 6041 + }, + { + "epoch": 0.43591501028101437, + "grad_norm": 3.028237330318638, + "learning_rate": 2.506644864831445e-06, + "loss": 0.9483, + "step": 6042 + }, + { + "epoch": 0.4359871577504419, + "grad_norm": 1.863742859159589, + "learning_rate": 2.506192735578872e-06, + "loss": 0.9853, + "step": 6043 + }, + { + "epoch": 0.4360593052198694, + "grad_norm": 2.685086519491873, + "learning_rate": 2.505740578684987e-06, + "loss": 0.9926, + "step": 6044 + }, + { + "epoch": 0.43613145268929693, + "grad_norm": 2.370156720448267, + "learning_rate": 2.5052883941744816e-06, + "loss": 0.9413, + "step": 6045 + }, + { + "epoch": 0.43620360015872445, + "grad_norm": 2.6771757849575795, + "learning_rate": 2.5048361820720472e-06, + "loss": 1.0131, + "step": 6046 + }, + { + "epoch": 0.43627574762815197, + "grad_norm": 1.988538264542619, + "learning_rate": 2.504383942402378e-06, + "loss": 0.8708, + "step": 6047 + }, + { + "epoch": 0.43634789509757943, + "grad_norm": 2.593703304008312, + "learning_rate": 2.5039316751901687e-06, + "loss": 0.9368, + "step": 6048 + }, + { + "epoch": 0.43642004256700695, + "grad_norm": 2.223760615425869, + "learning_rate": 2.5034793804601163e-06, + "loss": 1.0293, + "step": 6049 + }, + { + "epoch": 0.43649219003643447, + "grad_norm": 2.0746042356748853, + "learning_rate": 2.5030270582369186e-06, + "loss": 1.0012, + "step": 6050 + }, + { + "epoch": 0.436564337505862, + "grad_norm": 1.8475918587237061, + "learning_rate": 2.502574708545275e-06, + "loss": 0.9509, + "step": 6051 + }, + { + "epoch": 0.4366364849752895, + "grad_norm": 2.450782775392018, + "learning_rate": 2.5021223314098877e-06, + "loss": 0.9806, + "step": 6052 + }, + { + "epoch": 0.436708632444717, + "grad_norm": 4.37867974436484, + "learning_rate": 2.5016699268554585e-06, + "loss": 0.9681, + "step": 6053 + }, + { + "epoch": 0.4367807799141445, + "grad_norm": 1.7957146256489112, + "learning_rate": 2.5012174949066916e-06, + "loss": 0.9645, + "step": 6054 + }, + { + "epoch": 0.436852927383572, + "grad_norm": 2.134160548505115, + "learning_rate": 2.500765035588293e-06, + "loss": 0.8793, + "step": 6055 + }, + { + "epoch": 0.43692507485299953, + "grad_norm": 2.499659324322196, + "learning_rate": 2.500312548924969e-06, + "loss": 0.8361, + "step": 6056 + }, + { + "epoch": 0.43699722232242705, + "grad_norm": 2.558019116351866, + "learning_rate": 2.4998600349414293e-06, + "loss": 0.9105, + "step": 6057 + }, + { + "epoch": 0.43706936979185457, + "grad_norm": 2.2976889692595006, + "learning_rate": 2.499407493662383e-06, + "loss": 0.8605, + "step": 6058 + }, + { + "epoch": 0.4371415172612821, + "grad_norm": 2.7167396566808932, + "learning_rate": 2.4989549251125427e-06, + "loss": 0.9539, + "step": 6059 + }, + { + "epoch": 0.43721366473070955, + "grad_norm": 11.464055818324685, + "learning_rate": 2.498502329316621e-06, + "loss": 0.9568, + "step": 6060 + }, + { + "epoch": 0.43728581220013707, + "grad_norm": 2.531263563331717, + "learning_rate": 2.4980497062993323e-06, + "loss": 0.9852, + "step": 6061 + }, + { + "epoch": 0.4373579596695646, + "grad_norm": 3.196288614946119, + "learning_rate": 2.4975970560853936e-06, + "loss": 0.9144, + "step": 6062 + }, + { + "epoch": 0.4374301071389921, + "grad_norm": 3.673451835834758, + "learning_rate": 2.49714437869952e-06, + "loss": 0.8347, + "step": 6063 + }, + { + "epoch": 0.4375022546084196, + "grad_norm": 2.278620210049386, + "learning_rate": 2.4966916741664338e-06, + "loss": 0.892, + "step": 6064 + }, + { + "epoch": 0.43757440207784715, + "grad_norm": 2.966003110877558, + "learning_rate": 2.4962389425108536e-06, + "loss": 0.9232, + "step": 6065 + }, + { + "epoch": 0.4376465495472746, + "grad_norm": 2.0933375165780097, + "learning_rate": 2.495786183757501e-06, + "loss": 0.8215, + "step": 6066 + }, + { + "epoch": 0.43771869701670213, + "grad_norm": 2.10062986192308, + "learning_rate": 2.4953333979311014e-06, + "loss": 0.9066, + "step": 6067 + }, + { + "epoch": 0.43779084448612965, + "grad_norm": 4.059958029469, + "learning_rate": 2.4948805850563775e-06, + "loss": 0.9903, + "step": 6068 + }, + { + "epoch": 0.43786299195555717, + "grad_norm": 2.27282832527919, + "learning_rate": 2.4944277451580576e-06, + "loss": 0.929, + "step": 6069 + }, + { + "epoch": 0.4379351394249847, + "grad_norm": 3.0535101219148153, + "learning_rate": 2.4939748782608687e-06, + "loss": 0.8694, + "step": 6070 + }, + { + "epoch": 0.43800728689441215, + "grad_norm": 4.878821415770659, + "learning_rate": 2.4935219843895403e-06, + "loss": 0.8686, + "step": 6071 + }, + { + "epoch": 0.43807943436383967, + "grad_norm": 3.0063125635564516, + "learning_rate": 2.4930690635688034e-06, + "loss": 0.9327, + "step": 6072 + }, + { + "epoch": 0.4381515818332672, + "grad_norm": 3.3068390463945727, + "learning_rate": 2.49261611582339e-06, + "loss": 0.9853, + "step": 6073 + }, + { + "epoch": 0.4382237293026947, + "grad_norm": 1.677128532542989, + "learning_rate": 2.492163141178034e-06, + "loss": 0.9673, + "step": 6074 + }, + { + "epoch": 0.43829587677212223, + "grad_norm": 2.9768367450762123, + "learning_rate": 2.4917101396574706e-06, + "loss": 0.8784, + "step": 6075 + }, + { + "epoch": 0.43836802424154975, + "grad_norm": 2.0412750517317626, + "learning_rate": 2.4912571112864375e-06, + "loss": 1.0559, + "step": 6076 + }, + { + "epoch": 0.4384401717109772, + "grad_norm": 2.786073680722181, + "learning_rate": 2.4908040560896722e-06, + "loss": 0.9594, + "step": 6077 + }, + { + "epoch": 0.43851231918040473, + "grad_norm": 15.587904207056168, + "learning_rate": 2.490350974091913e-06, + "loss": 0.9743, + "step": 6078 + }, + { + "epoch": 0.43858446664983225, + "grad_norm": 3.1356821211838906, + "learning_rate": 2.4898978653179036e-06, + "loss": 0.8929, + "step": 6079 + }, + { + "epoch": 0.43865661411925977, + "grad_norm": 2.565329036763903, + "learning_rate": 2.4894447297923843e-06, + "loss": 1.0025, + "step": 6080 + }, + { + "epoch": 0.4387287615886873, + "grad_norm": 2.0399801593638465, + "learning_rate": 2.488991567540101e-06, + "loss": 0.933, + "step": 6081 + }, + { + "epoch": 0.4388009090581148, + "grad_norm": 3.126013934630944, + "learning_rate": 2.4885383785857983e-06, + "loss": 0.9361, + "step": 6082 + }, + { + "epoch": 0.43887305652754227, + "grad_norm": 1.9503591255686643, + "learning_rate": 2.488085162954223e-06, + "loss": 0.9051, + "step": 6083 + }, + { + "epoch": 0.4389452039969698, + "grad_norm": 2.770511857124956, + "learning_rate": 2.4876319206701236e-06, + "loss": 1.0308, + "step": 6084 + }, + { + "epoch": 0.4390173514663973, + "grad_norm": 4.847613362899532, + "learning_rate": 2.48717865175825e-06, + "loss": 0.9948, + "step": 6085 + }, + { + "epoch": 0.43908949893582483, + "grad_norm": 3.712568771804231, + "learning_rate": 2.4867253562433546e-06, + "loss": 0.8888, + "step": 6086 + }, + { + "epoch": 0.43916164640525235, + "grad_norm": 2.275855073474628, + "learning_rate": 2.4862720341501893e-06, + "loss": 0.8533, + "step": 6087 + }, + { + "epoch": 0.43923379387467987, + "grad_norm": 2.6077890659054193, + "learning_rate": 2.485818685503507e-06, + "loss": 0.898, + "step": 6088 + }, + { + "epoch": 0.43930594134410733, + "grad_norm": 2.102481796966025, + "learning_rate": 2.485365310328066e-06, + "loss": 0.8775, + "step": 6089 + }, + { + "epoch": 0.43937808881353485, + "grad_norm": 2.0399510577819635, + "learning_rate": 2.484911908648621e-06, + "loss": 0.9649, + "step": 6090 + }, + { + "epoch": 0.43945023628296237, + "grad_norm": 2.3302971864033197, + "learning_rate": 2.4844584804899326e-06, + "loss": 0.8833, + "step": 6091 + }, + { + "epoch": 0.4395223837523899, + "grad_norm": 2.2313707335048107, + "learning_rate": 2.48400502587676e-06, + "loss": 0.8821, + "step": 6092 + }, + { + "epoch": 0.4395945312218174, + "grad_norm": 0.7568125862703078, + "learning_rate": 2.4835515448338644e-06, + "loss": 0.8468, + "step": 6093 + }, + { + "epoch": 0.4396666786912449, + "grad_norm": 2.359171372272735, + "learning_rate": 2.4830980373860094e-06, + "loss": 0.882, + "step": 6094 + }, + { + "epoch": 0.4397388261606724, + "grad_norm": 2.658220794192975, + "learning_rate": 2.4826445035579583e-06, + "loss": 0.82, + "step": 6095 + }, + { + "epoch": 0.4398109736300999, + "grad_norm": 2.049927742777865, + "learning_rate": 2.482190943374478e-06, + "loss": 0.9765, + "step": 6096 + }, + { + "epoch": 0.43988312109952743, + "grad_norm": 2.3330820039087548, + "learning_rate": 2.4817373568603346e-06, + "loss": 0.8696, + "step": 6097 + }, + { + "epoch": 0.43995526856895495, + "grad_norm": 2.192953342211024, + "learning_rate": 2.481283744040299e-06, + "loss": 0.9968, + "step": 6098 + }, + { + "epoch": 0.44002741603838247, + "grad_norm": 2.784652942695191, + "learning_rate": 2.480830104939139e-06, + "loss": 1.0239, + "step": 6099 + }, + { + "epoch": 0.44009956350781, + "grad_norm": 3.224373063058968, + "learning_rate": 2.4803764395816264e-06, + "loss": 0.9452, + "step": 6100 + }, + { + "epoch": 0.44017171097723745, + "grad_norm": 6.10324374394453, + "learning_rate": 2.479922747992535e-06, + "loss": 1.0364, + "step": 6101 + }, + { + "epoch": 0.44024385844666497, + "grad_norm": 1.9936817623885679, + "learning_rate": 2.479469030196639e-06, + "loss": 0.8387, + "step": 6102 + }, + { + "epoch": 0.4403160059160925, + "grad_norm": 2.5116865710723006, + "learning_rate": 2.4790152862187137e-06, + "loss": 0.9491, + "step": 6103 + }, + { + "epoch": 0.44038815338552, + "grad_norm": 2.351323822176741, + "learning_rate": 2.4785615160835382e-06, + "loss": 0.969, + "step": 6104 + }, + { + "epoch": 0.44046030085494753, + "grad_norm": 2.470329744977313, + "learning_rate": 2.4781077198158886e-06, + "loss": 0.9878, + "step": 6105 + }, + { + "epoch": 0.44053244832437505, + "grad_norm": 0.7662783968145737, + "learning_rate": 2.4776538974405473e-06, + "loss": 0.7225, + "step": 6106 + }, + { + "epoch": 0.4406045957938025, + "grad_norm": 2.435028094339149, + "learning_rate": 2.477200048982294e-06, + "loss": 0.9434, + "step": 6107 + }, + { + "epoch": 0.44067674326323003, + "grad_norm": 3.186537952723223, + "learning_rate": 2.476746174465913e-06, + "loss": 0.9795, + "step": 6108 + }, + { + "epoch": 0.44074889073265755, + "grad_norm": 0.7229470157757196, + "learning_rate": 2.4762922739161883e-06, + "loss": 0.7965, + "step": 6109 + }, + { + "epoch": 0.44082103820208507, + "grad_norm": 2.9468732589117836, + "learning_rate": 2.4758383473579053e-06, + "loss": 0.9064, + "step": 6110 + }, + { + "epoch": 0.4408931856715126, + "grad_norm": 7.534904891073895, + "learning_rate": 2.475384394815852e-06, + "loss": 0.9497, + "step": 6111 + }, + { + "epoch": 0.4409653331409401, + "grad_norm": 3.1897501763365366, + "learning_rate": 2.4749304163148168e-06, + "loss": 0.9824, + "step": 6112 + }, + { + "epoch": 0.44103748061036757, + "grad_norm": 3.1922049112540645, + "learning_rate": 2.4744764118795894e-06, + "loss": 0.9684, + "step": 6113 + }, + { + "epoch": 0.4411096280797951, + "grad_norm": 2.2253048205795602, + "learning_rate": 2.4740223815349616e-06, + "loss": 0.9398, + "step": 6114 + }, + { + "epoch": 0.4411817755492226, + "grad_norm": 3.2064125123954335, + "learning_rate": 2.4735683253057265e-06, + "loss": 0.8787, + "step": 6115 + }, + { + "epoch": 0.44125392301865013, + "grad_norm": 2.1999161530902582, + "learning_rate": 2.4731142432166784e-06, + "loss": 0.8272, + "step": 6116 + }, + { + "epoch": 0.44132607048807765, + "grad_norm": 1.8624917202963902, + "learning_rate": 2.4726601352926126e-06, + "loss": 0.8777, + "step": 6117 + }, + { + "epoch": 0.44139821795750517, + "grad_norm": 2.3457835213051745, + "learning_rate": 2.4722060015583267e-06, + "loss": 0.9832, + "step": 6118 + }, + { + "epoch": 0.44147036542693263, + "grad_norm": 2.2983083599228498, + "learning_rate": 2.471751842038619e-06, + "loss": 0.9543, + "step": 6119 + }, + { + "epoch": 0.44154251289636015, + "grad_norm": 2.589216936071795, + "learning_rate": 2.4712976567582892e-06, + "loss": 0.926, + "step": 6120 + }, + { + "epoch": 0.44161466036578767, + "grad_norm": 2.277457756211813, + "learning_rate": 2.4708434457421403e-06, + "loss": 0.9049, + "step": 6121 + }, + { + "epoch": 0.4416868078352152, + "grad_norm": 2.7221885071394487, + "learning_rate": 2.470389209014973e-06, + "loss": 0.9435, + "step": 6122 + }, + { + "epoch": 0.4417589553046427, + "grad_norm": 2.501960271963592, + "learning_rate": 2.469934946601592e-06, + "loss": 0.8138, + "step": 6123 + }, + { + "epoch": 0.4418311027740702, + "grad_norm": 1.6072143569058204, + "learning_rate": 2.469480658526803e-06, + "loss": 0.9231, + "step": 6124 + }, + { + "epoch": 0.4419032502434977, + "grad_norm": 3.123082908055106, + "learning_rate": 2.4690263448154143e-06, + "loss": 0.8701, + "step": 6125 + }, + { + "epoch": 0.4419753977129252, + "grad_norm": 1.5490439265840128, + "learning_rate": 2.468572005492233e-06, + "loss": 0.8712, + "step": 6126 + }, + { + "epoch": 0.44204754518235273, + "grad_norm": 2.0993361468295766, + "learning_rate": 2.468117640582068e-06, + "loss": 0.9508, + "step": 6127 + }, + { + "epoch": 0.44211969265178025, + "grad_norm": 2.7426297518586114, + "learning_rate": 2.4676632501097327e-06, + "loss": 0.8274, + "step": 6128 + }, + { + "epoch": 0.44219184012120777, + "grad_norm": 2.7218501513602766, + "learning_rate": 2.467208834100037e-06, + "loss": 0.8595, + "step": 6129 + }, + { + "epoch": 0.44226398759063523, + "grad_norm": 5.041237910090903, + "learning_rate": 2.466754392577798e-06, + "loss": 0.9502, + "step": 6130 + }, + { + "epoch": 0.44233613506006275, + "grad_norm": 0.7570568131578329, + "learning_rate": 2.4662999255678298e-06, + "loss": 0.7958, + "step": 6131 + }, + { + "epoch": 0.44240828252949027, + "grad_norm": 2.498619747612261, + "learning_rate": 2.465845433094947e-06, + "loss": 0.95, + "step": 6132 + }, + { + "epoch": 0.4424804299989178, + "grad_norm": 2.852355010805285, + "learning_rate": 2.4653909151839712e-06, + "loss": 1.0556, + "step": 6133 + }, + { + "epoch": 0.4425525774683453, + "grad_norm": 2.5140487755874417, + "learning_rate": 2.4649363718597196e-06, + "loss": 0.88, + "step": 6134 + }, + { + "epoch": 0.44262472493777283, + "grad_norm": 5.131811128478329, + "learning_rate": 2.4644818031470136e-06, + "loss": 1.0086, + "step": 6135 + }, + { + "epoch": 0.4426968724072003, + "grad_norm": 2.2094140077678848, + "learning_rate": 2.4640272090706764e-06, + "loss": 0.9124, + "step": 6136 + }, + { + "epoch": 0.4427690198766278, + "grad_norm": 1.70805576054322, + "learning_rate": 2.463572589655531e-06, + "loss": 0.9797, + "step": 6137 + }, + { + "epoch": 0.44284116734605533, + "grad_norm": 2.7739977633378543, + "learning_rate": 2.463117944926403e-06, + "loss": 0.9235, + "step": 6138 + }, + { + "epoch": 0.44291331481548285, + "grad_norm": 3.6483585586012706, + "learning_rate": 2.4626632749081177e-06, + "loss": 0.839, + "step": 6139 + }, + { + "epoch": 0.44298546228491037, + "grad_norm": 2.073677753939859, + "learning_rate": 2.462208579625504e-06, + "loss": 0.9629, + "step": 6140 + }, + { + "epoch": 0.4430576097543379, + "grad_norm": 2.4585234897151587, + "learning_rate": 2.46175385910339e-06, + "loss": 0.9197, + "step": 6141 + }, + { + "epoch": 0.44312975722376535, + "grad_norm": 1.834980946787525, + "learning_rate": 2.461299113366608e-06, + "loss": 0.8337, + "step": 6142 + }, + { + "epoch": 0.44320190469319287, + "grad_norm": 2.621002877901514, + "learning_rate": 2.4608443424399895e-06, + "loss": 0.8844, + "step": 6143 + }, + { + "epoch": 0.4432740521626204, + "grad_norm": 0.9372759551318213, + "learning_rate": 2.460389546348366e-06, + "loss": 0.8584, + "step": 6144 + }, + { + "epoch": 0.4433461996320479, + "grad_norm": 2.4938614822747147, + "learning_rate": 2.4599347251165746e-06, + "loss": 0.9396, + "step": 6145 + }, + { + "epoch": 0.44341834710147543, + "grad_norm": 2.0394439925022376, + "learning_rate": 2.4594798787694502e-06, + "loss": 0.9332, + "step": 6146 + }, + { + "epoch": 0.44349049457090295, + "grad_norm": 2.451841565326439, + "learning_rate": 2.459025007331831e-06, + "loss": 1.0525, + "step": 6147 + }, + { + "epoch": 0.4435626420403304, + "grad_norm": 2.1466415389389066, + "learning_rate": 2.4585701108285556e-06, + "loss": 0.9736, + "step": 6148 + }, + { + "epoch": 0.44363478950975793, + "grad_norm": 2.3140576246005464, + "learning_rate": 2.4581151892844635e-06, + "loss": 1.1012, + "step": 6149 + }, + { + "epoch": 0.44370693697918545, + "grad_norm": 0.8462704348104266, + "learning_rate": 2.4576602427243967e-06, + "loss": 0.7683, + "step": 6150 + }, + { + "epoch": 0.44377908444861297, + "grad_norm": 2.000993005285663, + "learning_rate": 2.457205271173198e-06, + "loss": 0.9418, + "step": 6151 + }, + { + "epoch": 0.4438512319180405, + "grad_norm": 0.797384622712235, + "learning_rate": 2.4567502746557125e-06, + "loss": 0.8382, + "step": 6152 + }, + { + "epoch": 0.443923379387468, + "grad_norm": 2.61598492223145, + "learning_rate": 2.4562952531967854e-06, + "loss": 0.8776, + "step": 6153 + }, + { + "epoch": 0.4439955268568955, + "grad_norm": 1.9631343861003006, + "learning_rate": 2.4558402068212635e-06, + "loss": 0.9014, + "step": 6154 + }, + { + "epoch": 0.444067674326323, + "grad_norm": 2.0022842952074087, + "learning_rate": 2.4553851355539953e-06, + "loss": 0.8163, + "step": 6155 + }, + { + "epoch": 0.4441398217957505, + "grad_norm": 2.569534787030336, + "learning_rate": 2.45493003941983e-06, + "loss": 0.8816, + "step": 6156 + }, + { + "epoch": 0.44421196926517803, + "grad_norm": 18.698793194839055, + "learning_rate": 2.4544749184436206e-06, + "loss": 0.9376, + "step": 6157 + }, + { + "epoch": 0.44428411673460555, + "grad_norm": 0.7771658765778059, + "learning_rate": 2.4540197726502175e-06, + "loss": 0.8011, + "step": 6158 + }, + { + "epoch": 0.44435626420403307, + "grad_norm": 3.8170795139648552, + "learning_rate": 2.4535646020644755e-06, + "loss": 0.9422, + "step": 6159 + }, + { + "epoch": 0.44442841167346053, + "grad_norm": 2.3583719508735244, + "learning_rate": 2.45310940671125e-06, + "loss": 0.8823, + "step": 6160 + }, + { + "epoch": 0.44450055914288805, + "grad_norm": 0.880536208322966, + "learning_rate": 2.452654186615397e-06, + "loss": 0.8618, + "step": 6161 + }, + { + "epoch": 0.44457270661231557, + "grad_norm": 2.491222421940404, + "learning_rate": 2.452198941801774e-06, + "loss": 0.9669, + "step": 6162 + }, + { + "epoch": 0.4446448540817431, + "grad_norm": 2.194933223881721, + "learning_rate": 2.4517436722952407e-06, + "loss": 0.8649, + "step": 6163 + }, + { + "epoch": 0.4447170015511706, + "grad_norm": 2.4417247839077034, + "learning_rate": 2.4512883781206587e-06, + "loss": 0.9389, + "step": 6164 + }, + { + "epoch": 0.44478914902059813, + "grad_norm": 1.0104144905827217, + "learning_rate": 2.4508330593028885e-06, + "loss": 0.8201, + "step": 6165 + }, + { + "epoch": 0.4448612964900256, + "grad_norm": 2.819835929020598, + "learning_rate": 2.4503777158667934e-06, + "loss": 1.0036, + "step": 6166 + }, + { + "epoch": 0.4449334439594531, + "grad_norm": 6.378861099522544, + "learning_rate": 2.4499223478372394e-06, + "loss": 0.9107, + "step": 6167 + }, + { + "epoch": 0.44500559142888063, + "grad_norm": 0.64719181595362, + "learning_rate": 2.4494669552390904e-06, + "loss": 0.7411, + "step": 6168 + }, + { + "epoch": 0.44507773889830815, + "grad_norm": 2.7560208041509044, + "learning_rate": 2.4490115380972156e-06, + "loss": 0.9771, + "step": 6169 + }, + { + "epoch": 0.44514988636773567, + "grad_norm": 2.5596985739336953, + "learning_rate": 2.4485560964364834e-06, + "loss": 0.9498, + "step": 6170 + }, + { + "epoch": 0.4452220338371632, + "grad_norm": 3.5411137916015636, + "learning_rate": 2.448100630281762e-06, + "loss": 0.9415, + "step": 6171 + }, + { + "epoch": 0.44529418130659065, + "grad_norm": 3.265084253833482, + "learning_rate": 2.447645139657925e-06, + "loss": 0.8833, + "step": 6172 + }, + { + "epoch": 0.44536632877601817, + "grad_norm": 2.523993746742204, + "learning_rate": 2.4471896245898438e-06, + "loss": 0.8373, + "step": 6173 + }, + { + "epoch": 0.4454384762454457, + "grad_norm": 3.3106724718063205, + "learning_rate": 2.446734085102393e-06, + "loss": 1.0102, + "step": 6174 + }, + { + "epoch": 0.4455106237148732, + "grad_norm": 0.7129359400730385, + "learning_rate": 2.4462785212204468e-06, + "loss": 0.7922, + "step": 6175 + }, + { + "epoch": 0.44558277118430073, + "grad_norm": 2.988086888593466, + "learning_rate": 2.4458229329688826e-06, + "loss": 0.8516, + "step": 6176 + }, + { + "epoch": 0.4456549186537282, + "grad_norm": 2.684542869852233, + "learning_rate": 2.4453673203725793e-06, + "loss": 0.9109, + "step": 6177 + }, + { + "epoch": 0.4457270661231557, + "grad_norm": 2.8044991177594745, + "learning_rate": 2.4449116834564138e-06, + "loss": 0.8113, + "step": 6178 + }, + { + "epoch": 0.44579921359258323, + "grad_norm": 2.3380000472056546, + "learning_rate": 2.44445602224527e-06, + "loss": 0.8852, + "step": 6179 + }, + { + "epoch": 0.44587136106201075, + "grad_norm": 2.143547791489477, + "learning_rate": 2.4440003367640264e-06, + "loss": 0.9389, + "step": 6180 + }, + { + "epoch": 0.44594350853143827, + "grad_norm": 2.8997608875484984, + "learning_rate": 2.4435446270375695e-06, + "loss": 0.9285, + "step": 6181 + }, + { + "epoch": 0.4460156560008658, + "grad_norm": 2.3080862039238594, + "learning_rate": 2.4430888930907822e-06, + "loss": 1.0172, + "step": 6182 + }, + { + "epoch": 0.44608780347029325, + "grad_norm": 2.7923088094341666, + "learning_rate": 2.4426331349485497e-06, + "loss": 0.9528, + "step": 6183 + }, + { + "epoch": 0.4461599509397208, + "grad_norm": 3.4812225998188833, + "learning_rate": 2.4421773526357614e-06, + "loss": 0.9304, + "step": 6184 + }, + { + "epoch": 0.4462320984091483, + "grad_norm": 3.030323319511693, + "learning_rate": 2.4417215461773046e-06, + "loss": 1.0031, + "step": 6185 + }, + { + "epoch": 0.4463042458785758, + "grad_norm": 2.481605281425036, + "learning_rate": 2.4412657155980693e-06, + "loss": 0.8808, + "step": 6186 + }, + { + "epoch": 0.44637639334800333, + "grad_norm": 2.2712878784313064, + "learning_rate": 2.4408098609229468e-06, + "loss": 0.9646, + "step": 6187 + }, + { + "epoch": 0.44644854081743085, + "grad_norm": 3.647886050778633, + "learning_rate": 2.4403539821768292e-06, + "loss": 0.9814, + "step": 6188 + }, + { + "epoch": 0.4465206882868583, + "grad_norm": 2.2020264179401656, + "learning_rate": 2.439898079384611e-06, + "loss": 0.8285, + "step": 6189 + }, + { + "epoch": 0.44659283575628583, + "grad_norm": 1.9752117260786868, + "learning_rate": 2.439442152571187e-06, + "loss": 0.8868, + "step": 6190 + }, + { + "epoch": 0.44666498322571335, + "grad_norm": 2.506020829908016, + "learning_rate": 2.4389862017614543e-06, + "loss": 0.9042, + "step": 6191 + }, + { + "epoch": 0.44673713069514087, + "grad_norm": 2.5910780599091807, + "learning_rate": 2.43853022698031e-06, + "loss": 0.8811, + "step": 6192 + }, + { + "epoch": 0.4468092781645684, + "grad_norm": 2.8874558680741225, + "learning_rate": 2.438074228252653e-06, + "loss": 0.8437, + "step": 6193 + }, + { + "epoch": 0.4468814256339959, + "grad_norm": 1.9351345512273197, + "learning_rate": 2.4376182056033844e-06, + "loss": 0.9391, + "step": 6194 + }, + { + "epoch": 0.4469535731034234, + "grad_norm": 2.6295663988195432, + "learning_rate": 2.4371621590574053e-06, + "loss": 0.872, + "step": 6195 + }, + { + "epoch": 0.4470257205728509, + "grad_norm": 2.2967911270294414, + "learning_rate": 2.436706088639619e-06, + "loss": 0.8341, + "step": 6196 + }, + { + "epoch": 0.4470978680422784, + "grad_norm": 2.9709286375305815, + "learning_rate": 2.436249994374931e-06, + "loss": 0.9705, + "step": 6197 + }, + { + "epoch": 0.44717001551170593, + "grad_norm": 2.6227414542031595, + "learning_rate": 2.4357938762882443e-06, + "loss": 0.9067, + "step": 6198 + }, + { + "epoch": 0.44724216298113345, + "grad_norm": 2.6441097402551135, + "learning_rate": 2.435337734404468e-06, + "loss": 0.9082, + "step": 6199 + }, + { + "epoch": 0.44731431045056097, + "grad_norm": 2.2582235523112266, + "learning_rate": 2.4348815687485093e-06, + "loss": 0.9464, + "step": 6200 + }, + { + "epoch": 0.44738645791998843, + "grad_norm": 3.5769706096598846, + "learning_rate": 2.4344253793452784e-06, + "loss": 0.9276, + "step": 6201 + }, + { + "epoch": 0.44745860538941595, + "grad_norm": 2.137053022441778, + "learning_rate": 2.433969166219685e-06, + "loss": 0.8519, + "step": 6202 + }, + { + "epoch": 0.4475307528588435, + "grad_norm": 3.2737445550722546, + "learning_rate": 2.4335129293966425e-06, + "loss": 0.9188, + "step": 6203 + }, + { + "epoch": 0.447602900328271, + "grad_norm": 2.3710318090653297, + "learning_rate": 2.433056668901064e-06, + "loss": 0.9214, + "step": 6204 + }, + { + "epoch": 0.4476750477976985, + "grad_norm": 1.786042308652261, + "learning_rate": 2.4326003847578627e-06, + "loss": 1.0114, + "step": 6205 + }, + { + "epoch": 0.44774719526712603, + "grad_norm": 5.13002530721925, + "learning_rate": 2.432144076991957e-06, + "loss": 0.9161, + "step": 6206 + }, + { + "epoch": 0.4478193427365535, + "grad_norm": 0.7714040207362208, + "learning_rate": 2.4316877456282624e-06, + "loss": 0.7588, + "step": 6207 + }, + { + "epoch": 0.447891490205981, + "grad_norm": 2.2347021330262455, + "learning_rate": 2.4312313906916984e-06, + "loss": 0.995, + "step": 6208 + }, + { + "epoch": 0.44796363767540853, + "grad_norm": 1.9602663631838728, + "learning_rate": 2.4307750122071848e-06, + "loss": 0.9948, + "step": 6209 + }, + { + "epoch": 0.44803578514483605, + "grad_norm": 1.9967235310735552, + "learning_rate": 2.430318610199642e-06, + "loss": 0.9164, + "step": 6210 + }, + { + "epoch": 0.44810793261426357, + "grad_norm": 4.5776391927076645, + "learning_rate": 2.429862184693993e-06, + "loss": 0.8919, + "step": 6211 + }, + { + "epoch": 0.4481800800836911, + "grad_norm": 2.4766635813978377, + "learning_rate": 2.429405735715161e-06, + "loss": 0.9023, + "step": 6212 + }, + { + "epoch": 0.44825222755311855, + "grad_norm": 2.7546879950667362, + "learning_rate": 2.428949263288072e-06, + "loss": 0.9636, + "step": 6213 + }, + { + "epoch": 0.4483243750225461, + "grad_norm": 3.3212759976128656, + "learning_rate": 2.4284927674376516e-06, + "loss": 0.7857, + "step": 6214 + }, + { + "epoch": 0.4483965224919736, + "grad_norm": 2.392785087509136, + "learning_rate": 2.4280362481888266e-06, + "loss": 0.9203, + "step": 6215 + }, + { + "epoch": 0.4484686699614011, + "grad_norm": 2.970897018655979, + "learning_rate": 2.427579705566528e-06, + "loss": 0.8919, + "step": 6216 + }, + { + "epoch": 0.44854081743082863, + "grad_norm": 1.7641684001156677, + "learning_rate": 2.427123139595683e-06, + "loss": 0.907, + "step": 6217 + }, + { + "epoch": 0.44861296490025615, + "grad_norm": 5.103896070301152, + "learning_rate": 2.4266665503012252e-06, + "loss": 0.9197, + "step": 6218 + }, + { + "epoch": 0.4486851123696836, + "grad_norm": 2.2507901393790313, + "learning_rate": 2.426209937708087e-06, + "loss": 0.9784, + "step": 6219 + }, + { + "epoch": 0.44875725983911113, + "grad_norm": 2.9374735404912884, + "learning_rate": 2.425753301841201e-06, + "loss": 0.9402, + "step": 6220 + }, + { + "epoch": 0.44882940730853865, + "grad_norm": 1.8003832197212915, + "learning_rate": 2.4252966427255042e-06, + "loss": 1.0098, + "step": 6221 + }, + { + "epoch": 0.44890155477796617, + "grad_norm": 2.6596638900252136, + "learning_rate": 2.4248399603859317e-06, + "loss": 0.9803, + "step": 6222 + }, + { + "epoch": 0.4489737022473937, + "grad_norm": 0.9020523988090401, + "learning_rate": 2.4243832548474214e-06, + "loss": 0.8167, + "step": 6223 + }, + { + "epoch": 0.44904584971682115, + "grad_norm": 1.3893122531108628, + "learning_rate": 2.4239265261349125e-06, + "loss": 0.7895, + "step": 6224 + }, + { + "epoch": 0.4491179971862487, + "grad_norm": 6.233008825385537, + "learning_rate": 2.423469774273346e-06, + "loss": 0.9256, + "step": 6225 + }, + { + "epoch": 0.4491901446556762, + "grad_norm": 2.544114565207007, + "learning_rate": 2.4230129992876623e-06, + "loss": 0.8496, + "step": 6226 + }, + { + "epoch": 0.4492622921251037, + "grad_norm": 5.5273499613481, + "learning_rate": 2.422556201202805e-06, + "loss": 0.9296, + "step": 6227 + }, + { + "epoch": 0.44933443959453123, + "grad_norm": 3.2749049282106433, + "learning_rate": 2.4220993800437174e-06, + "loss": 0.8454, + "step": 6228 + }, + { + "epoch": 0.44940658706395875, + "grad_norm": 2.6022501085486396, + "learning_rate": 2.421642535835345e-06, + "loss": 1.0462, + "step": 6229 + }, + { + "epoch": 0.4494787345333862, + "grad_norm": 2.428069333257037, + "learning_rate": 2.4211856686026355e-06, + "loss": 0.9497, + "step": 6230 + }, + { + "epoch": 0.44955088200281373, + "grad_norm": 2.1012070365838773, + "learning_rate": 2.4207287783705354e-06, + "loss": 0.9668, + "step": 6231 + }, + { + "epoch": 0.44962302947224125, + "grad_norm": 3.036173646295848, + "learning_rate": 2.4202718651639935e-06, + "loss": 0.8554, + "step": 6232 + }, + { + "epoch": 0.4496951769416688, + "grad_norm": 2.307980012076582, + "learning_rate": 2.4198149290079616e-06, + "loss": 0.9128, + "step": 6233 + }, + { + "epoch": 0.4497673244110963, + "grad_norm": 1.9894150534626622, + "learning_rate": 2.41935796992739e-06, + "loss": 1.0148, + "step": 6234 + }, + { + "epoch": 0.4498394718805238, + "grad_norm": 2.6061460881364154, + "learning_rate": 2.418900987947232e-06, + "loss": 0.9641, + "step": 6235 + }, + { + "epoch": 0.4499116193499513, + "grad_norm": 2.4965899098221813, + "learning_rate": 2.4184439830924426e-06, + "loss": 1.0263, + "step": 6236 + }, + { + "epoch": 0.4499837668193788, + "grad_norm": 3.9250952885231736, + "learning_rate": 2.417986955387976e-06, + "loss": 0.9288, + "step": 6237 + }, + { + "epoch": 0.4500559142888063, + "grad_norm": 2.186437403224477, + "learning_rate": 2.4175299048587887e-06, + "loss": 0.9419, + "step": 6238 + }, + { + "epoch": 0.45012806175823383, + "grad_norm": 2.5211532221666166, + "learning_rate": 2.4170728315298394e-06, + "loss": 0.8225, + "step": 6239 + }, + { + "epoch": 0.45020020922766135, + "grad_norm": 2.2588865899827306, + "learning_rate": 2.4166157354260867e-06, + "loss": 0.9082, + "step": 6240 + }, + { + "epoch": 0.45027235669708887, + "grad_norm": 2.728994339799725, + "learning_rate": 2.4161586165724906e-06, + "loss": 0.974, + "step": 6241 + }, + { + "epoch": 0.45034450416651634, + "grad_norm": 2.7862110253984276, + "learning_rate": 2.415701474994013e-06, + "loss": 0.9616, + "step": 6242 + }, + { + "epoch": 0.45041665163594385, + "grad_norm": 1.8384895281292448, + "learning_rate": 2.4152443107156177e-06, + "loss": 0.916, + "step": 6243 + }, + { + "epoch": 0.4504887991053714, + "grad_norm": 4.846115617519782, + "learning_rate": 2.4147871237622663e-06, + "loss": 1.0107, + "step": 6244 + }, + { + "epoch": 0.4505609465747989, + "grad_norm": 2.170008673936235, + "learning_rate": 2.4143299141589265e-06, + "loss": 0.9444, + "step": 6245 + }, + { + "epoch": 0.4506330940442264, + "grad_norm": 4.918353464896973, + "learning_rate": 2.4138726819305633e-06, + "loss": 0.9396, + "step": 6246 + }, + { + "epoch": 0.45070524151365393, + "grad_norm": 3.1551706715894006, + "learning_rate": 2.413415427102145e-06, + "loss": 0.8775, + "step": 6247 + }, + { + "epoch": 0.4507773889830814, + "grad_norm": 2.6788106212084712, + "learning_rate": 2.4129581496986414e-06, + "loss": 0.907, + "step": 6248 + }, + { + "epoch": 0.4508495364525089, + "grad_norm": 3.3331191153038917, + "learning_rate": 2.412500849745022e-06, + "loss": 0.8341, + "step": 6249 + }, + { + "epoch": 0.45092168392193643, + "grad_norm": 2.37307570954061, + "learning_rate": 2.4120435272662576e-06, + "loss": 0.9636, + "step": 6250 + }, + { + "epoch": 0.45099383139136395, + "grad_norm": 0.8512654967724997, + "learning_rate": 2.411586182287321e-06, + "loss": 0.8816, + "step": 6251 + }, + { + "epoch": 0.45106597886079147, + "grad_norm": 7.481971179489593, + "learning_rate": 2.411128814833188e-06, + "loss": 0.8818, + "step": 6252 + }, + { + "epoch": 0.451138126330219, + "grad_norm": 0.7527949785384311, + "learning_rate": 2.410671424928832e-06, + "loss": 0.8058, + "step": 6253 + }, + { + "epoch": 0.45121027379964646, + "grad_norm": 2.1735751955141094, + "learning_rate": 2.410214012599229e-06, + "loss": 0.9339, + "step": 6254 + }, + { + "epoch": 0.451282421269074, + "grad_norm": 2.719678435250795, + "learning_rate": 2.4097565778693583e-06, + "loss": 0.8014, + "step": 6255 + }, + { + "epoch": 0.4513545687385015, + "grad_norm": 0.8740110257935069, + "learning_rate": 2.4092991207641967e-06, + "loss": 0.9472, + "step": 6256 + }, + { + "epoch": 0.451426716207929, + "grad_norm": 0.8898253950548612, + "learning_rate": 2.4088416413087256e-06, + "loss": 0.8419, + "step": 6257 + }, + { + "epoch": 0.45149886367735653, + "grad_norm": 2.1013152817315457, + "learning_rate": 2.4083841395279267e-06, + "loss": 0.9604, + "step": 6258 + }, + { + "epoch": 0.45157101114678405, + "grad_norm": 2.7728331001313586, + "learning_rate": 2.4079266154467806e-06, + "loss": 0.8154, + "step": 6259 + }, + { + "epoch": 0.4516431586162115, + "grad_norm": 0.7726694782848839, + "learning_rate": 2.407469069090273e-06, + "loss": 0.8052, + "step": 6260 + }, + { + "epoch": 0.45171530608563903, + "grad_norm": 2.586334013719744, + "learning_rate": 2.407011500483388e-06, + "loss": 0.79, + "step": 6261 + }, + { + "epoch": 0.45178745355506655, + "grad_norm": 2.360796165653994, + "learning_rate": 2.406553909651111e-06, + "loss": 0.9667, + "step": 6262 + }, + { + "epoch": 0.4518596010244941, + "grad_norm": 2.495610197746082, + "learning_rate": 2.4060962966184304e-06, + "loss": 0.9016, + "step": 6263 + }, + { + "epoch": 0.4519317484939216, + "grad_norm": 2.8156943089362048, + "learning_rate": 2.405638661410334e-06, + "loss": 0.8375, + "step": 6264 + }, + { + "epoch": 0.4520038959633491, + "grad_norm": 0.7901237451838792, + "learning_rate": 2.405181004051812e-06, + "loss": 0.8001, + "step": 6265 + }, + { + "epoch": 0.4520760434327766, + "grad_norm": 2.076863529840618, + "learning_rate": 2.404723324567856e-06, + "loss": 0.926, + "step": 6266 + }, + { + "epoch": 0.4521481909022041, + "grad_norm": 2.4962897425254336, + "learning_rate": 2.404265622983457e-06, + "loss": 0.8604, + "step": 6267 + }, + { + "epoch": 0.4522203383716316, + "grad_norm": 2.151323079935391, + "learning_rate": 2.4038078993236082e-06, + "loss": 0.8859, + "step": 6268 + }, + { + "epoch": 0.45229248584105913, + "grad_norm": 2.5379050096031084, + "learning_rate": 2.4033501536133057e-06, + "loss": 0.9013, + "step": 6269 + }, + { + "epoch": 0.45236463331048665, + "grad_norm": 2.38638068519216, + "learning_rate": 2.4028923858775445e-06, + "loss": 1.019, + "step": 6270 + }, + { + "epoch": 0.45243678077991417, + "grad_norm": 1.9908408007582181, + "learning_rate": 2.402434596141321e-06, + "loss": 0.9571, + "step": 6271 + }, + { + "epoch": 0.45250892824934164, + "grad_norm": 6.727106619525142, + "learning_rate": 2.4019767844296343e-06, + "loss": 0.9139, + "step": 6272 + }, + { + "epoch": 0.45258107571876915, + "grad_norm": 2.221404837909547, + "learning_rate": 2.4015189507674836e-06, + "loss": 0.9621, + "step": 6273 + }, + { + "epoch": 0.4526532231881967, + "grad_norm": 3.092035608478913, + "learning_rate": 2.401061095179869e-06, + "loss": 0.9658, + "step": 6274 + }, + { + "epoch": 0.4527253706576242, + "grad_norm": 3.177991109932816, + "learning_rate": 2.4006032176917935e-06, + "loss": 0.9243, + "step": 6275 + }, + { + "epoch": 0.4527975181270517, + "grad_norm": 2.3583459694280084, + "learning_rate": 2.4001453183282587e-06, + "loss": 1.0489, + "step": 6276 + }, + { + "epoch": 0.4528696655964792, + "grad_norm": 1.8400045595941854, + "learning_rate": 2.399687397114269e-06, + "loss": 0.8958, + "step": 6277 + }, + { + "epoch": 0.4529418130659067, + "grad_norm": 0.6961181131160274, + "learning_rate": 2.39922945407483e-06, + "loss": 0.7468, + "step": 6278 + }, + { + "epoch": 0.4530139605353342, + "grad_norm": 2.4947251939512216, + "learning_rate": 2.398771489234949e-06, + "loss": 0.8928, + "step": 6279 + }, + { + "epoch": 0.45308610800476173, + "grad_norm": 2.4768371429746048, + "learning_rate": 2.3983135026196334e-06, + "loss": 0.8037, + "step": 6280 + }, + { + "epoch": 0.45315825547418925, + "grad_norm": 5.83281905314141, + "learning_rate": 2.3978554942538914e-06, + "loss": 1.071, + "step": 6281 + }, + { + "epoch": 0.4532304029436168, + "grad_norm": 2.436989559622383, + "learning_rate": 2.3973974641627343e-06, + "loss": 1.039, + "step": 6282 + }, + { + "epoch": 0.45330255041304424, + "grad_norm": 3.0087011040271223, + "learning_rate": 2.3969394123711716e-06, + "loss": 0.8756, + "step": 6283 + }, + { + "epoch": 0.45337469788247176, + "grad_norm": 2.5430920361519753, + "learning_rate": 2.396481338904218e-06, + "loss": 0.9548, + "step": 6284 + }, + { + "epoch": 0.4534468453518993, + "grad_norm": 2.2934257922644945, + "learning_rate": 2.3960232437868866e-06, + "loss": 0.9002, + "step": 6285 + }, + { + "epoch": 0.4535189928213268, + "grad_norm": 3.1884150967946088, + "learning_rate": 2.3955651270441904e-06, + "loss": 0.9362, + "step": 6286 + }, + { + "epoch": 0.4535911402907543, + "grad_norm": 2.419335886255499, + "learning_rate": 2.395106988701148e-06, + "loss": 1.0704, + "step": 6287 + }, + { + "epoch": 0.45366328776018183, + "grad_norm": 2.311051817381971, + "learning_rate": 2.3946488287827753e-06, + "loss": 0.8338, + "step": 6288 + }, + { + "epoch": 0.4537354352296093, + "grad_norm": 2.6937434397748117, + "learning_rate": 2.394190647314091e-06, + "loss": 0.9869, + "step": 6289 + }, + { + "epoch": 0.4538075826990368, + "grad_norm": 0.7833876166574766, + "learning_rate": 2.393732444320115e-06, + "loss": 0.7495, + "step": 6290 + }, + { + "epoch": 0.45387973016846433, + "grad_norm": 2.526261582911918, + "learning_rate": 2.3932742198258675e-06, + "loss": 0.9353, + "step": 6291 + }, + { + "epoch": 0.45395187763789185, + "grad_norm": 0.909974763646068, + "learning_rate": 2.3928159738563707e-06, + "loss": 0.9072, + "step": 6292 + }, + { + "epoch": 0.4540240251073194, + "grad_norm": 2.2276641797185945, + "learning_rate": 2.392357706436648e-06, + "loss": 0.9371, + "step": 6293 + }, + { + "epoch": 0.4540961725767469, + "grad_norm": 1.9931758448520258, + "learning_rate": 2.3918994175917235e-06, + "loss": 0.8888, + "step": 6294 + }, + { + "epoch": 0.45416832004617436, + "grad_norm": 10.472757511865906, + "learning_rate": 2.3914411073466214e-06, + "loss": 0.9092, + "step": 6295 + }, + { + "epoch": 0.4542404675156019, + "grad_norm": 1.944476273442689, + "learning_rate": 2.390982775726371e-06, + "loss": 0.9423, + "step": 6296 + }, + { + "epoch": 0.4543126149850294, + "grad_norm": 0.8281933018569902, + "learning_rate": 2.3905244227559978e-06, + "loss": 0.8052, + "step": 6297 + }, + { + "epoch": 0.4543847624544569, + "grad_norm": 3.312506621732031, + "learning_rate": 2.3900660484605313e-06, + "loss": 0.9635, + "step": 6298 + }, + { + "epoch": 0.45445690992388443, + "grad_norm": 2.3766183860372583, + "learning_rate": 2.389607652865002e-06, + "loss": 1.0186, + "step": 6299 + }, + { + "epoch": 0.45452905739331195, + "grad_norm": 5.654573476610997, + "learning_rate": 2.3891492359944414e-06, + "loss": 1.0432, + "step": 6300 + }, + { + "epoch": 0.4546012048627394, + "grad_norm": 2.714916727745536, + "learning_rate": 2.3886907978738814e-06, + "loss": 0.8827, + "step": 6301 + }, + { + "epoch": 0.45467335233216694, + "grad_norm": 3.167744921433553, + "learning_rate": 2.3882323385283563e-06, + "loss": 0.9348, + "step": 6302 + }, + { + "epoch": 0.45474549980159445, + "grad_norm": 3.313770266556902, + "learning_rate": 2.3877738579828997e-06, + "loss": 0.7989, + "step": 6303 + }, + { + "epoch": 0.454817647271022, + "grad_norm": 2.544764480934873, + "learning_rate": 2.3873153562625482e-06, + "loss": 0.9851, + "step": 6304 + }, + { + "epoch": 0.4548897947404495, + "grad_norm": 2.500246798731163, + "learning_rate": 2.3868568333923393e-06, + "loss": 0.9655, + "step": 6305 + }, + { + "epoch": 0.454961942209877, + "grad_norm": 3.9073877737533986, + "learning_rate": 2.386398289397311e-06, + "loss": 0.8959, + "step": 6306 + }, + { + "epoch": 0.4550340896793045, + "grad_norm": 2.711684681934738, + "learning_rate": 2.3859397243025017e-06, + "loss": 1.0198, + "step": 6307 + }, + { + "epoch": 0.455106237148732, + "grad_norm": 3.3469083611600317, + "learning_rate": 2.3854811381329535e-06, + "loss": 0.7909, + "step": 6308 + }, + { + "epoch": 0.4551783846181595, + "grad_norm": 2.426638146257306, + "learning_rate": 2.3850225309137074e-06, + "loss": 0.8916, + "step": 6309 + }, + { + "epoch": 0.45525053208758703, + "grad_norm": 2.3766204927229473, + "learning_rate": 2.3845639026698054e-06, + "loss": 0.8908, + "step": 6310 + }, + { + "epoch": 0.45532267955701455, + "grad_norm": 2.095548853390978, + "learning_rate": 2.384105253426293e-06, + "loss": 0.9329, + "step": 6311 + }, + { + "epoch": 0.4553948270264421, + "grad_norm": 2.0006952269508216, + "learning_rate": 2.3836465832082147e-06, + "loss": 0.8571, + "step": 6312 + }, + { + "epoch": 0.45546697449586954, + "grad_norm": 2.6698165013267925, + "learning_rate": 2.3831878920406157e-06, + "loss": 0.9336, + "step": 6313 + }, + { + "epoch": 0.45553912196529706, + "grad_norm": 2.3213846831312592, + "learning_rate": 2.3827291799485458e-06, + "loss": 0.9492, + "step": 6314 + }, + { + "epoch": 0.4556112694347246, + "grad_norm": 7.644053932067553, + "learning_rate": 2.3822704469570513e-06, + "loss": 0.8013, + "step": 6315 + }, + { + "epoch": 0.4556834169041521, + "grad_norm": 3.820523683292594, + "learning_rate": 2.381811693091183e-06, + "loss": 0.9452, + "step": 6316 + }, + { + "epoch": 0.4557555643735796, + "grad_norm": 2.175735007433304, + "learning_rate": 2.3813529183759914e-06, + "loss": 0.9515, + "step": 6317 + }, + { + "epoch": 0.45582771184300713, + "grad_norm": 2.7090253874982166, + "learning_rate": 2.3808941228365293e-06, + "loss": 0.8749, + "step": 6318 + }, + { + "epoch": 0.4558998593124346, + "grad_norm": 2.5444324224744235, + "learning_rate": 2.3804353064978488e-06, + "loss": 0.8769, + "step": 6319 + }, + { + "epoch": 0.4559720067818621, + "grad_norm": 8.245082892654159, + "learning_rate": 2.379976469385004e-06, + "loss": 0.9529, + "step": 6320 + }, + { + "epoch": 0.45604415425128964, + "grad_norm": 2.9525688495409037, + "learning_rate": 2.3795176115230516e-06, + "loss": 0.9134, + "step": 6321 + }, + { + "epoch": 0.45611630172071715, + "grad_norm": 3.25017693844898, + "learning_rate": 2.3790587329370467e-06, + "loss": 0.8365, + "step": 6322 + }, + { + "epoch": 0.4561884491901447, + "grad_norm": 2.2505297566997453, + "learning_rate": 2.378599833652048e-06, + "loss": 0.9497, + "step": 6323 + }, + { + "epoch": 0.4562605966595722, + "grad_norm": 2.9484437037788305, + "learning_rate": 2.3781409136931146e-06, + "loss": 0.9196, + "step": 6324 + }, + { + "epoch": 0.45633274412899966, + "grad_norm": 3.1649711151974484, + "learning_rate": 2.377681973085304e-06, + "loss": 0.9627, + "step": 6325 + }, + { + "epoch": 0.4564048915984272, + "grad_norm": 2.4792511121542167, + "learning_rate": 2.3772230118536805e-06, + "loss": 0.8876, + "step": 6326 + }, + { + "epoch": 0.4564770390678547, + "grad_norm": 2.6871008465677733, + "learning_rate": 2.3767640300233044e-06, + "loss": 1.0832, + "step": 6327 + }, + { + "epoch": 0.4565491865372822, + "grad_norm": 2.4751491424629957, + "learning_rate": 2.376305027619239e-06, + "loss": 1.0058, + "step": 6328 + }, + { + "epoch": 0.45662133400670973, + "grad_norm": 2.2588301216612483, + "learning_rate": 2.3758460046665495e-06, + "loss": 0.8322, + "step": 6329 + }, + { + "epoch": 0.4566934814761372, + "grad_norm": 3.3052951984470242, + "learning_rate": 2.375386961190301e-06, + "loss": 0.9131, + "step": 6330 + }, + { + "epoch": 0.4567656289455647, + "grad_norm": 2.7324996184604444, + "learning_rate": 2.3749278972155595e-06, + "loss": 0.935, + "step": 6331 + }, + { + "epoch": 0.45683777641499224, + "grad_norm": 2.140154153316362, + "learning_rate": 2.374468812767394e-06, + "loss": 0.8945, + "step": 6332 + }, + { + "epoch": 0.45690992388441976, + "grad_norm": 2.382688025053101, + "learning_rate": 2.374009707870873e-06, + "loss": 0.9152, + "step": 6333 + }, + { + "epoch": 0.4569820713538473, + "grad_norm": 3.471714711716788, + "learning_rate": 2.3735505825510658e-06, + "loss": 0.9978, + "step": 6334 + }, + { + "epoch": 0.4570542188232748, + "grad_norm": 2.489198716166645, + "learning_rate": 2.3730914368330447e-06, + "loss": 0.8703, + "step": 6335 + }, + { + "epoch": 0.45712636629270226, + "grad_norm": 2.165034767735084, + "learning_rate": 2.3726322707418813e-06, + "loss": 0.7845, + "step": 6336 + }, + { + "epoch": 0.4571985137621298, + "grad_norm": 2.2857775764558372, + "learning_rate": 2.3721730843026484e-06, + "loss": 1.0377, + "step": 6337 + }, + { + "epoch": 0.4572706612315573, + "grad_norm": 2.786741257799357, + "learning_rate": 2.371713877540422e-06, + "loss": 0.9628, + "step": 6338 + }, + { + "epoch": 0.4573428087009848, + "grad_norm": 2.368563614006897, + "learning_rate": 2.3712546504802767e-06, + "loss": 0.9394, + "step": 6339 + }, + { + "epoch": 0.45741495617041233, + "grad_norm": 3.1177026614566543, + "learning_rate": 2.3707954031472886e-06, + "loss": 0.9327, + "step": 6340 + }, + { + "epoch": 0.45748710363983985, + "grad_norm": 2.931938913559878, + "learning_rate": 2.370336135566537e-06, + "loss": 0.9485, + "step": 6341 + }, + { + "epoch": 0.4575592511092673, + "grad_norm": 2.2580831292919057, + "learning_rate": 2.369876847763099e-06, + "loss": 0.9102, + "step": 6342 + }, + { + "epoch": 0.45763139857869484, + "grad_norm": 0.8444206610022688, + "learning_rate": 2.369417539762056e-06, + "loss": 0.8743, + "step": 6343 + }, + { + "epoch": 0.45770354604812236, + "grad_norm": 1.8343417760904372, + "learning_rate": 2.368958211588489e-06, + "loss": 0.9815, + "step": 6344 + }, + { + "epoch": 0.4577756935175499, + "grad_norm": 2.308233500868413, + "learning_rate": 2.3684988632674804e-06, + "loss": 0.9074, + "step": 6345 + }, + { + "epoch": 0.4578478409869774, + "grad_norm": 0.8835139230949058, + "learning_rate": 2.368039494824112e-06, + "loss": 0.8389, + "step": 6346 + }, + { + "epoch": 0.4579199884564049, + "grad_norm": 2.0709914911409397, + "learning_rate": 2.3675801062834694e-06, + "loss": 1.0079, + "step": 6347 + }, + { + "epoch": 0.4579921359258324, + "grad_norm": 10.47613829480011, + "learning_rate": 2.3671206976706385e-06, + "loss": 0.9786, + "step": 6348 + }, + { + "epoch": 0.4580642833952599, + "grad_norm": 2.002159859752439, + "learning_rate": 2.3666612690107045e-06, + "loss": 1.015, + "step": 6349 + }, + { + "epoch": 0.4581364308646874, + "grad_norm": 2.575825723640739, + "learning_rate": 2.3662018203287565e-06, + "loss": 0.9266, + "step": 6350 + }, + { + "epoch": 0.45820857833411494, + "grad_norm": 2.1550309425486875, + "learning_rate": 2.3657423516498827e-06, + "loss": 1.0571, + "step": 6351 + }, + { + "epoch": 0.45828072580354245, + "grad_norm": 2.4905772014324645, + "learning_rate": 2.3652828629991727e-06, + "loss": 0.9946, + "step": 6352 + }, + { + "epoch": 0.45835287327297, + "grad_norm": 2.384271393971048, + "learning_rate": 2.364823354401718e-06, + "loss": 0.9547, + "step": 6353 + }, + { + "epoch": 0.45842502074239744, + "grad_norm": 1.8716666791898342, + "learning_rate": 2.36436382588261e-06, + "loss": 0.9745, + "step": 6354 + }, + { + "epoch": 0.45849716821182496, + "grad_norm": 3.2141084258823063, + "learning_rate": 2.3639042774669427e-06, + "loss": 0.8876, + "step": 6355 + }, + { + "epoch": 0.4585693156812525, + "grad_norm": 2.2163419954212222, + "learning_rate": 2.3634447091798102e-06, + "loss": 0.8962, + "step": 6356 + }, + { + "epoch": 0.45864146315068, + "grad_norm": 3.0868349725908986, + "learning_rate": 2.3629851210463075e-06, + "loss": 0.9711, + "step": 6357 + }, + { + "epoch": 0.4587136106201075, + "grad_norm": 1.9428666760206748, + "learning_rate": 2.362525513091531e-06, + "loss": 1.0184, + "step": 6358 + }, + { + "epoch": 0.45878575808953503, + "grad_norm": 3.2043253812599644, + "learning_rate": 2.362065885340578e-06, + "loss": 0.8992, + "step": 6359 + }, + { + "epoch": 0.4588579055589625, + "grad_norm": 3.800172104200731, + "learning_rate": 2.3616062378185477e-06, + "loss": 0.8099, + "step": 6360 + }, + { + "epoch": 0.45893005302839, + "grad_norm": 1.6399191655032164, + "learning_rate": 2.3611465705505383e-06, + "loss": 0.8678, + "step": 6361 + }, + { + "epoch": 0.45900220049781754, + "grad_norm": 2.214677861643375, + "learning_rate": 2.360686883561653e-06, + "loss": 0.928, + "step": 6362 + }, + { + "epoch": 0.45907434796724506, + "grad_norm": 5.053930682873484, + "learning_rate": 2.360227176876992e-06, + "loss": 0.8022, + "step": 6363 + }, + { + "epoch": 0.4591464954366726, + "grad_norm": 14.500547201420538, + "learning_rate": 2.3597674505216572e-06, + "loss": 0.8411, + "step": 6364 + }, + { + "epoch": 0.4592186429061001, + "grad_norm": 1.728122265556614, + "learning_rate": 2.3593077045207546e-06, + "loss": 0.8516, + "step": 6365 + }, + { + "epoch": 0.45929079037552756, + "grad_norm": 2.6291771305639746, + "learning_rate": 2.3588479388993883e-06, + "loss": 0.9763, + "step": 6366 + }, + { + "epoch": 0.4593629378449551, + "grad_norm": 2.487043183519718, + "learning_rate": 2.3583881536826644e-06, + "loss": 0.8015, + "step": 6367 + }, + { + "epoch": 0.4594350853143826, + "grad_norm": 2.4726390868159274, + "learning_rate": 2.3579283488956906e-06, + "loss": 0.8183, + "step": 6368 + }, + { + "epoch": 0.4595072327838101, + "grad_norm": 3.0270885729148205, + "learning_rate": 2.3574685245635743e-06, + "loss": 0.9567, + "step": 6369 + }, + { + "epoch": 0.45957938025323763, + "grad_norm": 2.5702259547727797, + "learning_rate": 2.3570086807114253e-06, + "loss": 0.9318, + "step": 6370 + }, + { + "epoch": 0.45965152772266515, + "grad_norm": 2.781235255513059, + "learning_rate": 2.3565488173643535e-06, + "loss": 0.966, + "step": 6371 + }, + { + "epoch": 0.4597236751920926, + "grad_norm": 1.6997525483841833, + "learning_rate": 2.356088934547471e-06, + "loss": 0.8824, + "step": 6372 + }, + { + "epoch": 0.45979582266152014, + "grad_norm": 2.841483585877461, + "learning_rate": 2.3556290322858904e-06, + "loss": 0.8764, + "step": 6373 + }, + { + "epoch": 0.45986797013094766, + "grad_norm": 2.622188061107311, + "learning_rate": 2.3551691106047234e-06, + "loss": 0.9111, + "step": 6374 + }, + { + "epoch": 0.4599401176003752, + "grad_norm": 2.3683137637928704, + "learning_rate": 2.354709169529088e-06, + "loss": 0.836, + "step": 6375 + }, + { + "epoch": 0.4600122650698027, + "grad_norm": 1.985386387960054, + "learning_rate": 2.354249209084096e-06, + "loss": 0.9553, + "step": 6376 + }, + { + "epoch": 0.4600844125392302, + "grad_norm": 2.8237531391614374, + "learning_rate": 2.3537892292948674e-06, + "loss": 0.9984, + "step": 6377 + }, + { + "epoch": 0.4601565600086577, + "grad_norm": 2.18420478037512, + "learning_rate": 2.3533292301865183e-06, + "loss": 0.9083, + "step": 6378 + }, + { + "epoch": 0.4602287074780852, + "grad_norm": 2.662846351984076, + "learning_rate": 2.3528692117841676e-06, + "loss": 0.8966, + "step": 6379 + }, + { + "epoch": 0.4603008549475127, + "grad_norm": 0.6816913400053796, + "learning_rate": 2.3524091741129363e-06, + "loss": 0.7807, + "step": 6380 + }, + { + "epoch": 0.46037300241694024, + "grad_norm": 0.8940497022590848, + "learning_rate": 2.3519491171979436e-06, + "loss": 0.8343, + "step": 6381 + }, + { + "epoch": 0.46044514988636775, + "grad_norm": 2.8296015978643916, + "learning_rate": 2.3514890410643137e-06, + "loss": 0.8063, + "step": 6382 + }, + { + "epoch": 0.4605172973557952, + "grad_norm": 4.946721603897357, + "learning_rate": 2.351028945737167e-06, + "loss": 1.0329, + "step": 6383 + }, + { + "epoch": 0.46058944482522274, + "grad_norm": 2.4295587536227, + "learning_rate": 2.3505688312416302e-06, + "loss": 0.9058, + "step": 6384 + }, + { + "epoch": 0.46066159229465026, + "grad_norm": 2.8172266766473593, + "learning_rate": 2.3501086976028264e-06, + "loss": 0.9349, + "step": 6385 + }, + { + "epoch": 0.4607337397640778, + "grad_norm": 30.067804641785248, + "learning_rate": 2.3496485448458834e-06, + "loss": 1.0166, + "step": 6386 + }, + { + "epoch": 0.4608058872335053, + "grad_norm": 2.5436048049166304, + "learning_rate": 2.349188372995927e-06, + "loss": 0.9848, + "step": 6387 + }, + { + "epoch": 0.4608780347029328, + "grad_norm": 2.32141231070053, + "learning_rate": 2.3487281820780863e-06, + "loss": 0.9881, + "step": 6388 + }, + { + "epoch": 0.4609501821723603, + "grad_norm": 3.0010336049024686, + "learning_rate": 2.348267972117491e-06, + "loss": 0.8618, + "step": 6389 + }, + { + "epoch": 0.4610223296417878, + "grad_norm": 1.8542618530643438, + "learning_rate": 2.3478077431392713e-06, + "loss": 0.9776, + "step": 6390 + }, + { + "epoch": 0.4610944771112153, + "grad_norm": 2.1205363797064862, + "learning_rate": 2.347347495168558e-06, + "loss": 1.0233, + "step": 6391 + }, + { + "epoch": 0.46116662458064284, + "grad_norm": 1.965107530364686, + "learning_rate": 2.3468872282304834e-06, + "loss": 0.931, + "step": 6392 + }, + { + "epoch": 0.46123877205007036, + "grad_norm": 6.953183154602309, + "learning_rate": 2.3464269423501814e-06, + "loss": 0.9198, + "step": 6393 + }, + { + "epoch": 0.4613109195194979, + "grad_norm": 3.9888493565755434, + "learning_rate": 2.3459666375527873e-06, + "loss": 0.8902, + "step": 6394 + }, + { + "epoch": 0.46138306698892534, + "grad_norm": 2.9586630265894134, + "learning_rate": 2.345506313863435e-06, + "loss": 0.9201, + "step": 6395 + }, + { + "epoch": 0.46145521445835286, + "grad_norm": 2.790889024365835, + "learning_rate": 2.3450459713072625e-06, + "loss": 0.899, + "step": 6396 + }, + { + "epoch": 0.4615273619277804, + "grad_norm": 2.567054794327078, + "learning_rate": 2.344585609909407e-06, + "loss": 0.9274, + "step": 6397 + }, + { + "epoch": 0.4615995093972079, + "grad_norm": 2.5303893333285012, + "learning_rate": 2.3441252296950054e-06, + "loss": 0.9569, + "step": 6398 + }, + { + "epoch": 0.4616716568666354, + "grad_norm": 4.24188422423934, + "learning_rate": 2.3436648306892004e-06, + "loss": 0.9419, + "step": 6399 + }, + { + "epoch": 0.46174380433606294, + "grad_norm": 3.0402917410751242, + "learning_rate": 2.3432044129171306e-06, + "loss": 0.8243, + "step": 6400 + }, + { + "epoch": 0.4618159518054904, + "grad_norm": 0.7820398153013197, + "learning_rate": 2.3427439764039385e-06, + "loss": 0.8194, + "step": 6401 + }, + { + "epoch": 0.4618880992749179, + "grad_norm": 3.6418843853870317, + "learning_rate": 2.3422835211747673e-06, + "loss": 1.0206, + "step": 6402 + }, + { + "epoch": 0.46196024674434544, + "grad_norm": 3.841929927474856, + "learning_rate": 2.3418230472547587e-06, + "loss": 0.9008, + "step": 6403 + }, + { + "epoch": 0.46203239421377296, + "grad_norm": 0.7341382071023006, + "learning_rate": 2.34136255466906e-06, + "loss": 0.8023, + "step": 6404 + }, + { + "epoch": 0.4621045416832005, + "grad_norm": 2.2823364596989193, + "learning_rate": 2.3409020434428155e-06, + "loss": 0.95, + "step": 6405 + }, + { + "epoch": 0.462176689152628, + "grad_norm": 2.3216040513503455, + "learning_rate": 2.340441513601173e-06, + "loss": 0.9261, + "step": 6406 + }, + { + "epoch": 0.46224883662205546, + "grad_norm": 0.6624829748953087, + "learning_rate": 2.3399809651692786e-06, + "loss": 0.8303, + "step": 6407 + }, + { + "epoch": 0.462320984091483, + "grad_norm": 3.7911513401052117, + "learning_rate": 2.339520398172283e-06, + "loss": 0.8811, + "step": 6408 + }, + { + "epoch": 0.4623931315609105, + "grad_norm": 3.2682596922078346, + "learning_rate": 2.339059812635335e-06, + "loss": 0.9562, + "step": 6409 + }, + { + "epoch": 0.462465279030338, + "grad_norm": 2.6902546626655806, + "learning_rate": 2.3385992085835854e-06, + "loss": 0.8185, + "step": 6410 + }, + { + "epoch": 0.46253742649976554, + "grad_norm": 4.247002610513214, + "learning_rate": 2.3381385860421874e-06, + "loss": 0.9267, + "step": 6411 + }, + { + "epoch": 0.46260957396919306, + "grad_norm": 3.8716738792567917, + "learning_rate": 2.337677945036293e-06, + "loss": 0.9485, + "step": 6412 + }, + { + "epoch": 0.4626817214386205, + "grad_norm": 1.9868148462453938, + "learning_rate": 2.337217285591055e-06, + "loss": 0.8637, + "step": 6413 + }, + { + "epoch": 0.46275386890804804, + "grad_norm": 2.2729890256483363, + "learning_rate": 2.3367566077316303e-06, + "loss": 0.9454, + "step": 6414 + }, + { + "epoch": 0.46282601637747556, + "grad_norm": 1.8618508532579139, + "learning_rate": 2.3362959114831727e-06, + "loss": 0.9717, + "step": 6415 + }, + { + "epoch": 0.4628981638469031, + "grad_norm": 2.3249845893923395, + "learning_rate": 2.3358351968708417e-06, + "loss": 0.9378, + "step": 6416 + }, + { + "epoch": 0.4629703113163306, + "grad_norm": 3.255230143156104, + "learning_rate": 2.3353744639197934e-06, + "loss": 0.9396, + "step": 6417 + }, + { + "epoch": 0.4630424587857581, + "grad_norm": 0.7565028485145567, + "learning_rate": 2.334913712655187e-06, + "loss": 0.7748, + "step": 6418 + }, + { + "epoch": 0.4631146062551856, + "grad_norm": 0.8730530194571949, + "learning_rate": 2.334452943102182e-06, + "loss": 0.8542, + "step": 6419 + }, + { + "epoch": 0.4631867537246131, + "grad_norm": 3.3997973998798066, + "learning_rate": 2.3339921552859406e-06, + "loss": 0.8372, + "step": 6420 + }, + { + "epoch": 0.4632589011940406, + "grad_norm": 0.7984844858338069, + "learning_rate": 2.333531349231624e-06, + "loss": 0.8807, + "step": 6421 + }, + { + "epoch": 0.46333104866346814, + "grad_norm": 2.4295749454247733, + "learning_rate": 2.333070524964395e-06, + "loss": 0.9091, + "step": 6422 + }, + { + "epoch": 0.46340319613289566, + "grad_norm": 2.6920480613442623, + "learning_rate": 2.3326096825094176e-06, + "loss": 0.9701, + "step": 6423 + }, + { + "epoch": 0.4634753436023232, + "grad_norm": 3.228559785645876, + "learning_rate": 2.3321488218918567e-06, + "loss": 0.9078, + "step": 6424 + }, + { + "epoch": 0.46354749107175064, + "grad_norm": 0.7687670015765588, + "learning_rate": 2.331687943136878e-06, + "loss": 0.8361, + "step": 6425 + }, + { + "epoch": 0.46361963854117816, + "grad_norm": 2.034930723731037, + "learning_rate": 2.3312270462696486e-06, + "loss": 0.9514, + "step": 6426 + }, + { + "epoch": 0.4636917860106057, + "grad_norm": 2.552270895365074, + "learning_rate": 2.3307661313153362e-06, + "loss": 0.9164, + "step": 6427 + }, + { + "epoch": 0.4637639334800332, + "grad_norm": 3.2111580209117885, + "learning_rate": 2.3303051982991106e-06, + "loss": 0.9711, + "step": 6428 + }, + { + "epoch": 0.4638360809494607, + "grad_norm": 2.5599128875216084, + "learning_rate": 2.329844247246141e-06, + "loss": 0.9547, + "step": 6429 + }, + { + "epoch": 0.4639082284188882, + "grad_norm": 2.4960282246999, + "learning_rate": 2.329383278181597e-06, + "loss": 0.8842, + "step": 6430 + }, + { + "epoch": 0.4639803758883157, + "grad_norm": 4.708829847954511, + "learning_rate": 2.3289222911306518e-06, + "loss": 0.9229, + "step": 6431 + }, + { + "epoch": 0.4640525233577432, + "grad_norm": 3.7723944507796303, + "learning_rate": 2.328461286118478e-06, + "loss": 0.9444, + "step": 6432 + }, + { + "epoch": 0.46412467082717074, + "grad_norm": 1.8881672418762798, + "learning_rate": 2.3280002631702496e-06, + "loss": 0.9942, + "step": 6433 + }, + { + "epoch": 0.46419681829659826, + "grad_norm": 2.5474097503758233, + "learning_rate": 2.3275392223111414e-06, + "loss": 0.9368, + "step": 6434 + }, + { + "epoch": 0.4642689657660258, + "grad_norm": 2.245044443247069, + "learning_rate": 2.3270781635663277e-06, + "loss": 0.917, + "step": 6435 + }, + { + "epoch": 0.46434111323545324, + "grad_norm": 5.7280850955886375, + "learning_rate": 2.3266170869609875e-06, + "loss": 0.8363, + "step": 6436 + }, + { + "epoch": 0.46441326070488076, + "grad_norm": 4.561172501252462, + "learning_rate": 2.3261559925202962e-06, + "loss": 0.8769, + "step": 6437 + }, + { + "epoch": 0.4644854081743083, + "grad_norm": 1.8707853632542166, + "learning_rate": 2.325694880269434e-06, + "loss": 0.9371, + "step": 6438 + }, + { + "epoch": 0.4645575556437358, + "grad_norm": 6.377013542975655, + "learning_rate": 2.3252337502335804e-06, + "loss": 1.0144, + "step": 6439 + }, + { + "epoch": 0.4646297031131633, + "grad_norm": 0.8107256221158695, + "learning_rate": 2.324772602437915e-06, + "loss": 0.8068, + "step": 6440 + }, + { + "epoch": 0.46470185058259084, + "grad_norm": 2.2136468361198625, + "learning_rate": 2.3243114369076203e-06, + "loss": 0.9461, + "step": 6441 + }, + { + "epoch": 0.4647739980520183, + "grad_norm": 2.9608035095012637, + "learning_rate": 2.323850253667878e-06, + "loss": 0.8808, + "step": 6442 + }, + { + "epoch": 0.4648461455214458, + "grad_norm": 3.0623299687732026, + "learning_rate": 2.3233890527438734e-06, + "loss": 0.9068, + "step": 6443 + }, + { + "epoch": 0.46491829299087334, + "grad_norm": 3.8083253110527058, + "learning_rate": 2.322927834160789e-06, + "loss": 0.9783, + "step": 6444 + }, + { + "epoch": 0.46499044046030086, + "grad_norm": 2.12018589662241, + "learning_rate": 2.3224665979438116e-06, + "loss": 0.9357, + "step": 6445 + }, + { + "epoch": 0.4650625879297284, + "grad_norm": 4.140344689435756, + "learning_rate": 2.3220053441181267e-06, + "loss": 0.9243, + "step": 6446 + }, + { + "epoch": 0.4651347353991559, + "grad_norm": 2.905556708879244, + "learning_rate": 2.3215440727089216e-06, + "loss": 0.8882, + "step": 6447 + }, + { + "epoch": 0.46520688286858336, + "grad_norm": 6.642035408584968, + "learning_rate": 2.3210827837413855e-06, + "loss": 0.9302, + "step": 6448 + }, + { + "epoch": 0.4652790303380109, + "grad_norm": 3.269415886414924, + "learning_rate": 2.3206214772407066e-06, + "loss": 0.9715, + "step": 6449 + }, + { + "epoch": 0.4653511778074384, + "grad_norm": 5.1601318543613335, + "learning_rate": 2.3201601532320765e-06, + "loss": 0.8489, + "step": 6450 + }, + { + "epoch": 0.4654233252768659, + "grad_norm": 2.9908732822400683, + "learning_rate": 2.3196988117406854e-06, + "loss": 0.9406, + "step": 6451 + }, + { + "epoch": 0.46549547274629344, + "grad_norm": 3.158755827922147, + "learning_rate": 2.319237452791725e-06, + "loss": 1.0016, + "step": 6452 + }, + { + "epoch": 0.46556762021572096, + "grad_norm": 3.62403396362322, + "learning_rate": 2.31877607641039e-06, + "loss": 0.9566, + "step": 6453 + }, + { + "epoch": 0.4656397676851484, + "grad_norm": 2.1729066443158827, + "learning_rate": 2.3183146826218725e-06, + "loss": 1.0333, + "step": 6454 + }, + { + "epoch": 0.46571191515457594, + "grad_norm": 1.9687409173665118, + "learning_rate": 2.3178532714513697e-06, + "loss": 1.0386, + "step": 6455 + }, + { + "epoch": 0.46578406262400346, + "grad_norm": 2.9217472150470605, + "learning_rate": 2.3173918429240764e-06, + "loss": 0.8802, + "step": 6456 + }, + { + "epoch": 0.465856210093431, + "grad_norm": 3.0687222232843427, + "learning_rate": 2.3169303970651888e-06, + "loss": 0.8671, + "step": 6457 + }, + { + "epoch": 0.4659283575628585, + "grad_norm": 2.3357365244268773, + "learning_rate": 2.316468933899905e-06, + "loss": 0.9652, + "step": 6458 + }, + { + "epoch": 0.466000505032286, + "grad_norm": 2.4813020522953955, + "learning_rate": 2.3160074534534254e-06, + "loss": 0.9055, + "step": 6459 + }, + { + "epoch": 0.4660726525017135, + "grad_norm": 2.2516398282243415, + "learning_rate": 2.315545955750948e-06, + "loss": 0.8653, + "step": 6460 + }, + { + "epoch": 0.466144799971141, + "grad_norm": 2.9855393939315698, + "learning_rate": 2.3150844408176743e-06, + "loss": 1.0062, + "step": 6461 + }, + { + "epoch": 0.4662169474405685, + "grad_norm": 2.6483381134559827, + "learning_rate": 2.3146229086788053e-06, + "loss": 0.8378, + "step": 6462 + }, + { + "epoch": 0.46628909490999604, + "grad_norm": 2.4884581211054377, + "learning_rate": 2.3141613593595447e-06, + "loss": 0.8745, + "step": 6463 + }, + { + "epoch": 0.46636124237942356, + "grad_norm": 0.7228695941151007, + "learning_rate": 2.313699792885094e-06, + "loss": 0.8075, + "step": 6464 + }, + { + "epoch": 0.4664333898488511, + "grad_norm": 2.8600671482873268, + "learning_rate": 2.31323820928066e-06, + "loss": 0.9192, + "step": 6465 + }, + { + "epoch": 0.46650553731827854, + "grad_norm": 2.2503573875575005, + "learning_rate": 2.312776608571446e-06, + "loss": 1.0058, + "step": 6466 + }, + { + "epoch": 0.46657768478770606, + "grad_norm": 2.3034830340453487, + "learning_rate": 2.3123149907826604e-06, + "loss": 1.059, + "step": 6467 + }, + { + "epoch": 0.4666498322571336, + "grad_norm": 3.0630894307809418, + "learning_rate": 2.311853355939509e-06, + "loss": 0.8742, + "step": 6468 + }, + { + "epoch": 0.4667219797265611, + "grad_norm": 4.750494880997125, + "learning_rate": 2.3113917040671998e-06, + "loss": 0.8503, + "step": 6469 + }, + { + "epoch": 0.4667941271959886, + "grad_norm": 2.332050368136406, + "learning_rate": 2.3109300351909424e-06, + "loss": 0.8068, + "step": 6470 + }, + { + "epoch": 0.46686627466541614, + "grad_norm": 2.0877282389056115, + "learning_rate": 2.310468349335947e-06, + "loss": 1.0063, + "step": 6471 + }, + { + "epoch": 0.4669384221348436, + "grad_norm": 2.245681751446112, + "learning_rate": 2.3100066465274246e-06, + "loss": 0.9349, + "step": 6472 + }, + { + "epoch": 0.4670105696042711, + "grad_norm": 3.4012848950621484, + "learning_rate": 2.3095449267905865e-06, + "loss": 0.8871, + "step": 6473 + }, + { + "epoch": 0.46708271707369864, + "grad_norm": 3.1036388451978265, + "learning_rate": 2.309083190150646e-06, + "loss": 0.8257, + "step": 6474 + }, + { + "epoch": 0.46715486454312616, + "grad_norm": 2.087076169234785, + "learning_rate": 2.308621436632817e-06, + "loss": 0.9499, + "step": 6475 + }, + { + "epoch": 0.4672270120125537, + "grad_norm": 2.976208282780739, + "learning_rate": 2.308159666262313e-06, + "loss": 0.9236, + "step": 6476 + }, + { + "epoch": 0.4672991594819812, + "grad_norm": 5.77299016682937, + "learning_rate": 2.307697879064351e-06, + "loss": 0.9031, + "step": 6477 + }, + { + "epoch": 0.46737130695140866, + "grad_norm": 3.4283690818612866, + "learning_rate": 2.3072360750641473e-06, + "loss": 0.8678, + "step": 6478 + }, + { + "epoch": 0.4674434544208362, + "grad_norm": 2.9692149551095146, + "learning_rate": 2.3067742542869177e-06, + "loss": 0.9081, + "step": 6479 + }, + { + "epoch": 0.4675156018902637, + "grad_norm": 2.10548301791026, + "learning_rate": 2.3063124167578823e-06, + "loss": 0.9813, + "step": 6480 + }, + { + "epoch": 0.4675877493596912, + "grad_norm": 4.054366200088964, + "learning_rate": 2.3058505625022603e-06, + "loss": 0.9794, + "step": 6481 + }, + { + "epoch": 0.46765989682911874, + "grad_norm": 3.574715668368629, + "learning_rate": 2.3053886915452704e-06, + "loss": 0.9505, + "step": 6482 + }, + { + "epoch": 0.4677320442985462, + "grad_norm": 2.555504344574503, + "learning_rate": 2.304926803912135e-06, + "loss": 1.0144, + "step": 6483 + }, + { + "epoch": 0.4678041917679737, + "grad_norm": 0.7528140997325325, + "learning_rate": 2.3044648996280757e-06, + "loss": 0.8037, + "step": 6484 + }, + { + "epoch": 0.46787633923740124, + "grad_norm": 2.075717420533442, + "learning_rate": 2.3040029787183147e-06, + "loss": 1.0126, + "step": 6485 + }, + { + "epoch": 0.46794848670682876, + "grad_norm": 4.597902873124712, + "learning_rate": 2.3035410412080768e-06, + "loss": 0.8657, + "step": 6486 + }, + { + "epoch": 0.4680206341762563, + "grad_norm": 9.614089925606908, + "learning_rate": 2.3030790871225873e-06, + "loss": 0.9372, + "step": 6487 + }, + { + "epoch": 0.4680927816456838, + "grad_norm": 3.800843476494724, + "learning_rate": 2.3026171164870694e-06, + "loss": 0.8874, + "step": 6488 + }, + { + "epoch": 0.46816492911511126, + "grad_norm": 2.432144639814975, + "learning_rate": 2.3021551293267518e-06, + "loss": 0.9708, + "step": 6489 + }, + { + "epoch": 0.4682370765845388, + "grad_norm": 3.3819825737953715, + "learning_rate": 2.301693125666861e-06, + "loss": 0.9133, + "step": 6490 + }, + { + "epoch": 0.4683092240539663, + "grad_norm": 2.4632680363274972, + "learning_rate": 2.3012311055326246e-06, + "loss": 0.9462, + "step": 6491 + }, + { + "epoch": 0.4683813715233938, + "grad_norm": 3.05437106861902, + "learning_rate": 2.300769068949274e-06, + "loss": 0.9434, + "step": 6492 + }, + { + "epoch": 0.46845351899282134, + "grad_norm": 0.7549605829168287, + "learning_rate": 2.300307015942037e-06, + "loss": 0.8221, + "step": 6493 + }, + { + "epoch": 0.46852566646224886, + "grad_norm": 0.6981056986969207, + "learning_rate": 2.299844946536146e-06, + "loss": 0.8033, + "step": 6494 + }, + { + "epoch": 0.4685978139316763, + "grad_norm": 3.025511668414599, + "learning_rate": 2.2993828607568324e-06, + "loss": 1.0049, + "step": 6495 + }, + { + "epoch": 0.46866996140110384, + "grad_norm": 3.9078991879957186, + "learning_rate": 2.298920758629329e-06, + "loss": 0.9587, + "step": 6496 + }, + { + "epoch": 0.46874210887053136, + "grad_norm": 3.3233556826141952, + "learning_rate": 2.2984586401788696e-06, + "loss": 1.0281, + "step": 6497 + }, + { + "epoch": 0.4688142563399589, + "grad_norm": 4.514738530459221, + "learning_rate": 2.297996505430688e-06, + "loss": 0.964, + "step": 6498 + }, + { + "epoch": 0.4688864038093864, + "grad_norm": 4.098612685537012, + "learning_rate": 2.297534354410022e-06, + "loss": 0.9848, + "step": 6499 + }, + { + "epoch": 0.4689585512788139, + "grad_norm": 10.93606854472135, + "learning_rate": 2.2970721871421055e-06, + "loss": 0.9147, + "step": 6500 + }, + { + "epoch": 0.4690306987482414, + "grad_norm": 2.392876356572361, + "learning_rate": 2.296610003652176e-06, + "loss": 0.9907, + "step": 6501 + }, + { + "epoch": 0.4691028462176689, + "grad_norm": 2.7659065405005445, + "learning_rate": 2.2961478039654725e-06, + "loss": 0.9271, + "step": 6502 + }, + { + "epoch": 0.4691749936870964, + "grad_norm": 3.8181957785446126, + "learning_rate": 2.2956855881072337e-06, + "loss": 1.0091, + "step": 6503 + }, + { + "epoch": 0.46924714115652394, + "grad_norm": 2.9455070937250465, + "learning_rate": 2.2952233561027003e-06, + "loss": 0.952, + "step": 6504 + }, + { + "epoch": 0.46931928862595146, + "grad_norm": 3.3347118547172405, + "learning_rate": 2.294761107977112e-06, + "loss": 0.9971, + "step": 6505 + }, + { + "epoch": 0.469391436095379, + "grad_norm": 2.9732405580777903, + "learning_rate": 2.2942988437557096e-06, + "loss": 0.9281, + "step": 6506 + }, + { + "epoch": 0.46946358356480644, + "grad_norm": 2.7671794940783134, + "learning_rate": 2.293836563463738e-06, + "loss": 0.89, + "step": 6507 + }, + { + "epoch": 0.46953573103423396, + "grad_norm": 2.7667265169341975, + "learning_rate": 2.293374267126439e-06, + "loss": 0.9201, + "step": 6508 + }, + { + "epoch": 0.4696078785036615, + "grad_norm": 2.215312497761442, + "learning_rate": 2.2929119547690577e-06, + "loss": 0.98, + "step": 6509 + }, + { + "epoch": 0.469680025973089, + "grad_norm": 3.7290258682920934, + "learning_rate": 2.292449626416838e-06, + "loss": 0.8563, + "step": 6510 + }, + { + "epoch": 0.4697521734425165, + "grad_norm": 2.87723437848956, + "learning_rate": 2.291987282095028e-06, + "loss": 0.8771, + "step": 6511 + }, + { + "epoch": 0.46982432091194404, + "grad_norm": 2.4831321054467854, + "learning_rate": 2.2915249218288726e-06, + "loss": 0.9735, + "step": 6512 + }, + { + "epoch": 0.4698964683813715, + "grad_norm": 0.8394404662869932, + "learning_rate": 2.2910625456436205e-06, + "loss": 0.8707, + "step": 6513 + }, + { + "epoch": 0.469968615850799, + "grad_norm": 2.3784256372893977, + "learning_rate": 2.2906001535645205e-06, + "loss": 0.9348, + "step": 6514 + }, + { + "epoch": 0.47004076332022654, + "grad_norm": 0.7656657538953142, + "learning_rate": 2.290137745616821e-06, + "loss": 0.7724, + "step": 6515 + }, + { + "epoch": 0.47011291078965406, + "grad_norm": 2.4175293358302, + "learning_rate": 2.2896753218257746e-06, + "loss": 0.9634, + "step": 6516 + }, + { + "epoch": 0.4701850582590816, + "grad_norm": 3.349947168873074, + "learning_rate": 2.289212882216631e-06, + "loss": 0.9761, + "step": 6517 + }, + { + "epoch": 0.4702572057285091, + "grad_norm": 3.153100434614959, + "learning_rate": 2.2887504268146415e-06, + "loss": 0.9302, + "step": 6518 + }, + { + "epoch": 0.47032935319793656, + "grad_norm": 2.1588608788055024, + "learning_rate": 2.288287955645061e-06, + "loss": 0.8449, + "step": 6519 + }, + { + "epoch": 0.4704015006673641, + "grad_norm": 2.443909166230788, + "learning_rate": 2.2878254687331426e-06, + "loss": 0.9397, + "step": 6520 + }, + { + "epoch": 0.4704736481367916, + "grad_norm": 3.463540091072049, + "learning_rate": 2.28736296610414e-06, + "loss": 0.8888, + "step": 6521 + }, + { + "epoch": 0.4705457956062191, + "grad_norm": 2.9290852245514376, + "learning_rate": 2.2869004477833103e-06, + "loss": 0.8555, + "step": 6522 + }, + { + "epoch": 0.47061794307564664, + "grad_norm": 3.03060654565153, + "learning_rate": 2.2864379137959095e-06, + "loss": 0.9612, + "step": 6523 + }, + { + "epoch": 0.47069009054507416, + "grad_norm": 2.7859110832566527, + "learning_rate": 2.285975364167194e-06, + "loss": 0.7796, + "step": 6524 + }, + { + "epoch": 0.4707622380145016, + "grad_norm": 3.1038765142299685, + "learning_rate": 2.2855127989224223e-06, + "loss": 0.9076, + "step": 6525 + }, + { + "epoch": 0.47083438548392914, + "grad_norm": 3.799351641907438, + "learning_rate": 2.285050218086854e-06, + "loss": 0.9273, + "step": 6526 + }, + { + "epoch": 0.47090653295335666, + "grad_norm": 2.1799033000042423, + "learning_rate": 2.284587621685748e-06, + "loss": 0.8467, + "step": 6527 + }, + { + "epoch": 0.4709786804227842, + "grad_norm": 2.3738310095295745, + "learning_rate": 2.284125009744366e-06, + "loss": 0.9395, + "step": 6528 + }, + { + "epoch": 0.4710508278922117, + "grad_norm": 2.0812154177063658, + "learning_rate": 2.283662382287969e-06, + "loss": 0.8944, + "step": 6529 + }, + { + "epoch": 0.4711229753616392, + "grad_norm": 4.08821535529377, + "learning_rate": 2.283199739341819e-06, + "loss": 0.9709, + "step": 6530 + }, + { + "epoch": 0.4711951228310667, + "grad_norm": 2.485980972303151, + "learning_rate": 2.2827370809311798e-06, + "loss": 0.8528, + "step": 6531 + }, + { + "epoch": 0.4712672703004942, + "grad_norm": 3.3336727128512016, + "learning_rate": 2.2822744070813153e-06, + "loss": 0.8864, + "step": 6532 + }, + { + "epoch": 0.4713394177699217, + "grad_norm": 4.689370151025623, + "learning_rate": 2.2818117178174905e-06, + "loss": 1.0394, + "step": 6533 + }, + { + "epoch": 0.47141156523934924, + "grad_norm": 6.526982277450007, + "learning_rate": 2.2813490131649714e-06, + "loss": 0.9533, + "step": 6534 + }, + { + "epoch": 0.47148371270877676, + "grad_norm": 11.210500590354034, + "learning_rate": 2.2808862931490234e-06, + "loss": 1.0189, + "step": 6535 + }, + { + "epoch": 0.4715558601782042, + "grad_norm": 3.2024672414398117, + "learning_rate": 2.280423557794915e-06, + "loss": 0.9707, + "step": 6536 + }, + { + "epoch": 0.47162800764763174, + "grad_norm": 3.345259708877089, + "learning_rate": 2.2799608071279143e-06, + "loss": 0.905, + "step": 6537 + }, + { + "epoch": 0.47170015511705926, + "grad_norm": 2.2670974682020995, + "learning_rate": 2.2794980411732906e-06, + "loss": 0.9014, + "step": 6538 + }, + { + "epoch": 0.4717723025864868, + "grad_norm": 3.1856583903783515, + "learning_rate": 2.279035259956313e-06, + "loss": 0.9929, + "step": 6539 + }, + { + "epoch": 0.4718444500559143, + "grad_norm": 2.100732121685799, + "learning_rate": 2.278572463502253e-06, + "loss": 0.8966, + "step": 6540 + }, + { + "epoch": 0.4719165975253418, + "grad_norm": 3.0373370217135403, + "learning_rate": 2.278109651836382e-06, + "loss": 1.008, + "step": 6541 + }, + { + "epoch": 0.4719887449947693, + "grad_norm": 2.2119188051825533, + "learning_rate": 2.2776468249839724e-06, + "loss": 0.964, + "step": 6542 + }, + { + "epoch": 0.4720608924641968, + "grad_norm": 4.212482459147804, + "learning_rate": 2.277183982970298e-06, + "loss": 0.8351, + "step": 6543 + }, + { + "epoch": 0.4721330399336243, + "grad_norm": 2.8691830044415214, + "learning_rate": 2.276721125820632e-06, + "loss": 0.9312, + "step": 6544 + }, + { + "epoch": 0.47220518740305184, + "grad_norm": 2.5748686599568202, + "learning_rate": 2.276258253560249e-06, + "loss": 0.7782, + "step": 6545 + }, + { + "epoch": 0.47227733487247936, + "grad_norm": 2.663567550189292, + "learning_rate": 2.2757953662144266e-06, + "loss": 0.9506, + "step": 6546 + }, + { + "epoch": 0.4723494823419069, + "grad_norm": 3.3466417874276497, + "learning_rate": 2.2753324638084398e-06, + "loss": 0.8925, + "step": 6547 + }, + { + "epoch": 0.47242162981133434, + "grad_norm": 3.887837587276699, + "learning_rate": 2.274869546367566e-06, + "loss": 0.9664, + "step": 6548 + }, + { + "epoch": 0.47249377728076186, + "grad_norm": 3.4535288855025503, + "learning_rate": 2.2744066139170846e-06, + "loss": 0.942, + "step": 6549 + }, + { + "epoch": 0.4725659247501894, + "grad_norm": 2.606336549195464, + "learning_rate": 2.2739436664822728e-06, + "loss": 0.9303, + "step": 6550 + }, + { + "epoch": 0.4726380722196169, + "grad_norm": 0.7337321044325249, + "learning_rate": 2.2734807040884123e-06, + "loss": 0.7602, + "step": 6551 + }, + { + "epoch": 0.4727102196890444, + "grad_norm": 9.48870941615757, + "learning_rate": 2.2730177267607824e-06, + "loss": 0.828, + "step": 6552 + }, + { + "epoch": 0.47278236715847194, + "grad_norm": 8.508640161474743, + "learning_rate": 2.2725547345246657e-06, + "loss": 0.9735, + "step": 6553 + }, + { + "epoch": 0.4728545146278994, + "grad_norm": 2.842091506419062, + "learning_rate": 2.272091727405343e-06, + "loss": 0.8934, + "step": 6554 + }, + { + "epoch": 0.4729266620973269, + "grad_norm": 3.1123664306414796, + "learning_rate": 2.2716287054280996e-06, + "loss": 0.9271, + "step": 6555 + }, + { + "epoch": 0.47299880956675444, + "grad_norm": 2.8864181010979166, + "learning_rate": 2.2711656686182175e-06, + "loss": 0.9279, + "step": 6556 + }, + { + "epoch": 0.47307095703618196, + "grad_norm": 3.457250831656315, + "learning_rate": 2.2707026170009815e-06, + "loss": 0.867, + "step": 6557 + }, + { + "epoch": 0.4731431045056095, + "grad_norm": 2.725959077098306, + "learning_rate": 2.2702395506016784e-06, + "loss": 0.9786, + "step": 6558 + }, + { + "epoch": 0.473215251975037, + "grad_norm": 2.9962882285927885, + "learning_rate": 2.2697764694455935e-06, + "loss": 0.887, + "step": 6559 + }, + { + "epoch": 0.47328739944446446, + "grad_norm": 4.725820536741043, + "learning_rate": 2.2693133735580144e-06, + "loss": 0.9911, + "step": 6560 + }, + { + "epoch": 0.473359546913892, + "grad_norm": 7.434953269580759, + "learning_rate": 2.2688502629642295e-06, + "loss": 1.038, + "step": 6561 + }, + { + "epoch": 0.4734316943833195, + "grad_norm": 3.490352139783661, + "learning_rate": 2.268387137689526e-06, + "loss": 0.9287, + "step": 6562 + }, + { + "epoch": 0.473503841852747, + "grad_norm": 4.356674615861976, + "learning_rate": 2.267923997759195e-06, + "loss": 0.919, + "step": 6563 + }, + { + "epoch": 0.47357598932217454, + "grad_norm": 1.997009067482341, + "learning_rate": 2.2674608431985257e-06, + "loss": 0.9826, + "step": 6564 + }, + { + "epoch": 0.47364813679160206, + "grad_norm": 2.5044700713803594, + "learning_rate": 2.266997674032811e-06, + "loss": 0.8558, + "step": 6565 + }, + { + "epoch": 0.4737202842610295, + "grad_norm": 3.3062358141993196, + "learning_rate": 2.266534490287341e-06, + "loss": 0.9584, + "step": 6566 + }, + { + "epoch": 0.47379243173045704, + "grad_norm": 2.016324062512556, + "learning_rate": 2.266071291987409e-06, + "loss": 0.9029, + "step": 6567 + }, + { + "epoch": 0.47386457919988456, + "grad_norm": 2.150936601365099, + "learning_rate": 2.2656080791583093e-06, + "loss": 0.8206, + "step": 6568 + }, + { + "epoch": 0.4739367266693121, + "grad_norm": 2.723567246516077, + "learning_rate": 2.265144851825335e-06, + "loss": 1.0762, + "step": 6569 + }, + { + "epoch": 0.4740088741387396, + "grad_norm": 2.641814466206634, + "learning_rate": 2.264681610013782e-06, + "loss": 0.9735, + "step": 6570 + }, + { + "epoch": 0.4740810216081671, + "grad_norm": 2.3358297163678663, + "learning_rate": 2.2642183537489464e-06, + "loss": 0.947, + "step": 6571 + }, + { + "epoch": 0.4741531690775946, + "grad_norm": 2.2812680805482657, + "learning_rate": 2.2637550830561244e-06, + "loss": 0.9618, + "step": 6572 + }, + { + "epoch": 0.4742253165470221, + "grad_norm": 3.6693380479181665, + "learning_rate": 2.263291797960614e-06, + "loss": 0.7653, + "step": 6573 + }, + { + "epoch": 0.4742974640164496, + "grad_norm": 2.9115578367714483, + "learning_rate": 2.2628284984877127e-06, + "loss": 0.8877, + "step": 6574 + }, + { + "epoch": 0.47436961148587714, + "grad_norm": 3.5309271200648524, + "learning_rate": 2.2623651846627203e-06, + "loss": 1.0162, + "step": 6575 + }, + { + "epoch": 0.47444175895530466, + "grad_norm": 3.9735062101590897, + "learning_rate": 2.261901856510936e-06, + "loss": 1.0296, + "step": 6576 + }, + { + "epoch": 0.4745139064247322, + "grad_norm": 3.053401900886118, + "learning_rate": 2.2614385140576616e-06, + "loss": 0.9271, + "step": 6577 + }, + { + "epoch": 0.47458605389415964, + "grad_norm": 2.617470706021579, + "learning_rate": 2.2609751573281978e-06, + "loss": 0.9768, + "step": 6578 + }, + { + "epoch": 0.47465820136358716, + "grad_norm": 3.7817245059565927, + "learning_rate": 2.260511786347846e-06, + "loss": 0.9448, + "step": 6579 + }, + { + "epoch": 0.4747303488330147, + "grad_norm": 2.7465317269698883, + "learning_rate": 2.2600484011419107e-06, + "loss": 0.8253, + "step": 6580 + }, + { + "epoch": 0.4748024963024422, + "grad_norm": 2.7248876286006687, + "learning_rate": 2.2595850017356935e-06, + "loss": 0.8654, + "step": 6581 + }, + { + "epoch": 0.4748746437718697, + "grad_norm": 2.2275726703816097, + "learning_rate": 2.259121588154502e-06, + "loss": 0.8613, + "step": 6582 + }, + { + "epoch": 0.47494679124129724, + "grad_norm": 3.0417286766385256, + "learning_rate": 2.2586581604236392e-06, + "loss": 0.9616, + "step": 6583 + }, + { + "epoch": 0.4750189387107247, + "grad_norm": 3.07216520442406, + "learning_rate": 2.2581947185684116e-06, + "loss": 0.9771, + "step": 6584 + }, + { + "epoch": 0.4750910861801522, + "grad_norm": 2.9398093378433736, + "learning_rate": 2.257731262614126e-06, + "loss": 0.9108, + "step": 6585 + }, + { + "epoch": 0.47516323364957974, + "grad_norm": 2.885255352335587, + "learning_rate": 2.2572677925860905e-06, + "loss": 0.8725, + "step": 6586 + }, + { + "epoch": 0.47523538111900726, + "grad_norm": 4.512667099667002, + "learning_rate": 2.2568043085096132e-06, + "loss": 0.8604, + "step": 6587 + }, + { + "epoch": 0.4753075285884348, + "grad_norm": 3.2138625880683436, + "learning_rate": 2.2563408104100037e-06, + "loss": 0.9692, + "step": 6588 + }, + { + "epoch": 0.47537967605786224, + "grad_norm": 5.536471386305266, + "learning_rate": 2.2558772983125707e-06, + "loss": 0.965, + "step": 6589 + }, + { + "epoch": 0.47545182352728976, + "grad_norm": 3.793541469216082, + "learning_rate": 2.2554137722426263e-06, + "loss": 1.0229, + "step": 6590 + }, + { + "epoch": 0.4755239709967173, + "grad_norm": 2.4030389539694177, + "learning_rate": 2.254950232225481e-06, + "loss": 0.8775, + "step": 6591 + }, + { + "epoch": 0.4755961184661448, + "grad_norm": 4.109306305865953, + "learning_rate": 2.254486678286448e-06, + "loss": 1.0629, + "step": 6592 + }, + { + "epoch": 0.4756682659355723, + "grad_norm": 2.864402811546741, + "learning_rate": 2.2540231104508376e-06, + "loss": 1.0229, + "step": 6593 + }, + { + "epoch": 0.47574041340499984, + "grad_norm": 6.81874987692269, + "learning_rate": 2.2535595287439677e-06, + "loss": 1.0198, + "step": 6594 + }, + { + "epoch": 0.4758125608744273, + "grad_norm": 5.577812121936965, + "learning_rate": 2.25309593319115e-06, + "loss": 0.964, + "step": 6595 + }, + { + "epoch": 0.4758847083438548, + "grad_norm": 0.7004869563952928, + "learning_rate": 2.2526323238176992e-06, + "loss": 0.8043, + "step": 6596 + }, + { + "epoch": 0.47595685581328234, + "grad_norm": 2.143614526143713, + "learning_rate": 2.2521687006489337e-06, + "loss": 1.0319, + "step": 6597 + }, + { + "epoch": 0.47602900328270986, + "grad_norm": 2.0727498228574324, + "learning_rate": 2.2517050637101683e-06, + "loss": 0.9773, + "step": 6598 + }, + { + "epoch": 0.4761011507521374, + "grad_norm": 4.111120745215139, + "learning_rate": 2.2512414130267213e-06, + "loss": 0.9198, + "step": 6599 + }, + { + "epoch": 0.4761732982215649, + "grad_norm": 2.292682688348814, + "learning_rate": 2.2507777486239116e-06, + "loss": 0.9503, + "step": 6600 + }, + { + "epoch": 0.47624544569099236, + "grad_norm": 3.5226259502259216, + "learning_rate": 2.2503140705270564e-06, + "loss": 0.9508, + "step": 6601 + }, + { + "epoch": 0.4763175931604199, + "grad_norm": 2.524826092915905, + "learning_rate": 2.2498503787614767e-06, + "loss": 0.9506, + "step": 6602 + }, + { + "epoch": 0.4763897406298474, + "grad_norm": 4.452144046286769, + "learning_rate": 2.249386673352492e-06, + "loss": 1.0014, + "step": 6603 + }, + { + "epoch": 0.4764618880992749, + "grad_norm": 2.472711306376342, + "learning_rate": 2.2489229543254256e-06, + "loss": 0.9047, + "step": 6604 + }, + { + "epoch": 0.47653403556870244, + "grad_norm": 3.181608402470732, + "learning_rate": 2.2484592217055983e-06, + "loss": 0.8784, + "step": 6605 + }, + { + "epoch": 0.47660618303812996, + "grad_norm": 13.720551739600799, + "learning_rate": 2.2479954755183313e-06, + "loss": 0.9724, + "step": 6606 + }, + { + "epoch": 0.4766783305075574, + "grad_norm": 2.5220570287042237, + "learning_rate": 2.2475317157889502e-06, + "loss": 0.8786, + "step": 6607 + }, + { + "epoch": 0.47675047797698494, + "grad_norm": 3.135806358343705, + "learning_rate": 2.2470679425427775e-06, + "loss": 0.9021, + "step": 6608 + }, + { + "epoch": 0.47682262544641246, + "grad_norm": 2.317951169668913, + "learning_rate": 2.24660415580514e-06, + "loss": 0.9132, + "step": 6609 + }, + { + "epoch": 0.47689477291584, + "grad_norm": 2.0649516244537613, + "learning_rate": 2.246140355601363e-06, + "loss": 0.93, + "step": 6610 + }, + { + "epoch": 0.4769669203852675, + "grad_norm": 2.562842090381441, + "learning_rate": 2.245676541956771e-06, + "loss": 0.8944, + "step": 6611 + }, + { + "epoch": 0.477039067854695, + "grad_norm": 2.9739065260419273, + "learning_rate": 2.245212714896694e-06, + "loss": 0.9816, + "step": 6612 + }, + { + "epoch": 0.4771112153241225, + "grad_norm": 7.818828003613946, + "learning_rate": 2.244748874446457e-06, + "loss": 0.8068, + "step": 6613 + }, + { + "epoch": 0.47718336279355, + "grad_norm": 7.570693440492877, + "learning_rate": 2.2442850206313904e-06, + "loss": 0.9233, + "step": 6614 + }, + { + "epoch": 0.4772555102629775, + "grad_norm": 3.245665373922953, + "learning_rate": 2.243821153476823e-06, + "loss": 0.8396, + "step": 6615 + }, + { + "epoch": 0.47732765773240504, + "grad_norm": 5.0544642969580496, + "learning_rate": 2.2433572730080857e-06, + "loss": 0.9582, + "step": 6616 + }, + { + "epoch": 0.47739980520183256, + "grad_norm": 2.8978733223133295, + "learning_rate": 2.2428933792505087e-06, + "loss": 0.9367, + "step": 6617 + }, + { + "epoch": 0.4774719526712601, + "grad_norm": 36.606445612099, + "learning_rate": 2.242429472229422e-06, + "loss": 0.9203, + "step": 6618 + }, + { + "epoch": 0.47754410014068754, + "grad_norm": 2.1791131688316887, + "learning_rate": 2.2419655519701606e-06, + "loss": 0.9502, + "step": 6619 + }, + { + "epoch": 0.47761624761011506, + "grad_norm": 3.3476841679480573, + "learning_rate": 2.241501618498055e-06, + "loss": 0.9378, + "step": 6620 + }, + { + "epoch": 0.4776883950795426, + "grad_norm": 2.3125601064757717, + "learning_rate": 2.2410376718384413e-06, + "loss": 0.906, + "step": 6621 + }, + { + "epoch": 0.4777605425489701, + "grad_norm": 3.785678130745789, + "learning_rate": 2.2405737120166524e-06, + "loss": 0.9732, + "step": 6622 + }, + { + "epoch": 0.4778326900183976, + "grad_norm": 2.2630386849340547, + "learning_rate": 2.2401097390580224e-06, + "loss": 0.8874, + "step": 6623 + }, + { + "epoch": 0.47790483748782514, + "grad_norm": 3.086883477189511, + "learning_rate": 2.23964575298789e-06, + "loss": 0.9293, + "step": 6624 + }, + { + "epoch": 0.4779769849572526, + "grad_norm": 3.593041391586018, + "learning_rate": 2.2391817538315893e-06, + "loss": 1.0432, + "step": 6625 + }, + { + "epoch": 0.4780491324266801, + "grad_norm": 5.9881025612405425, + "learning_rate": 2.2387177416144586e-06, + "loss": 0.8694, + "step": 6626 + }, + { + "epoch": 0.47812127989610764, + "grad_norm": 2.3978002242101337, + "learning_rate": 2.2382537163618364e-06, + "loss": 0.9457, + "step": 6627 + }, + { + "epoch": 0.47819342736553516, + "grad_norm": 4.893464552932875, + "learning_rate": 2.2377896780990597e-06, + "loss": 1.0096, + "step": 6628 + }, + { + "epoch": 0.4782655748349627, + "grad_norm": 2.954228109376289, + "learning_rate": 2.2373256268514694e-06, + "loss": 0.9241, + "step": 6629 + }, + { + "epoch": 0.4783377223043902, + "grad_norm": 2.94212561495176, + "learning_rate": 2.236861562644405e-06, + "loss": 0.8913, + "step": 6630 + }, + { + "epoch": 0.47840986977381766, + "grad_norm": 2.4182930329862615, + "learning_rate": 2.2363974855032074e-06, + "loss": 0.9127, + "step": 6631 + }, + { + "epoch": 0.4784820172432452, + "grad_norm": 0.8689400430743766, + "learning_rate": 2.235933395453219e-06, + "loss": 0.9063, + "step": 6632 + }, + { + "epoch": 0.4785541647126727, + "grad_norm": 2.7436337040126975, + "learning_rate": 2.23546929251978e-06, + "loss": 0.8561, + "step": 6633 + }, + { + "epoch": 0.4786263121821002, + "grad_norm": 2.5119307029605396, + "learning_rate": 2.2350051767282356e-06, + "loss": 0.8782, + "step": 6634 + }, + { + "epoch": 0.47869845965152774, + "grad_norm": 3.201139700112863, + "learning_rate": 2.2345410481039278e-06, + "loss": 0.9775, + "step": 6635 + }, + { + "epoch": 0.4787706071209552, + "grad_norm": 3.493924317080135, + "learning_rate": 2.2340769066722026e-06, + "loss": 0.9086, + "step": 6636 + }, + { + "epoch": 0.4788427545903827, + "grad_norm": 2.1580724340025315, + "learning_rate": 2.233612752458403e-06, + "loss": 0.9704, + "step": 6637 + }, + { + "epoch": 0.47891490205981024, + "grad_norm": 2.514946698027793, + "learning_rate": 2.2331485854878764e-06, + "loss": 0.9013, + "step": 6638 + }, + { + "epoch": 0.47898704952923776, + "grad_norm": 2.73720382412057, + "learning_rate": 2.2326844057859685e-06, + "loss": 1.0037, + "step": 6639 + }, + { + "epoch": 0.4790591969986653, + "grad_norm": 7.9411555005297405, + "learning_rate": 2.2322202133780265e-06, + "loss": 0.9706, + "step": 6640 + }, + { + "epoch": 0.4791313444680928, + "grad_norm": 3.671083405572053, + "learning_rate": 2.2317560082893986e-06, + "loss": 0.9058, + "step": 6641 + }, + { + "epoch": 0.47920349193752027, + "grad_norm": 2.302832112005559, + "learning_rate": 2.2312917905454323e-06, + "loss": 0.9268, + "step": 6642 + }, + { + "epoch": 0.4792756394069478, + "grad_norm": 2.625590575859038, + "learning_rate": 2.230827560171478e-06, + "loss": 0.8727, + "step": 6643 + }, + { + "epoch": 0.4793477868763753, + "grad_norm": 2.4816093165438584, + "learning_rate": 2.2303633171928856e-06, + "loss": 0.8996, + "step": 6644 + }, + { + "epoch": 0.4794199343458028, + "grad_norm": 3.233078406466636, + "learning_rate": 2.229899061635004e-06, + "loss": 0.9354, + "step": 6645 + }, + { + "epoch": 0.47949208181523034, + "grad_norm": 2.836597040160888, + "learning_rate": 2.229434793523187e-06, + "loss": 0.7814, + "step": 6646 + }, + { + "epoch": 0.47956422928465786, + "grad_norm": 3.728958607131644, + "learning_rate": 2.228970512882784e-06, + "loss": 1.0116, + "step": 6647 + }, + { + "epoch": 0.4796363767540853, + "grad_norm": 3.86487770844472, + "learning_rate": 2.2285062197391503e-06, + "loss": 0.9895, + "step": 6648 + }, + { + "epoch": 0.47970852422351284, + "grad_norm": 4.784898749449195, + "learning_rate": 2.228041914117637e-06, + "loss": 0.9297, + "step": 6649 + }, + { + "epoch": 0.47978067169294036, + "grad_norm": 3.8961122110410207, + "learning_rate": 2.227577596043599e-06, + "loss": 1.0413, + "step": 6650 + }, + { + "epoch": 0.4798528191623679, + "grad_norm": 2.7525815983872355, + "learning_rate": 2.227113265542391e-06, + "loss": 0.9701, + "step": 6651 + }, + { + "epoch": 0.4799249666317954, + "grad_norm": 2.4565429202227436, + "learning_rate": 2.2266489226393682e-06, + "loss": 1.0, + "step": 6652 + }, + { + "epoch": 0.4799971141012229, + "grad_norm": 11.215220286586446, + "learning_rate": 2.2261845673598873e-06, + "loss": 0.9347, + "step": 6653 + }, + { + "epoch": 0.4800692615706504, + "grad_norm": 4.030331529180623, + "learning_rate": 2.225720199729304e-06, + "loss": 0.8655, + "step": 6654 + }, + { + "epoch": 0.4801414090400779, + "grad_norm": 4.024639535246837, + "learning_rate": 2.2252558197729765e-06, + "loss": 0.9904, + "step": 6655 + }, + { + "epoch": 0.4802135565095054, + "grad_norm": 2.901911789387512, + "learning_rate": 2.2247914275162626e-06, + "loss": 0.9617, + "step": 6656 + }, + { + "epoch": 0.48028570397893294, + "grad_norm": 3.8908665593544374, + "learning_rate": 2.22432702298452e-06, + "loss": 0.9351, + "step": 6657 + }, + { + "epoch": 0.48035785144836046, + "grad_norm": 30.42665060263011, + "learning_rate": 2.22386260620311e-06, + "loss": 0.9358, + "step": 6658 + }, + { + "epoch": 0.480429998917788, + "grad_norm": 8.866362761067236, + "learning_rate": 2.2233981771973915e-06, + "loss": 0.8617, + "step": 6659 + }, + { + "epoch": 0.48050214638721545, + "grad_norm": 2.8726501813362266, + "learning_rate": 2.2229337359927256e-06, + "loss": 0.8826, + "step": 6660 + }, + { + "epoch": 0.48057429385664296, + "grad_norm": 2.3360557908946884, + "learning_rate": 2.222469282614474e-06, + "loss": 0.9517, + "step": 6661 + }, + { + "epoch": 0.4806464413260705, + "grad_norm": 3.131205386759589, + "learning_rate": 2.222004817087998e-06, + "loss": 0.9973, + "step": 6662 + }, + { + "epoch": 0.480718588795498, + "grad_norm": 4.218107725631741, + "learning_rate": 2.2215403394386607e-06, + "loss": 0.9929, + "step": 6663 + }, + { + "epoch": 0.4807907362649255, + "grad_norm": 3.3498661754695047, + "learning_rate": 2.221075849691826e-06, + "loss": 0.9251, + "step": 6664 + }, + { + "epoch": 0.48086288373435304, + "grad_norm": 2.755179642429972, + "learning_rate": 2.2206113478728574e-06, + "loss": 0.8789, + "step": 6665 + }, + { + "epoch": 0.4809350312037805, + "grad_norm": 3.2006966666995607, + "learning_rate": 2.2201468340071197e-06, + "loss": 0.9536, + "step": 6666 + }, + { + "epoch": 0.481007178673208, + "grad_norm": 2.899330680164706, + "learning_rate": 2.219682308119978e-06, + "loss": 0.9467, + "step": 6667 + }, + { + "epoch": 0.48107932614263554, + "grad_norm": 3.3565294145886067, + "learning_rate": 2.2192177702367997e-06, + "loss": 0.8667, + "step": 6668 + }, + { + "epoch": 0.48115147361206306, + "grad_norm": 2.848129254108372, + "learning_rate": 2.218753220382949e-06, + "loss": 1.0005, + "step": 6669 + }, + { + "epoch": 0.4812236210814906, + "grad_norm": 3.6671907454981376, + "learning_rate": 2.218288658583796e-06, + "loss": 1.0085, + "step": 6670 + }, + { + "epoch": 0.4812957685509181, + "grad_norm": 2.631200823471576, + "learning_rate": 2.2178240848647074e-06, + "loss": 0.9508, + "step": 6671 + }, + { + "epoch": 0.48136791602034557, + "grad_norm": 4.076288620977272, + "learning_rate": 2.2173594992510507e-06, + "loss": 0.9044, + "step": 6672 + }, + { + "epoch": 0.4814400634897731, + "grad_norm": 2.6656754161364686, + "learning_rate": 2.216894901768198e-06, + "loss": 0.9026, + "step": 6673 + }, + { + "epoch": 0.4815122109592006, + "grad_norm": 7.633806739797397, + "learning_rate": 2.2164302924415158e-06, + "loss": 0.9117, + "step": 6674 + }, + { + "epoch": 0.4815843584286281, + "grad_norm": 2.0975741862293784, + "learning_rate": 2.215965671296378e-06, + "loss": 0.839, + "step": 6675 + }, + { + "epoch": 0.48165650589805564, + "grad_norm": 2.4294144946225478, + "learning_rate": 2.215501038358154e-06, + "loss": 0.9474, + "step": 6676 + }, + { + "epoch": 0.48172865336748316, + "grad_norm": 0.8197405319714128, + "learning_rate": 2.215036393652216e-06, + "loss": 0.8817, + "step": 6677 + }, + { + "epoch": 0.4818008008369106, + "grad_norm": 1.9835702060115024, + "learning_rate": 2.214571737203936e-06, + "loss": 0.9154, + "step": 6678 + }, + { + "epoch": 0.48187294830633814, + "grad_norm": 3.9186744548810637, + "learning_rate": 2.2141070690386885e-06, + "loss": 0.9586, + "step": 6679 + }, + { + "epoch": 0.48194509577576566, + "grad_norm": 2.469241419302312, + "learning_rate": 2.2136423891818463e-06, + "loss": 0.9778, + "step": 6680 + }, + { + "epoch": 0.4820172432451932, + "grad_norm": 41.35063186771606, + "learning_rate": 2.213177697658784e-06, + "loss": 0.9217, + "step": 6681 + }, + { + "epoch": 0.4820893907146207, + "grad_norm": 3.616619156087384, + "learning_rate": 2.212712994494877e-06, + "loss": 0.907, + "step": 6682 + }, + { + "epoch": 0.4821615381840482, + "grad_norm": 2.0645060176082612, + "learning_rate": 2.212248279715501e-06, + "loss": 0.9418, + "step": 6683 + }, + { + "epoch": 0.4822336856534757, + "grad_norm": 3.5497157614377395, + "learning_rate": 2.2117835533460313e-06, + "loss": 0.8922, + "step": 6684 + }, + { + "epoch": 0.4823058331229032, + "grad_norm": 6.709545270858467, + "learning_rate": 2.2113188154118466e-06, + "loss": 0.9772, + "step": 6685 + }, + { + "epoch": 0.4823779805923307, + "grad_norm": 4.410337892074215, + "learning_rate": 2.210854065938323e-06, + "loss": 0.9819, + "step": 6686 + }, + { + "epoch": 0.48245012806175824, + "grad_norm": 2.684758673394854, + "learning_rate": 2.21038930495084e-06, + "loss": 0.9028, + "step": 6687 + }, + { + "epoch": 0.48252227553118576, + "grad_norm": 2.2214792148563025, + "learning_rate": 2.2099245324747762e-06, + "loss": 0.8831, + "step": 6688 + }, + { + "epoch": 0.4825944230006132, + "grad_norm": 2.2344140536223143, + "learning_rate": 2.20945974853551e-06, + "loss": 0.8651, + "step": 6689 + }, + { + "epoch": 0.48266657047004075, + "grad_norm": 3.503302514702589, + "learning_rate": 2.2089949531584224e-06, + "loss": 0.8909, + "step": 6690 + }, + { + "epoch": 0.48273871793946826, + "grad_norm": 2.82881215214773, + "learning_rate": 2.2085301463688943e-06, + "loss": 0.9086, + "step": 6691 + }, + { + "epoch": 0.4828108654088958, + "grad_norm": 7.996954815172959, + "learning_rate": 2.208065328192307e-06, + "loss": 0.8624, + "step": 6692 + }, + { + "epoch": 0.4828830128783233, + "grad_norm": 2.7875485197899437, + "learning_rate": 2.207600498654042e-06, + "loss": 0.8936, + "step": 6693 + }, + { + "epoch": 0.4829551603477508, + "grad_norm": 11.96804432992324, + "learning_rate": 2.2071356577794826e-06, + "loss": 0.9183, + "step": 6694 + }, + { + "epoch": 0.4830273078171783, + "grad_norm": 0.6780173730481099, + "learning_rate": 2.2066708055940118e-06, + "loss": 0.7889, + "step": 6695 + }, + { + "epoch": 0.4830994552866058, + "grad_norm": 0.7333837271483685, + "learning_rate": 2.2062059421230124e-06, + "loss": 0.7928, + "step": 6696 + }, + { + "epoch": 0.4831716027560333, + "grad_norm": 2.8145625710633375, + "learning_rate": 2.2057410673918712e-06, + "loss": 0.8949, + "step": 6697 + }, + { + "epoch": 0.48324375022546084, + "grad_norm": 0.8070136216962713, + "learning_rate": 2.2052761814259718e-06, + "loss": 0.7924, + "step": 6698 + }, + { + "epoch": 0.48331589769488836, + "grad_norm": 4.5359325549512315, + "learning_rate": 2.204811284250699e-06, + "loss": 0.9064, + "step": 6699 + }, + { + "epoch": 0.4833880451643159, + "grad_norm": 3.4609770417376446, + "learning_rate": 2.204346375891441e-06, + "loss": 0.9787, + "step": 6700 + }, + { + "epoch": 0.48346019263374335, + "grad_norm": 5.150763904988428, + "learning_rate": 2.2038814563735837e-06, + "loss": 0.9637, + "step": 6701 + }, + { + "epoch": 0.48353234010317087, + "grad_norm": 6.246368268572616, + "learning_rate": 2.2034165257225145e-06, + "loss": 1.0615, + "step": 6702 + }, + { + "epoch": 0.4836044875725984, + "grad_norm": 4.433215154540248, + "learning_rate": 2.202951583963622e-06, + "loss": 0.8688, + "step": 6703 + }, + { + "epoch": 0.4836766350420259, + "grad_norm": 2.277517531292938, + "learning_rate": 2.202486631122295e-06, + "loss": 0.9072, + "step": 6704 + }, + { + "epoch": 0.4837487825114534, + "grad_norm": 2.9047918455887216, + "learning_rate": 2.202021667223923e-06, + "loss": 0.9317, + "step": 6705 + }, + { + "epoch": 0.48382092998088094, + "grad_norm": 2.933064947710465, + "learning_rate": 2.2015566922938944e-06, + "loss": 0.9447, + "step": 6706 + }, + { + "epoch": 0.4838930774503084, + "grad_norm": 4.037628568604077, + "learning_rate": 2.2010917063576022e-06, + "loss": 0.9687, + "step": 6707 + }, + { + "epoch": 0.4839652249197359, + "grad_norm": 4.867938086373794, + "learning_rate": 2.2006267094404355e-06, + "loss": 0.8003, + "step": 6708 + }, + { + "epoch": 0.48403737238916344, + "grad_norm": 0.7250427430805878, + "learning_rate": 2.200161701567788e-06, + "loss": 0.7955, + "step": 6709 + }, + { + "epoch": 0.48410951985859096, + "grad_norm": 3.0526280009357785, + "learning_rate": 2.1996966827650502e-06, + "loss": 0.8361, + "step": 6710 + }, + { + "epoch": 0.4841816673280185, + "grad_norm": 2.507448639424506, + "learning_rate": 2.199231653057615e-06, + "loss": 0.8868, + "step": 6711 + }, + { + "epoch": 0.484253814797446, + "grad_norm": 4.297202135984502, + "learning_rate": 2.1987666124708784e-06, + "loss": 0.8515, + "step": 6712 + }, + { + "epoch": 0.48432596226687347, + "grad_norm": 2.2976496421721544, + "learning_rate": 2.1983015610302317e-06, + "loss": 0.8378, + "step": 6713 + }, + { + "epoch": 0.484398109736301, + "grad_norm": 2.8118307376997542, + "learning_rate": 2.197836498761071e-06, + "loss": 0.9919, + "step": 6714 + }, + { + "epoch": 0.4844702572057285, + "grad_norm": 3.1769296744454496, + "learning_rate": 2.1973714256887923e-06, + "loss": 0.8844, + "step": 6715 + }, + { + "epoch": 0.484542404675156, + "grad_norm": 2.5624482684612464, + "learning_rate": 2.1969063418387897e-06, + "loss": 0.9813, + "step": 6716 + }, + { + "epoch": 0.48461455214458354, + "grad_norm": 3.90978672899627, + "learning_rate": 2.196441247236461e-06, + "loss": 0.8888, + "step": 6717 + }, + { + "epoch": 0.48468669961401106, + "grad_norm": 3.1638088548507066, + "learning_rate": 2.195976141907203e-06, + "loss": 0.9805, + "step": 6718 + }, + { + "epoch": 0.4847588470834385, + "grad_norm": 3.24718837896252, + "learning_rate": 2.1955110258764136e-06, + "loss": 0.9498, + "step": 6719 + }, + { + "epoch": 0.48483099455286605, + "grad_norm": 0.6457848581969672, + "learning_rate": 2.1950458991694908e-06, + "loss": 0.779, + "step": 6720 + }, + { + "epoch": 0.48490314202229357, + "grad_norm": 9.202821580842917, + "learning_rate": 2.1945807618118333e-06, + "loss": 0.9597, + "step": 6721 + }, + { + "epoch": 0.4849752894917211, + "grad_norm": 3.2529098981967106, + "learning_rate": 2.194115613828841e-06, + "loss": 0.8702, + "step": 6722 + }, + { + "epoch": 0.4850474369611486, + "grad_norm": 8.656007618729769, + "learning_rate": 2.193650455245913e-06, + "loss": 0.9286, + "step": 6723 + }, + { + "epoch": 0.4851195844305761, + "grad_norm": 4.396756598011007, + "learning_rate": 2.193185286088451e-06, + "loss": 0.9613, + "step": 6724 + }, + { + "epoch": 0.4851917319000036, + "grad_norm": 2.202945025512645, + "learning_rate": 2.1927201063818552e-06, + "loss": 1.0193, + "step": 6725 + }, + { + "epoch": 0.4852638793694311, + "grad_norm": 3.21723724667669, + "learning_rate": 2.192254916151528e-06, + "loss": 0.8037, + "step": 6726 + }, + { + "epoch": 0.4853360268388586, + "grad_norm": 4.078816439303469, + "learning_rate": 2.1917897154228723e-06, + "loss": 0.9942, + "step": 6727 + }, + { + "epoch": 0.48540817430828614, + "grad_norm": 6.588360215252938, + "learning_rate": 2.1913245042212895e-06, + "loss": 0.885, + "step": 6728 + }, + { + "epoch": 0.48548032177771366, + "grad_norm": 2.687737210472972, + "learning_rate": 2.190859282572184e-06, + "loss": 1.0828, + "step": 6729 + }, + { + "epoch": 0.4855524692471412, + "grad_norm": 3.294152025736601, + "learning_rate": 2.1903940505009595e-06, + "loss": 0.8579, + "step": 6730 + }, + { + "epoch": 0.48562461671656865, + "grad_norm": 4.083585744309519, + "learning_rate": 2.1899288080330206e-06, + "loss": 0.8949, + "step": 6731 + }, + { + "epoch": 0.48569676418599617, + "grad_norm": 1.989245887087628, + "learning_rate": 2.1894635551937728e-06, + "loss": 1.0735, + "step": 6732 + }, + { + "epoch": 0.4857689116554237, + "grad_norm": 15.835130502532227, + "learning_rate": 2.1889982920086214e-06, + "loss": 0.9084, + "step": 6733 + }, + { + "epoch": 0.4858410591248512, + "grad_norm": 2.593169181780177, + "learning_rate": 2.1885330185029733e-06, + "loss": 0.9905, + "step": 6734 + }, + { + "epoch": 0.4859132065942787, + "grad_norm": 2.472927566757111, + "learning_rate": 2.1880677347022343e-06, + "loss": 0.9228, + "step": 6735 + }, + { + "epoch": 0.48598535406370624, + "grad_norm": 3.717421735235993, + "learning_rate": 2.1876024406318133e-06, + "loss": 0.9815, + "step": 6736 + }, + { + "epoch": 0.4860575015331337, + "grad_norm": 2.438580493598886, + "learning_rate": 2.1871371363171175e-06, + "loss": 1.0154, + "step": 6737 + }, + { + "epoch": 0.4861296490025612, + "grad_norm": 3.371110723813003, + "learning_rate": 2.1866718217835545e-06, + "loss": 0.9003, + "step": 6738 + }, + { + "epoch": 0.48620179647198875, + "grad_norm": 2.4290282897621016, + "learning_rate": 2.1862064970565353e-06, + "loss": 0.9492, + "step": 6739 + }, + { + "epoch": 0.48627394394141626, + "grad_norm": 2.5484749852644812, + "learning_rate": 2.185741162161468e-06, + "loss": 0.8764, + "step": 6740 + }, + { + "epoch": 0.4863460914108438, + "grad_norm": 3.8732907617125862, + "learning_rate": 2.1852758171237637e-06, + "loss": 0.9462, + "step": 6741 + }, + { + "epoch": 0.48641823888027125, + "grad_norm": 6.935305686544776, + "learning_rate": 2.184810461968833e-06, + "loss": 0.8635, + "step": 6742 + }, + { + "epoch": 0.48649038634969877, + "grad_norm": 5.41912895382312, + "learning_rate": 2.1843450967220872e-06, + "loss": 0.9602, + "step": 6743 + }, + { + "epoch": 0.4865625338191263, + "grad_norm": 3.162186582210621, + "learning_rate": 2.1838797214089373e-06, + "loss": 1.0378, + "step": 6744 + }, + { + "epoch": 0.4866346812885538, + "grad_norm": 2.0316486847696913, + "learning_rate": 2.183414336054797e-06, + "loss": 0.88, + "step": 6745 + }, + { + "epoch": 0.4867068287579813, + "grad_norm": 2.486101426354931, + "learning_rate": 2.1829489406850798e-06, + "loss": 0.9466, + "step": 6746 + }, + { + "epoch": 0.48677897622740884, + "grad_norm": 4.496062781577752, + "learning_rate": 2.1824835353251967e-06, + "loss": 0.8199, + "step": 6747 + }, + { + "epoch": 0.4868511236968363, + "grad_norm": 2.342174560022523, + "learning_rate": 2.182018120000564e-06, + "loss": 0.9312, + "step": 6748 + }, + { + "epoch": 0.4869232711662638, + "grad_norm": 4.239600754060812, + "learning_rate": 2.1815526947365957e-06, + "loss": 0.8923, + "step": 6749 + }, + { + "epoch": 0.48699541863569135, + "grad_norm": 2.6681175358002824, + "learning_rate": 2.1810872595587057e-06, + "loss": 0.8286, + "step": 6750 + }, + { + "epoch": 0.48706756610511887, + "grad_norm": 3.2512971783551223, + "learning_rate": 2.180621814492312e-06, + "loss": 0.9315, + "step": 6751 + }, + { + "epoch": 0.4871397135745464, + "grad_norm": 2.813009936410419, + "learning_rate": 2.1801563595628293e-06, + "loss": 0.9234, + "step": 6752 + }, + { + "epoch": 0.4872118610439739, + "grad_norm": 1.9118901595642908, + "learning_rate": 2.1796908947956747e-06, + "loss": 0.9873, + "step": 6753 + }, + { + "epoch": 0.48728400851340137, + "grad_norm": 2.563606023516686, + "learning_rate": 2.179225420216266e-06, + "loss": 0.928, + "step": 6754 + }, + { + "epoch": 0.4873561559828289, + "grad_norm": 2.191785592549991, + "learning_rate": 2.17875993585002e-06, + "loss": 1.024, + "step": 6755 + }, + { + "epoch": 0.4874283034522564, + "grad_norm": 4.055520032846146, + "learning_rate": 2.1782944417223556e-06, + "loss": 0.9459, + "step": 6756 + }, + { + "epoch": 0.4875004509216839, + "grad_norm": 3.17789838164855, + "learning_rate": 2.177828937858692e-06, + "loss": 0.9858, + "step": 6757 + }, + { + "epoch": 0.48757259839111144, + "grad_norm": 10.245082885347328, + "learning_rate": 2.1773634242844485e-06, + "loss": 0.9128, + "step": 6758 + }, + { + "epoch": 0.48764474586053896, + "grad_norm": 5.173882501070902, + "learning_rate": 2.176897901025045e-06, + "loss": 1.033, + "step": 6759 + }, + { + "epoch": 0.4877168933299664, + "grad_norm": 12.045729922881385, + "learning_rate": 2.1764323681059013e-06, + "loss": 0.943, + "step": 6760 + }, + { + "epoch": 0.48778904079939395, + "grad_norm": 1.757123006593358, + "learning_rate": 2.1759668255524402e-06, + "loss": 0.9512, + "step": 6761 + }, + { + "epoch": 0.48786118826882147, + "grad_norm": 2.6962290214532425, + "learning_rate": 2.1755012733900807e-06, + "loss": 0.9047, + "step": 6762 + }, + { + "epoch": 0.487933335738249, + "grad_norm": 1.8017689939736041, + "learning_rate": 2.1750357116442477e-06, + "loss": 0.9166, + "step": 6763 + }, + { + "epoch": 0.4880054832076765, + "grad_norm": 3.982042534961692, + "learning_rate": 2.1745701403403624e-06, + "loss": 0.9336, + "step": 6764 + }, + { + "epoch": 0.488077630677104, + "grad_norm": 3.4127342730581676, + "learning_rate": 2.174104559503847e-06, + "loss": 0.9641, + "step": 6765 + }, + { + "epoch": 0.4881497781465315, + "grad_norm": 5.263967009641005, + "learning_rate": 2.173638969160127e-06, + "loss": 0.9113, + "step": 6766 + }, + { + "epoch": 0.488221925615959, + "grad_norm": 2.797526363626979, + "learning_rate": 2.173173369334625e-06, + "loss": 0.9137, + "step": 6767 + }, + { + "epoch": 0.4882940730853865, + "grad_norm": 3.336111738670926, + "learning_rate": 2.172707760052767e-06, + "loss": 0.7988, + "step": 6768 + }, + { + "epoch": 0.48836622055481405, + "grad_norm": 5.143790247448005, + "learning_rate": 2.172242141339977e-06, + "loss": 0.8892, + "step": 6769 + }, + { + "epoch": 0.48843836802424156, + "grad_norm": 2.9591234432563303, + "learning_rate": 2.171776513221682e-06, + "loss": 0.9361, + "step": 6770 + }, + { + "epoch": 0.4885105154936691, + "grad_norm": 6.0104584144275135, + "learning_rate": 2.171310875723307e-06, + "loss": 0.891, + "step": 6771 + }, + { + "epoch": 0.48858266296309655, + "grad_norm": 2.422846543884615, + "learning_rate": 2.170845228870279e-06, + "loss": 0.9468, + "step": 6772 + }, + { + "epoch": 0.48865481043252407, + "grad_norm": 3.0673742625139515, + "learning_rate": 2.1703795726880262e-06, + "loss": 0.9257, + "step": 6773 + }, + { + "epoch": 0.4887269579019516, + "grad_norm": 3.4531708451071528, + "learning_rate": 2.1699139072019745e-06, + "loss": 1.0186, + "step": 6774 + }, + { + "epoch": 0.4887991053713791, + "grad_norm": 2.6074540611309347, + "learning_rate": 2.1694482324375543e-06, + "loss": 1.0261, + "step": 6775 + }, + { + "epoch": 0.4888712528408066, + "grad_norm": 4.005737719462425, + "learning_rate": 2.1689825484201937e-06, + "loss": 0.8767, + "step": 6776 + }, + { + "epoch": 0.48894340031023414, + "grad_norm": 3.3817561308443604, + "learning_rate": 2.168516855175321e-06, + "loss": 1.0465, + "step": 6777 + }, + { + "epoch": 0.4890155477796616, + "grad_norm": 2.5992147947280735, + "learning_rate": 2.168051152728367e-06, + "loss": 0.9374, + "step": 6778 + }, + { + "epoch": 0.4890876952490891, + "grad_norm": 3.801539691397231, + "learning_rate": 2.1675854411047613e-06, + "loss": 0.8835, + "step": 6779 + }, + { + "epoch": 0.48915984271851665, + "grad_norm": 3.0451627174836076, + "learning_rate": 2.1671197203299354e-06, + "loss": 0.9135, + "step": 6780 + }, + { + "epoch": 0.48923199018794417, + "grad_norm": 2.1606385173821674, + "learning_rate": 2.1666539904293205e-06, + "loss": 1.0045, + "step": 6781 + }, + { + "epoch": 0.4893041376573717, + "grad_norm": 3.592451574202258, + "learning_rate": 2.1661882514283475e-06, + "loss": 0.9602, + "step": 6782 + }, + { + "epoch": 0.4893762851267992, + "grad_norm": 2.806180253893197, + "learning_rate": 2.165722503352449e-06, + "loss": 0.9048, + "step": 6783 + }, + { + "epoch": 0.48944843259622667, + "grad_norm": 2.9979800735635944, + "learning_rate": 2.1652567462270585e-06, + "loss": 0.9437, + "step": 6784 + }, + { + "epoch": 0.4895205800656542, + "grad_norm": 2.3252973345885004, + "learning_rate": 2.1647909800776093e-06, + "loss": 0.8307, + "step": 6785 + }, + { + "epoch": 0.4895927275350817, + "grad_norm": 2.29349824942669, + "learning_rate": 2.1643252049295345e-06, + "loss": 0.8988, + "step": 6786 + }, + { + "epoch": 0.4896648750045092, + "grad_norm": 3.146907766071192, + "learning_rate": 2.163859420808268e-06, + "loss": 0.803, + "step": 6787 + }, + { + "epoch": 0.48973702247393675, + "grad_norm": 1.9575038194680008, + "learning_rate": 2.163393627739246e-06, + "loss": 0.8393, + "step": 6788 + }, + { + "epoch": 0.4898091699433642, + "grad_norm": 3.1344347627671976, + "learning_rate": 2.162927825747902e-06, + "loss": 0.8999, + "step": 6789 + }, + { + "epoch": 0.48988131741279173, + "grad_norm": 3.973345041343115, + "learning_rate": 2.162462014859673e-06, + "loss": 0.944, + "step": 6790 + }, + { + "epoch": 0.48995346488221925, + "grad_norm": 4.510435085244341, + "learning_rate": 2.1619961950999944e-06, + "loss": 0.9544, + "step": 6791 + }, + { + "epoch": 0.49002561235164677, + "grad_norm": 0.9997482280882434, + "learning_rate": 2.1615303664943037e-06, + "loss": 0.8625, + "step": 6792 + }, + { + "epoch": 0.4900977598210743, + "grad_norm": 4.320204335077934, + "learning_rate": 2.161064529068038e-06, + "loss": 0.9608, + "step": 6793 + }, + { + "epoch": 0.4901699072905018, + "grad_norm": 2.3002105160444204, + "learning_rate": 2.160598682846634e-06, + "loss": 0.9092, + "step": 6794 + }, + { + "epoch": 0.49024205475992927, + "grad_norm": 5.99903385013035, + "learning_rate": 2.16013282785553e-06, + "loss": 0.9398, + "step": 6795 + }, + { + "epoch": 0.4903142022293568, + "grad_norm": 5.9995679699805065, + "learning_rate": 2.159666964120166e-06, + "loss": 0.9373, + "step": 6796 + }, + { + "epoch": 0.4903863496987843, + "grad_norm": 4.018217088834057, + "learning_rate": 2.15920109166598e-06, + "loss": 1.04, + "step": 6797 + }, + { + "epoch": 0.4904584971682118, + "grad_norm": 2.9972978184260084, + "learning_rate": 2.1587352105184115e-06, + "loss": 0.8885, + "step": 6798 + }, + { + "epoch": 0.49053064463763935, + "grad_norm": 3.1987128410849386, + "learning_rate": 2.158269320702901e-06, + "loss": 0.9346, + "step": 6799 + }, + { + "epoch": 0.49060279210706687, + "grad_norm": 5.46938281076471, + "learning_rate": 2.1578034222448885e-06, + "loss": 0.8404, + "step": 6800 + }, + { + "epoch": 0.49067493957649433, + "grad_norm": 4.807114200753599, + "learning_rate": 2.1573375151698148e-06, + "loss": 1.008, + "step": 6801 + }, + { + "epoch": 0.49074708704592185, + "grad_norm": 3.685545969337793, + "learning_rate": 2.156871599503123e-06, + "loss": 0.9365, + "step": 6802 + }, + { + "epoch": 0.49081923451534937, + "grad_norm": 4.328657537536379, + "learning_rate": 2.1564056752702532e-06, + "loss": 0.8047, + "step": 6803 + }, + { + "epoch": 0.4908913819847769, + "grad_norm": 2.567948109004454, + "learning_rate": 2.155939742496648e-06, + "loss": 1.0031, + "step": 6804 + }, + { + "epoch": 0.4909635294542044, + "grad_norm": 2.472033767327595, + "learning_rate": 2.155473801207752e-06, + "loss": 0.9305, + "step": 6805 + }, + { + "epoch": 0.4910356769236319, + "grad_norm": 2.465808903129545, + "learning_rate": 2.155007851429006e-06, + "loss": 0.8484, + "step": 6806 + }, + { + "epoch": 0.4911078243930594, + "grad_norm": 5.954620734115499, + "learning_rate": 2.154541893185856e-06, + "loss": 0.8829, + "step": 6807 + }, + { + "epoch": 0.4911799718624869, + "grad_norm": 3.18724298375954, + "learning_rate": 2.154075926503745e-06, + "loss": 0.8402, + "step": 6808 + }, + { + "epoch": 0.4912521193319144, + "grad_norm": 2.9855980089743173, + "learning_rate": 2.153609951408118e-06, + "loss": 0.8395, + "step": 6809 + }, + { + "epoch": 0.49132426680134195, + "grad_norm": 3.2113862478204687, + "learning_rate": 2.15314396792442e-06, + "loss": 1.0279, + "step": 6810 + }, + { + "epoch": 0.49139641427076947, + "grad_norm": 2.3496515867987116, + "learning_rate": 2.1526779760780967e-06, + "loss": 1.0048, + "step": 6811 + }, + { + "epoch": 0.491468561740197, + "grad_norm": 2.4948997925413754, + "learning_rate": 2.152211975894595e-06, + "loss": 0.8962, + "step": 6812 + }, + { + "epoch": 0.49154070920962445, + "grad_norm": 2.8080313043778493, + "learning_rate": 2.1517459673993596e-06, + "loss": 0.9855, + "step": 6813 + }, + { + "epoch": 0.49161285667905197, + "grad_norm": 2.2792477116189915, + "learning_rate": 2.15127995061784e-06, + "loss": 0.8697, + "step": 6814 + }, + { + "epoch": 0.4916850041484795, + "grad_norm": 2.3620598277009566, + "learning_rate": 2.150813925575482e-06, + "loss": 0.9548, + "step": 6815 + }, + { + "epoch": 0.491757151617907, + "grad_norm": 28.912420160733365, + "learning_rate": 2.150347892297733e-06, + "loss": 0.9561, + "step": 6816 + }, + { + "epoch": 0.4918292990873345, + "grad_norm": 2.810312734557424, + "learning_rate": 2.149881850810043e-06, + "loss": 0.9735, + "step": 6817 + }, + { + "epoch": 0.49190144655676205, + "grad_norm": 2.9810171360264275, + "learning_rate": 2.14941580113786e-06, + "loss": 0.8384, + "step": 6818 + }, + { + "epoch": 0.4919735940261895, + "grad_norm": 2.199114564965302, + "learning_rate": 2.1489497433066327e-06, + "loss": 0.9152, + "step": 6819 + }, + { + "epoch": 0.49204574149561703, + "grad_norm": 2.097001355339489, + "learning_rate": 2.1484836773418115e-06, + "loss": 0.8751, + "step": 6820 + }, + { + "epoch": 0.49211788896504455, + "grad_norm": 2.402110316402635, + "learning_rate": 2.148017603268846e-06, + "loss": 0.897, + "step": 6821 + }, + { + "epoch": 0.49219003643447207, + "grad_norm": 2.6226649116325356, + "learning_rate": 2.1475515211131875e-06, + "loss": 0.9212, + "step": 6822 + }, + { + "epoch": 0.4922621839038996, + "grad_norm": 4.242252243981079, + "learning_rate": 2.147085430900287e-06, + "loss": 0.9405, + "step": 6823 + }, + { + "epoch": 0.4923343313733271, + "grad_norm": 3.3703074858888584, + "learning_rate": 2.146619332655595e-06, + "loss": 1.032, + "step": 6824 + }, + { + "epoch": 0.49240647884275457, + "grad_norm": 4.820467410079208, + "learning_rate": 2.1461532264045648e-06, + "loss": 0.9279, + "step": 6825 + }, + { + "epoch": 0.4924786263121821, + "grad_norm": 5.264147270971673, + "learning_rate": 2.1456871121726472e-06, + "loss": 0.8764, + "step": 6826 + }, + { + "epoch": 0.4925507737816096, + "grad_norm": 4.702117913484779, + "learning_rate": 2.145220989985296e-06, + "loss": 1.0068, + "step": 6827 + }, + { + "epoch": 0.4926229212510371, + "grad_norm": 2.285805738697196, + "learning_rate": 2.1447548598679636e-06, + "loss": 0.8988, + "step": 6828 + }, + { + "epoch": 0.49269506872046465, + "grad_norm": 3.6521024736482057, + "learning_rate": 2.144288721846105e-06, + "loss": 1.0487, + "step": 6829 + }, + { + "epoch": 0.49276721618989217, + "grad_norm": 3.317695430009903, + "learning_rate": 2.1438225759451735e-06, + "loss": 0.9774, + "step": 6830 + }, + { + "epoch": 0.49283936365931963, + "grad_norm": 3.12777586673959, + "learning_rate": 2.143356422190622e-06, + "loss": 0.9509, + "step": 6831 + }, + { + "epoch": 0.49291151112874715, + "grad_norm": 2.9398531315248597, + "learning_rate": 2.142890260607909e-06, + "loss": 1.0145, + "step": 6832 + }, + { + "epoch": 0.49298365859817467, + "grad_norm": 2.549516676931082, + "learning_rate": 2.1424240912224866e-06, + "loss": 0.8668, + "step": 6833 + }, + { + "epoch": 0.4930558060676022, + "grad_norm": 3.018974854997945, + "learning_rate": 2.141957914059812e-06, + "loss": 1.017, + "step": 6834 + }, + { + "epoch": 0.4931279535370297, + "grad_norm": 0.7381733810499554, + "learning_rate": 2.1414917291453412e-06, + "loss": 0.7908, + "step": 6835 + }, + { + "epoch": 0.4932001010064572, + "grad_norm": 3.839718076370433, + "learning_rate": 2.141025536504531e-06, + "loss": 0.8776, + "step": 6836 + }, + { + "epoch": 0.4932722484758847, + "grad_norm": 4.1456165097277795, + "learning_rate": 2.140559336162839e-06, + "loss": 0.7723, + "step": 6837 + }, + { + "epoch": 0.4933443959453122, + "grad_norm": 2.503674286617906, + "learning_rate": 2.1400931281457207e-06, + "loss": 0.8732, + "step": 6838 + }, + { + "epoch": 0.49341654341473973, + "grad_norm": 3.3544845114540998, + "learning_rate": 2.1396269124786363e-06, + "loss": 0.9771, + "step": 6839 + }, + { + "epoch": 0.49348869088416725, + "grad_norm": 2.6546754546001647, + "learning_rate": 2.139160689187042e-06, + "loss": 0.8795, + "step": 6840 + }, + { + "epoch": 0.49356083835359477, + "grad_norm": 3.567433337823149, + "learning_rate": 2.138694458296399e-06, + "loss": 1.0013, + "step": 6841 + }, + { + "epoch": 0.49363298582302223, + "grad_norm": 4.041980744313652, + "learning_rate": 2.1382282198321642e-06, + "loss": 1.0197, + "step": 6842 + }, + { + "epoch": 0.49370513329244975, + "grad_norm": 5.95060969202208, + "learning_rate": 2.137761973819798e-06, + "loss": 0.9743, + "step": 6843 + }, + { + "epoch": 0.49377728076187727, + "grad_norm": 3.3364589024611857, + "learning_rate": 2.1372957202847606e-06, + "loss": 0.9731, + "step": 6844 + }, + { + "epoch": 0.4938494282313048, + "grad_norm": 2.8716241670970217, + "learning_rate": 2.1368294592525122e-06, + "loss": 0.785, + "step": 6845 + }, + { + "epoch": 0.4939215757007323, + "grad_norm": 3.5165648157879437, + "learning_rate": 2.136363190748513e-06, + "loss": 0.9284, + "step": 6846 + }, + { + "epoch": 0.4939937231701598, + "grad_norm": 2.4844796980383235, + "learning_rate": 2.135896914798226e-06, + "loss": 0.9233, + "step": 6847 + }, + { + "epoch": 0.4940658706395873, + "grad_norm": 3.066711021876567, + "learning_rate": 2.1354306314271106e-06, + "loss": 0.8773, + "step": 6848 + }, + { + "epoch": 0.4941380181090148, + "grad_norm": 2.490957692133494, + "learning_rate": 2.13496434066063e-06, + "loss": 0.9662, + "step": 6849 + }, + { + "epoch": 0.49421016557844233, + "grad_norm": 2.849456260076364, + "learning_rate": 2.134498042524246e-06, + "loss": 0.9195, + "step": 6850 + }, + { + "epoch": 0.49428231304786985, + "grad_norm": 2.42919014013693, + "learning_rate": 2.1340317370434224e-06, + "loss": 0.9554, + "step": 6851 + }, + { + "epoch": 0.49435446051729737, + "grad_norm": 2.3918984206712395, + "learning_rate": 2.1335654242436216e-06, + "loss": 0.91, + "step": 6852 + }, + { + "epoch": 0.4944266079867249, + "grad_norm": 3.5882311032821623, + "learning_rate": 2.133099104150307e-06, + "loss": 0.8337, + "step": 6853 + }, + { + "epoch": 0.49449875545615235, + "grad_norm": 3.9919194618617038, + "learning_rate": 2.132632776788944e-06, + "loss": 0.8501, + "step": 6854 + }, + { + "epoch": 0.49457090292557987, + "grad_norm": 2.892417722964415, + "learning_rate": 2.1321664421849947e-06, + "loss": 1.0337, + "step": 6855 + }, + { + "epoch": 0.4946430503950074, + "grad_norm": 5.70512493026846, + "learning_rate": 2.1317001003639262e-06, + "loss": 0.9353, + "step": 6856 + }, + { + "epoch": 0.4947151978644349, + "grad_norm": 5.296346722481103, + "learning_rate": 2.1312337513512025e-06, + "loss": 1.0232, + "step": 6857 + }, + { + "epoch": 0.4947873453338624, + "grad_norm": 2.0727384353258023, + "learning_rate": 2.130767395172289e-06, + "loss": 0.9088, + "step": 6858 + }, + { + "epoch": 0.49485949280328995, + "grad_norm": 4.838702124928148, + "learning_rate": 2.1303010318526532e-06, + "loss": 1.0308, + "step": 6859 + }, + { + "epoch": 0.4949316402727174, + "grad_norm": 3.0957397843846746, + "learning_rate": 2.12983466141776e-06, + "loss": 1.0061, + "step": 6860 + }, + { + "epoch": 0.49500378774214493, + "grad_norm": 2.158423392676521, + "learning_rate": 2.129368283893076e-06, + "loss": 0.9986, + "step": 6861 + }, + { + "epoch": 0.49507593521157245, + "grad_norm": 1.8937760883208772, + "learning_rate": 2.1289018993040694e-06, + "loss": 0.8577, + "step": 6862 + }, + { + "epoch": 0.49514808268099997, + "grad_norm": 3.8165149497538113, + "learning_rate": 2.128435507676207e-06, + "loss": 0.9866, + "step": 6863 + }, + { + "epoch": 0.4952202301504275, + "grad_norm": 11.059040110428324, + "learning_rate": 2.127969109034958e-06, + "loss": 0.8239, + "step": 6864 + }, + { + "epoch": 0.495292377619855, + "grad_norm": 2.635298735099642, + "learning_rate": 2.127502703405788e-06, + "loss": 0.8974, + "step": 6865 + }, + { + "epoch": 0.49536452508928247, + "grad_norm": 3.242472367238791, + "learning_rate": 2.1270362908141688e-06, + "loss": 0.9582, + "step": 6866 + }, + { + "epoch": 0.49543667255871, + "grad_norm": 9.016643393927794, + "learning_rate": 2.126569871285567e-06, + "loss": 0.9228, + "step": 6867 + }, + { + "epoch": 0.4955088200281375, + "grad_norm": 3.5144443818403666, + "learning_rate": 2.1261034448454537e-06, + "loss": 0.9266, + "step": 6868 + }, + { + "epoch": 0.49558096749756503, + "grad_norm": 4.224072524979781, + "learning_rate": 2.1256370115192982e-06, + "loss": 0.9789, + "step": 6869 + }, + { + "epoch": 0.49565311496699255, + "grad_norm": 3.0212522194364553, + "learning_rate": 2.1251705713325698e-06, + "loss": 0.9165, + "step": 6870 + }, + { + "epoch": 0.49572526243642007, + "grad_norm": 2.5064871545017686, + "learning_rate": 2.1247041243107407e-06, + "loss": 0.9797, + "step": 6871 + }, + { + "epoch": 0.49579740990584753, + "grad_norm": 2.899625056791085, + "learning_rate": 2.124237670479281e-06, + "loss": 0.8608, + "step": 6872 + }, + { + "epoch": 0.49586955737527505, + "grad_norm": 2.6445893995464784, + "learning_rate": 2.123771209863661e-06, + "loss": 1.0366, + "step": 6873 + }, + { + "epoch": 0.49594170484470257, + "grad_norm": 2.203445627676062, + "learning_rate": 2.1233047424893543e-06, + "loss": 0.8944, + "step": 6874 + }, + { + "epoch": 0.4960138523141301, + "grad_norm": 6.523945351208626, + "learning_rate": 2.122838268381832e-06, + "loss": 0.9086, + "step": 6875 + }, + { + "epoch": 0.4960859997835576, + "grad_norm": 3.7029133023943226, + "learning_rate": 2.122371787566567e-06, + "loss": 0.9959, + "step": 6876 + }, + { + "epoch": 0.4961581472529851, + "grad_norm": 1.714431400296148, + "learning_rate": 2.1219053000690308e-06, + "loss": 1.0192, + "step": 6877 + }, + { + "epoch": 0.4962302947224126, + "grad_norm": 3.722229683965033, + "learning_rate": 2.121438805914698e-06, + "loss": 0.8751, + "step": 6878 + }, + { + "epoch": 0.4963024421918401, + "grad_norm": 2.878511026445811, + "learning_rate": 2.1209723051290407e-06, + "loss": 0.987, + "step": 6879 + }, + { + "epoch": 0.49637458966126763, + "grad_norm": 39.556225126888215, + "learning_rate": 2.1205057977375345e-06, + "loss": 0.8856, + "step": 6880 + }, + { + "epoch": 0.49644673713069515, + "grad_norm": 5.769271731108966, + "learning_rate": 2.1200392837656533e-06, + "loss": 0.9773, + "step": 6881 + }, + { + "epoch": 0.49651888460012267, + "grad_norm": 6.286122420300754, + "learning_rate": 2.1195727632388696e-06, + "loss": 0.8853, + "step": 6882 + }, + { + "epoch": 0.4965910320695502, + "grad_norm": 13.543773749330239, + "learning_rate": 2.1191062361826613e-06, + "loss": 0.8438, + "step": 6883 + }, + { + "epoch": 0.49666317953897765, + "grad_norm": 2.060613405138924, + "learning_rate": 2.1186397026225016e-06, + "loss": 0.974, + "step": 6884 + }, + { + "epoch": 0.49673532700840517, + "grad_norm": 3.2870835537273115, + "learning_rate": 2.118173162583867e-06, + "loss": 0.9257, + "step": 6885 + }, + { + "epoch": 0.4968074744778327, + "grad_norm": 14.515534596731401, + "learning_rate": 2.117706616092234e-06, + "loss": 0.924, + "step": 6886 + }, + { + "epoch": 0.4968796219472602, + "grad_norm": 2.9791327165567645, + "learning_rate": 2.1172400631730785e-06, + "loss": 0.8619, + "step": 6887 + }, + { + "epoch": 0.4969517694166877, + "grad_norm": 0.7730133357779132, + "learning_rate": 2.1167735038518766e-06, + "loss": 0.7996, + "step": 6888 + }, + { + "epoch": 0.49702391688611525, + "grad_norm": 5.210012465332114, + "learning_rate": 2.1163069381541057e-06, + "loss": 1.0032, + "step": 6889 + }, + { + "epoch": 0.4970960643555427, + "grad_norm": 5.164586581646799, + "learning_rate": 2.115840366105244e-06, + "loss": 0.9623, + "step": 6890 + }, + { + "epoch": 0.49716821182497023, + "grad_norm": 3.4158973913304456, + "learning_rate": 2.115373787730769e-06, + "loss": 0.9374, + "step": 6891 + }, + { + "epoch": 0.49724035929439775, + "grad_norm": 3.1352858067238065, + "learning_rate": 2.1149072030561577e-06, + "loss": 0.8235, + "step": 6892 + }, + { + "epoch": 0.49731250676382527, + "grad_norm": 0.7455705335985144, + "learning_rate": 2.1144406121068904e-06, + "loss": 0.7898, + "step": 6893 + }, + { + "epoch": 0.4973846542332528, + "grad_norm": 3.046589686411052, + "learning_rate": 2.113974014908444e-06, + "loss": 0.9666, + "step": 6894 + }, + { + "epoch": 0.49745680170268025, + "grad_norm": 2.583494940184136, + "learning_rate": 2.113507411486299e-06, + "loss": 0.9286, + "step": 6895 + }, + { + "epoch": 0.49752894917210777, + "grad_norm": 2.592033829597773, + "learning_rate": 2.1130408018659345e-06, + "loss": 0.9326, + "step": 6896 + }, + { + "epoch": 0.4976010966415353, + "grad_norm": 2.6053977396151624, + "learning_rate": 2.11257418607283e-06, + "loss": 0.9417, + "step": 6897 + }, + { + "epoch": 0.4976732441109628, + "grad_norm": 6.8283124326212725, + "learning_rate": 2.112107564132466e-06, + "loss": 1.0662, + "step": 6898 + }, + { + "epoch": 0.49774539158039033, + "grad_norm": 7.3757310925202475, + "learning_rate": 2.1116409360703233e-06, + "loss": 1.0288, + "step": 6899 + }, + { + "epoch": 0.49781753904981785, + "grad_norm": 7.988429523761446, + "learning_rate": 2.1111743019118823e-06, + "loss": 0.8664, + "step": 6900 + }, + { + "epoch": 0.4978896865192453, + "grad_norm": 0.9193558622088993, + "learning_rate": 2.1107076616826236e-06, + "loss": 0.8726, + "step": 6901 + }, + { + "epoch": 0.49796183398867283, + "grad_norm": 2.6426126797849334, + "learning_rate": 2.1102410154080297e-06, + "loss": 0.8796, + "step": 6902 + }, + { + "epoch": 0.49803398145810035, + "grad_norm": 4.014155374999053, + "learning_rate": 2.1097743631135824e-06, + "loss": 0.9615, + "step": 6903 + }, + { + "epoch": 0.49810612892752787, + "grad_norm": 3.5176017120773593, + "learning_rate": 2.1093077048247626e-06, + "loss": 0.9271, + "step": 6904 + }, + { + "epoch": 0.4981782763969554, + "grad_norm": 0.9732309504990778, + "learning_rate": 2.108841040567055e-06, + "loss": 0.7917, + "step": 6905 + }, + { + "epoch": 0.4982504238663829, + "grad_norm": 0.8064432710939098, + "learning_rate": 2.1083743703659394e-06, + "loss": 0.859, + "step": 6906 + }, + { + "epoch": 0.49832257133581037, + "grad_norm": 2.6693711176696424, + "learning_rate": 2.107907694246902e-06, + "loss": 0.9251, + "step": 6907 + }, + { + "epoch": 0.4983947188052379, + "grad_norm": 3.641832798035244, + "learning_rate": 2.107441012235425e-06, + "loss": 0.8983, + "step": 6908 + }, + { + "epoch": 0.4984668662746654, + "grad_norm": 7.910198688157764, + "learning_rate": 2.1069743243569904e-06, + "loss": 0.9329, + "step": 6909 + }, + { + "epoch": 0.49853901374409293, + "grad_norm": 2.6107585231543617, + "learning_rate": 2.1065076306370857e-06, + "loss": 1.0078, + "step": 6910 + }, + { + "epoch": 0.49861116121352045, + "grad_norm": 3.1411478546815816, + "learning_rate": 2.106040931101193e-06, + "loss": 1.0025, + "step": 6911 + }, + { + "epoch": 0.49868330868294797, + "grad_norm": 6.155520323290262, + "learning_rate": 2.1055742257747978e-06, + "loss": 0.9024, + "step": 6912 + }, + { + "epoch": 0.49875545615237543, + "grad_norm": 2.566964609845064, + "learning_rate": 2.1051075146833844e-06, + "loss": 0.9686, + "step": 6913 + }, + { + "epoch": 0.49882760362180295, + "grad_norm": 3.487557364918345, + "learning_rate": 2.104640797852439e-06, + "loss": 0.9578, + "step": 6914 + }, + { + "epoch": 0.49889975109123047, + "grad_norm": 3.574485560396681, + "learning_rate": 2.1041740753074473e-06, + "loss": 0.8575, + "step": 6915 + }, + { + "epoch": 0.498971898560658, + "grad_norm": 13.819860370112792, + "learning_rate": 2.1037073470738942e-06, + "loss": 0.8756, + "step": 6916 + }, + { + "epoch": 0.4990440460300855, + "grad_norm": 3.6087431912016052, + "learning_rate": 2.1032406131772674e-06, + "loss": 0.8543, + "step": 6917 + }, + { + "epoch": 0.49911619349951303, + "grad_norm": 4.145106240672241, + "learning_rate": 2.1027738736430534e-06, + "loss": 0.9111, + "step": 6918 + }, + { + "epoch": 0.4991883409689405, + "grad_norm": 3.232627359641267, + "learning_rate": 2.102307128496737e-06, + "loss": 0.8516, + "step": 6919 + }, + { + "epoch": 0.499260488438368, + "grad_norm": 3.1565154313378425, + "learning_rate": 2.1018403777638088e-06, + "loss": 1.0288, + "step": 6920 + }, + { + "epoch": 0.49933263590779553, + "grad_norm": 5.655149911365704, + "learning_rate": 2.101373621469753e-06, + "loss": 0.8718, + "step": 6921 + }, + { + "epoch": 0.49940478337722305, + "grad_norm": 3.2359991004197575, + "learning_rate": 2.10090685964006e-06, + "loss": 0.9292, + "step": 6922 + }, + { + "epoch": 0.49947693084665057, + "grad_norm": 2.3867885747956703, + "learning_rate": 2.100440092300217e-06, + "loss": 0.8976, + "step": 6923 + }, + { + "epoch": 0.4995490783160781, + "grad_norm": 3.0444927586096444, + "learning_rate": 2.0999733194757124e-06, + "loss": 0.9865, + "step": 6924 + }, + { + "epoch": 0.49962122578550555, + "grad_norm": 11.120953456153199, + "learning_rate": 2.0995065411920345e-06, + "loss": 0.9208, + "step": 6925 + }, + { + "epoch": 0.49969337325493307, + "grad_norm": 2.87649696188411, + "learning_rate": 2.0990397574746724e-06, + "loss": 0.9422, + "step": 6926 + }, + { + "epoch": 0.4997655207243606, + "grad_norm": 7.346075980134442, + "learning_rate": 2.0985729683491172e-06, + "loss": 0.9419, + "step": 6927 + }, + { + "epoch": 0.4998376681937881, + "grad_norm": 8.557639865829959, + "learning_rate": 2.0981061738408552e-06, + "loss": 0.874, + "step": 6928 + }, + { + "epoch": 0.49990981566321563, + "grad_norm": 1.021068714538331, + "learning_rate": 2.0976393739753797e-06, + "loss": 0.7621, + "step": 6929 + }, + { + "epoch": 0.49998196313264315, + "grad_norm": 2.561788320571015, + "learning_rate": 2.0971725687781794e-06, + "loss": 0.9433, + "step": 6930 + }, + { + "epoch": 0.5000541106020706, + "grad_norm": 3.5556619167976224, + "learning_rate": 2.0967057582747433e-06, + "loss": 0.9116, + "step": 6931 + }, + { + "epoch": 0.5001262580714981, + "grad_norm": 7.27208817665013, + "learning_rate": 2.0962389424905654e-06, + "loss": 0.8187, + "step": 6932 + }, + { + "epoch": 0.5001984055409257, + "grad_norm": 10.805817584271482, + "learning_rate": 2.0957721214511343e-06, + "loss": 1.0052, + "step": 6933 + }, + { + "epoch": 0.5002705530103532, + "grad_norm": 2.7876155743780626, + "learning_rate": 2.0953052951819426e-06, + "loss": 0.8679, + "step": 6934 + }, + { + "epoch": 0.5003427004797807, + "grad_norm": 34.461375854443915, + "learning_rate": 2.0948384637084814e-06, + "loss": 0.8669, + "step": 6935 + }, + { + "epoch": 0.5004148479492082, + "grad_norm": 3.4368098172979953, + "learning_rate": 2.0943716270562425e-06, + "loss": 0.9424, + "step": 6936 + }, + { + "epoch": 0.5004869954186357, + "grad_norm": 7.544086075244195, + "learning_rate": 2.0939047852507187e-06, + "loss": 0.9546, + "step": 6937 + }, + { + "epoch": 0.5005591428880632, + "grad_norm": 0.9058554217872047, + "learning_rate": 2.093437938317402e-06, + "loss": 0.7982, + "step": 6938 + }, + { + "epoch": 0.5006312903574908, + "grad_norm": 2.3867006691405455, + "learning_rate": 2.0929710862817855e-06, + "loss": 0.8999, + "step": 6939 + }, + { + "epoch": 0.5007034378269182, + "grad_norm": 4.374028125171913, + "learning_rate": 2.092504229169362e-06, + "loss": 0.9646, + "step": 6940 + }, + { + "epoch": 0.5007755852963457, + "grad_norm": 4.364481461215407, + "learning_rate": 2.0920373670056253e-06, + "loss": 1.0621, + "step": 6941 + }, + { + "epoch": 0.5008477327657732, + "grad_norm": 4.057233712888743, + "learning_rate": 2.0915704998160686e-06, + "loss": 0.8895, + "step": 6942 + }, + { + "epoch": 0.5009198802352007, + "grad_norm": 2.2046635443280147, + "learning_rate": 2.0911036276261855e-06, + "loss": 0.9786, + "step": 6943 + }, + { + "epoch": 0.5009920277046283, + "grad_norm": 4.298431891240434, + "learning_rate": 2.0906367504614712e-06, + "loss": 0.9338, + "step": 6944 + }, + { + "epoch": 0.5010641751740558, + "grad_norm": 6.157241973736661, + "learning_rate": 2.090169868347418e-06, + "loss": 0.8236, + "step": 6945 + }, + { + "epoch": 0.5011363226434833, + "grad_norm": 3.833371424831266, + "learning_rate": 2.0897029813095243e-06, + "loss": 0.9632, + "step": 6946 + }, + { + "epoch": 0.5012084701129108, + "grad_norm": 4.529612067582398, + "learning_rate": 2.0892360893732817e-06, + "loss": 0.9691, + "step": 6947 + }, + { + "epoch": 0.5012806175823383, + "grad_norm": 2.2332706523278976, + "learning_rate": 2.088769192564187e-06, + "loss": 0.9046, + "step": 6948 + }, + { + "epoch": 0.5013527650517658, + "grad_norm": 3.1356036530730473, + "learning_rate": 2.0883022909077343e-06, + "loss": 1.0126, + "step": 6949 + }, + { + "epoch": 0.5014249125211934, + "grad_norm": 2.9860279239235474, + "learning_rate": 2.0878353844294213e-06, + "loss": 0.9774, + "step": 6950 + }, + { + "epoch": 0.5014970599906208, + "grad_norm": 3.7629818325815587, + "learning_rate": 2.0873684731547428e-06, + "loss": 1.0295, + "step": 6951 + }, + { + "epoch": 0.5015692074600483, + "grad_norm": 2.8080914171504414, + "learning_rate": 2.0869015571091956e-06, + "loss": 0.8797, + "step": 6952 + }, + { + "epoch": 0.5016413549294758, + "grad_norm": 2.280121615663155, + "learning_rate": 2.086434636318276e-06, + "loss": 1.0065, + "step": 6953 + }, + { + "epoch": 0.5017135023989033, + "grad_norm": 2.303599576049384, + "learning_rate": 2.0859677108074807e-06, + "loss": 0.9578, + "step": 6954 + }, + { + "epoch": 0.5017856498683309, + "grad_norm": 3.3139299868898577, + "learning_rate": 2.085500780602307e-06, + "loss": 0.8805, + "step": 6955 + }, + { + "epoch": 0.5018577973377584, + "grad_norm": 4.65491010601631, + "learning_rate": 2.0850338457282525e-06, + "loss": 1.0489, + "step": 6956 + }, + { + "epoch": 0.5019299448071859, + "grad_norm": 2.9319761568377625, + "learning_rate": 2.0845669062108146e-06, + "loss": 0.8521, + "step": 6957 + }, + { + "epoch": 0.5020020922766134, + "grad_norm": 2.4837672611859705, + "learning_rate": 2.08409996207549e-06, + "loss": 0.9031, + "step": 6958 + }, + { + "epoch": 0.5020742397460409, + "grad_norm": 23.48612923837307, + "learning_rate": 2.0836330133477787e-06, + "loss": 0.8982, + "step": 6959 + }, + { + "epoch": 0.5021463872154684, + "grad_norm": 2.777964590996319, + "learning_rate": 2.083166060053178e-06, + "loss": 1.0167, + "step": 6960 + }, + { + "epoch": 0.502218534684896, + "grad_norm": 2.154907472044619, + "learning_rate": 2.082699102217186e-06, + "loss": 0.9672, + "step": 6961 + }, + { + "epoch": 0.5022906821543235, + "grad_norm": 2.6453855140555325, + "learning_rate": 2.082232139865303e-06, + "loss": 0.8831, + "step": 6962 + }, + { + "epoch": 0.5023628296237509, + "grad_norm": 2.9166586921219575, + "learning_rate": 2.0817651730230273e-06, + "loss": 0.8439, + "step": 6963 + }, + { + "epoch": 0.5024349770931784, + "grad_norm": 4.533297582560172, + "learning_rate": 2.0812982017158576e-06, + "loss": 0.9089, + "step": 6964 + }, + { + "epoch": 0.5025071245626059, + "grad_norm": 2.4783587274573695, + "learning_rate": 2.0808312259692942e-06, + "loss": 0.971, + "step": 6965 + }, + { + "epoch": 0.5025792720320335, + "grad_norm": 2.203098486341734, + "learning_rate": 2.080364245808837e-06, + "loss": 0.8786, + "step": 6966 + }, + { + "epoch": 0.502651419501461, + "grad_norm": 3.0301925545119133, + "learning_rate": 2.079897261259985e-06, + "loss": 1.122, + "step": 6967 + }, + { + "epoch": 0.5027235669708885, + "grad_norm": 2.534459845725461, + "learning_rate": 2.0794302723482407e-06, + "loss": 0.9134, + "step": 6968 + }, + { + "epoch": 0.502795714440316, + "grad_norm": 2.536479492934767, + "learning_rate": 2.0789632790991027e-06, + "loss": 0.9488, + "step": 6969 + }, + { + "epoch": 0.5028678619097435, + "grad_norm": 2.7620876125276266, + "learning_rate": 2.078496281538072e-06, + "loss": 0.8615, + "step": 6970 + }, + { + "epoch": 0.502940009379171, + "grad_norm": 3.098593879019903, + "learning_rate": 2.07802927969065e-06, + "loss": 0.8417, + "step": 6971 + }, + { + "epoch": 0.5030121568485986, + "grad_norm": 1.891033160075757, + "learning_rate": 2.077562273582338e-06, + "loss": 0.9779, + "step": 6972 + }, + { + "epoch": 0.5030843043180261, + "grad_norm": 3.5509860375231104, + "learning_rate": 2.0770952632386374e-06, + "loss": 1.021, + "step": 6973 + }, + { + "epoch": 0.5031564517874536, + "grad_norm": 20.739265330884884, + "learning_rate": 2.0766282486850503e-06, + "loss": 0.9206, + "step": 6974 + }, + { + "epoch": 0.503228599256881, + "grad_norm": 3.661664295870037, + "learning_rate": 2.0761612299470777e-06, + "loss": 0.9572, + "step": 6975 + }, + { + "epoch": 0.5033007467263085, + "grad_norm": 4.178997916975778, + "learning_rate": 2.075694207050222e-06, + "loss": 0.8703, + "step": 6976 + }, + { + "epoch": 0.503372894195736, + "grad_norm": 6.762778725230168, + "learning_rate": 2.075227180019986e-06, + "loss": 0.8986, + "step": 6977 + }, + { + "epoch": 0.5034450416651636, + "grad_norm": 2.350871849659553, + "learning_rate": 2.0747601488818734e-06, + "loss": 0.9622, + "step": 6978 + }, + { + "epoch": 0.5035171891345911, + "grad_norm": 10.753127242035148, + "learning_rate": 2.0742931136613846e-06, + "loss": 0.9036, + "step": 6979 + }, + { + "epoch": 0.5035893366040186, + "grad_norm": 2.8657645454666563, + "learning_rate": 2.0738260743840243e-06, + "loss": 0.9078, + "step": 6980 + }, + { + "epoch": 0.5036614840734461, + "grad_norm": 4.430479486836201, + "learning_rate": 2.073359031075295e-06, + "loss": 0.9271, + "step": 6981 + }, + { + "epoch": 0.5037336315428736, + "grad_norm": 3.0753018362881623, + "learning_rate": 2.0728919837607007e-06, + "loss": 0.9425, + "step": 6982 + }, + { + "epoch": 0.5038057790123012, + "grad_norm": 3.0141373359561574, + "learning_rate": 2.0724249324657454e-06, + "loss": 0.9295, + "step": 6983 + }, + { + "epoch": 0.5038779264817287, + "grad_norm": 4.786810938397394, + "learning_rate": 2.0719578772159326e-06, + "loss": 0.9519, + "step": 6984 + }, + { + "epoch": 0.5039500739511562, + "grad_norm": 5.2818036212460795, + "learning_rate": 2.071490818036766e-06, + "loss": 0.8846, + "step": 6985 + }, + { + "epoch": 0.5040222214205837, + "grad_norm": 2.9438536089180363, + "learning_rate": 2.0710237549537508e-06, + "loss": 0.796, + "step": 6986 + }, + { + "epoch": 0.5040943688900111, + "grad_norm": 4.048486337253073, + "learning_rate": 2.070556687992391e-06, + "loss": 0.9887, + "step": 6987 + }, + { + "epoch": 0.5041665163594387, + "grad_norm": 2.2398979380039976, + "learning_rate": 2.0700896171781914e-06, + "loss": 0.9159, + "step": 6988 + }, + { + "epoch": 0.5042386638288662, + "grad_norm": 6.933474910953404, + "learning_rate": 2.069622542536658e-06, + "loss": 0.9692, + "step": 6989 + }, + { + "epoch": 0.5043108112982937, + "grad_norm": 3.830559528604804, + "learning_rate": 2.069155464093295e-06, + "loss": 0.9456, + "step": 6990 + }, + { + "epoch": 0.5043829587677212, + "grad_norm": 2.686975472296073, + "learning_rate": 2.068688381873608e-06, + "loss": 0.9362, + "step": 6991 + }, + { + "epoch": 0.5044551062371487, + "grad_norm": 4.3904972804488684, + "learning_rate": 2.068221295903103e-06, + "loss": 0.8444, + "step": 6992 + }, + { + "epoch": 0.5045272537065763, + "grad_norm": 1.1869198235137668, + "learning_rate": 2.0677542062072855e-06, + "loss": 0.7878, + "step": 6993 + }, + { + "epoch": 0.5045994011760038, + "grad_norm": 4.369707666740813, + "learning_rate": 2.067287112811661e-06, + "loss": 0.8974, + "step": 6994 + }, + { + "epoch": 0.5046715486454313, + "grad_norm": 5.454938963942841, + "learning_rate": 2.0668200157417376e-06, + "loss": 0.9231, + "step": 6995 + }, + { + "epoch": 0.5047436961148588, + "grad_norm": 2.8757731186001245, + "learning_rate": 2.06635291502302e-06, + "loss": 0.968, + "step": 6996 + }, + { + "epoch": 0.5048158435842863, + "grad_norm": 2.7685034476353994, + "learning_rate": 2.0658858106810145e-06, + "loss": 0.9373, + "step": 6997 + }, + { + "epoch": 0.5048879910537137, + "grad_norm": 3.046285092883866, + "learning_rate": 2.0654187027412297e-06, + "loss": 0.9859, + "step": 6998 + }, + { + "epoch": 0.5049601385231413, + "grad_norm": 2.2889664665622313, + "learning_rate": 2.0649515912291715e-06, + "loss": 0.8388, + "step": 6999 + }, + { + "epoch": 0.5050322859925688, + "grad_norm": 2.864161919316196, + "learning_rate": 2.064484476170347e-06, + "loss": 0.994, + "step": 7000 + }, + { + "epoch": 0.5051044334619963, + "grad_norm": 2.443407285420314, + "learning_rate": 2.0640173575902645e-06, + "loss": 0.9404, + "step": 7001 + }, + { + "epoch": 0.5051765809314238, + "grad_norm": 2.2870378621532184, + "learning_rate": 2.0635502355144307e-06, + "loss": 0.842, + "step": 7002 + }, + { + "epoch": 0.5052487284008513, + "grad_norm": 2.4494500301985043, + "learning_rate": 2.063083109968354e-06, + "loss": 0.8834, + "step": 7003 + }, + { + "epoch": 0.5053208758702789, + "grad_norm": 3.9612049862683585, + "learning_rate": 2.062615980977542e-06, + "loss": 1.029, + "step": 7004 + }, + { + "epoch": 0.5053930233397064, + "grad_norm": 2.463076385583582, + "learning_rate": 2.0621488485675035e-06, + "loss": 0.9428, + "step": 7005 + }, + { + "epoch": 0.5054651708091339, + "grad_norm": 1.8863337975071315, + "learning_rate": 2.061681712763746e-06, + "loss": 0.811, + "step": 7006 + }, + { + "epoch": 0.5055373182785614, + "grad_norm": 4.3277883019983845, + "learning_rate": 2.0612145735917784e-06, + "loss": 0.9538, + "step": 7007 + }, + { + "epoch": 0.5056094657479889, + "grad_norm": 14.35872358934576, + "learning_rate": 2.0607474310771103e-06, + "loss": 0.878, + "step": 7008 + }, + { + "epoch": 0.5056816132174164, + "grad_norm": 2.3669705433142765, + "learning_rate": 2.0602802852452486e-06, + "loss": 0.8202, + "step": 7009 + }, + { + "epoch": 0.5057537606868439, + "grad_norm": 2.9829273807230026, + "learning_rate": 2.059813136121705e-06, + "loss": 0.8397, + "step": 7010 + }, + { + "epoch": 0.5058259081562714, + "grad_norm": 2.575584685971202, + "learning_rate": 2.0593459837319868e-06, + "loss": 0.8051, + "step": 7011 + }, + { + "epoch": 0.5058980556256989, + "grad_norm": 2.823218795167488, + "learning_rate": 2.058878828101604e-06, + "loss": 0.8709, + "step": 7012 + }, + { + "epoch": 0.5059702030951264, + "grad_norm": 3.715253324022161, + "learning_rate": 2.058411669256067e-06, + "loss": 0.8841, + "step": 7013 + }, + { + "epoch": 0.5060423505645539, + "grad_norm": 3.0165593572537173, + "learning_rate": 2.057944507220884e-06, + "loss": 0.8776, + "step": 7014 + }, + { + "epoch": 0.5061144980339815, + "grad_norm": 4.093421515178682, + "learning_rate": 2.057477342021567e-06, + "loss": 0.9069, + "step": 7015 + }, + { + "epoch": 0.506186645503409, + "grad_norm": 3.5365940942488137, + "learning_rate": 2.057010173683624e-06, + "loss": 0.9452, + "step": 7016 + }, + { + "epoch": 0.5062587929728365, + "grad_norm": 5.3443494315545665, + "learning_rate": 2.0565430022325678e-06, + "loss": 1.0026, + "step": 7017 + }, + { + "epoch": 0.506330940442264, + "grad_norm": 2.5777197317004332, + "learning_rate": 2.056075827693907e-06, + "loss": 0.9146, + "step": 7018 + }, + { + "epoch": 0.5064030879116915, + "grad_norm": 54.45845337990467, + "learning_rate": 2.0556086500931525e-06, + "loss": 1.0161, + "step": 7019 + }, + { + "epoch": 0.506475235381119, + "grad_norm": 2.489627587684573, + "learning_rate": 2.055141469455816e-06, + "loss": 0.9411, + "step": 7020 + }, + { + "epoch": 0.5065473828505466, + "grad_norm": 2.750686646481225, + "learning_rate": 2.054674285807407e-06, + "loss": 1.0223, + "step": 7021 + }, + { + "epoch": 0.506619530319974, + "grad_norm": 11.138470670142688, + "learning_rate": 2.0542070991734387e-06, + "loss": 0.8837, + "step": 7022 + }, + { + "epoch": 0.5066916777894015, + "grad_norm": 2.7091574857664376, + "learning_rate": 2.053739909579421e-06, + "loss": 1.0519, + "step": 7023 + }, + { + "epoch": 0.506763825258829, + "grad_norm": 3.3038378044754633, + "learning_rate": 2.0532727170508655e-06, + "loss": 1.0466, + "step": 7024 + }, + { + "epoch": 0.5068359727282565, + "grad_norm": 3.4932847315438242, + "learning_rate": 2.0528055216132845e-06, + "loss": 0.9627, + "step": 7025 + }, + { + "epoch": 0.506908120197684, + "grad_norm": 2.1935553526858245, + "learning_rate": 2.052338323292189e-06, + "loss": 0.9162, + "step": 7026 + }, + { + "epoch": 0.5069802676671116, + "grad_norm": 3.5747953022861423, + "learning_rate": 2.051871122113092e-06, + "loss": 0.7501, + "step": 7027 + }, + { + "epoch": 0.5070524151365391, + "grad_norm": 2.777986304615336, + "learning_rate": 2.051403918101505e-06, + "loss": 1.0168, + "step": 7028 + }, + { + "epoch": 0.5071245626059666, + "grad_norm": 6.624497016670545, + "learning_rate": 2.05093671128294e-06, + "loss": 0.8874, + "step": 7029 + }, + { + "epoch": 0.5071967100753941, + "grad_norm": 6.257592739099256, + "learning_rate": 2.05046950168291e-06, + "loss": 1.0123, + "step": 7030 + }, + { + "epoch": 0.5072688575448216, + "grad_norm": 3.9917659648949426, + "learning_rate": 2.0500022893269268e-06, + "loss": 0.935, + "step": 7031 + }, + { + "epoch": 0.5073410050142492, + "grad_norm": 3.1416307044226555, + "learning_rate": 2.0495350742405044e-06, + "loss": 0.9846, + "step": 7032 + }, + { + "epoch": 0.5074131524836767, + "grad_norm": 2.53944304915634, + "learning_rate": 2.0490678564491543e-06, + "loss": 0.8784, + "step": 7033 + }, + { + "epoch": 0.5074852999531041, + "grad_norm": 2.986448835648992, + "learning_rate": 2.048600635978391e-06, + "loss": 0.9497, + "step": 7034 + }, + { + "epoch": 0.5075574474225316, + "grad_norm": 3.732255278202433, + "learning_rate": 2.0481334128537266e-06, + "loss": 0.9272, + "step": 7035 + }, + { + "epoch": 0.5076295948919591, + "grad_norm": 2.0058643671584666, + "learning_rate": 2.047666187100674e-06, + "loss": 0.8669, + "step": 7036 + }, + { + "epoch": 0.5077017423613867, + "grad_norm": 2.61627700682369, + "learning_rate": 2.047198958744748e-06, + "loss": 0.9179, + "step": 7037 + }, + { + "epoch": 0.5077738898308142, + "grad_norm": 2.6940567408408223, + "learning_rate": 2.0467317278114616e-06, + "loss": 1.0113, + "step": 7038 + }, + { + "epoch": 0.5078460373002417, + "grad_norm": 2.9130387949797596, + "learning_rate": 2.0462644943263287e-06, + "loss": 0.992, + "step": 7039 + }, + { + "epoch": 0.5079181847696692, + "grad_norm": 2.470657094896073, + "learning_rate": 2.045797258314863e-06, + "loss": 0.8922, + "step": 7040 + }, + { + "epoch": 0.5079903322390967, + "grad_norm": 3.38155632394273, + "learning_rate": 2.045330019802578e-06, + "loss": 0.9394, + "step": 7041 + }, + { + "epoch": 0.5080624797085243, + "grad_norm": 3.5596100981572856, + "learning_rate": 2.0448627788149884e-06, + "loss": 0.941, + "step": 7042 + }, + { + "epoch": 0.5081346271779518, + "grad_norm": 7.087337872839089, + "learning_rate": 2.044395535377609e-06, + "loss": 0.7963, + "step": 7043 + }, + { + "epoch": 0.5082067746473793, + "grad_norm": 3.190836487877529, + "learning_rate": 2.043928289515954e-06, + "loss": 0.947, + "step": 7044 + }, + { + "epoch": 0.5082789221168068, + "grad_norm": 2.309320687157332, + "learning_rate": 2.043461041255537e-06, + "loss": 0.9967, + "step": 7045 + }, + { + "epoch": 0.5083510695862342, + "grad_norm": 3.356426985920654, + "learning_rate": 2.042993790621874e-06, + "loss": 0.8229, + "step": 7046 + }, + { + "epoch": 0.5084232170556617, + "grad_norm": 2.1465012581797356, + "learning_rate": 2.0425265376404797e-06, + "loss": 1.0671, + "step": 7047 + }, + { + "epoch": 0.5084953645250893, + "grad_norm": 2.6831167094140107, + "learning_rate": 2.0420592823368673e-06, + "loss": 0.9051, + "step": 7048 + }, + { + "epoch": 0.5085675119945168, + "grad_norm": 3.0755567341301826, + "learning_rate": 2.041592024736554e-06, + "loss": 1.0057, + "step": 7049 + }, + { + "epoch": 0.5086396594639443, + "grad_norm": 3.16110701811186, + "learning_rate": 2.0411247648650547e-06, + "loss": 0.8691, + "step": 7050 + }, + { + "epoch": 0.5087118069333718, + "grad_norm": 3.519365821193675, + "learning_rate": 2.0406575027478827e-06, + "loss": 0.9433, + "step": 7051 + }, + { + "epoch": 0.5087839544027993, + "grad_norm": 2.656054051969508, + "learning_rate": 2.0401902384105564e-06, + "loss": 0.9328, + "step": 7052 + }, + { + "epoch": 0.5088561018722269, + "grad_norm": 3.0124029626690167, + "learning_rate": 2.0397229718785893e-06, + "loss": 0.888, + "step": 7053 + }, + { + "epoch": 0.5089282493416544, + "grad_norm": 5.867016176447553, + "learning_rate": 2.039255703177498e-06, + "loss": 0.9415, + "step": 7054 + }, + { + "epoch": 0.5090003968110819, + "grad_norm": 4.8659286218089, + "learning_rate": 2.038788432332798e-06, + "loss": 1.0912, + "step": 7055 + }, + { + "epoch": 0.5090725442805094, + "grad_norm": 5.268204145109677, + "learning_rate": 2.0383211593700054e-06, + "loss": 0.7988, + "step": 7056 + }, + { + "epoch": 0.5091446917499368, + "grad_norm": 15.525791982208338, + "learning_rate": 2.037853884314636e-06, + "loss": 0.9232, + "step": 7057 + }, + { + "epoch": 0.5092168392193643, + "grad_norm": 5.058365817439659, + "learning_rate": 2.0373866071922053e-06, + "loss": 0.8702, + "step": 7058 + }, + { + "epoch": 0.5092889866887919, + "grad_norm": 2.46012017134871, + "learning_rate": 2.0369193280282318e-06, + "loss": 0.858, + "step": 7059 + }, + { + "epoch": 0.5093611341582194, + "grad_norm": 2.597086862673586, + "learning_rate": 2.036452046848229e-06, + "loss": 0.9015, + "step": 7060 + }, + { + "epoch": 0.5094332816276469, + "grad_norm": 2.697474223695196, + "learning_rate": 2.0359847636777165e-06, + "loss": 0.9769, + "step": 7061 + }, + { + "epoch": 0.5095054290970744, + "grad_norm": 3.753158764842133, + "learning_rate": 2.035517478542209e-06, + "loss": 1.027, + "step": 7062 + }, + { + "epoch": 0.5095775765665019, + "grad_norm": 3.855264923056926, + "learning_rate": 2.035050191467222e-06, + "loss": 0.8905, + "step": 7063 + }, + { + "epoch": 0.5096497240359295, + "grad_norm": 6.935494072771403, + "learning_rate": 2.0345829024782754e-06, + "loss": 0.9477, + "step": 7064 + }, + { + "epoch": 0.509721871505357, + "grad_norm": 6.049362577797454, + "learning_rate": 2.0341156116008836e-06, + "loss": 0.8775, + "step": 7065 + }, + { + "epoch": 0.5097940189747845, + "grad_norm": 29.051171767326142, + "learning_rate": 2.0336483188605648e-06, + "loss": 1.0052, + "step": 7066 + }, + { + "epoch": 0.509866166444212, + "grad_norm": 0.7799682020161534, + "learning_rate": 2.033181024282836e-06, + "loss": 0.8293, + "step": 7067 + }, + { + "epoch": 0.5099383139136395, + "grad_norm": 12.838297556735217, + "learning_rate": 2.0327137278932144e-06, + "loss": 0.9141, + "step": 7068 + }, + { + "epoch": 0.5100104613830669, + "grad_norm": 10.863275631515986, + "learning_rate": 2.0322464297172166e-06, + "loss": 0.9638, + "step": 7069 + }, + { + "epoch": 0.5100826088524945, + "grad_norm": 2.97455150525414, + "learning_rate": 2.0317791297803615e-06, + "loss": 0.8268, + "step": 7070 + }, + { + "epoch": 0.510154756321922, + "grad_norm": 3.318523185148944, + "learning_rate": 2.0313118281081653e-06, + "loss": 0.9664, + "step": 7071 + }, + { + "epoch": 0.5102269037913495, + "grad_norm": 3.9356889949912723, + "learning_rate": 2.0308445247261455e-06, + "loss": 0.9695, + "step": 7072 + }, + { + "epoch": 0.510299051260777, + "grad_norm": 4.092744893161318, + "learning_rate": 2.0303772196598214e-06, + "loss": 0.7727, + "step": 7073 + }, + { + "epoch": 0.5103711987302045, + "grad_norm": 6.344386477557253, + "learning_rate": 2.0299099129347095e-06, + "loss": 0.9158, + "step": 7074 + }, + { + "epoch": 0.510443346199632, + "grad_norm": 2.982636269768209, + "learning_rate": 2.0294426045763275e-06, + "loss": 0.901, + "step": 7075 + }, + { + "epoch": 0.5105154936690596, + "grad_norm": 5.608343087915758, + "learning_rate": 2.0289752946101944e-06, + "loss": 0.9127, + "step": 7076 + }, + { + "epoch": 0.5105876411384871, + "grad_norm": 4.495701326135707, + "learning_rate": 2.028507983061827e-06, + "loss": 0.9288, + "step": 7077 + }, + { + "epoch": 0.5106597886079146, + "grad_norm": 4.60956430854631, + "learning_rate": 2.028040669956745e-06, + "loss": 0.8883, + "step": 7078 + }, + { + "epoch": 0.5107319360773421, + "grad_norm": 4.302803084023578, + "learning_rate": 2.027573355320466e-06, + "loss": 0.9932, + "step": 7079 + }, + { + "epoch": 0.5108040835467696, + "grad_norm": 3.3029269676068718, + "learning_rate": 2.027106039178507e-06, + "loss": 0.9284, + "step": 7080 + }, + { + "epoch": 0.5108762310161971, + "grad_norm": 2.529701041549997, + "learning_rate": 2.0266387215563884e-06, + "loss": 0.9526, + "step": 7081 + }, + { + "epoch": 0.5109483784856246, + "grad_norm": 2.81365248061612, + "learning_rate": 2.0261714024796273e-06, + "loss": 0.9823, + "step": 7082 + }, + { + "epoch": 0.5110205259550521, + "grad_norm": 5.522998754619656, + "learning_rate": 2.025704081973744e-06, + "loss": 1.0413, + "step": 7083 + }, + { + "epoch": 0.5110926734244796, + "grad_norm": 18.03053829699205, + "learning_rate": 2.025236760064255e-06, + "loss": 0.927, + "step": 7084 + }, + { + "epoch": 0.5111648208939071, + "grad_norm": 2.7488295925494786, + "learning_rate": 2.02476943677668e-06, + "loss": 1.0089, + "step": 7085 + }, + { + "epoch": 0.5112369683633347, + "grad_norm": 2.5785071812193685, + "learning_rate": 2.0243021121365378e-06, + "loss": 0.9629, + "step": 7086 + }, + { + "epoch": 0.5113091158327622, + "grad_norm": 3.012237543726661, + "learning_rate": 2.0238347861693466e-06, + "loss": 0.9631, + "step": 7087 + }, + { + "epoch": 0.5113812633021897, + "grad_norm": 3.026062922093934, + "learning_rate": 2.0233674589006267e-06, + "loss": 0.917, + "step": 7088 + }, + { + "epoch": 0.5114534107716172, + "grad_norm": 3.2119082725985364, + "learning_rate": 2.0229001303558965e-06, + "loss": 0.9184, + "step": 7089 + }, + { + "epoch": 0.5115255582410447, + "grad_norm": 2.8599634450963873, + "learning_rate": 2.0224328005606743e-06, + "loss": 0.8394, + "step": 7090 + }, + { + "epoch": 0.5115977057104723, + "grad_norm": 2.930151655939781, + "learning_rate": 2.021965469540481e-06, + "loss": 0.9798, + "step": 7091 + }, + { + "epoch": 0.5116698531798998, + "grad_norm": 7.550784533835939, + "learning_rate": 2.0214981373208336e-06, + "loss": 0.903, + "step": 7092 + }, + { + "epoch": 0.5117420006493272, + "grad_norm": 2.2172035953295066, + "learning_rate": 2.0210308039272527e-06, + "loss": 0.9164, + "step": 7093 + }, + { + "epoch": 0.5118141481187547, + "grad_norm": 3.8914573422539314, + "learning_rate": 2.020563469385258e-06, + "loss": 0.9319, + "step": 7094 + }, + { + "epoch": 0.5118862955881822, + "grad_norm": 2.4714057255074087, + "learning_rate": 2.0200961337203683e-06, + "loss": 0.8993, + "step": 7095 + }, + { + "epoch": 0.5119584430576097, + "grad_norm": 4.146206760681302, + "learning_rate": 2.019628796958103e-06, + "loss": 1.0396, + "step": 7096 + }, + { + "epoch": 0.5120305905270373, + "grad_norm": 3.459671925773364, + "learning_rate": 2.0191614591239808e-06, + "loss": 0.8501, + "step": 7097 + }, + { + "epoch": 0.5121027379964648, + "grad_norm": 23.608845026315084, + "learning_rate": 2.0186941202435233e-06, + "loss": 0.8807, + "step": 7098 + }, + { + "epoch": 0.5121748854658923, + "grad_norm": 3.8486282282784803, + "learning_rate": 2.0182267803422485e-06, + "loss": 0.8633, + "step": 7099 + }, + { + "epoch": 0.5122470329353198, + "grad_norm": 2.6893172219373342, + "learning_rate": 2.0177594394456773e-06, + "loss": 0.7107, + "step": 7100 + }, + { + "epoch": 0.5123191804047473, + "grad_norm": 4.847003459396372, + "learning_rate": 2.0172920975793288e-06, + "loss": 0.8615, + "step": 7101 + }, + { + "epoch": 0.5123913278741749, + "grad_norm": 2.1267624446423636, + "learning_rate": 2.0168247547687223e-06, + "loss": 0.8575, + "step": 7102 + }, + { + "epoch": 0.5124634753436024, + "grad_norm": 3.4993312741801224, + "learning_rate": 2.016357411039379e-06, + "loss": 0.9629, + "step": 7103 + }, + { + "epoch": 0.5125356228130298, + "grad_norm": 5.73760213657133, + "learning_rate": 2.0158900664168175e-06, + "loss": 0.8677, + "step": 7104 + }, + { + "epoch": 0.5126077702824573, + "grad_norm": 4.390439935768378, + "learning_rate": 2.0154227209265584e-06, + "loss": 0.9368, + "step": 7105 + }, + { + "epoch": 0.5126799177518848, + "grad_norm": 2.948998853557127, + "learning_rate": 2.014955374594122e-06, + "loss": 0.8478, + "step": 7106 + }, + { + "epoch": 0.5127520652213123, + "grad_norm": 0.78115039190446, + "learning_rate": 2.0144880274450275e-06, + "loss": 0.8452, + "step": 7107 + }, + { + "epoch": 0.5128242126907399, + "grad_norm": 2.344137236076693, + "learning_rate": 2.014020679504796e-06, + "loss": 0.8793, + "step": 7108 + }, + { + "epoch": 0.5128963601601674, + "grad_norm": 3.3356686200096513, + "learning_rate": 2.013553330798946e-06, + "loss": 0.8317, + "step": 7109 + }, + { + "epoch": 0.5129685076295949, + "grad_norm": 6.4976878822348345, + "learning_rate": 2.0130859813529994e-06, + "loss": 0.9566, + "step": 7110 + }, + { + "epoch": 0.5130406550990224, + "grad_norm": 4.986520717919385, + "learning_rate": 2.0126186311924763e-06, + "loss": 0.9552, + "step": 7111 + }, + { + "epoch": 0.5131128025684499, + "grad_norm": 0.8633683795026174, + "learning_rate": 2.0121512803428953e-06, + "loss": 0.7936, + "step": 7112 + }, + { + "epoch": 0.5131849500378775, + "grad_norm": 0.7692167927315913, + "learning_rate": 2.0116839288297788e-06, + "loss": 0.7691, + "step": 7113 + }, + { + "epoch": 0.513257097507305, + "grad_norm": 2.7387003666303387, + "learning_rate": 2.0112165766786457e-06, + "loss": 0.8157, + "step": 7114 + }, + { + "epoch": 0.5133292449767325, + "grad_norm": 6.594811810519093, + "learning_rate": 2.010749223915017e-06, + "loss": 0.8248, + "step": 7115 + }, + { + "epoch": 0.5134013924461599, + "grad_norm": 3.3816365583372656, + "learning_rate": 2.0102818705644133e-06, + "loss": 0.8364, + "step": 7116 + }, + { + "epoch": 0.5134735399155874, + "grad_norm": 2.5708808618373817, + "learning_rate": 2.0098145166523546e-06, + "loss": 0.9252, + "step": 7117 + }, + { + "epoch": 0.5135456873850149, + "grad_norm": 2.4815209267245217, + "learning_rate": 2.0093471622043613e-06, + "loss": 0.8157, + "step": 7118 + }, + { + "epoch": 0.5136178348544425, + "grad_norm": 2.184782465531041, + "learning_rate": 2.0088798072459543e-06, + "loss": 0.8304, + "step": 7119 + }, + { + "epoch": 0.51368998232387, + "grad_norm": 2.3633001153366338, + "learning_rate": 2.008412451802654e-06, + "loss": 0.8461, + "step": 7120 + }, + { + "epoch": 0.5137621297932975, + "grad_norm": 2.7228347960589967, + "learning_rate": 2.00794509589998e-06, + "loss": 0.8128, + "step": 7121 + }, + { + "epoch": 0.513834277262725, + "grad_norm": 3.944909768275313, + "learning_rate": 2.007477739563455e-06, + "loss": 1.0079, + "step": 7122 + }, + { + "epoch": 0.5139064247321525, + "grad_norm": 3.0252845504221497, + "learning_rate": 2.007010382818598e-06, + "loss": 0.7903, + "step": 7123 + }, + { + "epoch": 0.51397857220158, + "grad_norm": 3.340374963465241, + "learning_rate": 2.006543025690929e-06, + "loss": 0.9899, + "step": 7124 + }, + { + "epoch": 0.5140507196710076, + "grad_norm": 3.789167202161644, + "learning_rate": 2.0060756682059704e-06, + "loss": 0.9146, + "step": 7125 + }, + { + "epoch": 0.5141228671404351, + "grad_norm": 3.4431033155555113, + "learning_rate": 2.0056083103892415e-06, + "loss": 0.8424, + "step": 7126 + }, + { + "epoch": 0.5141950146098626, + "grad_norm": 6.29241133908179, + "learning_rate": 2.005140952266264e-06, + "loss": 0.9471, + "step": 7127 + }, + { + "epoch": 0.51426716207929, + "grad_norm": 2.9377304960682316, + "learning_rate": 2.004673593862558e-06, + "loss": 0.9106, + "step": 7128 + }, + { + "epoch": 0.5143393095487175, + "grad_norm": 5.377375055809733, + "learning_rate": 2.004206235203644e-06, + "loss": 0.8234, + "step": 7129 + }, + { + "epoch": 0.5144114570181451, + "grad_norm": 2.206568198451275, + "learning_rate": 2.0037388763150437e-06, + "loss": 0.8877, + "step": 7130 + }, + { + "epoch": 0.5144836044875726, + "grad_norm": 3.1505257848893162, + "learning_rate": 2.0032715172222765e-06, + "loss": 0.9609, + "step": 7131 + }, + { + "epoch": 0.5145557519570001, + "grad_norm": 6.678176512673488, + "learning_rate": 2.002804157950865e-06, + "loss": 0.973, + "step": 7132 + }, + { + "epoch": 0.5146278994264276, + "grad_norm": 6.276785255374748, + "learning_rate": 2.002336798526328e-06, + "loss": 0.995, + "step": 7133 + }, + { + "epoch": 0.5147000468958551, + "grad_norm": 4.0970415467047445, + "learning_rate": 2.0018694389741867e-06, + "loss": 0.9433, + "step": 7134 + }, + { + "epoch": 0.5147721943652827, + "grad_norm": 3.8159969114215118, + "learning_rate": 2.0014020793199633e-06, + "loss": 0.8563, + "step": 7135 + }, + { + "epoch": 0.5148443418347102, + "grad_norm": 2.5506846948549207, + "learning_rate": 2.0009347195891767e-06, + "loss": 0.9598, + "step": 7136 + }, + { + "epoch": 0.5149164893041377, + "grad_norm": 2.5845127387283604, + "learning_rate": 2.0004673598073486e-06, + "loss": 0.8201, + "step": 7137 + }, + { + "epoch": 0.5149886367735652, + "grad_norm": 3.074985975528153, + "learning_rate": 1.9999999999999995e-06, + "loss": 0.9514, + "step": 7138 + }, + { + "epoch": 0.5150607842429927, + "grad_norm": 2.866751240066762, + "learning_rate": 1.999532640192651e-06, + "loss": 0.7931, + "step": 7139 + }, + { + "epoch": 0.5151329317124201, + "grad_norm": 0.745446088802663, + "learning_rate": 1.9990652804108236e-06, + "loss": 0.703, + "step": 7140 + }, + { + "epoch": 0.5152050791818477, + "grad_norm": 2.5407307470257128, + "learning_rate": 1.998597920680037e-06, + "loss": 0.9312, + "step": 7141 + }, + { + "epoch": 0.5152772266512752, + "grad_norm": 3.4713235192211136, + "learning_rate": 1.998130561025813e-06, + "loss": 0.896, + "step": 7142 + }, + { + "epoch": 0.5153493741207027, + "grad_norm": 4.975912629577853, + "learning_rate": 1.997663201473672e-06, + "loss": 0.9666, + "step": 7143 + }, + { + "epoch": 0.5154215215901302, + "grad_norm": 3.461007352171426, + "learning_rate": 1.9971958420491356e-06, + "loss": 0.9563, + "step": 7144 + }, + { + "epoch": 0.5154936690595577, + "grad_norm": 4.864044595275531, + "learning_rate": 1.9967284827777233e-06, + "loss": 0.9546, + "step": 7145 + }, + { + "epoch": 0.5155658165289853, + "grad_norm": 2.6563031359575016, + "learning_rate": 1.996261123684956e-06, + "loss": 0.9525, + "step": 7146 + }, + { + "epoch": 0.5156379639984128, + "grad_norm": 0.8147098859221602, + "learning_rate": 1.995793764796356e-06, + "loss": 0.8265, + "step": 7147 + }, + { + "epoch": 0.5157101114678403, + "grad_norm": 2.1556708622381318, + "learning_rate": 1.9953264061374417e-06, + "loss": 1.0076, + "step": 7148 + }, + { + "epoch": 0.5157822589372678, + "grad_norm": 2.900460167253419, + "learning_rate": 1.9948590477337363e-06, + "loss": 0.9492, + "step": 7149 + }, + { + "epoch": 0.5158544064066953, + "grad_norm": 1.9957873562777853, + "learning_rate": 1.9943916896107588e-06, + "loss": 0.8873, + "step": 7150 + }, + { + "epoch": 0.5159265538761227, + "grad_norm": 4.5081205213221205, + "learning_rate": 1.99392433179403e-06, + "loss": 0.8525, + "step": 7151 + }, + { + "epoch": 0.5159987013455503, + "grad_norm": 2.234553296638135, + "learning_rate": 1.9934569743090714e-06, + "loss": 0.9407, + "step": 7152 + }, + { + "epoch": 0.5160708488149778, + "grad_norm": 2.3088167086114963, + "learning_rate": 1.9929896171814024e-06, + "loss": 1.0241, + "step": 7153 + }, + { + "epoch": 0.5161429962844053, + "grad_norm": 3.1180528859271255, + "learning_rate": 1.9925222604365454e-06, + "loss": 1.0018, + "step": 7154 + }, + { + "epoch": 0.5162151437538328, + "grad_norm": 3.8576059340976814, + "learning_rate": 1.99205490410002e-06, + "loss": 0.9756, + "step": 7155 + }, + { + "epoch": 0.5162872912232603, + "grad_norm": 3.1216602026252467, + "learning_rate": 1.9915875481973462e-06, + "loss": 0.9389, + "step": 7156 + }, + { + "epoch": 0.5163594386926879, + "grad_norm": 22.985896554837435, + "learning_rate": 1.991120192754046e-06, + "loss": 1.0104, + "step": 7157 + }, + { + "epoch": 0.5164315861621154, + "grad_norm": 3.1758336512147465, + "learning_rate": 1.9906528377956386e-06, + "loss": 0.8669, + "step": 7158 + }, + { + "epoch": 0.5165037336315429, + "grad_norm": 4.157908402762932, + "learning_rate": 1.9901854833476456e-06, + "loss": 0.9761, + "step": 7159 + }, + { + "epoch": 0.5165758811009704, + "grad_norm": 8.515861521944652, + "learning_rate": 1.989718129435586e-06, + "loss": 0.9866, + "step": 7160 + }, + { + "epoch": 0.5166480285703979, + "grad_norm": 2.5537970154589407, + "learning_rate": 1.9892507760849827e-06, + "loss": 1.059, + "step": 7161 + }, + { + "epoch": 0.5167201760398255, + "grad_norm": 3.30842728629535, + "learning_rate": 1.9887834233213546e-06, + "loss": 0.8684, + "step": 7162 + }, + { + "epoch": 0.5167923235092529, + "grad_norm": 3.3978877181911957, + "learning_rate": 1.988316071170221e-06, + "loss": 0.9478, + "step": 7163 + }, + { + "epoch": 0.5168644709786804, + "grad_norm": 3.392247387582753, + "learning_rate": 1.9878487196571045e-06, + "loss": 0.9237, + "step": 7164 + }, + { + "epoch": 0.5169366184481079, + "grad_norm": 3.0816243220945045, + "learning_rate": 1.9873813688075235e-06, + "loss": 0.932, + "step": 7165 + }, + { + "epoch": 0.5170087659175354, + "grad_norm": 3.5284876188536938, + "learning_rate": 1.9869140186470004e-06, + "loss": 0.7959, + "step": 7166 + }, + { + "epoch": 0.5170809133869629, + "grad_norm": 3.023270318980898, + "learning_rate": 1.9864466692010542e-06, + "loss": 0.9677, + "step": 7167 + }, + { + "epoch": 0.5171530608563905, + "grad_norm": 2.4196441220052467, + "learning_rate": 1.9859793204952043e-06, + "loss": 0.9701, + "step": 7168 + }, + { + "epoch": 0.517225208325818, + "grad_norm": 4.347099714140753, + "learning_rate": 1.9855119725549727e-06, + "loss": 0.9616, + "step": 7169 + }, + { + "epoch": 0.5172973557952455, + "grad_norm": 2.6216481016277355, + "learning_rate": 1.9850446254058783e-06, + "loss": 0.8371, + "step": 7170 + }, + { + "epoch": 0.517369503264673, + "grad_norm": 2.1184830308909373, + "learning_rate": 1.9845772790734414e-06, + "loss": 0.9441, + "step": 7171 + }, + { + "epoch": 0.5174416507341005, + "grad_norm": 2.380407601883496, + "learning_rate": 1.9841099335831827e-06, + "loss": 0.896, + "step": 7172 + }, + { + "epoch": 0.517513798203528, + "grad_norm": 2.7699129301633354, + "learning_rate": 1.983642588960621e-06, + "loss": 0.9719, + "step": 7173 + }, + { + "epoch": 0.5175859456729556, + "grad_norm": 4.630813656327461, + "learning_rate": 1.983175245231278e-06, + "loss": 0.9265, + "step": 7174 + }, + { + "epoch": 0.517658093142383, + "grad_norm": 2.7066147168659636, + "learning_rate": 1.982707902420671e-06, + "loss": 1.0151, + "step": 7175 + }, + { + "epoch": 0.5177302406118105, + "grad_norm": 1.8871059612732073, + "learning_rate": 1.9822405605543225e-06, + "loss": 0.917, + "step": 7176 + }, + { + "epoch": 0.517802388081238, + "grad_norm": 3.3711698485942962, + "learning_rate": 1.9817732196577518e-06, + "loss": 0.9008, + "step": 7177 + }, + { + "epoch": 0.5178745355506655, + "grad_norm": 3.485938846001393, + "learning_rate": 1.9813058797564765e-06, + "loss": 0.8945, + "step": 7178 + }, + { + "epoch": 0.5179466830200931, + "grad_norm": 2.5985691388173353, + "learning_rate": 1.980838540876019e-06, + "loss": 0.8493, + "step": 7179 + }, + { + "epoch": 0.5180188304895206, + "grad_norm": 0.8028036491408043, + "learning_rate": 1.980371203041897e-06, + "loss": 0.8808, + "step": 7180 + }, + { + "epoch": 0.5180909779589481, + "grad_norm": 2.390264945248515, + "learning_rate": 1.979903866279632e-06, + "loss": 0.9267, + "step": 7181 + }, + { + "epoch": 0.5181631254283756, + "grad_norm": 1.7942566744204667, + "learning_rate": 1.979436530614742e-06, + "loss": 1.0057, + "step": 7182 + }, + { + "epoch": 0.5182352728978031, + "grad_norm": 1.5768065279784376, + "learning_rate": 1.978969196072747e-06, + "loss": 0.8994, + "step": 7183 + }, + { + "epoch": 0.5183074203672307, + "grad_norm": 4.160012284774248, + "learning_rate": 1.9785018626791667e-06, + "loss": 1.0734, + "step": 7184 + }, + { + "epoch": 0.5183795678366582, + "grad_norm": 3.1231369568582714, + "learning_rate": 1.9780345304595192e-06, + "loss": 0.8882, + "step": 7185 + }, + { + "epoch": 0.5184517153060857, + "grad_norm": 4.322072117434606, + "learning_rate": 1.9775671994393255e-06, + "loss": 0.8856, + "step": 7186 + }, + { + "epoch": 0.5185238627755131, + "grad_norm": 4.342756198842838, + "learning_rate": 1.9770998696441033e-06, + "loss": 0.807, + "step": 7187 + }, + { + "epoch": 0.5185960102449406, + "grad_norm": 3.237286275635802, + "learning_rate": 1.976632541099373e-06, + "loss": 0.9322, + "step": 7188 + }, + { + "epoch": 0.5186681577143681, + "grad_norm": 1.906119232695347, + "learning_rate": 1.9761652138306536e-06, + "loss": 0.9327, + "step": 7189 + }, + { + "epoch": 0.5187403051837957, + "grad_norm": 2.6478753407572704, + "learning_rate": 1.975697887863462e-06, + "loss": 0.8443, + "step": 7190 + }, + { + "epoch": 0.5188124526532232, + "grad_norm": 2.3627323742779116, + "learning_rate": 1.9752305632233204e-06, + "loss": 0.9792, + "step": 7191 + }, + { + "epoch": 0.5188846001226507, + "grad_norm": 2.49878262443675, + "learning_rate": 1.974763239935745e-06, + "loss": 0.9787, + "step": 7192 + }, + { + "epoch": 0.5189567475920782, + "grad_norm": 3.7658710320219115, + "learning_rate": 1.9742959180262564e-06, + "loss": 0.8592, + "step": 7193 + }, + { + "epoch": 0.5190288950615057, + "grad_norm": 2.5754465697368554, + "learning_rate": 1.9738285975203725e-06, + "loss": 0.9309, + "step": 7194 + }, + { + "epoch": 0.5191010425309333, + "grad_norm": 3.536675934798896, + "learning_rate": 1.9733612784436114e-06, + "loss": 0.9839, + "step": 7195 + }, + { + "epoch": 0.5191731900003608, + "grad_norm": 2.7110176184520403, + "learning_rate": 1.9728939608214927e-06, + "loss": 0.9044, + "step": 7196 + }, + { + "epoch": 0.5192453374697883, + "grad_norm": 1.9934829389551527, + "learning_rate": 1.972426644679534e-06, + "loss": 1.006, + "step": 7197 + }, + { + "epoch": 0.5193174849392158, + "grad_norm": 3.723757537905877, + "learning_rate": 1.971959330043255e-06, + "loss": 0.8271, + "step": 7198 + }, + { + "epoch": 0.5193896324086432, + "grad_norm": 2.8820028848097565, + "learning_rate": 1.9714920169381728e-06, + "loss": 1.0181, + "step": 7199 + }, + { + "epoch": 0.5194617798780707, + "grad_norm": 2.406539701386741, + "learning_rate": 1.971024705389806e-06, + "loss": 0.9613, + "step": 7200 + }, + { + "epoch": 0.5195339273474983, + "grad_norm": 8.080042952621659, + "learning_rate": 1.9705573954236727e-06, + "loss": 1.0329, + "step": 7201 + }, + { + "epoch": 0.5196060748169258, + "grad_norm": 3.446228320828762, + "learning_rate": 1.9700900870652907e-06, + "loss": 0.8966, + "step": 7202 + }, + { + "epoch": 0.5196782222863533, + "grad_norm": 2.420575390028378, + "learning_rate": 1.969622780340179e-06, + "loss": 0.8785, + "step": 7203 + }, + { + "epoch": 0.5197503697557808, + "grad_norm": 3.109783356655185, + "learning_rate": 1.969155475273854e-06, + "loss": 0.8622, + "step": 7204 + }, + { + "epoch": 0.5198225172252083, + "grad_norm": 3.793636117951004, + "learning_rate": 1.968688171891835e-06, + "loss": 0.9398, + "step": 7205 + }, + { + "epoch": 0.5198946646946359, + "grad_norm": 3.601861440701557, + "learning_rate": 1.968220870219639e-06, + "loss": 0.9936, + "step": 7206 + }, + { + "epoch": 0.5199668121640634, + "grad_norm": 0.7333320353958896, + "learning_rate": 1.9677535702827836e-06, + "loss": 0.8044, + "step": 7207 + }, + { + "epoch": 0.5200389596334909, + "grad_norm": 2.4692239427140787, + "learning_rate": 1.967286272106786e-06, + "loss": 0.9657, + "step": 7208 + }, + { + "epoch": 0.5201111071029184, + "grad_norm": 2.815366513393297, + "learning_rate": 1.966818975717164e-06, + "loss": 0.9758, + "step": 7209 + }, + { + "epoch": 0.5201832545723458, + "grad_norm": 4.317577303319722, + "learning_rate": 1.9663516811394355e-06, + "loss": 0.9688, + "step": 7210 + }, + { + "epoch": 0.5202554020417733, + "grad_norm": 2.854991116217639, + "learning_rate": 1.9658843883991166e-06, + "loss": 0.859, + "step": 7211 + }, + { + "epoch": 0.5203275495112009, + "grad_norm": 2.3421535904295663, + "learning_rate": 1.965417097521725e-06, + "loss": 0.9056, + "step": 7212 + }, + { + "epoch": 0.5203996969806284, + "grad_norm": 2.569921771094579, + "learning_rate": 1.9649498085327777e-06, + "loss": 0.9192, + "step": 7213 + }, + { + "epoch": 0.5204718444500559, + "grad_norm": 3.697894043472648, + "learning_rate": 1.9644825214577913e-06, + "loss": 0.8688, + "step": 7214 + }, + { + "epoch": 0.5205439919194834, + "grad_norm": 3.028520279192783, + "learning_rate": 1.9640152363222837e-06, + "loss": 0.9835, + "step": 7215 + }, + { + "epoch": 0.5206161393889109, + "grad_norm": 3.56561785336981, + "learning_rate": 1.9635479531517706e-06, + "loss": 0.8767, + "step": 7216 + }, + { + "epoch": 0.5206882868583385, + "grad_norm": 2.4135029462184274, + "learning_rate": 1.963080671971768e-06, + "loss": 0.9227, + "step": 7217 + }, + { + "epoch": 0.520760434327766, + "grad_norm": 4.727893383457046, + "learning_rate": 1.9626133928077945e-06, + "loss": 0.8413, + "step": 7218 + }, + { + "epoch": 0.5208325817971935, + "grad_norm": 2.387603745528223, + "learning_rate": 1.9621461156853643e-06, + "loss": 0.994, + "step": 7219 + }, + { + "epoch": 0.520904729266621, + "grad_norm": 3.33333870569432, + "learning_rate": 1.961678840629995e-06, + "loss": 0.9417, + "step": 7220 + }, + { + "epoch": 0.5209768767360485, + "grad_norm": 3.0697845703981876, + "learning_rate": 1.9612115676672024e-06, + "loss": 0.936, + "step": 7221 + }, + { + "epoch": 0.5210490242054759, + "grad_norm": 3.1320566992274093, + "learning_rate": 1.9607442968225023e-06, + "loss": 0.8612, + "step": 7222 + }, + { + "epoch": 0.5211211716749035, + "grad_norm": 2.297936226792377, + "learning_rate": 1.9602770281214105e-06, + "loss": 1.0353, + "step": 7223 + }, + { + "epoch": 0.521193319144331, + "grad_norm": 3.469065196520169, + "learning_rate": 1.959809761589444e-06, + "loss": 0.9105, + "step": 7224 + }, + { + "epoch": 0.5212654666137585, + "grad_norm": 2.3287203238823664, + "learning_rate": 1.959342497252117e-06, + "loss": 0.9025, + "step": 7225 + }, + { + "epoch": 0.521337614083186, + "grad_norm": 2.568962322286973, + "learning_rate": 1.9588752351349455e-06, + "loss": 0.9601, + "step": 7226 + }, + { + "epoch": 0.5214097615526135, + "grad_norm": 5.3423481491136355, + "learning_rate": 1.958407975263446e-06, + "loss": 0.9288, + "step": 7227 + }, + { + "epoch": 0.5214819090220411, + "grad_norm": 2.102818223818187, + "learning_rate": 1.957940717663133e-06, + "loss": 1.0005, + "step": 7228 + }, + { + "epoch": 0.5215540564914686, + "grad_norm": 3.177032486972522, + "learning_rate": 1.9574734623595206e-06, + "loss": 0.8956, + "step": 7229 + }, + { + "epoch": 0.5216262039608961, + "grad_norm": 2.7368588746286844, + "learning_rate": 1.9570062093781257e-06, + "loss": 1.0395, + "step": 7230 + }, + { + "epoch": 0.5216983514303236, + "grad_norm": 2.4065208778006744, + "learning_rate": 1.9565389587444627e-06, + "loss": 0.9684, + "step": 7231 + }, + { + "epoch": 0.5217704988997511, + "grad_norm": 7.868045703252909, + "learning_rate": 1.9560717104840457e-06, + "loss": 0.8336, + "step": 7232 + }, + { + "epoch": 0.5218426463691787, + "grad_norm": 0.8173747546529161, + "learning_rate": 1.955604464622391e-06, + "loss": 0.8495, + "step": 7233 + }, + { + "epoch": 0.5219147938386061, + "grad_norm": 2.383243609342307, + "learning_rate": 1.9551372211850114e-06, + "loss": 0.9964, + "step": 7234 + }, + { + "epoch": 0.5219869413080336, + "grad_norm": 4.444486678770514, + "learning_rate": 1.954669980197422e-06, + "loss": 0.9314, + "step": 7235 + }, + { + "epoch": 0.5220590887774611, + "grad_norm": 2.3305680942109697, + "learning_rate": 1.9542027416851374e-06, + "loss": 0.9115, + "step": 7236 + }, + { + "epoch": 0.5221312362468886, + "grad_norm": 2.3591067938537944, + "learning_rate": 1.9537355056736715e-06, + "loss": 0.9011, + "step": 7237 + }, + { + "epoch": 0.5222033837163161, + "grad_norm": 2.519444190598596, + "learning_rate": 1.953268272188538e-06, + "loss": 0.9253, + "step": 7238 + }, + { + "epoch": 0.5222755311857437, + "grad_norm": 2.626633363002708, + "learning_rate": 1.9528010412552517e-06, + "loss": 0.8971, + "step": 7239 + }, + { + "epoch": 0.5223476786551712, + "grad_norm": 3.648032449787105, + "learning_rate": 1.952333812899326e-06, + "loss": 0.92, + "step": 7240 + }, + { + "epoch": 0.5224198261245987, + "grad_norm": 3.307021162530017, + "learning_rate": 1.951866587146273e-06, + "loss": 0.9005, + "step": 7241 + }, + { + "epoch": 0.5224919735940262, + "grad_norm": 0.7084202900038391, + "learning_rate": 1.951399364021609e-06, + "loss": 0.7567, + "step": 7242 + }, + { + "epoch": 0.5225641210634537, + "grad_norm": 2.434489224732645, + "learning_rate": 1.9509321435508455e-06, + "loss": 0.933, + "step": 7243 + }, + { + "epoch": 0.5226362685328813, + "grad_norm": 2.812999002906034, + "learning_rate": 1.9504649257594954e-06, + "loss": 0.967, + "step": 7244 + }, + { + "epoch": 0.5227084160023088, + "grad_norm": 2.888470811017247, + "learning_rate": 1.949997710673073e-06, + "loss": 0.9123, + "step": 7245 + }, + { + "epoch": 0.5227805634717362, + "grad_norm": 3.5239188463910045, + "learning_rate": 1.94953049831709e-06, + "loss": 0.9102, + "step": 7246 + }, + { + "epoch": 0.5228527109411637, + "grad_norm": 3.4458184595182946, + "learning_rate": 1.94906328871706e-06, + "loss": 0.9724, + "step": 7247 + }, + { + "epoch": 0.5229248584105912, + "grad_norm": 3.8555047401673153, + "learning_rate": 1.948596081898495e-06, + "loss": 0.847, + "step": 7248 + }, + { + "epoch": 0.5229970058800187, + "grad_norm": 4.061784651937597, + "learning_rate": 1.948128877886908e-06, + "loss": 1.0065, + "step": 7249 + }, + { + "epoch": 0.5230691533494463, + "grad_norm": 2.7387619140867288, + "learning_rate": 1.9476616767078107e-06, + "loss": 0.9137, + "step": 7250 + }, + { + "epoch": 0.5231413008188738, + "grad_norm": 2.472545169305874, + "learning_rate": 1.9471944783867153e-06, + "loss": 0.8706, + "step": 7251 + }, + { + "epoch": 0.5232134482883013, + "grad_norm": 6.7208413351271075, + "learning_rate": 1.9467272829491347e-06, + "loss": 0.9115, + "step": 7252 + }, + { + "epoch": 0.5232855957577288, + "grad_norm": 4.918476590576182, + "learning_rate": 1.9462600904205787e-06, + "loss": 0.97, + "step": 7253 + }, + { + "epoch": 0.5233577432271563, + "grad_norm": 4.069044505065545, + "learning_rate": 1.9457929008265616e-06, + "loss": 0.9883, + "step": 7254 + }, + { + "epoch": 0.5234298906965839, + "grad_norm": 3.2920604100113446, + "learning_rate": 1.945325714192593e-06, + "loss": 0.9138, + "step": 7255 + }, + { + "epoch": 0.5235020381660114, + "grad_norm": 3.1023504760023726, + "learning_rate": 1.9448585305441843e-06, + "loss": 0.8593, + "step": 7256 + }, + { + "epoch": 0.5235741856354388, + "grad_norm": 2.6996772255289696, + "learning_rate": 1.9443913499068477e-06, + "loss": 0.927, + "step": 7257 + }, + { + "epoch": 0.5236463331048663, + "grad_norm": 4.151430706075754, + "learning_rate": 1.9439241723060933e-06, + "loss": 0.9938, + "step": 7258 + }, + { + "epoch": 0.5237184805742938, + "grad_norm": 2.230951741263016, + "learning_rate": 1.943456997767432e-06, + "loss": 0.8829, + "step": 7259 + }, + { + "epoch": 0.5237906280437213, + "grad_norm": 4.73546605395593, + "learning_rate": 1.9429898263163757e-06, + "loss": 0.9557, + "step": 7260 + }, + { + "epoch": 0.5238627755131489, + "grad_norm": 2.6452004784683263, + "learning_rate": 1.942522657978433e-06, + "loss": 1.0285, + "step": 7261 + }, + { + "epoch": 0.5239349229825764, + "grad_norm": 2.8529276885112287, + "learning_rate": 1.9420554927791156e-06, + "loss": 0.9138, + "step": 7262 + }, + { + "epoch": 0.5240070704520039, + "grad_norm": 4.260176200267334, + "learning_rate": 1.941588330743933e-06, + "loss": 1.0029, + "step": 7263 + }, + { + "epoch": 0.5240792179214314, + "grad_norm": 3.5839391388493977, + "learning_rate": 1.941121171898396e-06, + "loss": 0.9045, + "step": 7264 + }, + { + "epoch": 0.5241513653908589, + "grad_norm": 2.766891017304774, + "learning_rate": 1.940654016268013e-06, + "loss": 0.9248, + "step": 7265 + }, + { + "epoch": 0.5242235128602865, + "grad_norm": 2.5601532587727127, + "learning_rate": 1.940186863878295e-06, + "loss": 0.9068, + "step": 7266 + }, + { + "epoch": 0.524295660329714, + "grad_norm": 2.3416300022970047, + "learning_rate": 1.9397197147547512e-06, + "loss": 0.9581, + "step": 7267 + }, + { + "epoch": 0.5243678077991415, + "grad_norm": 0.7273637189080441, + "learning_rate": 1.93925256892289e-06, + "loss": 0.7838, + "step": 7268 + }, + { + "epoch": 0.5244399552685689, + "grad_norm": 2.84933376221333, + "learning_rate": 1.9387854264082214e-06, + "loss": 0.917, + "step": 7269 + }, + { + "epoch": 0.5245121027379964, + "grad_norm": 2.6675035732999715, + "learning_rate": 1.938318287236254e-06, + "loss": 0.9216, + "step": 7270 + }, + { + "epoch": 0.5245842502074239, + "grad_norm": 3.2878966813758526, + "learning_rate": 1.9378511514324963e-06, + "loss": 0.9534, + "step": 7271 + }, + { + "epoch": 0.5246563976768515, + "grad_norm": 0.7009332830036064, + "learning_rate": 1.937384019022458e-06, + "loss": 0.7813, + "step": 7272 + }, + { + "epoch": 0.524728545146279, + "grad_norm": 3.463849016286564, + "learning_rate": 1.936916890031646e-06, + "loss": 0.7991, + "step": 7273 + }, + { + "epoch": 0.5248006926157065, + "grad_norm": 3.406314114308443, + "learning_rate": 1.936449764485569e-06, + "loss": 0.8079, + "step": 7274 + }, + { + "epoch": 0.524872840085134, + "grad_norm": 4.278801879389987, + "learning_rate": 1.9359826424097354e-06, + "loss": 0.8244, + "step": 7275 + }, + { + "epoch": 0.5249449875545615, + "grad_norm": 4.468638812195757, + "learning_rate": 1.935515523829653e-06, + "loss": 0.8923, + "step": 7276 + }, + { + "epoch": 0.5250171350239891, + "grad_norm": 5.911574774000051, + "learning_rate": 1.9350484087708287e-06, + "loss": 0.9999, + "step": 7277 + }, + { + "epoch": 0.5250892824934166, + "grad_norm": 2.8106727598415007, + "learning_rate": 1.93458129725877e-06, + "loss": 0.9364, + "step": 7278 + }, + { + "epoch": 0.5251614299628441, + "grad_norm": 3.5229246861693464, + "learning_rate": 1.9341141893189858e-06, + "loss": 0.9558, + "step": 7279 + }, + { + "epoch": 0.5252335774322716, + "grad_norm": 3.1382508675033063, + "learning_rate": 1.93364708497698e-06, + "loss": 0.9677, + "step": 7280 + }, + { + "epoch": 0.525305724901699, + "grad_norm": 3.1246151496422763, + "learning_rate": 1.9331799842582627e-06, + "loss": 0.8717, + "step": 7281 + }, + { + "epoch": 0.5253778723711265, + "grad_norm": 2.5432932188171113, + "learning_rate": 1.932712887188339e-06, + "loss": 0.9255, + "step": 7282 + }, + { + "epoch": 0.5254500198405541, + "grad_norm": 2.4474137026230585, + "learning_rate": 1.9322457937927143e-06, + "loss": 1.0099, + "step": 7283 + }, + { + "epoch": 0.5255221673099816, + "grad_norm": 2.4252140776535027, + "learning_rate": 1.9317787040968973e-06, + "loss": 1.0279, + "step": 7284 + }, + { + "epoch": 0.5255943147794091, + "grad_norm": 8.770057577468313, + "learning_rate": 1.931311618126392e-06, + "loss": 0.9739, + "step": 7285 + }, + { + "epoch": 0.5256664622488366, + "grad_norm": 20.940782887453206, + "learning_rate": 1.930844535906705e-06, + "loss": 0.9602, + "step": 7286 + }, + { + "epoch": 0.5257386097182641, + "grad_norm": 3.6542360676895687, + "learning_rate": 1.9303774574633423e-06, + "loss": 1.002, + "step": 7287 + }, + { + "epoch": 0.5258107571876917, + "grad_norm": 2.3562805902331894, + "learning_rate": 1.9299103828218084e-06, + "loss": 0.9384, + "step": 7288 + }, + { + "epoch": 0.5258829046571192, + "grad_norm": 5.785329668020292, + "learning_rate": 1.9294433120076094e-06, + "loss": 0.9735, + "step": 7289 + }, + { + "epoch": 0.5259550521265467, + "grad_norm": 2.740072711898664, + "learning_rate": 1.9289762450462495e-06, + "loss": 0.9219, + "step": 7290 + }, + { + "epoch": 0.5260271995959742, + "grad_norm": 3.2461889370069166, + "learning_rate": 1.9285091819632344e-06, + "loss": 0.8645, + "step": 7291 + }, + { + "epoch": 0.5260993470654017, + "grad_norm": 4.974711937927306, + "learning_rate": 1.9280421227840676e-06, + "loss": 0.8564, + "step": 7292 + }, + { + "epoch": 0.5261714945348291, + "grad_norm": 0.7717572466031523, + "learning_rate": 1.927575067534255e-06, + "loss": 0.7689, + "step": 7293 + }, + { + "epoch": 0.5262436420042567, + "grad_norm": 2.1104200247022065, + "learning_rate": 1.9271080162392995e-06, + "loss": 0.9778, + "step": 7294 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 2.9378251240766122, + "learning_rate": 1.926640968924705e-06, + "loss": 1.0607, + "step": 7295 + }, + { + "epoch": 0.5263879369431117, + "grad_norm": 3.6548876710426343, + "learning_rate": 1.926173925615976e-06, + "loss": 0.9329, + "step": 7296 + }, + { + "epoch": 0.5264600844125392, + "grad_norm": 3.5876333178693622, + "learning_rate": 1.9257068863386156e-06, + "loss": 0.8474, + "step": 7297 + }, + { + "epoch": 0.5265322318819667, + "grad_norm": 2.8232369516878983, + "learning_rate": 1.925239851118127e-06, + "loss": 0.9555, + "step": 7298 + }, + { + "epoch": 0.5266043793513943, + "grad_norm": 4.1753639747810505, + "learning_rate": 1.9247728199800137e-06, + "loss": 1.0172, + "step": 7299 + }, + { + "epoch": 0.5266765268208218, + "grad_norm": 2.898777533067735, + "learning_rate": 1.924305792949778e-06, + "loss": 0.9738, + "step": 7300 + }, + { + "epoch": 0.5267486742902493, + "grad_norm": 2.658298734561338, + "learning_rate": 1.9238387700529226e-06, + "loss": 0.9445, + "step": 7301 + }, + { + "epoch": 0.5268208217596768, + "grad_norm": 3.0813844724951127, + "learning_rate": 1.9233717513149495e-06, + "loss": 1.0505, + "step": 7302 + }, + { + "epoch": 0.5268929692291043, + "grad_norm": 2.6807522896455653, + "learning_rate": 1.9229047367613624e-06, + "loss": 0.9485, + "step": 7303 + }, + { + "epoch": 0.5269651166985317, + "grad_norm": 3.385490925292345, + "learning_rate": 1.922437726417662e-06, + "loss": 0.9501, + "step": 7304 + }, + { + "epoch": 0.5270372641679593, + "grad_norm": 2.932502553797242, + "learning_rate": 1.9219707203093496e-06, + "loss": 0.9274, + "step": 7305 + }, + { + "epoch": 0.5271094116373868, + "grad_norm": 2.6133579761777015, + "learning_rate": 1.9215037184619284e-06, + "loss": 0.8389, + "step": 7306 + }, + { + "epoch": 0.5271815591068143, + "grad_norm": 4.162879876109211, + "learning_rate": 1.921036720900897e-06, + "loss": 0.9408, + "step": 7307 + }, + { + "epoch": 0.5272537065762418, + "grad_norm": 7.945965195274988, + "learning_rate": 1.9205697276517595e-06, + "loss": 0.9179, + "step": 7308 + }, + { + "epoch": 0.5273258540456693, + "grad_norm": 0.7027150336461746, + "learning_rate": 1.920102738740015e-06, + "loss": 0.7821, + "step": 7309 + }, + { + "epoch": 0.5273980015150969, + "grad_norm": 3.20834134253621, + "learning_rate": 1.919635754191163e-06, + "loss": 0.9598, + "step": 7310 + }, + { + "epoch": 0.5274701489845244, + "grad_norm": 2.8984014709372055, + "learning_rate": 1.919168774030706e-06, + "loss": 0.8981, + "step": 7311 + }, + { + "epoch": 0.5275422964539519, + "grad_norm": 3.16852215732161, + "learning_rate": 1.9187017982841426e-06, + "loss": 0.924, + "step": 7312 + }, + { + "epoch": 0.5276144439233794, + "grad_norm": 3.1305963378374027, + "learning_rate": 1.918234826976973e-06, + "loss": 0.9487, + "step": 7313 + }, + { + "epoch": 0.5276865913928069, + "grad_norm": 2.9649010050878095, + "learning_rate": 1.917767860134697e-06, + "loss": 0.9693, + "step": 7314 + }, + { + "epoch": 0.5277587388622345, + "grad_norm": 2.585142353351371, + "learning_rate": 1.917300897782814e-06, + "loss": 0.9134, + "step": 7315 + }, + { + "epoch": 0.5278308863316619, + "grad_norm": 3.009106800966696, + "learning_rate": 1.9168339399468223e-06, + "loss": 0.8575, + "step": 7316 + }, + { + "epoch": 0.5279030338010894, + "grad_norm": 0.8842015773725472, + "learning_rate": 1.9163669866522216e-06, + "loss": 0.8411, + "step": 7317 + }, + { + "epoch": 0.5279751812705169, + "grad_norm": 3.509305300746525, + "learning_rate": 1.91590003792451e-06, + "loss": 0.9348, + "step": 7318 + }, + { + "epoch": 0.5280473287399444, + "grad_norm": 2.527598160404406, + "learning_rate": 1.9154330937891857e-06, + "loss": 0.9174, + "step": 7319 + }, + { + "epoch": 0.5281194762093719, + "grad_norm": 0.631135815028161, + "learning_rate": 1.9149661542717478e-06, + "loss": 0.7234, + "step": 7320 + }, + { + "epoch": 0.5281916236787995, + "grad_norm": 2.755824596420787, + "learning_rate": 1.9144992193976933e-06, + "loss": 0.9197, + "step": 7321 + }, + { + "epoch": 0.528263771148227, + "grad_norm": 5.845113177088919, + "learning_rate": 1.914032289192519e-06, + "loss": 0.9274, + "step": 7322 + }, + { + "epoch": 0.5283359186176545, + "grad_norm": 3.7004446638815205, + "learning_rate": 1.9135653636817243e-06, + "loss": 1.0451, + "step": 7323 + }, + { + "epoch": 0.528408066087082, + "grad_norm": 3.69031791992203, + "learning_rate": 1.9130984428908042e-06, + "loss": 0.9734, + "step": 7324 + }, + { + "epoch": 0.5284802135565095, + "grad_norm": 2.894170452905916, + "learning_rate": 1.912631526845257e-06, + "loss": 0.8962, + "step": 7325 + }, + { + "epoch": 0.5285523610259371, + "grad_norm": 4.484416044359748, + "learning_rate": 1.912164615570579e-06, + "loss": 0.8478, + "step": 7326 + }, + { + "epoch": 0.5286245084953646, + "grad_norm": 7.097372466120935, + "learning_rate": 1.9116977090922655e-06, + "loss": 0.822, + "step": 7327 + }, + { + "epoch": 0.528696655964792, + "grad_norm": 3.037467793109459, + "learning_rate": 1.9112308074358133e-06, + "loss": 0.8997, + "step": 7328 + }, + { + "epoch": 0.5287688034342195, + "grad_norm": 7.606051555417691, + "learning_rate": 1.9107639106267186e-06, + "loss": 0.9422, + "step": 7329 + }, + { + "epoch": 0.528840950903647, + "grad_norm": 3.5757619726000196, + "learning_rate": 1.910297018690476e-06, + "loss": 0.9497, + "step": 7330 + }, + { + "epoch": 0.5289130983730745, + "grad_norm": 4.431530012529885, + "learning_rate": 1.9098301316525818e-06, + "loss": 0.9012, + "step": 7331 + }, + { + "epoch": 0.5289852458425021, + "grad_norm": 2.101649513003201, + "learning_rate": 1.909363249538529e-06, + "loss": 0.9814, + "step": 7332 + }, + { + "epoch": 0.5290573933119296, + "grad_norm": 3.6214821945610143, + "learning_rate": 1.9088963723738147e-06, + "loss": 0.8511, + "step": 7333 + }, + { + "epoch": 0.5291295407813571, + "grad_norm": 7.365382761283683, + "learning_rate": 1.9084295001839316e-06, + "loss": 0.9219, + "step": 7334 + }, + { + "epoch": 0.5292016882507846, + "grad_norm": 2.4345502366738945, + "learning_rate": 1.907962632994375e-06, + "loss": 0.9935, + "step": 7335 + }, + { + "epoch": 0.5292738357202121, + "grad_norm": 6.465137892194403, + "learning_rate": 1.907495770830638e-06, + "loss": 0.8798, + "step": 7336 + }, + { + "epoch": 0.5293459831896397, + "grad_norm": 3.414505779199462, + "learning_rate": 1.9070289137182146e-06, + "loss": 0.9271, + "step": 7337 + }, + { + "epoch": 0.5294181306590672, + "grad_norm": 0.7675734658136266, + "learning_rate": 1.9065620616825982e-06, + "loss": 0.7734, + "step": 7338 + }, + { + "epoch": 0.5294902781284947, + "grad_norm": 3.716303814223457, + "learning_rate": 1.9060952147492814e-06, + "loss": 0.9447, + "step": 7339 + }, + { + "epoch": 0.5295624255979221, + "grad_norm": 3.2452733808427063, + "learning_rate": 1.9056283729437574e-06, + "loss": 0.9254, + "step": 7340 + }, + { + "epoch": 0.5296345730673496, + "grad_norm": 3.4369183134883072, + "learning_rate": 1.9051615362915187e-06, + "loss": 0.9, + "step": 7341 + }, + { + "epoch": 0.5297067205367771, + "grad_norm": 2.0279128611727293, + "learning_rate": 1.9046947048180577e-06, + "loss": 0.872, + "step": 7342 + }, + { + "epoch": 0.5297788680062047, + "grad_norm": 3.0802967136687145, + "learning_rate": 1.9042278785488662e-06, + "loss": 0.9421, + "step": 7343 + }, + { + "epoch": 0.5298510154756322, + "grad_norm": 0.6987234950967185, + "learning_rate": 1.9037610575094348e-06, + "loss": 0.8297, + "step": 7344 + }, + { + "epoch": 0.5299231629450597, + "grad_norm": 2.8390062611058373, + "learning_rate": 1.9032942417252565e-06, + "loss": 0.8763, + "step": 7345 + }, + { + "epoch": 0.5299953104144872, + "grad_norm": 3.1309900476668506, + "learning_rate": 1.9028274312218209e-06, + "loss": 0.9546, + "step": 7346 + }, + { + "epoch": 0.5300674578839147, + "grad_norm": 0.7870138408529562, + "learning_rate": 1.9023606260246204e-06, + "loss": 0.8259, + "step": 7347 + }, + { + "epoch": 0.5301396053533423, + "grad_norm": 3.797477015111198, + "learning_rate": 1.9018938261591448e-06, + "loss": 0.8503, + "step": 7348 + }, + { + "epoch": 0.5302117528227698, + "grad_norm": 3.9918399069721606, + "learning_rate": 1.9014270316508832e-06, + "loss": 0.9064, + "step": 7349 + }, + { + "epoch": 0.5302839002921973, + "grad_norm": 15.056985059176542, + "learning_rate": 1.9009602425253274e-06, + "loss": 0.7727, + "step": 7350 + }, + { + "epoch": 0.5303560477616248, + "grad_norm": 4.976640685708427, + "learning_rate": 1.9004934588079655e-06, + "loss": 0.9606, + "step": 7351 + }, + { + "epoch": 0.5304281952310522, + "grad_norm": 2.88944166551736, + "learning_rate": 1.9000266805242878e-06, + "loss": 0.9596, + "step": 7352 + }, + { + "epoch": 0.5305003427004797, + "grad_norm": 8.312968993759746, + "learning_rate": 1.8995599076997832e-06, + "loss": 0.9486, + "step": 7353 + }, + { + "epoch": 0.5305724901699073, + "grad_norm": 4.891017910222791, + "learning_rate": 1.8990931403599397e-06, + "loss": 0.8845, + "step": 7354 + }, + { + "epoch": 0.5306446376393348, + "grad_norm": 3.6244926097540358, + "learning_rate": 1.898626378530247e-06, + "loss": 0.8912, + "step": 7355 + }, + { + "epoch": 0.5307167851087623, + "grad_norm": 3.72219368621617, + "learning_rate": 1.8981596222361913e-06, + "loss": 0.8478, + "step": 7356 + }, + { + "epoch": 0.5307889325781898, + "grad_norm": 4.128402693690325, + "learning_rate": 1.8976928715032626e-06, + "loss": 0.9182, + "step": 7357 + }, + { + "epoch": 0.5308610800476173, + "grad_norm": 2.991941277554993, + "learning_rate": 1.8972261263569466e-06, + "loss": 0.8982, + "step": 7358 + }, + { + "epoch": 0.5309332275170449, + "grad_norm": 3.394823559992331, + "learning_rate": 1.8967593868227325e-06, + "loss": 0.9738, + "step": 7359 + }, + { + "epoch": 0.5310053749864724, + "grad_norm": 3.3871399860584814, + "learning_rate": 1.8962926529261058e-06, + "loss": 1.0298, + "step": 7360 + }, + { + "epoch": 0.5310775224558999, + "grad_norm": 3.281373230572882, + "learning_rate": 1.8958259246925527e-06, + "loss": 0.819, + "step": 7361 + }, + { + "epoch": 0.5311496699253274, + "grad_norm": 2.8259792723600854, + "learning_rate": 1.895359202147561e-06, + "loss": 0.9236, + "step": 7362 + }, + { + "epoch": 0.5312218173947548, + "grad_norm": 2.6176068781298345, + "learning_rate": 1.8948924853166154e-06, + "loss": 0.9283, + "step": 7363 + }, + { + "epoch": 0.5312939648641823, + "grad_norm": 4.291883419945756, + "learning_rate": 1.8944257742252023e-06, + "loss": 0.8792, + "step": 7364 + }, + { + "epoch": 0.5313661123336099, + "grad_norm": 5.641060851949781, + "learning_rate": 1.8939590688988072e-06, + "loss": 1.0411, + "step": 7365 + }, + { + "epoch": 0.5314382598030374, + "grad_norm": 2.6354894414410515, + "learning_rate": 1.8934923693629141e-06, + "loss": 1.0073, + "step": 7366 + }, + { + "epoch": 0.5315104072724649, + "grad_norm": 3.178470913076896, + "learning_rate": 1.8930256756430092e-06, + "loss": 0.963, + "step": 7367 + }, + { + "epoch": 0.5315825547418924, + "grad_norm": 6.515915386214933, + "learning_rate": 1.8925589877645752e-06, + "loss": 0.8733, + "step": 7368 + }, + { + "epoch": 0.5316547022113199, + "grad_norm": 3.7606117148637805, + "learning_rate": 1.892092305753098e-06, + "loss": 0.9917, + "step": 7369 + }, + { + "epoch": 0.5317268496807475, + "grad_norm": 4.443828100277333, + "learning_rate": 1.8916256296340604e-06, + "loss": 0.9517, + "step": 7370 + }, + { + "epoch": 0.531798997150175, + "grad_norm": 3.1973443694487624, + "learning_rate": 1.8911589594329453e-06, + "loss": 0.8885, + "step": 7371 + }, + { + "epoch": 0.5318711446196025, + "grad_norm": 2.8345506708282997, + "learning_rate": 1.8906922951752374e-06, + "loss": 0.8724, + "step": 7372 + }, + { + "epoch": 0.53194329208903, + "grad_norm": 3.972011997792543, + "learning_rate": 1.8902256368864176e-06, + "loss": 0.9559, + "step": 7373 + }, + { + "epoch": 0.5320154395584575, + "grad_norm": 3.37706523861392, + "learning_rate": 1.8897589845919701e-06, + "loss": 1.1113, + "step": 7374 + }, + { + "epoch": 0.532087587027885, + "grad_norm": 0.8232901164410197, + "learning_rate": 1.8892923383173767e-06, + "loss": 0.7992, + "step": 7375 + }, + { + "epoch": 0.5321597344973125, + "grad_norm": 4.0688900503030405, + "learning_rate": 1.888825698088118e-06, + "loss": 0.9808, + "step": 7376 + }, + { + "epoch": 0.53223188196674, + "grad_norm": 2.518261682773175, + "learning_rate": 1.888359063929677e-06, + "loss": 0.9543, + "step": 7377 + }, + { + "epoch": 0.5323040294361675, + "grad_norm": 13.929494157905275, + "learning_rate": 1.8878924358675338e-06, + "loss": 0.9746, + "step": 7378 + }, + { + "epoch": 0.532376176905595, + "grad_norm": 5.335073465343775, + "learning_rate": 1.8874258139271702e-06, + "loss": 0.9091, + "step": 7379 + }, + { + "epoch": 0.5324483243750225, + "grad_norm": 2.6962273413458115, + "learning_rate": 1.8869591981340653e-06, + "loss": 0.976, + "step": 7380 + }, + { + "epoch": 0.5325204718444501, + "grad_norm": 2.8130517206519228, + "learning_rate": 1.886492588513701e-06, + "loss": 0.7522, + "step": 7381 + }, + { + "epoch": 0.5325926193138776, + "grad_norm": 3.354375339122092, + "learning_rate": 1.8860259850915564e-06, + "loss": 0.9799, + "step": 7382 + }, + { + "epoch": 0.5326647667833051, + "grad_norm": 4.861800100757595, + "learning_rate": 1.8855593878931098e-06, + "loss": 0.9154, + "step": 7383 + }, + { + "epoch": 0.5327369142527326, + "grad_norm": 15.776444290714172, + "learning_rate": 1.8850927969438421e-06, + "loss": 0.9794, + "step": 7384 + }, + { + "epoch": 0.5328090617221601, + "grad_norm": 5.8890265362725165, + "learning_rate": 1.8846262122692308e-06, + "loss": 0.9706, + "step": 7385 + }, + { + "epoch": 0.5328812091915877, + "grad_norm": 4.290165438007447, + "learning_rate": 1.8841596338947557e-06, + "loss": 0.9091, + "step": 7386 + }, + { + "epoch": 0.5329533566610151, + "grad_norm": 3.519798954510865, + "learning_rate": 1.883693061845894e-06, + "loss": 0.8432, + "step": 7387 + }, + { + "epoch": 0.5330255041304426, + "grad_norm": 10.235706318466285, + "learning_rate": 1.8832264961481232e-06, + "loss": 0.8684, + "step": 7388 + }, + { + "epoch": 0.5330976515998701, + "grad_norm": 0.733311593390938, + "learning_rate": 1.8827599368269218e-06, + "loss": 0.8043, + "step": 7389 + }, + { + "epoch": 0.5331697990692976, + "grad_norm": 2.4368789077183477, + "learning_rate": 1.8822933839077656e-06, + "loss": 0.9681, + "step": 7390 + }, + { + "epoch": 0.5332419465387251, + "grad_norm": 2.2301629119005955, + "learning_rate": 1.8818268374161329e-06, + "loss": 0.862, + "step": 7391 + }, + { + "epoch": 0.5333140940081527, + "grad_norm": 3.7873153591158117, + "learning_rate": 1.8813602973774982e-06, + "loss": 0.8556, + "step": 7392 + }, + { + "epoch": 0.5333862414775802, + "grad_norm": 2.688770526149686, + "learning_rate": 1.8808937638173387e-06, + "loss": 1.003, + "step": 7393 + }, + { + "epoch": 0.5334583889470077, + "grad_norm": 3.163088500259459, + "learning_rate": 1.8804272367611302e-06, + "loss": 1.0117, + "step": 7394 + }, + { + "epoch": 0.5335305364164352, + "grad_norm": 3.199212042832942, + "learning_rate": 1.879960716234347e-06, + "loss": 0.8958, + "step": 7395 + }, + { + "epoch": 0.5336026838858627, + "grad_norm": 3.808526266429381, + "learning_rate": 1.8794942022624653e-06, + "loss": 0.8697, + "step": 7396 + }, + { + "epoch": 0.5336748313552903, + "grad_norm": 3.2059020866820136, + "learning_rate": 1.8790276948709593e-06, + "loss": 0.9023, + "step": 7397 + }, + { + "epoch": 0.5337469788247178, + "grad_norm": 4.2883371319972055, + "learning_rate": 1.878561194085302e-06, + "loss": 0.9209, + "step": 7398 + }, + { + "epoch": 0.5338191262941452, + "grad_norm": 0.7684017353957684, + "learning_rate": 1.8780946999309695e-06, + "loss": 0.8103, + "step": 7399 + }, + { + "epoch": 0.5338912737635727, + "grad_norm": 2.944847171360663, + "learning_rate": 1.877628212433433e-06, + "loss": 0.9981, + "step": 7400 + }, + { + "epoch": 0.5339634212330002, + "grad_norm": 2.4995691881915163, + "learning_rate": 1.8771617316181678e-06, + "loss": 0.9521, + "step": 7401 + }, + { + "epoch": 0.5340355687024277, + "grad_norm": 4.285762218933899, + "learning_rate": 1.8766952575106455e-06, + "loss": 0.8826, + "step": 7402 + }, + { + "epoch": 0.5341077161718553, + "grad_norm": 3.711597006569054, + "learning_rate": 1.876228790136339e-06, + "loss": 0.991, + "step": 7403 + }, + { + "epoch": 0.5341798636412828, + "grad_norm": 4.946342373077717, + "learning_rate": 1.8757623295207194e-06, + "loss": 1.032, + "step": 7404 + }, + { + "epoch": 0.5342520111107103, + "grad_norm": 3.192445546091319, + "learning_rate": 1.8752958756892593e-06, + "loss": 0.9682, + "step": 7405 + }, + { + "epoch": 0.5343241585801378, + "grad_norm": 2.2921137864859533, + "learning_rate": 1.8748294286674303e-06, + "loss": 0.8593, + "step": 7406 + }, + { + "epoch": 0.5343963060495653, + "grad_norm": 8.515254077357476, + "learning_rate": 1.8743629884807018e-06, + "loss": 0.9055, + "step": 7407 + }, + { + "epoch": 0.5344684535189929, + "grad_norm": 2.8092367950832497, + "learning_rate": 1.8738965551545463e-06, + "loss": 0.9775, + "step": 7408 + }, + { + "epoch": 0.5345406009884204, + "grad_norm": 0.8727428410394099, + "learning_rate": 1.8734301287144333e-06, + "loss": 0.82, + "step": 7409 + }, + { + "epoch": 0.5346127484578478, + "grad_norm": 2.8981826546803666, + "learning_rate": 1.8729637091858313e-06, + "loss": 0.8623, + "step": 7410 + }, + { + "epoch": 0.5346848959272753, + "grad_norm": 3.9055934506840786, + "learning_rate": 1.8724972965942118e-06, + "loss": 0.8628, + "step": 7411 + }, + { + "epoch": 0.5347570433967028, + "grad_norm": 0.7393455705243396, + "learning_rate": 1.8720308909650423e-06, + "loss": 0.7786, + "step": 7412 + }, + { + "epoch": 0.5348291908661303, + "grad_norm": 1.1694809428208024, + "learning_rate": 1.8715644923237925e-06, + "loss": 0.8228, + "step": 7413 + }, + { + "epoch": 0.5349013383355579, + "grad_norm": 0.7604539200179539, + "learning_rate": 1.8710981006959308e-06, + "loss": 0.8001, + "step": 7414 + }, + { + "epoch": 0.5349734858049854, + "grad_norm": 3.7486580036878907, + "learning_rate": 1.8706317161069241e-06, + "loss": 0.9922, + "step": 7415 + }, + { + "epoch": 0.5350456332744129, + "grad_norm": 2.6215143221635455, + "learning_rate": 1.8701653385822402e-06, + "loss": 0.9833, + "step": 7416 + }, + { + "epoch": 0.5351177807438404, + "grad_norm": 2.326512335037269, + "learning_rate": 1.8696989681473468e-06, + "loss": 0.937, + "step": 7417 + }, + { + "epoch": 0.5351899282132679, + "grad_norm": 3.0242686940363, + "learning_rate": 1.8692326048277108e-06, + "loss": 0.8856, + "step": 7418 + }, + { + "epoch": 0.5352620756826955, + "grad_norm": 2.5142855662030015, + "learning_rate": 1.8687662486487976e-06, + "loss": 0.8715, + "step": 7419 + }, + { + "epoch": 0.535334223152123, + "grad_norm": 3.1338806614456627, + "learning_rate": 1.868299899636074e-06, + "loss": 0.9143, + "step": 7420 + }, + { + "epoch": 0.5354063706215505, + "grad_norm": 0.9398094342214662, + "learning_rate": 1.8678335578150056e-06, + "loss": 0.8484, + "step": 7421 + }, + { + "epoch": 0.5354785180909779, + "grad_norm": 4.734773625377388, + "learning_rate": 1.8673672232110564e-06, + "loss": 0.9246, + "step": 7422 + }, + { + "epoch": 0.5355506655604054, + "grad_norm": 4.186394161135295, + "learning_rate": 1.866900895849693e-06, + "loss": 0.8634, + "step": 7423 + }, + { + "epoch": 0.535622813029833, + "grad_norm": 2.77936764571373, + "learning_rate": 1.8664345757563784e-06, + "loss": 0.9346, + "step": 7424 + }, + { + "epoch": 0.5356949604992605, + "grad_norm": 2.6386978972625075, + "learning_rate": 1.8659682629565779e-06, + "loss": 0.8991, + "step": 7425 + }, + { + "epoch": 0.535767107968688, + "grad_norm": 5.2416709952445215, + "learning_rate": 1.8655019574757542e-06, + "loss": 0.8929, + "step": 7426 + }, + { + "epoch": 0.5358392554381155, + "grad_norm": 2.72030262077067, + "learning_rate": 1.8650356593393703e-06, + "loss": 0.9567, + "step": 7427 + }, + { + "epoch": 0.535911402907543, + "grad_norm": 3.6838495321454543, + "learning_rate": 1.8645693685728896e-06, + "loss": 0.8399, + "step": 7428 + }, + { + "epoch": 0.5359835503769705, + "grad_norm": 3.744881124405058, + "learning_rate": 1.8641030852017742e-06, + "loss": 0.9074, + "step": 7429 + }, + { + "epoch": 0.5360556978463981, + "grad_norm": 14.612688147881162, + "learning_rate": 1.8636368092514869e-06, + "loss": 0.8831, + "step": 7430 + }, + { + "epoch": 0.5361278453158256, + "grad_norm": 4.821528499314522, + "learning_rate": 1.863170540747488e-06, + "loss": 0.9166, + "step": 7431 + }, + { + "epoch": 0.5361999927852531, + "grad_norm": 2.8719813214069227, + "learning_rate": 1.8627042797152394e-06, + "loss": 0.8542, + "step": 7432 + }, + { + "epoch": 0.5362721402546806, + "grad_norm": 6.101314603138786, + "learning_rate": 1.8622380261802025e-06, + "loss": 0.9745, + "step": 7433 + }, + { + "epoch": 0.536344287724108, + "grad_norm": 2.4082789598638588, + "learning_rate": 1.8617717801678356e-06, + "loss": 0.9454, + "step": 7434 + }, + { + "epoch": 0.5364164351935355, + "grad_norm": 2.721639654099859, + "learning_rate": 1.8613055417036014e-06, + "loss": 0.935, + "step": 7435 + }, + { + "epoch": 0.5364885826629631, + "grad_norm": 2.0735116101655797, + "learning_rate": 1.860839310812958e-06, + "loss": 0.9191, + "step": 7436 + }, + { + "epoch": 0.5365607301323906, + "grad_norm": 2.8821997676233826, + "learning_rate": 1.860373087521364e-06, + "loss": 0.8934, + "step": 7437 + }, + { + "epoch": 0.5366328776018181, + "grad_norm": 2.3662679667730306, + "learning_rate": 1.8599068718542793e-06, + "loss": 0.8584, + "step": 7438 + }, + { + "epoch": 0.5367050250712456, + "grad_norm": 7.984885242380405, + "learning_rate": 1.8594406638371616e-06, + "loss": 0.9347, + "step": 7439 + }, + { + "epoch": 0.5367771725406731, + "grad_norm": 2.646870647548443, + "learning_rate": 1.8589744634954687e-06, + "loss": 0.9522, + "step": 7440 + }, + { + "epoch": 0.5368493200101007, + "grad_norm": 2.767611980567668, + "learning_rate": 1.8585082708546588e-06, + "loss": 0.942, + "step": 7441 + }, + { + "epoch": 0.5369214674795282, + "grad_norm": 2.576419613070341, + "learning_rate": 1.858042085940188e-06, + "loss": 1.0053, + "step": 7442 + }, + { + "epoch": 0.5369936149489557, + "grad_norm": 3.60054064505687, + "learning_rate": 1.8575759087775134e-06, + "loss": 0.7281, + "step": 7443 + }, + { + "epoch": 0.5370657624183832, + "grad_norm": 2.5735631174396714, + "learning_rate": 1.8571097393920914e-06, + "loss": 0.9375, + "step": 7444 + }, + { + "epoch": 0.5371379098878107, + "grad_norm": 5.458700657877702, + "learning_rate": 1.8566435778093776e-06, + "loss": 1.0192, + "step": 7445 + }, + { + "epoch": 0.5372100573572381, + "grad_norm": 3.786118077133113, + "learning_rate": 1.8561774240548267e-06, + "loss": 0.9184, + "step": 7446 + }, + { + "epoch": 0.5372822048266657, + "grad_norm": 3.1835429743362664, + "learning_rate": 1.855711278153895e-06, + "loss": 0.8717, + "step": 7447 + }, + { + "epoch": 0.5373543522960932, + "grad_norm": 2.7127957090577297, + "learning_rate": 1.8552451401320367e-06, + "loss": 1.0096, + "step": 7448 + }, + { + "epoch": 0.5374264997655207, + "grad_norm": 4.505438802774042, + "learning_rate": 1.854779010014704e-06, + "loss": 0.9444, + "step": 7449 + }, + { + "epoch": 0.5374986472349482, + "grad_norm": 8.083773676358458, + "learning_rate": 1.854312887827353e-06, + "loss": 0.9097, + "step": 7450 + }, + { + "epoch": 0.5375707947043757, + "grad_norm": 3.2170743562016217, + "learning_rate": 1.8538467735954355e-06, + "loss": 0.9475, + "step": 7451 + }, + { + "epoch": 0.5376429421738033, + "grad_norm": 5.016243869712158, + "learning_rate": 1.8533806673444048e-06, + "loss": 0.9549, + "step": 7452 + }, + { + "epoch": 0.5377150896432308, + "grad_norm": 2.860603777236008, + "learning_rate": 1.8529145690997132e-06, + "loss": 0.8954, + "step": 7453 + }, + { + "epoch": 0.5377872371126583, + "grad_norm": 3.138756343177332, + "learning_rate": 1.8524484788868123e-06, + "loss": 0.8753, + "step": 7454 + }, + { + "epoch": 0.5378593845820858, + "grad_norm": 25.46930717373671, + "learning_rate": 1.8519823967311537e-06, + "loss": 0.7981, + "step": 7455 + }, + { + "epoch": 0.5379315320515133, + "grad_norm": 3.4376180455139664, + "learning_rate": 1.8515163226581885e-06, + "loss": 0.736, + "step": 7456 + }, + { + "epoch": 0.5380036795209409, + "grad_norm": 4.703672459571733, + "learning_rate": 1.8510502566933677e-06, + "loss": 0.8919, + "step": 7457 + }, + { + "epoch": 0.5380758269903683, + "grad_norm": 26.246500417764032, + "learning_rate": 1.8505841988621404e-06, + "loss": 0.982, + "step": 7458 + }, + { + "epoch": 0.5381479744597958, + "grad_norm": 0.8767764586847226, + "learning_rate": 1.8501181491899568e-06, + "loss": 0.8023, + "step": 7459 + }, + { + "epoch": 0.5382201219292233, + "grad_norm": 3.112574632355019, + "learning_rate": 1.849652107702267e-06, + "loss": 0.8837, + "step": 7460 + }, + { + "epoch": 0.5382922693986508, + "grad_norm": 8.144228205543165, + "learning_rate": 1.849186074424518e-06, + "loss": 0.9144, + "step": 7461 + }, + { + "epoch": 0.5383644168680783, + "grad_norm": 2.6630997246503214, + "learning_rate": 1.8487200493821603e-06, + "loss": 0.9075, + "step": 7462 + }, + { + "epoch": 0.5384365643375059, + "grad_norm": 3.0201566968774727, + "learning_rate": 1.8482540326006402e-06, + "loss": 0.817, + "step": 7463 + }, + { + "epoch": 0.5385087118069334, + "grad_norm": 2.6236784196733445, + "learning_rate": 1.847788024105405e-06, + "loss": 0.897, + "step": 7464 + }, + { + "epoch": 0.5385808592763609, + "grad_norm": 0.8629353723627554, + "learning_rate": 1.847322023921903e-06, + "loss": 0.7862, + "step": 7465 + }, + { + "epoch": 0.5386530067457884, + "grad_norm": 5.098613266248925, + "learning_rate": 1.8468560320755801e-06, + "loss": 0.76, + "step": 7466 + }, + { + "epoch": 0.5387251542152159, + "grad_norm": 3.5090904711553703, + "learning_rate": 1.8463900485918822e-06, + "loss": 0.8613, + "step": 7467 + }, + { + "epoch": 0.5387973016846435, + "grad_norm": 5.323303188715561, + "learning_rate": 1.845924073496255e-06, + "loss": 1.0474, + "step": 7468 + }, + { + "epoch": 0.5388694491540709, + "grad_norm": 2.4700877753966686, + "learning_rate": 1.8454581068141442e-06, + "loss": 0.8265, + "step": 7469 + }, + { + "epoch": 0.5389415966234984, + "grad_norm": 3.391221938785214, + "learning_rate": 1.8449921485709936e-06, + "loss": 0.9937, + "step": 7470 + }, + { + "epoch": 0.5390137440929259, + "grad_norm": 3.362456146294609, + "learning_rate": 1.844526198792248e-06, + "loss": 0.9568, + "step": 7471 + }, + { + "epoch": 0.5390858915623534, + "grad_norm": 2.97889165787183, + "learning_rate": 1.8440602575033516e-06, + "loss": 0.9249, + "step": 7472 + }, + { + "epoch": 0.539158039031781, + "grad_norm": 2.8859393108963256, + "learning_rate": 1.8435943247297466e-06, + "loss": 0.9297, + "step": 7473 + }, + { + "epoch": 0.5392301865012085, + "grad_norm": 0.8385995564509392, + "learning_rate": 1.8431284004968772e-06, + "loss": 0.7924, + "step": 7474 + }, + { + "epoch": 0.539302333970636, + "grad_norm": 2.7430422067652738, + "learning_rate": 1.842662484830185e-06, + "loss": 0.979, + "step": 7475 + }, + { + "epoch": 0.5393744814400635, + "grad_norm": 2.59510850374502, + "learning_rate": 1.8421965777551113e-06, + "loss": 0.8323, + "step": 7476 + }, + { + "epoch": 0.539446628909491, + "grad_norm": 3.216841048288842, + "learning_rate": 1.8417306792970994e-06, + "loss": 0.9116, + "step": 7477 + }, + { + "epoch": 0.5395187763789185, + "grad_norm": 2.5946123459113153, + "learning_rate": 1.8412647894815888e-06, + "loss": 0.9727, + "step": 7478 + }, + { + "epoch": 0.5395909238483461, + "grad_norm": 3.0614047524841785, + "learning_rate": 1.8407989083340201e-06, + "loss": 1.013, + "step": 7479 + }, + { + "epoch": 0.5396630713177736, + "grad_norm": 2.303637042119737, + "learning_rate": 1.8403330358798342e-06, + "loss": 0.9686, + "step": 7480 + }, + { + "epoch": 0.539735218787201, + "grad_norm": 3.11185222457271, + "learning_rate": 1.8398671721444696e-06, + "loss": 0.9192, + "step": 7481 + }, + { + "epoch": 0.5398073662566285, + "grad_norm": 2.804009400747869, + "learning_rate": 1.8394013171533663e-06, + "loss": 0.9477, + "step": 7482 + }, + { + "epoch": 0.539879513726056, + "grad_norm": 6.479032590732013, + "learning_rate": 1.8389354709319622e-06, + "loss": 0.8675, + "step": 7483 + }, + { + "epoch": 0.5399516611954835, + "grad_norm": 2.8022284393365946, + "learning_rate": 1.8384696335056964e-06, + "loss": 0.9762, + "step": 7484 + }, + { + "epoch": 0.5400238086649111, + "grad_norm": 3.2660057904483555, + "learning_rate": 1.8380038049000056e-06, + "loss": 0.9588, + "step": 7485 + }, + { + "epoch": 0.5400959561343386, + "grad_norm": 2.402284103521476, + "learning_rate": 1.8375379851403271e-06, + "loss": 0.9653, + "step": 7486 + }, + { + "epoch": 0.5401681036037661, + "grad_norm": 2.291837917055964, + "learning_rate": 1.8370721742520984e-06, + "loss": 0.8525, + "step": 7487 + }, + { + "epoch": 0.5402402510731936, + "grad_norm": 40.91241478565828, + "learning_rate": 1.8366063722607542e-06, + "loss": 0.8987, + "step": 7488 + }, + { + "epoch": 0.5403123985426211, + "grad_norm": 4.545676791219804, + "learning_rate": 1.8361405791917318e-06, + "loss": 0.9726, + "step": 7489 + }, + { + "epoch": 0.5403845460120487, + "grad_norm": 4.048374678765893, + "learning_rate": 1.835674795070466e-06, + "loss": 0.9509, + "step": 7490 + }, + { + "epoch": 0.5404566934814762, + "grad_norm": 3.9071688372469784, + "learning_rate": 1.8352090199223907e-06, + "loss": 0.8876, + "step": 7491 + }, + { + "epoch": 0.5405288409509037, + "grad_norm": 7.4023331855960235, + "learning_rate": 1.8347432537729416e-06, + "loss": 0.874, + "step": 7492 + }, + { + "epoch": 0.5406009884203311, + "grad_norm": 3.3479117760012276, + "learning_rate": 1.834277496647551e-06, + "loss": 0.8834, + "step": 7493 + }, + { + "epoch": 0.5406731358897586, + "grad_norm": 2.4347848686731997, + "learning_rate": 1.833811748571653e-06, + "loss": 0.8786, + "step": 7494 + }, + { + "epoch": 0.5407452833591861, + "grad_norm": 1.889073263158182, + "learning_rate": 1.8333460095706798e-06, + "loss": 0.8355, + "step": 7495 + }, + { + "epoch": 0.5408174308286137, + "grad_norm": 3.280401937699673, + "learning_rate": 1.8328802796700649e-06, + "loss": 0.9295, + "step": 7496 + }, + { + "epoch": 0.5408895782980412, + "grad_norm": 16.596479001112705, + "learning_rate": 1.8324145588952388e-06, + "loss": 1.0191, + "step": 7497 + }, + { + "epoch": 0.5409617257674687, + "grad_norm": 5.48770761987778, + "learning_rate": 1.8319488472716331e-06, + "loss": 0.9118, + "step": 7498 + }, + { + "epoch": 0.5410338732368962, + "grad_norm": 3.1393141406883416, + "learning_rate": 1.8314831448246793e-06, + "loss": 0.9137, + "step": 7499 + }, + { + "epoch": 0.5411060207063237, + "grad_norm": 3.311025183229063, + "learning_rate": 1.8310174515798063e-06, + "loss": 0.9399, + "step": 7500 + }, + { + "epoch": 0.5411781681757513, + "grad_norm": 3.203016325805103, + "learning_rate": 1.8305517675624453e-06, + "loss": 0.8948, + "step": 7501 + }, + { + "epoch": 0.5412503156451788, + "grad_norm": 3.5373327478560985, + "learning_rate": 1.8300860927980253e-06, + "loss": 0.9075, + "step": 7502 + }, + { + "epoch": 0.5413224631146063, + "grad_norm": 3.8482384255789737, + "learning_rate": 1.829620427311974e-06, + "loss": 1.0559, + "step": 7503 + }, + { + "epoch": 0.5413946105840338, + "grad_norm": 2.40328578931929, + "learning_rate": 1.8291547711297211e-06, + "loss": 0.9831, + "step": 7504 + }, + { + "epoch": 0.5414667580534612, + "grad_norm": 3.169908336673955, + "learning_rate": 1.8286891242766934e-06, + "loss": 0.8096, + "step": 7505 + }, + { + "epoch": 0.5415389055228887, + "grad_norm": 11.380806247998962, + "learning_rate": 1.8282234867783182e-06, + "loss": 0.9482, + "step": 7506 + }, + { + "epoch": 0.5416110529923163, + "grad_norm": 2.266124486124933, + "learning_rate": 1.827757858660023e-06, + "loss": 0.9475, + "step": 7507 + }, + { + "epoch": 0.5416832004617438, + "grad_norm": 3.16096537149222, + "learning_rate": 1.8272922399472333e-06, + "loss": 0.7869, + "step": 7508 + }, + { + "epoch": 0.5417553479311713, + "grad_norm": 4.440517420475665, + "learning_rate": 1.8268266306653747e-06, + "loss": 0.9716, + "step": 7509 + }, + { + "epoch": 0.5418274954005988, + "grad_norm": 4.661892947034431, + "learning_rate": 1.826361030839873e-06, + "loss": 0.919, + "step": 7510 + }, + { + "epoch": 0.5418996428700263, + "grad_norm": 2.3499303076941, + "learning_rate": 1.825895440496153e-06, + "loss": 0.986, + "step": 7511 + }, + { + "epoch": 0.5419717903394539, + "grad_norm": 5.446010200101481, + "learning_rate": 1.8254298596596376e-06, + "loss": 0.8956, + "step": 7512 + }, + { + "epoch": 0.5420439378088814, + "grad_norm": 2.9980837742028945, + "learning_rate": 1.824964288355752e-06, + "loss": 0.8804, + "step": 7513 + }, + { + "epoch": 0.5421160852783089, + "grad_norm": 2.1782844430369077, + "learning_rate": 1.824498726609919e-06, + "loss": 0.9537, + "step": 7514 + }, + { + "epoch": 0.5421882327477364, + "grad_norm": 0.8737398337453571, + "learning_rate": 1.8240331744475598e-06, + "loss": 0.7589, + "step": 7515 + }, + { + "epoch": 0.5422603802171638, + "grad_norm": 2.9148059540902547, + "learning_rate": 1.8235676318940985e-06, + "loss": 0.8932, + "step": 7516 + }, + { + "epoch": 0.5423325276865913, + "grad_norm": 2.703694366291863, + "learning_rate": 1.8231020989749552e-06, + "loss": 0.8266, + "step": 7517 + }, + { + "epoch": 0.5424046751560189, + "grad_norm": 8.549908589549274, + "learning_rate": 1.8226365757155516e-06, + "loss": 0.9392, + "step": 7518 + }, + { + "epoch": 0.5424768226254464, + "grad_norm": 2.964840211727961, + "learning_rate": 1.8221710621413081e-06, + "loss": 0.9688, + "step": 7519 + }, + { + "epoch": 0.5425489700948739, + "grad_norm": 3.437331871776206, + "learning_rate": 1.8217055582776444e-06, + "loss": 0.9552, + "step": 7520 + }, + { + "epoch": 0.5426211175643014, + "grad_norm": 5.505904236414877, + "learning_rate": 1.8212400641499802e-06, + "loss": 0.9283, + "step": 7521 + }, + { + "epoch": 0.5426932650337289, + "grad_norm": 2.5207889227199716, + "learning_rate": 1.8207745797837342e-06, + "loss": 0.8897, + "step": 7522 + }, + { + "epoch": 0.5427654125031565, + "grad_norm": 2.3403141523251714, + "learning_rate": 1.8203091052043254e-06, + "loss": 0.885, + "step": 7523 + }, + { + "epoch": 0.542837559972584, + "grad_norm": 3.4637856916160166, + "learning_rate": 1.8198436404371707e-06, + "loss": 0.944, + "step": 7524 + }, + { + "epoch": 0.5429097074420115, + "grad_norm": 2.8223187609887637, + "learning_rate": 1.8193781855076877e-06, + "loss": 0.8725, + "step": 7525 + }, + { + "epoch": 0.542981854911439, + "grad_norm": 5.416224456733921, + "learning_rate": 1.8189127404412941e-06, + "loss": 0.9255, + "step": 7526 + }, + { + "epoch": 0.5430540023808665, + "grad_norm": 0.8646460054655062, + "learning_rate": 1.8184473052634044e-06, + "loss": 0.8495, + "step": 7527 + }, + { + "epoch": 0.543126149850294, + "grad_norm": 3.548038241501799, + "learning_rate": 1.8179818799994362e-06, + "loss": 0.8865, + "step": 7528 + }, + { + "epoch": 0.5431982973197215, + "grad_norm": 2.3915698608939135, + "learning_rate": 1.8175164646748036e-06, + "loss": 0.8281, + "step": 7529 + }, + { + "epoch": 0.543270444789149, + "grad_norm": 4.375979940476547, + "learning_rate": 1.8170510593149207e-06, + "loss": 1.0897, + "step": 7530 + }, + { + "epoch": 0.5433425922585765, + "grad_norm": 0.9258196858986226, + "learning_rate": 1.8165856639452029e-06, + "loss": 0.8488, + "step": 7531 + }, + { + "epoch": 0.543414739728004, + "grad_norm": 0.6827645858958227, + "learning_rate": 1.8161202785910625e-06, + "loss": 0.8146, + "step": 7532 + }, + { + "epoch": 0.5434868871974315, + "grad_norm": 0.6628096037014095, + "learning_rate": 1.8156549032779132e-06, + "loss": 0.8247, + "step": 7533 + }, + { + "epoch": 0.5435590346668591, + "grad_norm": 2.655086486902016, + "learning_rate": 1.8151895380311667e-06, + "loss": 0.9173, + "step": 7534 + }, + { + "epoch": 0.5436311821362866, + "grad_norm": 2.4780092075229176, + "learning_rate": 1.8147241828762365e-06, + "loss": 0.844, + "step": 7535 + }, + { + "epoch": 0.5437033296057141, + "grad_norm": 2.8329214656890365, + "learning_rate": 1.814258837838532e-06, + "loss": 0.9448, + "step": 7536 + }, + { + "epoch": 0.5437754770751416, + "grad_norm": 2.502780226681445, + "learning_rate": 1.8137935029434647e-06, + "loss": 0.9902, + "step": 7537 + }, + { + "epoch": 0.5438476245445691, + "grad_norm": 0.9215701940919652, + "learning_rate": 1.8133281782164455e-06, + "loss": 0.789, + "step": 7538 + }, + { + "epoch": 0.5439197720139967, + "grad_norm": 2.479068294678502, + "learning_rate": 1.8128628636828827e-06, + "loss": 0.8813, + "step": 7539 + }, + { + "epoch": 0.5439919194834241, + "grad_norm": 1.89394805426496, + "learning_rate": 1.8123975593681867e-06, + "loss": 0.8199, + "step": 7540 + }, + { + "epoch": 0.5440640669528516, + "grad_norm": 2.6754420155911216, + "learning_rate": 1.8119322652977658e-06, + "loss": 0.8192, + "step": 7541 + }, + { + "epoch": 0.5441362144222791, + "grad_norm": 0.8061258368290364, + "learning_rate": 1.8114669814970268e-06, + "loss": 0.9257, + "step": 7542 + }, + { + "epoch": 0.5442083618917066, + "grad_norm": 2.1388630542048115, + "learning_rate": 1.8110017079913789e-06, + "loss": 0.9084, + "step": 7543 + }, + { + "epoch": 0.5442805093611341, + "grad_norm": 2.619760279280995, + "learning_rate": 1.8105364448062273e-06, + "loss": 0.918, + "step": 7544 + }, + { + "epoch": 0.5443526568305617, + "grad_norm": 3.306306338776696, + "learning_rate": 1.8100711919669794e-06, + "loss": 0.9626, + "step": 7545 + }, + { + "epoch": 0.5444248042999892, + "grad_norm": 2.1814493694843193, + "learning_rate": 1.809605949499041e-06, + "loss": 0.9058, + "step": 7546 + }, + { + "epoch": 0.5444969517694167, + "grad_norm": 0.6889018375344534, + "learning_rate": 1.8091407174278164e-06, + "loss": 0.7387, + "step": 7547 + }, + { + "epoch": 0.5445690992388442, + "grad_norm": 3.7553854101213426, + "learning_rate": 1.8086754957787107e-06, + "loss": 0.9551, + "step": 7548 + }, + { + "epoch": 0.5446412467082717, + "grad_norm": 3.5395057518539543, + "learning_rate": 1.808210284577128e-06, + "loss": 0.8888, + "step": 7549 + }, + { + "epoch": 0.5447133941776993, + "grad_norm": 2.384035390764005, + "learning_rate": 1.8077450838484719e-06, + "loss": 0.9523, + "step": 7550 + }, + { + "epoch": 0.5447855416471268, + "grad_norm": 3.407610420346708, + "learning_rate": 1.8072798936181452e-06, + "loss": 0.9016, + "step": 7551 + }, + { + "epoch": 0.5448576891165542, + "grad_norm": 6.390082972887803, + "learning_rate": 1.806814713911549e-06, + "loss": 0.9182, + "step": 7552 + }, + { + "epoch": 0.5449298365859817, + "grad_norm": 4.884461440323971, + "learning_rate": 1.8063495447540873e-06, + "loss": 0.9533, + "step": 7553 + }, + { + "epoch": 0.5450019840554092, + "grad_norm": 3.7968132108205332, + "learning_rate": 1.8058843861711592e-06, + "loss": 0.799, + "step": 7554 + }, + { + "epoch": 0.5450741315248367, + "grad_norm": 3.609798257954974, + "learning_rate": 1.805419238188167e-06, + "loss": 0.9867, + "step": 7555 + }, + { + "epoch": 0.5451462789942643, + "grad_norm": 0.8710388371560697, + "learning_rate": 1.8049541008305093e-06, + "loss": 0.7456, + "step": 7556 + }, + { + "epoch": 0.5452184264636918, + "grad_norm": 4.591832597691755, + "learning_rate": 1.8044889741235862e-06, + "loss": 0.8802, + "step": 7557 + }, + { + "epoch": 0.5452905739331193, + "grad_norm": 2.0334198146418574, + "learning_rate": 1.804023858092797e-06, + "loss": 0.9608, + "step": 7558 + }, + { + "epoch": 0.5453627214025468, + "grad_norm": 1.8660167391436124, + "learning_rate": 1.803558752763539e-06, + "loss": 0.9468, + "step": 7559 + }, + { + "epoch": 0.5454348688719743, + "grad_norm": 2.796967744621508, + "learning_rate": 1.80309365816121e-06, + "loss": 0.7914, + "step": 7560 + }, + { + "epoch": 0.5455070163414019, + "grad_norm": 3.455296312756368, + "learning_rate": 1.802628574311208e-06, + "loss": 0.9306, + "step": 7561 + }, + { + "epoch": 0.5455791638108294, + "grad_norm": 2.2933216245855057, + "learning_rate": 1.8021635012389289e-06, + "loss": 0.8651, + "step": 7562 + }, + { + "epoch": 0.5456513112802568, + "grad_norm": 2.4764049010568265, + "learning_rate": 1.8016984389697688e-06, + "loss": 0.8702, + "step": 7563 + }, + { + "epoch": 0.5457234587496843, + "grad_norm": 4.644563692459233, + "learning_rate": 1.8012333875291219e-06, + "loss": 0.907, + "step": 7564 + }, + { + "epoch": 0.5457956062191118, + "grad_norm": 2.547275815913233, + "learning_rate": 1.8007683469423849e-06, + "loss": 0.9093, + "step": 7565 + }, + { + "epoch": 0.5458677536885393, + "grad_norm": 2.7266271195501157, + "learning_rate": 1.8003033172349498e-06, + "loss": 0.9256, + "step": 7566 + }, + { + "epoch": 0.5459399011579669, + "grad_norm": 2.194079830230003, + "learning_rate": 1.7998382984322125e-06, + "loss": 0.9612, + "step": 7567 + }, + { + "epoch": 0.5460120486273944, + "grad_norm": 0.813600015603312, + "learning_rate": 1.7993732905595643e-06, + "loss": 0.801, + "step": 7568 + }, + { + "epoch": 0.5460841960968219, + "grad_norm": 1.9991914187520028, + "learning_rate": 1.7989082936423976e-06, + "loss": 0.8552, + "step": 7569 + }, + { + "epoch": 0.5461563435662494, + "grad_norm": 2.5247943643362003, + "learning_rate": 1.7984433077061052e-06, + "loss": 0.9071, + "step": 7570 + }, + { + "epoch": 0.5462284910356769, + "grad_norm": 3.4383909978082183, + "learning_rate": 1.7979783327760774e-06, + "loss": 0.9321, + "step": 7571 + }, + { + "epoch": 0.5463006385051045, + "grad_norm": 2.875776932269324, + "learning_rate": 1.7975133688777046e-06, + "loss": 0.9338, + "step": 7572 + }, + { + "epoch": 0.546372785974532, + "grad_norm": 3.181312989096597, + "learning_rate": 1.797048416036378e-06, + "loss": 0.9534, + "step": 7573 + }, + { + "epoch": 0.5464449334439595, + "grad_norm": 2.3683097369821424, + "learning_rate": 1.7965834742774855e-06, + "loss": 0.9854, + "step": 7574 + }, + { + "epoch": 0.5465170809133869, + "grad_norm": 3.6192605193814553, + "learning_rate": 1.7961185436264167e-06, + "loss": 0.8478, + "step": 7575 + }, + { + "epoch": 0.5465892283828144, + "grad_norm": 0.9087144294294192, + "learning_rate": 1.7956536241085588e-06, + "loss": 0.8339, + "step": 7576 + }, + { + "epoch": 0.546661375852242, + "grad_norm": 2.3907883532195267, + "learning_rate": 1.795188715749301e-06, + "loss": 0.8333, + "step": 7577 + }, + { + "epoch": 0.5467335233216695, + "grad_norm": 2.667718610305583, + "learning_rate": 1.7947238185740282e-06, + "loss": 0.9239, + "step": 7578 + }, + { + "epoch": 0.546805670791097, + "grad_norm": 2.5388099896975045, + "learning_rate": 1.7942589326081288e-06, + "loss": 0.9475, + "step": 7579 + }, + { + "epoch": 0.5468778182605245, + "grad_norm": 2.1730647498998, + "learning_rate": 1.7937940578769872e-06, + "loss": 0.8547, + "step": 7580 + }, + { + "epoch": 0.546949965729952, + "grad_norm": 2.3129024670888363, + "learning_rate": 1.793329194405988e-06, + "loss": 0.908, + "step": 7581 + }, + { + "epoch": 0.5470221131993795, + "grad_norm": 2.479131960158415, + "learning_rate": 1.7928643422205175e-06, + "loss": 0.87, + "step": 7582 + }, + { + "epoch": 0.5470942606688071, + "grad_norm": 3.4823909293878836, + "learning_rate": 1.7923995013459579e-06, + "loss": 0.9001, + "step": 7583 + }, + { + "epoch": 0.5471664081382346, + "grad_norm": 2.2497460963781992, + "learning_rate": 1.791934671807693e-06, + "loss": 0.9558, + "step": 7584 + }, + { + "epoch": 0.5472385556076621, + "grad_norm": 3.070357538334584, + "learning_rate": 1.7914698536311058e-06, + "loss": 0.9488, + "step": 7585 + }, + { + "epoch": 0.5473107030770896, + "grad_norm": 2.912897526539866, + "learning_rate": 1.7910050468415776e-06, + "loss": 0.8943, + "step": 7586 + }, + { + "epoch": 0.547382850546517, + "grad_norm": 2.634842630424685, + "learning_rate": 1.7905402514644905e-06, + "loss": 1.0044, + "step": 7587 + }, + { + "epoch": 0.5474549980159445, + "grad_norm": 2.3473126609077584, + "learning_rate": 1.790075467525224e-06, + "loss": 0.9563, + "step": 7588 + }, + { + "epoch": 0.5475271454853721, + "grad_norm": 2.852677636729513, + "learning_rate": 1.78961069504916e-06, + "loss": 1.0194, + "step": 7589 + }, + { + "epoch": 0.5475992929547996, + "grad_norm": 5.43017628617169, + "learning_rate": 1.7891459340616773e-06, + "loss": 0.8513, + "step": 7590 + }, + { + "epoch": 0.5476714404242271, + "grad_norm": 2.481548789072584, + "learning_rate": 1.7886811845881533e-06, + "loss": 0.9271, + "step": 7591 + }, + { + "epoch": 0.5477435878936546, + "grad_norm": 2.8648152945264953, + "learning_rate": 1.7882164466539685e-06, + "loss": 0.9167, + "step": 7592 + }, + { + "epoch": 0.5478157353630821, + "grad_norm": 3.1880784351363665, + "learning_rate": 1.7877517202844991e-06, + "loss": 0.9331, + "step": 7593 + }, + { + "epoch": 0.5478878828325097, + "grad_norm": 3.3349849582773285, + "learning_rate": 1.787287005505123e-06, + "loss": 0.9234, + "step": 7594 + }, + { + "epoch": 0.5479600303019372, + "grad_norm": 3.258482646854372, + "learning_rate": 1.786822302341216e-06, + "loss": 0.8329, + "step": 7595 + }, + { + "epoch": 0.5480321777713647, + "grad_norm": 0.71114182941909, + "learning_rate": 1.7863576108181535e-06, + "loss": 0.8237, + "step": 7596 + }, + { + "epoch": 0.5481043252407922, + "grad_norm": 3.324286167702816, + "learning_rate": 1.7858929309613118e-06, + "loss": 0.999, + "step": 7597 + }, + { + "epoch": 0.5481764727102197, + "grad_norm": 2.574765415061426, + "learning_rate": 1.785428262796064e-06, + "loss": 0.9442, + "step": 7598 + }, + { + "epoch": 0.5482486201796472, + "grad_norm": 2.3833563512299714, + "learning_rate": 1.7849636063477843e-06, + "loss": 0.9391, + "step": 7599 + }, + { + "epoch": 0.5483207676490747, + "grad_norm": 2.3585874746314333, + "learning_rate": 1.784498961641846e-06, + "loss": 0.9259, + "step": 7600 + }, + { + "epoch": 0.5483929151185022, + "grad_norm": 2.335119914502508, + "learning_rate": 1.7840343287036222e-06, + "loss": 0.8793, + "step": 7601 + }, + { + "epoch": 0.5484650625879297, + "grad_norm": 2.684595801051017, + "learning_rate": 1.7835697075584843e-06, + "loss": 0.9141, + "step": 7602 + }, + { + "epoch": 0.5485372100573572, + "grad_norm": 2.2310022895148074, + "learning_rate": 1.7831050982318026e-06, + "loss": 1.0154, + "step": 7603 + }, + { + "epoch": 0.5486093575267847, + "grad_norm": 3.266198650880586, + "learning_rate": 1.7826405007489493e-06, + "loss": 0.8097, + "step": 7604 + }, + { + "epoch": 0.5486815049962123, + "grad_norm": 2.529860503682134, + "learning_rate": 1.7821759151352928e-06, + "loss": 0.9924, + "step": 7605 + }, + { + "epoch": 0.5487536524656398, + "grad_norm": 2.065989230501775, + "learning_rate": 1.7817113414162042e-06, + "loss": 1.0631, + "step": 7606 + }, + { + "epoch": 0.5488257999350673, + "grad_norm": 7.254935327684646, + "learning_rate": 1.781246779617051e-06, + "loss": 0.8431, + "step": 7607 + }, + { + "epoch": 0.5488979474044948, + "grad_norm": 3.7980887274479755, + "learning_rate": 1.7807822297632004e-06, + "loss": 0.9284, + "step": 7608 + }, + { + "epoch": 0.5489700948739223, + "grad_norm": 3.25904965409046, + "learning_rate": 1.7803176918800217e-06, + "loss": 0.939, + "step": 7609 + }, + { + "epoch": 0.5490422423433499, + "grad_norm": 2.079889566534308, + "learning_rate": 1.7798531659928803e-06, + "loss": 0.8745, + "step": 7610 + }, + { + "epoch": 0.5491143898127773, + "grad_norm": 2.711524218299697, + "learning_rate": 1.7793886521271426e-06, + "loss": 1.0001, + "step": 7611 + }, + { + "epoch": 0.5491865372822048, + "grad_norm": 11.371061544526706, + "learning_rate": 1.7789241503081743e-06, + "loss": 0.9983, + "step": 7612 + }, + { + "epoch": 0.5492586847516323, + "grad_norm": 2.4281310956527413, + "learning_rate": 1.778459660561339e-06, + "loss": 0.9113, + "step": 7613 + }, + { + "epoch": 0.5493308322210598, + "grad_norm": 2.7711665782494452, + "learning_rate": 1.7779951829120023e-06, + "loss": 0.8621, + "step": 7614 + }, + { + "epoch": 0.5494029796904873, + "grad_norm": 1.0450669802869732, + "learning_rate": 1.777530717385526e-06, + "loss": 0.8136, + "step": 7615 + }, + { + "epoch": 0.5494751271599149, + "grad_norm": 5.1242379343028785, + "learning_rate": 1.7770662640072744e-06, + "loss": 0.9378, + "step": 7616 + }, + { + "epoch": 0.5495472746293424, + "grad_norm": 34.82514918309755, + "learning_rate": 1.776601822802608e-06, + "loss": 0.868, + "step": 7617 + }, + { + "epoch": 0.5496194220987699, + "grad_norm": 3.5539936436481985, + "learning_rate": 1.77613739379689e-06, + "loss": 0.8188, + "step": 7618 + }, + { + "epoch": 0.5496915695681974, + "grad_norm": 2.3107962647069606, + "learning_rate": 1.77567297701548e-06, + "loss": 1.0299, + "step": 7619 + }, + { + "epoch": 0.5497637170376249, + "grad_norm": 2.687076357785461, + "learning_rate": 1.7752085724837375e-06, + "loss": 0.9209, + "step": 7620 + }, + { + "epoch": 0.5498358645070525, + "grad_norm": 3.1905042664703074, + "learning_rate": 1.7747441802270238e-06, + "loss": 0.9752, + "step": 7621 + }, + { + "epoch": 0.5499080119764799, + "grad_norm": 4.562871003416954, + "learning_rate": 1.774279800270696e-06, + "loss": 0.9053, + "step": 7622 + }, + { + "epoch": 0.5499801594459074, + "grad_norm": 4.674426144553339, + "learning_rate": 1.7738154326401127e-06, + "loss": 0.9369, + "step": 7623 + }, + { + "epoch": 0.5500523069153349, + "grad_norm": 2.5161597589796916, + "learning_rate": 1.7733510773606318e-06, + "loss": 0.9615, + "step": 7624 + }, + { + "epoch": 0.5501244543847624, + "grad_norm": 2.793548822520149, + "learning_rate": 1.772886734457609e-06, + "loss": 0.9121, + "step": 7625 + }, + { + "epoch": 0.55019660185419, + "grad_norm": 3.2666215608038818, + "learning_rate": 1.7724224039564012e-06, + "loss": 0.9185, + "step": 7626 + }, + { + "epoch": 0.5502687493236175, + "grad_norm": 3.0551554523881164, + "learning_rate": 1.771958085882363e-06, + "loss": 0.9233, + "step": 7627 + }, + { + "epoch": 0.550340896793045, + "grad_norm": 3.120192224498719, + "learning_rate": 1.7714937802608501e-06, + "loss": 0.831, + "step": 7628 + }, + { + "epoch": 0.5504130442624725, + "grad_norm": 5.516580558327646, + "learning_rate": 1.771029487117216e-06, + "loss": 0.8776, + "step": 7629 + }, + { + "epoch": 0.5504851917319, + "grad_norm": 2.6007898231426925, + "learning_rate": 1.770565206476813e-06, + "loss": 0.8445, + "step": 7630 + }, + { + "epoch": 0.5505573392013275, + "grad_norm": 3.0277163959156876, + "learning_rate": 1.7701009383649958e-06, + "loss": 0.8502, + "step": 7631 + }, + { + "epoch": 0.5506294866707551, + "grad_norm": 5.860660666240984, + "learning_rate": 1.7696366828071144e-06, + "loss": 0.9787, + "step": 7632 + }, + { + "epoch": 0.5507016341401826, + "grad_norm": 2.4818255700996374, + "learning_rate": 1.769172439828522e-06, + "loss": 0.9932, + "step": 7633 + }, + { + "epoch": 0.55077378160961, + "grad_norm": 4.048234983537096, + "learning_rate": 1.7687082094545678e-06, + "loss": 0.7876, + "step": 7634 + }, + { + "epoch": 0.5508459290790375, + "grad_norm": 2.9228003357522288, + "learning_rate": 1.7682439917106014e-06, + "loss": 0.9498, + "step": 7635 + }, + { + "epoch": 0.550918076548465, + "grad_norm": 0.8091486625423197, + "learning_rate": 1.7677797866219737e-06, + "loss": 0.8077, + "step": 7636 + }, + { + "epoch": 0.5509902240178925, + "grad_norm": 3.5701698591924624, + "learning_rate": 1.7673155942140315e-06, + "loss": 0.9519, + "step": 7637 + }, + { + "epoch": 0.5510623714873201, + "grad_norm": 1.8783136333367847, + "learning_rate": 1.7668514145121239e-06, + "loss": 0.9175, + "step": 7638 + }, + { + "epoch": 0.5511345189567476, + "grad_norm": 3.534687226094541, + "learning_rate": 1.7663872475415966e-06, + "loss": 0.9273, + "step": 7639 + }, + { + "epoch": 0.5512066664261751, + "grad_norm": 5.996314188232146, + "learning_rate": 1.7659230933277979e-06, + "loss": 0.8734, + "step": 7640 + }, + { + "epoch": 0.5512788138956026, + "grad_norm": 4.694340305314434, + "learning_rate": 1.7654589518960723e-06, + "loss": 0.9567, + "step": 7641 + }, + { + "epoch": 0.5513509613650301, + "grad_norm": 7.0830274029656355, + "learning_rate": 1.7649948232717642e-06, + "loss": 0.893, + "step": 7642 + }, + { + "epoch": 0.5514231088344577, + "grad_norm": 3.9879346317684674, + "learning_rate": 1.7645307074802197e-06, + "loss": 1.0102, + "step": 7643 + }, + { + "epoch": 0.5514952563038852, + "grad_norm": 1.8483139061257974, + "learning_rate": 1.7640666045467811e-06, + "loss": 0.9373, + "step": 7644 + }, + { + "epoch": 0.5515674037733127, + "grad_norm": 3.209012459221675, + "learning_rate": 1.7636025144967924e-06, + "loss": 0.9702, + "step": 7645 + }, + { + "epoch": 0.5516395512427401, + "grad_norm": 3.499822203343634, + "learning_rate": 1.7631384373555953e-06, + "loss": 0.893, + "step": 7646 + }, + { + "epoch": 0.5517116987121676, + "grad_norm": 3.5867400424452396, + "learning_rate": 1.7626743731485306e-06, + "loss": 0.881, + "step": 7647 + }, + { + "epoch": 0.5517838461815952, + "grad_norm": 2.8435791299648376, + "learning_rate": 1.7622103219009406e-06, + "loss": 0.9617, + "step": 7648 + }, + { + "epoch": 0.5518559936510227, + "grad_norm": 4.482378897533618, + "learning_rate": 1.761746283638164e-06, + "loss": 0.9306, + "step": 7649 + }, + { + "epoch": 0.5519281411204502, + "grad_norm": 3.834480335184674, + "learning_rate": 1.7612822583855416e-06, + "loss": 0.9459, + "step": 7650 + }, + { + "epoch": 0.5520002885898777, + "grad_norm": 3.172077191302344, + "learning_rate": 1.7608182461684107e-06, + "loss": 0.9212, + "step": 7651 + }, + { + "epoch": 0.5520724360593052, + "grad_norm": 5.158323298841504, + "learning_rate": 1.7603542470121103e-06, + "loss": 0.9676, + "step": 7652 + }, + { + "epoch": 0.5521445835287327, + "grad_norm": 1.9979682974032869, + "learning_rate": 1.7598902609419772e-06, + "loss": 0.943, + "step": 7653 + }, + { + "epoch": 0.5522167309981603, + "grad_norm": 5.014060088264358, + "learning_rate": 1.7594262879833477e-06, + "loss": 0.9577, + "step": 7654 + }, + { + "epoch": 0.5522888784675878, + "grad_norm": 4.002733726944419, + "learning_rate": 1.7589623281615586e-06, + "loss": 0.787, + "step": 7655 + }, + { + "epoch": 0.5523610259370153, + "grad_norm": 3.0970167387429006, + "learning_rate": 1.7584983815019447e-06, + "loss": 0.8671, + "step": 7656 + }, + { + "epoch": 0.5524331734064428, + "grad_norm": 2.768419997825979, + "learning_rate": 1.7580344480298392e-06, + "loss": 0.9312, + "step": 7657 + }, + { + "epoch": 0.5525053208758702, + "grad_norm": 29.539427191282204, + "learning_rate": 1.7575705277705779e-06, + "loss": 0.8262, + "step": 7658 + }, + { + "epoch": 0.5525774683452978, + "grad_norm": 2.5544002266448578, + "learning_rate": 1.7571066207494916e-06, + "loss": 0.9037, + "step": 7659 + }, + { + "epoch": 0.5526496158147253, + "grad_norm": 4.267874658962498, + "learning_rate": 1.7566427269919144e-06, + "loss": 0.8543, + "step": 7660 + }, + { + "epoch": 0.5527217632841528, + "grad_norm": 0.8024749336606066, + "learning_rate": 1.7561788465231764e-06, + "loss": 0.7692, + "step": 7661 + }, + { + "epoch": 0.5527939107535803, + "grad_norm": 4.7076607057729545, + "learning_rate": 1.7557149793686097e-06, + "loss": 0.9462, + "step": 7662 + }, + { + "epoch": 0.5528660582230078, + "grad_norm": 4.335155959618278, + "learning_rate": 1.7552511255535432e-06, + "loss": 0.9119, + "step": 7663 + }, + { + "epoch": 0.5529382056924353, + "grad_norm": 3.840217641980824, + "learning_rate": 1.7547872851033065e-06, + "loss": 0.8368, + "step": 7664 + }, + { + "epoch": 0.5530103531618629, + "grad_norm": 5.895355349015198, + "learning_rate": 1.7543234580432288e-06, + "loss": 1.0287, + "step": 7665 + }, + { + "epoch": 0.5530825006312904, + "grad_norm": 4.235935725905083, + "learning_rate": 1.7538596443986372e-06, + "loss": 1.0285, + "step": 7666 + }, + { + "epoch": 0.5531546481007179, + "grad_norm": 2.8338664245246883, + "learning_rate": 1.7533958441948597e-06, + "loss": 0.9508, + "step": 7667 + }, + { + "epoch": 0.5532267955701454, + "grad_norm": 2.827528495476343, + "learning_rate": 1.7529320574572224e-06, + "loss": 0.9131, + "step": 7668 + }, + { + "epoch": 0.5532989430395728, + "grad_norm": 3.904395067393303, + "learning_rate": 1.7524682842110498e-06, + "loss": 0.8803, + "step": 7669 + }, + { + "epoch": 0.5533710905090004, + "grad_norm": 2.2612201885664343, + "learning_rate": 1.7520045244816691e-06, + "loss": 0.9664, + "step": 7670 + }, + { + "epoch": 0.5534432379784279, + "grad_norm": 3.5009991718237865, + "learning_rate": 1.751540778294402e-06, + "loss": 0.8231, + "step": 7671 + }, + { + "epoch": 0.5535153854478554, + "grad_norm": 2.406699894053141, + "learning_rate": 1.7510770456745742e-06, + "loss": 1.0066, + "step": 7672 + }, + { + "epoch": 0.5535875329172829, + "grad_norm": 3.105707814353631, + "learning_rate": 1.7506133266475076e-06, + "loss": 0.8375, + "step": 7673 + }, + { + "epoch": 0.5536596803867104, + "grad_norm": 3.097187329044901, + "learning_rate": 1.7501496212385235e-06, + "loss": 0.817, + "step": 7674 + }, + { + "epoch": 0.553731827856138, + "grad_norm": 3.855998056055109, + "learning_rate": 1.7496859294729439e-06, + "loss": 0.9264, + "step": 7675 + }, + { + "epoch": 0.5538039753255655, + "grad_norm": 2.626821249448922, + "learning_rate": 1.7492222513760888e-06, + "loss": 0.9425, + "step": 7676 + }, + { + "epoch": 0.553876122794993, + "grad_norm": 2.461296321806912, + "learning_rate": 1.7487585869732786e-06, + "loss": 0.9869, + "step": 7677 + }, + { + "epoch": 0.5539482702644205, + "grad_norm": 2.9517000181473976, + "learning_rate": 1.7482949362898317e-06, + "loss": 0.8789, + "step": 7678 + }, + { + "epoch": 0.554020417733848, + "grad_norm": 2.4027158867638354, + "learning_rate": 1.7478312993510663e-06, + "loss": 0.9775, + "step": 7679 + }, + { + "epoch": 0.5540925652032755, + "grad_norm": 5.023969607255205, + "learning_rate": 1.7473676761823006e-06, + "loss": 0.8795, + "step": 7680 + }, + { + "epoch": 0.554164712672703, + "grad_norm": 3.217715495257909, + "learning_rate": 1.74690406680885e-06, + "loss": 0.9474, + "step": 7681 + }, + { + "epoch": 0.5542368601421305, + "grad_norm": 3.0556361293765604, + "learning_rate": 1.7464404712560325e-06, + "loss": 0.9635, + "step": 7682 + }, + { + "epoch": 0.554309007611558, + "grad_norm": 2.50104167217449, + "learning_rate": 1.7459768895491611e-06, + "loss": 0.9212, + "step": 7683 + }, + { + "epoch": 0.5543811550809855, + "grad_norm": 2.269925580590354, + "learning_rate": 1.7455133217135524e-06, + "loss": 1.0026, + "step": 7684 + }, + { + "epoch": 0.554453302550413, + "grad_norm": 0.7315363706332692, + "learning_rate": 1.7450497677745193e-06, + "loss": 0.7933, + "step": 7685 + }, + { + "epoch": 0.5545254500198405, + "grad_norm": 2.6928534305716045, + "learning_rate": 1.7445862277573738e-06, + "loss": 0.9403, + "step": 7686 + }, + { + "epoch": 0.5545975974892681, + "grad_norm": 2.3110125113610347, + "learning_rate": 1.744122701687429e-06, + "loss": 0.9251, + "step": 7687 + }, + { + "epoch": 0.5546697449586956, + "grad_norm": 2.7782842704164836, + "learning_rate": 1.7436591895899964e-06, + "loss": 0.9068, + "step": 7688 + }, + { + "epoch": 0.5547418924281231, + "grad_norm": 3.656845305144093, + "learning_rate": 1.7431956914903868e-06, + "loss": 0.9603, + "step": 7689 + }, + { + "epoch": 0.5548140398975506, + "grad_norm": 3.2084507280810666, + "learning_rate": 1.7427322074139095e-06, + "loss": 0.9323, + "step": 7690 + }, + { + "epoch": 0.5548861873669781, + "grad_norm": 2.875883049731832, + "learning_rate": 1.7422687373858737e-06, + "loss": 0.9054, + "step": 7691 + }, + { + "epoch": 0.5549583348364057, + "grad_norm": 2.2357251549634576, + "learning_rate": 1.741805281431589e-06, + "loss": 0.8725, + "step": 7692 + }, + { + "epoch": 0.5550304823058331, + "grad_norm": 2.1731722682917063, + "learning_rate": 1.7413418395763606e-06, + "loss": 0.9494, + "step": 7693 + }, + { + "epoch": 0.5551026297752606, + "grad_norm": 2.1960592391356872, + "learning_rate": 1.7408784118454982e-06, + "loss": 1.0047, + "step": 7694 + }, + { + "epoch": 0.5551747772446881, + "grad_norm": 2.869603607237574, + "learning_rate": 1.7404149982643061e-06, + "loss": 0.906, + "step": 7695 + }, + { + "epoch": 0.5552469247141156, + "grad_norm": 2.849718140166419, + "learning_rate": 1.7399515988580895e-06, + "loss": 0.863, + "step": 7696 + }, + { + "epoch": 0.5553190721835431, + "grad_norm": 2.7259035379886773, + "learning_rate": 1.7394882136521541e-06, + "loss": 0.9239, + "step": 7697 + }, + { + "epoch": 0.5553912196529707, + "grad_norm": 2.4076640940485254, + "learning_rate": 1.7390248426718027e-06, + "loss": 0.8562, + "step": 7698 + }, + { + "epoch": 0.5554633671223982, + "grad_norm": 3.665082517061868, + "learning_rate": 1.738561485942338e-06, + "loss": 0.8531, + "step": 7699 + }, + { + "epoch": 0.5555355145918257, + "grad_norm": 4.888287555433843, + "learning_rate": 1.7380981434890638e-06, + "loss": 0.9279, + "step": 7700 + }, + { + "epoch": 0.5556076620612532, + "grad_norm": 2.756392937584768, + "learning_rate": 1.7376348153372797e-06, + "loss": 0.9681, + "step": 7701 + }, + { + "epoch": 0.5556798095306807, + "grad_norm": 2.343676756668055, + "learning_rate": 1.7371715015122871e-06, + "loss": 0.95, + "step": 7702 + }, + { + "epoch": 0.5557519570001083, + "grad_norm": 3.359667104397125, + "learning_rate": 1.7367082020393861e-06, + "loss": 0.8886, + "step": 7703 + }, + { + "epoch": 0.5558241044695358, + "grad_norm": 7.14816133559787, + "learning_rate": 1.7362449169438758e-06, + "loss": 0.9018, + "step": 7704 + }, + { + "epoch": 0.5558962519389632, + "grad_norm": 4.164929498302472, + "learning_rate": 1.7357816462510532e-06, + "loss": 1.017, + "step": 7705 + }, + { + "epoch": 0.5559683994083907, + "grad_norm": 4.60050871980013, + "learning_rate": 1.7353183899862179e-06, + "loss": 0.9043, + "step": 7706 + }, + { + "epoch": 0.5560405468778182, + "grad_norm": 3.2272951342675484, + "learning_rate": 1.7348551481746654e-06, + "loss": 0.8142, + "step": 7707 + }, + { + "epoch": 0.5561126943472458, + "grad_norm": 2.9974804470515806, + "learning_rate": 1.734391920841691e-06, + "loss": 1.0413, + "step": 7708 + }, + { + "epoch": 0.5561848418166733, + "grad_norm": 3.58465194097891, + "learning_rate": 1.7339287080125911e-06, + "loss": 0.9571, + "step": 7709 + }, + { + "epoch": 0.5562569892861008, + "grad_norm": 2.2486057200009513, + "learning_rate": 1.7334655097126591e-06, + "loss": 0.8971, + "step": 7710 + }, + { + "epoch": 0.5563291367555283, + "grad_norm": 2.342958443807568, + "learning_rate": 1.733002325967189e-06, + "loss": 0.9467, + "step": 7711 + }, + { + "epoch": 0.5564012842249558, + "grad_norm": 0.6827478897637602, + "learning_rate": 1.732539156801474e-06, + "loss": 0.8284, + "step": 7712 + }, + { + "epoch": 0.5564734316943833, + "grad_norm": 5.5408497067157585, + "learning_rate": 1.7320760022408052e-06, + "loss": 0.9671, + "step": 7713 + }, + { + "epoch": 0.5565455791638109, + "grad_norm": 3.0127236437586022, + "learning_rate": 1.7316128623104737e-06, + "loss": 0.9138, + "step": 7714 + }, + { + "epoch": 0.5566177266332384, + "grad_norm": 3.70471093062992, + "learning_rate": 1.7311497370357708e-06, + "loss": 0.8361, + "step": 7715 + }, + { + "epoch": 0.5566898741026658, + "grad_norm": 2.7333724445553145, + "learning_rate": 1.7306866264419858e-06, + "loss": 0.9106, + "step": 7716 + }, + { + "epoch": 0.5567620215720933, + "grad_norm": 2.3913152358972, + "learning_rate": 1.7302235305544063e-06, + "loss": 0.9407, + "step": 7717 + }, + { + "epoch": 0.5568341690415208, + "grad_norm": 3.482045168781428, + "learning_rate": 1.7297604493983216e-06, + "loss": 0.8995, + "step": 7718 + }, + { + "epoch": 0.5569063165109484, + "grad_norm": 3.468018463252813, + "learning_rate": 1.7292973829990185e-06, + "loss": 0.9364, + "step": 7719 + }, + { + "epoch": 0.5569784639803759, + "grad_norm": 2.138035007914594, + "learning_rate": 1.7288343313817825e-06, + "loss": 0.9733, + "step": 7720 + }, + { + "epoch": 0.5570506114498034, + "grad_norm": 2.0931139307193565, + "learning_rate": 1.7283712945719007e-06, + "loss": 0.9098, + "step": 7721 + }, + { + "epoch": 0.5571227589192309, + "grad_norm": 2.732964288019707, + "learning_rate": 1.727908272594657e-06, + "loss": 0.9967, + "step": 7722 + }, + { + "epoch": 0.5571949063886584, + "grad_norm": 2.7377426771481055, + "learning_rate": 1.7274452654753345e-06, + "loss": 0.9178, + "step": 7723 + }, + { + "epoch": 0.557267053858086, + "grad_norm": 2.1251044528197403, + "learning_rate": 1.7269822732392177e-06, + "loss": 0.9808, + "step": 7724 + }, + { + "epoch": 0.5573392013275135, + "grad_norm": 2.3404838692627328, + "learning_rate": 1.726519295911588e-06, + "loss": 0.9354, + "step": 7725 + }, + { + "epoch": 0.557411348796941, + "grad_norm": 1.8134567760893854, + "learning_rate": 1.726056333517727e-06, + "loss": 0.8623, + "step": 7726 + }, + { + "epoch": 0.5574834962663685, + "grad_norm": 4.803048699492061, + "learning_rate": 1.7255933860829156e-06, + "loss": 0.879, + "step": 7727 + }, + { + "epoch": 0.5575556437357959, + "grad_norm": 4.919432600224775, + "learning_rate": 1.725130453632434e-06, + "loss": 0.8426, + "step": 7728 + }, + { + "epoch": 0.5576277912052234, + "grad_norm": 2.4040078950901473, + "learning_rate": 1.7246675361915605e-06, + "loss": 0.836, + "step": 7729 + }, + { + "epoch": 0.557699938674651, + "grad_norm": 4.800172349696623, + "learning_rate": 1.7242046337855736e-06, + "loss": 0.903, + "step": 7730 + }, + { + "epoch": 0.5577720861440785, + "grad_norm": 2.8516670103386903, + "learning_rate": 1.723741746439751e-06, + "loss": 0.9151, + "step": 7731 + }, + { + "epoch": 0.557844233613506, + "grad_norm": 3.158189838407426, + "learning_rate": 1.723278874179368e-06, + "loss": 0.8978, + "step": 7732 + }, + { + "epoch": 0.5579163810829335, + "grad_norm": 3.121961871552758, + "learning_rate": 1.7228160170297023e-06, + "loss": 0.9186, + "step": 7733 + }, + { + "epoch": 0.557988528552361, + "grad_norm": 0.8130716733577017, + "learning_rate": 1.7223531750160279e-06, + "loss": 0.8095, + "step": 7734 + }, + { + "epoch": 0.5580606760217885, + "grad_norm": 3.248854361865397, + "learning_rate": 1.7218903481636179e-06, + "loss": 0.8216, + "step": 7735 + }, + { + "epoch": 0.5581328234912161, + "grad_norm": 2.98244138060501, + "learning_rate": 1.7214275364977471e-06, + "loss": 0.8887, + "step": 7736 + }, + { + "epoch": 0.5582049709606436, + "grad_norm": 2.5649922044721083, + "learning_rate": 1.7209647400436871e-06, + "loss": 0.9712, + "step": 7737 + }, + { + "epoch": 0.5582771184300711, + "grad_norm": 6.49317162648548, + "learning_rate": 1.7205019588267097e-06, + "loss": 0.8016, + "step": 7738 + }, + { + "epoch": 0.5583492658994986, + "grad_norm": 2.495615738780595, + "learning_rate": 1.720039192872086e-06, + "loss": 0.9646, + "step": 7739 + }, + { + "epoch": 0.558421413368926, + "grad_norm": 3.527504749362487, + "learning_rate": 1.719576442205085e-06, + "loss": 0.8779, + "step": 7740 + }, + { + "epoch": 0.5584935608383536, + "grad_norm": 2.6652536721658238, + "learning_rate": 1.7191137068509765e-06, + "loss": 0.8983, + "step": 7741 + }, + { + "epoch": 0.5585657083077811, + "grad_norm": 2.1190352031309327, + "learning_rate": 1.7186509868350288e-06, + "loss": 0.8801, + "step": 7742 + }, + { + "epoch": 0.5586378557772086, + "grad_norm": 3.8961975145032732, + "learning_rate": 1.7181882821825095e-06, + "loss": 0.9275, + "step": 7743 + }, + { + "epoch": 0.5587100032466361, + "grad_norm": 0.775682754641015, + "learning_rate": 1.7177255929186845e-06, + "loss": 0.8343, + "step": 7744 + }, + { + "epoch": 0.5587821507160636, + "grad_norm": 5.030456480392515, + "learning_rate": 1.7172629190688199e-06, + "loss": 0.8815, + "step": 7745 + }, + { + "epoch": 0.5588542981854911, + "grad_norm": 3.1050635115413607, + "learning_rate": 1.7168002606581811e-06, + "loss": 0.9127, + "step": 7746 + }, + { + "epoch": 0.5589264456549187, + "grad_norm": 11.947992157746109, + "learning_rate": 1.7163376177120307e-06, + "loss": 0.9127, + "step": 7747 + }, + { + "epoch": 0.5589985931243462, + "grad_norm": 2.2136454359676527, + "learning_rate": 1.715874990255634e-06, + "loss": 0.9676, + "step": 7748 + }, + { + "epoch": 0.5590707405937737, + "grad_norm": 3.1139860228982505, + "learning_rate": 1.7154123783142517e-06, + "loss": 0.946, + "step": 7749 + }, + { + "epoch": 0.5591428880632012, + "grad_norm": 2.862230553085071, + "learning_rate": 1.714949781913146e-06, + "loss": 0.9525, + "step": 7750 + }, + { + "epoch": 0.5592150355326287, + "grad_norm": 4.325370710945985, + "learning_rate": 1.714487201077578e-06, + "loss": 0.8543, + "step": 7751 + }, + { + "epoch": 0.5592871830020562, + "grad_norm": 2.6680307674321067, + "learning_rate": 1.7140246358328065e-06, + "loss": 1.0192, + "step": 7752 + }, + { + "epoch": 0.5593593304714837, + "grad_norm": 4.603471292080762, + "learning_rate": 1.713562086204091e-06, + "loss": 0.9136, + "step": 7753 + }, + { + "epoch": 0.5594314779409112, + "grad_norm": 11.287333615450745, + "learning_rate": 1.7130995522166893e-06, + "loss": 0.9148, + "step": 7754 + }, + { + "epoch": 0.5595036254103387, + "grad_norm": 0.7150623841847014, + "learning_rate": 1.7126370338958598e-06, + "loss": 0.8297, + "step": 7755 + }, + { + "epoch": 0.5595757728797662, + "grad_norm": 3.259364647999725, + "learning_rate": 1.7121745312668577e-06, + "loss": 1.0044, + "step": 7756 + }, + { + "epoch": 0.5596479203491938, + "grad_norm": 2.3767430785505588, + "learning_rate": 1.7117120443549387e-06, + "loss": 0.9428, + "step": 7757 + }, + { + "epoch": 0.5597200678186213, + "grad_norm": 0.6778739763435402, + "learning_rate": 1.7112495731853583e-06, + "loss": 0.8, + "step": 7758 + }, + { + "epoch": 0.5597922152880488, + "grad_norm": 2.5062475342393573, + "learning_rate": 1.710787117783369e-06, + "loss": 0.8864, + "step": 7759 + }, + { + "epoch": 0.5598643627574763, + "grad_norm": 2.6685778703054615, + "learning_rate": 1.7103246781742253e-06, + "loss": 0.9446, + "step": 7760 + }, + { + "epoch": 0.5599365102269038, + "grad_norm": 3.8193506695573713, + "learning_rate": 1.7098622543831787e-06, + "loss": 0.8849, + "step": 7761 + }, + { + "epoch": 0.5600086576963313, + "grad_norm": 2.3671562771894057, + "learning_rate": 1.7093998464354793e-06, + "loss": 0.7551, + "step": 7762 + }, + { + "epoch": 0.5600808051657589, + "grad_norm": 2.37366889995864, + "learning_rate": 1.7089374543563798e-06, + "loss": 0.8022, + "step": 7763 + }, + { + "epoch": 0.5601529526351863, + "grad_norm": 2.601424938150587, + "learning_rate": 1.7084750781711274e-06, + "loss": 0.9489, + "step": 7764 + }, + { + "epoch": 0.5602251001046138, + "grad_norm": 7.60143428118033, + "learning_rate": 1.7080127179049722e-06, + "loss": 0.9224, + "step": 7765 + }, + { + "epoch": 0.5602972475740413, + "grad_norm": 2.6551132462482334, + "learning_rate": 1.7075503735831621e-06, + "loss": 0.9569, + "step": 7766 + }, + { + "epoch": 0.5603693950434688, + "grad_norm": 2.8085465513462835, + "learning_rate": 1.7070880452309426e-06, + "loss": 0.9494, + "step": 7767 + }, + { + "epoch": 0.5604415425128964, + "grad_norm": 3.3615443123026383, + "learning_rate": 1.706625732873561e-06, + "loss": 0.9482, + "step": 7768 + }, + { + "epoch": 0.5605136899823239, + "grad_norm": 0.6940937075754676, + "learning_rate": 1.7061634365362618e-06, + "loss": 0.8408, + "step": 7769 + }, + { + "epoch": 0.5605858374517514, + "grad_norm": 3.1434747473040843, + "learning_rate": 1.7057011562442902e-06, + "loss": 0.8658, + "step": 7770 + }, + { + "epoch": 0.5606579849211789, + "grad_norm": 2.4008136125165316, + "learning_rate": 1.7052388920228884e-06, + "loss": 0.9075, + "step": 7771 + }, + { + "epoch": 0.5607301323906064, + "grad_norm": 3.5895132549852553, + "learning_rate": 1.7047766438973002e-06, + "loss": 0.8896, + "step": 7772 + }, + { + "epoch": 0.560802279860034, + "grad_norm": 0.7688184816205151, + "learning_rate": 1.7043144118927664e-06, + "loss": 0.7741, + "step": 7773 + }, + { + "epoch": 0.5608744273294615, + "grad_norm": 3.271615698157139, + "learning_rate": 1.703852196034527e-06, + "loss": 0.8753, + "step": 7774 + }, + { + "epoch": 0.5609465747988889, + "grad_norm": 3.5369192880141833, + "learning_rate": 1.7033899963478241e-06, + "loss": 1.0496, + "step": 7775 + }, + { + "epoch": 0.5610187222683164, + "grad_norm": 3.130328708731246, + "learning_rate": 1.702927812857895e-06, + "loss": 0.9101, + "step": 7776 + }, + { + "epoch": 0.5610908697377439, + "grad_norm": 2.9278267202643744, + "learning_rate": 1.7024656455899784e-06, + "loss": 1.067, + "step": 7777 + }, + { + "epoch": 0.5611630172071714, + "grad_norm": 2.550571403809624, + "learning_rate": 1.7020034945693115e-06, + "loss": 0.9185, + "step": 7778 + }, + { + "epoch": 0.561235164676599, + "grad_norm": 3.790776759645348, + "learning_rate": 1.7015413598211306e-06, + "loss": 1.0064, + "step": 7779 + }, + { + "epoch": 0.5613073121460265, + "grad_norm": 2.6497736348331595, + "learning_rate": 1.701079241370671e-06, + "loss": 0.8443, + "step": 7780 + }, + { + "epoch": 0.561379459615454, + "grad_norm": 2.9206857147861123, + "learning_rate": 1.7006171392431674e-06, + "loss": 0.951, + "step": 7781 + }, + { + "epoch": 0.5614516070848815, + "grad_norm": 3.4702461384040757, + "learning_rate": 1.7001550534638542e-06, + "loss": 0.9344, + "step": 7782 + }, + { + "epoch": 0.561523754554309, + "grad_norm": 3.642638472397224, + "learning_rate": 1.6996929840579629e-06, + "loss": 1.0419, + "step": 7783 + }, + { + "epoch": 0.5615959020237365, + "grad_norm": 2.866406409699505, + "learning_rate": 1.6992309310507263e-06, + "loss": 1.0444, + "step": 7784 + }, + { + "epoch": 0.5616680494931641, + "grad_norm": 4.08520688003503, + "learning_rate": 1.6987688944673752e-06, + "loss": 0.8918, + "step": 7785 + }, + { + "epoch": 0.5617401969625916, + "grad_norm": 3.197497676940981, + "learning_rate": 1.698306874333139e-06, + "loss": 0.9394, + "step": 7786 + }, + { + "epoch": 0.561812344432019, + "grad_norm": 5.198815746437382, + "learning_rate": 1.6978448706732485e-06, + "loss": 0.9338, + "step": 7787 + }, + { + "epoch": 0.5618844919014465, + "grad_norm": 3.5066065425093997, + "learning_rate": 1.6973828835129309e-06, + "loss": 0.9411, + "step": 7788 + }, + { + "epoch": 0.561956639370874, + "grad_norm": 2.2463526302915846, + "learning_rate": 1.696920912877413e-06, + "loss": 0.9721, + "step": 7789 + }, + { + "epoch": 0.5620287868403016, + "grad_norm": 4.6708489251309, + "learning_rate": 1.6964589587919228e-06, + "loss": 0.9582, + "step": 7790 + }, + { + "epoch": 0.5621009343097291, + "grad_norm": 7.123019294486637, + "learning_rate": 1.695997021281685e-06, + "loss": 0.9662, + "step": 7791 + }, + { + "epoch": 0.5621730817791566, + "grad_norm": 4.764015201022273, + "learning_rate": 1.6955351003719245e-06, + "loss": 0.9048, + "step": 7792 + }, + { + "epoch": 0.5622452292485841, + "grad_norm": 2.5655608156378933, + "learning_rate": 1.695073196087865e-06, + "loss": 0.8859, + "step": 7793 + }, + { + "epoch": 0.5623173767180116, + "grad_norm": 2.5613948951199106, + "learning_rate": 1.6946113084547298e-06, + "loss": 0.9697, + "step": 7794 + }, + { + "epoch": 0.5623895241874391, + "grad_norm": 2.420435225230147, + "learning_rate": 1.6941494374977402e-06, + "loss": 0.8905, + "step": 7795 + }, + { + "epoch": 0.5624616716568667, + "grad_norm": 3.897569945896327, + "learning_rate": 1.6936875832421173e-06, + "loss": 0.8454, + "step": 7796 + }, + { + "epoch": 0.5625338191262942, + "grad_norm": 6.940230923199457, + "learning_rate": 1.6932257457130826e-06, + "loss": 0.9605, + "step": 7797 + }, + { + "epoch": 0.5626059665957217, + "grad_norm": 3.0207558106746415, + "learning_rate": 1.692763924935853e-06, + "loss": 0.801, + "step": 7798 + }, + { + "epoch": 0.5626781140651491, + "grad_norm": 10.113945848914044, + "learning_rate": 1.692302120935649e-06, + "loss": 0.8111, + "step": 7799 + }, + { + "epoch": 0.5627502615345766, + "grad_norm": 2.602900988407058, + "learning_rate": 1.6918403337376873e-06, + "loss": 0.9356, + "step": 7800 + }, + { + "epoch": 0.5628224090040042, + "grad_norm": 2.9358359960334073, + "learning_rate": 1.6913785633671831e-06, + "loss": 0.849, + "step": 7801 + }, + { + "epoch": 0.5628945564734317, + "grad_norm": 3.3185254841812135, + "learning_rate": 1.690916809849354e-06, + "loss": 0.9591, + "step": 7802 + }, + { + "epoch": 0.5629667039428592, + "grad_norm": 2.2536966897474606, + "learning_rate": 1.6904550732094134e-06, + "loss": 0.8777, + "step": 7803 + }, + { + "epoch": 0.5630388514122867, + "grad_norm": 5.225549163432322, + "learning_rate": 1.6899933534725752e-06, + "loss": 1.001, + "step": 7804 + }, + { + "epoch": 0.5631109988817142, + "grad_norm": 3.0059647073812235, + "learning_rate": 1.689531650664053e-06, + "loss": 0.8497, + "step": 7805 + }, + { + "epoch": 0.5631831463511418, + "grad_norm": 3.0533756649075796, + "learning_rate": 1.6890699648090574e-06, + "loss": 0.9408, + "step": 7806 + }, + { + "epoch": 0.5632552938205693, + "grad_norm": 3.8693291792931617, + "learning_rate": 1.6886082959328e-06, + "loss": 0.9231, + "step": 7807 + }, + { + "epoch": 0.5633274412899968, + "grad_norm": 4.9738507270823735, + "learning_rate": 1.688146644060491e-06, + "loss": 0.8179, + "step": 7808 + }, + { + "epoch": 0.5633995887594243, + "grad_norm": 2.34755296527558, + "learning_rate": 1.6876850092173399e-06, + "loss": 0.8954, + "step": 7809 + }, + { + "epoch": 0.5634717362288518, + "grad_norm": 3.1563492561067887, + "learning_rate": 1.687223391428554e-06, + "loss": 0.858, + "step": 7810 + }, + { + "epoch": 0.5635438836982792, + "grad_norm": 3.6221647026576127, + "learning_rate": 1.68676179071934e-06, + "loss": 0.8891, + "step": 7811 + }, + { + "epoch": 0.5636160311677068, + "grad_norm": 3.654230848130789, + "learning_rate": 1.686300207114906e-06, + "loss": 0.9719, + "step": 7812 + }, + { + "epoch": 0.5636881786371343, + "grad_norm": 3.4774364958723125, + "learning_rate": 1.6858386406404554e-06, + "loss": 0.9217, + "step": 7813 + }, + { + "epoch": 0.5637603261065618, + "grad_norm": 3.6159084357340676, + "learning_rate": 1.6853770913211947e-06, + "loss": 0.9459, + "step": 7814 + }, + { + "epoch": 0.5638324735759893, + "grad_norm": 3.112256579091628, + "learning_rate": 1.684915559182326e-06, + "loss": 0.9138, + "step": 7815 + }, + { + "epoch": 0.5639046210454168, + "grad_norm": 9.08673391353197, + "learning_rate": 1.6844540442490517e-06, + "loss": 0.9291, + "step": 7816 + }, + { + "epoch": 0.5639767685148444, + "grad_norm": 3.0042737083747117, + "learning_rate": 1.6839925465465746e-06, + "loss": 0.8156, + "step": 7817 + }, + { + "epoch": 0.5640489159842719, + "grad_norm": 2.811172002284496, + "learning_rate": 1.6835310661000946e-06, + "loss": 0.8677, + "step": 7818 + }, + { + "epoch": 0.5641210634536994, + "grad_norm": 5.197060106092471, + "learning_rate": 1.6830696029348113e-06, + "loss": 0.8994, + "step": 7819 + }, + { + "epoch": 0.5641932109231269, + "grad_norm": 7.159208394197153, + "learning_rate": 1.6826081570759238e-06, + "loss": 0.798, + "step": 7820 + }, + { + "epoch": 0.5642653583925544, + "grad_norm": 2.445022127016261, + "learning_rate": 1.6821467285486303e-06, + "loss": 0.9388, + "step": 7821 + }, + { + "epoch": 0.5643375058619818, + "grad_norm": 3.243130392826204, + "learning_rate": 1.6816853173781273e-06, + "loss": 0.9375, + "step": 7822 + }, + { + "epoch": 0.5644096533314094, + "grad_norm": 2.206770133716964, + "learning_rate": 1.68122392358961e-06, + "loss": 0.8482, + "step": 7823 + }, + { + "epoch": 0.5644818008008369, + "grad_norm": 2.5146126931622192, + "learning_rate": 1.6807625472082749e-06, + "loss": 1.026, + "step": 7824 + }, + { + "epoch": 0.5645539482702644, + "grad_norm": 0.7169363941087326, + "learning_rate": 1.6803011882593146e-06, + "loss": 0.7379, + "step": 7825 + }, + { + "epoch": 0.5646260957396919, + "grad_norm": 3.450827131771683, + "learning_rate": 1.6798398467679236e-06, + "loss": 0.9049, + "step": 7826 + }, + { + "epoch": 0.5646982432091194, + "grad_norm": 2.3408436366237337, + "learning_rate": 1.6793785227592934e-06, + "loss": 0.9494, + "step": 7827 + }, + { + "epoch": 0.564770390678547, + "grad_norm": 3.2233292680285386, + "learning_rate": 1.6789172162586145e-06, + "loss": 0.8739, + "step": 7828 + }, + { + "epoch": 0.5648425381479745, + "grad_norm": 3.796318484364688, + "learning_rate": 1.6784559272910786e-06, + "loss": 0.9028, + "step": 7829 + }, + { + "epoch": 0.564914685617402, + "grad_norm": 2.762738461796298, + "learning_rate": 1.6779946558818736e-06, + "loss": 1.0123, + "step": 7830 + }, + { + "epoch": 0.5649868330868295, + "grad_norm": 2.231696597585415, + "learning_rate": 1.6775334020561886e-06, + "loss": 1.0213, + "step": 7831 + }, + { + "epoch": 0.565058980556257, + "grad_norm": 2.1537778753158254, + "learning_rate": 1.6770721658392112e-06, + "loss": 0.9749, + "step": 7832 + }, + { + "epoch": 0.5651311280256845, + "grad_norm": 4.8609985386968475, + "learning_rate": 1.6766109472561266e-06, + "loss": 0.7942, + "step": 7833 + }, + { + "epoch": 0.565203275495112, + "grad_norm": 2.653350324464513, + "learning_rate": 1.676149746332122e-06, + "loss": 0.8984, + "step": 7834 + }, + { + "epoch": 0.5652754229645395, + "grad_norm": 3.2843482512774047, + "learning_rate": 1.6756885630923798e-06, + "loss": 0.9127, + "step": 7835 + }, + { + "epoch": 0.565347570433967, + "grad_norm": 2.8719969282532314, + "learning_rate": 1.6752273975620854e-06, + "loss": 0.9134, + "step": 7836 + }, + { + "epoch": 0.5654197179033945, + "grad_norm": 2.221046656036522, + "learning_rate": 1.6747662497664199e-06, + "loss": 0.8949, + "step": 7837 + }, + { + "epoch": 0.565491865372822, + "grad_norm": 4.422976515022594, + "learning_rate": 1.6743051197305662e-06, + "loss": 0.9215, + "step": 7838 + }, + { + "epoch": 0.5655640128422496, + "grad_norm": 1.8298872580480876, + "learning_rate": 1.6738440074797042e-06, + "loss": 0.9329, + "step": 7839 + }, + { + "epoch": 0.5656361603116771, + "grad_norm": 3.7524294295209812, + "learning_rate": 1.6733829130390127e-06, + "loss": 0.8774, + "step": 7840 + }, + { + "epoch": 0.5657083077811046, + "grad_norm": 3.117922741601404, + "learning_rate": 1.672921836433672e-06, + "loss": 0.8972, + "step": 7841 + }, + { + "epoch": 0.5657804552505321, + "grad_norm": 2.4742896785030997, + "learning_rate": 1.672460777688859e-06, + "loss": 1.0026, + "step": 7842 + }, + { + "epoch": 0.5658526027199596, + "grad_norm": 4.871635449718714, + "learning_rate": 1.67199973682975e-06, + "loss": 0.9535, + "step": 7843 + }, + { + "epoch": 0.5659247501893871, + "grad_norm": 2.249850480091905, + "learning_rate": 1.6715387138815217e-06, + "loss": 1.0499, + "step": 7844 + }, + { + "epoch": 0.5659968976588147, + "grad_norm": 6.277135307872521, + "learning_rate": 1.6710777088693482e-06, + "loss": 0.9831, + "step": 7845 + }, + { + "epoch": 0.5660690451282421, + "grad_norm": 0.75151306751131, + "learning_rate": 1.6706167218184036e-06, + "loss": 0.8724, + "step": 7846 + }, + { + "epoch": 0.5661411925976696, + "grad_norm": 0.82656883280587, + "learning_rate": 1.6701557527538594e-06, + "loss": 0.7608, + "step": 7847 + }, + { + "epoch": 0.5662133400670971, + "grad_norm": 3.0569510500317207, + "learning_rate": 1.6696948017008897e-06, + "loss": 0.8818, + "step": 7848 + }, + { + "epoch": 0.5662854875365246, + "grad_norm": 4.170375368709918, + "learning_rate": 1.6692338686846638e-06, + "loss": 0.9285, + "step": 7849 + }, + { + "epoch": 0.5663576350059522, + "grad_norm": 3.0690464972003175, + "learning_rate": 1.6687729537303512e-06, + "loss": 0.8518, + "step": 7850 + }, + { + "epoch": 0.5664297824753797, + "grad_norm": 3.4311302122458467, + "learning_rate": 1.6683120568631224e-06, + "loss": 0.8856, + "step": 7851 + }, + { + "epoch": 0.5665019299448072, + "grad_norm": 2.633644583773622, + "learning_rate": 1.6678511781081433e-06, + "loss": 1.0551, + "step": 7852 + }, + { + "epoch": 0.5665740774142347, + "grad_norm": 2.464591471340731, + "learning_rate": 1.6673903174905825e-06, + "loss": 0.8832, + "step": 7853 + }, + { + "epoch": 0.5666462248836622, + "grad_norm": 2.670961101013079, + "learning_rate": 1.6669294750356056e-06, + "loss": 0.9088, + "step": 7854 + }, + { + "epoch": 0.5667183723530897, + "grad_norm": 3.7965824972127353, + "learning_rate": 1.6664686507683758e-06, + "loss": 0.9254, + "step": 7855 + }, + { + "epoch": 0.5667905198225173, + "grad_norm": 4.332134276558925, + "learning_rate": 1.6660078447140595e-06, + "loss": 0.8754, + "step": 7856 + }, + { + "epoch": 0.5668626672919448, + "grad_norm": 2.754479920717607, + "learning_rate": 1.6655470568978177e-06, + "loss": 0.9807, + "step": 7857 + }, + { + "epoch": 0.5669348147613722, + "grad_norm": 2.6831829972733297, + "learning_rate": 1.6650862873448136e-06, + "loss": 1.022, + "step": 7858 + }, + { + "epoch": 0.5670069622307997, + "grad_norm": 3.9711749260482, + "learning_rate": 1.6646255360802066e-06, + "loss": 1.0124, + "step": 7859 + }, + { + "epoch": 0.5670791097002272, + "grad_norm": 2.8354252964834883, + "learning_rate": 1.6641648031291585e-06, + "loss": 0.9012, + "step": 7860 + }, + { + "epoch": 0.5671512571696548, + "grad_norm": 3.560570713116169, + "learning_rate": 1.6637040885168272e-06, + "loss": 0.9992, + "step": 7861 + }, + { + "epoch": 0.5672234046390823, + "grad_norm": 3.609172864926381, + "learning_rate": 1.66324339226837e-06, + "loss": 0.9371, + "step": 7862 + }, + { + "epoch": 0.5672955521085098, + "grad_norm": 3.741827005690554, + "learning_rate": 1.6627827144089451e-06, + "loss": 0.927, + "step": 7863 + }, + { + "epoch": 0.5673676995779373, + "grad_norm": 2.8130323965768103, + "learning_rate": 1.6623220549637074e-06, + "loss": 0.9175, + "step": 7864 + }, + { + "epoch": 0.5674398470473648, + "grad_norm": 4.798941201417567, + "learning_rate": 1.6618614139578126e-06, + "loss": 0.9659, + "step": 7865 + }, + { + "epoch": 0.5675119945167924, + "grad_norm": 2.460187719052364, + "learning_rate": 1.6614007914164146e-06, + "loss": 0.7992, + "step": 7866 + }, + { + "epoch": 0.5675841419862199, + "grad_norm": 2.9981931967550657, + "learning_rate": 1.6609401873646649e-06, + "loss": 0.9891, + "step": 7867 + }, + { + "epoch": 0.5676562894556474, + "grad_norm": 5.028471564564601, + "learning_rate": 1.6604796018277175e-06, + "loss": 0.9258, + "step": 7868 + }, + { + "epoch": 0.5677284369250748, + "grad_norm": 4.555628788884074, + "learning_rate": 1.6600190348307214e-06, + "loss": 0.9494, + "step": 7869 + }, + { + "epoch": 0.5678005843945023, + "grad_norm": 9.503682276183302, + "learning_rate": 1.659558486398828e-06, + "loss": 0.8451, + "step": 7870 + }, + { + "epoch": 0.5678727318639298, + "grad_norm": 1.9728517341066094, + "learning_rate": 1.6590979565571845e-06, + "loss": 0.8778, + "step": 7871 + }, + { + "epoch": 0.5679448793333574, + "grad_norm": 2.7579758031114223, + "learning_rate": 1.6586374453309399e-06, + "loss": 1.013, + "step": 7872 + }, + { + "epoch": 0.5680170268027849, + "grad_norm": 0.7865989677210983, + "learning_rate": 1.6581769527452413e-06, + "loss": 0.7708, + "step": 7873 + }, + { + "epoch": 0.5680891742722124, + "grad_norm": 52.63364453569005, + "learning_rate": 1.657716478825233e-06, + "loss": 0.9036, + "step": 7874 + }, + { + "epoch": 0.5681613217416399, + "grad_norm": 3.119169976771728, + "learning_rate": 1.6572560235960614e-06, + "loss": 0.8834, + "step": 7875 + }, + { + "epoch": 0.5682334692110674, + "grad_norm": 2.880681105448262, + "learning_rate": 1.6567955870828696e-06, + "loss": 0.8542, + "step": 7876 + }, + { + "epoch": 0.568305616680495, + "grad_norm": 4.467466797024857, + "learning_rate": 1.6563351693107994e-06, + "loss": 0.8793, + "step": 7877 + }, + { + "epoch": 0.5683777641499225, + "grad_norm": 2.6624946343452938, + "learning_rate": 1.6558747703049944e-06, + "loss": 0.9978, + "step": 7878 + }, + { + "epoch": 0.56844991161935, + "grad_norm": 3.297482547554759, + "learning_rate": 1.6554143900905935e-06, + "loss": 1.0387, + "step": 7879 + }, + { + "epoch": 0.5685220590887775, + "grad_norm": 3.5178556020399983, + "learning_rate": 1.654954028692738e-06, + "loss": 0.9088, + "step": 7880 + }, + { + "epoch": 0.5685942065582049, + "grad_norm": 2.7543084466388428, + "learning_rate": 1.6544936861365648e-06, + "loss": 0.8434, + "step": 7881 + }, + { + "epoch": 0.5686663540276324, + "grad_norm": 3.1813222820711426, + "learning_rate": 1.6540333624472131e-06, + "loss": 0.9441, + "step": 7882 + }, + { + "epoch": 0.56873850149706, + "grad_norm": 3.72639154645879, + "learning_rate": 1.6535730576498184e-06, + "loss": 0.8288, + "step": 7883 + }, + { + "epoch": 0.5688106489664875, + "grad_norm": 10.413158426155013, + "learning_rate": 1.6531127717695167e-06, + "loss": 0.885, + "step": 7884 + }, + { + "epoch": 0.568882796435915, + "grad_norm": 3.3902086978014254, + "learning_rate": 1.6526525048314426e-06, + "loss": 0.9971, + "step": 7885 + }, + { + "epoch": 0.5689549439053425, + "grad_norm": 5.01878927365111, + "learning_rate": 1.6521922568607285e-06, + "loss": 0.9235, + "step": 7886 + }, + { + "epoch": 0.56902709137477, + "grad_norm": 56.80574853310402, + "learning_rate": 1.6517320278825086e-06, + "loss": 0.8618, + "step": 7887 + }, + { + "epoch": 0.5690992388441976, + "grad_norm": 4.8342976539899825, + "learning_rate": 1.6512718179219135e-06, + "loss": 0.9869, + "step": 7888 + }, + { + "epoch": 0.5691713863136251, + "grad_norm": 6.438296370644179, + "learning_rate": 1.6508116270040726e-06, + "loss": 0.8411, + "step": 7889 + }, + { + "epoch": 0.5692435337830526, + "grad_norm": 3.479122428079798, + "learning_rate": 1.650351455154117e-06, + "loss": 0.9325, + "step": 7890 + }, + { + "epoch": 0.5693156812524801, + "grad_norm": 2.976681524184833, + "learning_rate": 1.6498913023971732e-06, + "loss": 0.9695, + "step": 7891 + }, + { + "epoch": 0.5693878287219076, + "grad_norm": 0.7865721428651008, + "learning_rate": 1.6494311687583702e-06, + "loss": 0.8607, + "step": 7892 + }, + { + "epoch": 0.569459976191335, + "grad_norm": 0.8961857649530106, + "learning_rate": 1.6489710542628333e-06, + "loss": 0.789, + "step": 7893 + }, + { + "epoch": 0.5695321236607626, + "grad_norm": 8.520612519503485, + "learning_rate": 1.6485109589356871e-06, + "loss": 0.8644, + "step": 7894 + }, + { + "epoch": 0.5696042711301901, + "grad_norm": 3.0314931870273374, + "learning_rate": 1.6480508828020562e-06, + "loss": 0.9802, + "step": 7895 + }, + { + "epoch": 0.5696764185996176, + "grad_norm": 2.7020497030039365, + "learning_rate": 1.6475908258870641e-06, + "loss": 0.973, + "step": 7896 + }, + { + "epoch": 0.5697485660690451, + "grad_norm": 3.7609544336043146, + "learning_rate": 1.6471307882158324e-06, + "loss": 1.0725, + "step": 7897 + }, + { + "epoch": 0.5698207135384726, + "grad_norm": 2.4706003522041518, + "learning_rate": 1.646670769813482e-06, + "loss": 0.9267, + "step": 7898 + }, + { + "epoch": 0.5698928610079002, + "grad_norm": 2.8189039437829204, + "learning_rate": 1.646210770705133e-06, + "loss": 0.9889, + "step": 7899 + }, + { + "epoch": 0.5699650084773277, + "grad_norm": 3.47516032885078, + "learning_rate": 1.6457507909159041e-06, + "loss": 0.8121, + "step": 7900 + }, + { + "epoch": 0.5700371559467552, + "grad_norm": 3.651007912879295, + "learning_rate": 1.6452908304709126e-06, + "loss": 0.9107, + "step": 7901 + }, + { + "epoch": 0.5701093034161827, + "grad_norm": 6.406174673242355, + "learning_rate": 1.6448308893952764e-06, + "loss": 0.9441, + "step": 7902 + }, + { + "epoch": 0.5701814508856102, + "grad_norm": 3.343937378383716, + "learning_rate": 1.6443709677141097e-06, + "loss": 0.9431, + "step": 7903 + }, + { + "epoch": 0.5702535983550377, + "grad_norm": 3.5295107448947705, + "learning_rate": 1.643911065452529e-06, + "loss": 0.9446, + "step": 7904 + }, + { + "epoch": 0.5703257458244652, + "grad_norm": 2.520926439651848, + "learning_rate": 1.6434511826356467e-06, + "loss": 0.9221, + "step": 7905 + }, + { + "epoch": 0.5703978932938927, + "grad_norm": 3.0733790335357036, + "learning_rate": 1.642991319288575e-06, + "loss": 0.8881, + "step": 7906 + }, + { + "epoch": 0.5704700407633202, + "grad_norm": 3.131918915284836, + "learning_rate": 1.6425314754364258e-06, + "loss": 0.8973, + "step": 7907 + }, + { + "epoch": 0.5705421882327477, + "grad_norm": 3.5178722743648776, + "learning_rate": 1.6420716511043092e-06, + "loss": 0.9378, + "step": 7908 + }, + { + "epoch": 0.5706143357021752, + "grad_norm": 3.4209252001700783, + "learning_rate": 1.6416118463173355e-06, + "loss": 0.8833, + "step": 7909 + }, + { + "epoch": 0.5706864831716028, + "grad_norm": 7.084772150848845, + "learning_rate": 1.6411520611006115e-06, + "loss": 0.995, + "step": 7910 + }, + { + "epoch": 0.5707586306410303, + "grad_norm": 2.0821087735112846, + "learning_rate": 1.640692295479245e-06, + "loss": 0.8886, + "step": 7911 + }, + { + "epoch": 0.5708307781104578, + "grad_norm": 6.030067762813374, + "learning_rate": 1.6402325494783428e-06, + "loss": 0.8377, + "step": 7912 + }, + { + "epoch": 0.5709029255798853, + "grad_norm": 3.680247270320722, + "learning_rate": 1.6397728231230084e-06, + "loss": 0.993, + "step": 7913 + }, + { + "epoch": 0.5709750730493128, + "grad_norm": 3.1947553750534805, + "learning_rate": 1.6393131164383474e-06, + "loss": 0.8912, + "step": 7914 + }, + { + "epoch": 0.5710472205187404, + "grad_norm": 4.8985786478492965, + "learning_rate": 1.6388534294494618e-06, + "loss": 0.9338, + "step": 7915 + }, + { + "epoch": 0.5711193679881679, + "grad_norm": 8.114748076960804, + "learning_rate": 1.6383937621814525e-06, + "loss": 0.8019, + "step": 7916 + }, + { + "epoch": 0.5711915154575953, + "grad_norm": 5.744624071163765, + "learning_rate": 1.6379341146594222e-06, + "loss": 0.9553, + "step": 7917 + }, + { + "epoch": 0.5712636629270228, + "grad_norm": 4.192503487773757, + "learning_rate": 1.6374744869084689e-06, + "loss": 0.8671, + "step": 7918 + }, + { + "epoch": 0.5713358103964503, + "grad_norm": 4.26672416191304, + "learning_rate": 1.6370148789536925e-06, + "loss": 0.9625, + "step": 7919 + }, + { + "epoch": 0.5714079578658778, + "grad_norm": 2.954769423472199, + "learning_rate": 1.63655529082019e-06, + "loss": 0.9633, + "step": 7920 + }, + { + "epoch": 0.5714801053353054, + "grad_norm": 2.793553601899303, + "learning_rate": 1.6360957225330571e-06, + "loss": 1.0423, + "step": 7921 + }, + { + "epoch": 0.5715522528047329, + "grad_norm": 3.217058348321321, + "learning_rate": 1.6356361741173897e-06, + "loss": 1.0122, + "step": 7922 + }, + { + "epoch": 0.5716244002741604, + "grad_norm": 2.9668945939305194, + "learning_rate": 1.6351766455982821e-06, + "loss": 0.944, + "step": 7923 + }, + { + "epoch": 0.5716965477435879, + "grad_norm": 3.321059917025458, + "learning_rate": 1.6347171370008278e-06, + "loss": 1.0095, + "step": 7924 + }, + { + "epoch": 0.5717686952130154, + "grad_norm": 7.8864669872984345, + "learning_rate": 1.6342576483501173e-06, + "loss": 0.9987, + "step": 7925 + }, + { + "epoch": 0.571840842682443, + "grad_norm": 3.503439711659446, + "learning_rate": 1.6337981796712437e-06, + "loss": 0.8821, + "step": 7926 + }, + { + "epoch": 0.5719129901518705, + "grad_norm": 5.30267232784832, + "learning_rate": 1.633338730989296e-06, + "loss": 0.946, + "step": 7927 + }, + { + "epoch": 0.5719851376212979, + "grad_norm": 3.3869944177564104, + "learning_rate": 1.6328793023293617e-06, + "loss": 0.9448, + "step": 7928 + }, + { + "epoch": 0.5720572850907254, + "grad_norm": 0.848784157763705, + "learning_rate": 1.6324198937165307e-06, + "loss": 0.8154, + "step": 7929 + }, + { + "epoch": 0.5721294325601529, + "grad_norm": 5.902416135393333, + "learning_rate": 1.6319605051758879e-06, + "loss": 0.9599, + "step": 7930 + }, + { + "epoch": 0.5722015800295804, + "grad_norm": 2.9782525818787238, + "learning_rate": 1.63150113673252e-06, + "loss": 0.8366, + "step": 7931 + }, + { + "epoch": 0.572273727499008, + "grad_norm": 2.7201296933893873, + "learning_rate": 1.6310417884115112e-06, + "loss": 0.9349, + "step": 7932 + }, + { + "epoch": 0.5723458749684355, + "grad_norm": 3.803221747265667, + "learning_rate": 1.6305824602379436e-06, + "loss": 0.8724, + "step": 7933 + }, + { + "epoch": 0.572418022437863, + "grad_norm": 4.525133095148539, + "learning_rate": 1.6301231522369007e-06, + "loss": 0.9218, + "step": 7934 + }, + { + "epoch": 0.5724901699072905, + "grad_norm": 6.694021307454021, + "learning_rate": 1.6296638644334632e-06, + "loss": 0.859, + "step": 7935 + }, + { + "epoch": 0.572562317376718, + "grad_norm": 4.1927332275157045, + "learning_rate": 1.6292045968527114e-06, + "loss": 0.8146, + "step": 7936 + }, + { + "epoch": 0.5726344648461456, + "grad_norm": 2.7973634943243724, + "learning_rate": 1.6287453495197236e-06, + "loss": 0.9007, + "step": 7937 + }, + { + "epoch": 0.5727066123155731, + "grad_norm": 4.729696751041395, + "learning_rate": 1.6282861224595778e-06, + "loss": 0.8862, + "step": 7938 + }, + { + "epoch": 0.5727787597850006, + "grad_norm": 3.06234927682355, + "learning_rate": 1.6278269156973514e-06, + "loss": 0.8781, + "step": 7939 + }, + { + "epoch": 0.572850907254428, + "grad_norm": 4.121740729724082, + "learning_rate": 1.6273677292581185e-06, + "loss": 0.8347, + "step": 7940 + }, + { + "epoch": 0.5729230547238555, + "grad_norm": 4.128833261239922, + "learning_rate": 1.6269085631669554e-06, + "loss": 0.8366, + "step": 7941 + }, + { + "epoch": 0.572995202193283, + "grad_norm": 2.5851456735025127, + "learning_rate": 1.6264494174489345e-06, + "loss": 0.8785, + "step": 7942 + }, + { + "epoch": 0.5730673496627106, + "grad_norm": 2.362457283077871, + "learning_rate": 1.6259902921291271e-06, + "loss": 0.8172, + "step": 7943 + }, + { + "epoch": 0.5731394971321381, + "grad_norm": 2.8123016287464266, + "learning_rate": 1.6255311872326064e-06, + "loss": 0.9519, + "step": 7944 + }, + { + "epoch": 0.5732116446015656, + "grad_norm": 2.8432558070440215, + "learning_rate": 1.6250721027844405e-06, + "loss": 0.9375, + "step": 7945 + }, + { + "epoch": 0.5732837920709931, + "grad_norm": 16.75543215903241, + "learning_rate": 1.6246130388096995e-06, + "loss": 0.8853, + "step": 7946 + }, + { + "epoch": 0.5733559395404206, + "grad_norm": 7.75445182668151, + "learning_rate": 1.6241539953334508e-06, + "loss": 1.0541, + "step": 7947 + }, + { + "epoch": 0.5734280870098482, + "grad_norm": 3.114621590641962, + "learning_rate": 1.6236949723807612e-06, + "loss": 1.0071, + "step": 7948 + }, + { + "epoch": 0.5735002344792757, + "grad_norm": 3.950402207962538, + "learning_rate": 1.623235969976696e-06, + "loss": 0.8755, + "step": 7949 + }, + { + "epoch": 0.5735723819487032, + "grad_norm": 5.925311940669308, + "learning_rate": 1.6227769881463196e-06, + "loss": 0.8892, + "step": 7950 + }, + { + "epoch": 0.5736445294181307, + "grad_norm": 4.751298827231802, + "learning_rate": 1.622318026914696e-06, + "loss": 0.9891, + "step": 7951 + }, + { + "epoch": 0.5737166768875581, + "grad_norm": 3.6684043552491374, + "learning_rate": 1.6218590863068857e-06, + "loss": 0.9878, + "step": 7952 + }, + { + "epoch": 0.5737888243569856, + "grad_norm": 4.36328496565699, + "learning_rate": 1.621400166347952e-06, + "loss": 0.8016, + "step": 7953 + }, + { + "epoch": 0.5738609718264132, + "grad_norm": 5.968513524029533, + "learning_rate": 1.6209412670629535e-06, + "loss": 0.8723, + "step": 7954 + }, + { + "epoch": 0.5739331192958407, + "grad_norm": 0.6712645263449778, + "learning_rate": 1.6204823884769482e-06, + "loss": 0.7639, + "step": 7955 + }, + { + "epoch": 0.5740052667652682, + "grad_norm": 2.2313144236434774, + "learning_rate": 1.620023530614996e-06, + "loss": 0.8705, + "step": 7956 + }, + { + "epoch": 0.5740774142346957, + "grad_norm": 2.6369421744170807, + "learning_rate": 1.6195646935021517e-06, + "loss": 0.9599, + "step": 7957 + }, + { + "epoch": 0.5741495617041232, + "grad_norm": 0.6378666646070039, + "learning_rate": 1.619105877163471e-06, + "loss": 0.7742, + "step": 7958 + }, + { + "epoch": 0.5742217091735508, + "grad_norm": 2.35924433668483, + "learning_rate": 1.6186470816240086e-06, + "loss": 0.9708, + "step": 7959 + }, + { + "epoch": 0.5742938566429783, + "grad_norm": 5.377334087938981, + "learning_rate": 1.6181883069088172e-06, + "loss": 0.9837, + "step": 7960 + }, + { + "epoch": 0.5743660041124058, + "grad_norm": 2.7710265950276156, + "learning_rate": 1.6177295530429487e-06, + "loss": 0.8768, + "step": 7961 + }, + { + "epoch": 0.5744381515818333, + "grad_norm": 2.328718685772986, + "learning_rate": 1.6172708200514543e-06, + "loss": 0.9154, + "step": 7962 + }, + { + "epoch": 0.5745102990512608, + "grad_norm": 8.528251819778923, + "learning_rate": 1.616812107959384e-06, + "loss": 0.8336, + "step": 7963 + }, + { + "epoch": 0.5745824465206882, + "grad_norm": 2.94466273580261, + "learning_rate": 1.6163534167917855e-06, + "loss": 0.9349, + "step": 7964 + }, + { + "epoch": 0.5746545939901158, + "grad_norm": 2.7754395042372852, + "learning_rate": 1.615894746573707e-06, + "loss": 0.8928, + "step": 7965 + }, + { + "epoch": 0.5747267414595433, + "grad_norm": 2.814402805490055, + "learning_rate": 1.6154360973301946e-06, + "loss": 0.866, + "step": 7966 + }, + { + "epoch": 0.5747988889289708, + "grad_norm": 4.181934358845133, + "learning_rate": 1.6149774690862924e-06, + "loss": 0.863, + "step": 7967 + }, + { + "epoch": 0.5748710363983983, + "grad_norm": 3.137518109881048, + "learning_rate": 1.6145188618670465e-06, + "loss": 1.012, + "step": 7968 + }, + { + "epoch": 0.5749431838678258, + "grad_norm": 0.7056422209348813, + "learning_rate": 1.6140602756974979e-06, + "loss": 0.7512, + "step": 7969 + }, + { + "epoch": 0.5750153313372534, + "grad_norm": 3.7495024033063244, + "learning_rate": 1.613601710602689e-06, + "loss": 0.8542, + "step": 7970 + }, + { + "epoch": 0.5750874788066809, + "grad_norm": 2.052614740284618, + "learning_rate": 1.6131431666076607e-06, + "loss": 0.9189, + "step": 7971 + }, + { + "epoch": 0.5751596262761084, + "grad_norm": 6.101781316098783, + "learning_rate": 1.6126846437374516e-06, + "loss": 0.8231, + "step": 7972 + }, + { + "epoch": 0.5752317737455359, + "grad_norm": 2.1607668463789427, + "learning_rate": 1.6122261420171e-06, + "loss": 0.888, + "step": 7973 + }, + { + "epoch": 0.5753039212149634, + "grad_norm": 3.3193731437768417, + "learning_rate": 1.6117676614716437e-06, + "loss": 0.8651, + "step": 7974 + }, + { + "epoch": 0.5753760686843908, + "grad_norm": 2.646362301872095, + "learning_rate": 1.6113092021261186e-06, + "loss": 0.976, + "step": 7975 + }, + { + "epoch": 0.5754482161538184, + "grad_norm": 2.718647220466566, + "learning_rate": 1.6108507640055587e-06, + "loss": 0.9329, + "step": 7976 + }, + { + "epoch": 0.5755203636232459, + "grad_norm": 3.1770431432631305, + "learning_rate": 1.6103923471349975e-06, + "loss": 0.9085, + "step": 7977 + }, + { + "epoch": 0.5755925110926734, + "grad_norm": 4.741157130712961, + "learning_rate": 1.609933951539469e-06, + "loss": 0.8675, + "step": 7978 + }, + { + "epoch": 0.5756646585621009, + "grad_norm": 3.1252853263296902, + "learning_rate": 1.6094755772440023e-06, + "loss": 0.9939, + "step": 7979 + }, + { + "epoch": 0.5757368060315284, + "grad_norm": 3.4345891939522226, + "learning_rate": 1.6090172242736295e-06, + "loss": 0.8787, + "step": 7980 + }, + { + "epoch": 0.575808953500956, + "grad_norm": 3.213871045082765, + "learning_rate": 1.6085588926533787e-06, + "loss": 0.9615, + "step": 7981 + }, + { + "epoch": 0.5758811009703835, + "grad_norm": 0.7955330039010751, + "learning_rate": 1.608100582408277e-06, + "loss": 0.8629, + "step": 7982 + }, + { + "epoch": 0.575953248439811, + "grad_norm": 1.5671545510142024, + "learning_rate": 1.6076422935633524e-06, + "loss": 0.9889, + "step": 7983 + }, + { + "epoch": 0.5760253959092385, + "grad_norm": 3.312607961370815, + "learning_rate": 1.6071840261436294e-06, + "loss": 0.9344, + "step": 7984 + }, + { + "epoch": 0.576097543378666, + "grad_norm": 2.9629464694288155, + "learning_rate": 1.6067257801741325e-06, + "loss": 0.9105, + "step": 7985 + }, + { + "epoch": 0.5761696908480936, + "grad_norm": 2.7807700782496165, + "learning_rate": 1.6062675556798853e-06, + "loss": 0.959, + "step": 7986 + }, + { + "epoch": 0.576241838317521, + "grad_norm": 5.265345534880362, + "learning_rate": 1.6058093526859088e-06, + "loss": 0.9197, + "step": 7987 + }, + { + "epoch": 0.5763139857869485, + "grad_norm": 2.3578300217570867, + "learning_rate": 1.6053511712172245e-06, + "loss": 0.9522, + "step": 7988 + }, + { + "epoch": 0.576386133256376, + "grad_norm": 2.931033056889433, + "learning_rate": 1.604893011298852e-06, + "loss": 0.8966, + "step": 7989 + }, + { + "epoch": 0.5764582807258035, + "grad_norm": 3.285377304496083, + "learning_rate": 1.6044348729558095e-06, + "loss": 0.9547, + "step": 7990 + }, + { + "epoch": 0.576530428195231, + "grad_norm": 3.0177195494266362, + "learning_rate": 1.6039767562131137e-06, + "loss": 0.9737, + "step": 7991 + }, + { + "epoch": 0.5766025756646586, + "grad_norm": 1.0318781355793456, + "learning_rate": 1.6035186610957819e-06, + "loss": 0.717, + "step": 7992 + }, + { + "epoch": 0.5766747231340861, + "grad_norm": 2.8951029965752184, + "learning_rate": 1.6030605876288284e-06, + "loss": 0.9308, + "step": 7993 + }, + { + "epoch": 0.5767468706035136, + "grad_norm": 0.7854161757476918, + "learning_rate": 1.602602535837266e-06, + "loss": 0.776, + "step": 7994 + }, + { + "epoch": 0.5768190180729411, + "grad_norm": 7.38285771108842, + "learning_rate": 1.6021445057461084e-06, + "loss": 0.9399, + "step": 7995 + }, + { + "epoch": 0.5768911655423686, + "grad_norm": 0.7156642519917978, + "learning_rate": 1.6016864973803666e-06, + "loss": 0.8015, + "step": 7996 + }, + { + "epoch": 0.5769633130117962, + "grad_norm": 1.974275857018703, + "learning_rate": 1.6012285107650504e-06, + "loss": 0.9769, + "step": 7997 + }, + { + "epoch": 0.5770354604812237, + "grad_norm": 2.65656754334719, + "learning_rate": 1.6007705459251696e-06, + "loss": 0.9388, + "step": 7998 + }, + { + "epoch": 0.5771076079506511, + "grad_norm": 3.1680703481993793, + "learning_rate": 1.6003126028857313e-06, + "loss": 0.8313, + "step": 7999 + }, + { + "epoch": 0.5771797554200786, + "grad_norm": 3.2965547668267967, + "learning_rate": 1.5998546816717416e-06, + "loss": 1.0069, + "step": 8000 + }, + { + "epoch": 0.5772519028895061, + "grad_norm": 1.9843300641596477, + "learning_rate": 1.5993967823082066e-06, + "loss": 0.9559, + "step": 8001 + }, + { + "epoch": 0.5773240503589336, + "grad_norm": 6.5094576767658205, + "learning_rate": 1.598938904820131e-06, + "loss": 0.9036, + "step": 8002 + }, + { + "epoch": 0.5773961978283612, + "grad_norm": 9.113661737261838, + "learning_rate": 1.5984810492325165e-06, + "loss": 0.84, + "step": 8003 + }, + { + "epoch": 0.5774683452977887, + "grad_norm": 3.7838006362576566, + "learning_rate": 1.5980232155703655e-06, + "loss": 0.8605, + "step": 8004 + }, + { + "epoch": 0.5775404927672162, + "grad_norm": 2.4166238276751524, + "learning_rate": 1.597565403858679e-06, + "loss": 0.9048, + "step": 8005 + }, + { + "epoch": 0.5776126402366437, + "grad_norm": 3.0580503875093994, + "learning_rate": 1.5971076141224554e-06, + "loss": 0.9585, + "step": 8006 + }, + { + "epoch": 0.5776847877060712, + "grad_norm": 3.8188672286550123, + "learning_rate": 1.5966498463866941e-06, + "loss": 0.8804, + "step": 8007 + }, + { + "epoch": 0.5777569351754988, + "grad_norm": 2.7144925336273094, + "learning_rate": 1.5961921006763918e-06, + "loss": 0.9057, + "step": 8008 + }, + { + "epoch": 0.5778290826449263, + "grad_norm": 2.5784186918418532, + "learning_rate": 1.595734377016543e-06, + "loss": 0.9448, + "step": 8009 + }, + { + "epoch": 0.5779012301143538, + "grad_norm": 2.5102872907507003, + "learning_rate": 1.5952766754321445e-06, + "loss": 0.9004, + "step": 8010 + }, + { + "epoch": 0.5779733775837812, + "grad_norm": 3.1528749455636964, + "learning_rate": 1.5948189959481879e-06, + "loss": 0.8547, + "step": 8011 + }, + { + "epoch": 0.5780455250532087, + "grad_norm": 5.472304404341412, + "learning_rate": 1.5943613385896659e-06, + "loss": 0.9856, + "step": 8012 + }, + { + "epoch": 0.5781176725226362, + "grad_norm": 0.743528900621193, + "learning_rate": 1.5939037033815696e-06, + "loss": 0.8009, + "step": 8013 + }, + { + "epoch": 0.5781898199920638, + "grad_norm": 2.706421622457343, + "learning_rate": 1.5934460903488892e-06, + "loss": 0.9409, + "step": 8014 + }, + { + "epoch": 0.5782619674614913, + "grad_norm": 2.6192710667613412, + "learning_rate": 1.5929884995166125e-06, + "loss": 0.974, + "step": 8015 + }, + { + "epoch": 0.5783341149309188, + "grad_norm": 3.953451150700684, + "learning_rate": 1.5925309309097271e-06, + "loss": 0.959, + "step": 8016 + }, + { + "epoch": 0.5784062624003463, + "grad_norm": 3.209479750546035, + "learning_rate": 1.5920733845532194e-06, + "loss": 0.9375, + "step": 8017 + }, + { + "epoch": 0.5784784098697738, + "grad_norm": 11.441464263466782, + "learning_rate": 1.5916158604720733e-06, + "loss": 0.8431, + "step": 8018 + }, + { + "epoch": 0.5785505573392014, + "grad_norm": 2.6375479743535664, + "learning_rate": 1.5911583586912742e-06, + "loss": 1.0952, + "step": 8019 + }, + { + "epoch": 0.5786227048086289, + "grad_norm": 2.803101497623917, + "learning_rate": 1.5907008792358036e-06, + "loss": 0.9046, + "step": 8020 + }, + { + "epoch": 0.5786948522780564, + "grad_norm": 2.3982775507473484, + "learning_rate": 1.590243422130642e-06, + "loss": 0.8931, + "step": 8021 + }, + { + "epoch": 0.5787669997474839, + "grad_norm": 2.238424512928534, + "learning_rate": 1.589785987400771e-06, + "loss": 0.938, + "step": 8022 + }, + { + "epoch": 0.5788391472169113, + "grad_norm": 4.869122018177946, + "learning_rate": 1.5893285750711685e-06, + "loss": 0.8456, + "step": 8023 + }, + { + "epoch": 0.5789112946863388, + "grad_norm": 3.1367929338207787, + "learning_rate": 1.588871185166812e-06, + "loss": 0.927, + "step": 8024 + }, + { + "epoch": 0.5789834421557664, + "grad_norm": 3.3430641397843988, + "learning_rate": 1.5884138177126786e-06, + "loss": 0.977, + "step": 8025 + }, + { + "epoch": 0.5790555896251939, + "grad_norm": 0.7024982626136663, + "learning_rate": 1.5879564727337427e-06, + "loss": 0.7991, + "step": 8026 + }, + { + "epoch": 0.5791277370946214, + "grad_norm": 3.108294888856086, + "learning_rate": 1.5874991502549784e-06, + "loss": 0.7971, + "step": 8027 + }, + { + "epoch": 0.5791998845640489, + "grad_norm": 2.1543955914315926, + "learning_rate": 1.5870418503013582e-06, + "loss": 0.8573, + "step": 8028 + }, + { + "epoch": 0.5792720320334764, + "grad_norm": 1.9776115196988155, + "learning_rate": 1.5865845728978546e-06, + "loss": 0.8718, + "step": 8029 + }, + { + "epoch": 0.579344179502904, + "grad_norm": 2.6696791521093917, + "learning_rate": 1.5861273180694367e-06, + "loss": 0.928, + "step": 8030 + }, + { + "epoch": 0.5794163269723315, + "grad_norm": 4.521646991878628, + "learning_rate": 1.5856700858410736e-06, + "loss": 0.9528, + "step": 8031 + }, + { + "epoch": 0.579488474441759, + "grad_norm": 6.031022932294594, + "learning_rate": 1.585212876237734e-06, + "loss": 0.8031, + "step": 8032 + }, + { + "epoch": 0.5795606219111865, + "grad_norm": 0.738661814297852, + "learning_rate": 1.5847556892843828e-06, + "loss": 0.8074, + "step": 8033 + }, + { + "epoch": 0.5796327693806139, + "grad_norm": 2.8763391444298247, + "learning_rate": 1.5842985250059871e-06, + "loss": 0.93, + "step": 8034 + }, + { + "epoch": 0.5797049168500414, + "grad_norm": 3.826451274334798, + "learning_rate": 1.5838413834275092e-06, + "loss": 0.8852, + "step": 8035 + }, + { + "epoch": 0.579777064319469, + "grad_norm": 5.330598765910355, + "learning_rate": 1.5833842645739134e-06, + "loss": 0.8349, + "step": 8036 + }, + { + "epoch": 0.5798492117888965, + "grad_norm": 0.8078552042964483, + "learning_rate": 1.5829271684701609e-06, + "loss": 0.777, + "step": 8037 + }, + { + "epoch": 0.579921359258324, + "grad_norm": 2.44639753455243, + "learning_rate": 1.5824700951412111e-06, + "loss": 0.8691, + "step": 8038 + }, + { + "epoch": 0.5799935067277515, + "grad_norm": 0.7775862809049272, + "learning_rate": 1.582013044612024e-06, + "loss": 0.8044, + "step": 8039 + }, + { + "epoch": 0.580065654197179, + "grad_norm": 5.340923253230607, + "learning_rate": 1.5815560169075572e-06, + "loss": 0.8479, + "step": 8040 + }, + { + "epoch": 0.5801378016666066, + "grad_norm": 0.9385071430920146, + "learning_rate": 1.5810990120527676e-06, + "loss": 0.7953, + "step": 8041 + }, + { + "epoch": 0.5802099491360341, + "grad_norm": 3.9038624295166073, + "learning_rate": 1.58064203007261e-06, + "loss": 1.0325, + "step": 8042 + }, + { + "epoch": 0.5802820966054616, + "grad_norm": 0.7734895168499033, + "learning_rate": 1.5801850709920386e-06, + "loss": 0.7917, + "step": 8043 + }, + { + "epoch": 0.5803542440748891, + "grad_norm": 2.606037403830727, + "learning_rate": 1.5797281348360068e-06, + "loss": 1.0194, + "step": 8044 + }, + { + "epoch": 0.5804263915443166, + "grad_norm": 2.4592191964303036, + "learning_rate": 1.5792712216294649e-06, + "loss": 1.0089, + "step": 8045 + }, + { + "epoch": 0.580498539013744, + "grad_norm": 6.535194775364627, + "learning_rate": 1.578814331397365e-06, + "loss": 0.8561, + "step": 8046 + }, + { + "epoch": 0.5805706864831716, + "grad_norm": 0.9400860722175058, + "learning_rate": 1.578357464164655e-06, + "loss": 0.9145, + "step": 8047 + }, + { + "epoch": 0.5806428339525991, + "grad_norm": 2.224948445269338, + "learning_rate": 1.5779006199562826e-06, + "loss": 0.8862, + "step": 8048 + }, + { + "epoch": 0.5807149814220266, + "grad_norm": 2.2620393990239904, + "learning_rate": 1.5774437987971954e-06, + "loss": 0.9592, + "step": 8049 + }, + { + "epoch": 0.5807871288914541, + "grad_norm": 3.582358841387786, + "learning_rate": 1.5769870007123377e-06, + "loss": 0.9447, + "step": 8050 + }, + { + "epoch": 0.5808592763608816, + "grad_norm": 2.161023371183945, + "learning_rate": 1.576530225726654e-06, + "loss": 0.9479, + "step": 8051 + }, + { + "epoch": 0.5809314238303092, + "grad_norm": 2.8424355959075527, + "learning_rate": 1.5760734738650873e-06, + "loss": 0.8326, + "step": 8052 + }, + { + "epoch": 0.5810035712997367, + "grad_norm": 18.113905043038155, + "learning_rate": 1.5756167451525789e-06, + "loss": 0.9077, + "step": 8053 + }, + { + "epoch": 0.5810757187691642, + "grad_norm": 2.8035237652768212, + "learning_rate": 1.5751600396140686e-06, + "loss": 0.9239, + "step": 8054 + }, + { + "epoch": 0.5811478662385917, + "grad_norm": 2.875245125313605, + "learning_rate": 1.5747033572744958e-06, + "loss": 0.7847, + "step": 8055 + }, + { + "epoch": 0.5812200137080192, + "grad_norm": 2.1598958691469043, + "learning_rate": 1.5742466981587989e-06, + "loss": 0.9417, + "step": 8056 + }, + { + "epoch": 0.5812921611774468, + "grad_norm": 3.3284032284486598, + "learning_rate": 1.573790062291913e-06, + "loss": 1.0241, + "step": 8057 + }, + { + "epoch": 0.5813643086468742, + "grad_norm": 5.31052263148429, + "learning_rate": 1.5733334496987744e-06, + "loss": 0.9206, + "step": 8058 + }, + { + "epoch": 0.5814364561163017, + "grad_norm": 3.037347854131876, + "learning_rate": 1.572876860404317e-06, + "loss": 0.9054, + "step": 8059 + }, + { + "epoch": 0.5815086035857292, + "grad_norm": 2.9110178241714775, + "learning_rate": 1.5724202944334724e-06, + "loss": 0.8375, + "step": 8060 + }, + { + "epoch": 0.5815807510551567, + "grad_norm": 2.095132058472149, + "learning_rate": 1.5719637518111732e-06, + "loss": 0.905, + "step": 8061 + }, + { + "epoch": 0.5816528985245842, + "grad_norm": 0.9413356833701026, + "learning_rate": 1.5715072325623487e-06, + "loss": 0.8268, + "step": 8062 + }, + { + "epoch": 0.5817250459940118, + "grad_norm": 2.7663187999417884, + "learning_rate": 1.571050736711928e-06, + "loss": 0.9093, + "step": 8063 + }, + { + "epoch": 0.5817971934634393, + "grad_norm": 3.585032496391505, + "learning_rate": 1.5705942642848392e-06, + "loss": 0.9031, + "step": 8064 + }, + { + "epoch": 0.5818693409328668, + "grad_norm": 3.188040294856836, + "learning_rate": 1.5701378153060072e-06, + "loss": 0.8469, + "step": 8065 + }, + { + "epoch": 0.5819414884022943, + "grad_norm": 1.7735324111613922, + "learning_rate": 1.569681389800358e-06, + "loss": 1.0424, + "step": 8066 + }, + { + "epoch": 0.5820136358717218, + "grad_norm": 3.3280934793475065, + "learning_rate": 1.5692249877928153e-06, + "loss": 0.9062, + "step": 8067 + }, + { + "epoch": 0.5820857833411494, + "grad_norm": 3.6259889240295617, + "learning_rate": 1.5687686093083016e-06, + "loss": 0.9015, + "step": 8068 + }, + { + "epoch": 0.5821579308105769, + "grad_norm": 0.8387433317007493, + "learning_rate": 1.568312254371738e-06, + "loss": 0.8616, + "step": 8069 + }, + { + "epoch": 0.5822300782800043, + "grad_norm": 13.578319412086335, + "learning_rate": 1.567855923008043e-06, + "loss": 0.9711, + "step": 8070 + }, + { + "epoch": 0.5823022257494318, + "grad_norm": 4.74117664202967, + "learning_rate": 1.567399615242137e-06, + "loss": 0.8545, + "step": 8071 + }, + { + "epoch": 0.5823743732188593, + "grad_norm": 2.8765461951792966, + "learning_rate": 1.5669433310989361e-06, + "loss": 0.8883, + "step": 8072 + }, + { + "epoch": 0.5824465206882868, + "grad_norm": 15.781387298288886, + "learning_rate": 1.5664870706033577e-06, + "loss": 0.8978, + "step": 8073 + }, + { + "epoch": 0.5825186681577144, + "grad_norm": 2.5772847973943205, + "learning_rate": 1.5660308337803152e-06, + "loss": 0.9453, + "step": 8074 + }, + { + "epoch": 0.5825908156271419, + "grad_norm": 7.9497746753550205, + "learning_rate": 1.5655746206547219e-06, + "loss": 0.9002, + "step": 8075 + }, + { + "epoch": 0.5826629630965694, + "grad_norm": 2.806654896231984, + "learning_rate": 1.565118431251491e-06, + "loss": 0.9613, + "step": 8076 + }, + { + "epoch": 0.5827351105659969, + "grad_norm": 2.6812245934623156, + "learning_rate": 1.5646622655955322e-06, + "loss": 0.9504, + "step": 8077 + }, + { + "epoch": 0.5828072580354244, + "grad_norm": 2.5280887973678277, + "learning_rate": 1.5642061237117555e-06, + "loss": 0.897, + "step": 8078 + }, + { + "epoch": 0.582879405504852, + "grad_norm": 2.6937413155754277, + "learning_rate": 1.5637500056250692e-06, + "loss": 0.9064, + "step": 8079 + }, + { + "epoch": 0.5829515529742795, + "grad_norm": 3.0718249606988293, + "learning_rate": 1.5632939113603807e-06, + "loss": 0.9245, + "step": 8080 + }, + { + "epoch": 0.5830237004437069, + "grad_norm": 2.7338394948563596, + "learning_rate": 1.5628378409425951e-06, + "loss": 0.9757, + "step": 8081 + }, + { + "epoch": 0.5830958479131344, + "grad_norm": 3.4309553787678717, + "learning_rate": 1.5623817943966156e-06, + "loss": 0.8691, + "step": 8082 + }, + { + "epoch": 0.5831679953825619, + "grad_norm": 3.6494311790891745, + "learning_rate": 1.5619257717473474e-06, + "loss": 0.8943, + "step": 8083 + }, + { + "epoch": 0.5832401428519894, + "grad_norm": 2.8163924514495977, + "learning_rate": 1.56146977301969e-06, + "loss": 0.9405, + "step": 8084 + }, + { + "epoch": 0.583312290321417, + "grad_norm": 5.768873669258604, + "learning_rate": 1.561013798238546e-06, + "loss": 0.9493, + "step": 8085 + }, + { + "epoch": 0.5833844377908445, + "grad_norm": 7.977474209114065, + "learning_rate": 1.560557847428813e-06, + "loss": 1.0241, + "step": 8086 + }, + { + "epoch": 0.583456585260272, + "grad_norm": 5.763022892660424, + "learning_rate": 1.5601019206153888e-06, + "loss": 0.9126, + "step": 8087 + }, + { + "epoch": 0.5835287327296995, + "grad_norm": 2.4590840459595467, + "learning_rate": 1.559646017823171e-06, + "loss": 0.8861, + "step": 8088 + }, + { + "epoch": 0.583600880199127, + "grad_norm": 8.24800316891392, + "learning_rate": 1.5591901390770535e-06, + "loss": 0.9957, + "step": 8089 + }, + { + "epoch": 0.5836730276685546, + "grad_norm": 5.1399799162476025, + "learning_rate": 1.5587342844019308e-06, + "loss": 0.8305, + "step": 8090 + }, + { + "epoch": 0.5837451751379821, + "grad_norm": 3.1730346063313073, + "learning_rate": 1.5582784538226956e-06, + "loss": 0.9432, + "step": 8091 + }, + { + "epoch": 0.5838173226074096, + "grad_norm": 4.173705196043027, + "learning_rate": 1.5578226473642384e-06, + "loss": 0.914, + "step": 8092 + }, + { + "epoch": 0.583889470076837, + "grad_norm": 2.488690640533404, + "learning_rate": 1.5573668650514501e-06, + "loss": 0.9216, + "step": 8093 + }, + { + "epoch": 0.5839616175462645, + "grad_norm": 2.991173953513607, + "learning_rate": 1.5569111069092178e-06, + "loss": 0.913, + "step": 8094 + }, + { + "epoch": 0.584033765015692, + "grad_norm": 99.93263844491699, + "learning_rate": 1.5564553729624308e-06, + "loss": 0.8783, + "step": 8095 + }, + { + "epoch": 0.5841059124851196, + "grad_norm": 7.327361044683849, + "learning_rate": 1.5559996632359736e-06, + "loss": 0.8515, + "step": 8096 + }, + { + "epoch": 0.5841780599545471, + "grad_norm": 2.307533704980619, + "learning_rate": 1.5555439777547306e-06, + "loss": 0.8816, + "step": 8097 + }, + { + "epoch": 0.5842502074239746, + "grad_norm": 0.8247278819545414, + "learning_rate": 1.5550883165435863e-06, + "loss": 0.8592, + "step": 8098 + }, + { + "epoch": 0.5843223548934021, + "grad_norm": 2.1657125744307826, + "learning_rate": 1.5546326796274212e-06, + "loss": 1.0392, + "step": 8099 + }, + { + "epoch": 0.5843945023628296, + "grad_norm": 4.521383342891203, + "learning_rate": 1.5541770670311177e-06, + "loss": 0.9859, + "step": 8100 + }, + { + "epoch": 0.5844666498322572, + "grad_norm": 3.84828067882846, + "learning_rate": 1.5537214787795537e-06, + "loss": 0.8436, + "step": 8101 + }, + { + "epoch": 0.5845387973016847, + "grad_norm": 2.870455093455127, + "learning_rate": 1.5532659148976074e-06, + "loss": 0.9727, + "step": 8102 + }, + { + "epoch": 0.5846109447711122, + "grad_norm": 3.5920183073057315, + "learning_rate": 1.5528103754101565e-06, + "loss": 0.9563, + "step": 8103 + }, + { + "epoch": 0.5846830922405397, + "grad_norm": 5.1607808869194365, + "learning_rate": 1.552354860342075e-06, + "loss": 0.8272, + "step": 8104 + }, + { + "epoch": 0.5847552397099671, + "grad_norm": 3.7562748862154627, + "learning_rate": 1.551899369718238e-06, + "loss": 0.8695, + "step": 8105 + }, + { + "epoch": 0.5848273871793946, + "grad_norm": 3.2857865716552888, + "learning_rate": 1.5514439035635168e-06, + "loss": 0.9392, + "step": 8106 + }, + { + "epoch": 0.5848995346488222, + "grad_norm": 3.3741434211209733, + "learning_rate": 1.5509884619027843e-06, + "loss": 0.997, + "step": 8107 + }, + { + "epoch": 0.5849716821182497, + "grad_norm": 2.432937656091445, + "learning_rate": 1.5505330447609096e-06, + "loss": 0.9348, + "step": 8108 + }, + { + "epoch": 0.5850438295876772, + "grad_norm": 9.47865386805482, + "learning_rate": 1.5500776521627608e-06, + "loss": 0.8291, + "step": 8109 + }, + { + "epoch": 0.5851159770571047, + "grad_norm": 0.8132721826229269, + "learning_rate": 1.5496222841332064e-06, + "loss": 0.8818, + "step": 8110 + }, + { + "epoch": 0.5851881245265322, + "grad_norm": 3.17665574161643, + "learning_rate": 1.5491669406971114e-06, + "loss": 1.0678, + "step": 8111 + }, + { + "epoch": 0.5852602719959598, + "grad_norm": 2.89290549559105, + "learning_rate": 1.5487116218793415e-06, + "loss": 0.8655, + "step": 8112 + }, + { + "epoch": 0.5853324194653873, + "grad_norm": 2.987400777916808, + "learning_rate": 1.548256327704759e-06, + "loss": 0.96, + "step": 8113 + }, + { + "epoch": 0.5854045669348148, + "grad_norm": 4.3678565425580524, + "learning_rate": 1.5478010581982259e-06, + "loss": 0.9184, + "step": 8114 + }, + { + "epoch": 0.5854767144042423, + "grad_norm": 3.872645862804748, + "learning_rate": 1.5473458133846036e-06, + "loss": 0.8964, + "step": 8115 + }, + { + "epoch": 0.5855488618736698, + "grad_norm": 2.6205088933638394, + "learning_rate": 1.5468905932887502e-06, + "loss": 0.9116, + "step": 8116 + }, + { + "epoch": 0.5856210093430972, + "grad_norm": 3.441114848483215, + "learning_rate": 1.5464353979355247e-06, + "loss": 0.9753, + "step": 8117 + }, + { + "epoch": 0.5856931568125248, + "grad_norm": 3.6730932933613105, + "learning_rate": 1.545980227349782e-06, + "loss": 0.974, + "step": 8118 + }, + { + "epoch": 0.5857653042819523, + "grad_norm": 2.5930871691073856, + "learning_rate": 1.5455250815563796e-06, + "loss": 1.0567, + "step": 8119 + }, + { + "epoch": 0.5858374517513798, + "grad_norm": 3.3338868317416464, + "learning_rate": 1.5450699605801698e-06, + "loss": 0.8411, + "step": 8120 + }, + { + "epoch": 0.5859095992208073, + "grad_norm": 4.835479172293434, + "learning_rate": 1.5446148644460047e-06, + "loss": 0.9545, + "step": 8121 + }, + { + "epoch": 0.5859817466902348, + "grad_norm": 4.4307612441471855, + "learning_rate": 1.544159793178737e-06, + "loss": 1.0709, + "step": 8122 + }, + { + "epoch": 0.5860538941596624, + "grad_norm": 3.7624780952094796, + "learning_rate": 1.5437047468032142e-06, + "loss": 0.9357, + "step": 8123 + }, + { + "epoch": 0.5861260416290899, + "grad_norm": 2.4956760206197117, + "learning_rate": 1.5432497253442873e-06, + "loss": 0.9078, + "step": 8124 + }, + { + "epoch": 0.5861981890985174, + "grad_norm": 0.7011495815122697, + "learning_rate": 1.542794728826802e-06, + "loss": 0.7981, + "step": 8125 + }, + { + "epoch": 0.5862703365679449, + "grad_norm": 2.5888619381489124, + "learning_rate": 1.542339757275603e-06, + "loss": 0.9514, + "step": 8126 + }, + { + "epoch": 0.5863424840373724, + "grad_norm": 3.1183270735264115, + "learning_rate": 1.541884810715537e-06, + "loss": 0.9376, + "step": 8127 + }, + { + "epoch": 0.5864146315067998, + "grad_norm": 2.0142211756855497, + "learning_rate": 1.5414298891714448e-06, + "loss": 0.9546, + "step": 8128 + }, + { + "epoch": 0.5864867789762274, + "grad_norm": 3.046817720314951, + "learning_rate": 1.5409749926681693e-06, + "loss": 0.9886, + "step": 8129 + }, + { + "epoch": 0.5865589264456549, + "grad_norm": 7.2344765048393285, + "learning_rate": 1.5405201212305496e-06, + "loss": 0.9461, + "step": 8130 + }, + { + "epoch": 0.5866310739150824, + "grad_norm": 2.848319270808143, + "learning_rate": 1.5400652748834254e-06, + "loss": 0.9579, + "step": 8131 + }, + { + "epoch": 0.5867032213845099, + "grad_norm": 4.7619911222347096, + "learning_rate": 1.539610453651634e-06, + "loss": 0.9444, + "step": 8132 + }, + { + "epoch": 0.5867753688539374, + "grad_norm": 4.091818783503401, + "learning_rate": 1.539155657560011e-06, + "loss": 0.7985, + "step": 8133 + }, + { + "epoch": 0.586847516323365, + "grad_norm": 3.4383419046250454, + "learning_rate": 1.538700886633392e-06, + "loss": 0.9329, + "step": 8134 + }, + { + "epoch": 0.5869196637927925, + "grad_norm": 2.6224653631928563, + "learning_rate": 1.53824614089661e-06, + "loss": 0.8667, + "step": 8135 + }, + { + "epoch": 0.58699181126222, + "grad_norm": 12.994317206497843, + "learning_rate": 1.5377914203744965e-06, + "loss": 0.8179, + "step": 8136 + }, + { + "epoch": 0.5870639587316475, + "grad_norm": 3.7721009345602425, + "learning_rate": 1.5373367250918828e-06, + "loss": 0.946, + "step": 8137 + }, + { + "epoch": 0.587136106201075, + "grad_norm": 4.362442960612176, + "learning_rate": 1.536882055073597e-06, + "loss": 0.8922, + "step": 8138 + }, + { + "epoch": 0.5872082536705026, + "grad_norm": 2.669176847462488, + "learning_rate": 1.5364274103444687e-06, + "loss": 0.9272, + "step": 8139 + }, + { + "epoch": 0.58728040113993, + "grad_norm": 2.743378383348464, + "learning_rate": 1.5359727909293232e-06, + "loss": 0.8821, + "step": 8140 + }, + { + "epoch": 0.5873525486093575, + "grad_norm": 2.592596509970879, + "learning_rate": 1.5355181968529862e-06, + "loss": 0.8728, + "step": 8141 + }, + { + "epoch": 0.587424696078785, + "grad_norm": 4.381786858811118, + "learning_rate": 1.5350636281402804e-06, + "loss": 0.8915, + "step": 8142 + }, + { + "epoch": 0.5874968435482125, + "grad_norm": 2.5582039339556766, + "learning_rate": 1.534609084816029e-06, + "loss": 0.8948, + "step": 8143 + }, + { + "epoch": 0.58756899101764, + "grad_norm": 2.556716439891215, + "learning_rate": 1.5341545669050528e-06, + "loss": 0.9497, + "step": 8144 + }, + { + "epoch": 0.5876411384870676, + "grad_norm": 2.6840763019114084, + "learning_rate": 1.5337000744321707e-06, + "loss": 0.9757, + "step": 8145 + }, + { + "epoch": 0.5877132859564951, + "grad_norm": 2.3431872900342343, + "learning_rate": 1.5332456074222018e-06, + "loss": 0.9425, + "step": 8146 + }, + { + "epoch": 0.5877854334259226, + "grad_norm": 2.746664538691292, + "learning_rate": 1.5327911658999625e-06, + "loss": 0.9, + "step": 8147 + }, + { + "epoch": 0.5878575808953501, + "grad_norm": 3.1782119664059163, + "learning_rate": 1.5323367498902676e-06, + "loss": 0.9582, + "step": 8148 + }, + { + "epoch": 0.5879297283647776, + "grad_norm": 2.7471859145157205, + "learning_rate": 1.531882359417932e-06, + "loss": 0.8838, + "step": 8149 + }, + { + "epoch": 0.5880018758342052, + "grad_norm": 2.471247218839077, + "learning_rate": 1.531427994507767e-06, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5880740233036327, + "grad_norm": 0.849114109249685, + "learning_rate": 1.5309736551845857e-06, + "loss": 0.7711, + "step": 8151 + }, + { + "epoch": 0.5881461707730601, + "grad_norm": 5.523407457153975, + "learning_rate": 1.5305193414731968e-06, + "loss": 0.9333, + "step": 8152 + }, + { + "epoch": 0.5882183182424876, + "grad_norm": 3.6540463319376872, + "learning_rate": 1.5300650533984082e-06, + "loss": 0.8704, + "step": 8153 + }, + { + "epoch": 0.5882904657119151, + "grad_norm": 0.7649546432762118, + "learning_rate": 1.5296107909850272e-06, + "loss": 0.8907, + "step": 8154 + }, + { + "epoch": 0.5883626131813426, + "grad_norm": 4.908555964983689, + "learning_rate": 1.52915655425786e-06, + "loss": 0.9279, + "step": 8155 + }, + { + "epoch": 0.5884347606507702, + "grad_norm": 0.7245094662292075, + "learning_rate": 1.5287023432417106e-06, + "loss": 0.7198, + "step": 8156 + }, + { + "epoch": 0.5885069081201977, + "grad_norm": 3.468302517457221, + "learning_rate": 1.528248157961381e-06, + "loss": 0.943, + "step": 8157 + }, + { + "epoch": 0.5885790555896252, + "grad_norm": 2.1915216806511215, + "learning_rate": 1.5277939984416733e-06, + "loss": 0.9042, + "step": 8158 + }, + { + "epoch": 0.5886512030590527, + "grad_norm": 2.418502822587261, + "learning_rate": 1.5273398647073877e-06, + "loss": 0.962, + "step": 8159 + }, + { + "epoch": 0.5887233505284802, + "grad_norm": 3.8129529840235405, + "learning_rate": 1.5268857567833214e-06, + "loss": 0.9424, + "step": 8160 + }, + { + "epoch": 0.5887954979979078, + "grad_norm": 3.8460634294299205, + "learning_rate": 1.5264316746942733e-06, + "loss": 0.8359, + "step": 8161 + }, + { + "epoch": 0.5888676454673353, + "grad_norm": 4.216355385932086, + "learning_rate": 1.5259776184650376e-06, + "loss": 0.9966, + "step": 8162 + }, + { + "epoch": 0.5889397929367628, + "grad_norm": 2.5608699545293647, + "learning_rate": 1.5255235881204102e-06, + "loss": 0.9008, + "step": 8163 + }, + { + "epoch": 0.5890119404061902, + "grad_norm": 3.88939981434572, + "learning_rate": 1.5250695836851835e-06, + "loss": 0.9527, + "step": 8164 + }, + { + "epoch": 0.5890840878756177, + "grad_norm": 3.2305676446281937, + "learning_rate": 1.5246156051841479e-06, + "loss": 0.8792, + "step": 8165 + }, + { + "epoch": 0.5891562353450452, + "grad_norm": 3.7986324308929573, + "learning_rate": 1.5241616526420944e-06, + "loss": 0.9367, + "step": 8166 + }, + { + "epoch": 0.5892283828144728, + "grad_norm": 3.102434396103954, + "learning_rate": 1.5237077260838116e-06, + "loss": 0.9562, + "step": 8167 + }, + { + "epoch": 0.5893005302839003, + "grad_norm": 3.6297622180330835, + "learning_rate": 1.5232538255340873e-06, + "loss": 0.835, + "step": 8168 + }, + { + "epoch": 0.5893726777533278, + "grad_norm": 6.634756138221801, + "learning_rate": 1.5227999510177061e-06, + "loss": 0.8431, + "step": 8169 + }, + { + "epoch": 0.5894448252227553, + "grad_norm": 4.191669949984671, + "learning_rate": 1.5223461025594532e-06, + "loss": 0.9653, + "step": 8170 + }, + { + "epoch": 0.5895169726921828, + "grad_norm": 3.474702418916534, + "learning_rate": 1.5218922801841117e-06, + "loss": 0.9865, + "step": 8171 + }, + { + "epoch": 0.5895891201616104, + "grad_norm": 2.2807828869140505, + "learning_rate": 1.5214384839164618e-06, + "loss": 0.9669, + "step": 8172 + }, + { + "epoch": 0.5896612676310379, + "grad_norm": 3.1372770617791668, + "learning_rate": 1.5209847137812859e-06, + "loss": 0.9578, + "step": 8173 + }, + { + "epoch": 0.5897334151004654, + "grad_norm": 2.004684921584103, + "learning_rate": 1.5205309698033615e-06, + "loss": 0.9116, + "step": 8174 + }, + { + "epoch": 0.5898055625698929, + "grad_norm": 3.1332451956777647, + "learning_rate": 1.520077252007465e-06, + "loss": 1.0051, + "step": 8175 + }, + { + "epoch": 0.5898777100393203, + "grad_norm": 2.6667606416673935, + "learning_rate": 1.519623560418374e-06, + "loss": 1.0018, + "step": 8176 + }, + { + "epoch": 0.5899498575087478, + "grad_norm": 18.062906333286406, + "learning_rate": 1.5191698950608616e-06, + "loss": 0.8641, + "step": 8177 + }, + { + "epoch": 0.5900220049781754, + "grad_norm": 2.764214969826707, + "learning_rate": 1.5187162559597014e-06, + "loss": 0.8648, + "step": 8178 + }, + { + "epoch": 0.5900941524476029, + "grad_norm": 3.7898933679481646, + "learning_rate": 1.518262643139665e-06, + "loss": 0.8918, + "step": 8179 + }, + { + "epoch": 0.5901662999170304, + "grad_norm": 0.6540511167932708, + "learning_rate": 1.5178090566255221e-06, + "loss": 0.7837, + "step": 8180 + }, + { + "epoch": 0.5902384473864579, + "grad_norm": 0.6935067678326202, + "learning_rate": 1.5173554964420417e-06, + "loss": 0.7708, + "step": 8181 + }, + { + "epoch": 0.5903105948558854, + "grad_norm": 2.0601846819562293, + "learning_rate": 1.5169019626139906e-06, + "loss": 0.891, + "step": 8182 + }, + { + "epoch": 0.590382742325313, + "grad_norm": 2.995505303757833, + "learning_rate": 1.5164484551661357e-06, + "loss": 0.8129, + "step": 8183 + }, + { + "epoch": 0.5904548897947405, + "grad_norm": 2.7728266513428523, + "learning_rate": 1.5159949741232396e-06, + "loss": 0.9608, + "step": 8184 + }, + { + "epoch": 0.590527037264168, + "grad_norm": 2.6711375685626466, + "learning_rate": 1.5155415195100672e-06, + "loss": 0.9638, + "step": 8185 + }, + { + "epoch": 0.5905991847335955, + "grad_norm": 5.40861327275113, + "learning_rate": 1.515088091351379e-06, + "loss": 0.9832, + "step": 8186 + }, + { + "epoch": 0.5906713322030229, + "grad_norm": 2.2658261966160342, + "learning_rate": 1.514634689671934e-06, + "loss": 0.8821, + "step": 8187 + }, + { + "epoch": 0.5907434796724504, + "grad_norm": 3.492719707855413, + "learning_rate": 1.514181314496493e-06, + "loss": 0.9848, + "step": 8188 + }, + { + "epoch": 0.590815627141878, + "grad_norm": 3.1702730991402497, + "learning_rate": 1.5137279658498113e-06, + "loss": 0.9345, + "step": 8189 + }, + { + "epoch": 0.5908877746113055, + "grad_norm": 3.518838456500494, + "learning_rate": 1.5132746437566454e-06, + "loss": 0.9575, + "step": 8190 + }, + { + "epoch": 0.590959922080733, + "grad_norm": 3.0470053620449318, + "learning_rate": 1.5128213482417499e-06, + "loss": 0.9353, + "step": 8191 + }, + { + "epoch": 0.5910320695501605, + "grad_norm": 7.848645145586459, + "learning_rate": 1.5123680793298766e-06, + "loss": 0.9284, + "step": 8192 + }, + { + "epoch": 0.591104217019588, + "grad_norm": 3.68531799414332, + "learning_rate": 1.5119148370457773e-06, + "loss": 0.9219, + "step": 8193 + }, + { + "epoch": 0.5911763644890156, + "grad_norm": 4.169103723529068, + "learning_rate": 1.5114616214142017e-06, + "loss": 0.8955, + "step": 8194 + }, + { + "epoch": 0.5912485119584431, + "grad_norm": 2.4685750005963634, + "learning_rate": 1.5110084324598992e-06, + "loss": 0.7799, + "step": 8195 + }, + { + "epoch": 0.5913206594278706, + "grad_norm": 2.8167743839079638, + "learning_rate": 1.5105552702076153e-06, + "loss": 1.0179, + "step": 8196 + }, + { + "epoch": 0.5913928068972981, + "grad_norm": 4.3873411231333845, + "learning_rate": 1.5101021346820966e-06, + "loss": 0.9679, + "step": 8197 + }, + { + "epoch": 0.5914649543667256, + "grad_norm": 3.1168754249329216, + "learning_rate": 1.5096490259080873e-06, + "loss": 0.977, + "step": 8198 + }, + { + "epoch": 0.591537101836153, + "grad_norm": 2.5863613922187416, + "learning_rate": 1.5091959439103282e-06, + "loss": 1.0216, + "step": 8199 + }, + { + "epoch": 0.5916092493055806, + "grad_norm": 10.128845991432788, + "learning_rate": 1.5087428887135628e-06, + "loss": 0.9906, + "step": 8200 + }, + { + "epoch": 0.5916813967750081, + "grad_norm": 4.731105771001769, + "learning_rate": 1.5082898603425294e-06, + "loss": 0.9608, + "step": 8201 + }, + { + "epoch": 0.5917535442444356, + "grad_norm": 3.1936818931170694, + "learning_rate": 1.507836858821966e-06, + "loss": 0.9323, + "step": 8202 + }, + { + "epoch": 0.5918256917138631, + "grad_norm": 3.516965347791809, + "learning_rate": 1.5073838841766105e-06, + "loss": 0.9172, + "step": 8203 + }, + { + "epoch": 0.5918978391832906, + "grad_norm": 4.409453830244137, + "learning_rate": 1.5069309364311969e-06, + "loss": 0.9615, + "step": 8204 + }, + { + "epoch": 0.5919699866527182, + "grad_norm": 4.157640038274864, + "learning_rate": 1.5064780156104597e-06, + "loss": 1.0182, + "step": 8205 + }, + { + "epoch": 0.5920421341221457, + "grad_norm": 6.018625596009558, + "learning_rate": 1.5060251217391311e-06, + "loss": 0.8926, + "step": 8206 + }, + { + "epoch": 0.5921142815915732, + "grad_norm": 2.948116354138117, + "learning_rate": 1.5055722548419425e-06, + "loss": 0.8918, + "step": 8207 + }, + { + "epoch": 0.5921864290610007, + "grad_norm": 3.010907371857244, + "learning_rate": 1.5051194149436221e-06, + "loss": 0.8927, + "step": 8208 + }, + { + "epoch": 0.5922585765304282, + "grad_norm": 4.313885673482996, + "learning_rate": 1.5046666020688987e-06, + "loss": 0.9953, + "step": 8209 + }, + { + "epoch": 0.5923307239998558, + "grad_norm": 3.747300384274379, + "learning_rate": 1.5042138162424988e-06, + "loss": 0.9706, + "step": 8210 + }, + { + "epoch": 0.5924028714692832, + "grad_norm": 4.157174143909895, + "learning_rate": 1.5037610574891465e-06, + "loss": 0.8808, + "step": 8211 + }, + { + "epoch": 0.5924750189387107, + "grad_norm": 4.23301691890285, + "learning_rate": 1.5033083258335663e-06, + "loss": 0.9344, + "step": 8212 + }, + { + "epoch": 0.5925471664081382, + "grad_norm": 3.9118844861331374, + "learning_rate": 1.50285562130048e-06, + "loss": 0.9777, + "step": 8213 + }, + { + "epoch": 0.5926193138775657, + "grad_norm": 3.1621336532518405, + "learning_rate": 1.502402943914607e-06, + "loss": 0.7862, + "step": 8214 + }, + { + "epoch": 0.5926914613469932, + "grad_norm": 8.923389312982746, + "learning_rate": 1.5019502937006677e-06, + "loss": 0.8628, + "step": 8215 + }, + { + "epoch": 0.5927636088164208, + "grad_norm": 4.3790650147539925, + "learning_rate": 1.5014976706833792e-06, + "loss": 0.9868, + "step": 8216 + }, + { + "epoch": 0.5928357562858483, + "grad_norm": 3.88596969364146, + "learning_rate": 1.501045074887457e-06, + "loss": 1.0094, + "step": 8217 + }, + { + "epoch": 0.5929079037552758, + "grad_norm": 16.717537010611697, + "learning_rate": 1.500592506337617e-06, + "loss": 0.9131, + "step": 8218 + }, + { + "epoch": 0.5929800512247033, + "grad_norm": 7.586902686650791, + "learning_rate": 1.500139965058571e-06, + "loss": 0.9766, + "step": 8219 + }, + { + "epoch": 0.5930521986941308, + "grad_norm": 24.790544688591076, + "learning_rate": 1.4996874510750307e-06, + "loss": 0.9615, + "step": 8220 + }, + { + "epoch": 0.5931243461635584, + "grad_norm": 2.8621489197102643, + "learning_rate": 1.4992349644117073e-06, + "loss": 0.9193, + "step": 8221 + }, + { + "epoch": 0.5931964936329859, + "grad_norm": 19.897615437475324, + "learning_rate": 1.4987825050933086e-06, + "loss": 0.9713, + "step": 8222 + }, + { + "epoch": 0.5932686411024133, + "grad_norm": 2.3392649143172024, + "learning_rate": 1.4983300731445416e-06, + "loss": 0.9519, + "step": 8223 + }, + { + "epoch": 0.5933407885718408, + "grad_norm": 11.467941047764416, + "learning_rate": 1.4978776685901122e-06, + "loss": 0.8906, + "step": 8224 + }, + { + "epoch": 0.5934129360412683, + "grad_norm": 0.8264124454663467, + "learning_rate": 1.497425291454725e-06, + "loss": 0.8502, + "step": 8225 + }, + { + "epoch": 0.5934850835106958, + "grad_norm": 2.7993442993797184, + "learning_rate": 1.4969729417630812e-06, + "loss": 0.9856, + "step": 8226 + }, + { + "epoch": 0.5935572309801234, + "grad_norm": 0.7081207816662439, + "learning_rate": 1.496520619539884e-06, + "loss": 0.8049, + "step": 8227 + }, + { + "epoch": 0.5936293784495509, + "grad_norm": 3.0765820039098952, + "learning_rate": 1.496068324809831e-06, + "loss": 0.8536, + "step": 8228 + }, + { + "epoch": 0.5937015259189784, + "grad_norm": 2.372529702678977, + "learning_rate": 1.4956160575976218e-06, + "loss": 0.9445, + "step": 8229 + }, + { + "epoch": 0.5937736733884059, + "grad_norm": 6.33413794910954, + "learning_rate": 1.4951638179279528e-06, + "loss": 0.9595, + "step": 8230 + }, + { + "epoch": 0.5938458208578334, + "grad_norm": 3.1125455247851175, + "learning_rate": 1.4947116058255184e-06, + "loss": 0.8959, + "step": 8231 + }, + { + "epoch": 0.593917968327261, + "grad_norm": 4.672615528423545, + "learning_rate": 1.4942594213150127e-06, + "loss": 0.856, + "step": 8232 + }, + { + "epoch": 0.5939901157966885, + "grad_norm": 3.7559683035588662, + "learning_rate": 1.493807264421128e-06, + "loss": 0.9459, + "step": 8233 + }, + { + "epoch": 0.5940622632661159, + "grad_norm": 4.134220107126015, + "learning_rate": 1.4933551351685549e-06, + "loss": 0.9619, + "step": 8234 + }, + { + "epoch": 0.5941344107355434, + "grad_norm": 2.755879359507403, + "learning_rate": 1.4929030335819822e-06, + "loss": 0.9101, + "step": 8235 + }, + { + "epoch": 0.5942065582049709, + "grad_norm": 3.5487852624079808, + "learning_rate": 1.4924509596860973e-06, + "loss": 0.9239, + "step": 8236 + }, + { + "epoch": 0.5942787056743984, + "grad_norm": 0.7242690364164851, + "learning_rate": 1.4919989135055872e-06, + "loss": 0.7707, + "step": 8237 + }, + { + "epoch": 0.594350853143826, + "grad_norm": 3.980216694975896, + "learning_rate": 1.491546895065135e-06, + "loss": 0.9788, + "step": 8238 + }, + { + "epoch": 0.5944230006132535, + "grad_norm": 5.596619621171518, + "learning_rate": 1.4910949043894255e-06, + "loss": 0.8499, + "step": 8239 + }, + { + "epoch": 0.594495148082681, + "grad_norm": 3.052303388732463, + "learning_rate": 1.4906429415031393e-06, + "loss": 1.0126, + "step": 8240 + }, + { + "epoch": 0.5945672955521085, + "grad_norm": 4.096938660430383, + "learning_rate": 1.4901910064309555e-06, + "loss": 0.8223, + "step": 8241 + }, + { + "epoch": 0.594639443021536, + "grad_norm": 3.0079729309893155, + "learning_rate": 1.4897390991975547e-06, + "loss": 0.8994, + "step": 8242 + }, + { + "epoch": 0.5947115904909636, + "grad_norm": 2.2687830520618313, + "learning_rate": 1.4892872198276124e-06, + "loss": 0.9166, + "step": 8243 + }, + { + "epoch": 0.5947837379603911, + "grad_norm": 4.352508058666501, + "learning_rate": 1.4888353683458042e-06, + "loss": 1.0082, + "step": 8244 + }, + { + "epoch": 0.5948558854298186, + "grad_norm": 5.552081176681052, + "learning_rate": 1.4883835447768046e-06, + "loss": 0.8969, + "step": 8245 + }, + { + "epoch": 0.594928032899246, + "grad_norm": 3.9490284557887025, + "learning_rate": 1.4879317491452857e-06, + "loss": 0.909, + "step": 8246 + }, + { + "epoch": 0.5950001803686735, + "grad_norm": 3.7991840892028836, + "learning_rate": 1.487479981475918e-06, + "loss": 0.9566, + "step": 8247 + }, + { + "epoch": 0.595072327838101, + "grad_norm": 4.095506596977799, + "learning_rate": 1.4870282417933715e-06, + "loss": 0.7669, + "step": 8248 + }, + { + "epoch": 0.5951444753075286, + "grad_norm": 3.913257386631173, + "learning_rate": 1.486576530122314e-06, + "loss": 0.8443, + "step": 8249 + }, + { + "epoch": 0.5952166227769561, + "grad_norm": 2.741787177532181, + "learning_rate": 1.4861248464874105e-06, + "loss": 0.9679, + "step": 8250 + }, + { + "epoch": 0.5952887702463836, + "grad_norm": 3.700531771825113, + "learning_rate": 1.485673190913328e-06, + "loss": 0.8374, + "step": 8251 + }, + { + "epoch": 0.5953609177158111, + "grad_norm": 5.561011297342411, + "learning_rate": 1.4852215634247285e-06, + "loss": 0.9851, + "step": 8252 + }, + { + "epoch": 0.5954330651852386, + "grad_norm": 3.672684470698984, + "learning_rate": 1.484769964046273e-06, + "loss": 0.8309, + "step": 8253 + }, + { + "epoch": 0.5955052126546662, + "grad_norm": 4.784635056118623, + "learning_rate": 1.4843183928026234e-06, + "loss": 1.0328, + "step": 8254 + }, + { + "epoch": 0.5955773601240937, + "grad_norm": 4.79251725760364, + "learning_rate": 1.4838668497184364e-06, + "loss": 0.8971, + "step": 8255 + }, + { + "epoch": 0.5956495075935212, + "grad_norm": 3.2198246023138477, + "learning_rate": 1.4834153348183706e-06, + "loss": 0.8052, + "step": 8256 + }, + { + "epoch": 0.5957216550629487, + "grad_norm": 3.655539549573843, + "learning_rate": 1.4829638481270812e-06, + "loss": 0.8833, + "step": 8257 + }, + { + "epoch": 0.5957938025323761, + "grad_norm": 4.500763934247306, + "learning_rate": 1.4825123896692218e-06, + "loss": 0.82, + "step": 8258 + }, + { + "epoch": 0.5958659500018036, + "grad_norm": 5.324795664417354, + "learning_rate": 1.482060959469445e-06, + "loss": 0.9766, + "step": 8259 + }, + { + "epoch": 0.5959380974712312, + "grad_norm": 4.595749400465595, + "learning_rate": 1.4816095575524018e-06, + "loss": 0.9844, + "step": 8260 + }, + { + "epoch": 0.5960102449406587, + "grad_norm": 1.9085717367432233, + "learning_rate": 1.4811581839427425e-06, + "loss": 0.8366, + "step": 8261 + }, + { + "epoch": 0.5960823924100862, + "grad_norm": 4.291471878124952, + "learning_rate": 1.4807068386651135e-06, + "loss": 0.89, + "step": 8262 + }, + { + "epoch": 0.5961545398795137, + "grad_norm": 3.190780447555418, + "learning_rate": 1.4802555217441612e-06, + "loss": 0.9755, + "step": 8263 + }, + { + "epoch": 0.5962266873489412, + "grad_norm": 3.395536600889968, + "learning_rate": 1.4798042332045318e-06, + "loss": 0.9265, + "step": 8264 + }, + { + "epoch": 0.5962988348183688, + "grad_norm": 4.4307136759881995, + "learning_rate": 1.4793529730708663e-06, + "loss": 0.9293, + "step": 8265 + }, + { + "epoch": 0.5963709822877963, + "grad_norm": 6.437307410461096, + "learning_rate": 1.4789017413678084e-06, + "loss": 1.0028, + "step": 8266 + }, + { + "epoch": 0.5964431297572238, + "grad_norm": 3.261586882158463, + "learning_rate": 1.478450538119998e-06, + "loss": 0.8956, + "step": 8267 + }, + { + "epoch": 0.5965152772266513, + "grad_norm": 9.81244868368349, + "learning_rate": 1.4779993633520716e-06, + "loss": 0.7952, + "step": 8268 + }, + { + "epoch": 0.5965874246960788, + "grad_norm": 10.416438067153086, + "learning_rate": 1.4775482170886684e-06, + "loss": 0.9297, + "step": 8269 + }, + { + "epoch": 0.5966595721655062, + "grad_norm": 3.5608122994012734, + "learning_rate": 1.4770970993544227e-06, + "loss": 0.9617, + "step": 8270 + }, + { + "epoch": 0.5967317196349338, + "grad_norm": 2.5777263911193415, + "learning_rate": 1.476646010173969e-06, + "loss": 0.9694, + "step": 8271 + }, + { + "epoch": 0.5968038671043613, + "grad_norm": 4.708700632718072, + "learning_rate": 1.4761949495719388e-06, + "loss": 1.0611, + "step": 8272 + }, + { + "epoch": 0.5968760145737888, + "grad_norm": 2.3200678327114854, + "learning_rate": 1.4757439175729639e-06, + "loss": 0.9309, + "step": 8273 + }, + { + "epoch": 0.5969481620432163, + "grad_norm": 2.7593073697296813, + "learning_rate": 1.4752929142016726e-06, + "loss": 0.9895, + "step": 8274 + }, + { + "epoch": 0.5970203095126438, + "grad_norm": 3.427306719671878, + "learning_rate": 1.474841939482693e-06, + "loss": 0.8621, + "step": 8275 + }, + { + "epoch": 0.5970924569820714, + "grad_norm": 3.312520009106357, + "learning_rate": 1.4743909934406512e-06, + "loss": 0.8529, + "step": 8276 + }, + { + "epoch": 0.5971646044514989, + "grad_norm": 3.6090621482379257, + "learning_rate": 1.4739400761001708e-06, + "loss": 0.9058, + "step": 8277 + }, + { + "epoch": 0.5972367519209264, + "grad_norm": 6.8223344005980815, + "learning_rate": 1.4734891874858761e-06, + "loss": 1.0162, + "step": 8278 + }, + { + "epoch": 0.5973088993903539, + "grad_norm": 2.9137913472937567, + "learning_rate": 1.4730383276223879e-06, + "loss": 0.9592, + "step": 8279 + }, + { + "epoch": 0.5973810468597814, + "grad_norm": 2.890755810226438, + "learning_rate": 1.472587496534325e-06, + "loss": 1.0124, + "step": 8280 + }, + { + "epoch": 0.5974531943292088, + "grad_norm": 4.322184648543882, + "learning_rate": 1.4721366942463074e-06, + "loss": 0.9431, + "step": 8281 + }, + { + "epoch": 0.5975253417986364, + "grad_norm": 3.224563533611922, + "learning_rate": 1.4716859207829507e-06, + "loss": 0.9905, + "step": 8282 + }, + { + "epoch": 0.5975974892680639, + "grad_norm": 24.644314235155615, + "learning_rate": 1.4712351761688697e-06, + "loss": 0.9492, + "step": 8283 + }, + { + "epoch": 0.5976696367374914, + "grad_norm": 5.9354041968290865, + "learning_rate": 1.4707844604286788e-06, + "loss": 0.9681, + "step": 8284 + }, + { + "epoch": 0.5977417842069189, + "grad_norm": 14.122336347472142, + "learning_rate": 1.4703337735869891e-06, + "loss": 0.9201, + "step": 8285 + }, + { + "epoch": 0.5978139316763464, + "grad_norm": 9.08678638956856, + "learning_rate": 1.4698831156684112e-06, + "loss": 0.8476, + "step": 8286 + }, + { + "epoch": 0.597886079145774, + "grad_norm": 3.459089970067345, + "learning_rate": 1.469432486697554e-06, + "loss": 0.7924, + "step": 8287 + }, + { + "epoch": 0.5979582266152015, + "grad_norm": 8.725605993349223, + "learning_rate": 1.4689818866990251e-06, + "loss": 0.9208, + "step": 8288 + }, + { + "epoch": 0.598030374084629, + "grad_norm": 7.565659548693576, + "learning_rate": 1.4685313156974296e-06, + "loss": 1.0109, + "step": 8289 + }, + { + "epoch": 0.5981025215540565, + "grad_norm": 2.413779232849708, + "learning_rate": 1.4680807737173704e-06, + "loss": 0.9136, + "step": 8290 + }, + { + "epoch": 0.598174669023484, + "grad_norm": 3.5624985276604004, + "learning_rate": 1.4676302607834521e-06, + "loss": 0.8453, + "step": 8291 + }, + { + "epoch": 0.5982468164929116, + "grad_norm": 3.1376419703974214, + "learning_rate": 1.4671797769202736e-06, + "loss": 0.9193, + "step": 8292 + }, + { + "epoch": 0.598318963962339, + "grad_norm": 2.8190422261875687, + "learning_rate": 1.466729322152436e-06, + "loss": 0.8829, + "step": 8293 + }, + { + "epoch": 0.5983911114317665, + "grad_norm": 5.786298866947396, + "learning_rate": 1.4662788965045353e-06, + "loss": 0.9142, + "step": 8294 + }, + { + "epoch": 0.598463258901194, + "grad_norm": 7.824453745534932, + "learning_rate": 1.4658285000011685e-06, + "loss": 0.94, + "step": 8295 + }, + { + "epoch": 0.5985354063706215, + "grad_norm": 3.5698317990896298, + "learning_rate": 1.4653781326669305e-06, + "loss": 0.8915, + "step": 8296 + }, + { + "epoch": 0.598607553840049, + "grad_norm": 7.6558456625534115, + "learning_rate": 1.464927794526413e-06, + "loss": 0.9797, + "step": 8297 + }, + { + "epoch": 0.5986797013094766, + "grad_norm": 3.3850545523232842, + "learning_rate": 1.464477485604208e-06, + "loss": 0.9567, + "step": 8298 + }, + { + "epoch": 0.5987518487789041, + "grad_norm": 3.5257410609674245, + "learning_rate": 1.464027205924905e-06, + "loss": 0.9642, + "step": 8299 + }, + { + "epoch": 0.5988239962483316, + "grad_norm": 0.779995865321816, + "learning_rate": 1.4635769555130923e-06, + "loss": 0.8143, + "step": 8300 + }, + { + "epoch": 0.5988961437177591, + "grad_norm": 6.555638677353838, + "learning_rate": 1.4631267343933564e-06, + "loss": 0.8378, + "step": 8301 + }, + { + "epoch": 0.5989682911871866, + "grad_norm": 5.148260596887255, + "learning_rate": 1.4626765425902814e-06, + "loss": 0.821, + "step": 8302 + }, + { + "epoch": 0.5990404386566142, + "grad_norm": 5.330171164220956, + "learning_rate": 1.462226380128452e-06, + "loss": 0.9447, + "step": 8303 + }, + { + "epoch": 0.5991125861260417, + "grad_norm": 2.9209472488921375, + "learning_rate": 1.4617762470324486e-06, + "loss": 0.9742, + "step": 8304 + }, + { + "epoch": 0.5991847335954691, + "grad_norm": 3.8685846444383354, + "learning_rate": 1.4613261433268526e-06, + "loss": 0.973, + "step": 8305 + }, + { + "epoch": 0.5992568810648966, + "grad_norm": 3.803636597676952, + "learning_rate": 1.4608760690362418e-06, + "loss": 0.9038, + "step": 8306 + }, + { + "epoch": 0.5993290285343241, + "grad_norm": 4.940791131736525, + "learning_rate": 1.4604260241851925e-06, + "loss": 0.8986, + "step": 8307 + }, + { + "epoch": 0.5994011760037516, + "grad_norm": 5.581380796875576, + "learning_rate": 1.459976008798281e-06, + "loss": 0.8553, + "step": 8308 + }, + { + "epoch": 0.5994733234731792, + "grad_norm": 3.3379904797362827, + "learning_rate": 1.4595260229000804e-06, + "loss": 0.8726, + "step": 8309 + }, + { + "epoch": 0.5995454709426067, + "grad_norm": 2.7312497835137393, + "learning_rate": 1.4590760665151628e-06, + "loss": 0.8719, + "step": 8310 + }, + { + "epoch": 0.5996176184120342, + "grad_norm": 7.026958597091927, + "learning_rate": 1.4586261396680995e-06, + "loss": 0.9482, + "step": 8311 + }, + { + "epoch": 0.5996897658814617, + "grad_norm": 3.6526856614275927, + "learning_rate": 1.4581762423834578e-06, + "loss": 0.9319, + "step": 8312 + }, + { + "epoch": 0.5997619133508892, + "grad_norm": 3.990483287902947, + "learning_rate": 1.4577263746858064e-06, + "loss": 0.8977, + "step": 8313 + }, + { + "epoch": 0.5998340608203168, + "grad_norm": 0.9010974153828112, + "learning_rate": 1.4572765365997093e-06, + "loss": 0.8578, + "step": 8314 + }, + { + "epoch": 0.5999062082897443, + "grad_norm": 8.429855387432655, + "learning_rate": 1.4568267281497325e-06, + "loss": 0.8787, + "step": 8315 + }, + { + "epoch": 0.5999783557591718, + "grad_norm": 0.6969593133941356, + "learning_rate": 1.456376949360436e-06, + "loss": 0.8357, + "step": 8316 + }, + { + "epoch": 0.6000505032285992, + "grad_norm": 2.7013201559083657, + "learning_rate": 1.4559272002563832e-06, + "loss": 0.9397, + "step": 8317 + }, + { + "epoch": 0.6001226506980267, + "grad_norm": 3.14891453766261, + "learning_rate": 1.4554774808621317e-06, + "loss": 0.7915, + "step": 8318 + }, + { + "epoch": 0.6001947981674542, + "grad_norm": 0.7805965360975611, + "learning_rate": 1.4550277912022384e-06, + "loss": 0.7786, + "step": 8319 + }, + { + "epoch": 0.6002669456368818, + "grad_norm": 2.87386747527472, + "learning_rate": 1.4545781313012608e-06, + "loss": 0.9214, + "step": 8320 + }, + { + "epoch": 0.6003390931063093, + "grad_norm": 3.655743164406718, + "learning_rate": 1.454128501183752e-06, + "loss": 0.8607, + "step": 8321 + }, + { + "epoch": 0.6004112405757368, + "grad_norm": 3.759600366700876, + "learning_rate": 1.453678900874265e-06, + "loss": 0.8883, + "step": 8322 + }, + { + "epoch": 0.6004833880451643, + "grad_norm": 3.4987024899466914, + "learning_rate": 1.4532293303973512e-06, + "loss": 0.8631, + "step": 8323 + }, + { + "epoch": 0.6005555355145918, + "grad_norm": 2.410137157890092, + "learning_rate": 1.4527797897775592e-06, + "loss": 0.8903, + "step": 8324 + }, + { + "epoch": 0.6006276829840194, + "grad_norm": 2.500737176928049, + "learning_rate": 1.4523302790394375e-06, + "loss": 0.9607, + "step": 8325 + }, + { + "epoch": 0.6006998304534469, + "grad_norm": 5.810017045171688, + "learning_rate": 1.4518807982075312e-06, + "loss": 0.9666, + "step": 8326 + }, + { + "epoch": 0.6007719779228744, + "grad_norm": 4.33292714685028, + "learning_rate": 1.451431347306386e-06, + "loss": 0.9291, + "step": 8327 + }, + { + "epoch": 0.6008441253923019, + "grad_norm": 3.2053106526996817, + "learning_rate": 1.4509819263605445e-06, + "loss": 0.9228, + "step": 8328 + }, + { + "epoch": 0.6009162728617293, + "grad_norm": 2.523181914149931, + "learning_rate": 1.4505325353945468e-06, + "loss": 0.8294, + "step": 8329 + }, + { + "epoch": 0.6009884203311568, + "grad_norm": 3.5520871219260557, + "learning_rate": 1.4500831744329338e-06, + "loss": 0.934, + "step": 8330 + }, + { + "epoch": 0.6010605678005844, + "grad_norm": 4.778985977193786, + "learning_rate": 1.449633843500242e-06, + "loss": 1.0635, + "step": 8331 + }, + { + "epoch": 0.6011327152700119, + "grad_norm": 2.7743761650163785, + "learning_rate": 1.44918454262101e-06, + "loss": 0.861, + "step": 8332 + }, + { + "epoch": 0.6012048627394394, + "grad_norm": 5.612639006186798, + "learning_rate": 1.4487352718197707e-06, + "loss": 0.8145, + "step": 8333 + }, + { + "epoch": 0.6012770102088669, + "grad_norm": 2.6519005563885916, + "learning_rate": 1.4482860311210567e-06, + "loss": 0.9324, + "step": 8334 + }, + { + "epoch": 0.6013491576782944, + "grad_norm": 3.694052483425916, + "learning_rate": 1.447836820549401e-06, + "loss": 0.9776, + "step": 8335 + }, + { + "epoch": 0.601421305147722, + "grad_norm": 2.150968745906811, + "learning_rate": 1.4473876401293321e-06, + "loss": 0.8836, + "step": 8336 + }, + { + "epoch": 0.6014934526171495, + "grad_norm": 4.86387401476627, + "learning_rate": 1.4469384898853784e-06, + "loss": 0.9154, + "step": 8337 + }, + { + "epoch": 0.601565600086577, + "grad_norm": 5.221555218587355, + "learning_rate": 1.4464893698420663e-06, + "loss": 0.8829, + "step": 8338 + }, + { + "epoch": 0.6016377475560045, + "grad_norm": 5.117188431834362, + "learning_rate": 1.4460402800239215e-06, + "loss": 0.9216, + "step": 8339 + }, + { + "epoch": 0.6017098950254319, + "grad_norm": 9.793889176305495, + "learning_rate": 1.445591220455466e-06, + "loss": 1.0096, + "step": 8340 + }, + { + "epoch": 0.6017820424948594, + "grad_norm": 3.910286610628216, + "learning_rate": 1.4451421911612209e-06, + "loss": 0.8471, + "step": 8341 + }, + { + "epoch": 0.601854189964287, + "grad_norm": 3.5406222579958904, + "learning_rate": 1.4446931921657078e-06, + "loss": 0.966, + "step": 8342 + }, + { + "epoch": 0.6019263374337145, + "grad_norm": 3.3937893354127953, + "learning_rate": 1.4442442234934425e-06, + "loss": 0.9677, + "step": 8343 + }, + { + "epoch": 0.601998484903142, + "grad_norm": 6.925415511352573, + "learning_rate": 1.4437952851689437e-06, + "loss": 0.949, + "step": 8344 + }, + { + "epoch": 0.6020706323725695, + "grad_norm": 7.980130076865989, + "learning_rate": 1.4433463772167257e-06, + "loss": 0.9651, + "step": 8345 + }, + { + "epoch": 0.602142779841997, + "grad_norm": 4.807735908897467, + "learning_rate": 1.4428974996613001e-06, + "loss": 0.8282, + "step": 8346 + }, + { + "epoch": 0.6022149273114246, + "grad_norm": 3.833252878312277, + "learning_rate": 1.4424486525271808e-06, + "loss": 0.9306, + "step": 8347 + }, + { + "epoch": 0.6022870747808521, + "grad_norm": 0.735658173894783, + "learning_rate": 1.4419998358388763e-06, + "loss": 0.8082, + "step": 8348 + }, + { + "epoch": 0.6023592222502796, + "grad_norm": 10.632730342146772, + "learning_rate": 1.4415510496208951e-06, + "loss": 0.9433, + "step": 8349 + }, + { + "epoch": 0.6024313697197071, + "grad_norm": 17.37697584595148, + "learning_rate": 1.4411022938977442e-06, + "loss": 0.8526, + "step": 8350 + }, + { + "epoch": 0.6025035171891346, + "grad_norm": 5.688117025252623, + "learning_rate": 1.4406535686939277e-06, + "loss": 0.9697, + "step": 8351 + }, + { + "epoch": 0.602575664658562, + "grad_norm": 0.7645963258878025, + "learning_rate": 1.4402048740339492e-06, + "loss": 0.8006, + "step": 8352 + }, + { + "epoch": 0.6026478121279896, + "grad_norm": 3.964248387027212, + "learning_rate": 1.4397562099423098e-06, + "loss": 0.9486, + "step": 8353 + }, + { + "epoch": 0.6027199595974171, + "grad_norm": 7.011879513738491, + "learning_rate": 1.4393075764435106e-06, + "loss": 0.95, + "step": 8354 + }, + { + "epoch": 0.6027921070668446, + "grad_norm": 5.628383212438504, + "learning_rate": 1.438858973562049e-06, + "loss": 0.8496, + "step": 8355 + }, + { + "epoch": 0.6028642545362721, + "grad_norm": 5.919066072461602, + "learning_rate": 1.4384104013224203e-06, + "loss": 0.9374, + "step": 8356 + }, + { + "epoch": 0.6029364020056996, + "grad_norm": 2.6345124232603245, + "learning_rate": 1.4379618597491221e-06, + "loss": 0.8198, + "step": 8357 + }, + { + "epoch": 0.6030085494751272, + "grad_norm": 3.6939421163256805, + "learning_rate": 1.4375133488666449e-06, + "loss": 0.9298, + "step": 8358 + }, + { + "epoch": 0.6030806969445547, + "grad_norm": 4.1227225750302505, + "learning_rate": 1.4370648686994826e-06, + "loss": 0.8508, + "step": 8359 + }, + { + "epoch": 0.6031528444139822, + "grad_norm": 3.3493840177926777, + "learning_rate": 1.4366164192721236e-06, + "loss": 0.8627, + "step": 8360 + }, + { + "epoch": 0.6032249918834097, + "grad_norm": 3.704234154297058, + "learning_rate": 1.436168000609056e-06, + "loss": 0.8557, + "step": 8361 + }, + { + "epoch": 0.6032971393528372, + "grad_norm": 3.3922364233592757, + "learning_rate": 1.435719612734767e-06, + "loss": 0.9035, + "step": 8362 + }, + { + "epoch": 0.6033692868222648, + "grad_norm": 3.221907317094756, + "learning_rate": 1.4352712556737409e-06, + "loss": 0.9167, + "step": 8363 + }, + { + "epoch": 0.6034414342916922, + "grad_norm": 3.3947947655556647, + "learning_rate": 1.4348229294504615e-06, + "loss": 0.9281, + "step": 8364 + }, + { + "epoch": 0.6035135817611197, + "grad_norm": 3.9111582932612543, + "learning_rate": 1.4343746340894086e-06, + "loss": 0.9194, + "step": 8365 + }, + { + "epoch": 0.6035857292305472, + "grad_norm": 4.8281125559229805, + "learning_rate": 1.433926369615064e-06, + "loss": 0.943, + "step": 8366 + }, + { + "epoch": 0.6036578766999747, + "grad_norm": 2.9118129853151964, + "learning_rate": 1.433478136051905e-06, + "loss": 0.9773, + "step": 8367 + }, + { + "epoch": 0.6037300241694022, + "grad_norm": 4.8046922637171505, + "learning_rate": 1.4330299334244068e-06, + "loss": 0.8025, + "step": 8368 + }, + { + "epoch": 0.6038021716388298, + "grad_norm": 9.828961408045096, + "learning_rate": 1.432581761757046e-06, + "loss": 0.883, + "step": 8369 + }, + { + "epoch": 0.6038743191082573, + "grad_norm": 3.04312748427651, + "learning_rate": 1.4321336210742936e-06, + "loss": 0.9252, + "step": 8370 + }, + { + "epoch": 0.6039464665776848, + "grad_norm": 3.5756538220053935, + "learning_rate": 1.431685511400623e-06, + "loss": 0.7946, + "step": 8371 + }, + { + "epoch": 0.6040186140471123, + "grad_norm": 6.611303891203739, + "learning_rate": 1.4312374327605028e-06, + "loss": 1.0145, + "step": 8372 + }, + { + "epoch": 0.6040907615165398, + "grad_norm": 5.336937918090554, + "learning_rate": 1.4307893851784e-06, + "loss": 1.0625, + "step": 8373 + }, + { + "epoch": 0.6041629089859674, + "grad_norm": 4.3223218884018815, + "learning_rate": 1.4303413686787826e-06, + "loss": 0.9316, + "step": 8374 + }, + { + "epoch": 0.6042350564553949, + "grad_norm": 3.741441497031326, + "learning_rate": 1.429893383286114e-06, + "loss": 0.7948, + "step": 8375 + }, + { + "epoch": 0.6043072039248223, + "grad_norm": 3.12722546489318, + "learning_rate": 1.4294454290248576e-06, + "loss": 0.9739, + "step": 8376 + }, + { + "epoch": 0.6043793513942498, + "grad_norm": 7.667398362557308, + "learning_rate": 1.4289975059194737e-06, + "loss": 0.9694, + "step": 8377 + }, + { + "epoch": 0.6044514988636773, + "grad_norm": 4.378813253430986, + "learning_rate": 1.4285496139944222e-06, + "loss": 0.9508, + "step": 8378 + }, + { + "epoch": 0.6045236463331048, + "grad_norm": 0.7335719323697113, + "learning_rate": 1.4281017532741613e-06, + "loss": 0.7437, + "step": 8379 + }, + { + "epoch": 0.6045957938025324, + "grad_norm": 6.653733615084677, + "learning_rate": 1.4276539237831458e-06, + "loss": 0.9904, + "step": 8380 + }, + { + "epoch": 0.6046679412719599, + "grad_norm": 4.64234825531308, + "learning_rate": 1.4272061255458314e-06, + "loss": 0.9833, + "step": 8381 + }, + { + "epoch": 0.6047400887413874, + "grad_norm": 3.965218361882441, + "learning_rate": 1.4267583585866696e-06, + "loss": 0.9557, + "step": 8382 + }, + { + "epoch": 0.6048122362108149, + "grad_norm": 5.077393088781184, + "learning_rate": 1.4263106229301123e-06, + "loss": 0.8943, + "step": 8383 + }, + { + "epoch": 0.6048843836802424, + "grad_norm": 3.0380167203723087, + "learning_rate": 1.4258629186006081e-06, + "loss": 0.964, + "step": 8384 + }, + { + "epoch": 0.60495653114967, + "grad_norm": 4.879579984694007, + "learning_rate": 1.425415245622604e-06, + "loss": 0.8507, + "step": 8385 + }, + { + "epoch": 0.6050286786190975, + "grad_norm": 3.4884661959737633, + "learning_rate": 1.424967604020547e-06, + "loss": 0.9351, + "step": 8386 + }, + { + "epoch": 0.6051008260885249, + "grad_norm": 4.640099081428502, + "learning_rate": 1.42451999381888e-06, + "loss": 0.939, + "step": 8387 + }, + { + "epoch": 0.6051729735579524, + "grad_norm": 3.681213580794953, + "learning_rate": 1.4240724150420461e-06, + "loss": 0.8903, + "step": 8388 + }, + { + "epoch": 0.6052451210273799, + "grad_norm": 4.033608862763428, + "learning_rate": 1.4236248677144853e-06, + "loss": 0.9174, + "step": 8389 + }, + { + "epoch": 0.6053172684968074, + "grad_norm": 3.3202615173016006, + "learning_rate": 1.423177351860637e-06, + "loss": 0.7929, + "step": 8390 + }, + { + "epoch": 0.605389415966235, + "grad_norm": 4.12831583556583, + "learning_rate": 1.4227298675049381e-06, + "loss": 0.9811, + "step": 8391 + }, + { + "epoch": 0.6054615634356625, + "grad_norm": 3.923024408066641, + "learning_rate": 1.4222824146718236e-06, + "loss": 0.8115, + "step": 8392 + }, + { + "epoch": 0.60553371090509, + "grad_norm": 2.6188182703555802, + "learning_rate": 1.4218349933857287e-06, + "loss": 0.9204, + "step": 8393 + }, + { + "epoch": 0.6056058583745175, + "grad_norm": 3.6156466602939408, + "learning_rate": 1.4213876036710846e-06, + "loss": 0.9626, + "step": 8394 + }, + { + "epoch": 0.605678005843945, + "grad_norm": 3.529069480473607, + "learning_rate": 1.4209402455523205e-06, + "loss": 0.9524, + "step": 8395 + }, + { + "epoch": 0.6057501533133726, + "grad_norm": 3.073953813432977, + "learning_rate": 1.4204929190538669e-06, + "loss": 0.9622, + "step": 8396 + }, + { + "epoch": 0.6058223007828001, + "grad_norm": 2.91268110899147, + "learning_rate": 1.4200456242001489e-06, + "loss": 0.9048, + "step": 8397 + }, + { + "epoch": 0.6058944482522276, + "grad_norm": 4.326959225756284, + "learning_rate": 1.419598361015593e-06, + "loss": 0.8826, + "step": 8398 + }, + { + "epoch": 0.605966595721655, + "grad_norm": 2.878594349252994, + "learning_rate": 1.4191511295246223e-06, + "loss": 0.8931, + "step": 8399 + }, + { + "epoch": 0.6060387431910825, + "grad_norm": 3.1764482874492628, + "learning_rate": 1.418703929751658e-06, + "loss": 0.9589, + "step": 8400 + }, + { + "epoch": 0.60611089066051, + "grad_norm": 3.1469550417103034, + "learning_rate": 1.4182567617211198e-06, + "loss": 0.8436, + "step": 8401 + }, + { + "epoch": 0.6061830381299376, + "grad_norm": 5.249195037165985, + "learning_rate": 1.4178096254574263e-06, + "loss": 0.8883, + "step": 8402 + }, + { + "epoch": 0.6062551855993651, + "grad_norm": 7.052275692125495, + "learning_rate": 1.4173625209849949e-06, + "loss": 0.8484, + "step": 8403 + }, + { + "epoch": 0.6063273330687926, + "grad_norm": 3.3721140072375397, + "learning_rate": 1.416915448328238e-06, + "loss": 0.897, + "step": 8404 + }, + { + "epoch": 0.6063994805382201, + "grad_norm": 4.312352716999219, + "learning_rate": 1.416468407511571e-06, + "loss": 0.9614, + "step": 8405 + }, + { + "epoch": 0.6064716280076476, + "grad_norm": 3.8673509967805875, + "learning_rate": 1.416021398559404e-06, + "loss": 0.8529, + "step": 8406 + }, + { + "epoch": 0.6065437754770752, + "grad_norm": 3.0451281112505306, + "learning_rate": 1.4155744214961459e-06, + "loss": 0.9273, + "step": 8407 + }, + { + "epoch": 0.6066159229465027, + "grad_norm": 3.417464248481385, + "learning_rate": 1.415127476346206e-06, + "loss": 0.8399, + "step": 8408 + }, + { + "epoch": 0.6066880704159302, + "grad_norm": 2.598615655623793, + "learning_rate": 1.4146805631339886e-06, + "loss": 0.9283, + "step": 8409 + }, + { + "epoch": 0.6067602178853577, + "grad_norm": 6.349770681490752, + "learning_rate": 1.4142336818838999e-06, + "loss": 0.9146, + "step": 8410 + }, + { + "epoch": 0.6068323653547851, + "grad_norm": 0.7391440380837344, + "learning_rate": 1.413786832620341e-06, + "loss": 0.7814, + "step": 8411 + }, + { + "epoch": 0.6069045128242126, + "grad_norm": 5.1853732609825585, + "learning_rate": 1.4133400153677132e-06, + "loss": 0.8425, + "step": 8412 + }, + { + "epoch": 0.6069766602936402, + "grad_norm": 0.607754410858342, + "learning_rate": 1.412893230150415e-06, + "loss": 0.7581, + "step": 8413 + }, + { + "epoch": 0.6070488077630677, + "grad_norm": 3.8534004205805767, + "learning_rate": 1.4124464769928442e-06, + "loss": 0.8284, + "step": 8414 + }, + { + "epoch": 0.6071209552324952, + "grad_norm": 2.470227246197739, + "learning_rate": 1.4119997559193965e-06, + "loss": 0.9265, + "step": 8415 + }, + { + "epoch": 0.6071931027019227, + "grad_norm": 3.4765293076891615, + "learning_rate": 1.4115530669544653e-06, + "loss": 0.8844, + "step": 8416 + }, + { + "epoch": 0.6072652501713502, + "grad_norm": 4.999947547637473, + "learning_rate": 1.4111064101224425e-06, + "loss": 0.875, + "step": 8417 + }, + { + "epoch": 0.6073373976407778, + "grad_norm": 2.448378131197801, + "learning_rate": 1.4106597854477194e-06, + "loss": 1.0076, + "step": 8418 + }, + { + "epoch": 0.6074095451102053, + "grad_norm": 2.6311574199273444, + "learning_rate": 1.4102131929546826e-06, + "loss": 0.8916, + "step": 8419 + }, + { + "epoch": 0.6074816925796328, + "grad_norm": 3.233202883130579, + "learning_rate": 1.409766632667721e-06, + "loss": 1.0168, + "step": 8420 + }, + { + "epoch": 0.6075538400490603, + "grad_norm": 5.603850382940276, + "learning_rate": 1.4093201046112184e-06, + "loss": 0.9611, + "step": 8421 + }, + { + "epoch": 0.6076259875184878, + "grad_norm": 3.253407232847081, + "learning_rate": 1.4088736088095576e-06, + "loss": 0.8845, + "step": 8422 + }, + { + "epoch": 0.6076981349879153, + "grad_norm": 3.8851640361782276, + "learning_rate": 1.408427145287122e-06, + "loss": 0.9499, + "step": 8423 + }, + { + "epoch": 0.6077702824573428, + "grad_norm": 3.1157429725404273, + "learning_rate": 1.407980714068289e-06, + "loss": 0.8604, + "step": 8424 + }, + { + "epoch": 0.6078424299267703, + "grad_norm": 3.8763265492576204, + "learning_rate": 1.4075343151774379e-06, + "loss": 1.0127, + "step": 8425 + }, + { + "epoch": 0.6079145773961978, + "grad_norm": 11.750981066714107, + "learning_rate": 1.4070879486389446e-06, + "loss": 0.9398, + "step": 8426 + }, + { + "epoch": 0.6079867248656253, + "grad_norm": 52.941532922679194, + "learning_rate": 1.406641614477184e-06, + "loss": 0.9842, + "step": 8427 + }, + { + "epoch": 0.6080588723350528, + "grad_norm": 3.70666026117151, + "learning_rate": 1.4061953127165277e-06, + "loss": 0.8594, + "step": 8428 + }, + { + "epoch": 0.6081310198044804, + "grad_norm": 21.64731606284274, + "learning_rate": 1.4057490433813474e-06, + "loss": 0.9064, + "step": 8429 + }, + { + "epoch": 0.6082031672739079, + "grad_norm": 4.566839166617857, + "learning_rate": 1.4053028064960125e-06, + "loss": 0.9362, + "step": 8430 + }, + { + "epoch": 0.6082753147433354, + "grad_norm": 0.7104699410369198, + "learning_rate": 1.4048566020848886e-06, + "loss": 0.811, + "step": 8431 + }, + { + "epoch": 0.6083474622127629, + "grad_norm": 4.448890787819292, + "learning_rate": 1.4044104301723437e-06, + "loss": 0.9851, + "step": 8432 + }, + { + "epoch": 0.6084196096821904, + "grad_norm": 0.8307995701596356, + "learning_rate": 1.4039642907827403e-06, + "loss": 0.8088, + "step": 8433 + }, + { + "epoch": 0.608491757151618, + "grad_norm": 4.226254041207793, + "learning_rate": 1.4035181839404397e-06, + "loss": 0.8735, + "step": 8434 + }, + { + "epoch": 0.6085639046210454, + "grad_norm": 7.776269015552773, + "learning_rate": 1.4030721096698037e-06, + "loss": 1.0041, + "step": 8435 + }, + { + "epoch": 0.6086360520904729, + "grad_norm": 4.030637236162666, + "learning_rate": 1.4026260679951897e-06, + "loss": 0.934, + "step": 8436 + }, + { + "epoch": 0.6087081995599004, + "grad_norm": 3.303006369085405, + "learning_rate": 1.4021800589409547e-06, + "loss": 0.8725, + "step": 8437 + }, + { + "epoch": 0.6087803470293279, + "grad_norm": 2.744808237989711, + "learning_rate": 1.4017340825314543e-06, + "loss": 1.0133, + "step": 8438 + }, + { + "epoch": 0.6088524944987554, + "grad_norm": 3.316311640563786, + "learning_rate": 1.4012881387910402e-06, + "loss": 0.9802, + "step": 8439 + }, + { + "epoch": 0.608924641968183, + "grad_norm": 0.7498351154279366, + "learning_rate": 1.4008422277440648e-06, + "loss": 0.7905, + "step": 8440 + }, + { + "epoch": 0.6089967894376105, + "grad_norm": 4.468916029613482, + "learning_rate": 1.4003963494148771e-06, + "loss": 0.8615, + "step": 8441 + }, + { + "epoch": 0.609068936907038, + "grad_norm": 2.504670834262776, + "learning_rate": 1.3999505038278255e-06, + "loss": 0.8949, + "step": 8442 + }, + { + "epoch": 0.6091410843764655, + "grad_norm": 3.8819914473567656, + "learning_rate": 1.3995046910072551e-06, + "loss": 0.7056, + "step": 8443 + }, + { + "epoch": 0.609213231845893, + "grad_norm": 8.411132755808806, + "learning_rate": 1.3990589109775109e-06, + "loss": 0.7476, + "step": 8444 + }, + { + "epoch": 0.6092853793153206, + "grad_norm": 5.63965376420203, + "learning_rate": 1.3986131637629351e-06, + "loss": 1.0041, + "step": 8445 + }, + { + "epoch": 0.609357526784748, + "grad_norm": 3.4112587596183968, + "learning_rate": 1.3981674493878672e-06, + "loss": 0.954, + "step": 8446 + }, + { + "epoch": 0.6094296742541755, + "grad_norm": 3.6319735306855003, + "learning_rate": 1.3977217678766482e-06, + "loss": 0.9216, + "step": 8447 + }, + { + "epoch": 0.609501821723603, + "grad_norm": 5.579434452178137, + "learning_rate": 1.3972761192536133e-06, + "loss": 1.0535, + "step": 8448 + }, + { + "epoch": 0.6095739691930305, + "grad_norm": 3.5625567515353573, + "learning_rate": 1.3968305035430984e-06, + "loss": 0.8706, + "step": 8449 + }, + { + "epoch": 0.609646116662458, + "grad_norm": 3.1221310988764994, + "learning_rate": 1.3963849207694371e-06, + "loss": 0.9622, + "step": 8450 + }, + { + "epoch": 0.6097182641318856, + "grad_norm": 2.9229254644745946, + "learning_rate": 1.3959393709569604e-06, + "loss": 0.8802, + "step": 8451 + }, + { + "epoch": 0.6097904116013131, + "grad_norm": 5.452343903921584, + "learning_rate": 1.3954938541299988e-06, + "loss": 0.9764, + "step": 8452 + }, + { + "epoch": 0.6098625590707406, + "grad_norm": 12.10438090062043, + "learning_rate": 1.3950483703128795e-06, + "loss": 0.8933, + "step": 8453 + }, + { + "epoch": 0.6099347065401681, + "grad_norm": 3.885374271819473, + "learning_rate": 1.3946029195299301e-06, + "loss": 0.922, + "step": 8454 + }, + { + "epoch": 0.6100068540095956, + "grad_norm": 3.2072734485755947, + "learning_rate": 1.3941575018054735e-06, + "loss": 0.8907, + "step": 8455 + }, + { + "epoch": 0.6100790014790232, + "grad_norm": 2.901495048301793, + "learning_rate": 1.393712117163833e-06, + "loss": 0.9341, + "step": 8456 + }, + { + "epoch": 0.6101511489484507, + "grad_norm": 3.1335780095551753, + "learning_rate": 1.3932667656293297e-06, + "loss": 0.9423, + "step": 8457 + }, + { + "epoch": 0.6102232964178781, + "grad_norm": 3.5928208186516235, + "learning_rate": 1.392821447226282e-06, + "loss": 0.9031, + "step": 8458 + }, + { + "epoch": 0.6102954438873056, + "grad_norm": 2.3868341246344054, + "learning_rate": 1.3923761619790075e-06, + "loss": 0.9332, + "step": 8459 + }, + { + "epoch": 0.6103675913567331, + "grad_norm": 3.318705522199674, + "learning_rate": 1.3919309099118216e-06, + "loss": 0.8056, + "step": 8460 + }, + { + "epoch": 0.6104397388261606, + "grad_norm": 5.375074874001575, + "learning_rate": 1.391485691049037e-06, + "loss": 0.9575, + "step": 8461 + }, + { + "epoch": 0.6105118862955882, + "grad_norm": 2.443835997958581, + "learning_rate": 1.391040505414967e-06, + "loss": 0.8868, + "step": 8462 + }, + { + "epoch": 0.6105840337650157, + "grad_norm": 3.090991553313242, + "learning_rate": 1.3905953530339205e-06, + "loss": 0.9331, + "step": 8463 + }, + { + "epoch": 0.6106561812344432, + "grad_norm": 4.179517738959058, + "learning_rate": 1.3901502339302055e-06, + "loss": 0.9354, + "step": 8464 + }, + { + "epoch": 0.6107283287038707, + "grad_norm": 2.302317185988117, + "learning_rate": 1.3897051481281295e-06, + "loss": 0.8965, + "step": 8465 + }, + { + "epoch": 0.6108004761732982, + "grad_norm": 3.3577541965767694, + "learning_rate": 1.3892600956519953e-06, + "loss": 0.9241, + "step": 8466 + }, + { + "epoch": 0.6108726236427258, + "grad_norm": 29.948001811473166, + "learning_rate": 1.3888150765261063e-06, + "loss": 0.8855, + "step": 8467 + }, + { + "epoch": 0.6109447711121533, + "grad_norm": 4.096210002686434, + "learning_rate": 1.3883700907747637e-06, + "loss": 0.8875, + "step": 8468 + }, + { + "epoch": 0.6110169185815808, + "grad_norm": 5.923282989362429, + "learning_rate": 1.3879251384222665e-06, + "loss": 1.0401, + "step": 8469 + }, + { + "epoch": 0.6110890660510082, + "grad_norm": 5.900848017380108, + "learning_rate": 1.387480219492911e-06, + "loss": 0.9831, + "step": 8470 + }, + { + "epoch": 0.6111612135204357, + "grad_norm": 2.48452067397942, + "learning_rate": 1.3870353340109941e-06, + "loss": 1.0112, + "step": 8471 + }, + { + "epoch": 0.6112333609898633, + "grad_norm": 4.532999897830406, + "learning_rate": 1.3865904820008085e-06, + "loss": 0.886, + "step": 8472 + }, + { + "epoch": 0.6113055084592908, + "grad_norm": 4.61412233184042, + "learning_rate": 1.386145663486645e-06, + "loss": 0.9799, + "step": 8473 + }, + { + "epoch": 0.6113776559287183, + "grad_norm": 3.3093564934433704, + "learning_rate": 1.3857008784927952e-06, + "loss": 0.8405, + "step": 8474 + }, + { + "epoch": 0.6114498033981458, + "grad_norm": 5.76600773892659, + "learning_rate": 1.3852561270435458e-06, + "loss": 0.9309, + "step": 8475 + }, + { + "epoch": 0.6115219508675733, + "grad_norm": 3.4295401226521407, + "learning_rate": 1.384811409163184e-06, + "loss": 0.8707, + "step": 8476 + }, + { + "epoch": 0.6115940983370008, + "grad_norm": 3.0441289019667925, + "learning_rate": 1.384366724875994e-06, + "loss": 0.9641, + "step": 8477 + }, + { + "epoch": 0.6116662458064284, + "grad_norm": 6.608861303544824, + "learning_rate": 1.3839220742062576e-06, + "loss": 0.9034, + "step": 8478 + }, + { + "epoch": 0.6117383932758559, + "grad_norm": 5.927505916108228, + "learning_rate": 1.3834774571782564e-06, + "loss": 1.0151, + "step": 8479 + }, + { + "epoch": 0.6118105407452834, + "grad_norm": 9.317720524435382, + "learning_rate": 1.3830328738162686e-06, + "loss": 0.7825, + "step": 8480 + }, + { + "epoch": 0.6118826882147109, + "grad_norm": 4.338386572561108, + "learning_rate": 1.3825883241445725e-06, + "loss": 0.9336, + "step": 8481 + }, + { + "epoch": 0.6119548356841383, + "grad_norm": 2.893733317448622, + "learning_rate": 1.3821438081874418e-06, + "loss": 0.9434, + "step": 8482 + }, + { + "epoch": 0.6120269831535659, + "grad_norm": 2.8233487595005755, + "learning_rate": 1.3816993259691506e-06, + "loss": 1.0413, + "step": 8483 + }, + { + "epoch": 0.6120991306229934, + "grad_norm": 11.440341283869387, + "learning_rate": 1.3812548775139706e-06, + "loss": 0.8781, + "step": 8484 + }, + { + "epoch": 0.6121712780924209, + "grad_norm": 4.164381061556987, + "learning_rate": 1.3808104628461703e-06, + "loss": 0.9025, + "step": 8485 + }, + { + "epoch": 0.6122434255618484, + "grad_norm": 4.2489274859526365, + "learning_rate": 1.3803660819900194e-06, + "loss": 0.9304, + "step": 8486 + }, + { + "epoch": 0.6123155730312759, + "grad_norm": 5.0894986062708005, + "learning_rate": 1.3799217349697832e-06, + "loss": 0.9238, + "step": 8487 + }, + { + "epoch": 0.6123877205007034, + "grad_norm": 3.8859138613826656, + "learning_rate": 1.3794774218097246e-06, + "loss": 0.8227, + "step": 8488 + }, + { + "epoch": 0.612459867970131, + "grad_norm": 0.8046776113550662, + "learning_rate": 1.3790331425341072e-06, + "loss": 0.7862, + "step": 8489 + }, + { + "epoch": 0.6125320154395585, + "grad_norm": 7.323054855439189, + "learning_rate": 1.3785888971671913e-06, + "loss": 0.8532, + "step": 8490 + }, + { + "epoch": 0.612604162908986, + "grad_norm": 4.114223854038885, + "learning_rate": 1.3781446857332353e-06, + "loss": 0.9426, + "step": 8491 + }, + { + "epoch": 0.6126763103784135, + "grad_norm": 0.8041352163763152, + "learning_rate": 1.3777005082564959e-06, + "loss": 0.7714, + "step": 8492 + }, + { + "epoch": 0.6127484578478409, + "grad_norm": 3.717538844754128, + "learning_rate": 1.3772563647612288e-06, + "loss": 0.9151, + "step": 8493 + }, + { + "epoch": 0.6128206053172685, + "grad_norm": 7.054733191729801, + "learning_rate": 1.3768122552716854e-06, + "loss": 0.9548, + "step": 8494 + }, + { + "epoch": 0.612892752786696, + "grad_norm": 4.083589480923132, + "learning_rate": 1.376368179812118e-06, + "loss": 0.8991, + "step": 8495 + }, + { + "epoch": 0.6129649002561235, + "grad_norm": 5.067131090629492, + "learning_rate": 1.375924138406776e-06, + "loss": 0.9044, + "step": 8496 + }, + { + "epoch": 0.613037047725551, + "grad_norm": 4.422886816934179, + "learning_rate": 1.375480131079906e-06, + "loss": 0.9089, + "step": 8497 + }, + { + "epoch": 0.6131091951949785, + "grad_norm": 2.9588096844060714, + "learning_rate": 1.3750361578557548e-06, + "loss": 0.9188, + "step": 8498 + }, + { + "epoch": 0.613181342664406, + "grad_norm": 2.2685088646900295, + "learning_rate": 1.3745922187585655e-06, + "loss": 1.0164, + "step": 8499 + }, + { + "epoch": 0.6132534901338336, + "grad_norm": 14.134588987125198, + "learning_rate": 1.374148313812579e-06, + "loss": 0.7443, + "step": 8500 + }, + { + "epoch": 0.6133256376032611, + "grad_norm": 0.7144199636865245, + "learning_rate": 1.3737044430420372e-06, + "loss": 0.8042, + "step": 8501 + }, + { + "epoch": 0.6133977850726886, + "grad_norm": 3.1653256002662165, + "learning_rate": 1.373260606471177e-06, + "loss": 0.7944, + "step": 8502 + }, + { + "epoch": 0.6134699325421161, + "grad_norm": 3.0276048904506454, + "learning_rate": 1.3728168041242347e-06, + "loss": 0.9412, + "step": 8503 + }, + { + "epoch": 0.6135420800115436, + "grad_norm": 5.043085618441515, + "learning_rate": 1.3723730360254455e-06, + "loss": 0.9283, + "step": 8504 + }, + { + "epoch": 0.613614227480971, + "grad_norm": 3.15935537330274, + "learning_rate": 1.371929302199041e-06, + "loss": 0.9904, + "step": 8505 + }, + { + "epoch": 0.6136863749503986, + "grad_norm": 4.469961148974322, + "learning_rate": 1.3714856026692526e-06, + "loss": 0.9666, + "step": 8506 + }, + { + "epoch": 0.6137585224198261, + "grad_norm": 3.371024439376556, + "learning_rate": 1.3710419374603083e-06, + "loss": 0.8878, + "step": 8507 + }, + { + "epoch": 0.6138306698892536, + "grad_norm": 2.947855451148217, + "learning_rate": 1.370598306596436e-06, + "loss": 0.9878, + "step": 8508 + }, + { + "epoch": 0.6139028173586811, + "grad_norm": 3.23203845559272, + "learning_rate": 1.3701547101018598e-06, + "loss": 0.9718, + "step": 8509 + }, + { + "epoch": 0.6139749648281086, + "grad_norm": 4.414496314412431, + "learning_rate": 1.3697111480008032e-06, + "loss": 0.9386, + "step": 8510 + }, + { + "epoch": 0.6140471122975362, + "grad_norm": 4.456411761628919, + "learning_rate": 1.3692676203174883e-06, + "loss": 0.8638, + "step": 8511 + }, + { + "epoch": 0.6141192597669637, + "grad_norm": 3.984301757139306, + "learning_rate": 1.3688241270761327e-06, + "loss": 0.9097, + "step": 8512 + }, + { + "epoch": 0.6141914072363912, + "grad_norm": 4.907185070564637, + "learning_rate": 1.3683806683009555e-06, + "loss": 0.9764, + "step": 8513 + }, + { + "epoch": 0.6142635547058187, + "grad_norm": 0.7547186787800994, + "learning_rate": 1.367937244016172e-06, + "loss": 0.8052, + "step": 8514 + }, + { + "epoch": 0.6143357021752462, + "grad_norm": 3.240766466634053, + "learning_rate": 1.3674938542459955e-06, + "loss": 0.921, + "step": 8515 + }, + { + "epoch": 0.6144078496446738, + "grad_norm": 4.357186373062278, + "learning_rate": 1.3670504990146388e-06, + "loss": 0.9134, + "step": 8516 + }, + { + "epoch": 0.6144799971141012, + "grad_norm": 3.420320061770707, + "learning_rate": 1.3666071783463104e-06, + "loss": 1.0416, + "step": 8517 + }, + { + "epoch": 0.6145521445835287, + "grad_norm": 2.4617972673480772, + "learning_rate": 1.3661638922652196e-06, + "loss": 0.8976, + "step": 8518 + }, + { + "epoch": 0.6146242920529562, + "grad_norm": 4.593256086928154, + "learning_rate": 1.3657206407955721e-06, + "loss": 0.7738, + "step": 8519 + }, + { + "epoch": 0.6146964395223837, + "grad_norm": 3.8901963821304095, + "learning_rate": 1.3652774239615728e-06, + "loss": 0.9791, + "step": 8520 + }, + { + "epoch": 0.6147685869918113, + "grad_norm": 3.0860612675437427, + "learning_rate": 1.3648342417874235e-06, + "loss": 0.9027, + "step": 8521 + }, + { + "epoch": 0.6148407344612388, + "grad_norm": 5.149755468396275, + "learning_rate": 1.3643910942973248e-06, + "loss": 0.9361, + "step": 8522 + }, + { + "epoch": 0.6149128819306663, + "grad_norm": 3.102289916863265, + "learning_rate": 1.363947981515476e-06, + "loss": 1.0057, + "step": 8523 + }, + { + "epoch": 0.6149850294000938, + "grad_norm": 5.0083552169110135, + "learning_rate": 1.3635049034660723e-06, + "loss": 0.8704, + "step": 8524 + }, + { + "epoch": 0.6150571768695213, + "grad_norm": 2.200513333339814, + "learning_rate": 1.3630618601733106e-06, + "loss": 0.9187, + "step": 8525 + }, + { + "epoch": 0.6151293243389488, + "grad_norm": 2.3260715312438554, + "learning_rate": 1.3626188516613828e-06, + "loss": 0.8929, + "step": 8526 + }, + { + "epoch": 0.6152014718083764, + "grad_norm": 5.802283712199488, + "learning_rate": 1.3621758779544792e-06, + "loss": 0.9707, + "step": 8527 + }, + { + "epoch": 0.6152736192778039, + "grad_norm": 3.308729941347294, + "learning_rate": 1.3617329390767905e-06, + "loss": 0.8847, + "step": 8528 + }, + { + "epoch": 0.6153457667472313, + "grad_norm": 4.009133401966768, + "learning_rate": 1.3612900350525029e-06, + "loss": 0.8284, + "step": 8529 + }, + { + "epoch": 0.6154179142166588, + "grad_norm": 3.329846577131428, + "learning_rate": 1.360847165905802e-06, + "loss": 0.9375, + "step": 8530 + }, + { + "epoch": 0.6154900616860863, + "grad_norm": 3.799689235026449, + "learning_rate": 1.360404331660872e-06, + "loss": 1.0151, + "step": 8531 + }, + { + "epoch": 0.6155622091555139, + "grad_norm": 3.627099711589145, + "learning_rate": 1.3599615323418929e-06, + "loss": 0.9447, + "step": 8532 + }, + { + "epoch": 0.6156343566249414, + "grad_norm": 5.0351458328174825, + "learning_rate": 1.3595187679730455e-06, + "loss": 0.8964, + "step": 8533 + }, + { + "epoch": 0.6157065040943689, + "grad_norm": 6.3462838059569915, + "learning_rate": 1.359076038578507e-06, + "loss": 1.0351, + "step": 8534 + }, + { + "epoch": 0.6157786515637964, + "grad_norm": 5.721027858656299, + "learning_rate": 1.3586333441824537e-06, + "loss": 0.9506, + "step": 8535 + }, + { + "epoch": 0.6158507990332239, + "grad_norm": 4.7930612718377406, + "learning_rate": 1.3581906848090588e-06, + "loss": 0.8979, + "step": 8536 + }, + { + "epoch": 0.6159229465026514, + "grad_norm": 4.5537353357900425, + "learning_rate": 1.3577480604824954e-06, + "loss": 0.9615, + "step": 8537 + }, + { + "epoch": 0.615995093972079, + "grad_norm": 23.57886922077623, + "learning_rate": 1.3573054712269328e-06, + "loss": 0.8977, + "step": 8538 + }, + { + "epoch": 0.6160672414415065, + "grad_norm": 3.148394790246138, + "learning_rate": 1.3568629170665382e-06, + "loss": 0.9971, + "step": 8539 + }, + { + "epoch": 0.6161393889109339, + "grad_norm": 5.102747360431403, + "learning_rate": 1.35642039802548e-06, + "loss": 0.8909, + "step": 8540 + }, + { + "epoch": 0.6162115363803614, + "grad_norm": 2.824019175715627, + "learning_rate": 1.355977914127921e-06, + "loss": 0.8703, + "step": 8541 + }, + { + "epoch": 0.6162836838497889, + "grad_norm": 7.718763343706836, + "learning_rate": 1.355535465398024e-06, + "loss": 0.9178, + "step": 8542 + }, + { + "epoch": 0.6163558313192165, + "grad_norm": 6.199860220533234, + "learning_rate": 1.35509305185995e-06, + "loss": 0.9541, + "step": 8543 + }, + { + "epoch": 0.616427978788644, + "grad_norm": 2.844252992518285, + "learning_rate": 1.3546506735378564e-06, + "loss": 0.8748, + "step": 8544 + }, + { + "epoch": 0.6165001262580715, + "grad_norm": 2.353764311128565, + "learning_rate": 1.3542083304559007e-06, + "loss": 0.9169, + "step": 8545 + }, + { + "epoch": 0.616572273727499, + "grad_norm": 4.518087909993204, + "learning_rate": 1.3537660226382373e-06, + "loss": 0.9421, + "step": 8546 + }, + { + "epoch": 0.6166444211969265, + "grad_norm": 3.193503317665633, + "learning_rate": 1.3533237501090197e-06, + "loss": 0.8951, + "step": 8547 + }, + { + "epoch": 0.616716568666354, + "grad_norm": 3.892509034917461, + "learning_rate": 1.3528815128923982e-06, + "loss": 0.9093, + "step": 8548 + }, + { + "epoch": 0.6167887161357816, + "grad_norm": 4.774286722165347, + "learning_rate": 1.3524393110125208e-06, + "loss": 0.8599, + "step": 8549 + }, + { + "epoch": 0.6168608636052091, + "grad_norm": 3.6064049857700007, + "learning_rate": 1.3519971444935364e-06, + "loss": 0.8513, + "step": 8550 + }, + { + "epoch": 0.6169330110746366, + "grad_norm": 2.0628883689748125, + "learning_rate": 1.351555013359588e-06, + "loss": 0.9249, + "step": 8551 + }, + { + "epoch": 0.617005158544064, + "grad_norm": 8.215031816231708, + "learning_rate": 1.3511129176348212e-06, + "loss": 0.9495, + "step": 8552 + }, + { + "epoch": 0.6170773060134915, + "grad_norm": 4.382121584147007, + "learning_rate": 1.3506708573433754e-06, + "loss": 0.8075, + "step": 8553 + }, + { + "epoch": 0.617149453482919, + "grad_norm": 2.8047084966952136, + "learning_rate": 1.3502288325093895e-06, + "loss": 0.9188, + "step": 8554 + }, + { + "epoch": 0.6172216009523466, + "grad_norm": 2.820835139068822, + "learning_rate": 1.3497868431570029e-06, + "loss": 0.9771, + "step": 8555 + }, + { + "epoch": 0.6172937484217741, + "grad_norm": 6.547842459562092, + "learning_rate": 1.3493448893103489e-06, + "loss": 0.8844, + "step": 8556 + }, + { + "epoch": 0.6173658958912016, + "grad_norm": 2.9589584299089435, + "learning_rate": 1.3489029709935621e-06, + "loss": 0.9322, + "step": 8557 + }, + { + "epoch": 0.6174380433606291, + "grad_norm": 2.692231648282955, + "learning_rate": 1.3484610882307739e-06, + "loss": 0.9103, + "step": 8558 + }, + { + "epoch": 0.6175101908300566, + "grad_norm": 10.51864240640067, + "learning_rate": 1.348019241046114e-06, + "loss": 0.8212, + "step": 8559 + }, + { + "epoch": 0.6175823382994842, + "grad_norm": 7.379496609853802, + "learning_rate": 1.3475774294637097e-06, + "loss": 0.8102, + "step": 8560 + }, + { + "epoch": 0.6176544857689117, + "grad_norm": 5.079285662130494, + "learning_rate": 1.3471356535076864e-06, + "loss": 1.0099, + "step": 8561 + }, + { + "epoch": 0.6177266332383392, + "grad_norm": 3.637861914179897, + "learning_rate": 1.3466939132021686e-06, + "loss": 0.8602, + "step": 8562 + }, + { + "epoch": 0.6177987807077667, + "grad_norm": 0.7766507764413202, + "learning_rate": 1.3462522085712771e-06, + "loss": 0.8535, + "step": 8563 + }, + { + "epoch": 0.6178709281771941, + "grad_norm": 9.12851887616817, + "learning_rate": 1.345810539639133e-06, + "loss": 0.793, + "step": 8564 + }, + { + "epoch": 0.6179430756466217, + "grad_norm": 2.3941860230147265, + "learning_rate": 1.345368906429854e-06, + "loss": 0.8368, + "step": 8565 + }, + { + "epoch": 0.6180152231160492, + "grad_norm": 3.9598877247393647, + "learning_rate": 1.3449273089675547e-06, + "loss": 0.8953, + "step": 8566 + }, + { + "epoch": 0.6180873705854767, + "grad_norm": 2.6261472465680096, + "learning_rate": 1.3444857472763507e-06, + "loss": 0.9203, + "step": 8567 + }, + { + "epoch": 0.6181595180549042, + "grad_norm": 2.129719710018999, + "learning_rate": 1.3440442213803533e-06, + "loss": 0.8894, + "step": 8568 + }, + { + "epoch": 0.6182316655243317, + "grad_norm": 4.842186854000239, + "learning_rate": 1.3436027313036723e-06, + "loss": 0.8947, + "step": 8569 + }, + { + "epoch": 0.6183038129937592, + "grad_norm": 2.687742710243085, + "learning_rate": 1.343161277070417e-06, + "loss": 0.9859, + "step": 8570 + }, + { + "epoch": 0.6183759604631868, + "grad_norm": 3.2919519196217117, + "learning_rate": 1.3427198587046924e-06, + "loss": 1.029, + "step": 8571 + }, + { + "epoch": 0.6184481079326143, + "grad_norm": 6.5252342445157305, + "learning_rate": 1.3422784762306036e-06, + "loss": 0.9528, + "step": 8572 + }, + { + "epoch": 0.6185202554020418, + "grad_norm": 3.245824038321261, + "learning_rate": 1.3418371296722514e-06, + "loss": 0.8474, + "step": 8573 + }, + { + "epoch": 0.6185924028714693, + "grad_norm": 4.724312844340041, + "learning_rate": 1.341395819053738e-06, + "loss": 0.8915, + "step": 8574 + }, + { + "epoch": 0.6186645503408968, + "grad_norm": 2.9567800034651888, + "learning_rate": 1.3409545443991607e-06, + "loss": 0.9601, + "step": 8575 + }, + { + "epoch": 0.6187366978103243, + "grad_norm": 2.7478182547769423, + "learning_rate": 1.3405133057326155e-06, + "loss": 0.9352, + "step": 8576 + }, + { + "epoch": 0.6188088452797518, + "grad_norm": 2.831689432496837, + "learning_rate": 1.340072103078198e-06, + "loss": 0.9214, + "step": 8577 + }, + { + "epoch": 0.6188809927491793, + "grad_norm": 6.755011499537711, + "learning_rate": 1.339630936459999e-06, + "loss": 0.8605, + "step": 8578 + }, + { + "epoch": 0.6189531402186068, + "grad_norm": 3.2381927514954083, + "learning_rate": 1.339189805902111e-06, + "loss": 1.0212, + "step": 8579 + }, + { + "epoch": 0.6190252876880343, + "grad_norm": 9.294374581688974, + "learning_rate": 1.338748711428621e-06, + "loss": 0.9024, + "step": 8580 + }, + { + "epoch": 0.6190974351574619, + "grad_norm": 3.7156663178503946, + "learning_rate": 1.3383076530636159e-06, + "loss": 0.9034, + "step": 8581 + }, + { + "epoch": 0.6191695826268894, + "grad_norm": 2.2668976465045314, + "learning_rate": 1.337866630831181e-06, + "loss": 0.8789, + "step": 8582 + }, + { + "epoch": 0.6192417300963169, + "grad_norm": 0.723323246150138, + "learning_rate": 1.3374256447553977e-06, + "loss": 0.8119, + "step": 8583 + }, + { + "epoch": 0.6193138775657444, + "grad_norm": 3.3823147378895744, + "learning_rate": 1.3369846948603475e-06, + "loss": 0.9556, + "step": 8584 + }, + { + "epoch": 0.6193860250351719, + "grad_norm": 3.6258955539866853, + "learning_rate": 1.3365437811701083e-06, + "loss": 0.9487, + "step": 8585 + }, + { + "epoch": 0.6194581725045994, + "grad_norm": 3.1306507138631603, + "learning_rate": 1.3361029037087574e-06, + "loss": 0.8701, + "step": 8586 + }, + { + "epoch": 0.619530319974027, + "grad_norm": 5.489015534175853, + "learning_rate": 1.3356620625003699e-06, + "loss": 0.9636, + "step": 8587 + }, + { + "epoch": 0.6196024674434544, + "grad_norm": 3.6842501268911643, + "learning_rate": 1.3352212575690165e-06, + "loss": 0.8602, + "step": 8588 + }, + { + "epoch": 0.6196746149128819, + "grad_norm": 3.663875355445277, + "learning_rate": 1.3347804889387705e-06, + "loss": 0.9209, + "step": 8589 + }, + { + "epoch": 0.6197467623823094, + "grad_norm": 3.212284297603556, + "learning_rate": 1.3343397566336983e-06, + "loss": 0.8952, + "step": 8590 + }, + { + "epoch": 0.6198189098517369, + "grad_norm": 4.087180654200796, + "learning_rate": 1.3338990606778689e-06, + "loss": 0.9578, + "step": 8591 + }, + { + "epoch": 0.6198910573211645, + "grad_norm": 2.6217038486347555, + "learning_rate": 1.3334584010953458e-06, + "loss": 0.9745, + "step": 8592 + }, + { + "epoch": 0.619963204790592, + "grad_norm": 3.9350488693130403, + "learning_rate": 1.3330177779101913e-06, + "loss": 0.943, + "step": 8593 + }, + { + "epoch": 0.6200353522600195, + "grad_norm": 6.110221940675039, + "learning_rate": 1.3325771911464676e-06, + "loss": 0.894, + "step": 8594 + }, + { + "epoch": 0.620107499729447, + "grad_norm": 3.9404589721650343, + "learning_rate": 1.3321366408282325e-06, + "loss": 0.8972, + "step": 8595 + }, + { + "epoch": 0.6201796471988745, + "grad_norm": 3.1887183759280853, + "learning_rate": 1.3316961269795432e-06, + "loss": 0.9447, + "step": 8596 + }, + { + "epoch": 0.620251794668302, + "grad_norm": 9.883714524670154, + "learning_rate": 1.3312556496244543e-06, + "loss": 0.9702, + "step": 8597 + }, + { + "epoch": 0.6203239421377296, + "grad_norm": 3.565867388524697, + "learning_rate": 1.3308152087870187e-06, + "loss": 0.8276, + "step": 8598 + }, + { + "epoch": 0.620396089607157, + "grad_norm": 10.569188996440571, + "learning_rate": 1.3303748044912875e-06, + "loss": 0.9434, + "step": 8599 + }, + { + "epoch": 0.6204682370765845, + "grad_norm": 3.7074591788618854, + "learning_rate": 1.3299344367613085e-06, + "loss": 0.96, + "step": 8600 + }, + { + "epoch": 0.620540384546012, + "grad_norm": 3.882571913328897, + "learning_rate": 1.3294941056211306e-06, + "loss": 0.9278, + "step": 8601 + }, + { + "epoch": 0.6206125320154395, + "grad_norm": 4.080625972972631, + "learning_rate": 1.3290538110947965e-06, + "loss": 0.9171, + "step": 8602 + }, + { + "epoch": 0.620684679484867, + "grad_norm": 5.63576223990866, + "learning_rate": 1.3286135532063508e-06, + "loss": 0.8742, + "step": 8603 + }, + { + "epoch": 0.6207568269542946, + "grad_norm": 8.284110097200488, + "learning_rate": 1.3281733319798337e-06, + "loss": 0.9025, + "step": 8604 + }, + { + "epoch": 0.6208289744237221, + "grad_norm": 3.282024909660101, + "learning_rate": 1.3277331474392828e-06, + "loss": 0.8524, + "step": 8605 + }, + { + "epoch": 0.6209011218931496, + "grad_norm": 2.9305233182736004, + "learning_rate": 1.3272929996087373e-06, + "loss": 0.9103, + "step": 8606 + }, + { + "epoch": 0.6209732693625771, + "grad_norm": 2.9951838935859154, + "learning_rate": 1.3268528885122302e-06, + "loss": 0.842, + "step": 8607 + }, + { + "epoch": 0.6210454168320046, + "grad_norm": 5.854297278995949, + "learning_rate": 1.3264128141737953e-06, + "loss": 1.011, + "step": 8608 + }, + { + "epoch": 0.6211175643014322, + "grad_norm": 2.6758997535916724, + "learning_rate": 1.3259727766174627e-06, + "loss": 0.9691, + "step": 8609 + }, + { + "epoch": 0.6211897117708597, + "grad_norm": 3.5261506928229958, + "learning_rate": 1.3255327758672614e-06, + "loss": 0.9204, + "step": 8610 + }, + { + "epoch": 0.6212618592402871, + "grad_norm": 6.080280147422697, + "learning_rate": 1.3250928119472193e-06, + "loss": 0.8611, + "step": 8611 + }, + { + "epoch": 0.6213340067097146, + "grad_norm": 2.9158697583660875, + "learning_rate": 1.3246528848813591e-06, + "loss": 0.9776, + "step": 8612 + }, + { + "epoch": 0.6214061541791421, + "grad_norm": 3.174164946069816, + "learning_rate": 1.3242129946937057e-06, + "loss": 0.8711, + "step": 8613 + }, + { + "epoch": 0.6214783016485697, + "grad_norm": 3.127236290904931, + "learning_rate": 1.3237731414082794e-06, + "loss": 0.876, + "step": 8614 + }, + { + "epoch": 0.6215504491179972, + "grad_norm": 3.288056063310976, + "learning_rate": 1.3233333250490972e-06, + "loss": 0.9884, + "step": 8615 + }, + { + "epoch": 0.6216225965874247, + "grad_norm": 3.749702441807831, + "learning_rate": 1.3228935456401785e-06, + "loss": 0.8947, + "step": 8616 + }, + { + "epoch": 0.6216947440568522, + "grad_norm": 3.10392721050491, + "learning_rate": 1.3224538032055352e-06, + "loss": 0.8332, + "step": 8617 + }, + { + "epoch": 0.6217668915262797, + "grad_norm": 6.298136771651882, + "learning_rate": 1.3220140977691828e-06, + "loss": 0.9324, + "step": 8618 + }, + { + "epoch": 0.6218390389957072, + "grad_norm": 8.12189177701831, + "learning_rate": 1.3215744293551303e-06, + "loss": 0.9633, + "step": 8619 + }, + { + "epoch": 0.6219111864651348, + "grad_norm": 3.9130826473579257, + "learning_rate": 1.321134797987387e-06, + "loss": 0.8975, + "step": 8620 + }, + { + "epoch": 0.6219833339345623, + "grad_norm": 2.9903498252941443, + "learning_rate": 1.320695203689959e-06, + "loss": 0.9893, + "step": 8621 + }, + { + "epoch": 0.6220554814039898, + "grad_norm": 3.3452682613350175, + "learning_rate": 1.3202556464868514e-06, + "loss": 0.9072, + "step": 8622 + }, + { + "epoch": 0.6221276288734172, + "grad_norm": 2.729493847131651, + "learning_rate": 1.319816126402067e-06, + "loss": 0.8645, + "step": 8623 + }, + { + "epoch": 0.6221997763428447, + "grad_norm": 3.0381535835545947, + "learning_rate": 1.3193766434596048e-06, + "loss": 0.8988, + "step": 8624 + }, + { + "epoch": 0.6222719238122723, + "grad_norm": 5.394746216752375, + "learning_rate": 1.3189371976834655e-06, + "loss": 0.7783, + "step": 8625 + }, + { + "epoch": 0.6223440712816998, + "grad_norm": 6.619837638961467, + "learning_rate": 1.3184977890976448e-06, + "loss": 0.8476, + "step": 8626 + }, + { + "epoch": 0.6224162187511273, + "grad_norm": 3.5893430806925557, + "learning_rate": 1.3180584177261361e-06, + "loss": 0.9902, + "step": 8627 + }, + { + "epoch": 0.6224883662205548, + "grad_norm": 2.842880451438195, + "learning_rate": 1.3176190835929334e-06, + "loss": 0.9426, + "step": 8628 + }, + { + "epoch": 0.6225605136899823, + "grad_norm": 0.8369216829573934, + "learning_rate": 1.3171797867220255e-06, + "loss": 0.8352, + "step": 8629 + }, + { + "epoch": 0.6226326611594099, + "grad_norm": 3.2669803416884093, + "learning_rate": 1.3167405271374031e-06, + "loss": 0.9568, + "step": 8630 + }, + { + "epoch": 0.6227048086288374, + "grad_norm": 3.164532963190946, + "learning_rate": 1.3163013048630508e-06, + "loss": 0.8974, + "step": 8631 + }, + { + "epoch": 0.6227769560982649, + "grad_norm": 3.10408835770999, + "learning_rate": 1.315862119922953e-06, + "loss": 0.878, + "step": 8632 + }, + { + "epoch": 0.6228491035676924, + "grad_norm": 3.2171705500270615, + "learning_rate": 1.3154229723410922e-06, + "loss": 0.9145, + "step": 8633 + }, + { + "epoch": 0.6229212510371199, + "grad_norm": 2.979197219656396, + "learning_rate": 1.3149838621414486e-06, + "loss": 0.8836, + "step": 8634 + }, + { + "epoch": 0.6229933985065473, + "grad_norm": 2.381336492935055, + "learning_rate": 1.3145447893480008e-06, + "loss": 0.9702, + "step": 8635 + }, + { + "epoch": 0.6230655459759749, + "grad_norm": 3.726469090728551, + "learning_rate": 1.3141057539847244e-06, + "loss": 0.817, + "step": 8636 + }, + { + "epoch": 0.6231376934454024, + "grad_norm": 2.757378822354793, + "learning_rate": 1.3136667560755936e-06, + "loss": 0.814, + "step": 8637 + }, + { + "epoch": 0.6232098409148299, + "grad_norm": 4.1988892040020955, + "learning_rate": 1.3132277956445808e-06, + "loss": 0.8517, + "step": 8638 + }, + { + "epoch": 0.6232819883842574, + "grad_norm": 2.470522666643422, + "learning_rate": 1.312788872715655e-06, + "loss": 0.9074, + "step": 8639 + }, + { + "epoch": 0.6233541358536849, + "grad_norm": 2.5703534989725005, + "learning_rate": 1.3123499873127858e-06, + "loss": 0.9861, + "step": 8640 + }, + { + "epoch": 0.6234262833231125, + "grad_norm": 2.764884720342725, + "learning_rate": 1.3119111394599374e-06, + "loss": 0.9176, + "step": 8641 + }, + { + "epoch": 0.62349843079254, + "grad_norm": 2.860472421642199, + "learning_rate": 1.3114723291810753e-06, + "loss": 0.9384, + "step": 8642 + }, + { + "epoch": 0.6235705782619675, + "grad_norm": 3.3777813224048865, + "learning_rate": 1.3110335565001602e-06, + "loss": 0.9637, + "step": 8643 + }, + { + "epoch": 0.623642725731395, + "grad_norm": 4.58837035859219, + "learning_rate": 1.3105948214411519e-06, + "loss": 0.8261, + "step": 8644 + }, + { + "epoch": 0.6237148732008225, + "grad_norm": 3.0545031401667564, + "learning_rate": 1.3101561240280083e-06, + "loss": 1.0159, + "step": 8645 + }, + { + "epoch": 0.6237870206702499, + "grad_norm": 3.183862742662021, + "learning_rate": 1.309717464284685e-06, + "loss": 1.0238, + "step": 8646 + }, + { + "epoch": 0.6238591681396775, + "grad_norm": 7.50363198394214, + "learning_rate": 1.3092788422351362e-06, + "loss": 0.8203, + "step": 8647 + }, + { + "epoch": 0.623931315609105, + "grad_norm": 6.187278666535335, + "learning_rate": 1.3088402579033122e-06, + "loss": 0.8111, + "step": 8648 + }, + { + "epoch": 0.6240034630785325, + "grad_norm": 0.812442227290334, + "learning_rate": 1.3084017113131633e-06, + "loss": 0.8502, + "step": 8649 + }, + { + "epoch": 0.62407561054796, + "grad_norm": 12.812651842659697, + "learning_rate": 1.307963202488637e-06, + "loss": 0.8657, + "step": 8650 + }, + { + "epoch": 0.6241477580173875, + "grad_norm": 3.0419829391412385, + "learning_rate": 1.3075247314536776e-06, + "loss": 0.8211, + "step": 8651 + }, + { + "epoch": 0.624219905486815, + "grad_norm": 4.860147985081283, + "learning_rate": 1.3070862982322302e-06, + "loss": 0.9162, + "step": 8652 + }, + { + "epoch": 0.6242920529562426, + "grad_norm": 4.942871069632582, + "learning_rate": 1.3066479028482347e-06, + "loss": 0.8977, + "step": 8653 + }, + { + "epoch": 0.6243642004256701, + "grad_norm": 4.545088269475009, + "learning_rate": 1.3062095453256298e-06, + "loss": 0.8713, + "step": 8654 + }, + { + "epoch": 0.6244363478950976, + "grad_norm": 4.282723055061747, + "learning_rate": 1.3057712256883542e-06, + "loss": 0.8885, + "step": 8655 + }, + { + "epoch": 0.6245084953645251, + "grad_norm": 0.8087381795995149, + "learning_rate": 1.3053329439603412e-06, + "loss": 0.8219, + "step": 8656 + }, + { + "epoch": 0.6245806428339526, + "grad_norm": 3.0546141320513036, + "learning_rate": 1.3048947001655253e-06, + "loss": 0.9051, + "step": 8657 + }, + { + "epoch": 0.6246527903033801, + "grad_norm": 3.1480646030917563, + "learning_rate": 1.3044564943278364e-06, + "loss": 0.913, + "step": 8658 + }, + { + "epoch": 0.6247249377728076, + "grad_norm": 2.7151861400268484, + "learning_rate": 1.3040183264712035e-06, + "loss": 0.9339, + "step": 8659 + }, + { + "epoch": 0.6247970852422351, + "grad_norm": 5.012042040369246, + "learning_rate": 1.3035801966195532e-06, + "loss": 0.9416, + "step": 8660 + }, + { + "epoch": 0.6248692327116626, + "grad_norm": 3.3844170762367414, + "learning_rate": 1.3031421047968106e-06, + "loss": 0.9358, + "step": 8661 + }, + { + "epoch": 0.6249413801810901, + "grad_norm": 3.1155786019163823, + "learning_rate": 1.302704051026898e-06, + "loss": 0.9235, + "step": 8662 + }, + { + "epoch": 0.6250135276505177, + "grad_norm": 6.375268369523535, + "learning_rate": 1.3022660353337352e-06, + "loss": 1.0013, + "step": 8663 + }, + { + "epoch": 0.6250856751199452, + "grad_norm": 2.893248486161073, + "learning_rate": 1.301828057741242e-06, + "loss": 0.8887, + "step": 8664 + }, + { + "epoch": 0.6251578225893727, + "grad_norm": 4.166303593393575, + "learning_rate": 1.3013901182733342e-06, + "loss": 0.9368, + "step": 8665 + }, + { + "epoch": 0.6252299700588002, + "grad_norm": 2.713012253168025, + "learning_rate": 1.300952216953925e-06, + "loss": 0.8669, + "step": 8666 + }, + { + "epoch": 0.6253021175282277, + "grad_norm": 4.046166317427662, + "learning_rate": 1.300514353806928e-06, + "loss": 0.8946, + "step": 8667 + }, + { + "epoch": 0.6253742649976552, + "grad_norm": 3.2810735745910793, + "learning_rate": 1.3000765288562523e-06, + "loss": 0.8191, + "step": 8668 + }, + { + "epoch": 0.6254464124670828, + "grad_norm": 7.376362303005141, + "learning_rate": 1.2996387421258068e-06, + "loss": 0.9718, + "step": 8669 + }, + { + "epoch": 0.6255185599365102, + "grad_norm": 2.0693645106274143, + "learning_rate": 1.2992009936394973e-06, + "loss": 0.977, + "step": 8670 + }, + { + "epoch": 0.6255907074059377, + "grad_norm": 3.0960459811125363, + "learning_rate": 1.2987632834212266e-06, + "loss": 0.8976, + "step": 8671 + }, + { + "epoch": 0.6256628548753652, + "grad_norm": 4.996911048401278, + "learning_rate": 1.2983256114948972e-06, + "loss": 0.9387, + "step": 8672 + }, + { + "epoch": 0.6257350023447927, + "grad_norm": 4.416111077445838, + "learning_rate": 1.2978879778844087e-06, + "loss": 1.0019, + "step": 8673 + }, + { + "epoch": 0.6258071498142203, + "grad_norm": 0.8302284014935561, + "learning_rate": 1.297450382613659e-06, + "loss": 0.782, + "step": 8674 + }, + { + "epoch": 0.6258792972836478, + "grad_norm": 3.133508619180661, + "learning_rate": 1.2970128257065427e-06, + "loss": 1.0017, + "step": 8675 + }, + { + "epoch": 0.6259514447530753, + "grad_norm": 2.3355560501615122, + "learning_rate": 1.2965753071869534e-06, + "loss": 0.9817, + "step": 8676 + }, + { + "epoch": 0.6260235922225028, + "grad_norm": 1.9964565238584848, + "learning_rate": 1.2961378270787832e-06, + "loss": 0.844, + "step": 8677 + }, + { + "epoch": 0.6260957396919303, + "grad_norm": 2.301396389707739, + "learning_rate": 1.2957003854059202e-06, + "loss": 0.9298, + "step": 8678 + }, + { + "epoch": 0.6261678871613579, + "grad_norm": 3.2097180503540423, + "learning_rate": 1.295262982192252e-06, + "loss": 0.9088, + "step": 8679 + }, + { + "epoch": 0.6262400346307854, + "grad_norm": 4.199194903638687, + "learning_rate": 1.294825617461664e-06, + "loss": 0.9697, + "step": 8680 + }, + { + "epoch": 0.6263121821002129, + "grad_norm": 3.710441861432122, + "learning_rate": 1.2943882912380377e-06, + "loss": 0.9697, + "step": 8681 + }, + { + "epoch": 0.6263843295696403, + "grad_norm": 2.6845319460017616, + "learning_rate": 1.2939510035452555e-06, + "loss": 0.862, + "step": 8682 + }, + { + "epoch": 0.6264564770390678, + "grad_norm": 3.3616426133477497, + "learning_rate": 1.293513754407195e-06, + "loss": 0.8643, + "step": 8683 + }, + { + "epoch": 0.6265286245084953, + "grad_norm": 0.7899941336136309, + "learning_rate": 1.2930765438477327e-06, + "loss": 0.7917, + "step": 8684 + }, + { + "epoch": 0.6266007719779229, + "grad_norm": 3.4293335056314778, + "learning_rate": 1.2926393718907436e-06, + "loss": 0.8598, + "step": 8685 + }, + { + "epoch": 0.6266729194473504, + "grad_norm": 2.39035441544027, + "learning_rate": 1.2922022385601e-06, + "loss": 0.988, + "step": 8686 + }, + { + "epoch": 0.6267450669167779, + "grad_norm": 3.7979759849202903, + "learning_rate": 1.291765143879672e-06, + "loss": 0.8183, + "step": 8687 + }, + { + "epoch": 0.6268172143862054, + "grad_norm": 3.676920180262814, + "learning_rate": 1.2913280878733278e-06, + "loss": 0.9412, + "step": 8688 + }, + { + "epoch": 0.6268893618556329, + "grad_norm": 2.9072952851824563, + "learning_rate": 1.2908910705649333e-06, + "loss": 0.9165, + "step": 8689 + }, + { + "epoch": 0.6269615093250605, + "grad_norm": 2.5942688388557684, + "learning_rate": 1.2904540919783516e-06, + "loss": 0.8715, + "step": 8690 + }, + { + "epoch": 0.627033656794488, + "grad_norm": 2.3486927048626436, + "learning_rate": 1.2900171521374462e-06, + "loss": 0.8455, + "step": 8691 + }, + { + "epoch": 0.6271058042639155, + "grad_norm": 4.117433486336397, + "learning_rate": 1.2895802510660761e-06, + "loss": 0.8818, + "step": 8692 + }, + { + "epoch": 0.6271779517333429, + "grad_norm": 2.363373961088247, + "learning_rate": 1.2891433887880974e-06, + "loss": 0.8536, + "step": 8693 + }, + { + "epoch": 0.6272500992027704, + "grad_norm": 3.1506666900897367, + "learning_rate": 1.288706565327368e-06, + "loss": 0.9365, + "step": 8694 + }, + { + "epoch": 0.6273222466721979, + "grad_norm": 2.8833242236160697, + "learning_rate": 1.2882697807077389e-06, + "loss": 0.9605, + "step": 8695 + }, + { + "epoch": 0.6273943941416255, + "grad_norm": 0.6927071741041843, + "learning_rate": 1.2878330349530628e-06, + "loss": 0.7952, + "step": 8696 + }, + { + "epoch": 0.627466541611053, + "grad_norm": 3.380578552254248, + "learning_rate": 1.2873963280871887e-06, + "loss": 0.9897, + "step": 8697 + }, + { + "epoch": 0.6275386890804805, + "grad_norm": 3.6702201847891884, + "learning_rate": 1.2869596601339628e-06, + "loss": 1.0074, + "step": 8698 + }, + { + "epoch": 0.627610836549908, + "grad_norm": 2.777152053505496, + "learning_rate": 1.2865230311172299e-06, + "loss": 0.9376, + "step": 8699 + }, + { + "epoch": 0.6276829840193355, + "grad_norm": 0.6504384845159137, + "learning_rate": 1.2860864410608334e-06, + "loss": 0.7706, + "step": 8700 + }, + { + "epoch": 0.627755131488763, + "grad_norm": 0.753391821673941, + "learning_rate": 1.285649889988614e-06, + "loss": 0.7964, + "step": 8701 + }, + { + "epoch": 0.6278272789581906, + "grad_norm": 3.236076165603177, + "learning_rate": 1.2852133779244091e-06, + "loss": 0.8828, + "step": 8702 + }, + { + "epoch": 0.6278994264276181, + "grad_norm": 5.003097147152229, + "learning_rate": 1.2847769048920555e-06, + "loss": 0.9347, + "step": 8703 + }, + { + "epoch": 0.6279715738970456, + "grad_norm": 3.254669942455943, + "learning_rate": 1.284340470915388e-06, + "loss": 0.9497, + "step": 8704 + }, + { + "epoch": 0.628043721366473, + "grad_norm": 3.8033926325016423, + "learning_rate": 1.2839040760182371e-06, + "loss": 0.9168, + "step": 8705 + }, + { + "epoch": 0.6281158688359005, + "grad_norm": 2.933359190075516, + "learning_rate": 1.283467720224435e-06, + "loss": 0.8598, + "step": 8706 + }, + { + "epoch": 0.6281880163053281, + "grad_norm": 5.083919470508673, + "learning_rate": 1.2830314035578073e-06, + "loss": 0.8504, + "step": 8707 + }, + { + "epoch": 0.6282601637747556, + "grad_norm": 4.983749876613151, + "learning_rate": 1.2825951260421809e-06, + "loss": 0.8197, + "step": 8708 + }, + { + "epoch": 0.6283323112441831, + "grad_norm": 3.118744654832437, + "learning_rate": 1.2821588877013789e-06, + "loss": 0.9969, + "step": 8709 + }, + { + "epoch": 0.6284044587136106, + "grad_norm": 4.535350338695143, + "learning_rate": 1.2817226885592227e-06, + "loss": 0.9316, + "step": 8710 + }, + { + "epoch": 0.6284766061830381, + "grad_norm": 2.7644300735027865, + "learning_rate": 1.2812865286395314e-06, + "loss": 0.9254, + "step": 8711 + }, + { + "epoch": 0.6285487536524657, + "grad_norm": 2.5026466665056466, + "learning_rate": 1.2808504079661218e-06, + "loss": 0.9106, + "step": 8712 + }, + { + "epoch": 0.6286209011218932, + "grad_norm": 3.658378177330318, + "learning_rate": 1.28041432656281e-06, + "loss": 0.8982, + "step": 8713 + }, + { + "epoch": 0.6286930485913207, + "grad_norm": 3.0625878146290684, + "learning_rate": 1.2799782844534077e-06, + "loss": 0.8466, + "step": 8714 + }, + { + "epoch": 0.6287651960607482, + "grad_norm": 3.942645994203374, + "learning_rate": 1.2795422816617256e-06, + "loss": 0.8955, + "step": 8715 + }, + { + "epoch": 0.6288373435301757, + "grad_norm": 4.855530815924715, + "learning_rate": 1.279106318211573e-06, + "loss": 0.9133, + "step": 8716 + }, + { + "epoch": 0.6289094909996031, + "grad_norm": 3.0987811553484503, + "learning_rate": 1.2786703941267548e-06, + "loss": 0.9325, + "step": 8717 + }, + { + "epoch": 0.6289816384690307, + "grad_norm": 5.007867726513926, + "learning_rate": 1.278234509431077e-06, + "loss": 0.8316, + "step": 8718 + }, + { + "epoch": 0.6290537859384582, + "grad_norm": 3.8922107329837132, + "learning_rate": 1.2777986641483406e-06, + "loss": 0.9047, + "step": 8719 + }, + { + "epoch": 0.6291259334078857, + "grad_norm": 3.012698952844235, + "learning_rate": 1.277362858302345e-06, + "loss": 0.9809, + "step": 8720 + }, + { + "epoch": 0.6291980808773132, + "grad_norm": 2.816773622126149, + "learning_rate": 1.2769270919168894e-06, + "loss": 0.9511, + "step": 8721 + }, + { + "epoch": 0.6292702283467407, + "grad_norm": 3.0347375372280285, + "learning_rate": 1.276491365015768e-06, + "loss": 0.7933, + "step": 8722 + }, + { + "epoch": 0.6293423758161683, + "grad_norm": 2.418027615177298, + "learning_rate": 1.276055677622775e-06, + "loss": 0.8837, + "step": 8723 + }, + { + "epoch": 0.6294145232855958, + "grad_norm": 17.613746892408887, + "learning_rate": 1.2756200297617018e-06, + "loss": 1.0519, + "step": 8724 + }, + { + "epoch": 0.6294866707550233, + "grad_norm": 2.5281667887436825, + "learning_rate": 1.2751844214563369e-06, + "loss": 0.8444, + "step": 8725 + }, + { + "epoch": 0.6295588182244508, + "grad_norm": 3.788325350129666, + "learning_rate": 1.2747488527304677e-06, + "loss": 0.9461, + "step": 8726 + }, + { + "epoch": 0.6296309656938783, + "grad_norm": 3.564634520213161, + "learning_rate": 1.2743133236078785e-06, + "loss": 0.9042, + "step": 8727 + }, + { + "epoch": 0.6297031131633058, + "grad_norm": 3.4806961687415057, + "learning_rate": 1.273877834112353e-06, + "loss": 0.949, + "step": 8728 + }, + { + "epoch": 0.6297752606327333, + "grad_norm": 5.945913357141589, + "learning_rate": 1.2734423842676702e-06, + "loss": 0.8498, + "step": 8729 + }, + { + "epoch": 0.6298474081021608, + "grad_norm": 2.5310246873583786, + "learning_rate": 1.2730069740976098e-06, + "loss": 0.8957, + "step": 8730 + }, + { + "epoch": 0.6299195555715883, + "grad_norm": 3.572817268138177, + "learning_rate": 1.2725716036259473e-06, + "loss": 0.8308, + "step": 8731 + }, + { + "epoch": 0.6299917030410158, + "grad_norm": 5.585100188556239, + "learning_rate": 1.2721362728764558e-06, + "loss": 0.8409, + "step": 8732 + }, + { + "epoch": 0.6300638505104433, + "grad_norm": 2.61082436517936, + "learning_rate": 1.2717009818729093e-06, + "loss": 0.9756, + "step": 8733 + }, + { + "epoch": 0.6301359979798709, + "grad_norm": 4.0805268796417105, + "learning_rate": 1.2712657306390751e-06, + "loss": 1.0265, + "step": 8734 + }, + { + "epoch": 0.6302081454492984, + "grad_norm": 2.4350502223606605, + "learning_rate": 1.2708305191987218e-06, + "loss": 0.9983, + "step": 8735 + }, + { + "epoch": 0.6302802929187259, + "grad_norm": 4.6509514686350615, + "learning_rate": 1.2703953475756151e-06, + "loss": 0.882, + "step": 8736 + }, + { + "epoch": 0.6303524403881534, + "grad_norm": 3.096698473413532, + "learning_rate": 1.2699602157935169e-06, + "loss": 0.9411, + "step": 8737 + }, + { + "epoch": 0.6304245878575809, + "grad_norm": 5.922235402435911, + "learning_rate": 1.269525123876189e-06, + "loss": 0.8555, + "step": 8738 + }, + { + "epoch": 0.6304967353270085, + "grad_norm": 2.752012036940399, + "learning_rate": 1.2690900718473901e-06, + "loss": 0.9717, + "step": 8739 + }, + { + "epoch": 0.630568882796436, + "grad_norm": 3.7489754866772644, + "learning_rate": 1.268655059730877e-06, + "loss": 0.9045, + "step": 8740 + }, + { + "epoch": 0.6306410302658634, + "grad_norm": 2.221052882046304, + "learning_rate": 1.2682200875504032e-06, + "loss": 0.89, + "step": 8741 + }, + { + "epoch": 0.6307131777352909, + "grad_norm": 3.4984590680512584, + "learning_rate": 1.2677851553297213e-06, + "loss": 0.9802, + "step": 8742 + }, + { + "epoch": 0.6307853252047184, + "grad_norm": 4.040238878722835, + "learning_rate": 1.2673502630925822e-06, + "loss": 0.9589, + "step": 8743 + }, + { + "epoch": 0.6308574726741459, + "grad_norm": 4.146872129920823, + "learning_rate": 1.2669154108627322e-06, + "loss": 0.9648, + "step": 8744 + }, + { + "epoch": 0.6309296201435735, + "grad_norm": 2.929069596296949, + "learning_rate": 1.2664805986639186e-06, + "loss": 1.0022, + "step": 8745 + }, + { + "epoch": 0.631001767613001, + "grad_norm": 3.0408961395721885, + "learning_rate": 1.266045826519884e-06, + "loss": 0.9119, + "step": 8746 + }, + { + "epoch": 0.6310739150824285, + "grad_norm": 7.996942889695704, + "learning_rate": 1.2656110944543695e-06, + "loss": 0.9198, + "step": 8747 + }, + { + "epoch": 0.631146062551856, + "grad_norm": 4.6798676470438405, + "learning_rate": 1.2651764024911147e-06, + "loss": 0.9376, + "step": 8748 + }, + { + "epoch": 0.6312182100212835, + "grad_norm": 4.793070225469193, + "learning_rate": 1.2647417506538563e-06, + "loss": 0.8609, + "step": 8749 + }, + { + "epoch": 0.631290357490711, + "grad_norm": 4.005887467143087, + "learning_rate": 1.2643071389663286e-06, + "loss": 0.8861, + "step": 8750 + }, + { + "epoch": 0.6313625049601386, + "grad_norm": 4.242950876584984, + "learning_rate": 1.2638725674522652e-06, + "loss": 0.9231, + "step": 8751 + }, + { + "epoch": 0.631434652429566, + "grad_norm": 2.572645618406327, + "learning_rate": 1.2634380361353958e-06, + "loss": 0.8622, + "step": 8752 + }, + { + "epoch": 0.6315067998989935, + "grad_norm": 4.041328547121757, + "learning_rate": 1.2630035450394485e-06, + "loss": 0.8825, + "step": 8753 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 5.331169086660674, + "learning_rate": 1.2625690941881492e-06, + "loss": 0.915, + "step": 8754 + }, + { + "epoch": 0.6316510948378485, + "grad_norm": 2.764301565300157, + "learning_rate": 1.2621346836052223e-06, + "loss": 0.8452, + "step": 8755 + }, + { + "epoch": 0.6317232423072761, + "grad_norm": 2.942365148107033, + "learning_rate": 1.2617003133143878e-06, + "loss": 0.8835, + "step": 8756 + }, + { + "epoch": 0.6317953897767036, + "grad_norm": 2.537769070345179, + "learning_rate": 1.261265983339367e-06, + "loss": 0.9929, + "step": 8757 + }, + { + "epoch": 0.6318675372461311, + "grad_norm": 3.876567646532129, + "learning_rate": 1.260831693703876e-06, + "loss": 0.9502, + "step": 8758 + }, + { + "epoch": 0.6319396847155586, + "grad_norm": 3.1687748232740103, + "learning_rate": 1.2603974444316294e-06, + "loss": 0.8627, + "step": 8759 + }, + { + "epoch": 0.6320118321849861, + "grad_norm": 2.652651065332182, + "learning_rate": 1.2599632355463413e-06, + "loss": 0.9261, + "step": 8760 + }, + { + "epoch": 0.6320839796544137, + "grad_norm": 2.6565687998028467, + "learning_rate": 1.2595290670717207e-06, + "loss": 0.9123, + "step": 8761 + }, + { + "epoch": 0.6321561271238412, + "grad_norm": 2.7213137585622653, + "learning_rate": 1.2590949390314767e-06, + "loss": 0.91, + "step": 8762 + }, + { + "epoch": 0.6322282745932687, + "grad_norm": 3.1318456817941644, + "learning_rate": 1.2586608514493157e-06, + "loss": 0.9354, + "step": 8763 + }, + { + "epoch": 0.6323004220626961, + "grad_norm": 6.743304287214165, + "learning_rate": 1.258226804348941e-06, + "loss": 0.8975, + "step": 8764 + }, + { + "epoch": 0.6323725695321236, + "grad_norm": 3.343411865250197, + "learning_rate": 1.2577927977540546e-06, + "loss": 1.033, + "step": 8765 + }, + { + "epoch": 0.6324447170015511, + "grad_norm": 2.794223060922423, + "learning_rate": 1.2573588316883556e-06, + "loss": 1.0071, + "step": 8766 + }, + { + "epoch": 0.6325168644709787, + "grad_norm": 0.8380352686809739, + "learning_rate": 1.2569249061755422e-06, + "loss": 0.8214, + "step": 8767 + }, + { + "epoch": 0.6325890119404062, + "grad_norm": 5.850300916447655, + "learning_rate": 1.256491021239309e-06, + "loss": 1.0025, + "step": 8768 + }, + { + "epoch": 0.6326611594098337, + "grad_norm": 4.16640568233753, + "learning_rate": 1.2560571769033482e-06, + "loss": 0.8623, + "step": 8769 + }, + { + "epoch": 0.6327333068792612, + "grad_norm": 0.7142631382439838, + "learning_rate": 1.2556233731913514e-06, + "loss": 0.799, + "step": 8770 + }, + { + "epoch": 0.6328054543486887, + "grad_norm": 5.899191539137349, + "learning_rate": 1.2551896101270059e-06, + "loss": 0.8479, + "step": 8771 + }, + { + "epoch": 0.6328776018181163, + "grad_norm": 4.17216164965004, + "learning_rate": 1.2547558877339994e-06, + "loss": 0.9004, + "step": 8772 + }, + { + "epoch": 0.6329497492875438, + "grad_norm": 2.1652436596352893, + "learning_rate": 1.2543222060360149e-06, + "loss": 0.9352, + "step": 8773 + }, + { + "epoch": 0.6330218967569713, + "grad_norm": 4.187594967804299, + "learning_rate": 1.2538885650567342e-06, + "loss": 1.0737, + "step": 8774 + }, + { + "epoch": 0.6330940442263988, + "grad_norm": 5.497522836649025, + "learning_rate": 1.253454964819837e-06, + "loss": 0.99, + "step": 8775 + }, + { + "epoch": 0.6331661916958262, + "grad_norm": 3.7021579686965147, + "learning_rate": 1.2530214053490004e-06, + "loss": 0.8882, + "step": 8776 + }, + { + "epoch": 0.6332383391652537, + "grad_norm": 3.5735289528190117, + "learning_rate": 1.2525878866678999e-06, + "loss": 0.8589, + "step": 8777 + }, + { + "epoch": 0.6333104866346813, + "grad_norm": 5.8664813677105725, + "learning_rate": 1.2521544088002077e-06, + "loss": 0.8356, + "step": 8778 + }, + { + "epoch": 0.6333826341041088, + "grad_norm": 2.7944970277861567, + "learning_rate": 1.2517209717695955e-06, + "loss": 0.835, + "step": 8779 + }, + { + "epoch": 0.6334547815735363, + "grad_norm": 7.451886280054157, + "learning_rate": 1.2512875755997306e-06, + "loss": 0.9384, + "step": 8780 + }, + { + "epoch": 0.6335269290429638, + "grad_norm": 4.187643475707868, + "learning_rate": 1.2508542203142792e-06, + "loss": 0.9152, + "step": 8781 + }, + { + "epoch": 0.6335990765123913, + "grad_norm": 4.7244925009443595, + "learning_rate": 1.2504209059369062e-06, + "loss": 0.9122, + "step": 8782 + }, + { + "epoch": 0.6336712239818189, + "grad_norm": 4.024330528927102, + "learning_rate": 1.2499876324912716e-06, + "loss": 0.9769, + "step": 8783 + }, + { + "epoch": 0.6337433714512464, + "grad_norm": 4.76065294387507, + "learning_rate": 1.2495544000010374e-06, + "loss": 0.8523, + "step": 8784 + }, + { + "epoch": 0.6338155189206739, + "grad_norm": 3.0261823629126376, + "learning_rate": 1.2491212084898587e-06, + "loss": 0.9899, + "step": 8785 + }, + { + "epoch": 0.6338876663901014, + "grad_norm": 2.6712610978363234, + "learning_rate": 1.2486880579813908e-06, + "loss": 0.8345, + "step": 8786 + }, + { + "epoch": 0.6339598138595289, + "grad_norm": 2.9259810538627, + "learning_rate": 1.2482549484992872e-06, + "loss": 0.9959, + "step": 8787 + }, + { + "epoch": 0.6340319613289563, + "grad_norm": 3.311480095235751, + "learning_rate": 1.247821880067198e-06, + "loss": 0.9705, + "step": 8788 + }, + { + "epoch": 0.6341041087983839, + "grad_norm": 3.9502417864617843, + "learning_rate": 1.2473888527087712e-06, + "loss": 0.809, + "step": 8789 + }, + { + "epoch": 0.6341762562678114, + "grad_norm": 3.1096401389096564, + "learning_rate": 1.2469558664476536e-06, + "loss": 1.001, + "step": 8790 + }, + { + "epoch": 0.6342484037372389, + "grad_norm": 2.844886301245904, + "learning_rate": 1.246522921307488e-06, + "loss": 0.9884, + "step": 8791 + }, + { + "epoch": 0.6343205512066664, + "grad_norm": 2.6943276195390813, + "learning_rate": 1.2460900173119164e-06, + "loss": 0.9397, + "step": 8792 + }, + { + "epoch": 0.6343926986760939, + "grad_norm": 7.53029769005059, + "learning_rate": 1.2456571544845782e-06, + "loss": 0.9421, + "step": 8793 + }, + { + "epoch": 0.6344648461455215, + "grad_norm": 3.2258400466154016, + "learning_rate": 1.2452243328491106e-06, + "loss": 0.994, + "step": 8794 + }, + { + "epoch": 0.634536993614949, + "grad_norm": 3.474054353331615, + "learning_rate": 1.2447915524291473e-06, + "loss": 0.9175, + "step": 8795 + }, + { + "epoch": 0.6346091410843765, + "grad_norm": 3.60282164303257, + "learning_rate": 1.2443588132483224e-06, + "loss": 0.8927, + "step": 8796 + }, + { + "epoch": 0.634681288553804, + "grad_norm": 4.597621817123611, + "learning_rate": 1.2439261153302654e-06, + "loss": 0.9623, + "step": 8797 + }, + { + "epoch": 0.6347534360232315, + "grad_norm": 0.713916114142857, + "learning_rate": 1.2434934586986036e-06, + "loss": 0.7997, + "step": 8798 + }, + { + "epoch": 0.6348255834926589, + "grad_norm": 11.809482734069645, + "learning_rate": 1.2430608433769646e-06, + "loss": 0.8543, + "step": 8799 + }, + { + "epoch": 0.6348977309620865, + "grad_norm": 2.9545941608922557, + "learning_rate": 1.24262826938897e-06, + "loss": 0.8754, + "step": 8800 + }, + { + "epoch": 0.634969878431514, + "grad_norm": 2.8271491121544963, + "learning_rate": 1.242195736758242e-06, + "loss": 0.9446, + "step": 8801 + }, + { + "epoch": 0.6350420259009415, + "grad_norm": 5.7031504173234975, + "learning_rate": 1.2417632455083999e-06, + "loss": 0.8646, + "step": 8802 + }, + { + "epoch": 0.635114173370369, + "grad_norm": 4.030995915412016, + "learning_rate": 1.2413307956630598e-06, + "loss": 1.0859, + "step": 8803 + }, + { + "epoch": 0.6351863208397965, + "grad_norm": 2.7907461859863156, + "learning_rate": 1.2408983872458367e-06, + "loss": 0.8494, + "step": 8804 + }, + { + "epoch": 0.6352584683092241, + "grad_norm": 4.880561785958418, + "learning_rate": 1.2404660202803423e-06, + "loss": 1.1032, + "step": 8805 + }, + { + "epoch": 0.6353306157786516, + "grad_norm": 0.6919338131712399, + "learning_rate": 1.2400336947901871e-06, + "loss": 0.7365, + "step": 8806 + }, + { + "epoch": 0.6354027632480791, + "grad_norm": 6.599565156425554, + "learning_rate": 1.2396014107989786e-06, + "loss": 0.9216, + "step": 8807 + }, + { + "epoch": 0.6354749107175066, + "grad_norm": 4.978192839366797, + "learning_rate": 1.2391691683303216e-06, + "loss": 1.1148, + "step": 8808 + }, + { + "epoch": 0.6355470581869341, + "grad_norm": 2.458526399003262, + "learning_rate": 1.2387369674078204e-06, + "loss": 1.0038, + "step": 8809 + }, + { + "epoch": 0.6356192056563617, + "grad_norm": 2.83764479347539, + "learning_rate": 1.2383048080550747e-06, + "loss": 0.9933, + "step": 8810 + }, + { + "epoch": 0.6356913531257891, + "grad_norm": 3.746924601553461, + "learning_rate": 1.2378726902956842e-06, + "loss": 1.0424, + "step": 8811 + }, + { + "epoch": 0.6357635005952166, + "grad_norm": 15.18525032367192, + "learning_rate": 1.237440614153245e-06, + "loss": 0.9185, + "step": 8812 + }, + { + "epoch": 0.6358356480646441, + "grad_norm": 2.372877879463494, + "learning_rate": 1.23700857965135e-06, + "loss": 1.001, + "step": 8813 + }, + { + "epoch": 0.6359077955340716, + "grad_norm": 18.173282270315468, + "learning_rate": 1.236576586813593e-06, + "loss": 0.961, + "step": 8814 + }, + { + "epoch": 0.6359799430034991, + "grad_norm": 2.956779197119734, + "learning_rate": 1.2361446356635622e-06, + "loss": 0.9173, + "step": 8815 + }, + { + "epoch": 0.6360520904729267, + "grad_norm": 3.5482750368742115, + "learning_rate": 1.2357127262248448e-06, + "loss": 0.8746, + "step": 8816 + }, + { + "epoch": 0.6361242379423542, + "grad_norm": 4.563556705540921, + "learning_rate": 1.2352808585210261e-06, + "loss": 0.886, + "step": 8817 + }, + { + "epoch": 0.6361963854117817, + "grad_norm": 3.4922344708110726, + "learning_rate": 1.2348490325756898e-06, + "loss": 0.7596, + "step": 8818 + }, + { + "epoch": 0.6362685328812092, + "grad_norm": 3.2896611989432905, + "learning_rate": 1.2344172484124147e-06, + "loss": 0.9098, + "step": 8819 + }, + { + "epoch": 0.6363406803506367, + "grad_norm": 0.8941030018608337, + "learning_rate": 1.233985506054779e-06, + "loss": 0.8537, + "step": 8820 + }, + { + "epoch": 0.6364128278200643, + "grad_norm": 2.474360693620186, + "learning_rate": 1.23355380552636e-06, + "loss": 0.8893, + "step": 8821 + }, + { + "epoch": 0.6364849752894918, + "grad_norm": 4.866372519361047, + "learning_rate": 1.2331221468507296e-06, + "loss": 0.937, + "step": 8822 + }, + { + "epoch": 0.6365571227589192, + "grad_norm": 2.357203918343211, + "learning_rate": 1.2326905300514605e-06, + "loss": 0.8155, + "step": 8823 + }, + { + "epoch": 0.6366292702283467, + "grad_norm": 4.656735798116212, + "learning_rate": 1.232258955152121e-06, + "loss": 0.8862, + "step": 8824 + }, + { + "epoch": 0.6367014176977742, + "grad_norm": 2.6161343861280786, + "learning_rate": 1.2318274221762775e-06, + "loss": 0.9198, + "step": 8825 + }, + { + "epoch": 0.6367735651672017, + "grad_norm": 4.604299459944553, + "learning_rate": 1.2313959311474953e-06, + "loss": 0.9142, + "step": 8826 + }, + { + "epoch": 0.6368457126366293, + "grad_norm": 3.883567567781953, + "learning_rate": 1.2309644820893356e-06, + "loss": 0.9861, + "step": 8827 + }, + { + "epoch": 0.6369178601060568, + "grad_norm": 4.998155635173636, + "learning_rate": 1.2305330750253588e-06, + "loss": 1.0007, + "step": 8828 + }, + { + "epoch": 0.6369900075754843, + "grad_norm": 8.924433620230168, + "learning_rate": 1.230101709979123e-06, + "loss": 0.9795, + "step": 8829 + }, + { + "epoch": 0.6370621550449118, + "grad_norm": 2.9564335772108867, + "learning_rate": 1.2296703869741815e-06, + "loss": 0.8529, + "step": 8830 + }, + { + "epoch": 0.6371343025143393, + "grad_norm": 5.374735803542881, + "learning_rate": 1.2292391060340897e-06, + "loss": 0.8203, + "step": 8831 + }, + { + "epoch": 0.6372064499837669, + "grad_norm": 3.212998668396119, + "learning_rate": 1.2288078671823959e-06, + "loss": 0.9141, + "step": 8832 + }, + { + "epoch": 0.6372785974531944, + "grad_norm": 2.7696366175742626, + "learning_rate": 1.2283766704426506e-06, + "loss": 0.8054, + "step": 8833 + }, + { + "epoch": 0.6373507449226219, + "grad_norm": 30.902904425848472, + "learning_rate": 1.2279455158383987e-06, + "loss": 0.8487, + "step": 8834 + }, + { + "epoch": 0.6374228923920493, + "grad_norm": 3.0863013716941814, + "learning_rate": 1.2275144033931835e-06, + "loss": 0.9984, + "step": 8835 + }, + { + "epoch": 0.6374950398614768, + "grad_norm": 4.441581675638991, + "learning_rate": 1.2270833331305475e-06, + "loss": 0.9806, + "step": 8836 + }, + { + "epoch": 0.6375671873309043, + "grad_norm": 3.8199286706865747, + "learning_rate": 1.2266523050740291e-06, + "loss": 0.9258, + "step": 8837 + }, + { + "epoch": 0.6376393348003319, + "grad_norm": 5.394366660290104, + "learning_rate": 1.226221319247166e-06, + "loss": 0.9002, + "step": 8838 + }, + { + "epoch": 0.6377114822697594, + "grad_norm": 0.802288813507097, + "learning_rate": 1.2257903756734919e-06, + "loss": 0.8084, + "step": 8839 + }, + { + "epoch": 0.6377836297391869, + "grad_norm": 3.559107587379522, + "learning_rate": 1.2253594743765395e-06, + "loss": 1.0401, + "step": 8840 + }, + { + "epoch": 0.6378557772086144, + "grad_norm": 10.211417177428624, + "learning_rate": 1.2249286153798389e-06, + "loss": 0.8847, + "step": 8841 + }, + { + "epoch": 0.6379279246780419, + "grad_norm": 10.706820330374189, + "learning_rate": 1.224497798706917e-06, + "loss": 0.912, + "step": 8842 + }, + { + "epoch": 0.6380000721474695, + "grad_norm": 3.3784255551317663, + "learning_rate": 1.2240670243812998e-06, + "loss": 0.9024, + "step": 8843 + }, + { + "epoch": 0.638072219616897, + "grad_norm": 2.891511688330172, + "learning_rate": 1.2236362924265092e-06, + "loss": 0.8539, + "step": 8844 + }, + { + "epoch": 0.6381443670863245, + "grad_norm": 3.1627403982111857, + "learning_rate": 1.2232056028660675e-06, + "loss": 0.8821, + "step": 8845 + }, + { + "epoch": 0.638216514555752, + "grad_norm": 1.738829588395205, + "learning_rate": 1.2227749557234924e-06, + "loss": 0.8569, + "step": 8846 + }, + { + "epoch": 0.6382886620251794, + "grad_norm": 3.4135420553148776, + "learning_rate": 1.2223443510222988e-06, + "loss": 0.8994, + "step": 8847 + }, + { + "epoch": 0.6383608094946069, + "grad_norm": 5.4541535814719415, + "learning_rate": 1.221913788786002e-06, + "loss": 0.9145, + "step": 8848 + }, + { + "epoch": 0.6384329569640345, + "grad_norm": 2.613464805189438, + "learning_rate": 1.2214832690381125e-06, + "loss": 0.845, + "step": 8849 + }, + { + "epoch": 0.638505104433462, + "grad_norm": 3.865813286651146, + "learning_rate": 1.2210527918021402e-06, + "loss": 0.9004, + "step": 8850 + }, + { + "epoch": 0.6385772519028895, + "grad_norm": 3.6890666672433845, + "learning_rate": 1.2206223571015916e-06, + "loss": 0.9914, + "step": 8851 + }, + { + "epoch": 0.638649399372317, + "grad_norm": 14.703861295160014, + "learning_rate": 1.2201919649599698e-06, + "loss": 0.9193, + "step": 8852 + }, + { + "epoch": 0.6387215468417445, + "grad_norm": 3.7735652546550904, + "learning_rate": 1.219761615400779e-06, + "loss": 0.9035, + "step": 8853 + }, + { + "epoch": 0.6387936943111721, + "grad_norm": 12.741144433472583, + "learning_rate": 1.2193313084475176e-06, + "loss": 0.9434, + "step": 8854 + }, + { + "epoch": 0.6388658417805996, + "grad_norm": 3.89476509884369, + "learning_rate": 1.218901044123684e-06, + "loss": 0.8367, + "step": 8855 + }, + { + "epoch": 0.6389379892500271, + "grad_norm": 2.9685852005039988, + "learning_rate": 1.2184708224527725e-06, + "loss": 0.9438, + "step": 8856 + }, + { + "epoch": 0.6390101367194546, + "grad_norm": 4.968564971741636, + "learning_rate": 1.2180406434582761e-06, + "loss": 0.8935, + "step": 8857 + }, + { + "epoch": 0.639082284188882, + "grad_norm": 4.54913609446152, + "learning_rate": 1.2176105071636863e-06, + "loss": 0.854, + "step": 8858 + }, + { + "epoch": 0.6391544316583095, + "grad_norm": 3.015908499718504, + "learning_rate": 1.217180413592489e-06, + "loss": 0.9137, + "step": 8859 + }, + { + "epoch": 0.6392265791277371, + "grad_norm": 3.802943901320312, + "learning_rate": 1.2167503627681726e-06, + "loss": 0.8745, + "step": 8860 + }, + { + "epoch": 0.6392987265971646, + "grad_norm": 2.341505985471483, + "learning_rate": 1.2163203547142182e-06, + "loss": 0.955, + "step": 8861 + }, + { + "epoch": 0.6393708740665921, + "grad_norm": 2.331075450099983, + "learning_rate": 1.2158903894541092e-06, + "loss": 0.9336, + "step": 8862 + }, + { + "epoch": 0.6394430215360196, + "grad_norm": 3.275220872259604, + "learning_rate": 1.2154604670113232e-06, + "loss": 0.9016, + "step": 8863 + }, + { + "epoch": 0.6395151690054471, + "grad_norm": 4.654857657742369, + "learning_rate": 1.2150305874093362e-06, + "loss": 0.8811, + "step": 8864 + }, + { + "epoch": 0.6395873164748747, + "grad_norm": 2.693960187814637, + "learning_rate": 1.2146007506716238e-06, + "loss": 0.9091, + "step": 8865 + }, + { + "epoch": 0.6396594639443022, + "grad_norm": 3.5765337341671675, + "learning_rate": 1.2141709568216567e-06, + "loss": 0.8967, + "step": 8866 + }, + { + "epoch": 0.6397316114137297, + "grad_norm": 5.6888942476750675, + "learning_rate": 1.2137412058829046e-06, + "loss": 0.7927, + "step": 8867 + }, + { + "epoch": 0.6398037588831572, + "grad_norm": 3.7206645492996984, + "learning_rate": 1.2133114978788346e-06, + "loss": 0.8234, + "step": 8868 + }, + { + "epoch": 0.6398759063525847, + "grad_norm": 3.4947373470778937, + "learning_rate": 1.212881832832911e-06, + "loss": 0.9568, + "step": 8869 + }, + { + "epoch": 0.6399480538220121, + "grad_norm": 3.409618566888015, + "learning_rate": 1.2124522107685976e-06, + "loss": 0.9576, + "step": 8870 + }, + { + "epoch": 0.6400202012914397, + "grad_norm": 2.8044595864863355, + "learning_rate": 1.2120226317093524e-06, + "loss": 0.9622, + "step": 8871 + }, + { + "epoch": 0.6400923487608672, + "grad_norm": 7.39414458717338, + "learning_rate": 1.2115930956786354e-06, + "loss": 1.0277, + "step": 8872 + }, + { + "epoch": 0.6401644962302947, + "grad_norm": 3.3575957088505755, + "learning_rate": 1.2111636026999006e-06, + "loss": 0.8665, + "step": 8873 + }, + { + "epoch": 0.6402366436997222, + "grad_norm": 0.7611677083979389, + "learning_rate": 1.2107341527966007e-06, + "loss": 0.8125, + "step": 8874 + }, + { + "epoch": 0.6403087911691497, + "grad_norm": 2.7113554794455097, + "learning_rate": 1.2103047459921878e-06, + "loss": 0.8967, + "step": 8875 + }, + { + "epoch": 0.6403809386385773, + "grad_norm": 3.2078179940512497, + "learning_rate": 1.2098753823101082e-06, + "loss": 0.8501, + "step": 8876 + }, + { + "epoch": 0.6404530861080048, + "grad_norm": 3.04351966190019, + "learning_rate": 1.2094460617738103e-06, + "loss": 1.0103, + "step": 8877 + }, + { + "epoch": 0.6405252335774323, + "grad_norm": 3.396019647709924, + "learning_rate": 1.209016784406736e-06, + "loss": 0.8204, + "step": 8878 + }, + { + "epoch": 0.6405973810468598, + "grad_norm": 3.3347583267973904, + "learning_rate": 1.2085875502323268e-06, + "loss": 0.9153, + "step": 8879 + }, + { + "epoch": 0.6406695285162873, + "grad_norm": 3.4287355531283152, + "learning_rate": 1.2081583592740217e-06, + "loss": 0.8455, + "step": 8880 + }, + { + "epoch": 0.6407416759857149, + "grad_norm": 3.296483455024534, + "learning_rate": 1.2077292115552573e-06, + "loss": 0.9512, + "step": 8881 + }, + { + "epoch": 0.6408138234551423, + "grad_norm": 3.94432396921154, + "learning_rate": 1.2073001070994685e-06, + "loss": 0.9814, + "step": 8882 + }, + { + "epoch": 0.6408859709245698, + "grad_norm": 197.00163585932478, + "learning_rate": 1.206871045930085e-06, + "loss": 0.9598, + "step": 8883 + }, + { + "epoch": 0.6409581183939973, + "grad_norm": 2.8079049614747578, + "learning_rate": 1.2064420280705388e-06, + "loss": 0.8637, + "step": 8884 + }, + { + "epoch": 0.6410302658634248, + "grad_norm": 6.29668992293536, + "learning_rate": 1.2060130535442557e-06, + "loss": 0.8054, + "step": 8885 + }, + { + "epoch": 0.6411024133328523, + "grad_norm": 2.505228773471193, + "learning_rate": 1.2055841223746598e-06, + "loss": 1.008, + "step": 8886 + }, + { + "epoch": 0.6411745608022799, + "grad_norm": 5.800279137043919, + "learning_rate": 1.2051552345851748e-06, + "loss": 0.8807, + "step": 8887 + }, + { + "epoch": 0.6412467082717074, + "grad_norm": 9.746729766947595, + "learning_rate": 1.2047263901992193e-06, + "loss": 0.9721, + "step": 8888 + }, + { + "epoch": 0.6413188557411349, + "grad_norm": 2.2805877142231616, + "learning_rate": 1.2042975892402125e-06, + "loss": 0.8855, + "step": 8889 + }, + { + "epoch": 0.6413910032105624, + "grad_norm": 3.841207268002709, + "learning_rate": 1.2038688317315685e-06, + "loss": 0.9359, + "step": 8890 + }, + { + "epoch": 0.6414631506799899, + "grad_norm": 4.942152511275037, + "learning_rate": 1.2034401176967004e-06, + "loss": 0.9533, + "step": 8891 + }, + { + "epoch": 0.6415352981494175, + "grad_norm": 2.378992539517994, + "learning_rate": 1.2030114471590185e-06, + "loss": 1.0132, + "step": 8892 + }, + { + "epoch": 0.641607445618845, + "grad_norm": 4.467386744706824, + "learning_rate": 1.202582820141931e-06, + "loss": 0.9297, + "step": 8893 + }, + { + "epoch": 0.6416795930882724, + "grad_norm": 4.199113370770779, + "learning_rate": 1.2021542366688444e-06, + "loss": 0.998, + "step": 8894 + }, + { + "epoch": 0.6417517405576999, + "grad_norm": 0.5916770083459388, + "learning_rate": 1.201725696763161e-06, + "loss": 0.7592, + "step": 8895 + }, + { + "epoch": 0.6418238880271274, + "grad_norm": 4.185882939363649, + "learning_rate": 1.201297200448282e-06, + "loss": 0.8566, + "step": 8896 + }, + { + "epoch": 0.6418960354965549, + "grad_norm": 3.4344706278183788, + "learning_rate": 1.2008687477476068e-06, + "loss": 0.8659, + "step": 8897 + }, + { + "epoch": 0.6419681829659825, + "grad_norm": 3.026771146704674, + "learning_rate": 1.2004403386845299e-06, + "loss": 0.9399, + "step": 8898 + }, + { + "epoch": 0.64204033043541, + "grad_norm": 3.8375153879857398, + "learning_rate": 1.2000119732824472e-06, + "loss": 0.898, + "step": 8899 + }, + { + "epoch": 0.6421124779048375, + "grad_norm": 0.6984827440870562, + "learning_rate": 1.1995836515647493e-06, + "loss": 0.8219, + "step": 8900 + }, + { + "epoch": 0.642184625374265, + "grad_norm": 0.8758758861308068, + "learning_rate": 1.199155373554824e-06, + "loss": 0.7929, + "step": 8901 + }, + { + "epoch": 0.6422567728436925, + "grad_norm": 2.2046843076557106, + "learning_rate": 1.1987271392760603e-06, + "loss": 0.8988, + "step": 8902 + }, + { + "epoch": 0.64232892031312, + "grad_norm": 0.8236842659755719, + "learning_rate": 1.1982989487518409e-06, + "loss": 0.9033, + "step": 8903 + }, + { + "epoch": 0.6424010677825476, + "grad_norm": 2.245756598337097, + "learning_rate": 1.1978708020055478e-06, + "loss": 0.8481, + "step": 8904 + }, + { + "epoch": 0.642473215251975, + "grad_norm": 4.529759865768639, + "learning_rate": 1.1974426990605611e-06, + "loss": 0.9932, + "step": 8905 + }, + { + "epoch": 0.6425453627214025, + "grad_norm": 3.0121789721627357, + "learning_rate": 1.197014639940258e-06, + "loss": 0.8762, + "step": 8906 + }, + { + "epoch": 0.64261751019083, + "grad_norm": 0.7065902303984497, + "learning_rate": 1.1965866246680126e-06, + "loss": 0.8316, + "step": 8907 + }, + { + "epoch": 0.6426896576602575, + "grad_norm": 3.505447644042003, + "learning_rate": 1.1961586532671973e-06, + "loss": 0.8837, + "step": 8908 + }, + { + "epoch": 0.6427618051296851, + "grad_norm": 3.4026402936569093, + "learning_rate": 1.1957307257611824e-06, + "loss": 0.9408, + "step": 8909 + }, + { + "epoch": 0.6428339525991126, + "grad_norm": 3.6339785437530474, + "learning_rate": 1.1953028421733349e-06, + "loss": 0.961, + "step": 8910 + }, + { + "epoch": 0.6429061000685401, + "grad_norm": 4.425687299035897, + "learning_rate": 1.1948750025270209e-06, + "loss": 0.9912, + "step": 8911 + }, + { + "epoch": 0.6429782475379676, + "grad_norm": 0.8242174844596512, + "learning_rate": 1.1944472068456026e-06, + "loss": 0.8481, + "step": 8912 + }, + { + "epoch": 0.6430503950073951, + "grad_norm": 3.101457646901286, + "learning_rate": 1.1940194551524391e-06, + "loss": 0.8982, + "step": 8913 + }, + { + "epoch": 0.6431225424768227, + "grad_norm": 8.392625623483811, + "learning_rate": 1.193591747470891e-06, + "loss": 0.8564, + "step": 8914 + }, + { + "epoch": 0.6431946899462502, + "grad_norm": 2.9245560529231533, + "learning_rate": 1.1931640838243114e-06, + "loss": 0.9928, + "step": 8915 + }, + { + "epoch": 0.6432668374156777, + "grad_norm": 2.795637826768602, + "learning_rate": 1.1927364642360547e-06, + "loss": 0.976, + "step": 8916 + }, + { + "epoch": 0.6433389848851051, + "grad_norm": 3.9615387770557917, + "learning_rate": 1.1923088887294714e-06, + "loss": 0.9664, + "step": 8917 + }, + { + "epoch": 0.6434111323545326, + "grad_norm": 3.282636658263564, + "learning_rate": 1.1918813573279095e-06, + "loss": 0.8439, + "step": 8918 + }, + { + "epoch": 0.6434832798239601, + "grad_norm": 4.913750230725617, + "learning_rate": 1.1914538700547153e-06, + "loss": 0.8621, + "step": 8919 + }, + { + "epoch": 0.6435554272933877, + "grad_norm": 0.6847741535045192, + "learning_rate": 1.1910264269332317e-06, + "loss": 0.8172, + "step": 8920 + }, + { + "epoch": 0.6436275747628152, + "grad_norm": 1.867006556470297, + "learning_rate": 1.190599027986801e-06, + "loss": 0.9336, + "step": 8921 + }, + { + "epoch": 0.6436997222322427, + "grad_norm": 2.958523291994657, + "learning_rate": 1.1901716732387605e-06, + "loss": 0.8627, + "step": 8922 + }, + { + "epoch": 0.6437718697016702, + "grad_norm": 3.0741335943078103, + "learning_rate": 1.189744362712447e-06, + "loss": 1.0069, + "step": 8923 + }, + { + "epoch": 0.6438440171710977, + "grad_norm": 2.772295477626978, + "learning_rate": 1.189317096431195e-06, + "loss": 0.8394, + "step": 8924 + }, + { + "epoch": 0.6439161646405253, + "grad_norm": 4.396534048822556, + "learning_rate": 1.1888898744183342e-06, + "loss": 0.9146, + "step": 8925 + }, + { + "epoch": 0.6439883121099528, + "grad_norm": 2.2962328992943015, + "learning_rate": 1.1884626966971957e-06, + "loss": 0.9347, + "step": 8926 + }, + { + "epoch": 0.6440604595793803, + "grad_norm": 4.269626180111733, + "learning_rate": 1.188035563291105e-06, + "loss": 0.9986, + "step": 8927 + }, + { + "epoch": 0.6441326070488078, + "grad_norm": 3.2132625283424643, + "learning_rate": 1.1876084742233858e-06, + "loss": 0.8129, + "step": 8928 + }, + { + "epoch": 0.6442047545182352, + "grad_norm": 5.416622200807822, + "learning_rate": 1.1871814295173615e-06, + "loss": 0.893, + "step": 8929 + }, + { + "epoch": 0.6442769019876627, + "grad_norm": 2.3812579978347994, + "learning_rate": 1.1867544291963496e-06, + "loss": 0.9569, + "step": 8930 + }, + { + "epoch": 0.6443490494570903, + "grad_norm": 2.641386024016303, + "learning_rate": 1.1863274732836677e-06, + "loss": 0.9414, + "step": 8931 + }, + { + "epoch": 0.6444211969265178, + "grad_norm": 19.22153260470736, + "learning_rate": 1.1859005618026308e-06, + "loss": 0.8894, + "step": 8932 + }, + { + "epoch": 0.6444933443959453, + "grad_norm": 3.0818316608972007, + "learning_rate": 1.1854736947765505e-06, + "loss": 0.8678, + "step": 8933 + }, + { + "epoch": 0.6445654918653728, + "grad_norm": 2.702533458746087, + "learning_rate": 1.185046872228736e-06, + "loss": 0.8646, + "step": 8934 + }, + { + "epoch": 0.6446376393348003, + "grad_norm": 3.463446884961739, + "learning_rate": 1.1846200941824953e-06, + "loss": 0.964, + "step": 8935 + }, + { + "epoch": 0.6447097868042279, + "grad_norm": 2.410072065518463, + "learning_rate": 1.1841933606611332e-06, + "loss": 0.867, + "step": 8936 + }, + { + "epoch": 0.6447819342736554, + "grad_norm": 2.9243571300413085, + "learning_rate": 1.1837666716879505e-06, + "loss": 0.9068, + "step": 8937 + }, + { + "epoch": 0.6448540817430829, + "grad_norm": 3.4109008954854203, + "learning_rate": 1.1833400272862495e-06, + "loss": 0.8593, + "step": 8938 + }, + { + "epoch": 0.6449262292125104, + "grad_norm": 5.080378105930576, + "learning_rate": 1.1829134274793262e-06, + "loss": 0.8983, + "step": 8939 + }, + { + "epoch": 0.6449983766819379, + "grad_norm": 2.5671395889984394, + "learning_rate": 1.1824868722904753e-06, + "loss": 0.9534, + "step": 8940 + }, + { + "epoch": 0.6450705241513653, + "grad_norm": 4.363461564999013, + "learning_rate": 1.1820603617429906e-06, + "loss": 0.9309, + "step": 8941 + }, + { + "epoch": 0.6451426716207929, + "grad_norm": 4.461617065617573, + "learning_rate": 1.1816338958601612e-06, + "loss": 1.0455, + "step": 8942 + }, + { + "epoch": 0.6452148190902204, + "grad_norm": 2.6927910108927366, + "learning_rate": 1.1812074746652752e-06, + "loss": 0.9961, + "step": 8943 + }, + { + "epoch": 0.6452869665596479, + "grad_norm": 2.4416244043158004, + "learning_rate": 1.1807810981816185e-06, + "loss": 0.9616, + "step": 8944 + }, + { + "epoch": 0.6453591140290754, + "grad_norm": 3.020684302150636, + "learning_rate": 1.180354766432473e-06, + "loss": 0.8914, + "step": 8945 + }, + { + "epoch": 0.6454312614985029, + "grad_norm": 0.7057530351006664, + "learning_rate": 1.1799284794411193e-06, + "loss": 0.8031, + "step": 8946 + }, + { + "epoch": 0.6455034089679305, + "grad_norm": 4.48660042334557, + "learning_rate": 1.1795022372308356e-06, + "loss": 1.019, + "step": 8947 + }, + { + "epoch": 0.645575556437358, + "grad_norm": 2.4553282738511863, + "learning_rate": 1.1790760398248977e-06, + "loss": 0.9966, + "step": 8948 + }, + { + "epoch": 0.6456477039067855, + "grad_norm": 4.811803395771243, + "learning_rate": 1.1786498872465775e-06, + "loss": 0.9531, + "step": 8949 + }, + { + "epoch": 0.645719851376213, + "grad_norm": 9.634904001704964, + "learning_rate": 1.1782237795191475e-06, + "loss": 0.9962, + "step": 8950 + }, + { + "epoch": 0.6457919988456405, + "grad_norm": 2.4897861690703667, + "learning_rate": 1.1777977166658746e-06, + "loss": 0.8931, + "step": 8951 + }, + { + "epoch": 0.6458641463150679, + "grad_norm": 2.945454642106385, + "learning_rate": 1.1773716987100239e-06, + "loss": 1.0103, + "step": 8952 + }, + { + "epoch": 0.6459362937844955, + "grad_norm": 2.444310187014782, + "learning_rate": 1.1769457256748602e-06, + "loss": 0.942, + "step": 8953 + }, + { + "epoch": 0.646008441253923, + "grad_norm": 3.4456810440551284, + "learning_rate": 1.1765197975836434e-06, + "loss": 0.8893, + "step": 8954 + }, + { + "epoch": 0.6460805887233505, + "grad_norm": 3.2097034913969154, + "learning_rate": 1.1760939144596318e-06, + "loss": 1.0582, + "step": 8955 + }, + { + "epoch": 0.646152736192778, + "grad_norm": 4.280408344932202, + "learning_rate": 1.1756680763260823e-06, + "loss": 0.7457, + "step": 8956 + }, + { + "epoch": 0.6462248836622055, + "grad_norm": 3.8555302175397115, + "learning_rate": 1.1752422832062468e-06, + "loss": 1.0078, + "step": 8957 + }, + { + "epoch": 0.6462970311316331, + "grad_norm": 4.363338951610409, + "learning_rate": 1.174816535123377e-06, + "loss": 0.8965, + "step": 8958 + }, + { + "epoch": 0.6463691786010606, + "grad_norm": 3.047594503888573, + "learning_rate": 1.1743908321007219e-06, + "loss": 0.9532, + "step": 8959 + }, + { + "epoch": 0.6464413260704881, + "grad_norm": 2.426858905244715, + "learning_rate": 1.1739651741615273e-06, + "loss": 1.0404, + "step": 8960 + }, + { + "epoch": 0.6465134735399156, + "grad_norm": 2.315094497597538, + "learning_rate": 1.1735395613290362e-06, + "loss": 0.9158, + "step": 8961 + }, + { + "epoch": 0.6465856210093431, + "grad_norm": 2.7235675091333134, + "learning_rate": 1.1731139936264902e-06, + "loss": 1.0421, + "step": 8962 + }, + { + "epoch": 0.6466577684787707, + "grad_norm": 2.506648093401635, + "learning_rate": 1.1726884710771285e-06, + "loss": 0.9544, + "step": 8963 + }, + { + "epoch": 0.6467299159481981, + "grad_norm": 2.178432964954463, + "learning_rate": 1.1722629937041858e-06, + "loss": 0.8169, + "step": 8964 + }, + { + "epoch": 0.6468020634176256, + "grad_norm": 2.2062847664230785, + "learning_rate": 1.171837561530898e-06, + "loss": 0.8586, + "step": 8965 + }, + { + "epoch": 0.6468742108870531, + "grad_norm": 2.698987760472963, + "learning_rate": 1.171412174580495e-06, + "loss": 0.912, + "step": 8966 + }, + { + "epoch": 0.6469463583564806, + "grad_norm": 2.1526465782739606, + "learning_rate": 1.170986832876205e-06, + "loss": 1.0028, + "step": 8967 + }, + { + "epoch": 0.6470185058259081, + "grad_norm": 4.1028785556017375, + "learning_rate": 1.170561536441256e-06, + "loss": 0.9793, + "step": 8968 + }, + { + "epoch": 0.6470906532953357, + "grad_norm": 0.7831332586999213, + "learning_rate": 1.1701362852988705e-06, + "loss": 0.7985, + "step": 8969 + }, + { + "epoch": 0.6471628007647632, + "grad_norm": 5.8438583608770145, + "learning_rate": 1.1697110794722707e-06, + "loss": 1.01, + "step": 8970 + }, + { + "epoch": 0.6472349482341907, + "grad_norm": 2.8375263229695196, + "learning_rate": 1.1692859189846749e-06, + "loss": 0.9135, + "step": 8971 + }, + { + "epoch": 0.6473070957036182, + "grad_norm": 6.364556228738146, + "learning_rate": 1.1688608038593005e-06, + "loss": 0.8323, + "step": 8972 + }, + { + "epoch": 0.6473792431730457, + "grad_norm": 7.579692371738798, + "learning_rate": 1.1684357341193604e-06, + "loss": 0.9321, + "step": 8973 + }, + { + "epoch": 0.6474513906424733, + "grad_norm": 3.23097470437068, + "learning_rate": 1.1680107097880662e-06, + "loss": 1.006, + "step": 8974 + }, + { + "epoch": 0.6475235381119008, + "grad_norm": 2.316412425555638, + "learning_rate": 1.1675857308886282e-06, + "loss": 0.9676, + "step": 8975 + }, + { + "epoch": 0.6475956855813282, + "grad_norm": 3.7706792951000994, + "learning_rate": 1.167160797444251e-06, + "loss": 0.8682, + "step": 8976 + }, + { + "epoch": 0.6476678330507557, + "grad_norm": 2.542558440640503, + "learning_rate": 1.1667359094781398e-06, + "loss": 0.876, + "step": 8977 + }, + { + "epoch": 0.6477399805201832, + "grad_norm": 5.579889269588086, + "learning_rate": 1.166311067013496e-06, + "loss": 0.9594, + "step": 8978 + }, + { + "epoch": 0.6478121279896107, + "grad_norm": 3.942926210189461, + "learning_rate": 1.1658862700735185e-06, + "loss": 0.8785, + "step": 8979 + }, + { + "epoch": 0.6478842754590383, + "grad_norm": 2.4770453430633435, + "learning_rate": 1.165461518681405e-06, + "loss": 0.9661, + "step": 8980 + }, + { + "epoch": 0.6479564229284658, + "grad_norm": 3.1824343951610996, + "learning_rate": 1.1650368128603468e-06, + "loss": 0.9405, + "step": 8981 + }, + { + "epoch": 0.6480285703978933, + "grad_norm": 5.606142605674702, + "learning_rate": 1.164612152633539e-06, + "loss": 0.8117, + "step": 8982 + }, + { + "epoch": 0.6481007178673208, + "grad_norm": 1.7551960779624385, + "learning_rate": 1.1641875380241683e-06, + "loss": 0.9422, + "step": 8983 + }, + { + "epoch": 0.6481728653367483, + "grad_norm": 2.3982555805015315, + "learning_rate": 1.163762969055422e-06, + "loss": 0.9472, + "step": 8984 + }, + { + "epoch": 0.6482450128061759, + "grad_norm": 5.657332005992346, + "learning_rate": 1.1633384457504845e-06, + "loss": 0.8377, + "step": 8985 + }, + { + "epoch": 0.6483171602756034, + "grad_norm": 2.0545706480036015, + "learning_rate": 1.162913968132537e-06, + "loss": 0.9802, + "step": 8986 + }, + { + "epoch": 0.6483893077450309, + "grad_norm": 2.0993120701572017, + "learning_rate": 1.1624895362247592e-06, + "loss": 0.9997, + "step": 8987 + }, + { + "epoch": 0.6484614552144583, + "grad_norm": 3.125079802447848, + "learning_rate": 1.1620651500503278e-06, + "loss": 0.9078, + "step": 8988 + }, + { + "epoch": 0.6485336026838858, + "grad_norm": 5.404614504354018, + "learning_rate": 1.1616408096324161e-06, + "loss": 0.9127, + "step": 8989 + }, + { + "epoch": 0.6486057501533133, + "grad_norm": 3.508935421514394, + "learning_rate": 1.1612165149941964e-06, + "loss": 0.8732, + "step": 8990 + }, + { + "epoch": 0.6486778976227409, + "grad_norm": 2.129286313909899, + "learning_rate": 1.1607922661588376e-06, + "loss": 0.9301, + "step": 8991 + }, + { + "epoch": 0.6487500450921684, + "grad_norm": 2.9656259261392126, + "learning_rate": 1.1603680631495066e-06, + "loss": 0.8431, + "step": 8992 + }, + { + "epoch": 0.6488221925615959, + "grad_norm": 4.818840902735223, + "learning_rate": 1.1599439059893673e-06, + "loss": 0.715, + "step": 8993 + }, + { + "epoch": 0.6488943400310234, + "grad_norm": 2.474086546670807, + "learning_rate": 1.1595197947015818e-06, + "loss": 0.935, + "step": 8994 + }, + { + "epoch": 0.6489664875004509, + "grad_norm": 4.064477057984559, + "learning_rate": 1.1590957293093094e-06, + "loss": 0.9504, + "step": 8995 + }, + { + "epoch": 0.6490386349698785, + "grad_norm": 2.651914401691066, + "learning_rate": 1.1586717098357051e-06, + "loss": 0.9133, + "step": 8996 + }, + { + "epoch": 0.649110782439306, + "grad_norm": 2.118202219681382, + "learning_rate": 1.1582477363039255e-06, + "loss": 0.8869, + "step": 8997 + }, + { + "epoch": 0.6491829299087335, + "grad_norm": 2.4027376177695663, + "learning_rate": 1.1578238087371195e-06, + "loss": 0.8754, + "step": 8998 + }, + { + "epoch": 0.649255077378161, + "grad_norm": 0.8194668361779813, + "learning_rate": 1.1573999271584392e-06, + "loss": 0.8374, + "step": 8999 + }, + { + "epoch": 0.6493272248475884, + "grad_norm": 2.6895973538112843, + "learning_rate": 1.1569760915910292e-06, + "loss": 0.9509, + "step": 9000 + }, + { + "epoch": 0.6493993723170159, + "grad_norm": 2.6258139483689575, + "learning_rate": 1.1565523020580337e-06, + "loss": 0.9438, + "step": 9001 + }, + { + "epoch": 0.6494715197864435, + "grad_norm": 2.77733370622195, + "learning_rate": 1.1561285585825946e-06, + "loss": 0.8469, + "step": 9002 + }, + { + "epoch": 0.649543667255871, + "grad_norm": 3.652972194846717, + "learning_rate": 1.1557048611878513e-06, + "loss": 0.9045, + "step": 9003 + }, + { + "epoch": 0.6496158147252985, + "grad_norm": 3.779615443009208, + "learning_rate": 1.1552812098969407e-06, + "loss": 0.8404, + "step": 9004 + }, + { + "epoch": 0.649687962194726, + "grad_norm": 2.4058459054080847, + "learning_rate": 1.1548576047329955e-06, + "loss": 0.9409, + "step": 9005 + }, + { + "epoch": 0.6497601096641535, + "grad_norm": 2.5660195719804286, + "learning_rate": 1.1544340457191478e-06, + "loss": 0.938, + "step": 9006 + }, + { + "epoch": 0.6498322571335811, + "grad_norm": 3.11315029352279, + "learning_rate": 1.1540105328785267e-06, + "loss": 0.9513, + "step": 9007 + }, + { + "epoch": 0.6499044046030086, + "grad_norm": 2.6721186833492356, + "learning_rate": 1.1535870662342583e-06, + "loss": 0.9879, + "step": 9008 + }, + { + "epoch": 0.6499765520724361, + "grad_norm": 3.938433218674216, + "learning_rate": 1.1531636458094674e-06, + "loss": 0.9776, + "step": 9009 + }, + { + "epoch": 0.6500486995418636, + "grad_norm": 2.8541233217238373, + "learning_rate": 1.1527402716272756e-06, + "loss": 0.8496, + "step": 9010 + }, + { + "epoch": 0.650120847011291, + "grad_norm": 2.229258300704193, + "learning_rate": 1.1523169437107997e-06, + "loss": 1.0316, + "step": 9011 + }, + { + "epoch": 0.6501929944807185, + "grad_norm": 2.494149419400118, + "learning_rate": 1.1518936620831588e-06, + "loss": 0.8572, + "step": 9012 + }, + { + "epoch": 0.6502651419501461, + "grad_norm": 5.846817598616716, + "learning_rate": 1.1514704267674646e-06, + "loss": 0.9839, + "step": 9013 + }, + { + "epoch": 0.6503372894195736, + "grad_norm": 3.8545865766644747, + "learning_rate": 1.1510472377868295e-06, + "loss": 0.8197, + "step": 9014 + }, + { + "epoch": 0.6504094368890011, + "grad_norm": 3.9338290343998437, + "learning_rate": 1.1506240951643618e-06, + "loss": 0.8365, + "step": 9015 + }, + { + "epoch": 0.6504815843584286, + "grad_norm": 3.4585833325088053, + "learning_rate": 1.1502009989231681e-06, + "loss": 0.8685, + "step": 9016 + }, + { + "epoch": 0.6505537318278561, + "grad_norm": 2.868297559510271, + "learning_rate": 1.149777949086352e-06, + "loss": 0.9402, + "step": 9017 + }, + { + "epoch": 0.6506258792972837, + "grad_norm": 4.011169812050893, + "learning_rate": 1.1493549456770145e-06, + "loss": 0.9702, + "step": 9018 + }, + { + "epoch": 0.6506980267667112, + "grad_norm": 2.00945740051752, + "learning_rate": 1.1489319887182553e-06, + "loss": 0.8945, + "step": 9019 + }, + { + "epoch": 0.6507701742361387, + "grad_norm": 2.983306373886395, + "learning_rate": 1.148509078233168e-06, + "loss": 0.9532, + "step": 9020 + }, + { + "epoch": 0.6508423217055662, + "grad_norm": 2.6174656962102487, + "learning_rate": 1.1480862142448497e-06, + "loss": 0.8173, + "step": 9021 + }, + { + "epoch": 0.6509144691749937, + "grad_norm": 2.9287698758240923, + "learning_rate": 1.1476633967763887e-06, + "loss": 0.9002, + "step": 9022 + }, + { + "epoch": 0.6509866166444211, + "grad_norm": 2.563412945639193, + "learning_rate": 1.1472406258508741e-06, + "loss": 0.8714, + "step": 9023 + }, + { + "epoch": 0.6510587641138487, + "grad_norm": 18.84634427104664, + "learning_rate": 1.1468179014913926e-06, + "loss": 0.917, + "step": 9024 + }, + { + "epoch": 0.6511309115832762, + "grad_norm": 2.835454221799551, + "learning_rate": 1.1463952237210265e-06, + "loss": 0.9437, + "step": 9025 + }, + { + "epoch": 0.6512030590527037, + "grad_norm": 3.732639563902959, + "learning_rate": 1.1459725925628579e-06, + "loss": 0.8976, + "step": 9026 + }, + { + "epoch": 0.6512752065221312, + "grad_norm": 3.515715602131145, + "learning_rate": 1.145550008039965e-06, + "loss": 0.9393, + "step": 9027 + }, + { + "epoch": 0.6513473539915587, + "grad_norm": 2.224419348741688, + "learning_rate": 1.1451274701754222e-06, + "loss": 0.8564, + "step": 9028 + }, + { + "epoch": 0.6514195014609863, + "grad_norm": 3.679265542751981, + "learning_rate": 1.1447049789923038e-06, + "loss": 0.8887, + "step": 9029 + }, + { + "epoch": 0.6514916489304138, + "grad_norm": 3.505245501400073, + "learning_rate": 1.1442825345136803e-06, + "loss": 0.8119, + "step": 9030 + }, + { + "epoch": 0.6515637963998413, + "grad_norm": 6.254308817461742, + "learning_rate": 1.1438601367626198e-06, + "loss": 1.0013, + "step": 9031 + }, + { + "epoch": 0.6516359438692688, + "grad_norm": 6.218279490091655, + "learning_rate": 1.1434377857621877e-06, + "loss": 0.8619, + "step": 9032 + }, + { + "epoch": 0.6517080913386963, + "grad_norm": 2.5983603075481656, + "learning_rate": 1.1430154815354475e-06, + "loss": 0.8947, + "step": 9033 + }, + { + "epoch": 0.6517802388081239, + "grad_norm": 2.8940956518978402, + "learning_rate": 1.1425932241054601e-06, + "loss": 0.8744, + "step": 9034 + }, + { + "epoch": 0.6518523862775513, + "grad_norm": 3.317846050827583, + "learning_rate": 1.1421710134952814e-06, + "loss": 0.8682, + "step": 9035 + }, + { + "epoch": 0.6519245337469788, + "grad_norm": 2.857745597477554, + "learning_rate": 1.1417488497279694e-06, + "loss": 0.9132, + "step": 9036 + }, + { + "epoch": 0.6519966812164063, + "grad_norm": 3.6068010937170727, + "learning_rate": 1.1413267328265739e-06, + "loss": 0.9396, + "step": 9037 + }, + { + "epoch": 0.6520688286858338, + "grad_norm": 2.562149023815605, + "learning_rate": 1.1409046628141485e-06, + "loss": 0.9513, + "step": 9038 + }, + { + "epoch": 0.6521409761552613, + "grad_norm": 2.484057712087217, + "learning_rate": 1.1404826397137385e-06, + "loss": 0.9128, + "step": 9039 + }, + { + "epoch": 0.6522131236246889, + "grad_norm": 4.023589670210981, + "learning_rate": 1.1400606635483895e-06, + "loss": 1.0024, + "step": 9040 + }, + { + "epoch": 0.6522852710941164, + "grad_norm": 3.4522028593659377, + "learning_rate": 1.1396387343411446e-06, + "loss": 0.8023, + "step": 9041 + }, + { + "epoch": 0.6523574185635439, + "grad_norm": 2.3454315257001315, + "learning_rate": 1.1392168521150433e-06, + "loss": 0.8577, + "step": 9042 + }, + { + "epoch": 0.6524295660329714, + "grad_norm": 2.3264562783673863, + "learning_rate": 1.1387950168931242e-06, + "loss": 0.9685, + "step": 9043 + }, + { + "epoch": 0.6525017135023989, + "grad_norm": 2.975470074612916, + "learning_rate": 1.13837322869842e-06, + "loss": 0.9691, + "step": 9044 + }, + { + "epoch": 0.6525738609718265, + "grad_norm": 8.700410977883863, + "learning_rate": 1.1379514875539645e-06, + "loss": 0.9399, + "step": 9045 + }, + { + "epoch": 0.652646008441254, + "grad_norm": 5.129214345099416, + "learning_rate": 1.1375297934827868e-06, + "loss": 0.8414, + "step": 9046 + }, + { + "epoch": 0.6527181559106814, + "grad_norm": 4.194894894180041, + "learning_rate": 1.1371081465079142e-06, + "loss": 0.9118, + "step": 9047 + }, + { + "epoch": 0.6527903033801089, + "grad_norm": 2.789432789560556, + "learning_rate": 1.1366865466523718e-06, + "loss": 0.8857, + "step": 9048 + }, + { + "epoch": 0.6528624508495364, + "grad_norm": 4.105423452488569, + "learning_rate": 1.1362649939391817e-06, + "loss": 0.973, + "step": 9049 + }, + { + "epoch": 0.6529345983189639, + "grad_norm": 2.5761449435014288, + "learning_rate": 1.1358434883913614e-06, + "loss": 0.8853, + "step": 9050 + }, + { + "epoch": 0.6530067457883915, + "grad_norm": 3.1691200055373527, + "learning_rate": 1.1354220300319309e-06, + "loss": 0.9046, + "step": 9051 + }, + { + "epoch": 0.653078893257819, + "grad_norm": 2.7876107848181166, + "learning_rate": 1.1350006188839016e-06, + "loss": 1.0842, + "step": 9052 + }, + { + "epoch": 0.6531510407272465, + "grad_norm": 2.3898410883150123, + "learning_rate": 1.1345792549702866e-06, + "loss": 0.9464, + "step": 9053 + }, + { + "epoch": 0.653223188196674, + "grad_norm": 7.09480607158218, + "learning_rate": 1.1341579383140948e-06, + "loss": 0.9695, + "step": 9054 + }, + { + "epoch": 0.6532953356661015, + "grad_norm": 2.3135836356276793, + "learning_rate": 1.133736668938333e-06, + "loss": 0.9044, + "step": 9055 + }, + { + "epoch": 0.6533674831355291, + "grad_norm": 2.1832976166033835, + "learning_rate": 1.1333154468660047e-06, + "loss": 0.9503, + "step": 9056 + }, + { + "epoch": 0.6534396306049566, + "grad_norm": 2.3543629747310986, + "learning_rate": 1.1328942721201116e-06, + "loss": 0.9805, + "step": 9057 + }, + { + "epoch": 0.653511778074384, + "grad_norm": 2.0250098993506827, + "learning_rate": 1.132473144723653e-06, + "loss": 1.0104, + "step": 9058 + }, + { + "epoch": 0.6535839255438115, + "grad_norm": 3.142414551076199, + "learning_rate": 1.1320520646996228e-06, + "loss": 0.9352, + "step": 9059 + }, + { + "epoch": 0.653656073013239, + "grad_norm": 3.9861559907791753, + "learning_rate": 1.1316310320710182e-06, + "loss": 0.7953, + "step": 9060 + }, + { + "epoch": 0.6537282204826665, + "grad_norm": 2.0192527357590384, + "learning_rate": 1.1312100468608274e-06, + "loss": 0.8546, + "step": 9061 + }, + { + "epoch": 0.6538003679520941, + "grad_norm": 2.5530633925962336, + "learning_rate": 1.13078910909204e-06, + "loss": 0.9474, + "step": 9062 + }, + { + "epoch": 0.6538725154215216, + "grad_norm": 2.6805212208641307, + "learning_rate": 1.130368218787641e-06, + "loss": 0.9035, + "step": 9063 + }, + { + "epoch": 0.6539446628909491, + "grad_norm": 4.0568651294250255, + "learning_rate": 1.129947375970615e-06, + "loss": 0.9829, + "step": 9064 + }, + { + "epoch": 0.6540168103603766, + "grad_norm": 0.7547727359002022, + "learning_rate": 1.1295265806639417e-06, + "loss": 0.7964, + "step": 9065 + }, + { + "epoch": 0.6540889578298041, + "grad_norm": 4.464005946521148, + "learning_rate": 1.1291058328906004e-06, + "loss": 0.9472, + "step": 9066 + }, + { + "epoch": 0.6541611052992317, + "grad_norm": 0.7941390923646172, + "learning_rate": 1.1286851326735648e-06, + "loss": 0.8489, + "step": 9067 + }, + { + "epoch": 0.6542332527686592, + "grad_norm": 2.838086199601271, + "learning_rate": 1.1282644800358085e-06, + "loss": 0.8398, + "step": 9068 + }, + { + "epoch": 0.6543054002380867, + "grad_norm": 3.1623008816121096, + "learning_rate": 1.1278438750003018e-06, + "loss": 0.94, + "step": 9069 + }, + { + "epoch": 0.6543775477075141, + "grad_norm": 2.6928079218989875, + "learning_rate": 1.1274233175900129e-06, + "loss": 0.9091, + "step": 9070 + }, + { + "epoch": 0.6544496951769416, + "grad_norm": 3.9015522288720805, + "learning_rate": 1.127002807827906e-06, + "loss": 0.9217, + "step": 9071 + }, + { + "epoch": 0.6545218426463691, + "grad_norm": 2.5746229025257197, + "learning_rate": 1.1265823457369438e-06, + "loss": 0.928, + "step": 9072 + }, + { + "epoch": 0.6545939901157967, + "grad_norm": 2.612702127330716, + "learning_rate": 1.126161931340088e-06, + "loss": 0.9104, + "step": 9073 + }, + { + "epoch": 0.6546661375852242, + "grad_norm": 4.604808342638869, + "learning_rate": 1.1257415646602921e-06, + "loss": 0.9138, + "step": 9074 + }, + { + "epoch": 0.6547382850546517, + "grad_norm": 4.4414163419932615, + "learning_rate": 1.1253212457205147e-06, + "loss": 0.8981, + "step": 9075 + }, + { + "epoch": 0.6548104325240792, + "grad_norm": 2.7589993179005203, + "learning_rate": 1.1249009745437055e-06, + "loss": 0.9169, + "step": 9076 + }, + { + "epoch": 0.6548825799935067, + "grad_norm": 2.5350051153007227, + "learning_rate": 1.1244807511528142e-06, + "loss": 0.959, + "step": 9077 + }, + { + "epoch": 0.6549547274629343, + "grad_norm": 3.9519143644390704, + "learning_rate": 1.1240605755707884e-06, + "loss": 0.8225, + "step": 9078 + }, + { + "epoch": 0.6550268749323618, + "grad_norm": 8.194641260344603, + "learning_rate": 1.1236404478205719e-06, + "loss": 0.949, + "step": 9079 + }, + { + "epoch": 0.6550990224017893, + "grad_norm": 2.619672273275124, + "learning_rate": 1.1232203679251063e-06, + "loss": 0.9571, + "step": 9080 + }, + { + "epoch": 0.6551711698712168, + "grad_norm": 2.8182873838962483, + "learning_rate": 1.1228003359073306e-06, + "loss": 0.9314, + "step": 9081 + }, + { + "epoch": 0.6552433173406442, + "grad_norm": 6.069427622120248, + "learning_rate": 1.1223803517901822e-06, + "loss": 0.8698, + "step": 9082 + }, + { + "epoch": 0.6553154648100717, + "grad_norm": 2.1819321754327006, + "learning_rate": 1.1219604155965931e-06, + "loss": 0.9022, + "step": 9083 + }, + { + "epoch": 0.6553876122794993, + "grad_norm": 2.9269542914530406, + "learning_rate": 1.1215405273494952e-06, + "loss": 0.8928, + "step": 9084 + }, + { + "epoch": 0.6554597597489268, + "grad_norm": 2.96368699256598, + "learning_rate": 1.1211206870718175e-06, + "loss": 0.8374, + "step": 9085 + }, + { + "epoch": 0.6555319072183543, + "grad_norm": 4.530035448083014, + "learning_rate": 1.1207008947864851e-06, + "loss": 0.9319, + "step": 9086 + }, + { + "epoch": 0.6556040546877818, + "grad_norm": 4.940394844615471, + "learning_rate": 1.1202811505164223e-06, + "loss": 0.956, + "step": 9087 + }, + { + "epoch": 0.6556762021572093, + "grad_norm": 3.1668229482802466, + "learning_rate": 1.1198614542845497e-06, + "loss": 0.897, + "step": 9088 + }, + { + "epoch": 0.6557483496266369, + "grad_norm": 2.2941489098865495, + "learning_rate": 1.1194418061137833e-06, + "loss": 0.8883, + "step": 9089 + }, + { + "epoch": 0.6558204970960644, + "grad_norm": 3.8857214486833476, + "learning_rate": 1.1190222060270418e-06, + "loss": 1.0663, + "step": 9090 + }, + { + "epoch": 0.6558926445654919, + "grad_norm": 3.423241608036837, + "learning_rate": 1.1186026540472357e-06, + "loss": 0.8981, + "step": 9091 + }, + { + "epoch": 0.6559647920349194, + "grad_norm": 3.6039838946678024, + "learning_rate": 1.118183150197276e-06, + "loss": 0.8821, + "step": 9092 + }, + { + "epoch": 0.6560369395043469, + "grad_norm": 3.5936096163987044, + "learning_rate": 1.1177636945000698e-06, + "loss": 0.8431, + "step": 9093 + }, + { + "epoch": 0.6561090869737743, + "grad_norm": 3.1234917624999867, + "learning_rate": 1.1173442869785223e-06, + "loss": 0.9437, + "step": 9094 + }, + { + "epoch": 0.6561812344432019, + "grad_norm": 2.320734981002954, + "learning_rate": 1.116924927655536e-06, + "loss": 0.8482, + "step": 9095 + }, + { + "epoch": 0.6562533819126294, + "grad_norm": 3.0954407974913565, + "learning_rate": 1.11650561655401e-06, + "loss": 0.8274, + "step": 9096 + }, + { + "epoch": 0.6563255293820569, + "grad_norm": 2.846395079844245, + "learning_rate": 1.1160863536968427e-06, + "loss": 0.8498, + "step": 9097 + }, + { + "epoch": 0.6563976768514844, + "grad_norm": 2.2192972072588044, + "learning_rate": 1.1156671391069268e-06, + "loss": 0.8795, + "step": 9098 + }, + { + "epoch": 0.6564698243209119, + "grad_norm": 2.6224951827856864, + "learning_rate": 1.1152479728071547e-06, + "loss": 1.0047, + "step": 9099 + }, + { + "epoch": 0.6565419717903395, + "grad_norm": 3.6147093883744437, + "learning_rate": 1.1148288548204155e-06, + "loss": 0.8642, + "step": 9100 + }, + { + "epoch": 0.656614119259767, + "grad_norm": 4.489808090968431, + "learning_rate": 1.1144097851695958e-06, + "loss": 0.9583, + "step": 9101 + }, + { + "epoch": 0.6566862667291945, + "grad_norm": 4.28916655867622, + "learning_rate": 1.1139907638775794e-06, + "loss": 0.8683, + "step": 9102 + }, + { + "epoch": 0.656758414198622, + "grad_norm": 2.3538539532493385, + "learning_rate": 1.1135717909672474e-06, + "loss": 0.9005, + "step": 9103 + }, + { + "epoch": 0.6568305616680495, + "grad_norm": 2.86325676921876, + "learning_rate": 1.1131528664614787e-06, + "loss": 0.9874, + "step": 9104 + }, + { + "epoch": 0.656902709137477, + "grad_norm": 2.3832936285842012, + "learning_rate": 1.1127339903831496e-06, + "loss": 0.9686, + "step": 9105 + }, + { + "epoch": 0.6569748566069045, + "grad_norm": 2.5362105565131996, + "learning_rate": 1.1123151627551316e-06, + "loss": 0.9947, + "step": 9106 + }, + { + "epoch": 0.657047004076332, + "grad_norm": 2.525636453014933, + "learning_rate": 1.1118963836002971e-06, + "loss": 0.9457, + "step": 9107 + }, + { + "epoch": 0.6571191515457595, + "grad_norm": 2.4811322623888468, + "learning_rate": 1.111477652941513e-06, + "loss": 0.9818, + "step": 9108 + }, + { + "epoch": 0.657191299015187, + "grad_norm": 2.546732892711532, + "learning_rate": 1.1110589708016454e-06, + "loss": 0.8387, + "step": 9109 + }, + { + "epoch": 0.6572634464846145, + "grad_norm": 3.727840180560883, + "learning_rate": 1.1106403372035563e-06, + "loss": 0.939, + "step": 9110 + }, + { + "epoch": 0.6573355939540421, + "grad_norm": 2.4633331748263765, + "learning_rate": 1.1102217521701063e-06, + "loss": 0.9727, + "step": 9111 + }, + { + "epoch": 0.6574077414234696, + "grad_norm": 2.7831723716776815, + "learning_rate": 1.1098032157241535e-06, + "loss": 0.9153, + "step": 9112 + }, + { + "epoch": 0.6574798888928971, + "grad_norm": 2.8743857681261833, + "learning_rate": 1.1093847278885497e-06, + "loss": 0.9584, + "step": 9113 + }, + { + "epoch": 0.6575520363623246, + "grad_norm": 3.1005067349628197, + "learning_rate": 1.1089662886861509e-06, + "loss": 0.899, + "step": 9114 + }, + { + "epoch": 0.6576241838317521, + "grad_norm": 2.4093020021139187, + "learning_rate": 1.1085478981398036e-06, + "loss": 0.9237, + "step": 9115 + }, + { + "epoch": 0.6576963313011797, + "grad_norm": 2.9231704859103353, + "learning_rate": 1.1081295562723557e-06, + "loss": 0.8752, + "step": 9116 + }, + { + "epoch": 0.6577684787706071, + "grad_norm": 3.948838514668484, + "learning_rate": 1.107711263106651e-06, + "loss": 1.0069, + "step": 9117 + }, + { + "epoch": 0.6578406262400346, + "grad_norm": 2.9665615648947092, + "learning_rate": 1.1072930186655312e-06, + "loss": 0.8689, + "step": 9118 + }, + { + "epoch": 0.6579127737094621, + "grad_norm": 5.274283967076142, + "learning_rate": 1.1068748229718349e-06, + "loss": 0.8924, + "step": 9119 + }, + { + "epoch": 0.6579849211788896, + "grad_norm": 2.2974784214139623, + "learning_rate": 1.1064566760483983e-06, + "loss": 0.9875, + "step": 9120 + }, + { + "epoch": 0.6580570686483171, + "grad_norm": 0.7465638681800796, + "learning_rate": 1.1060385779180553e-06, + "loss": 0.773, + "step": 9121 + }, + { + "epoch": 0.6581292161177447, + "grad_norm": 3.395012192626992, + "learning_rate": 1.1056205286036359e-06, + "loss": 0.9247, + "step": 9122 + }, + { + "epoch": 0.6582013635871722, + "grad_norm": 2.5447447123016786, + "learning_rate": 1.1052025281279683e-06, + "loss": 0.8624, + "step": 9123 + }, + { + "epoch": 0.6582735110565997, + "grad_norm": 2.738019247757862, + "learning_rate": 1.1047845765138783e-06, + "loss": 0.9799, + "step": 9124 + }, + { + "epoch": 0.6583456585260272, + "grad_norm": 2.5436091166105395, + "learning_rate": 1.1043666737841887e-06, + "loss": 0.9646, + "step": 9125 + }, + { + "epoch": 0.6584178059954547, + "grad_norm": 8.353229524049086, + "learning_rate": 1.1039488199617196e-06, + "loss": 1.0278, + "step": 9126 + }, + { + "epoch": 0.6584899534648823, + "grad_norm": 2.585325508652103, + "learning_rate": 1.1035310150692888e-06, + "loss": 0.8895, + "step": 9127 + }, + { + "epoch": 0.6585621009343098, + "grad_norm": 2.701855929376, + "learning_rate": 1.1031132591297091e-06, + "loss": 0.9763, + "step": 9128 + }, + { + "epoch": 0.6586342484037372, + "grad_norm": 7.570298642676301, + "learning_rate": 1.1026955521657961e-06, + "loss": 0.9943, + "step": 9129 + }, + { + "epoch": 0.6587063958731647, + "grad_norm": 2.848353422245487, + "learning_rate": 1.1022778942003564e-06, + "loss": 0.8842, + "step": 9130 + }, + { + "epoch": 0.6587785433425922, + "grad_norm": 11.037397449401498, + "learning_rate": 1.1018602852561976e-06, + "loss": 0.9364, + "step": 9131 + }, + { + "epoch": 0.6588506908120197, + "grad_norm": 3.649232830787301, + "learning_rate": 1.1014427253561236e-06, + "loss": 0.7824, + "step": 9132 + }, + { + "epoch": 0.6589228382814473, + "grad_norm": 2.193490463510882, + "learning_rate": 1.1010252145229362e-06, + "loss": 0.8865, + "step": 9133 + }, + { + "epoch": 0.6589949857508748, + "grad_norm": 3.243928372985831, + "learning_rate": 1.100607752779434e-06, + "loss": 1.0056, + "step": 9134 + }, + { + "epoch": 0.6590671332203023, + "grad_norm": 3.2358764996344083, + "learning_rate": 1.1001903401484128e-06, + "loss": 0.9123, + "step": 9135 + }, + { + "epoch": 0.6591392806897298, + "grad_norm": 2.5754618443495603, + "learning_rate": 1.0997729766526673e-06, + "loss": 0.949, + "step": 9136 + }, + { + "epoch": 0.6592114281591573, + "grad_norm": 0.7046221582268456, + "learning_rate": 1.099355662314986e-06, + "loss": 0.8136, + "step": 9137 + }, + { + "epoch": 0.6592835756285849, + "grad_norm": 2.2342571947746626, + "learning_rate": 1.0989383971581578e-06, + "loss": 0.9616, + "step": 9138 + }, + { + "epoch": 0.6593557230980124, + "grad_norm": 2.2967472170622636, + "learning_rate": 1.0985211812049682e-06, + "loss": 0.8435, + "step": 9139 + }, + { + "epoch": 0.6594278705674399, + "grad_norm": 12.713665820053018, + "learning_rate": 1.0981040144781997e-06, + "loss": 0.816, + "step": 9140 + }, + { + "epoch": 0.6595000180368673, + "grad_norm": 2.915355242222548, + "learning_rate": 1.0976868970006324e-06, + "loss": 0.8655, + "step": 9141 + }, + { + "epoch": 0.6595721655062948, + "grad_norm": 2.65883116131481, + "learning_rate": 1.0972698287950432e-06, + "loss": 0.9459, + "step": 9142 + }, + { + "epoch": 0.6596443129757223, + "grad_norm": 1.8848157806838493, + "learning_rate": 1.096852809884207e-06, + "loss": 0.8979, + "step": 9143 + }, + { + "epoch": 0.6597164604451499, + "grad_norm": 2.800823216675868, + "learning_rate": 1.0964358402908965e-06, + "loss": 0.9774, + "step": 9144 + }, + { + "epoch": 0.6597886079145774, + "grad_norm": 2.5595216888385153, + "learning_rate": 1.0960189200378787e-06, + "loss": 0.9744, + "step": 9145 + }, + { + "epoch": 0.6598607553840049, + "grad_norm": 1.8536932551624121, + "learning_rate": 1.0956020491479215e-06, + "loss": 0.9811, + "step": 9146 + }, + { + "epoch": 0.6599329028534324, + "grad_norm": 2.989344907462008, + "learning_rate": 1.0951852276437882e-06, + "loss": 0.9271, + "step": 9147 + }, + { + "epoch": 0.6600050503228599, + "grad_norm": 0.7830850412292573, + "learning_rate": 1.09476845554824e-06, + "loss": 0.7906, + "step": 9148 + }, + { + "epoch": 0.6600771977922875, + "grad_norm": 2.5917976109173786, + "learning_rate": 1.0943517328840359e-06, + "loss": 0.8766, + "step": 9149 + }, + { + "epoch": 0.660149345261715, + "grad_norm": 0.8646276684772675, + "learning_rate": 1.0939350596739308e-06, + "loss": 0.9, + "step": 9150 + }, + { + "epoch": 0.6602214927311425, + "grad_norm": 2.923613005874354, + "learning_rate": 1.0935184359406788e-06, + "loss": 0.8948, + "step": 9151 + }, + { + "epoch": 0.66029364020057, + "grad_norm": 3.6908464925713282, + "learning_rate": 1.0931018617070278e-06, + "loss": 0.8786, + "step": 9152 + }, + { + "epoch": 0.6603657876699974, + "grad_norm": 7.133018933554383, + "learning_rate": 1.0926853369957284e-06, + "loss": 0.8813, + "step": 9153 + }, + { + "epoch": 0.660437935139425, + "grad_norm": 3.8613304713178884, + "learning_rate": 1.092268861829523e-06, + "loss": 0.8996, + "step": 9154 + }, + { + "epoch": 0.6605100826088525, + "grad_norm": 2.6275665814849902, + "learning_rate": 1.091852436231155e-06, + "loss": 0.8683, + "step": 9155 + }, + { + "epoch": 0.66058223007828, + "grad_norm": 2.0702603315582278, + "learning_rate": 1.0914360602233636e-06, + "loss": 0.8773, + "step": 9156 + }, + { + "epoch": 0.6606543775477075, + "grad_norm": 3.478743034353078, + "learning_rate": 1.0910197338288856e-06, + "loss": 0.892, + "step": 9157 + }, + { + "epoch": 0.660726525017135, + "grad_norm": 3.618501165132861, + "learning_rate": 1.090603457070455e-06, + "loss": 0.9319, + "step": 9158 + }, + { + "epoch": 0.6607986724865625, + "grad_norm": 2.606257420810591, + "learning_rate": 1.0901872299708038e-06, + "loss": 0.8304, + "step": 9159 + }, + { + "epoch": 0.6608708199559901, + "grad_norm": 4.365961165377334, + "learning_rate": 1.0897710525526593e-06, + "loss": 0.9872, + "step": 9160 + }, + { + "epoch": 0.6609429674254176, + "grad_norm": 3.2123453065905623, + "learning_rate": 1.089354924838748e-06, + "loss": 0.9841, + "step": 9161 + }, + { + "epoch": 0.6610151148948451, + "grad_norm": 3.036551018811268, + "learning_rate": 1.0889388468517932e-06, + "loss": 1.0014, + "step": 9162 + }, + { + "epoch": 0.6610872623642726, + "grad_norm": 0.70212500775843, + "learning_rate": 1.0885228186145152e-06, + "loss": 0.8059, + "step": 9163 + }, + { + "epoch": 0.6611594098337, + "grad_norm": 2.431141605265373, + "learning_rate": 1.088106840149632e-06, + "loss": 0.8563, + "step": 9164 + }, + { + "epoch": 0.6612315573031275, + "grad_norm": 2.585662366169677, + "learning_rate": 1.0876909114798585e-06, + "loss": 0.9326, + "step": 9165 + }, + { + "epoch": 0.6613037047725551, + "grad_norm": 3.4281251168403153, + "learning_rate": 1.0872750326279079e-06, + "loss": 0.8486, + "step": 9166 + }, + { + "epoch": 0.6613758522419826, + "grad_norm": 3.146805333083267, + "learning_rate": 1.0868592036164876e-06, + "loss": 0.9727, + "step": 9167 + }, + { + "epoch": 0.6614479997114101, + "grad_norm": 2.5226635764763734, + "learning_rate": 1.0864434244683072e-06, + "loss": 0.942, + "step": 9168 + }, + { + "epoch": 0.6615201471808376, + "grad_norm": 2.384767324586055, + "learning_rate": 1.0860276952060688e-06, + "loss": 0.8711, + "step": 9169 + }, + { + "epoch": 0.6615922946502651, + "grad_norm": 2.7154894163922516, + "learning_rate": 1.0856120158524744e-06, + "loss": 0.8981, + "step": 9170 + }, + { + "epoch": 0.6616644421196927, + "grad_norm": 5.2250299188222735, + "learning_rate": 1.085196386430223e-06, + "loss": 0.9356, + "step": 9171 + }, + { + "epoch": 0.6617365895891202, + "grad_norm": 2.7062640183153994, + "learning_rate": 1.0847808069620103e-06, + "loss": 0.8944, + "step": 9172 + }, + { + "epoch": 0.6618087370585477, + "grad_norm": 2.9208746028783477, + "learning_rate": 1.08436527747053e-06, + "loss": 0.9038, + "step": 9173 + }, + { + "epoch": 0.6618808845279752, + "grad_norm": 3.0660725211508053, + "learning_rate": 1.0839497979784722e-06, + "loss": 0.9484, + "step": 9174 + }, + { + "epoch": 0.6619530319974027, + "grad_norm": 2.6104161361202625, + "learning_rate": 1.0835343685085256e-06, + "loss": 0.8942, + "step": 9175 + }, + { + "epoch": 0.6620251794668301, + "grad_norm": 3.529528442959921, + "learning_rate": 1.0831189890833737e-06, + "loss": 0.9049, + "step": 9176 + }, + { + "epoch": 0.6620973269362577, + "grad_norm": 0.8247962483311493, + "learning_rate": 1.0827036597256998e-06, + "loss": 0.8646, + "step": 9177 + }, + { + "epoch": 0.6621694744056852, + "grad_norm": 4.780966756571458, + "learning_rate": 1.082288380458183e-06, + "loss": 0.9823, + "step": 9178 + }, + { + "epoch": 0.6622416218751127, + "grad_norm": 2.7759127894122697, + "learning_rate": 1.0818731513035007e-06, + "loss": 0.9834, + "step": 9179 + }, + { + "epoch": 0.6623137693445402, + "grad_norm": 10.586214969841455, + "learning_rate": 1.0814579722843267e-06, + "loss": 0.9421, + "step": 9180 + }, + { + "epoch": 0.6623859168139677, + "grad_norm": 0.8637062165995344, + "learning_rate": 1.0810428434233335e-06, + "loss": 0.7913, + "step": 9181 + }, + { + "epoch": 0.6624580642833953, + "grad_norm": 4.3470545212665845, + "learning_rate": 1.080627764743187e-06, + "loss": 0.7595, + "step": 9182 + }, + { + "epoch": 0.6625302117528228, + "grad_norm": 3.666700897634996, + "learning_rate": 1.0802127362665565e-06, + "loss": 0.8745, + "step": 9183 + }, + { + "epoch": 0.6626023592222503, + "grad_norm": 4.117059173610901, + "learning_rate": 1.0797977580161029e-06, + "loss": 0.8984, + "step": 9184 + }, + { + "epoch": 0.6626745066916778, + "grad_norm": 2.56413933024579, + "learning_rate": 1.079382830014487e-06, + "loss": 0.926, + "step": 9185 + }, + { + "epoch": 0.6627466541611053, + "grad_norm": 1.985740312080327, + "learning_rate": 1.078967952284367e-06, + "loss": 0.9808, + "step": 9186 + }, + { + "epoch": 0.6628188016305329, + "grad_norm": 2.396615296062206, + "learning_rate": 1.0785531248483972e-06, + "loss": 0.8958, + "step": 9187 + }, + { + "epoch": 0.6628909490999603, + "grad_norm": 6.618879838197699, + "learning_rate": 1.0781383477292312e-06, + "loss": 0.8486, + "step": 9188 + }, + { + "epoch": 0.6629630965693878, + "grad_norm": 2.7167992444442848, + "learning_rate": 1.0777236209495159e-06, + "loss": 0.864, + "step": 9189 + }, + { + "epoch": 0.6630352440388153, + "grad_norm": 4.308988606680226, + "learning_rate": 1.077308944531901e-06, + "loss": 0.8761, + "step": 9190 + }, + { + "epoch": 0.6631073915082428, + "grad_norm": 2.294151300151819, + "learning_rate": 1.0768943184990273e-06, + "loss": 0.9153, + "step": 9191 + }, + { + "epoch": 0.6631795389776703, + "grad_norm": 3.587070925902304, + "learning_rate": 1.0764797428735396e-06, + "loss": 0.8793, + "step": 9192 + }, + { + "epoch": 0.6632516864470979, + "grad_norm": 2.801988583538097, + "learning_rate": 1.076065217678073e-06, + "loss": 0.9168, + "step": 9193 + }, + { + "epoch": 0.6633238339165254, + "grad_norm": 2.788981887977415, + "learning_rate": 1.075650742935265e-06, + "loss": 0.9906, + "step": 9194 + }, + { + "epoch": 0.6633959813859529, + "grad_norm": 0.7775428172032594, + "learning_rate": 1.075236318667748e-06, + "loss": 0.7899, + "step": 9195 + }, + { + "epoch": 0.6634681288553804, + "grad_norm": 3.2274208682026977, + "learning_rate": 1.0748219448981523e-06, + "loss": 0.9378, + "step": 9196 + }, + { + "epoch": 0.6635402763248079, + "grad_norm": 2.279342167133813, + "learning_rate": 1.0744076216491053e-06, + "loss": 0.8241, + "step": 9197 + }, + { + "epoch": 0.6636124237942355, + "grad_norm": 0.8214125024515118, + "learning_rate": 1.0739933489432324e-06, + "loss": 0.8222, + "step": 9198 + }, + { + "epoch": 0.663684571263663, + "grad_norm": 2.532302355119333, + "learning_rate": 1.073579126803154e-06, + "loss": 0.9873, + "step": 9199 + }, + { + "epoch": 0.6637567187330904, + "grad_norm": 3.1897966675246505, + "learning_rate": 1.07316495525149e-06, + "loss": 0.9091, + "step": 9200 + }, + { + "epoch": 0.6638288662025179, + "grad_norm": 0.6959970083775298, + "learning_rate": 1.072750834310857e-06, + "loss": 0.801, + "step": 9201 + }, + { + "epoch": 0.6639010136719454, + "grad_norm": 2.2778375263699795, + "learning_rate": 1.0723367640038678e-06, + "loss": 0.8916, + "step": 9202 + }, + { + "epoch": 0.663973161141373, + "grad_norm": 3.1002406611394577, + "learning_rate": 1.071922744353135e-06, + "loss": 0.9117, + "step": 9203 + }, + { + "epoch": 0.6640453086108005, + "grad_norm": 2.876737649984082, + "learning_rate": 1.0715087753812637e-06, + "loss": 0.8178, + "step": 9204 + }, + { + "epoch": 0.664117456080228, + "grad_norm": 2.242492442378787, + "learning_rate": 1.071094857110863e-06, + "loss": 0.8682, + "step": 9205 + }, + { + "epoch": 0.6641896035496555, + "grad_norm": 2.9206166541194385, + "learning_rate": 1.0706809895645315e-06, + "loss": 0.8902, + "step": 9206 + }, + { + "epoch": 0.664261751019083, + "grad_norm": 3.5213503185278086, + "learning_rate": 1.070267172764873e-06, + "loss": 0.9641, + "step": 9207 + }, + { + "epoch": 0.6643338984885105, + "grad_norm": 3.0950544285340733, + "learning_rate": 1.0698534067344812e-06, + "loss": 0.8756, + "step": 9208 + }, + { + "epoch": 0.6644060459579381, + "grad_norm": 2.3470377940518476, + "learning_rate": 1.0694396914959515e-06, + "loss": 0.9957, + "step": 9209 + }, + { + "epoch": 0.6644781934273656, + "grad_norm": 2.1394810648109437, + "learning_rate": 1.0690260270718757e-06, + "loss": 0.9653, + "step": 9210 + }, + { + "epoch": 0.664550340896793, + "grad_norm": 4.197650960964703, + "learning_rate": 1.0686124134848421e-06, + "loss": 0.9528, + "step": 9211 + }, + { + "epoch": 0.6646224883662205, + "grad_norm": 3.2578739736781013, + "learning_rate": 1.0681988507574375e-06, + "loss": 0.8797, + "step": 9212 + }, + { + "epoch": 0.664694635835648, + "grad_norm": 2.489829068578999, + "learning_rate": 1.0677853389122428e-06, + "loss": 0.8479, + "step": 9213 + }, + { + "epoch": 0.6647667833050755, + "grad_norm": 2.6553676261625285, + "learning_rate": 1.0673718779718414e-06, + "loss": 0.8268, + "step": 9214 + }, + { + "epoch": 0.6648389307745031, + "grad_norm": 1.7340025630786788, + "learning_rate": 1.0669584679588082e-06, + "loss": 0.9489, + "step": 9215 + }, + { + "epoch": 0.6649110782439306, + "grad_norm": 2.0994677686622416, + "learning_rate": 1.0665451088957194e-06, + "loss": 0.8524, + "step": 9216 + }, + { + "epoch": 0.6649832257133581, + "grad_norm": 2.067069107782828, + "learning_rate": 1.0661318008051465e-06, + "loss": 0.9402, + "step": 9217 + }, + { + "epoch": 0.6650553731827856, + "grad_norm": 2.1644296713348163, + "learning_rate": 1.0657185437096589e-06, + "loss": 0.816, + "step": 9218 + }, + { + "epoch": 0.6651275206522131, + "grad_norm": 4.2896661385758605, + "learning_rate": 1.065305337631823e-06, + "loss": 0.9646, + "step": 9219 + }, + { + "epoch": 0.6651996681216407, + "grad_norm": 1.954884034544309, + "learning_rate": 1.0648921825942035e-06, + "loss": 0.9238, + "step": 9220 + }, + { + "epoch": 0.6652718155910682, + "grad_norm": 5.985383668360008, + "learning_rate": 1.0644790786193584e-06, + "loss": 0.9592, + "step": 9221 + }, + { + "epoch": 0.6653439630604957, + "grad_norm": 2.7964279414223037, + "learning_rate": 1.0640660257298493e-06, + "loss": 0.978, + "step": 9222 + }, + { + "epoch": 0.6654161105299231, + "grad_norm": 2.491586451265221, + "learning_rate": 1.063653023948229e-06, + "loss": 0.896, + "step": 9223 + }, + { + "epoch": 0.6654882579993506, + "grad_norm": 2.9399270927056684, + "learning_rate": 1.063240073297051e-06, + "loss": 1.0324, + "step": 9224 + }, + { + "epoch": 0.6655604054687781, + "grad_norm": 2.3309878981778125, + "learning_rate": 1.0628271737988647e-06, + "loss": 0.885, + "step": 9225 + }, + { + "epoch": 0.6656325529382057, + "grad_norm": 10.542436493720087, + "learning_rate": 1.0624143254762171e-06, + "loss": 0.9372, + "step": 9226 + }, + { + "epoch": 0.6657047004076332, + "grad_norm": 2.877197918185782, + "learning_rate": 1.0620015283516536e-06, + "loss": 0.9388, + "step": 9227 + }, + { + "epoch": 0.6657768478770607, + "grad_norm": 2.1493740190712836, + "learning_rate": 1.0615887824477119e-06, + "loss": 1.0525, + "step": 9228 + }, + { + "epoch": 0.6658489953464882, + "grad_norm": 2.7782747449305822, + "learning_rate": 1.0611760877869352e-06, + "loss": 0.959, + "step": 9229 + }, + { + "epoch": 0.6659211428159157, + "grad_norm": 3.303259430516382, + "learning_rate": 1.060763444391855e-06, + "loss": 0.9541, + "step": 9230 + }, + { + "epoch": 0.6659932902853433, + "grad_norm": 2.4652969741670887, + "learning_rate": 1.0603508522850078e-06, + "loss": 1.0001, + "step": 9231 + }, + { + "epoch": 0.6660654377547708, + "grad_norm": 2.699012494522239, + "learning_rate": 1.0599383114889214e-06, + "loss": 0.9048, + "step": 9232 + }, + { + "epoch": 0.6661375852241983, + "grad_norm": 2.5031276212338023, + "learning_rate": 1.0595258220261236e-06, + "loss": 0.8478, + "step": 9233 + }, + { + "epoch": 0.6662097326936258, + "grad_norm": 4.259114757488469, + "learning_rate": 1.0591133839191393e-06, + "loss": 0.898, + "step": 9234 + }, + { + "epoch": 0.6662818801630532, + "grad_norm": 7.543513400358398, + "learning_rate": 1.0587009971904901e-06, + "loss": 1.0044, + "step": 9235 + }, + { + "epoch": 0.6663540276324807, + "grad_norm": 3.6543597690505614, + "learning_rate": 1.0582886618626956e-06, + "loss": 0.9241, + "step": 9236 + }, + { + "epoch": 0.6664261751019083, + "grad_norm": 2.242849112800892, + "learning_rate": 1.0578763779582702e-06, + "loss": 0.9696, + "step": 9237 + }, + { + "epoch": 0.6664983225713358, + "grad_norm": 2.9545636583314314, + "learning_rate": 1.0574641454997286e-06, + "loss": 0.9448, + "step": 9238 + }, + { + "epoch": 0.6665704700407633, + "grad_norm": 3.1767234389951744, + "learning_rate": 1.0570519645095806e-06, + "loss": 0.9014, + "step": 9239 + }, + { + "epoch": 0.6666426175101908, + "grad_norm": 4.171334335210784, + "learning_rate": 1.0566398350103342e-06, + "loss": 0.8841, + "step": 9240 + }, + { + "epoch": 0.6667147649796183, + "grad_norm": 2.2477683550178886, + "learning_rate": 1.0562277570244942e-06, + "loss": 0.9728, + "step": 9241 + }, + { + "epoch": 0.6667869124490459, + "grad_norm": 4.140699853760311, + "learning_rate": 1.0558157305745634e-06, + "loss": 0.8412, + "step": 9242 + }, + { + "epoch": 0.6668590599184734, + "grad_norm": 0.7574586730485666, + "learning_rate": 1.055403755683039e-06, + "loss": 0.815, + "step": 9243 + }, + { + "epoch": 0.6669312073879009, + "grad_norm": 3.1624454840382876, + "learning_rate": 1.05499183237242e-06, + "loss": 0.9738, + "step": 9244 + }, + { + "epoch": 0.6670033548573284, + "grad_norm": 3.305579387532581, + "learning_rate": 1.054579960665197e-06, + "loss": 0.8825, + "step": 9245 + }, + { + "epoch": 0.6670755023267559, + "grad_norm": 3.5467510453801623, + "learning_rate": 1.0541681405838645e-06, + "loss": 0.915, + "step": 9246 + }, + { + "epoch": 0.6671476497961834, + "grad_norm": 1.9671840797982394, + "learning_rate": 1.0537563721509075e-06, + "loss": 0.8132, + "step": 9247 + }, + { + "epoch": 0.6672197972656109, + "grad_norm": 2.301564211304152, + "learning_rate": 1.053344655388812e-06, + "loss": 0.9296, + "step": 9248 + }, + { + "epoch": 0.6672919447350384, + "grad_norm": 3.5469720633917383, + "learning_rate": 1.0529329903200609e-06, + "loss": 0.7999, + "step": 9249 + }, + { + "epoch": 0.6673640922044659, + "grad_norm": 2.4208660362078884, + "learning_rate": 1.052521376967133e-06, + "loss": 0.9984, + "step": 9250 + }, + { + "epoch": 0.6674362396738934, + "grad_norm": 2.1342162937569373, + "learning_rate": 1.052109815352506e-06, + "loss": 0.9382, + "step": 9251 + }, + { + "epoch": 0.667508387143321, + "grad_norm": 2.432203554004638, + "learning_rate": 1.0516983054986516e-06, + "loss": 0.9017, + "step": 9252 + }, + { + "epoch": 0.6675805346127485, + "grad_norm": 3.032144001744385, + "learning_rate": 1.051286847428044e-06, + "loss": 0.9674, + "step": 9253 + }, + { + "epoch": 0.667652682082176, + "grad_norm": 3.182173972859196, + "learning_rate": 1.0508754411631482e-06, + "loss": 0.9784, + "step": 9254 + }, + { + "epoch": 0.6677248295516035, + "grad_norm": 3.2898357143951675, + "learning_rate": 1.0504640867264317e-06, + "loss": 0.8187, + "step": 9255 + }, + { + "epoch": 0.667796977021031, + "grad_norm": 3.288238349647391, + "learning_rate": 1.0500527841403558e-06, + "loss": 0.8768, + "step": 9256 + }, + { + "epoch": 0.6678691244904585, + "grad_norm": 5.413454051871921, + "learning_rate": 1.0496415334273811e-06, + "loss": 0.9074, + "step": 9257 + }, + { + "epoch": 0.667941271959886, + "grad_norm": 2.863180327881187, + "learning_rate": 1.0492303346099643e-06, + "loss": 0.9011, + "step": 9258 + }, + { + "epoch": 0.6680134194293135, + "grad_norm": 3.30407290684432, + "learning_rate": 1.04881918771056e-06, + "loss": 0.8641, + "step": 9259 + }, + { + "epoch": 0.668085566898741, + "grad_norm": 1.9637046833989333, + "learning_rate": 1.0484080927516178e-06, + "loss": 0.9651, + "step": 9260 + }, + { + "epoch": 0.6681577143681685, + "grad_norm": 3.3381013307087595, + "learning_rate": 1.0479970497555868e-06, + "loss": 0.9341, + "step": 9261 + }, + { + "epoch": 0.668229861837596, + "grad_norm": 3.586391711089447, + "learning_rate": 1.0475860587449127e-06, + "loss": 0.9992, + "step": 9262 + }, + { + "epoch": 0.6683020093070235, + "grad_norm": 8.208123941865129, + "learning_rate": 1.0471751197420387e-06, + "loss": 0.8152, + "step": 9263 + }, + { + "epoch": 0.6683741567764511, + "grad_norm": 2.3957249713662323, + "learning_rate": 1.0467642327694038e-06, + "loss": 0.9052, + "step": 9264 + }, + { + "epoch": 0.6684463042458786, + "grad_norm": 2.96067547204915, + "learning_rate": 1.0463533978494454e-06, + "loss": 0.8327, + "step": 9265 + }, + { + "epoch": 0.6685184517153061, + "grad_norm": 3.035988633397151, + "learning_rate": 1.0459426150045985e-06, + "loss": 0.8314, + "step": 9266 + }, + { + "epoch": 0.6685905991847336, + "grad_norm": 2.759618756597131, + "learning_rate": 1.0455318842572922e-06, + "loss": 0.8455, + "step": 9267 + }, + { + "epoch": 0.6686627466541611, + "grad_norm": 4.081810466143493, + "learning_rate": 1.0451212056299578e-06, + "loss": 0.9877, + "step": 9268 + }, + { + "epoch": 0.6687348941235887, + "grad_norm": 7.081538103779659, + "learning_rate": 1.044710579145019e-06, + "loss": 0.9391, + "step": 9269 + }, + { + "epoch": 0.6688070415930161, + "grad_norm": 3.622731386828292, + "learning_rate": 1.0443000048248986e-06, + "loss": 0.8585, + "step": 9270 + }, + { + "epoch": 0.6688791890624436, + "grad_norm": 4.967482495135493, + "learning_rate": 1.0438894826920175e-06, + "loss": 0.8855, + "step": 9271 + }, + { + "epoch": 0.6689513365318711, + "grad_norm": 0.8358310382110434, + "learning_rate": 1.043479012768792e-06, + "loss": 0.8245, + "step": 9272 + }, + { + "epoch": 0.6690234840012986, + "grad_norm": 2.5237488923582885, + "learning_rate": 1.0430685950776368e-06, + "loss": 0.8052, + "step": 9273 + }, + { + "epoch": 0.6690956314707261, + "grad_norm": 2.964067963236679, + "learning_rate": 1.0426582296409634e-06, + "loss": 0.8945, + "step": 9274 + }, + { + "epoch": 0.6691677789401537, + "grad_norm": 3.5092381764045815, + "learning_rate": 1.042247916481181e-06, + "loss": 0.8636, + "step": 9275 + }, + { + "epoch": 0.6692399264095812, + "grad_norm": 2.4729306519221903, + "learning_rate": 1.0418376556206929e-06, + "loss": 0.8865, + "step": 9276 + }, + { + "epoch": 0.6693120738790087, + "grad_norm": 3.749775180435301, + "learning_rate": 1.041427447081904e-06, + "loss": 0.9319, + "step": 9277 + }, + { + "epoch": 0.6693842213484362, + "grad_norm": 6.0403285506869695, + "learning_rate": 1.0410172908872138e-06, + "loss": 0.8802, + "step": 9278 + }, + { + "epoch": 0.6694563688178637, + "grad_norm": 2.3219096534165904, + "learning_rate": 1.0406071870590189e-06, + "loss": 0.8642, + "step": 9279 + }, + { + "epoch": 0.6695285162872913, + "grad_norm": 2.955929348030307, + "learning_rate": 1.0401971356197142e-06, + "loss": 0.8587, + "step": 9280 + }, + { + "epoch": 0.6696006637567188, + "grad_norm": 2.7243227790856164, + "learning_rate": 1.0397871365916918e-06, + "loss": 0.8139, + "step": 9281 + }, + { + "epoch": 0.6696728112261462, + "grad_norm": 3.36367848672196, + "learning_rate": 1.0393771899973374e-06, + "loss": 0.8607, + "step": 9282 + }, + { + "epoch": 0.6697449586955737, + "grad_norm": 2.0524254011938705, + "learning_rate": 1.0389672958590402e-06, + "loss": 1.0023, + "step": 9283 + }, + { + "epoch": 0.6698171061650012, + "grad_norm": 3.6473115086107915, + "learning_rate": 1.0385574541991808e-06, + "loss": 0.6802, + "step": 9284 + }, + { + "epoch": 0.6698892536344287, + "grad_norm": 2.848577907531384, + "learning_rate": 1.0381476650401393e-06, + "loss": 0.9056, + "step": 9285 + }, + { + "epoch": 0.6699614011038563, + "grad_norm": 2.664252042320194, + "learning_rate": 1.0377379284042935e-06, + "loss": 0.902, + "step": 9286 + }, + { + "epoch": 0.6700335485732838, + "grad_norm": 3.469844336192508, + "learning_rate": 1.0373282443140169e-06, + "loss": 0.8071, + "step": 9287 + }, + { + "epoch": 0.6701056960427113, + "grad_norm": 4.107165768956316, + "learning_rate": 1.0369186127916816e-06, + "loss": 0.9903, + "step": 9288 + }, + { + "epoch": 0.6701778435121388, + "grad_norm": 2.8745319151262003, + "learning_rate": 1.0365090338596556e-06, + "loss": 0.9847, + "step": 9289 + }, + { + "epoch": 0.6702499909815663, + "grad_norm": 3.1167397237228727, + "learning_rate": 1.0360995075403049e-06, + "loss": 0.8577, + "step": 9290 + }, + { + "epoch": 0.6703221384509939, + "grad_norm": 2.5960993410567426, + "learning_rate": 1.0356900338559914e-06, + "loss": 0.8896, + "step": 9291 + }, + { + "epoch": 0.6703942859204214, + "grad_norm": 2.6757791115411163, + "learning_rate": 1.0352806128290757e-06, + "loss": 1.0356, + "step": 9292 + }, + { + "epoch": 0.6704664333898489, + "grad_norm": 0.7769143533598234, + "learning_rate": 1.0348712444819138e-06, + "loss": 0.8061, + "step": 9293 + }, + { + "epoch": 0.6705385808592763, + "grad_norm": 3.1471279250994737, + "learning_rate": 1.0344619288368611e-06, + "loss": 0.8347, + "step": 9294 + }, + { + "epoch": 0.6706107283287038, + "grad_norm": 2.31195453706017, + "learning_rate": 1.0340526659162676e-06, + "loss": 0.8381, + "step": 9295 + }, + { + "epoch": 0.6706828757981314, + "grad_norm": 2.7702038463406917, + "learning_rate": 1.0336434557424825e-06, + "loss": 0.96, + "step": 9296 + }, + { + "epoch": 0.6707550232675589, + "grad_norm": 3.010301387563997, + "learning_rate": 1.033234298337851e-06, + "loss": 0.8351, + "step": 9297 + }, + { + "epoch": 0.6708271707369864, + "grad_norm": 2.3334130773087454, + "learning_rate": 1.0328251937247162e-06, + "loss": 0.9294, + "step": 9298 + }, + { + "epoch": 0.6708993182064139, + "grad_norm": 4.103927192220624, + "learning_rate": 1.0324161419254166e-06, + "loss": 0.9708, + "step": 9299 + }, + { + "epoch": 0.6709714656758414, + "grad_norm": 2.2292456806000267, + "learning_rate": 1.0320071429622895e-06, + "loss": 1.0073, + "step": 9300 + }, + { + "epoch": 0.671043613145269, + "grad_norm": 3.7117157130147675, + "learning_rate": 1.031598196857669e-06, + "loss": 0.9299, + "step": 9301 + }, + { + "epoch": 0.6711157606146965, + "grad_norm": 2.370766832416833, + "learning_rate": 1.0311893036338858e-06, + "loss": 0.8075, + "step": 9302 + }, + { + "epoch": 0.671187908084124, + "grad_norm": 3.026786743112446, + "learning_rate": 1.0307804633132687e-06, + "loss": 0.9722, + "step": 9303 + }, + { + "epoch": 0.6712600555535515, + "grad_norm": 3.1955737914159137, + "learning_rate": 1.0303716759181424e-06, + "loss": 0.9143, + "step": 9304 + }, + { + "epoch": 0.671332203022979, + "grad_norm": 2.0090660605233652, + "learning_rate": 1.0299629414708305e-06, + "loss": 0.856, + "step": 9305 + }, + { + "epoch": 0.6714043504924064, + "grad_norm": 2.810686840948899, + "learning_rate": 1.0295542599936493e-06, + "loss": 0.941, + "step": 9306 + }, + { + "epoch": 0.671476497961834, + "grad_norm": 3.3854028946033976, + "learning_rate": 1.0291456315089195e-06, + "loss": 0.8642, + "step": 9307 + }, + { + "epoch": 0.6715486454312615, + "grad_norm": 2.270589611261748, + "learning_rate": 1.0287370560389518e-06, + "loss": 0.9266, + "step": 9308 + }, + { + "epoch": 0.671620792900689, + "grad_norm": 3.458837694518418, + "learning_rate": 1.028328533606058e-06, + "loss": 0.8992, + "step": 9309 + }, + { + "epoch": 0.6716929403701165, + "grad_norm": 2.8403700979618827, + "learning_rate": 1.0279200642325462e-06, + "loss": 0.9895, + "step": 9310 + }, + { + "epoch": 0.671765087839544, + "grad_norm": 2.250470112325633, + "learning_rate": 1.027511647940721e-06, + "loss": 0.8651, + "step": 9311 + }, + { + "epoch": 0.6718372353089715, + "grad_norm": 2.2743871219623313, + "learning_rate": 1.0271032847528847e-06, + "loss": 0.9088, + "step": 9312 + }, + { + "epoch": 0.6719093827783991, + "grad_norm": 4.2682118379599165, + "learning_rate": 1.026694974691337e-06, + "loss": 0.8637, + "step": 9313 + }, + { + "epoch": 0.6719815302478266, + "grad_norm": 2.671302600248105, + "learning_rate": 1.0262867177783733e-06, + "loss": 0.9169, + "step": 9314 + }, + { + "epoch": 0.6720536777172541, + "grad_norm": 5.735883550720139, + "learning_rate": 1.0258785140362873e-06, + "loss": 0.9458, + "step": 9315 + }, + { + "epoch": 0.6721258251866816, + "grad_norm": 0.7936291880542914, + "learning_rate": 1.0254703634873693e-06, + "loss": 0.8096, + "step": 9316 + }, + { + "epoch": 0.672197972656109, + "grad_norm": 4.818510983250233, + "learning_rate": 1.0250622661539074e-06, + "loss": 0.934, + "step": 9317 + }, + { + "epoch": 0.6722701201255366, + "grad_norm": 2.6863303189631327, + "learning_rate": 1.024654222058186e-06, + "loss": 0.8404, + "step": 9318 + }, + { + "epoch": 0.6723422675949641, + "grad_norm": 3.310218025011028, + "learning_rate": 1.0242462312224871e-06, + "loss": 0.7911, + "step": 9319 + }, + { + "epoch": 0.6724144150643916, + "grad_norm": 4.0863881783448095, + "learning_rate": 1.02383829366909e-06, + "loss": 0.9502, + "step": 9320 + }, + { + "epoch": 0.6724865625338191, + "grad_norm": 3.457728428773144, + "learning_rate": 1.0234304094202684e-06, + "loss": 0.8429, + "step": 9321 + }, + { + "epoch": 0.6725587100032466, + "grad_norm": 3.218514887546718, + "learning_rate": 1.0230225784982986e-06, + "loss": 0.8291, + "step": 9322 + }, + { + "epoch": 0.6726308574726741, + "grad_norm": 3.2137818743899262, + "learning_rate": 1.0226148009254486e-06, + "loss": 0.8637, + "step": 9323 + }, + { + "epoch": 0.6727030049421017, + "grad_norm": 3.144422218849286, + "learning_rate": 1.0222070767239857e-06, + "loss": 0.9443, + "step": 9324 + }, + { + "epoch": 0.6727751524115292, + "grad_norm": 2.3341400477634675, + "learning_rate": 1.021799405916175e-06, + "loss": 0.9566, + "step": 9325 + }, + { + "epoch": 0.6728472998809567, + "grad_norm": 0.7102880340940978, + "learning_rate": 1.0213917885242778e-06, + "loss": 0.7781, + "step": 9326 + }, + { + "epoch": 0.6729194473503842, + "grad_norm": 2.5386908156437804, + "learning_rate": 1.0209842245705519e-06, + "loss": 0.9658, + "step": 9327 + }, + { + "epoch": 0.6729915948198117, + "grad_norm": 1.9447708294948662, + "learning_rate": 1.0205767140772535e-06, + "loss": 0.9545, + "step": 9328 + }, + { + "epoch": 0.6730637422892392, + "grad_norm": 3.9059570202628477, + "learning_rate": 1.0201692570666358e-06, + "loss": 0.9082, + "step": 9329 + }, + { + "epoch": 0.6731358897586667, + "grad_norm": 3.781446467568325, + "learning_rate": 1.019761853560947e-06, + "loss": 0.881, + "step": 9330 + }, + { + "epoch": 0.6732080372280942, + "grad_norm": 3.1069969399802098, + "learning_rate": 1.0193545035824346e-06, + "loss": 0.8132, + "step": 9331 + }, + { + "epoch": 0.6732801846975217, + "grad_norm": 2.921633786939313, + "learning_rate": 1.0189472071533428e-06, + "loss": 0.9847, + "step": 9332 + }, + { + "epoch": 0.6733523321669492, + "grad_norm": 0.7554351246874491, + "learning_rate": 1.0185399642959118e-06, + "loss": 0.7698, + "step": 9333 + }, + { + "epoch": 0.6734244796363767, + "grad_norm": 2.4939186517194267, + "learning_rate": 1.0181327750323804e-06, + "loss": 0.9302, + "step": 9334 + }, + { + "epoch": 0.6734966271058043, + "grad_norm": 4.012180855201289, + "learning_rate": 1.017725639384984e-06, + "loss": 0.9383, + "step": 9335 + }, + { + "epoch": 0.6735687745752318, + "grad_norm": 2.9248262076628992, + "learning_rate": 1.0173185573759529e-06, + "loss": 0.8878, + "step": 9336 + }, + { + "epoch": 0.6736409220446593, + "grad_norm": 3.2741779955632926, + "learning_rate": 1.016911529027519e-06, + "loss": 0.8679, + "step": 9337 + }, + { + "epoch": 0.6737130695140868, + "grad_norm": 2.2125725513351817, + "learning_rate": 1.0165045543619066e-06, + "loss": 0.8899, + "step": 9338 + }, + { + "epoch": 0.6737852169835143, + "grad_norm": 2.840609986251921, + "learning_rate": 1.0160976334013396e-06, + "loss": 0.9116, + "step": 9339 + }, + { + "epoch": 0.6738573644529419, + "grad_norm": 2.601699138180495, + "learning_rate": 1.0156907661680385e-06, + "loss": 0.9348, + "step": 9340 + }, + { + "epoch": 0.6739295119223693, + "grad_norm": 2.2309243827463052, + "learning_rate": 1.015283952684221e-06, + "loss": 0.9127, + "step": 9341 + }, + { + "epoch": 0.6740016593917968, + "grad_norm": 2.8529815069509135, + "learning_rate": 1.0148771929721017e-06, + "loss": 0.9134, + "step": 9342 + }, + { + "epoch": 0.6740738068612243, + "grad_norm": 2.9531853886641346, + "learning_rate": 1.0144704870538916e-06, + "loss": 0.8676, + "step": 9343 + }, + { + "epoch": 0.6741459543306518, + "grad_norm": 2.442515861123944, + "learning_rate": 1.0140638349518012e-06, + "loss": 0.8969, + "step": 9344 + }, + { + "epoch": 0.6742181018000794, + "grad_norm": 2.8265545104226497, + "learning_rate": 1.013657236688033e-06, + "loss": 0.9341, + "step": 9345 + }, + { + "epoch": 0.6742902492695069, + "grad_norm": 2.863759998718683, + "learning_rate": 1.0132506922847936e-06, + "loss": 0.8926, + "step": 9346 + }, + { + "epoch": 0.6743623967389344, + "grad_norm": 3.138646351794171, + "learning_rate": 1.0128442017642802e-06, + "loss": 0.8467, + "step": 9347 + }, + { + "epoch": 0.6744345442083619, + "grad_norm": 3.073418907027835, + "learning_rate": 1.0124377651486906e-06, + "loss": 0.9204, + "step": 9348 + }, + { + "epoch": 0.6745066916777894, + "grad_norm": 5.499138851373863, + "learning_rate": 1.012031382460219e-06, + "loss": 1.0135, + "step": 9349 + }, + { + "epoch": 0.674578839147217, + "grad_norm": 3.1377978909177764, + "learning_rate": 1.011625053721056e-06, + "loss": 0.8185, + "step": 9350 + }, + { + "epoch": 0.6746509866166445, + "grad_norm": 4.585105807582158, + "learning_rate": 1.01121877895339e-06, + "loss": 0.9135, + "step": 9351 + }, + { + "epoch": 0.674723134086072, + "grad_norm": 2.180454023660691, + "learning_rate": 1.0108125581794072e-06, + "loss": 0.8703, + "step": 9352 + }, + { + "epoch": 0.6747952815554994, + "grad_norm": 4.461089711172912, + "learning_rate": 1.0104063914212876e-06, + "loss": 0.9274, + "step": 9353 + }, + { + "epoch": 0.6748674290249269, + "grad_norm": 2.9530265024703035, + "learning_rate": 1.0100002787012118e-06, + "loss": 0.9583, + "step": 9354 + }, + { + "epoch": 0.6749395764943544, + "grad_norm": 3.9167155945880423, + "learning_rate": 1.009594220041356e-06, + "loss": 0.9122, + "step": 9355 + }, + { + "epoch": 0.675011723963782, + "grad_norm": 2.3575497065590123, + "learning_rate": 1.0091882154638934e-06, + "loss": 0.9574, + "step": 9356 + }, + { + "epoch": 0.6750838714332095, + "grad_norm": 3.076979215733375, + "learning_rate": 1.0087822649909945e-06, + "loss": 0.8365, + "step": 9357 + }, + { + "epoch": 0.675156018902637, + "grad_norm": 3.0493926772512, + "learning_rate": 1.0083763686448267e-06, + "loss": 0.954, + "step": 9358 + }, + { + "epoch": 0.6752281663720645, + "grad_norm": 2.6538751195519734, + "learning_rate": 1.0079705264475556e-06, + "loss": 0.9721, + "step": 9359 + }, + { + "epoch": 0.675300313841492, + "grad_norm": 6.123657079343244, + "learning_rate": 1.0075647384213397e-06, + "loss": 0.8682, + "step": 9360 + }, + { + "epoch": 0.6753724613109195, + "grad_norm": 2.4326875570035704, + "learning_rate": 1.0071590045883413e-06, + "loss": 0.9772, + "step": 9361 + }, + { + "epoch": 0.6754446087803471, + "grad_norm": 2.4205913464338966, + "learning_rate": 1.0067533249707137e-06, + "loss": 1.0452, + "step": 9362 + }, + { + "epoch": 0.6755167562497746, + "grad_norm": 6.840290466744347, + "learning_rate": 1.0063476995906097e-06, + "loss": 0.9723, + "step": 9363 + }, + { + "epoch": 0.675588903719202, + "grad_norm": 2.711078387290506, + "learning_rate": 1.0059421284701798e-06, + "loss": 0.9108, + "step": 9364 + }, + { + "epoch": 0.6756610511886295, + "grad_norm": 2.5432715638518744, + "learning_rate": 1.0055366116315705e-06, + "loss": 0.9964, + "step": 9365 + }, + { + "epoch": 0.675733198658057, + "grad_norm": 2.2787542408178125, + "learning_rate": 1.005131149096925e-06, + "loss": 0.8695, + "step": 9366 + }, + { + "epoch": 0.6758053461274846, + "grad_norm": 1.9101030775287149, + "learning_rate": 1.004725740888385e-06, + "loss": 0.9195, + "step": 9367 + }, + { + "epoch": 0.6758774935969121, + "grad_norm": 2.4520377890333696, + "learning_rate": 1.0043203870280885e-06, + "loss": 0.9588, + "step": 9368 + }, + { + "epoch": 0.6759496410663396, + "grad_norm": 1.920606365020352, + "learning_rate": 1.0039150875381692e-06, + "loss": 0.9452, + "step": 9369 + }, + { + "epoch": 0.6760217885357671, + "grad_norm": 0.7340264406275252, + "learning_rate": 1.0035098424407592e-06, + "loss": 0.8176, + "step": 9370 + }, + { + "epoch": 0.6760939360051946, + "grad_norm": 0.7937377057662781, + "learning_rate": 1.0031046517579882e-06, + "loss": 0.8574, + "step": 9371 + }, + { + "epoch": 0.6761660834746221, + "grad_norm": 2.0454013557454207, + "learning_rate": 1.0026995155119816e-06, + "loss": 0.9337, + "step": 9372 + }, + { + "epoch": 0.6762382309440497, + "grad_norm": 7.960611413448576, + "learning_rate": 1.0022944337248626e-06, + "loss": 0.962, + "step": 9373 + }, + { + "epoch": 0.6763103784134772, + "grad_norm": 2.371997139541419, + "learning_rate": 1.001889406418752e-06, + "loss": 0.9913, + "step": 9374 + }, + { + "epoch": 0.6763825258829047, + "grad_norm": 3.43596198614051, + "learning_rate": 1.0014844336157646e-06, + "loss": 1.0213, + "step": 9375 + }, + { + "epoch": 0.6764546733523321, + "grad_norm": 2.715412151744024, + "learning_rate": 1.0010795153380171e-06, + "loss": 0.8476, + "step": 9376 + }, + { + "epoch": 0.6765268208217596, + "grad_norm": 2.408167384764949, + "learning_rate": 1.000674651607619e-06, + "loss": 1.001, + "step": 9377 + }, + { + "epoch": 0.6765989682911872, + "grad_norm": 2.5901508408148413, + "learning_rate": 1.0002698424466784e-06, + "loss": 0.9452, + "step": 9378 + }, + { + "epoch": 0.6766711157606147, + "grad_norm": 2.660495551564599, + "learning_rate": 9.99865087877301e-07, + "loss": 0.9371, + "step": 9379 + }, + { + "epoch": 0.6767432632300422, + "grad_norm": 2.3149904810336936, + "learning_rate": 9.994603879215891e-07, + "loss": 0.9317, + "step": 9380 + }, + { + "epoch": 0.6768154106994697, + "grad_norm": 3.657805934999515, + "learning_rate": 9.990557426016412e-07, + "loss": 0.8066, + "step": 9381 + }, + { + "epoch": 0.6768875581688972, + "grad_norm": 1.936758884077179, + "learning_rate": 9.98651151939554e-07, + "loss": 0.8755, + "step": 9382 + }, + { + "epoch": 0.6769597056383247, + "grad_norm": 3.1259491051394876, + "learning_rate": 9.982466159574213e-07, + "loss": 0.9398, + "step": 9383 + }, + { + "epoch": 0.6770318531077523, + "grad_norm": 2.7769332131640354, + "learning_rate": 9.97842134677331e-07, + "loss": 0.9059, + "step": 9384 + }, + { + "epoch": 0.6771040005771798, + "grad_norm": 2.8467341265699417, + "learning_rate": 9.974377081213732e-07, + "loss": 0.9916, + "step": 9385 + }, + { + "epoch": 0.6771761480466073, + "grad_norm": 2.782817613103283, + "learning_rate": 9.970333363116302e-07, + "loss": 0.8721, + "step": 9386 + }, + { + "epoch": 0.6772482955160348, + "grad_norm": 2.533186183805148, + "learning_rate": 9.966290192701839e-07, + "loss": 0.9276, + "step": 9387 + }, + { + "epoch": 0.6773204429854622, + "grad_norm": 2.3511881482812393, + "learning_rate": 9.962247570191122e-07, + "loss": 0.9639, + "step": 9388 + }, + { + "epoch": 0.6773925904548898, + "grad_norm": 2.358507110429712, + "learning_rate": 9.95820549580491e-07, + "loss": 0.9327, + "step": 9389 + }, + { + "epoch": 0.6774647379243173, + "grad_norm": 2.450752714876864, + "learning_rate": 9.95416396976392e-07, + "loss": 0.9279, + "step": 9390 + }, + { + "epoch": 0.6775368853937448, + "grad_norm": 5.920734386257116, + "learning_rate": 9.950122992288857e-07, + "loss": 0.9437, + "step": 9391 + }, + { + "epoch": 0.6776090328631723, + "grad_norm": 0.6768882812859554, + "learning_rate": 9.946082563600365e-07, + "loss": 0.8162, + "step": 9392 + }, + { + "epoch": 0.6776811803325998, + "grad_norm": 2.760485426328302, + "learning_rate": 9.942042683919088e-07, + "loss": 0.8917, + "step": 9393 + }, + { + "epoch": 0.6777533278020273, + "grad_norm": 2.2944950567189326, + "learning_rate": 9.938003353465622e-07, + "loss": 1.0191, + "step": 9394 + }, + { + "epoch": 0.6778254752714549, + "grad_norm": 3.779248677421995, + "learning_rate": 9.933964572460549e-07, + "loss": 0.8532, + "step": 9395 + }, + { + "epoch": 0.6778976227408824, + "grad_norm": 3.473269980128006, + "learning_rate": 9.929926341124415e-07, + "loss": 0.8521, + "step": 9396 + }, + { + "epoch": 0.6779697702103099, + "grad_norm": 2.3569133118333117, + "learning_rate": 9.92588865967771e-07, + "loss": 0.909, + "step": 9397 + }, + { + "epoch": 0.6780419176797374, + "grad_norm": 6.860970626539107, + "learning_rate": 9.921851528340944e-07, + "loss": 0.895, + "step": 9398 + }, + { + "epoch": 0.678114065149165, + "grad_norm": 3.3904909731485002, + "learning_rate": 9.917814947334547e-07, + "loss": 0.9138, + "step": 9399 + }, + { + "epoch": 0.6781862126185924, + "grad_norm": 3.6160292284889817, + "learning_rate": 9.913778916878965e-07, + "loss": 0.8807, + "step": 9400 + }, + { + "epoch": 0.6782583600880199, + "grad_norm": 2.650697447207908, + "learning_rate": 9.90974343719457e-07, + "loss": 0.9721, + "step": 9401 + }, + { + "epoch": 0.6783305075574474, + "grad_norm": 2.1970646609487052, + "learning_rate": 9.905708508501738e-07, + "loss": 0.9094, + "step": 9402 + }, + { + "epoch": 0.6784026550268749, + "grad_norm": 2.355480896268434, + "learning_rate": 9.901674131020792e-07, + "loss": 0.9183, + "step": 9403 + }, + { + "epoch": 0.6784748024963024, + "grad_norm": 2.1762609312603525, + "learning_rate": 9.89764030497204e-07, + "loss": 0.9656, + "step": 9404 + }, + { + "epoch": 0.67854694996573, + "grad_norm": 8.434007614179345, + "learning_rate": 9.893607030575754e-07, + "loss": 0.9649, + "step": 9405 + }, + { + "epoch": 0.6786190974351575, + "grad_norm": 3.4668869651586585, + "learning_rate": 9.889574308052176e-07, + "loss": 0.9606, + "step": 9406 + }, + { + "epoch": 0.678691244904585, + "grad_norm": 2.5534820979267456, + "learning_rate": 9.885542137621526e-07, + "loss": 0.8412, + "step": 9407 + }, + { + "epoch": 0.6787633923740125, + "grad_norm": 2.7189318716669075, + "learning_rate": 9.881510519503967e-07, + "loss": 0.9453, + "step": 9408 + }, + { + "epoch": 0.67883553984344, + "grad_norm": 3.3928742573241912, + "learning_rate": 9.877479453919662e-07, + "loss": 0.957, + "step": 9409 + }, + { + "epoch": 0.6789076873128675, + "grad_norm": 2.885861818057539, + "learning_rate": 9.873448941088732e-07, + "loss": 0.9162, + "step": 9410 + }, + { + "epoch": 0.6789798347822951, + "grad_norm": 4.031503122352395, + "learning_rate": 9.86941898123127e-07, + "loss": 0.9354, + "step": 9411 + }, + { + "epoch": 0.6790519822517225, + "grad_norm": 2.71811773906623, + "learning_rate": 9.865389574567333e-07, + "loss": 0.9018, + "step": 9412 + }, + { + "epoch": 0.67912412972115, + "grad_norm": 3.905704551761986, + "learning_rate": 9.861360721316962e-07, + "loss": 0.9441, + "step": 9413 + }, + { + "epoch": 0.6791962771905775, + "grad_norm": 3.3782396662304284, + "learning_rate": 9.857332421700135e-07, + "loss": 0.9997, + "step": 9414 + }, + { + "epoch": 0.679268424660005, + "grad_norm": 0.7047368330508675, + "learning_rate": 9.853304675936854e-07, + "loss": 0.7255, + "step": 9415 + }, + { + "epoch": 0.6793405721294326, + "grad_norm": 6.643768212911596, + "learning_rate": 9.849277484247033e-07, + "loss": 0.8993, + "step": 9416 + }, + { + "epoch": 0.6794127195988601, + "grad_norm": 1.9206133787558932, + "learning_rate": 9.845250846850595e-07, + "loss": 0.8486, + "step": 9417 + }, + { + "epoch": 0.6794848670682876, + "grad_norm": 3.0903616180667144, + "learning_rate": 9.841224763967416e-07, + "loss": 0.9988, + "step": 9418 + }, + { + "epoch": 0.6795570145377151, + "grad_norm": 10.469370900430516, + "learning_rate": 9.837199235817343e-07, + "loss": 0.9243, + "step": 9419 + }, + { + "epoch": 0.6796291620071426, + "grad_norm": 3.0364695178568044, + "learning_rate": 9.833174262620208e-07, + "loss": 0.8762, + "step": 9420 + }, + { + "epoch": 0.6797013094765701, + "grad_norm": 0.8022953141402867, + "learning_rate": 9.829149844595775e-07, + "loss": 0.8254, + "step": 9421 + }, + { + "epoch": 0.6797734569459977, + "grad_norm": 3.4636595894761397, + "learning_rate": 9.825125981963834e-07, + "loss": 0.9413, + "step": 9422 + }, + { + "epoch": 0.6798456044154251, + "grad_norm": 3.0454599077651126, + "learning_rate": 9.821102674944089e-07, + "loss": 0.7865, + "step": 9423 + }, + { + "epoch": 0.6799177518848526, + "grad_norm": 3.4827373402402193, + "learning_rate": 9.817079923756244e-07, + "loss": 0.9375, + "step": 9424 + }, + { + "epoch": 0.6799898993542801, + "grad_norm": 4.191411028145576, + "learning_rate": 9.81305772861997e-07, + "loss": 0.9783, + "step": 9425 + }, + { + "epoch": 0.6800620468237076, + "grad_norm": 2.8079693223467523, + "learning_rate": 9.809036089754903e-07, + "loss": 0.8828, + "step": 9426 + }, + { + "epoch": 0.6801341942931352, + "grad_norm": 2.852260890771719, + "learning_rate": 9.805015007380648e-07, + "loss": 0.9512, + "step": 9427 + }, + { + "epoch": 0.6802063417625627, + "grad_norm": 2.853732853427326, + "learning_rate": 9.800994481716784e-07, + "loss": 1.0139, + "step": 9428 + }, + { + "epoch": 0.6802784892319902, + "grad_norm": 3.8640582746767236, + "learning_rate": 9.796974512982857e-07, + "loss": 0.9634, + "step": 9429 + }, + { + "epoch": 0.6803506367014177, + "grad_norm": 2.2771413722993255, + "learning_rate": 9.79295510139839e-07, + "loss": 0.927, + "step": 9430 + }, + { + "epoch": 0.6804227841708452, + "grad_norm": 0.7334187145452347, + "learning_rate": 9.788936247182848e-07, + "loss": 0.7946, + "step": 9431 + }, + { + "epoch": 0.6804949316402727, + "grad_norm": 2.069088901754059, + "learning_rate": 9.7849179505557e-07, + "loss": 0.899, + "step": 9432 + }, + { + "epoch": 0.6805670791097003, + "grad_norm": 2.987656791115985, + "learning_rate": 9.780900211736368e-07, + "loss": 0.9521, + "step": 9433 + }, + { + "epoch": 0.6806392265791278, + "grad_norm": 4.46093664486638, + "learning_rate": 9.776883030944243e-07, + "loss": 0.8288, + "step": 9434 + }, + { + "epoch": 0.6807113740485552, + "grad_norm": 26.96425268483372, + "learning_rate": 9.772866408398702e-07, + "loss": 0.965, + "step": 9435 + }, + { + "epoch": 0.6807835215179827, + "grad_norm": 2.9906223277176704, + "learning_rate": 9.768850344319051e-07, + "loss": 0.9022, + "step": 9436 + }, + { + "epoch": 0.6808556689874102, + "grad_norm": 3.154674476721354, + "learning_rate": 9.76483483892462e-07, + "loss": 0.8581, + "step": 9437 + }, + { + "epoch": 0.6809278164568378, + "grad_norm": 2.6350498371369673, + "learning_rate": 9.760819892434657e-07, + "loss": 0.8308, + "step": 9438 + }, + { + "epoch": 0.6809999639262653, + "grad_norm": 2.671104900183979, + "learning_rate": 9.756805505068431e-07, + "loss": 0.8841, + "step": 9439 + }, + { + "epoch": 0.6810721113956928, + "grad_norm": 3.598878336052771, + "learning_rate": 9.752791677045128e-07, + "loss": 0.9589, + "step": 9440 + }, + { + "epoch": 0.6811442588651203, + "grad_norm": 3.283394539612745, + "learning_rate": 9.74877840858394e-07, + "loss": 0.9556, + "step": 9441 + }, + { + "epoch": 0.6812164063345478, + "grad_norm": 3.633449047474152, + "learning_rate": 9.744765699904015e-07, + "loss": 0.9279, + "step": 9442 + }, + { + "epoch": 0.6812885538039753, + "grad_norm": 4.6102122435460595, + "learning_rate": 9.74075355122447e-07, + "loss": 0.9287, + "step": 9443 + }, + { + "epoch": 0.6813607012734029, + "grad_norm": 2.4887364328992705, + "learning_rate": 9.7367419627644e-07, + "loss": 0.8266, + "step": 9444 + }, + { + "epoch": 0.6814328487428304, + "grad_norm": 3.140936688950699, + "learning_rate": 9.732730934742855e-07, + "loss": 0.9374, + "step": 9445 + }, + { + "epoch": 0.6815049962122579, + "grad_norm": 4.067717971822901, + "learning_rate": 9.728720467378865e-07, + "loss": 0.9446, + "step": 9446 + }, + { + "epoch": 0.6815771436816853, + "grad_norm": 3.0788080842040726, + "learning_rate": 9.724710560891424e-07, + "loss": 0.9014, + "step": 9447 + }, + { + "epoch": 0.6816492911511128, + "grad_norm": 4.589706821428253, + "learning_rate": 9.720701215499502e-07, + "loss": 1.0074, + "step": 9448 + }, + { + "epoch": 0.6817214386205404, + "grad_norm": 2.3726439585287276, + "learning_rate": 9.716692431422036e-07, + "loss": 0.9207, + "step": 9449 + }, + { + "epoch": 0.6817935860899679, + "grad_norm": 2.6440797135748917, + "learning_rate": 9.712684208877927e-07, + "loss": 0.9034, + "step": 9450 + }, + { + "epoch": 0.6818657335593954, + "grad_norm": 8.406203259192777, + "learning_rate": 9.708676548086055e-07, + "loss": 0.9318, + "step": 9451 + }, + { + "epoch": 0.6819378810288229, + "grad_norm": 7.750273176731569, + "learning_rate": 9.704669449265264e-07, + "loss": 0.9814, + "step": 9452 + }, + { + "epoch": 0.6820100284982504, + "grad_norm": 2.413069239843838, + "learning_rate": 9.700662912634347e-07, + "loss": 0.9092, + "step": 9453 + }, + { + "epoch": 0.682082175967678, + "grad_norm": 2.269590393451345, + "learning_rate": 9.696656938412117e-07, + "loss": 0.6854, + "step": 9454 + }, + { + "epoch": 0.6821543234371055, + "grad_norm": 3.1573991855163355, + "learning_rate": 9.692651526817302e-07, + "loss": 0.9236, + "step": 9455 + }, + { + "epoch": 0.682226470906533, + "grad_norm": 3.4085409876857704, + "learning_rate": 9.68864667806863e-07, + "loss": 0.8531, + "step": 9456 + }, + { + "epoch": 0.6822986183759605, + "grad_norm": 3.7026845935703045, + "learning_rate": 9.684642392384795e-07, + "loss": 0.9463, + "step": 9457 + }, + { + "epoch": 0.682370765845388, + "grad_norm": 2.7542300203070806, + "learning_rate": 9.680638669984448e-07, + "loss": 0.9356, + "step": 9458 + }, + { + "epoch": 0.6824429133148154, + "grad_norm": 2.8617005610324053, + "learning_rate": 9.676635511086234e-07, + "loss": 0.9442, + "step": 9459 + }, + { + "epoch": 0.682515060784243, + "grad_norm": 2.285745867442885, + "learning_rate": 9.672632915908724e-07, + "loss": 0.8651, + "step": 9460 + }, + { + "epoch": 0.6825872082536705, + "grad_norm": 2.4375543832825475, + "learning_rate": 9.66863088467051e-07, + "loss": 1.0362, + "step": 9461 + }, + { + "epoch": 0.682659355723098, + "grad_norm": 2.3028738353324396, + "learning_rate": 9.664629417590115e-07, + "loss": 0.8234, + "step": 9462 + }, + { + "epoch": 0.6827315031925255, + "grad_norm": 2.496057739953229, + "learning_rate": 9.660628514886044e-07, + "loss": 0.9829, + "step": 9463 + }, + { + "epoch": 0.682803650661953, + "grad_norm": 5.56192819463804, + "learning_rate": 9.656628176776778e-07, + "loss": 0.9271, + "step": 9464 + }, + { + "epoch": 0.6828757981313806, + "grad_norm": 2.5726041925829857, + "learning_rate": 9.652628403480757e-07, + "loss": 0.9865, + "step": 9465 + }, + { + "epoch": 0.6829479456008081, + "grad_norm": 2.5015673492594317, + "learning_rate": 9.64862919521639e-07, + "loss": 0.9373, + "step": 9466 + }, + { + "epoch": 0.6830200930702356, + "grad_norm": 3.4511367804702626, + "learning_rate": 9.644630552202076e-07, + "loss": 0.9137, + "step": 9467 + }, + { + "epoch": 0.6830922405396631, + "grad_norm": 2.9543777311508954, + "learning_rate": 9.640632474656136e-07, + "loss": 0.8946, + "step": 9468 + }, + { + "epoch": 0.6831643880090906, + "grad_norm": 7.43968822441269, + "learning_rate": 9.636634962796923e-07, + "loss": 0.9174, + "step": 9469 + }, + { + "epoch": 0.683236535478518, + "grad_norm": 2.777571914250028, + "learning_rate": 9.632638016842703e-07, + "loss": 0.9281, + "step": 9470 + }, + { + "epoch": 0.6833086829479456, + "grad_norm": 0.64531128816283, + "learning_rate": 9.628641637011743e-07, + "loss": 0.7541, + "step": 9471 + }, + { + "epoch": 0.6833808304173731, + "grad_norm": 0.8496893132629912, + "learning_rate": 9.624645823522267e-07, + "loss": 0.864, + "step": 9472 + }, + { + "epoch": 0.6834529778868006, + "grad_norm": 5.804953649896044, + "learning_rate": 9.620650576592477e-07, + "loss": 0.903, + "step": 9473 + }, + { + "epoch": 0.6835251253562281, + "grad_norm": 2.7859659395675793, + "learning_rate": 9.616655896440545e-07, + "loss": 0.9106, + "step": 9474 + }, + { + "epoch": 0.6835972728256556, + "grad_norm": 2.669279298846459, + "learning_rate": 9.612661783284577e-07, + "loss": 0.8442, + "step": 9475 + }, + { + "epoch": 0.6836694202950832, + "grad_norm": 1.860345154598544, + "learning_rate": 9.608668237342715e-07, + "loss": 0.8506, + "step": 9476 + }, + { + "epoch": 0.6837415677645107, + "grad_norm": 3.500707418612001, + "learning_rate": 9.604675258832997e-07, + "loss": 0.8848, + "step": 9477 + }, + { + "epoch": 0.6838137152339382, + "grad_norm": 2.7568218407860012, + "learning_rate": 9.600682847973497e-07, + "loss": 0.9037, + "step": 9478 + }, + { + "epoch": 0.6838858627033657, + "grad_norm": 1.8210307656569107, + "learning_rate": 9.5966910049822e-07, + "loss": 0.8747, + "step": 9479 + }, + { + "epoch": 0.6839580101727932, + "grad_norm": 2.4600288772438788, + "learning_rate": 9.592699730077096e-07, + "loss": 0.8645, + "step": 9480 + }, + { + "epoch": 0.6840301576422207, + "grad_norm": 2.5854358935710677, + "learning_rate": 9.588709023476134e-07, + "loss": 0.8608, + "step": 9481 + }, + { + "epoch": 0.6841023051116482, + "grad_norm": 2.7067257927636326, + "learning_rate": 9.584718885397231e-07, + "loss": 0.9643, + "step": 9482 + }, + { + "epoch": 0.6841744525810757, + "grad_norm": 2.034988132788995, + "learning_rate": 9.58072931605828e-07, + "loss": 0.8417, + "step": 9483 + }, + { + "epoch": 0.6842466000505032, + "grad_norm": 3.4070436445508037, + "learning_rate": 9.576740315677125e-07, + "loss": 0.9917, + "step": 9484 + }, + { + "epoch": 0.6843187475199307, + "grad_norm": 8.295203088961076, + "learning_rate": 9.572751884471594e-07, + "loss": 1.0138, + "step": 9485 + }, + { + "epoch": 0.6843908949893582, + "grad_norm": 2.484996118216294, + "learning_rate": 9.568764022659483e-07, + "loss": 0.8733, + "step": 9486 + }, + { + "epoch": 0.6844630424587858, + "grad_norm": 2.803618160826743, + "learning_rate": 9.564776730458553e-07, + "loss": 0.8582, + "step": 9487 + }, + { + "epoch": 0.6845351899282133, + "grad_norm": 1.999539620341042, + "learning_rate": 9.560790008086538e-07, + "loss": 0.8253, + "step": 9488 + }, + { + "epoch": 0.6846073373976408, + "grad_norm": 6.8641399265423155, + "learning_rate": 9.55680385576114e-07, + "loss": 0.8877, + "step": 9489 + }, + { + "epoch": 0.6846794848670683, + "grad_norm": 0.7035703096629838, + "learning_rate": 9.552818273700012e-07, + "loss": 0.7934, + "step": 9490 + }, + { + "epoch": 0.6847516323364958, + "grad_norm": 3.65834754703573, + "learning_rate": 9.548833262120816e-07, + "loss": 0.983, + "step": 9491 + }, + { + "epoch": 0.6848237798059233, + "grad_norm": 0.7236861442238923, + "learning_rate": 9.544848821241133e-07, + "loss": 0.7837, + "step": 9492 + }, + { + "epoch": 0.6848959272753509, + "grad_norm": 2.8882254873995645, + "learning_rate": 9.540864951278568e-07, + "loss": 0.8361, + "step": 9493 + }, + { + "epoch": 0.6849680747447783, + "grad_norm": 2.346095831251158, + "learning_rate": 9.53688165245064e-07, + "loss": 0.8975, + "step": 9494 + }, + { + "epoch": 0.6850402222142058, + "grad_norm": 7.5283559723382, + "learning_rate": 9.532898924974875e-07, + "loss": 0.8749, + "step": 9495 + }, + { + "epoch": 0.6851123696836333, + "grad_norm": 5.723978622812464, + "learning_rate": 9.528916769068751e-07, + "loss": 0.8764, + "step": 9496 + }, + { + "epoch": 0.6851845171530608, + "grad_norm": 2.661442248351182, + "learning_rate": 9.524935184949718e-07, + "loss": 0.9421, + "step": 9497 + }, + { + "epoch": 0.6852566646224884, + "grad_norm": 3.501021644802135, + "learning_rate": 9.520954172835207e-07, + "loss": 0.9142, + "step": 9498 + }, + { + "epoch": 0.6853288120919159, + "grad_norm": 1.83394751953533, + "learning_rate": 9.516973732942579e-07, + "loss": 1.004, + "step": 9499 + }, + { + "epoch": 0.6854009595613434, + "grad_norm": 1.9145432471837542, + "learning_rate": 9.512993865489225e-07, + "loss": 0.8626, + "step": 9500 + }, + { + "epoch": 0.6854731070307709, + "grad_norm": 3.1096169842436043, + "learning_rate": 9.509014570692448e-07, + "loss": 0.9194, + "step": 9501 + }, + { + "epoch": 0.6855452545001984, + "grad_norm": 2.350744973457737, + "learning_rate": 9.505035848769547e-07, + "loss": 1.0327, + "step": 9502 + }, + { + "epoch": 0.685617401969626, + "grad_norm": 0.7324050697941838, + "learning_rate": 9.501057699937788e-07, + "loss": 0.79, + "step": 9503 + }, + { + "epoch": 0.6856895494390535, + "grad_norm": 4.831051430316519, + "learning_rate": 9.497080124414403e-07, + "loss": 0.927, + "step": 9504 + }, + { + "epoch": 0.685761696908481, + "grad_norm": 2.1675643406242524, + "learning_rate": 9.49310312241659e-07, + "loss": 0.9002, + "step": 9505 + }, + { + "epoch": 0.6858338443779084, + "grad_norm": 2.065973074206721, + "learning_rate": 9.489126694161529e-07, + "loss": 0.9223, + "step": 9506 + }, + { + "epoch": 0.6859059918473359, + "grad_norm": 4.494009905476131, + "learning_rate": 9.485150839866342e-07, + "loss": 0.9514, + "step": 9507 + }, + { + "epoch": 0.6859781393167634, + "grad_norm": 2.8580974789246856, + "learning_rate": 9.481175559748142e-07, + "loss": 0.9132, + "step": 9508 + }, + { + "epoch": 0.686050286786191, + "grad_norm": 2.968480629997674, + "learning_rate": 9.477200854024007e-07, + "loss": 0.8298, + "step": 9509 + }, + { + "epoch": 0.6861224342556185, + "grad_norm": 4.574486233572844, + "learning_rate": 9.473226722910979e-07, + "loss": 0.9344, + "step": 9510 + }, + { + "epoch": 0.686194581725046, + "grad_norm": 4.73868699313906, + "learning_rate": 9.46925316662607e-07, + "loss": 0.9007, + "step": 9511 + }, + { + "epoch": 0.6862667291944735, + "grad_norm": 2.0101268210186065, + "learning_rate": 9.465280185386262e-07, + "loss": 0.9214, + "step": 9512 + }, + { + "epoch": 0.686338876663901, + "grad_norm": 2.6745879916615904, + "learning_rate": 9.461307779408514e-07, + "loss": 0.9642, + "step": 9513 + }, + { + "epoch": 0.6864110241333286, + "grad_norm": 3.281514184398298, + "learning_rate": 9.457335948909717e-07, + "loss": 0.9755, + "step": 9514 + }, + { + "epoch": 0.6864831716027561, + "grad_norm": 4.651985212294346, + "learning_rate": 9.453364694106794e-07, + "loss": 0.8857, + "step": 9515 + }, + { + "epoch": 0.6865553190721836, + "grad_norm": 2.7559983984190963, + "learning_rate": 9.449394015216567e-07, + "loss": 1.0217, + "step": 9516 + }, + { + "epoch": 0.686627466541611, + "grad_norm": 2.300620521928636, + "learning_rate": 9.44542391245589e-07, + "loss": 0.8913, + "step": 9517 + }, + { + "epoch": 0.6866996140110385, + "grad_norm": 2.1940946085516995, + "learning_rate": 9.441454386041536e-07, + "loss": 0.8427, + "step": 9518 + }, + { + "epoch": 0.686771761480466, + "grad_norm": 3.392579530530077, + "learning_rate": 9.437485436190269e-07, + "loss": 0.9886, + "step": 9519 + }, + { + "epoch": 0.6868439089498936, + "grad_norm": 3.0801929943505635, + "learning_rate": 9.433517063118823e-07, + "loss": 0.8961, + "step": 9520 + }, + { + "epoch": 0.6869160564193211, + "grad_norm": 1.9789575232042882, + "learning_rate": 9.429549267043895e-07, + "loss": 0.933, + "step": 9521 + }, + { + "epoch": 0.6869882038887486, + "grad_norm": 3.66787904867353, + "learning_rate": 9.425582048182157e-07, + "loss": 0.9259, + "step": 9522 + }, + { + "epoch": 0.6870603513581761, + "grad_norm": 2.3177731165600814, + "learning_rate": 9.421615406750232e-07, + "loss": 0.9424, + "step": 9523 + }, + { + "epoch": 0.6871324988276036, + "grad_norm": 3.202827998092275, + "learning_rate": 9.417649342964728e-07, + "loss": 0.8609, + "step": 9524 + }, + { + "epoch": 0.6872046462970312, + "grad_norm": 2.831661984277199, + "learning_rate": 9.41368385704222e-07, + "loss": 0.8322, + "step": 9525 + }, + { + "epoch": 0.6872767937664587, + "grad_norm": 0.6908442912902658, + "learning_rate": 9.409718949199248e-07, + "loss": 0.7275, + "step": 9526 + }, + { + "epoch": 0.6873489412358862, + "grad_norm": 2.2520968944648243, + "learning_rate": 9.405754619652319e-07, + "loss": 0.8851, + "step": 9527 + }, + { + "epoch": 0.6874210887053137, + "grad_norm": 3.9012786969782844, + "learning_rate": 9.40179086861792e-07, + "loss": 1.0134, + "step": 9528 + }, + { + "epoch": 0.6874932361747411, + "grad_norm": 2.6719675326916037, + "learning_rate": 9.397827696312472e-07, + "loss": 0.8736, + "step": 9529 + }, + { + "epoch": 0.6875653836441686, + "grad_norm": 2.9047274959738822, + "learning_rate": 9.393865102952423e-07, + "loss": 0.8939, + "step": 9530 + }, + { + "epoch": 0.6876375311135962, + "grad_norm": 4.831834867737816, + "learning_rate": 9.389903088754127e-07, + "loss": 0.8716, + "step": 9531 + }, + { + "epoch": 0.6877096785830237, + "grad_norm": 2.880228512758244, + "learning_rate": 9.38594165393395e-07, + "loss": 0.9251, + "step": 9532 + }, + { + "epoch": 0.6877818260524512, + "grad_norm": 3.7197955288631515, + "learning_rate": 9.381980798708207e-07, + "loss": 0.9783, + "step": 9533 + }, + { + "epoch": 0.6878539735218787, + "grad_norm": 2.339813486963881, + "learning_rate": 9.378020523293184e-07, + "loss": 0.9754, + "step": 9534 + }, + { + "epoch": 0.6879261209913062, + "grad_norm": 2.055305765055972, + "learning_rate": 9.374060827905139e-07, + "loss": 0.9173, + "step": 9535 + }, + { + "epoch": 0.6879982684607338, + "grad_norm": 3.8908209693915414, + "learning_rate": 9.370101712760299e-07, + "loss": 0.9695, + "step": 9536 + }, + { + "epoch": 0.6880704159301613, + "grad_norm": 2.6866033078367186, + "learning_rate": 9.366143178074859e-07, + "loss": 0.9406, + "step": 9537 + }, + { + "epoch": 0.6881425633995888, + "grad_norm": 3.911321902615394, + "learning_rate": 9.36218522406496e-07, + "loss": 0.8048, + "step": 9538 + }, + { + "epoch": 0.6882147108690163, + "grad_norm": 0.7760919814898941, + "learning_rate": 9.35822785094676e-07, + "loss": 0.7924, + "step": 9539 + }, + { + "epoch": 0.6882868583384438, + "grad_norm": 2.2081727983196355, + "learning_rate": 9.354271058936339e-07, + "loss": 0.9718, + "step": 9540 + }, + { + "epoch": 0.6883590058078712, + "grad_norm": 3.461439384770578, + "learning_rate": 9.350314848249763e-07, + "loss": 0.8436, + "step": 9541 + }, + { + "epoch": 0.6884311532772988, + "grad_norm": 3.226071077844411, + "learning_rate": 9.34635921910307e-07, + "loss": 0.9122, + "step": 9542 + }, + { + "epoch": 0.6885033007467263, + "grad_norm": 6.481784827182898, + "learning_rate": 9.342404171712262e-07, + "loss": 0.8301, + "step": 9543 + }, + { + "epoch": 0.6885754482161538, + "grad_norm": 3.3544083186386024, + "learning_rate": 9.33844970629331e-07, + "loss": 0.8014, + "step": 9544 + }, + { + "epoch": 0.6886475956855813, + "grad_norm": 4.627707436403403, + "learning_rate": 9.334495823062159e-07, + "loss": 0.8637, + "step": 9545 + }, + { + "epoch": 0.6887197431550088, + "grad_norm": 2.1530496924619387, + "learning_rate": 9.330542522234699e-07, + "loss": 0.9768, + "step": 9546 + }, + { + "epoch": 0.6887918906244364, + "grad_norm": 2.110964254101022, + "learning_rate": 9.326589804026816e-07, + "loss": 1.05, + "step": 9547 + }, + { + "epoch": 0.6888640380938639, + "grad_norm": 2.2280165893375394, + "learning_rate": 9.322637668654354e-07, + "loss": 0.8701, + "step": 9548 + }, + { + "epoch": 0.6889361855632914, + "grad_norm": 2.641389815046105, + "learning_rate": 9.318686116333121e-07, + "loss": 0.8709, + "step": 9549 + }, + { + "epoch": 0.6890083330327189, + "grad_norm": 2.84561012550721, + "learning_rate": 9.314735147278898e-07, + "loss": 0.87, + "step": 9550 + }, + { + "epoch": 0.6890804805021464, + "grad_norm": 3.089397258304069, + "learning_rate": 9.310784761707436e-07, + "loss": 1.0013, + "step": 9551 + }, + { + "epoch": 0.689152627971574, + "grad_norm": 2.8110916850290395, + "learning_rate": 9.306834959834451e-07, + "loss": 0.8795, + "step": 9552 + }, + { + "epoch": 0.6892247754410014, + "grad_norm": 2.808732540006742, + "learning_rate": 9.302885741875612e-07, + "loss": 0.8945, + "step": 9553 + }, + { + "epoch": 0.6892969229104289, + "grad_norm": 2.8058323980620092, + "learning_rate": 9.298937108046598e-07, + "loss": 0.9465, + "step": 9554 + }, + { + "epoch": 0.6893690703798564, + "grad_norm": 2.735638397441964, + "learning_rate": 9.294989058563008e-07, + "loss": 0.964, + "step": 9555 + }, + { + "epoch": 0.6894412178492839, + "grad_norm": 2.9528118956556457, + "learning_rate": 9.291041593640434e-07, + "loss": 0.7847, + "step": 9556 + }, + { + "epoch": 0.6895133653187114, + "grad_norm": 2.5877035051749226, + "learning_rate": 9.287094713494437e-07, + "loss": 0.9392, + "step": 9557 + }, + { + "epoch": 0.689585512788139, + "grad_norm": 2.8383662645001677, + "learning_rate": 9.283148418340541e-07, + "loss": 0.8637, + "step": 9558 + }, + { + "epoch": 0.6896576602575665, + "grad_norm": 2.65231058046615, + "learning_rate": 9.279202708394236e-07, + "loss": 0.9153, + "step": 9559 + }, + { + "epoch": 0.689729807726994, + "grad_norm": 3.8536352805089256, + "learning_rate": 9.275257583870984e-07, + "loss": 0.9047, + "step": 9560 + }, + { + "epoch": 0.6898019551964215, + "grad_norm": 2.7838580301303777, + "learning_rate": 9.271313044986222e-07, + "loss": 0.8339, + "step": 9561 + }, + { + "epoch": 0.689874102665849, + "grad_norm": 0.8237280085266313, + "learning_rate": 9.26736909195533e-07, + "loss": 0.8098, + "step": 9562 + }, + { + "epoch": 0.6899462501352766, + "grad_norm": 2.4488284638137325, + "learning_rate": 9.26342572499368e-07, + "loss": 0.9314, + "step": 9563 + }, + { + "epoch": 0.6900183976047041, + "grad_norm": 2.004234361463596, + "learning_rate": 9.259482944316608e-07, + "loss": 0.9311, + "step": 9564 + }, + { + "epoch": 0.6900905450741315, + "grad_norm": 0.8184307859304873, + "learning_rate": 9.255540750139412e-07, + "loss": 0.8548, + "step": 9565 + }, + { + "epoch": 0.690162692543559, + "grad_norm": 0.8591034546904427, + "learning_rate": 9.251599142677362e-07, + "loss": 0.8609, + "step": 9566 + }, + { + "epoch": 0.6902348400129865, + "grad_norm": 3.0226792757365586, + "learning_rate": 9.247658122145698e-07, + "loss": 0.9395, + "step": 9567 + }, + { + "epoch": 0.690306987482414, + "grad_norm": 2.9945102371445933, + "learning_rate": 9.243717688759607e-07, + "loss": 0.9007, + "step": 9568 + }, + { + "epoch": 0.6903791349518416, + "grad_norm": 6.124604192907113, + "learning_rate": 9.239777842734291e-07, + "loss": 0.9336, + "step": 9569 + }, + { + "epoch": 0.6904512824212691, + "grad_norm": 4.0036906858892545, + "learning_rate": 9.235838584284865e-07, + "loss": 0.9243, + "step": 9570 + }, + { + "epoch": 0.6905234298906966, + "grad_norm": 3.569901924816369, + "learning_rate": 9.231899913626447e-07, + "loss": 0.8482, + "step": 9571 + }, + { + "epoch": 0.6905955773601241, + "grad_norm": 2.2368752955404267, + "learning_rate": 9.227961830974111e-07, + "loss": 0.9443, + "step": 9572 + }, + { + "epoch": 0.6906677248295516, + "grad_norm": 5.886184111214436, + "learning_rate": 9.224024336542904e-07, + "loss": 0.9335, + "step": 9573 + }, + { + "epoch": 0.6907398722989792, + "grad_norm": 4.307482246632447, + "learning_rate": 9.220087430547836e-07, + "loss": 1.0658, + "step": 9574 + }, + { + "epoch": 0.6908120197684067, + "grad_norm": 3.104029215045968, + "learning_rate": 9.216151113203887e-07, + "loss": 0.8196, + "step": 9575 + }, + { + "epoch": 0.6908841672378341, + "grad_norm": 4.066025832499917, + "learning_rate": 9.212215384726015e-07, + "loss": 0.9493, + "step": 9576 + }, + { + "epoch": 0.6909563147072616, + "grad_norm": 2.619264330918777, + "learning_rate": 9.208280245329117e-07, + "loss": 0.9384, + "step": 9577 + }, + { + "epoch": 0.6910284621766891, + "grad_norm": 2.0793092264700426, + "learning_rate": 9.204345695228084e-07, + "loss": 1.0073, + "step": 9578 + }, + { + "epoch": 0.6911006096461166, + "grad_norm": 2.6483643108146264, + "learning_rate": 9.20041173463777e-07, + "loss": 0.9225, + "step": 9579 + }, + { + "epoch": 0.6911727571155442, + "grad_norm": 1.9233031510288352, + "learning_rate": 9.196478363772992e-07, + "loss": 0.9611, + "step": 9580 + }, + { + "epoch": 0.6912449045849717, + "grad_norm": 3.322899526394412, + "learning_rate": 9.192545582848538e-07, + "loss": 0.8917, + "step": 9581 + }, + { + "epoch": 0.6913170520543992, + "grad_norm": 2.419685112036761, + "learning_rate": 9.18861339207916e-07, + "loss": 0.9637, + "step": 9582 + }, + { + "epoch": 0.6913891995238267, + "grad_norm": 1.6488183471997084, + "learning_rate": 9.184681791679585e-07, + "loss": 0.8937, + "step": 9583 + }, + { + "epoch": 0.6914613469932542, + "grad_norm": 3.5150963958068013, + "learning_rate": 9.180750781864505e-07, + "loss": 0.9521, + "step": 9584 + }, + { + "epoch": 0.6915334944626818, + "grad_norm": 2.5309243640306778, + "learning_rate": 9.176820362848569e-07, + "loss": 0.9607, + "step": 9585 + }, + { + "epoch": 0.6916056419321093, + "grad_norm": 0.7921896428487414, + "learning_rate": 9.172890534846405e-07, + "loss": 0.8075, + "step": 9586 + }, + { + "epoch": 0.6916777894015368, + "grad_norm": 2.996264197960843, + "learning_rate": 9.168961298072608e-07, + "loss": 0.856, + "step": 9587 + }, + { + "epoch": 0.6917499368709642, + "grad_norm": 3.3046002027015042, + "learning_rate": 9.16503265274174e-07, + "loss": 0.8029, + "step": 9588 + }, + { + "epoch": 0.6918220843403917, + "grad_norm": 3.0773131563438634, + "learning_rate": 9.16110459906833e-07, + "loss": 0.8804, + "step": 9589 + }, + { + "epoch": 0.6918942318098192, + "grad_norm": 3.6917504851482428, + "learning_rate": 9.157177137266872e-07, + "loss": 0.9113, + "step": 9590 + }, + { + "epoch": 0.6919663792792468, + "grad_norm": 3.574456745833139, + "learning_rate": 9.15325026755184e-07, + "loss": 0.8627, + "step": 9591 + }, + { + "epoch": 0.6920385267486743, + "grad_norm": 2.0246228608630417, + "learning_rate": 9.149323990137643e-07, + "loss": 0.9853, + "step": 9592 + }, + { + "epoch": 0.6921106742181018, + "grad_norm": 2.2603898171325953, + "learning_rate": 9.14539830523871e-07, + "loss": 0.9478, + "step": 9593 + }, + { + "epoch": 0.6921828216875293, + "grad_norm": 3.4805231400509573, + "learning_rate": 9.141473213069384e-07, + "loss": 0.9714, + "step": 9594 + }, + { + "epoch": 0.6922549691569568, + "grad_norm": 3.119753285948353, + "learning_rate": 9.137548713844013e-07, + "loss": 0.8055, + "step": 9595 + }, + { + "epoch": 0.6923271166263844, + "grad_norm": 2.8550769625844192, + "learning_rate": 9.133624807776892e-07, + "loss": 0.8542, + "step": 9596 + }, + { + "epoch": 0.6923992640958119, + "grad_norm": 2.6187767555490584, + "learning_rate": 9.129701495082294e-07, + "loss": 0.8841, + "step": 9597 + }, + { + "epoch": 0.6924714115652394, + "grad_norm": 2.636824632567327, + "learning_rate": 9.125778775974458e-07, + "loss": 0.8734, + "step": 9598 + }, + { + "epoch": 0.6925435590346669, + "grad_norm": 2.1226835527794714, + "learning_rate": 9.121856650667594e-07, + "loss": 0.8831, + "step": 9599 + }, + { + "epoch": 0.6926157065040943, + "grad_norm": 3.98102659260373, + "learning_rate": 9.117935119375863e-07, + "loss": 0.9136, + "step": 9600 + }, + { + "epoch": 0.6926878539735218, + "grad_norm": 2.3967213408592123, + "learning_rate": 9.114014182313412e-07, + "loss": 0.8513, + "step": 9601 + }, + { + "epoch": 0.6927600014429494, + "grad_norm": 2.7234318201877987, + "learning_rate": 9.110093839694346e-07, + "loss": 0.8849, + "step": 9602 + }, + { + "epoch": 0.6928321489123769, + "grad_norm": 2.990510714747896, + "learning_rate": 9.106174091732743e-07, + "loss": 0.9714, + "step": 9603 + }, + { + "epoch": 0.6929042963818044, + "grad_norm": 2.379627938880875, + "learning_rate": 9.102254938642646e-07, + "loss": 0.8352, + "step": 9604 + }, + { + "epoch": 0.6929764438512319, + "grad_norm": 2.9450482722990934, + "learning_rate": 9.098336380638064e-07, + "loss": 0.9674, + "step": 9605 + }, + { + "epoch": 0.6930485913206594, + "grad_norm": 2.4856309894799424, + "learning_rate": 9.094418417932983e-07, + "loss": 0.9364, + "step": 9606 + }, + { + "epoch": 0.693120738790087, + "grad_norm": 0.6887722597834819, + "learning_rate": 9.090501050741326e-07, + "loss": 0.7813, + "step": 9607 + }, + { + "epoch": 0.6931928862595145, + "grad_norm": 4.803582189935835, + "learning_rate": 9.086584279277037e-07, + "loss": 0.9248, + "step": 9608 + }, + { + "epoch": 0.693265033728942, + "grad_norm": 4.790628049766132, + "learning_rate": 9.082668103753972e-07, + "loss": 0.911, + "step": 9609 + }, + { + "epoch": 0.6933371811983695, + "grad_norm": 3.9609114967711543, + "learning_rate": 9.078752524385989e-07, + "loss": 0.9198, + "step": 9610 + }, + { + "epoch": 0.693409328667797, + "grad_norm": 3.4896013553590515, + "learning_rate": 9.074837541386899e-07, + "loss": 0.9366, + "step": 9611 + }, + { + "epoch": 0.6934814761372244, + "grad_norm": 2.3295070974909566, + "learning_rate": 9.070923154970489e-07, + "loss": 0.9857, + "step": 9612 + }, + { + "epoch": 0.693553623606652, + "grad_norm": 2.6756255835479266, + "learning_rate": 9.067009365350506e-07, + "loss": 0.9911, + "step": 9613 + }, + { + "epoch": 0.6936257710760795, + "grad_norm": 3.3393898460290927, + "learning_rate": 9.06309617274067e-07, + "loss": 0.9256, + "step": 9614 + }, + { + "epoch": 0.693697918545507, + "grad_norm": 2.2614302113484643, + "learning_rate": 9.059183577354672e-07, + "loss": 0.9036, + "step": 9615 + }, + { + "epoch": 0.6937700660149345, + "grad_norm": 2.6783079817054167, + "learning_rate": 9.055271579406152e-07, + "loss": 0.963, + "step": 9616 + }, + { + "epoch": 0.693842213484362, + "grad_norm": 2.3720266904310305, + "learning_rate": 9.051360179108733e-07, + "loss": 1.0187, + "step": 9617 + }, + { + "epoch": 0.6939143609537896, + "grad_norm": 2.878771754193816, + "learning_rate": 9.047449376676004e-07, + "loss": 0.9688, + "step": 9618 + }, + { + "epoch": 0.6939865084232171, + "grad_norm": 0.7489919245530317, + "learning_rate": 9.043539172321519e-07, + "loss": 0.8186, + "step": 9619 + }, + { + "epoch": 0.6940586558926446, + "grad_norm": 2.4028577797371935, + "learning_rate": 9.039629566258802e-07, + "loss": 0.9174, + "step": 9620 + }, + { + "epoch": 0.6941308033620721, + "grad_norm": 2.3690068772466955, + "learning_rate": 9.035720558701348e-07, + "loss": 0.9786, + "step": 9621 + }, + { + "epoch": 0.6942029508314996, + "grad_norm": 2.7322451770902907, + "learning_rate": 9.03181214986259e-07, + "loss": 0.9008, + "step": 9622 + }, + { + "epoch": 0.694275098300927, + "grad_norm": 2.330250736055973, + "learning_rate": 9.027904339955983e-07, + "loss": 0.9032, + "step": 9623 + }, + { + "epoch": 0.6943472457703546, + "grad_norm": 2.989758653628383, + "learning_rate": 9.023997129194896e-07, + "loss": 0.7876, + "step": 9624 + }, + { + "epoch": 0.6944193932397821, + "grad_norm": 2.496218682686273, + "learning_rate": 9.020090517792694e-07, + "loss": 1.0122, + "step": 9625 + }, + { + "epoch": 0.6944915407092096, + "grad_norm": 4.057457009285761, + "learning_rate": 9.016184505962701e-07, + "loss": 0.9995, + "step": 9626 + }, + { + "epoch": 0.6945636881786371, + "grad_norm": 9.517760289337664, + "learning_rate": 9.012279093918211e-07, + "loss": 0.9639, + "step": 9627 + }, + { + "epoch": 0.6946358356480646, + "grad_norm": 2.81406452320795, + "learning_rate": 9.008374281872486e-07, + "loss": 0.9103, + "step": 9628 + }, + { + "epoch": 0.6947079831174922, + "grad_norm": 3.992170300232029, + "learning_rate": 9.004470070038752e-07, + "loss": 0.9494, + "step": 9629 + }, + { + "epoch": 0.6947801305869197, + "grad_norm": 4.262103181417918, + "learning_rate": 9.000566458630209e-07, + "loss": 0.8348, + "step": 9630 + }, + { + "epoch": 0.6948522780563472, + "grad_norm": 2.3339820255283676, + "learning_rate": 8.99666344786e-07, + "loss": 0.8955, + "step": 9631 + }, + { + "epoch": 0.6949244255257747, + "grad_norm": 2.553532143747004, + "learning_rate": 8.992761037941284e-07, + "loss": 0.9421, + "step": 9632 + }, + { + "epoch": 0.6949965729952022, + "grad_norm": 2.6975223935588044, + "learning_rate": 8.988859229087129e-07, + "loss": 0.8806, + "step": 9633 + }, + { + "epoch": 0.6950687204646298, + "grad_norm": 3.0221928836607708, + "learning_rate": 8.984958021510611e-07, + "loss": 0.9359, + "step": 9634 + }, + { + "epoch": 0.6951408679340572, + "grad_norm": 2.330088868799885, + "learning_rate": 8.98105741542476e-07, + "loss": 0.8586, + "step": 9635 + }, + { + "epoch": 0.6952130154034847, + "grad_norm": 2.311163438709969, + "learning_rate": 8.977157411042573e-07, + "loss": 0.8829, + "step": 9636 + }, + { + "epoch": 0.6952851628729122, + "grad_norm": 2.2961835793662857, + "learning_rate": 8.973258008577014e-07, + "loss": 0.9619, + "step": 9637 + }, + { + "epoch": 0.6953573103423397, + "grad_norm": 3.1113272054494874, + "learning_rate": 8.969359208241023e-07, + "loss": 0.9938, + "step": 9638 + }, + { + "epoch": 0.6954294578117672, + "grad_norm": 2.3046522234786972, + "learning_rate": 8.965461010247484e-07, + "loss": 0.9283, + "step": 9639 + }, + { + "epoch": 0.6955016052811948, + "grad_norm": 2.3448235659729266, + "learning_rate": 8.96156341480927e-07, + "loss": 0.9169, + "step": 9640 + }, + { + "epoch": 0.6955737527506223, + "grad_norm": 2.400534343840098, + "learning_rate": 8.957666422139216e-07, + "loss": 0.9007, + "step": 9641 + }, + { + "epoch": 0.6956459002200498, + "grad_norm": 3.761189995005282, + "learning_rate": 8.953770032450121e-07, + "loss": 0.8994, + "step": 9642 + }, + { + "epoch": 0.6957180476894773, + "grad_norm": 2.275075420764335, + "learning_rate": 8.949874245954754e-07, + "loss": 0.9834, + "step": 9643 + }, + { + "epoch": 0.6957901951589048, + "grad_norm": 5.078122182992007, + "learning_rate": 8.945979062865847e-07, + "loss": 0.8795, + "step": 9644 + }, + { + "epoch": 0.6958623426283324, + "grad_norm": 4.011494096299227, + "learning_rate": 8.942084483396111e-07, + "loss": 0.9415, + "step": 9645 + }, + { + "epoch": 0.6959344900977599, + "grad_norm": 2.888295652742102, + "learning_rate": 8.93819050775819e-07, + "loss": 0.9759, + "step": 9646 + }, + { + "epoch": 0.6960066375671873, + "grad_norm": 2.9582089859166323, + "learning_rate": 8.934297136164752e-07, + "loss": 0.897, + "step": 9647 + }, + { + "epoch": 0.6960787850366148, + "grad_norm": 2.8169183569720637, + "learning_rate": 8.930404368828375e-07, + "loss": 0.8614, + "step": 9648 + }, + { + "epoch": 0.6961509325060423, + "grad_norm": 0.8443955671514822, + "learning_rate": 8.926512205961638e-07, + "loss": 0.8144, + "step": 9649 + }, + { + "epoch": 0.6962230799754698, + "grad_norm": 2.2334270467122583, + "learning_rate": 8.922620647777077e-07, + "loss": 0.9542, + "step": 9650 + }, + { + "epoch": 0.6962952274448974, + "grad_norm": 3.421182222653179, + "learning_rate": 8.918729694487197e-07, + "loss": 0.9124, + "step": 9651 + }, + { + "epoch": 0.6963673749143249, + "grad_norm": 3.658333600391097, + "learning_rate": 8.914839346304463e-07, + "loss": 0.9087, + "step": 9652 + }, + { + "epoch": 0.6964395223837524, + "grad_norm": 2.929636067256864, + "learning_rate": 8.91094960344132e-07, + "loss": 0.9229, + "step": 9653 + }, + { + "epoch": 0.6965116698531799, + "grad_norm": 0.6747728000730652, + "learning_rate": 8.907060466110174e-07, + "loss": 0.7462, + "step": 9654 + }, + { + "epoch": 0.6965838173226074, + "grad_norm": 9.059711316627313, + "learning_rate": 8.903171934523385e-07, + "loss": 0.834, + "step": 9655 + }, + { + "epoch": 0.696655964792035, + "grad_norm": 2.112190007144655, + "learning_rate": 8.899284008893298e-07, + "loss": 0.8717, + "step": 9656 + }, + { + "epoch": 0.6967281122614625, + "grad_norm": 3.0129206889969797, + "learning_rate": 8.895396689432217e-07, + "loss": 0.9133, + "step": 9657 + }, + { + "epoch": 0.69680025973089, + "grad_norm": 2.2061293658163024, + "learning_rate": 8.891509976352416e-07, + "loss": 0.8029, + "step": 9658 + }, + { + "epoch": 0.6968724072003174, + "grad_norm": 2.131730360561382, + "learning_rate": 8.887623869866132e-07, + "loss": 0.9492, + "step": 9659 + }, + { + "epoch": 0.6969445546697449, + "grad_norm": 3.463560053649894, + "learning_rate": 8.883738370185579e-07, + "loss": 0.9446, + "step": 9660 + }, + { + "epoch": 0.6970167021391724, + "grad_norm": 3.3852514764635915, + "learning_rate": 8.879853477522907e-07, + "loss": 0.8863, + "step": 9661 + }, + { + "epoch": 0.6970888496086, + "grad_norm": 3.1619463590527364, + "learning_rate": 8.875969192090289e-07, + "loss": 0.9488, + "step": 9662 + }, + { + "epoch": 0.6971609970780275, + "grad_norm": 3.618210584004225, + "learning_rate": 8.872085514099803e-07, + "loss": 0.925, + "step": 9663 + }, + { + "epoch": 0.697233144547455, + "grad_norm": 2.7840836052453763, + "learning_rate": 8.868202443763534e-07, + "loss": 0.9969, + "step": 9664 + }, + { + "epoch": 0.6973052920168825, + "grad_norm": 2.738824591828748, + "learning_rate": 8.864319981293522e-07, + "loss": 0.9916, + "step": 9665 + }, + { + "epoch": 0.69737743948631, + "grad_norm": 2.881036474958959, + "learning_rate": 8.860438126901772e-07, + "loss": 0.9641, + "step": 9666 + }, + { + "epoch": 0.6974495869557376, + "grad_norm": 0.8284691509357948, + "learning_rate": 8.856556880800268e-07, + "loss": 0.8262, + "step": 9667 + }, + { + "epoch": 0.6975217344251651, + "grad_norm": 2.2704524732839406, + "learning_rate": 8.852676243200925e-07, + "loss": 0.9344, + "step": 9668 + }, + { + "epoch": 0.6975938818945926, + "grad_norm": 5.735252708182002, + "learning_rate": 8.848796214315681e-07, + "loss": 1.0832, + "step": 9669 + }, + { + "epoch": 0.69766602936402, + "grad_norm": 2.9597737383067333, + "learning_rate": 8.844916794356381e-07, + "loss": 0.9224, + "step": 9670 + }, + { + "epoch": 0.6977381768334475, + "grad_norm": 2.5217440092534305, + "learning_rate": 8.841037983534893e-07, + "loss": 0.7744, + "step": 9671 + }, + { + "epoch": 0.697810324302875, + "grad_norm": 2.8035779368053424, + "learning_rate": 8.837159782063006e-07, + "loss": 0.9325, + "step": 9672 + }, + { + "epoch": 0.6978824717723026, + "grad_norm": 0.7395924222862281, + "learning_rate": 8.8332821901525e-07, + "loss": 0.8097, + "step": 9673 + }, + { + "epoch": 0.6979546192417301, + "grad_norm": 2.5371463059343, + "learning_rate": 8.829405208015117e-07, + "loss": 0.9111, + "step": 9674 + }, + { + "epoch": 0.6980267667111576, + "grad_norm": 12.663720909373668, + "learning_rate": 8.825528835862562e-07, + "loss": 0.9702, + "step": 9675 + }, + { + "epoch": 0.6980989141805851, + "grad_norm": 3.001345650555985, + "learning_rate": 8.82165307390651e-07, + "loss": 0.8315, + "step": 9676 + }, + { + "epoch": 0.6981710616500126, + "grad_norm": 1.8662102348450078, + "learning_rate": 8.817777922358612e-07, + "loss": 0.9225, + "step": 9677 + }, + { + "epoch": 0.6982432091194402, + "grad_norm": 2.6567013525363747, + "learning_rate": 8.813903381430456e-07, + "loss": 0.9268, + "step": 9678 + }, + { + "epoch": 0.6983153565888677, + "grad_norm": 3.6649230076715997, + "learning_rate": 8.810029451333629e-07, + "loss": 0.8287, + "step": 9679 + }, + { + "epoch": 0.6983875040582952, + "grad_norm": 3.919991941638371, + "learning_rate": 8.806156132279669e-07, + "loss": 0.9304, + "step": 9680 + }, + { + "epoch": 0.6984596515277227, + "grad_norm": 26.869329661620583, + "learning_rate": 8.802283424480086e-07, + "loss": 0.8689, + "step": 9681 + }, + { + "epoch": 0.6985317989971501, + "grad_norm": 2.740626311872771, + "learning_rate": 8.79841132814636e-07, + "loss": 0.9317, + "step": 9682 + }, + { + "epoch": 0.6986039464665776, + "grad_norm": 2.7071614199741374, + "learning_rate": 8.79453984348991e-07, + "loss": 1.0397, + "step": 9683 + }, + { + "epoch": 0.6986760939360052, + "grad_norm": 0.7590639065839001, + "learning_rate": 8.79066897072217e-07, + "loss": 0.7543, + "step": 9684 + }, + { + "epoch": 0.6987482414054327, + "grad_norm": 2.596668944923012, + "learning_rate": 8.786798710054491e-07, + "loss": 0.9613, + "step": 9685 + }, + { + "epoch": 0.6988203888748602, + "grad_norm": 3.185199430920125, + "learning_rate": 8.782929061698237e-07, + "loss": 0.9648, + "step": 9686 + }, + { + "epoch": 0.6988925363442877, + "grad_norm": 2.210686443619256, + "learning_rate": 8.779060025864695e-07, + "loss": 0.9472, + "step": 9687 + }, + { + "epoch": 0.6989646838137152, + "grad_norm": 2.295847448823694, + "learning_rate": 8.775191602765147e-07, + "loss": 0.9823, + "step": 9688 + }, + { + "epoch": 0.6990368312831428, + "grad_norm": 5.502203066629383, + "learning_rate": 8.771323792610832e-07, + "loss": 0.9299, + "step": 9689 + }, + { + "epoch": 0.6991089787525703, + "grad_norm": 3.005734684609188, + "learning_rate": 8.767456595612958e-07, + "loss": 0.9361, + "step": 9690 + }, + { + "epoch": 0.6991811262219978, + "grad_norm": 2.4766068800550847, + "learning_rate": 8.763590011982706e-07, + "loss": 0.9927, + "step": 9691 + }, + { + "epoch": 0.6992532736914253, + "grad_norm": 8.629026495880144, + "learning_rate": 8.759724041931194e-07, + "loss": 0.9513, + "step": 9692 + }, + { + "epoch": 0.6993254211608528, + "grad_norm": 2.3232502947197236, + "learning_rate": 8.755858685669555e-07, + "loss": 0.8869, + "step": 9693 + }, + { + "epoch": 0.6993975686302802, + "grad_norm": 3.6566274073210794, + "learning_rate": 8.751993943408842e-07, + "loss": 0.8977, + "step": 9694 + }, + { + "epoch": 0.6994697160997078, + "grad_norm": 2.6342177447063633, + "learning_rate": 8.748129815360099e-07, + "loss": 1.0178, + "step": 9695 + }, + { + "epoch": 0.6995418635691353, + "grad_norm": 4.019194089281997, + "learning_rate": 8.744266301734336e-07, + "loss": 0.9474, + "step": 9696 + }, + { + "epoch": 0.6996140110385628, + "grad_norm": 2.2491245685984675, + "learning_rate": 8.740403402742522e-07, + "loss": 0.8706, + "step": 9697 + }, + { + "epoch": 0.6996861585079903, + "grad_norm": 3.642253855162594, + "learning_rate": 8.736541118595597e-07, + "loss": 0.7277, + "step": 9698 + }, + { + "epoch": 0.6997583059774178, + "grad_norm": 3.523125949796556, + "learning_rate": 8.732679449504475e-07, + "loss": 0.8876, + "step": 9699 + }, + { + "epoch": 0.6998304534468454, + "grad_norm": 3.4578649517757687, + "learning_rate": 8.72881839568e-07, + "loss": 0.8646, + "step": 9700 + }, + { + "epoch": 0.6999026009162729, + "grad_norm": 2.2520814380996543, + "learning_rate": 8.724957957333044e-07, + "loss": 0.8707, + "step": 9701 + }, + { + "epoch": 0.6999747483857004, + "grad_norm": 5.178869766887166, + "learning_rate": 8.721098134674389e-07, + "loss": 0.9426, + "step": 9702 + }, + { + "epoch": 0.7000468958551279, + "grad_norm": 2.651075641486822, + "learning_rate": 8.717238927914809e-07, + "loss": 0.9716, + "step": 9703 + }, + { + "epoch": 0.7001190433245554, + "grad_norm": 2.776437775274408, + "learning_rate": 8.713380337265046e-07, + "loss": 0.9478, + "step": 9704 + }, + { + "epoch": 0.700191190793983, + "grad_norm": 4.040468779078972, + "learning_rate": 8.709522362935802e-07, + "loss": 0.8978, + "step": 9705 + }, + { + "epoch": 0.7002633382634104, + "grad_norm": 2.5594139124109496, + "learning_rate": 8.70566500513775e-07, + "loss": 0.907, + "step": 9706 + }, + { + "epoch": 0.7003354857328379, + "grad_norm": 2.691213836214329, + "learning_rate": 8.701808264081509e-07, + "loss": 0.7519, + "step": 9707 + }, + { + "epoch": 0.7004076332022654, + "grad_norm": 2.3495957777687404, + "learning_rate": 8.697952139977708e-07, + "loss": 0.9138, + "step": 9708 + }, + { + "epoch": 0.7004797806716929, + "grad_norm": 3.1012792962446047, + "learning_rate": 8.694096633036889e-07, + "loss": 0.9708, + "step": 9709 + }, + { + "epoch": 0.7005519281411204, + "grad_norm": 3.0145935508345727, + "learning_rate": 8.690241743469615e-07, + "loss": 0.8824, + "step": 9710 + }, + { + "epoch": 0.700624075610548, + "grad_norm": 3.1916648731940453, + "learning_rate": 8.686387471486363e-07, + "loss": 0.9966, + "step": 9711 + }, + { + "epoch": 0.7006962230799755, + "grad_norm": 2.974521207348145, + "learning_rate": 8.68253381729761e-07, + "loss": 0.9199, + "step": 9712 + }, + { + "epoch": 0.700768370549403, + "grad_norm": 2.7616584048227213, + "learning_rate": 8.678680781113789e-07, + "loss": 0.986, + "step": 9713 + }, + { + "epoch": 0.7008405180188305, + "grad_norm": 3.643100797160837, + "learning_rate": 8.674828363145301e-07, + "loss": 0.8827, + "step": 9714 + }, + { + "epoch": 0.700912665488258, + "grad_norm": 3.949711105791646, + "learning_rate": 8.670976563602519e-07, + "loss": 0.9132, + "step": 9715 + }, + { + "epoch": 0.7009848129576856, + "grad_norm": 4.408618912463633, + "learning_rate": 8.667125382695761e-07, + "loss": 0.9284, + "step": 9716 + }, + { + "epoch": 0.7010569604271131, + "grad_norm": 2.797175897363885, + "learning_rate": 8.663274820635333e-07, + "loss": 0.9547, + "step": 9717 + }, + { + "epoch": 0.7011291078965405, + "grad_norm": 2.943533848563519, + "learning_rate": 8.6594248776315e-07, + "loss": 0.9574, + "step": 9718 + }, + { + "epoch": 0.701201255365968, + "grad_norm": 2.6106023585913305, + "learning_rate": 8.655575553894494e-07, + "loss": 0.9893, + "step": 9719 + }, + { + "epoch": 0.7012734028353955, + "grad_norm": 2.108310911438821, + "learning_rate": 8.651726849634509e-07, + "loss": 0.9466, + "step": 9720 + }, + { + "epoch": 0.701345550304823, + "grad_norm": 7.588475785834548, + "learning_rate": 8.64787876506172e-07, + "loss": 0.9221, + "step": 9721 + }, + { + "epoch": 0.7014176977742506, + "grad_norm": 2.8828593097804465, + "learning_rate": 8.644031300386232e-07, + "loss": 0.9484, + "step": 9722 + }, + { + "epoch": 0.7014898452436781, + "grad_norm": 3.576205742517813, + "learning_rate": 8.640184455818171e-07, + "loss": 0.8831, + "step": 9723 + }, + { + "epoch": 0.7015619927131056, + "grad_norm": 3.534023579767616, + "learning_rate": 8.636338231567571e-07, + "loss": 0.992, + "step": 9724 + }, + { + "epoch": 0.7016341401825331, + "grad_norm": 3.4332590471644466, + "learning_rate": 8.632492627844486e-07, + "loss": 0.9016, + "step": 9725 + }, + { + "epoch": 0.7017062876519606, + "grad_norm": 0.8130603471953711, + "learning_rate": 8.62864764485889e-07, + "loss": 0.8449, + "step": 9726 + }, + { + "epoch": 0.7017784351213882, + "grad_norm": 4.129021735908576, + "learning_rate": 8.624803282820754e-07, + "loss": 0.9267, + "step": 9727 + }, + { + "epoch": 0.7018505825908157, + "grad_norm": 2.8758626555859625, + "learning_rate": 8.620959541939998e-07, + "loss": 0.9641, + "step": 9728 + }, + { + "epoch": 0.7019227300602431, + "grad_norm": 0.7403584882011438, + "learning_rate": 8.617116422426518e-07, + "loss": 0.8028, + "step": 9729 + }, + { + "epoch": 0.7019948775296706, + "grad_norm": 3.735629840533047, + "learning_rate": 8.613273924490181e-07, + "loss": 0.8864, + "step": 9730 + }, + { + "epoch": 0.7020670249990981, + "grad_norm": 3.243270215253109, + "learning_rate": 8.609432048340788e-07, + "loss": 0.9702, + "step": 9731 + }, + { + "epoch": 0.7021391724685256, + "grad_norm": 2.3076102651782016, + "learning_rate": 8.605590794188158e-07, + "loss": 0.9274, + "step": 9732 + }, + { + "epoch": 0.7022113199379532, + "grad_norm": 2.8255040790863917, + "learning_rate": 8.601750162242028e-07, + "loss": 0.8347, + "step": 9733 + }, + { + "epoch": 0.7022834674073807, + "grad_norm": 2.358136692093962, + "learning_rate": 8.597910152712125e-07, + "loss": 0.9391, + "step": 9734 + }, + { + "epoch": 0.7023556148768082, + "grad_norm": 7.966671900833943, + "learning_rate": 8.594070765808141e-07, + "loss": 0.8903, + "step": 9735 + }, + { + "epoch": 0.7024277623462357, + "grad_norm": 2.4834196545065446, + "learning_rate": 8.590232001739728e-07, + "loss": 0.9782, + "step": 9736 + }, + { + "epoch": 0.7024999098156632, + "grad_norm": 2.889094591968275, + "learning_rate": 8.586393860716508e-07, + "loss": 0.8386, + "step": 9737 + }, + { + "epoch": 0.7025720572850908, + "grad_norm": 2.4771611305470453, + "learning_rate": 8.582556342948075e-07, + "loss": 0.8416, + "step": 9738 + }, + { + "epoch": 0.7026442047545183, + "grad_norm": 8.133827930274675, + "learning_rate": 8.578719448643968e-07, + "loss": 0.9501, + "step": 9739 + }, + { + "epoch": 0.7027163522239458, + "grad_norm": 3.013585324877154, + "learning_rate": 8.574883178013712e-07, + "loss": 0.8256, + "step": 9740 + }, + { + "epoch": 0.7027884996933732, + "grad_norm": 2.6342184687726915, + "learning_rate": 8.571047531266789e-07, + "loss": 0.9457, + "step": 9741 + }, + { + "epoch": 0.7028606471628007, + "grad_norm": 5.641205733987967, + "learning_rate": 8.567212508612653e-07, + "loss": 0.9285, + "step": 9742 + }, + { + "epoch": 0.7029327946322282, + "grad_norm": 2.7136833949549457, + "learning_rate": 8.56337811026072e-07, + "loss": 0.9106, + "step": 9743 + }, + { + "epoch": 0.7030049421016558, + "grad_norm": 0.8624502471727089, + "learning_rate": 8.55954433642037e-07, + "loss": 0.808, + "step": 9744 + }, + { + "epoch": 0.7030770895710833, + "grad_norm": 3.187477410928307, + "learning_rate": 8.55571118730096e-07, + "loss": 0.847, + "step": 9745 + }, + { + "epoch": 0.7031492370405108, + "grad_norm": 3.627502629852919, + "learning_rate": 8.551878663111784e-07, + "loss": 1.1195, + "step": 9746 + }, + { + "epoch": 0.7032213845099383, + "grad_norm": 1.8799764079784789, + "learning_rate": 8.548046764062146e-07, + "loss": 0.9352, + "step": 9747 + }, + { + "epoch": 0.7032935319793658, + "grad_norm": 5.268243789338281, + "learning_rate": 8.544215490361277e-07, + "loss": 1.0001, + "step": 9748 + }, + { + "epoch": 0.7033656794487934, + "grad_norm": 2.77047588557234, + "learning_rate": 8.540384842218391e-07, + "loss": 0.9019, + "step": 9749 + }, + { + "epoch": 0.7034378269182209, + "grad_norm": 2.1960482738865816, + "learning_rate": 8.536554819842665e-07, + "loss": 1.0066, + "step": 9750 + }, + { + "epoch": 0.7035099743876484, + "grad_norm": 2.702959265065742, + "learning_rate": 8.532725423443246e-07, + "loss": 0.9376, + "step": 9751 + }, + { + "epoch": 0.7035821218570759, + "grad_norm": 4.445971931125242, + "learning_rate": 8.52889665322924e-07, + "loss": 0.9112, + "step": 9752 + }, + { + "epoch": 0.7036542693265033, + "grad_norm": 2.351440426518982, + "learning_rate": 8.525068509409726e-07, + "loss": 0.9797, + "step": 9753 + }, + { + "epoch": 0.7037264167959308, + "grad_norm": 3.0816243220945045, + "learning_rate": 8.521240992193746e-07, + "loss": 0.8687, + "step": 9754 + }, + { + "epoch": 0.7037985642653584, + "grad_norm": 2.486547707617592, + "learning_rate": 8.517414101790296e-07, + "loss": 0.9819, + "step": 9755 + }, + { + "epoch": 0.7038707117347859, + "grad_norm": 2.5717058202598237, + "learning_rate": 8.513587838408356e-07, + "loss": 0.9334, + "step": 9756 + }, + { + "epoch": 0.7039428592042134, + "grad_norm": 5.093987020583105, + "learning_rate": 8.50976220225686e-07, + "loss": 0.8467, + "step": 9757 + }, + { + "epoch": 0.7040150066736409, + "grad_norm": 2.356074772397914, + "learning_rate": 8.505937193544719e-07, + "loss": 0.8047, + "step": 9758 + }, + { + "epoch": 0.7040871541430684, + "grad_norm": 3.622966327440213, + "learning_rate": 8.502112812480795e-07, + "loss": 0.9654, + "step": 9759 + }, + { + "epoch": 0.704159301612496, + "grad_norm": 3.491262018857073, + "learning_rate": 8.498289059273935e-07, + "loss": 0.9723, + "step": 9760 + }, + { + "epoch": 0.7042314490819235, + "grad_norm": 1.8537538976236063, + "learning_rate": 8.494465934132917e-07, + "loss": 0.9548, + "step": 9761 + }, + { + "epoch": 0.704303596551351, + "grad_norm": 3.387223185247356, + "learning_rate": 8.490643437266536e-07, + "loss": 0.7884, + "step": 9762 + }, + { + "epoch": 0.7043757440207785, + "grad_norm": 3.212065190082442, + "learning_rate": 8.486821568883505e-07, + "loss": 0.9156, + "step": 9763 + }, + { + "epoch": 0.704447891490206, + "grad_norm": 2.7268056679094705, + "learning_rate": 8.483000329192527e-07, + "loss": 0.9054, + "step": 9764 + }, + { + "epoch": 0.7045200389596334, + "grad_norm": 3.1428451661711545, + "learning_rate": 8.479179718402268e-07, + "loss": 1.0103, + "step": 9765 + }, + { + "epoch": 0.704592186429061, + "grad_norm": 2.645458244909854, + "learning_rate": 8.475359736721352e-07, + "loss": 0.9485, + "step": 9766 + }, + { + "epoch": 0.7046643338984885, + "grad_norm": 2.60911439976284, + "learning_rate": 8.471540384358382e-07, + "loss": 0.904, + "step": 9767 + }, + { + "epoch": 0.704736481367916, + "grad_norm": 10.5612943965716, + "learning_rate": 8.467721661521912e-07, + "loss": 0.8759, + "step": 9768 + }, + { + "epoch": 0.7048086288373435, + "grad_norm": 2.8402507338185887, + "learning_rate": 8.463903568420481e-07, + "loss": 0.9235, + "step": 9769 + }, + { + "epoch": 0.704880776306771, + "grad_norm": 2.274157537896535, + "learning_rate": 8.460086105262561e-07, + "loss": 0.9297, + "step": 9770 + }, + { + "epoch": 0.7049529237761986, + "grad_norm": 2.34583017225939, + "learning_rate": 8.456269272256622e-07, + "loss": 0.9039, + "step": 9771 + }, + { + "epoch": 0.7050250712456261, + "grad_norm": 6.019495127852358, + "learning_rate": 8.452453069611083e-07, + "loss": 0.836, + "step": 9772 + }, + { + "epoch": 0.7050972187150536, + "grad_norm": 2.2040633780130046, + "learning_rate": 8.448637497534334e-07, + "loss": 0.7997, + "step": 9773 + }, + { + "epoch": 0.7051693661844811, + "grad_norm": 1.994146483409918, + "learning_rate": 8.444822556234729e-07, + "loss": 0.8753, + "step": 9774 + }, + { + "epoch": 0.7052415136539086, + "grad_norm": 2.18724723445479, + "learning_rate": 8.441008245920591e-07, + "loss": 0.8307, + "step": 9775 + }, + { + "epoch": 0.705313661123336, + "grad_norm": 3.3558791312638694, + "learning_rate": 8.437194566800201e-07, + "loss": 0.9565, + "step": 9776 + }, + { + "epoch": 0.7053858085927636, + "grad_norm": 2.8404474889101294, + "learning_rate": 8.433381519081819e-07, + "loss": 0.8527, + "step": 9777 + }, + { + "epoch": 0.7054579560621911, + "grad_norm": 1.7497276366682764, + "learning_rate": 8.429569102973648e-07, + "loss": 0.9381, + "step": 9778 + }, + { + "epoch": 0.7055301035316186, + "grad_norm": 2.7183353502494896, + "learning_rate": 8.425757318683874e-07, + "loss": 0.9062, + "step": 9779 + }, + { + "epoch": 0.7056022510010461, + "grad_norm": 2.1964645892301435, + "learning_rate": 8.421946166420649e-07, + "loss": 0.6948, + "step": 9780 + }, + { + "epoch": 0.7056743984704736, + "grad_norm": 2.1044001937549983, + "learning_rate": 8.418135646392081e-07, + "loss": 0.835, + "step": 9781 + }, + { + "epoch": 0.7057465459399012, + "grad_norm": 2.746558203021638, + "learning_rate": 8.414325758806253e-07, + "loss": 0.929, + "step": 9782 + }, + { + "epoch": 0.7058186934093287, + "grad_norm": 6.119749660483051, + "learning_rate": 8.410516503871206e-07, + "loss": 0.8563, + "step": 9783 + }, + { + "epoch": 0.7058908408787562, + "grad_norm": 2.328509100649044, + "learning_rate": 8.406707881794955e-07, + "loss": 0.8693, + "step": 9784 + }, + { + "epoch": 0.7059629883481837, + "grad_norm": 5.1734109770136145, + "learning_rate": 8.402899892785458e-07, + "loss": 0.8973, + "step": 9785 + }, + { + "epoch": 0.7060351358176112, + "grad_norm": 4.011521435781732, + "learning_rate": 8.399092537050678e-07, + "loss": 0.9303, + "step": 9786 + }, + { + "epoch": 0.7061072832870388, + "grad_norm": 2.4051544866293355, + "learning_rate": 8.395285814798505e-07, + "loss": 0.9792, + "step": 9787 + }, + { + "epoch": 0.7061794307564662, + "grad_norm": 4.119384179618006, + "learning_rate": 8.391479726236815e-07, + "loss": 0.9129, + "step": 9788 + }, + { + "epoch": 0.7062515782258937, + "grad_norm": 2.3907439757018603, + "learning_rate": 8.387674271573441e-07, + "loss": 0.9621, + "step": 9789 + }, + { + "epoch": 0.7063237256953212, + "grad_norm": 2.9530491087512054, + "learning_rate": 8.38386945101619e-07, + "loss": 0.8936, + "step": 9790 + }, + { + "epoch": 0.7063958731647487, + "grad_norm": 2.0947509195116543, + "learning_rate": 8.380065264772827e-07, + "loss": 0.9596, + "step": 9791 + }, + { + "epoch": 0.7064680206341762, + "grad_norm": 2.8275532855907817, + "learning_rate": 8.376261713051089e-07, + "loss": 0.9794, + "step": 9792 + }, + { + "epoch": 0.7065401681036038, + "grad_norm": 3.2080855500950274, + "learning_rate": 8.372458796058666e-07, + "loss": 0.8977, + "step": 9793 + }, + { + "epoch": 0.7066123155730313, + "grad_norm": 4.072049698262189, + "learning_rate": 8.368656514003221e-07, + "loss": 0.9973, + "step": 9794 + }, + { + "epoch": 0.7066844630424588, + "grad_norm": 0.7814090948238049, + "learning_rate": 8.364854867092388e-07, + "loss": 0.7681, + "step": 9795 + }, + { + "epoch": 0.7067566105118863, + "grad_norm": 4.48900851341047, + "learning_rate": 8.361053855533757e-07, + "loss": 0.9122, + "step": 9796 + }, + { + "epoch": 0.7068287579813138, + "grad_norm": 2.3801991624606784, + "learning_rate": 8.357253479534891e-07, + "loss": 0.8315, + "step": 9797 + }, + { + "epoch": 0.7069009054507414, + "grad_norm": 3.179148635379825, + "learning_rate": 8.353453739303311e-07, + "loss": 0.9154, + "step": 9798 + }, + { + "epoch": 0.7069730529201689, + "grad_norm": 0.753599272376922, + "learning_rate": 8.349654635046517e-07, + "loss": 0.8685, + "step": 9799 + }, + { + "epoch": 0.7070452003895963, + "grad_norm": 2.4777791497474295, + "learning_rate": 8.345856166971938e-07, + "loss": 0.9689, + "step": 9800 + }, + { + "epoch": 0.7071173478590238, + "grad_norm": 2.014636958242563, + "learning_rate": 8.34205833528703e-07, + "loss": 0.9818, + "step": 9801 + }, + { + "epoch": 0.7071894953284513, + "grad_norm": 8.97805696668119, + "learning_rate": 8.338261140199149e-07, + "loss": 0.9257, + "step": 9802 + }, + { + "epoch": 0.7072616427978788, + "grad_norm": 2.88078290417574, + "learning_rate": 8.334464581915659e-07, + "loss": 0.8754, + "step": 9803 + }, + { + "epoch": 0.7073337902673064, + "grad_norm": 2.8468937526717544, + "learning_rate": 8.330668660643874e-07, + "loss": 0.8641, + "step": 9804 + }, + { + "epoch": 0.7074059377367339, + "grad_norm": 2.170917327290736, + "learning_rate": 8.326873376591075e-07, + "loss": 0.9707, + "step": 9805 + }, + { + "epoch": 0.7074780852061614, + "grad_norm": 2.648491602882352, + "learning_rate": 8.323078729964508e-07, + "loss": 0.8336, + "step": 9806 + }, + { + "epoch": 0.7075502326755889, + "grad_norm": 2.522585887524253, + "learning_rate": 8.319284720971385e-07, + "loss": 0.914, + "step": 9807 + }, + { + "epoch": 0.7076223801450164, + "grad_norm": 2.1658219989608707, + "learning_rate": 8.315491349818891e-07, + "loss": 0.8474, + "step": 9808 + }, + { + "epoch": 0.707694527614444, + "grad_norm": 2.747421269351794, + "learning_rate": 8.311698616714151e-07, + "loss": 1.0038, + "step": 9809 + }, + { + "epoch": 0.7077666750838715, + "grad_norm": 6.8767104361998435, + "learning_rate": 8.307906521864283e-07, + "loss": 0.8546, + "step": 9810 + }, + { + "epoch": 0.707838822553299, + "grad_norm": 3.3628106579980694, + "learning_rate": 8.304115065476355e-07, + "loss": 0.8489, + "step": 9811 + }, + { + "epoch": 0.7079109700227264, + "grad_norm": 2.9586132257433477, + "learning_rate": 8.300324247757409e-07, + "loss": 0.8816, + "step": 9812 + }, + { + "epoch": 0.7079831174921539, + "grad_norm": 4.932712986751608, + "learning_rate": 8.296534068914443e-07, + "loss": 0.9121, + "step": 9813 + }, + { + "epoch": 0.7080552649615814, + "grad_norm": 2.1577091532051855, + "learning_rate": 8.292744529154434e-07, + "loss": 0.9097, + "step": 9814 + }, + { + "epoch": 0.708127412431009, + "grad_norm": 5.548801828904164, + "learning_rate": 8.288955628684293e-07, + "loss": 0.8585, + "step": 9815 + }, + { + "epoch": 0.7081995599004365, + "grad_norm": 2.8397885071892177, + "learning_rate": 8.285167367710948e-07, + "loss": 0.8942, + "step": 9816 + }, + { + "epoch": 0.708271707369864, + "grad_norm": 9.891448024356933, + "learning_rate": 8.281379746441239e-07, + "loss": 0.9337, + "step": 9817 + }, + { + "epoch": 0.7083438548392915, + "grad_norm": 2.518065222709752, + "learning_rate": 8.277592765082e-07, + "loss": 1.0128, + "step": 9818 + }, + { + "epoch": 0.708416002308719, + "grad_norm": 3.0822097861984825, + "learning_rate": 8.273806423840026e-07, + "loss": 0.9896, + "step": 9819 + }, + { + "epoch": 0.7084881497781466, + "grad_norm": 3.319328323808935, + "learning_rate": 8.270020722922073e-07, + "loss": 0.8882, + "step": 9820 + }, + { + "epoch": 0.7085602972475741, + "grad_norm": 2.1471286185214606, + "learning_rate": 8.266235662534869e-07, + "loss": 0.8022, + "step": 9821 + }, + { + "epoch": 0.7086324447170016, + "grad_norm": 2.731473680498247, + "learning_rate": 8.262451242885096e-07, + "loss": 0.845, + "step": 9822 + }, + { + "epoch": 0.7087045921864291, + "grad_norm": 3.6304096766296245, + "learning_rate": 8.258667464179416e-07, + "loss": 0.971, + "step": 9823 + }, + { + "epoch": 0.7087767396558565, + "grad_norm": 2.4228835436283576, + "learning_rate": 8.254884326624427e-07, + "loss": 0.9212, + "step": 9824 + }, + { + "epoch": 0.708848887125284, + "grad_norm": 3.6293862213281867, + "learning_rate": 8.251101830426741e-07, + "loss": 1.0141, + "step": 9825 + }, + { + "epoch": 0.7089210345947116, + "grad_norm": 3.748798813764423, + "learning_rate": 8.247319975792883e-07, + "loss": 0.9923, + "step": 9826 + }, + { + "epoch": 0.7089931820641391, + "grad_norm": 1.9841145021560664, + "learning_rate": 8.243538762929376e-07, + "loss": 1.0307, + "step": 9827 + }, + { + "epoch": 0.7090653295335666, + "grad_norm": 2.675466076186166, + "learning_rate": 8.239758192042696e-07, + "loss": 0.9405, + "step": 9828 + }, + { + "epoch": 0.7091374770029941, + "grad_norm": 0.7360851585163728, + "learning_rate": 8.235978263339285e-07, + "loss": 0.8062, + "step": 9829 + }, + { + "epoch": 0.7092096244724216, + "grad_norm": 3.9401138349247637, + "learning_rate": 8.232198977025553e-07, + "loss": 0.8807, + "step": 9830 + }, + { + "epoch": 0.7092817719418492, + "grad_norm": 4.322242678147402, + "learning_rate": 8.228420333307877e-07, + "loss": 0.9573, + "step": 9831 + }, + { + "epoch": 0.7093539194112767, + "grad_norm": 0.7346992994080569, + "learning_rate": 8.224642332392587e-07, + "loss": 0.822, + "step": 9832 + }, + { + "epoch": 0.7094260668807042, + "grad_norm": 2.118219103170601, + "learning_rate": 8.220864974485984e-07, + "loss": 0.9267, + "step": 9833 + }, + { + "epoch": 0.7094982143501317, + "grad_norm": 2.827009202233077, + "learning_rate": 8.217088259794343e-07, + "loss": 0.8955, + "step": 9834 + }, + { + "epoch": 0.7095703618195591, + "grad_norm": 3.473091089681369, + "learning_rate": 8.213312188523891e-07, + "loss": 0.9783, + "step": 9835 + }, + { + "epoch": 0.7096425092889866, + "grad_norm": 3.263186187491955, + "learning_rate": 8.209536760880829e-07, + "loss": 0.8921, + "step": 9836 + }, + { + "epoch": 0.7097146567584142, + "grad_norm": 5.365248239538875, + "learning_rate": 8.205761977071315e-07, + "loss": 0.9662, + "step": 9837 + }, + { + "epoch": 0.7097868042278417, + "grad_norm": 2.3282749204560464, + "learning_rate": 8.201987837301487e-07, + "loss": 0.9462, + "step": 9838 + }, + { + "epoch": 0.7098589516972692, + "grad_norm": 6.707067081842259, + "learning_rate": 8.198214341777416e-07, + "loss": 0.8762, + "step": 9839 + }, + { + "epoch": 0.7099310991666967, + "grad_norm": 2.3085348828551164, + "learning_rate": 8.194441490705184e-07, + "loss": 1.0099, + "step": 9840 + }, + { + "epoch": 0.7100032466361242, + "grad_norm": 3.027387064962304, + "learning_rate": 8.190669284290794e-07, + "loss": 0.9157, + "step": 9841 + }, + { + "epoch": 0.7100753941055518, + "grad_norm": 4.238004698278641, + "learning_rate": 8.186897722740236e-07, + "loss": 0.9745, + "step": 9842 + }, + { + "epoch": 0.7101475415749793, + "grad_norm": 2.7168099507964003, + "learning_rate": 8.183126806259463e-07, + "loss": 0.9845, + "step": 9843 + }, + { + "epoch": 0.7102196890444068, + "grad_norm": 0.6812321004353389, + "learning_rate": 8.179356535054392e-07, + "loss": 0.7423, + "step": 9844 + }, + { + "epoch": 0.7102918365138343, + "grad_norm": 2.033523578290435, + "learning_rate": 8.1755869093309e-07, + "loss": 1.0216, + "step": 9845 + }, + { + "epoch": 0.7103639839832618, + "grad_norm": 2.451600785743611, + "learning_rate": 8.171817929294836e-07, + "loss": 0.8277, + "step": 9846 + }, + { + "epoch": 0.7104361314526892, + "grad_norm": 2.3318133739629405, + "learning_rate": 8.168049595152014e-07, + "loss": 1.0011, + "step": 9847 + }, + { + "epoch": 0.7105082789221168, + "grad_norm": 3.4418412971682146, + "learning_rate": 8.164281907108197e-07, + "loss": 0.9323, + "step": 9848 + }, + { + "epoch": 0.7105804263915443, + "grad_norm": 5.01059439721266, + "learning_rate": 8.16051486536913e-07, + "loss": 0.9374, + "step": 9849 + }, + { + "epoch": 0.7106525738609718, + "grad_norm": 3.0587306278240436, + "learning_rate": 8.156748470140518e-07, + "loss": 1.021, + "step": 9850 + }, + { + "epoch": 0.7107247213303993, + "grad_norm": 2.7740177890672006, + "learning_rate": 8.15298272162803e-07, + "loss": 0.9431, + "step": 9851 + }, + { + "epoch": 0.7107968687998268, + "grad_norm": 5.402818241093167, + "learning_rate": 8.149217620037301e-07, + "loss": 0.9731, + "step": 9852 + }, + { + "epoch": 0.7108690162692544, + "grad_norm": 3.629283873062896, + "learning_rate": 8.145453165573935e-07, + "loss": 0.8375, + "step": 9853 + }, + { + "epoch": 0.7109411637386819, + "grad_norm": 5.594055167457033, + "learning_rate": 8.14168935844347e-07, + "loss": 0.917, + "step": 9854 + }, + { + "epoch": 0.7110133112081094, + "grad_norm": 0.7288120270205671, + "learning_rate": 8.137926198851468e-07, + "loss": 0.7895, + "step": 9855 + }, + { + "epoch": 0.7110854586775369, + "grad_norm": 0.693340118570627, + "learning_rate": 8.134163687003396e-07, + "loss": 0.7527, + "step": 9856 + }, + { + "epoch": 0.7111576061469644, + "grad_norm": 3.5391415888220665, + "learning_rate": 8.13040182310472e-07, + "loss": 0.9185, + "step": 9857 + }, + { + "epoch": 0.711229753616392, + "grad_norm": 2.0856459053050385, + "learning_rate": 8.126640607360862e-07, + "loss": 1.0024, + "step": 9858 + }, + { + "epoch": 0.7113019010858194, + "grad_norm": 2.69565347919593, + "learning_rate": 8.122880039977205e-07, + "loss": 0.9041, + "step": 9859 + }, + { + "epoch": 0.7113740485552469, + "grad_norm": 3.52814556484125, + "learning_rate": 8.1191201211591e-07, + "loss": 0.9083, + "step": 9860 + }, + { + "epoch": 0.7114461960246744, + "grad_norm": 2.971936252736931, + "learning_rate": 8.115360851111865e-07, + "loss": 0.805, + "step": 9861 + }, + { + "epoch": 0.7115183434941019, + "grad_norm": 2.4458529816530934, + "learning_rate": 8.111602230040789e-07, + "loss": 0.8896, + "step": 9862 + }, + { + "epoch": 0.7115904909635294, + "grad_norm": 3.829303544468779, + "learning_rate": 8.107844258151087e-07, + "loss": 0.948, + "step": 9863 + }, + { + "epoch": 0.711662638432957, + "grad_norm": 0.7897154872751865, + "learning_rate": 8.104086935648003e-07, + "loss": 0.7857, + "step": 9864 + }, + { + "epoch": 0.7117347859023845, + "grad_norm": 2.6978907540941335, + "learning_rate": 8.100330262736687e-07, + "loss": 0.8784, + "step": 9865 + }, + { + "epoch": 0.711806933371812, + "grad_norm": 2.200243424906711, + "learning_rate": 8.096574239622285e-07, + "loss": 0.917, + "step": 9866 + }, + { + "epoch": 0.7118790808412395, + "grad_norm": 4.179478263965487, + "learning_rate": 8.092818866509896e-07, + "loss": 0.8248, + "step": 9867 + }, + { + "epoch": 0.711951228310667, + "grad_norm": 2.2107372395432856, + "learning_rate": 8.089064143604594e-07, + "loss": 0.9715, + "step": 9868 + }, + { + "epoch": 0.7120233757800946, + "grad_norm": 4.347877572210902, + "learning_rate": 8.085310071111401e-07, + "loss": 0.8863, + "step": 9869 + }, + { + "epoch": 0.7120955232495221, + "grad_norm": 4.806317804897509, + "learning_rate": 8.08155664923533e-07, + "loss": 0.9025, + "step": 9870 + }, + { + "epoch": 0.7121676707189495, + "grad_norm": 3.151579167364981, + "learning_rate": 8.077803878181322e-07, + "loss": 0.9464, + "step": 9871 + }, + { + "epoch": 0.712239818188377, + "grad_norm": 2.7871947459222826, + "learning_rate": 8.074051758154308e-07, + "loss": 0.91, + "step": 9872 + }, + { + "epoch": 0.7123119656578045, + "grad_norm": 3.823513320526509, + "learning_rate": 8.07030028935918e-07, + "loss": 0.9013, + "step": 9873 + }, + { + "epoch": 0.712384113127232, + "grad_norm": 4.328461671810891, + "learning_rate": 8.066549472000792e-07, + "loss": 1.0473, + "step": 9874 + }, + { + "epoch": 0.7124562605966596, + "grad_norm": 2.509292593585082, + "learning_rate": 8.062799306283968e-07, + "loss": 0.874, + "step": 9875 + }, + { + "epoch": 0.7125284080660871, + "grad_norm": 3.9957103377510297, + "learning_rate": 8.059049792413468e-07, + "loss": 0.9641, + "step": 9876 + }, + { + "epoch": 0.7126005555355146, + "grad_norm": 2.282591333974834, + "learning_rate": 8.055300930594071e-07, + "loss": 0.8488, + "step": 9877 + }, + { + "epoch": 0.7126727030049421, + "grad_norm": 2.6184243994436436, + "learning_rate": 8.051552721030457e-07, + "loss": 0.7845, + "step": 9878 + }, + { + "epoch": 0.7127448504743696, + "grad_norm": 5.068798526856179, + "learning_rate": 8.047805163927332e-07, + "loss": 0.9748, + "step": 9879 + }, + { + "epoch": 0.7128169979437972, + "grad_norm": 3.9047050167330433, + "learning_rate": 8.044058259489315e-07, + "loss": 0.8813, + "step": 9880 + }, + { + "epoch": 0.7128891454132247, + "grad_norm": 2.4943756733594507, + "learning_rate": 8.040312007921015e-07, + "loss": 0.9931, + "step": 9881 + }, + { + "epoch": 0.7129612928826521, + "grad_norm": 2.2240383899106018, + "learning_rate": 8.036566409427006e-07, + "loss": 0.8602, + "step": 9882 + }, + { + "epoch": 0.7130334403520796, + "grad_norm": 3.541147725102489, + "learning_rate": 8.032821464211817e-07, + "loss": 0.9554, + "step": 9883 + }, + { + "epoch": 0.7131055878215071, + "grad_norm": 1.9499701081943699, + "learning_rate": 8.029077172479945e-07, + "loss": 0.8644, + "step": 9884 + }, + { + "epoch": 0.7131777352909346, + "grad_norm": 3.491964516571948, + "learning_rate": 8.025333534435857e-07, + "loss": 0.8228, + "step": 9885 + }, + { + "epoch": 0.7132498827603622, + "grad_norm": 0.8119803747755797, + "learning_rate": 8.021590550283981e-07, + "loss": 0.8171, + "step": 9886 + }, + { + "epoch": 0.7133220302297897, + "grad_norm": 2.614760181378366, + "learning_rate": 8.017848220228696e-07, + "loss": 0.8029, + "step": 9887 + }, + { + "epoch": 0.7133941776992172, + "grad_norm": 2.112948632991152, + "learning_rate": 8.014106544474361e-07, + "loss": 0.9263, + "step": 9888 + }, + { + "epoch": 0.7134663251686447, + "grad_norm": 2.9162018723487795, + "learning_rate": 8.0103655232253e-07, + "loss": 0.9107, + "step": 9889 + }, + { + "epoch": 0.7135384726380722, + "grad_norm": 2.810304420516418, + "learning_rate": 8.006625156685793e-07, + "loss": 0.9396, + "step": 9890 + }, + { + "epoch": 0.7136106201074998, + "grad_norm": 8.349222167556848, + "learning_rate": 8.00288544506009e-07, + "loss": 0.8299, + "step": 9891 + }, + { + "epoch": 0.7136827675769273, + "grad_norm": 2.7539684964984277, + "learning_rate": 7.999146388552407e-07, + "loss": 0.9619, + "step": 9892 + }, + { + "epoch": 0.7137549150463548, + "grad_norm": 3.1183828867881638, + "learning_rate": 7.995407987366898e-07, + "loss": 0.972, + "step": 9893 + }, + { + "epoch": 0.7138270625157822, + "grad_norm": 0.7119587699760949, + "learning_rate": 7.991670241707735e-07, + "loss": 0.7399, + "step": 9894 + }, + { + "epoch": 0.7138992099852097, + "grad_norm": 6.438835227785859, + "learning_rate": 7.987933151779001e-07, + "loss": 0.9208, + "step": 9895 + }, + { + "epoch": 0.7139713574546372, + "grad_norm": 6.94619096622091, + "learning_rate": 7.984196717784768e-07, + "loss": 1.003, + "step": 9896 + }, + { + "epoch": 0.7140435049240648, + "grad_norm": 3.4649667814789984, + "learning_rate": 7.980460939929072e-07, + "loss": 0.8665, + "step": 9897 + }, + { + "epoch": 0.7141156523934923, + "grad_norm": 9.336699514534407, + "learning_rate": 7.97672581841591e-07, + "loss": 0.8611, + "step": 9898 + }, + { + "epoch": 0.7141877998629198, + "grad_norm": 3.1320662905911214, + "learning_rate": 7.972991353449241e-07, + "loss": 0.9539, + "step": 9899 + }, + { + "epoch": 0.7142599473323473, + "grad_norm": 2.5728543128567436, + "learning_rate": 7.969257545232993e-07, + "loss": 0.9617, + "step": 9900 + }, + { + "epoch": 0.7143320948017748, + "grad_norm": 5.0734381541494615, + "learning_rate": 7.965524393971062e-07, + "loss": 0.969, + "step": 9901 + }, + { + "epoch": 0.7144042422712024, + "grad_norm": 29.265385159855448, + "learning_rate": 7.961791899867285e-07, + "loss": 0.985, + "step": 9902 + }, + { + "epoch": 0.7144763897406299, + "grad_norm": 3.4005229996491098, + "learning_rate": 7.95806006312549e-07, + "loss": 0.9636, + "step": 9903 + }, + { + "epoch": 0.7145485372100574, + "grad_norm": 2.7620182117798153, + "learning_rate": 7.954328883949455e-07, + "loss": 0.9068, + "step": 9904 + }, + { + "epoch": 0.7146206846794849, + "grad_norm": 3.168729829441858, + "learning_rate": 7.950598362542929e-07, + "loss": 0.8909, + "step": 9905 + }, + { + "epoch": 0.7146928321489123, + "grad_norm": 2.666604538034852, + "learning_rate": 7.946868499109623e-07, + "loss": 0.8716, + "step": 9906 + }, + { + "epoch": 0.7147649796183398, + "grad_norm": 4.789161668656154, + "learning_rate": 7.943139293853207e-07, + "loss": 1.0308, + "step": 9907 + }, + { + "epoch": 0.7148371270877674, + "grad_norm": 2.041523934909077, + "learning_rate": 7.939410746977322e-07, + "loss": 0.8809, + "step": 9908 + }, + { + "epoch": 0.7149092745571949, + "grad_norm": 2.295736536753576, + "learning_rate": 7.935682858685575e-07, + "loss": 0.9001, + "step": 9909 + }, + { + "epoch": 0.7149814220266224, + "grad_norm": 3.3080926040455574, + "learning_rate": 7.931955629181521e-07, + "loss": 0.852, + "step": 9910 + }, + { + "epoch": 0.7150535694960499, + "grad_norm": 3.7714628814606344, + "learning_rate": 7.928229058668694e-07, + "loss": 0.9021, + "step": 9911 + }, + { + "epoch": 0.7151257169654774, + "grad_norm": 3.0844121712374366, + "learning_rate": 7.924503147350592e-07, + "loss": 0.9499, + "step": 9912 + }, + { + "epoch": 0.715197864434905, + "grad_norm": 2.0243812039437175, + "learning_rate": 7.92077789543067e-07, + "loss": 0.9047, + "step": 9913 + }, + { + "epoch": 0.7152700119043325, + "grad_norm": 3.098047065855104, + "learning_rate": 7.917053303112359e-07, + "loss": 0.7352, + "step": 9914 + }, + { + "epoch": 0.71534215937376, + "grad_norm": 2.7602033328746374, + "learning_rate": 7.913329370599025e-07, + "loss": 0.9098, + "step": 9915 + }, + { + "epoch": 0.7154143068431875, + "grad_norm": 2.2685170624166764, + "learning_rate": 7.909606098094041e-07, + "loss": 0.9405, + "step": 9916 + }, + { + "epoch": 0.715486454312615, + "grad_norm": 7.347692851533638, + "learning_rate": 7.9058834858007e-07, + "loss": 0.9922, + "step": 9917 + }, + { + "epoch": 0.7155586017820424, + "grad_norm": 2.4325008479368764, + "learning_rate": 7.902161533922305e-07, + "loss": 0.8909, + "step": 9918 + }, + { + "epoch": 0.71563074925147, + "grad_norm": 0.84705637435933, + "learning_rate": 7.898440242662074e-07, + "loss": 0.8182, + "step": 9919 + }, + { + "epoch": 0.7157028967208975, + "grad_norm": 2.593653665615015, + "learning_rate": 7.894719612223226e-07, + "loss": 0.9224, + "step": 9920 + }, + { + "epoch": 0.715775044190325, + "grad_norm": 2.664706960459185, + "learning_rate": 7.890999642808926e-07, + "loss": 0.9412, + "step": 9921 + }, + { + "epoch": 0.7158471916597525, + "grad_norm": 2.947986633405054, + "learning_rate": 7.887280334622311e-07, + "loss": 0.8418, + "step": 9922 + }, + { + "epoch": 0.71591933912918, + "grad_norm": 2.3239781872288683, + "learning_rate": 7.883561687866476e-07, + "loss": 0.9045, + "step": 9923 + }, + { + "epoch": 0.7159914865986076, + "grad_norm": 1.8598875092770846, + "learning_rate": 7.879843702744491e-07, + "loss": 1.0246, + "step": 9924 + }, + { + "epoch": 0.7160636340680351, + "grad_norm": 2.5074160253548956, + "learning_rate": 7.876126379459367e-07, + "loss": 0.8472, + "step": 9925 + }, + { + "epoch": 0.7161357815374626, + "grad_norm": 2.9412141410434773, + "learning_rate": 7.872409718214099e-07, + "loss": 0.8446, + "step": 9926 + }, + { + "epoch": 0.7162079290068901, + "grad_norm": 2.62821980961939, + "learning_rate": 7.868693719211643e-07, + "loss": 0.9475, + "step": 9927 + }, + { + "epoch": 0.7162800764763176, + "grad_norm": 0.7641282535023805, + "learning_rate": 7.864978382654915e-07, + "loss": 0.7929, + "step": 9928 + }, + { + "epoch": 0.716352223945745, + "grad_norm": 4.314677255523979, + "learning_rate": 7.861263708746795e-07, + "loss": 0.9681, + "step": 9929 + }, + { + "epoch": 0.7164243714151726, + "grad_norm": 2.3923892822477115, + "learning_rate": 7.857549697690127e-07, + "loss": 0.9915, + "step": 9930 + }, + { + "epoch": 0.7164965188846001, + "grad_norm": 5.336108181235701, + "learning_rate": 7.853836349687728e-07, + "loss": 0.9002, + "step": 9931 + }, + { + "epoch": 0.7165686663540276, + "grad_norm": 2.889317232149001, + "learning_rate": 7.850123664942348e-07, + "loss": 0.9065, + "step": 9932 + }, + { + "epoch": 0.7166408138234551, + "grad_norm": 2.3211102382673894, + "learning_rate": 7.846411643656751e-07, + "loss": 0.8895, + "step": 9933 + }, + { + "epoch": 0.7167129612928826, + "grad_norm": 2.7072906149705225, + "learning_rate": 7.842700286033616e-07, + "loss": 0.9562, + "step": 9934 + }, + { + "epoch": 0.7167851087623102, + "grad_norm": 2.0595462340045785, + "learning_rate": 7.838989592275616e-07, + "loss": 0.9008, + "step": 9935 + }, + { + "epoch": 0.7168572562317377, + "grad_norm": 1.985866796884182, + "learning_rate": 7.835279562585373e-07, + "loss": 0.9041, + "step": 9936 + }, + { + "epoch": 0.7169294037011652, + "grad_norm": 3.9376668743315513, + "learning_rate": 7.831570197165481e-07, + "loss": 0.9071, + "step": 9937 + }, + { + "epoch": 0.7170015511705927, + "grad_norm": 2.290950790489525, + "learning_rate": 7.827861496218502e-07, + "loss": 0.9004, + "step": 9938 + }, + { + "epoch": 0.7170736986400202, + "grad_norm": 2.871303836116846, + "learning_rate": 7.824153459946934e-07, + "loss": 0.9123, + "step": 9939 + }, + { + "epoch": 0.7171458461094478, + "grad_norm": 2.4962954730694817, + "learning_rate": 7.820446088553285e-07, + "loss": 0.947, + "step": 9940 + }, + { + "epoch": 0.7172179935788752, + "grad_norm": 6.354939931710176, + "learning_rate": 7.81673938223998e-07, + "loss": 0.9326, + "step": 9941 + }, + { + "epoch": 0.7172901410483027, + "grad_norm": 2.0256888924997285, + "learning_rate": 7.813033341209439e-07, + "loss": 0.9748, + "step": 9942 + }, + { + "epoch": 0.7173622885177302, + "grad_norm": 2.784083862154222, + "learning_rate": 7.809327965664032e-07, + "loss": 0.7691, + "step": 9943 + }, + { + "epoch": 0.7174344359871577, + "grad_norm": 2.469262564843144, + "learning_rate": 7.805623255806095e-07, + "loss": 0.8453, + "step": 9944 + }, + { + "epoch": 0.7175065834565852, + "grad_norm": 3.1517130659276336, + "learning_rate": 7.801919211837931e-07, + "loss": 0.959, + "step": 9945 + }, + { + "epoch": 0.7175787309260128, + "grad_norm": 3.10245745067944, + "learning_rate": 7.79821583396181e-07, + "loss": 0.828, + "step": 9946 + }, + { + "epoch": 0.7176508783954403, + "grad_norm": 2.5649807714741697, + "learning_rate": 7.794513122379938e-07, + "loss": 0.9994, + "step": 9947 + }, + { + "epoch": 0.7177230258648678, + "grad_norm": 2.9915888812508693, + "learning_rate": 7.790811077294537e-07, + "loss": 0.9571, + "step": 9948 + }, + { + "epoch": 0.7177951733342953, + "grad_norm": 10.007938285948388, + "learning_rate": 7.787109698907737e-07, + "loss": 0.9603, + "step": 9949 + }, + { + "epoch": 0.7178673208037228, + "grad_norm": 2.7310211539690634, + "learning_rate": 7.783408987421667e-07, + "loss": 0.9557, + "step": 9950 + }, + { + "epoch": 0.7179394682731504, + "grad_norm": 3.4388811977905096, + "learning_rate": 7.779708943038408e-07, + "loss": 0.894, + "step": 9951 + }, + { + "epoch": 0.7180116157425779, + "grad_norm": 3.2452028523243928, + "learning_rate": 7.776009565960007e-07, + "loss": 0.9009, + "step": 9952 + }, + { + "epoch": 0.7180837632120053, + "grad_norm": 2.755605965797544, + "learning_rate": 7.772310856388475e-07, + "loss": 0.8616, + "step": 9953 + }, + { + "epoch": 0.7181559106814328, + "grad_norm": 2.2865083264654538, + "learning_rate": 7.768612814525772e-07, + "loss": 0.928, + "step": 9954 + }, + { + "epoch": 0.7182280581508603, + "grad_norm": 4.287556701819053, + "learning_rate": 7.764915440573856e-07, + "loss": 0.8493, + "step": 9955 + }, + { + "epoch": 0.7183002056202878, + "grad_norm": 5.508886267783803, + "learning_rate": 7.7612187347346e-07, + "loss": 0.8676, + "step": 9956 + }, + { + "epoch": 0.7183723530897154, + "grad_norm": 2.131956830094015, + "learning_rate": 7.757522697209899e-07, + "loss": 0.9354, + "step": 9957 + }, + { + "epoch": 0.7184445005591429, + "grad_norm": 4.053497198540319, + "learning_rate": 7.753827328201554e-07, + "loss": 0.931, + "step": 9958 + }, + { + "epoch": 0.7185166480285704, + "grad_norm": 2.0953701071106043, + "learning_rate": 7.750132627911367e-07, + "loss": 0.8008, + "step": 9959 + }, + { + "epoch": 0.7185887954979979, + "grad_norm": 5.203081560383137, + "learning_rate": 7.746438596541089e-07, + "loss": 0.8702, + "step": 9960 + }, + { + "epoch": 0.7186609429674254, + "grad_norm": 3.2490555784873254, + "learning_rate": 7.742745234292439e-07, + "loss": 0.9041, + "step": 9961 + }, + { + "epoch": 0.718733090436853, + "grad_norm": 3.683446822568495, + "learning_rate": 7.739052541367102e-07, + "loss": 1.0243, + "step": 9962 + }, + { + "epoch": 0.7188052379062805, + "grad_norm": 5.024386445350485, + "learning_rate": 7.735360517966711e-07, + "loss": 0.9286, + "step": 9963 + }, + { + "epoch": 0.718877385375708, + "grad_norm": 2.2034087302753904, + "learning_rate": 7.73166916429288e-07, + "loss": 1.012, + "step": 9964 + }, + { + "epoch": 0.7189495328451354, + "grad_norm": 2.3307433285984436, + "learning_rate": 7.727978480547182e-07, + "loss": 0.9868, + "step": 9965 + }, + { + "epoch": 0.7190216803145629, + "grad_norm": 2.274744765667543, + "learning_rate": 7.724288466931146e-07, + "loss": 0.9201, + "step": 9966 + }, + { + "epoch": 0.7190938277839904, + "grad_norm": 2.2930588972577057, + "learning_rate": 7.720599123646276e-07, + "loss": 0.8492, + "step": 9967 + }, + { + "epoch": 0.719165975253418, + "grad_norm": 3.310028593740821, + "learning_rate": 7.716910450894039e-07, + "loss": 0.9351, + "step": 9968 + }, + { + "epoch": 0.7192381227228455, + "grad_norm": 4.422117624265202, + "learning_rate": 7.713222448875835e-07, + "loss": 0.8979, + "step": 9969 + }, + { + "epoch": 0.719310270192273, + "grad_norm": 2.462655766396735, + "learning_rate": 7.709535117793085e-07, + "loss": 0.9823, + "step": 9970 + }, + { + "epoch": 0.7193824176617005, + "grad_norm": 3.8022892029648925, + "learning_rate": 7.705848457847111e-07, + "loss": 0.9107, + "step": 9971 + }, + { + "epoch": 0.719454565131128, + "grad_norm": 2.5894522852945916, + "learning_rate": 7.702162469239252e-07, + "loss": 1.0086, + "step": 9972 + }, + { + "epoch": 0.7195267126005556, + "grad_norm": 2.8410334764473357, + "learning_rate": 7.698477152170773e-07, + "loss": 0.8468, + "step": 9973 + }, + { + "epoch": 0.7195988600699831, + "grad_norm": 3.17620869255003, + "learning_rate": 7.694792506842916e-07, + "loss": 0.9953, + "step": 9974 + }, + { + "epoch": 0.7196710075394106, + "grad_norm": 2.245618793193373, + "learning_rate": 7.691108533456887e-07, + "loss": 1.0064, + "step": 9975 + }, + { + "epoch": 0.7197431550088381, + "grad_norm": 3.122504191338541, + "learning_rate": 7.687425232213855e-07, + "loss": 0.935, + "step": 9976 + }, + { + "epoch": 0.7198153024782655, + "grad_norm": 2.9218216345498464, + "learning_rate": 7.68374260331496e-07, + "loss": 0.9552, + "step": 9977 + }, + { + "epoch": 0.719887449947693, + "grad_norm": 3.888000706260523, + "learning_rate": 7.680060646961273e-07, + "loss": 0.9874, + "step": 9978 + }, + { + "epoch": 0.7199595974171206, + "grad_norm": 3.1270953973431115, + "learning_rate": 7.676379363353882e-07, + "loss": 0.8955, + "step": 9979 + }, + { + "epoch": 0.7200317448865481, + "grad_norm": 17.66696905681042, + "learning_rate": 7.672698752693787e-07, + "loss": 0.9628, + "step": 9980 + }, + { + "epoch": 0.7201038923559756, + "grad_norm": 2.891982630929456, + "learning_rate": 7.669018815181978e-07, + "loss": 0.9694, + "step": 9981 + }, + { + "epoch": 0.7201760398254031, + "grad_norm": 0.7701358646814629, + "learning_rate": 7.665339551019403e-07, + "loss": 0.7654, + "step": 9982 + }, + { + "epoch": 0.7202481872948306, + "grad_norm": 3.0983785818926255, + "learning_rate": 7.661660960406975e-07, + "loss": 0.9564, + "step": 9983 + }, + { + "epoch": 0.7203203347642582, + "grad_norm": 0.631548525784466, + "learning_rate": 7.657983043545568e-07, + "loss": 0.7531, + "step": 9984 + }, + { + "epoch": 0.7203924822336857, + "grad_norm": 7.075535180436856, + "learning_rate": 7.654305800636023e-07, + "loss": 0.9374, + "step": 9985 + }, + { + "epoch": 0.7204646297031132, + "grad_norm": 2.3737027490301252, + "learning_rate": 7.650629231879131e-07, + "loss": 0.9314, + "step": 9986 + }, + { + "epoch": 0.7205367771725407, + "grad_norm": 2.7166092435544584, + "learning_rate": 7.646953337475659e-07, + "loss": 0.8614, + "step": 9987 + }, + { + "epoch": 0.7206089246419681, + "grad_norm": 2.1593666220031826, + "learning_rate": 7.643278117626338e-07, + "loss": 0.8639, + "step": 9988 + }, + { + "epoch": 0.7206810721113956, + "grad_norm": 4.19492672188995, + "learning_rate": 7.639603572531854e-07, + "loss": 0.8505, + "step": 9989 + }, + { + "epoch": 0.7207532195808232, + "grad_norm": 2.387464241439103, + "learning_rate": 7.635929702392863e-07, + "loss": 0.82, + "step": 9990 + }, + { + "epoch": 0.7208253670502507, + "grad_norm": 2.510390528203821, + "learning_rate": 7.632256507409982e-07, + "loss": 0.9008, + "step": 9991 + }, + { + "epoch": 0.7208975145196782, + "grad_norm": 4.044637761252666, + "learning_rate": 7.628583987783794e-07, + "loss": 0.8354, + "step": 9992 + }, + { + "epoch": 0.7209696619891057, + "grad_norm": 4.621891110942171, + "learning_rate": 7.624912143714826e-07, + "loss": 1.0283, + "step": 9993 + }, + { + "epoch": 0.7210418094585332, + "grad_norm": 2.4064819422317205, + "learning_rate": 7.621240975403607e-07, + "loss": 0.989, + "step": 9994 + }, + { + "epoch": 0.7211139569279608, + "grad_norm": 2.3639858235936213, + "learning_rate": 7.61757048305058e-07, + "loss": 0.8767, + "step": 9995 + }, + { + "epoch": 0.7211861043973883, + "grad_norm": 2.2045643751124375, + "learning_rate": 7.613900666856203e-07, + "loss": 0.9913, + "step": 9996 + }, + { + "epoch": 0.7212582518668158, + "grad_norm": 2.5651166628357314, + "learning_rate": 7.610231527020854e-07, + "loss": 0.8599, + "step": 9997 + }, + { + "epoch": 0.7213303993362433, + "grad_norm": 2.10753511790999, + "learning_rate": 7.606563063744895e-07, + "loss": 0.9213, + "step": 9998 + }, + { + "epoch": 0.7214025468056708, + "grad_norm": 2.068716333582095, + "learning_rate": 7.602895277228647e-07, + "loss": 1.0084, + "step": 9999 + }, + { + "epoch": 0.7214746942750982, + "grad_norm": 0.7854938066142467, + "learning_rate": 7.599228167672398e-07, + "loss": 0.8188, + "step": 10000 + }, + { + "epoch": 0.7215468417445258, + "grad_norm": 5.039018592347466, + "learning_rate": 7.595561735276398e-07, + "loss": 0.8215, + "step": 10001 + }, + { + "epoch": 0.7216189892139533, + "grad_norm": 7.698047628222168, + "learning_rate": 7.591895980240846e-07, + "loss": 0.9454, + "step": 10002 + }, + { + "epoch": 0.7216911366833808, + "grad_norm": 2.4629383490927834, + "learning_rate": 7.58823090276592e-07, + "loss": 0.8304, + "step": 10003 + }, + { + "epoch": 0.7217632841528083, + "grad_norm": 2.7808541862781255, + "learning_rate": 7.584566503051759e-07, + "loss": 0.8742, + "step": 10004 + }, + { + "epoch": 0.7218354316222358, + "grad_norm": 0.7394722510361631, + "learning_rate": 7.580902781298459e-07, + "loss": 0.8101, + "step": 10005 + }, + { + "epoch": 0.7219075790916634, + "grad_norm": 2.4303228890607875, + "learning_rate": 7.577239737706087e-07, + "loss": 0.9416, + "step": 10006 + }, + { + "epoch": 0.7219797265610909, + "grad_norm": 0.726816645647252, + "learning_rate": 7.573577372474671e-07, + "loss": 0.8608, + "step": 10007 + }, + { + "epoch": 0.7220518740305184, + "grad_norm": 3.364560122257318, + "learning_rate": 7.569915685804178e-07, + "loss": 1.0028, + "step": 10008 + }, + { + "epoch": 0.7221240214999459, + "grad_norm": 2.188462290957761, + "learning_rate": 7.56625467789459e-07, + "loss": 1.0031, + "step": 10009 + }, + { + "epoch": 0.7221961689693734, + "grad_norm": 3.150083958551399, + "learning_rate": 7.562594348945801e-07, + "loss": 0.7997, + "step": 10010 + }, + { + "epoch": 0.722268316438801, + "grad_norm": 5.070349177388048, + "learning_rate": 7.558934699157691e-07, + "loss": 0.9243, + "step": 10011 + }, + { + "epoch": 0.7223404639082284, + "grad_norm": 4.25671484561222, + "learning_rate": 7.555275728730101e-07, + "loss": 0.9819, + "step": 10012 + }, + { + "epoch": 0.7224126113776559, + "grad_norm": 8.540938580503756, + "learning_rate": 7.551617437862837e-07, + "loss": 0.8742, + "step": 10013 + }, + { + "epoch": 0.7224847588470834, + "grad_norm": 2.1680328634593606, + "learning_rate": 7.54795982675566e-07, + "loss": 0.8559, + "step": 10014 + }, + { + "epoch": 0.7225569063165109, + "grad_norm": 2.5269632657411205, + "learning_rate": 7.544302895608302e-07, + "loss": 0.827, + "step": 10015 + }, + { + "epoch": 0.7226290537859384, + "grad_norm": 2.497266992156575, + "learning_rate": 7.54064664462046e-07, + "loss": 1.0022, + "step": 10016 + }, + { + "epoch": 0.722701201255366, + "grad_norm": 4.069698353304536, + "learning_rate": 7.53699107399177e-07, + "loss": 0.9819, + "step": 10017 + }, + { + "epoch": 0.7227733487247935, + "grad_norm": 2.2320327753443006, + "learning_rate": 7.533336183921872e-07, + "loss": 0.9313, + "step": 10018 + }, + { + "epoch": 0.722845496194221, + "grad_norm": 2.819840410194315, + "learning_rate": 7.529681974610329e-07, + "loss": 0.9873, + "step": 10019 + }, + { + "epoch": 0.7229176436636485, + "grad_norm": 2.1941888180126607, + "learning_rate": 7.526028446256689e-07, + "loss": 0.9774, + "step": 10020 + }, + { + "epoch": 0.722989791133076, + "grad_norm": 2.606864223782626, + "learning_rate": 7.52237559906046e-07, + "loss": 0.9678, + "step": 10021 + }, + { + "epoch": 0.7230619386025036, + "grad_norm": 11.951894978584006, + "learning_rate": 7.518723433221109e-07, + "loss": 0.9837, + "step": 10022 + }, + { + "epoch": 0.7231340860719311, + "grad_norm": 2.0330486843586435, + "learning_rate": 7.515071948938067e-07, + "loss": 0.8919, + "step": 10023 + }, + { + "epoch": 0.7232062335413585, + "grad_norm": 1.972460988266938, + "learning_rate": 7.511421146410735e-07, + "loss": 0.9425, + "step": 10024 + }, + { + "epoch": 0.723278381010786, + "grad_norm": 3.017357521264726, + "learning_rate": 7.507771025838456e-07, + "loss": 0.8383, + "step": 10025 + }, + { + "epoch": 0.7233505284802135, + "grad_norm": 3.615468483902641, + "learning_rate": 7.504121587420555e-07, + "loss": 0.9779, + "step": 10026 + }, + { + "epoch": 0.723422675949641, + "grad_norm": 2.4050104490755033, + "learning_rate": 7.500472831356319e-07, + "loss": 0.8668, + "step": 10027 + }, + { + "epoch": 0.7234948234190686, + "grad_norm": 2.73590829638674, + "learning_rate": 7.496824757844988e-07, + "loss": 0.8602, + "step": 10028 + }, + { + "epoch": 0.7235669708884961, + "grad_norm": 3.144629966181481, + "learning_rate": 7.493177367085772e-07, + "loss": 0.8978, + "step": 10029 + }, + { + "epoch": 0.7236391183579236, + "grad_norm": 2.383222600947659, + "learning_rate": 7.489530659277842e-07, + "loss": 0.9254, + "step": 10030 + }, + { + "epoch": 0.7237112658273511, + "grad_norm": 2.05789375347515, + "learning_rate": 7.485884634620336e-07, + "loss": 0.8481, + "step": 10031 + }, + { + "epoch": 0.7237834132967786, + "grad_norm": 4.407752896743973, + "learning_rate": 7.482239293312334e-07, + "loss": 0.932, + "step": 10032 + }, + { + "epoch": 0.7238555607662062, + "grad_norm": 2.4036794035369593, + "learning_rate": 7.478594635552915e-07, + "loss": 0.9209, + "step": 10033 + }, + { + "epoch": 0.7239277082356337, + "grad_norm": 2.6845419817359684, + "learning_rate": 7.474950661541086e-07, + "loss": 0.9697, + "step": 10034 + }, + { + "epoch": 0.7239998557050611, + "grad_norm": 4.47120716325067, + "learning_rate": 7.471307371475834e-07, + "loss": 0.9536, + "step": 10035 + }, + { + "epoch": 0.7240720031744886, + "grad_norm": 3.2362341213905013, + "learning_rate": 7.467664765556108e-07, + "loss": 0.8849, + "step": 10036 + }, + { + "epoch": 0.7241441506439161, + "grad_norm": 2.3900430008279945, + "learning_rate": 7.464022843980815e-07, + "loss": 0.8494, + "step": 10037 + }, + { + "epoch": 0.7242162981133436, + "grad_norm": 3.0162863525021173, + "learning_rate": 7.460381606948827e-07, + "loss": 0.8471, + "step": 10038 + }, + { + "epoch": 0.7242884455827712, + "grad_norm": 2.4348137554403704, + "learning_rate": 7.45674105465898e-07, + "loss": 0.8951, + "step": 10039 + }, + { + "epoch": 0.7243605930521987, + "grad_norm": 3.4965334482577393, + "learning_rate": 7.453101187310078e-07, + "loss": 0.9324, + "step": 10040 + }, + { + "epoch": 0.7244327405216262, + "grad_norm": 2.295058693561206, + "learning_rate": 7.449462005100866e-07, + "loss": 0.8545, + "step": 10041 + }, + { + "epoch": 0.7245048879910537, + "grad_norm": 2.676992679550313, + "learning_rate": 7.445823508230073e-07, + "loss": 0.9212, + "step": 10042 + }, + { + "epoch": 0.7245770354604812, + "grad_norm": 3.6540711260274317, + "learning_rate": 7.442185696896384e-07, + "loss": 0.8197, + "step": 10043 + }, + { + "epoch": 0.7246491829299088, + "grad_norm": 2.574962641620924, + "learning_rate": 7.438548571298449e-07, + "loss": 0.9327, + "step": 10044 + }, + { + "epoch": 0.7247213303993363, + "grad_norm": 2.511241909107107, + "learning_rate": 7.434912131634872e-07, + "loss": 0.7988, + "step": 10045 + }, + { + "epoch": 0.7247934778687638, + "grad_norm": 3.1067940433186263, + "learning_rate": 7.431276378104239e-07, + "loss": 0.924, + "step": 10046 + }, + { + "epoch": 0.7248656253381912, + "grad_norm": 5.193872421419346, + "learning_rate": 7.427641310905059e-07, + "loss": 0.8793, + "step": 10047 + }, + { + "epoch": 0.7249377728076187, + "grad_norm": 2.5976612980155367, + "learning_rate": 7.424006930235862e-07, + "loss": 0.8205, + "step": 10048 + }, + { + "epoch": 0.7250099202770462, + "grad_norm": 2.3442208389367507, + "learning_rate": 7.420373236295083e-07, + "loss": 1.0184, + "step": 10049 + }, + { + "epoch": 0.7250820677464738, + "grad_norm": 2.8083259974835704, + "learning_rate": 7.416740229281153e-07, + "loss": 0.9221, + "step": 10050 + }, + { + "epoch": 0.7251542152159013, + "grad_norm": 3.0555284518186125, + "learning_rate": 7.41310790939246e-07, + "loss": 0.9619, + "step": 10051 + }, + { + "epoch": 0.7252263626853288, + "grad_norm": 3.688017857282853, + "learning_rate": 7.409476276827347e-07, + "loss": 0.9472, + "step": 10052 + }, + { + "epoch": 0.7252985101547563, + "grad_norm": 2.262684143328605, + "learning_rate": 7.405845331784126e-07, + "loss": 0.9765, + "step": 10053 + }, + { + "epoch": 0.7253706576241838, + "grad_norm": 2.753093453441966, + "learning_rate": 7.402215074461072e-07, + "loss": 0.8521, + "step": 10054 + }, + { + "epoch": 0.7254428050936114, + "grad_norm": 3.897487119368336, + "learning_rate": 7.39858550505642e-07, + "loss": 0.9209, + "step": 10055 + }, + { + "epoch": 0.7255149525630389, + "grad_norm": 3.5169165379369542, + "learning_rate": 7.394956623768361e-07, + "loss": 0.9125, + "step": 10056 + }, + { + "epoch": 0.7255871000324664, + "grad_norm": 3.077542012576329, + "learning_rate": 7.391328430795057e-07, + "loss": 0.9449, + "step": 10057 + }, + { + "epoch": 0.7256592475018939, + "grad_norm": 2.28960581506088, + "learning_rate": 7.387700926334633e-07, + "loss": 0.9013, + "step": 10058 + }, + { + "epoch": 0.7257313949713213, + "grad_norm": 1.729797095158123, + "learning_rate": 7.38407411058517e-07, + "loss": 0.905, + "step": 10059 + }, + { + "epoch": 0.7258035424407489, + "grad_norm": 2.3995745321479913, + "learning_rate": 7.380447983744719e-07, + "loss": 0.8766, + "step": 10060 + }, + { + "epoch": 0.7258756899101764, + "grad_norm": 2.8409208540239423, + "learning_rate": 7.376822546011287e-07, + "loss": 0.9374, + "step": 10061 + }, + { + "epoch": 0.7259478373796039, + "grad_norm": 2.337327932460716, + "learning_rate": 7.373197797582848e-07, + "loss": 0.9503, + "step": 10062 + }, + { + "epoch": 0.7260199848490314, + "grad_norm": 2.6641941429569216, + "learning_rate": 7.36957373865734e-07, + "loss": 0.9486, + "step": 10063 + }, + { + "epoch": 0.7260921323184589, + "grad_norm": 4.473101651198864, + "learning_rate": 7.365950369432647e-07, + "loss": 0.989, + "step": 10064 + }, + { + "epoch": 0.7261642797878864, + "grad_norm": 3.9662108220841317, + "learning_rate": 7.362327690106635e-07, + "loss": 0.7987, + "step": 10065 + }, + { + "epoch": 0.726236427257314, + "grad_norm": 2.2426783858403936, + "learning_rate": 7.358705700877124e-07, + "loss": 0.9425, + "step": 10066 + }, + { + "epoch": 0.7263085747267415, + "grad_norm": 2.6713409782486233, + "learning_rate": 7.3550844019419e-07, + "loss": 0.9867, + "step": 10067 + }, + { + "epoch": 0.726380722196169, + "grad_norm": 3.246821683385334, + "learning_rate": 7.351463793498704e-07, + "loss": 0.9705, + "step": 10068 + }, + { + "epoch": 0.7264528696655965, + "grad_norm": 3.048390798831794, + "learning_rate": 7.34784387574525e-07, + "loss": 0.878, + "step": 10069 + }, + { + "epoch": 0.726525017135024, + "grad_norm": 4.987273231647668, + "learning_rate": 7.344224648879211e-07, + "loss": 0.8987, + "step": 10070 + }, + { + "epoch": 0.7265971646044515, + "grad_norm": 4.971144189358382, + "learning_rate": 7.3406061130982e-07, + "loss": 0.9158, + "step": 10071 + }, + { + "epoch": 0.726669312073879, + "grad_norm": 3.2528972916148033, + "learning_rate": 7.33698826859984e-07, + "loss": 0.9574, + "step": 10072 + }, + { + "epoch": 0.7267414595433065, + "grad_norm": 3.8700671639146584, + "learning_rate": 7.333371115581665e-07, + "loss": 0.9484, + "step": 10073 + }, + { + "epoch": 0.726813607012734, + "grad_norm": 2.9675589330488727, + "learning_rate": 7.329754654241205e-07, + "loss": 0.9636, + "step": 10074 + }, + { + "epoch": 0.7268857544821615, + "grad_norm": 2.201835200664439, + "learning_rate": 7.326138884775939e-07, + "loss": 0.9215, + "step": 10075 + }, + { + "epoch": 0.726957901951589, + "grad_norm": 2.0664692456282627, + "learning_rate": 7.322523807383312e-07, + "loss": 0.8751, + "step": 10076 + }, + { + "epoch": 0.7270300494210166, + "grad_norm": 2.561092827713976, + "learning_rate": 7.318909422260729e-07, + "loss": 0.8964, + "step": 10077 + }, + { + "epoch": 0.7271021968904441, + "grad_norm": 3.46704788373873, + "learning_rate": 7.315295729605563e-07, + "loss": 0.8635, + "step": 10078 + }, + { + "epoch": 0.7271743443598716, + "grad_norm": 3.7331395360055972, + "learning_rate": 7.311682729615137e-07, + "loss": 0.9738, + "step": 10079 + }, + { + "epoch": 0.7272464918292991, + "grad_norm": 2.4913942994064078, + "learning_rate": 7.308070422486746e-07, + "loss": 0.895, + "step": 10080 + }, + { + "epoch": 0.7273186392987266, + "grad_norm": 3.585051117430183, + "learning_rate": 7.304458808417644e-07, + "loss": 0.8292, + "step": 10081 + }, + { + "epoch": 0.727390786768154, + "grad_norm": 0.7064023672164662, + "learning_rate": 7.30084788760505e-07, + "loss": 0.7854, + "step": 10082 + }, + { + "epoch": 0.7274629342375816, + "grad_norm": 2.0427947398781114, + "learning_rate": 7.297237660246142e-07, + "loss": 0.9125, + "step": 10083 + }, + { + "epoch": 0.7275350817070091, + "grad_norm": 3.097161771890478, + "learning_rate": 7.29362812653806e-07, + "loss": 1.0746, + "step": 10084 + }, + { + "epoch": 0.7276072291764366, + "grad_norm": 2.2620856690782665, + "learning_rate": 7.290019286677917e-07, + "loss": 0.887, + "step": 10085 + }, + { + "epoch": 0.7276793766458641, + "grad_norm": 3.430272636810958, + "learning_rate": 7.286411140862759e-07, + "loss": 0.8588, + "step": 10086 + }, + { + "epoch": 0.7277515241152916, + "grad_norm": 3.36330393430213, + "learning_rate": 7.282803689289634e-07, + "loss": 0.954, + "step": 10087 + }, + { + "epoch": 0.7278236715847192, + "grad_norm": 2.4623694719294833, + "learning_rate": 7.279196932155516e-07, + "loss": 1.0212, + "step": 10088 + }, + { + "epoch": 0.7278958190541467, + "grad_norm": 2.087980034736387, + "learning_rate": 7.275590869657364e-07, + "loss": 0.9387, + "step": 10089 + }, + { + "epoch": 0.7279679665235742, + "grad_norm": 4.012278071175348, + "learning_rate": 7.271985501992088e-07, + "loss": 0.9421, + "step": 10090 + }, + { + "epoch": 0.7280401139930017, + "grad_norm": 2.6510498306558405, + "learning_rate": 7.268380829356569e-07, + "loss": 0.9378, + "step": 10091 + }, + { + "epoch": 0.7281122614624292, + "grad_norm": 4.633060545175164, + "learning_rate": 7.26477685194764e-07, + "loss": 0.8592, + "step": 10092 + }, + { + "epoch": 0.7281844089318568, + "grad_norm": 2.5821523940945323, + "learning_rate": 7.261173569962105e-07, + "loss": 0.8986, + "step": 10093 + }, + { + "epoch": 0.7282565564012842, + "grad_norm": 0.8497575652573307, + "learning_rate": 7.257570983596731e-07, + "loss": 0.8405, + "step": 10094 + }, + { + "epoch": 0.7283287038707117, + "grad_norm": 2.761149864470627, + "learning_rate": 7.253969093048227e-07, + "loss": 0.9092, + "step": 10095 + }, + { + "epoch": 0.7284008513401392, + "grad_norm": 2.327484349936738, + "learning_rate": 7.250367898513286e-07, + "loss": 0.9436, + "step": 10096 + }, + { + "epoch": 0.7284729988095667, + "grad_norm": 2.4076297321798905, + "learning_rate": 7.246767400188556e-07, + "loss": 0.9167, + "step": 10097 + }, + { + "epoch": 0.7285451462789942, + "grad_norm": 2.069386516275163, + "learning_rate": 7.243167598270648e-07, + "loss": 0.9879, + "step": 10098 + }, + { + "epoch": 0.7286172937484218, + "grad_norm": 2.660339080308523, + "learning_rate": 7.239568492956134e-07, + "loss": 0.9095, + "step": 10099 + }, + { + "epoch": 0.7286894412178493, + "grad_norm": 2.4712741357823926, + "learning_rate": 7.235970084441554e-07, + "loss": 1.0371, + "step": 10100 + }, + { + "epoch": 0.7287615886872768, + "grad_norm": 3.7616868062311926, + "learning_rate": 7.232372372923381e-07, + "loss": 0.9112, + "step": 10101 + }, + { + "epoch": 0.7288337361567043, + "grad_norm": 3.449679904407233, + "learning_rate": 7.228775358598105e-07, + "loss": 0.999, + "step": 10102 + }, + { + "epoch": 0.7289058836261318, + "grad_norm": 2.908525785911903, + "learning_rate": 7.225179041662119e-07, + "loss": 0.9651, + "step": 10103 + }, + { + "epoch": 0.7289780310955594, + "grad_norm": 3.5004393438067756, + "learning_rate": 7.221583422311819e-07, + "loss": 0.8954, + "step": 10104 + }, + { + "epoch": 0.7290501785649869, + "grad_norm": 3.244134598645441, + "learning_rate": 7.217988500743541e-07, + "loss": 0.8867, + "step": 10105 + }, + { + "epoch": 0.7291223260344143, + "grad_norm": 2.4024030972725345, + "learning_rate": 7.214394277153595e-07, + "loss": 0.9031, + "step": 10106 + }, + { + "epoch": 0.7291944735038418, + "grad_norm": 2.714716407478812, + "learning_rate": 7.210800751738246e-07, + "loss": 0.8433, + "step": 10107 + }, + { + "epoch": 0.7292666209732693, + "grad_norm": 3.186081813894474, + "learning_rate": 7.207207924693726e-07, + "loss": 0.8764, + "step": 10108 + }, + { + "epoch": 0.7293387684426968, + "grad_norm": 2.5006837863395432, + "learning_rate": 7.20361579621623e-07, + "loss": 0.9552, + "step": 10109 + }, + { + "epoch": 0.7294109159121244, + "grad_norm": 3.937098043603162, + "learning_rate": 7.200024366501889e-07, + "loss": 0.7978, + "step": 10110 + }, + { + "epoch": 0.7294830633815519, + "grad_norm": 3.022350500304671, + "learning_rate": 7.196433635746848e-07, + "loss": 0.9253, + "step": 10111 + }, + { + "epoch": 0.7295552108509794, + "grad_norm": 2.8630209436266827, + "learning_rate": 7.192843604147163e-07, + "loss": 0.8943, + "step": 10112 + }, + { + "epoch": 0.7296273583204069, + "grad_norm": 2.1207721389106005, + "learning_rate": 7.189254271898879e-07, + "loss": 0.899, + "step": 10113 + }, + { + "epoch": 0.7296995057898344, + "grad_norm": 3.3315350609173517, + "learning_rate": 7.185665639197996e-07, + "loss": 0.952, + "step": 10114 + }, + { + "epoch": 0.729771653259262, + "grad_norm": 3.9141597164667083, + "learning_rate": 7.182077706240473e-07, + "loss": 0.9178, + "step": 10115 + }, + { + "epoch": 0.7298438007286895, + "grad_norm": 2.250654019563555, + "learning_rate": 7.178490473222236e-07, + "loss": 0.8641, + "step": 10116 + }, + { + "epoch": 0.729915948198117, + "grad_norm": 2.743742499200044, + "learning_rate": 7.174903940339178e-07, + "loss": 0.9077, + "step": 10117 + }, + { + "epoch": 0.7299880956675444, + "grad_norm": 2.8203337185132824, + "learning_rate": 7.171318107787135e-07, + "loss": 0.8432, + "step": 10118 + }, + { + "epoch": 0.7300602431369719, + "grad_norm": 2.2359909936249247, + "learning_rate": 7.167732975761915e-07, + "loss": 0.9316, + "step": 10119 + }, + { + "epoch": 0.7301323906063995, + "grad_norm": 2.312981632699042, + "learning_rate": 7.164148544459296e-07, + "loss": 0.853, + "step": 10120 + }, + { + "epoch": 0.730204538075827, + "grad_norm": 2.4294144946225478, + "learning_rate": 7.16056481407501e-07, + "loss": 1.0089, + "step": 10121 + }, + { + "epoch": 0.7302766855452545, + "grad_norm": 0.7806401733682998, + "learning_rate": 7.156981784804755e-07, + "loss": 0.7737, + "step": 10122 + }, + { + "epoch": 0.730348833014682, + "grad_norm": 3.1924035744678947, + "learning_rate": 7.153399456844167e-07, + "loss": 0.891, + "step": 10123 + }, + { + "epoch": 0.7304209804841095, + "grad_norm": 3.2349229431408535, + "learning_rate": 7.149817830388893e-07, + "loss": 0.8864, + "step": 10124 + }, + { + "epoch": 0.730493127953537, + "grad_norm": 3.4710643024384606, + "learning_rate": 7.146236905634484e-07, + "loss": 0.9343, + "step": 10125 + }, + { + "epoch": 0.7305652754229646, + "grad_norm": 2.4590368288004, + "learning_rate": 7.142656682776509e-07, + "loss": 0.9448, + "step": 10126 + }, + { + "epoch": 0.7306374228923921, + "grad_norm": 2.5516843730462506, + "learning_rate": 7.139077162010447e-07, + "loss": 0.8747, + "step": 10127 + }, + { + "epoch": 0.7307095703618196, + "grad_norm": 2.1854762934383225, + "learning_rate": 7.135498343531774e-07, + "loss": 0.9137, + "step": 10128 + }, + { + "epoch": 0.7307817178312471, + "grad_norm": 3.5673015427541355, + "learning_rate": 7.131920227535912e-07, + "loss": 0.9269, + "step": 10129 + }, + { + "epoch": 0.7308538653006745, + "grad_norm": 2.3402981579575814, + "learning_rate": 7.128342814218251e-07, + "loss": 0.7524, + "step": 10130 + }, + { + "epoch": 0.730926012770102, + "grad_norm": 2.6141211923233967, + "learning_rate": 7.12476610377414e-07, + "loss": 0.852, + "step": 10131 + }, + { + "epoch": 0.7309981602395296, + "grad_norm": 3.3918537778736977, + "learning_rate": 7.12119009639889e-07, + "loss": 1.0227, + "step": 10132 + }, + { + "epoch": 0.7310703077089571, + "grad_norm": 3.306215767061227, + "learning_rate": 7.117614792287778e-07, + "loss": 0.9482, + "step": 10133 + }, + { + "epoch": 0.7311424551783846, + "grad_norm": 3.0946646215006677, + "learning_rate": 7.114040191636028e-07, + "loss": 0.9326, + "step": 10134 + }, + { + "epoch": 0.7312146026478121, + "grad_norm": 2.4447982296058477, + "learning_rate": 7.110466294638842e-07, + "loss": 0.9353, + "step": 10135 + }, + { + "epoch": 0.7312867501172396, + "grad_norm": 2.8925491115189086, + "learning_rate": 7.106893101491373e-07, + "loss": 0.8655, + "step": 10136 + }, + { + "epoch": 0.7313588975866672, + "grad_norm": 2.11035088464531, + "learning_rate": 7.103320612388746e-07, + "loss": 1.0477, + "step": 10137 + }, + { + "epoch": 0.7314310450560947, + "grad_norm": 2.920965859040913, + "learning_rate": 7.099748827526037e-07, + "loss": 0.9648, + "step": 10138 + }, + { + "epoch": 0.7315031925255222, + "grad_norm": 2.159207071711241, + "learning_rate": 7.096177747098298e-07, + "loss": 0.9539, + "step": 10139 + }, + { + "epoch": 0.7315753399949497, + "grad_norm": 3.00287649216095, + "learning_rate": 7.092607371300507e-07, + "loss": 0.9392, + "step": 10140 + }, + { + "epoch": 0.7316474874643771, + "grad_norm": 3.519823068593662, + "learning_rate": 7.089037700327662e-07, + "loss": 0.9512, + "step": 10141 + }, + { + "epoch": 0.7317196349338047, + "grad_norm": 2.2675914780439634, + "learning_rate": 7.085468734374666e-07, + "loss": 0.8087, + "step": 10142 + }, + { + "epoch": 0.7317917824032322, + "grad_norm": 0.7838601569435811, + "learning_rate": 7.081900473636416e-07, + "loss": 0.7809, + "step": 10143 + }, + { + "epoch": 0.7318639298726597, + "grad_norm": 2.9142508356427443, + "learning_rate": 7.078332918307757e-07, + "loss": 1.0104, + "step": 10144 + }, + { + "epoch": 0.7319360773420872, + "grad_norm": 1.9715574924794188, + "learning_rate": 7.074766068583504e-07, + "loss": 0.9452, + "step": 10145 + }, + { + "epoch": 0.7320082248115147, + "grad_norm": 2.7288503584883905, + "learning_rate": 7.071199924658435e-07, + "loss": 0.9353, + "step": 10146 + }, + { + "epoch": 0.7320803722809422, + "grad_norm": 2.7639768182856135, + "learning_rate": 7.067634486727265e-07, + "loss": 0.9349, + "step": 10147 + }, + { + "epoch": 0.7321525197503698, + "grad_norm": 2.4931661663352855, + "learning_rate": 7.064069754984716e-07, + "loss": 0.9321, + "step": 10148 + }, + { + "epoch": 0.7322246672197973, + "grad_norm": 2.069412208436911, + "learning_rate": 7.060505729625414e-07, + "loss": 0.9175, + "step": 10149 + }, + { + "epoch": 0.7322968146892248, + "grad_norm": 2.081233975918688, + "learning_rate": 7.056942410844012e-07, + "loss": 0.8709, + "step": 10150 + }, + { + "epoch": 0.7323689621586523, + "grad_norm": 2.510824420996277, + "learning_rate": 7.053379798835059e-07, + "loss": 0.8239, + "step": 10151 + }, + { + "epoch": 0.7324411096280798, + "grad_norm": 2.6425845307514284, + "learning_rate": 7.04981789379311e-07, + "loss": 0.9067, + "step": 10152 + }, + { + "epoch": 0.7325132570975073, + "grad_norm": 2.3283436307722627, + "learning_rate": 7.046256695912669e-07, + "loss": 0.9118, + "step": 10153 + }, + { + "epoch": 0.7325854045669348, + "grad_norm": 2.685899035859103, + "learning_rate": 7.042696205388195e-07, + "loss": 0.929, + "step": 10154 + }, + { + "epoch": 0.7326575520363623, + "grad_norm": 0.6966077120908502, + "learning_rate": 7.039136422414114e-07, + "loss": 0.7653, + "step": 10155 + }, + { + "epoch": 0.7327296995057898, + "grad_norm": 3.0613123868143686, + "learning_rate": 7.035577347184821e-07, + "loss": 0.8729, + "step": 10156 + }, + { + "epoch": 0.7328018469752173, + "grad_norm": 2.594356052418633, + "learning_rate": 7.032018979894649e-07, + "loss": 0.7888, + "step": 10157 + }, + { + "epoch": 0.7328739944446448, + "grad_norm": 5.810577731010509, + "learning_rate": 7.028461320737915e-07, + "loss": 0.9552, + "step": 10158 + }, + { + "epoch": 0.7329461419140724, + "grad_norm": 3.097612070391522, + "learning_rate": 7.024904369908891e-07, + "loss": 0.9064, + "step": 10159 + }, + { + "epoch": 0.7330182893834999, + "grad_norm": 1.8893714090578793, + "learning_rate": 7.021348127601805e-07, + "loss": 0.8638, + "step": 10160 + }, + { + "epoch": 0.7330904368529274, + "grad_norm": 2.2247100083897324, + "learning_rate": 7.017792594010858e-07, + "loss": 0.9031, + "step": 10161 + }, + { + "epoch": 0.7331625843223549, + "grad_norm": 2.152425719429391, + "learning_rate": 7.014237769330189e-07, + "loss": 0.9328, + "step": 10162 + }, + { + "epoch": 0.7332347317917824, + "grad_norm": 5.078063964476947, + "learning_rate": 7.010683653753935e-07, + "loss": 0.9387, + "step": 10163 + }, + { + "epoch": 0.73330687926121, + "grad_norm": 0.7200976467145501, + "learning_rate": 7.007130247476149e-07, + "loss": 0.7703, + "step": 10164 + }, + { + "epoch": 0.7333790267306374, + "grad_norm": 2.663530134330989, + "learning_rate": 7.003577550690896e-07, + "loss": 0.9278, + "step": 10165 + }, + { + "epoch": 0.7334511742000649, + "grad_norm": 2.9771223367779314, + "learning_rate": 7.000025563592154e-07, + "loss": 0.9048, + "step": 10166 + }, + { + "epoch": 0.7335233216694924, + "grad_norm": 0.6589323809089375, + "learning_rate": 6.996474286373893e-07, + "loss": 0.7823, + "step": 10167 + }, + { + "epoch": 0.7335954691389199, + "grad_norm": 3.6299269506497267, + "learning_rate": 6.992923719230033e-07, + "loss": 0.8362, + "step": 10168 + }, + { + "epoch": 0.7336676166083475, + "grad_norm": 2.6942274480355306, + "learning_rate": 6.989373862354457e-07, + "loss": 0.9007, + "step": 10169 + }, + { + "epoch": 0.733739764077775, + "grad_norm": 4.037600933502192, + "learning_rate": 6.98582471594102e-07, + "loss": 0.8262, + "step": 10170 + }, + { + "epoch": 0.7338119115472025, + "grad_norm": 3.77252957180155, + "learning_rate": 6.982276280183503e-07, + "loss": 0.8762, + "step": 10171 + }, + { + "epoch": 0.73388405901663, + "grad_norm": 2.801157991191102, + "learning_rate": 6.978728555275704e-07, + "loss": 0.9349, + "step": 10172 + }, + { + "epoch": 0.7339562064860575, + "grad_norm": 2.6373152898679724, + "learning_rate": 6.975181541411328e-07, + "loss": 0.9599, + "step": 10173 + }, + { + "epoch": 0.734028353955485, + "grad_norm": 3.6644415750820674, + "learning_rate": 6.97163523878407e-07, + "loss": 0.9798, + "step": 10174 + }, + { + "epoch": 0.7341005014249126, + "grad_norm": 0.9044859258063898, + "learning_rate": 6.968089647587583e-07, + "loss": 0.9231, + "step": 10175 + }, + { + "epoch": 0.7341726488943401, + "grad_norm": 3.998617887136031, + "learning_rate": 6.964544768015479e-07, + "loss": 1.0178, + "step": 10176 + }, + { + "epoch": 0.7342447963637675, + "grad_norm": 2.023439620911654, + "learning_rate": 6.961000600261329e-07, + "loss": 0.9216, + "step": 10177 + }, + { + "epoch": 0.734316943833195, + "grad_norm": 3.12090017317603, + "learning_rate": 6.957457144518675e-07, + "loss": 0.947, + "step": 10178 + }, + { + "epoch": 0.7343890913026225, + "grad_norm": 3.3661909642303445, + "learning_rate": 6.953914400980989e-07, + "loss": 0.9291, + "step": 10179 + }, + { + "epoch": 0.73446123877205, + "grad_norm": 2.8735392632856165, + "learning_rate": 6.950372369841757e-07, + "loss": 0.9055, + "step": 10180 + }, + { + "epoch": 0.7345333862414776, + "grad_norm": 2.0670671469786384, + "learning_rate": 6.946831051294376e-07, + "loss": 0.9938, + "step": 10181 + }, + { + "epoch": 0.7346055337109051, + "grad_norm": 2.7661604713124937, + "learning_rate": 6.943290445532228e-07, + "loss": 0.9152, + "step": 10182 + }, + { + "epoch": 0.7346776811803326, + "grad_norm": 2.510792705394145, + "learning_rate": 6.939750552748658e-07, + "loss": 0.8669, + "step": 10183 + }, + { + "epoch": 0.7347498286497601, + "grad_norm": 8.371785016517862, + "learning_rate": 6.936211373136959e-07, + "loss": 0.9063, + "step": 10184 + }, + { + "epoch": 0.7348219761191876, + "grad_norm": 6.283241373834467, + "learning_rate": 6.932672906890407e-07, + "loss": 0.9599, + "step": 10185 + }, + { + "epoch": 0.7348941235886152, + "grad_norm": 2.200689064833309, + "learning_rate": 6.929135154202197e-07, + "loss": 0.8644, + "step": 10186 + }, + { + "epoch": 0.7349662710580427, + "grad_norm": 5.404639737441185, + "learning_rate": 6.925598115265545e-07, + "loss": 0.8226, + "step": 10187 + }, + { + "epoch": 0.7350384185274701, + "grad_norm": 3.928450342269176, + "learning_rate": 6.922061790273566e-07, + "loss": 0.9048, + "step": 10188 + }, + { + "epoch": 0.7351105659968976, + "grad_norm": 3.44472341193972, + "learning_rate": 6.918526179419395e-07, + "loss": 0.8891, + "step": 10189 + }, + { + "epoch": 0.7351827134663251, + "grad_norm": 3.7531466951148165, + "learning_rate": 6.914991282896074e-07, + "loss": 0.868, + "step": 10190 + }, + { + "epoch": 0.7352548609357527, + "grad_norm": 6.396648375760171, + "learning_rate": 6.911457100896641e-07, + "loss": 1.008, + "step": 10191 + }, + { + "epoch": 0.7353270084051802, + "grad_norm": 3.6164788691081227, + "learning_rate": 6.907923633614084e-07, + "loss": 0.9552, + "step": 10192 + }, + { + "epoch": 0.7353991558746077, + "grad_norm": 2.9526722071441993, + "learning_rate": 6.904390881241351e-07, + "loss": 0.9683, + "step": 10193 + }, + { + "epoch": 0.7354713033440352, + "grad_norm": 5.927155166306895, + "learning_rate": 6.900858843971362e-07, + "loss": 0.9787, + "step": 10194 + }, + { + "epoch": 0.7355434508134627, + "grad_norm": 3.5438499154027943, + "learning_rate": 6.897327521996972e-07, + "loss": 0.9796, + "step": 10195 + }, + { + "epoch": 0.7356155982828902, + "grad_norm": 6.801410573624829, + "learning_rate": 6.893796915511023e-07, + "loss": 0.9295, + "step": 10196 + }, + { + "epoch": 0.7356877457523178, + "grad_norm": 1.8918765630800654, + "learning_rate": 6.890267024706306e-07, + "loss": 0.8591, + "step": 10197 + }, + { + "epoch": 0.7357598932217453, + "grad_norm": 2.8548117324030002, + "learning_rate": 6.886737849775577e-07, + "loss": 0.9528, + "step": 10198 + }, + { + "epoch": 0.7358320406911728, + "grad_norm": 3.1040831347668423, + "learning_rate": 6.883209390911548e-07, + "loss": 0.9439, + "step": 10199 + }, + { + "epoch": 0.7359041881606002, + "grad_norm": 6.337394349310438, + "learning_rate": 6.879681648306908e-07, + "loss": 0.9382, + "step": 10200 + }, + { + "epoch": 0.7359763356300277, + "grad_norm": 2.5662236025307763, + "learning_rate": 6.876154622154267e-07, + "loss": 0.8734, + "step": 10201 + }, + { + "epoch": 0.7360484830994553, + "grad_norm": 7.530452701896999, + "learning_rate": 6.872628312646254e-07, + "loss": 1.0059, + "step": 10202 + }, + { + "epoch": 0.7361206305688828, + "grad_norm": 3.3537756808474604, + "learning_rate": 6.869102719975399e-07, + "loss": 0.9751, + "step": 10203 + }, + { + "epoch": 0.7361927780383103, + "grad_norm": 6.073112087693384, + "learning_rate": 6.865577844334249e-07, + "loss": 0.9151, + "step": 10204 + }, + { + "epoch": 0.7362649255077378, + "grad_norm": 3.473920525493329, + "learning_rate": 6.862053685915266e-07, + "loss": 0.8976, + "step": 10205 + }, + { + "epoch": 0.7363370729771653, + "grad_norm": 0.7693689630242135, + "learning_rate": 6.858530244910894e-07, + "loss": 0.7731, + "step": 10206 + }, + { + "epoch": 0.7364092204465928, + "grad_norm": 2.6372723485679748, + "learning_rate": 6.855007521513538e-07, + "loss": 0.8642, + "step": 10207 + }, + { + "epoch": 0.7364813679160204, + "grad_norm": 2.949954798723899, + "learning_rate": 6.85148551591556e-07, + "loss": 1.0643, + "step": 10208 + }, + { + "epoch": 0.7365535153854479, + "grad_norm": 3.874752528994556, + "learning_rate": 6.847964228309291e-07, + "loss": 0.8523, + "step": 10209 + }, + { + "epoch": 0.7366256628548754, + "grad_norm": 2.2985646786438663, + "learning_rate": 6.844443658886994e-07, + "loss": 0.9029, + "step": 10210 + }, + { + "epoch": 0.7366978103243029, + "grad_norm": 3.241887163706064, + "learning_rate": 6.840923807840944e-07, + "loss": 0.8733, + "step": 10211 + }, + { + "epoch": 0.7367699577937303, + "grad_norm": 2.630707756946734, + "learning_rate": 6.837404675363323e-07, + "loss": 0.9765, + "step": 10212 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 13.402276008098983, + "learning_rate": 6.833886261646306e-07, + "loss": 0.9558, + "step": 10213 + }, + { + "epoch": 0.7369142527325854, + "grad_norm": 2.9632253539498166, + "learning_rate": 6.830368566882021e-07, + "loss": 0.951, + "step": 10214 + }, + { + "epoch": 0.7369864002020129, + "grad_norm": 2.4182536954099274, + "learning_rate": 6.826851591262559e-07, + "loss": 0.9024, + "step": 10215 + }, + { + "epoch": 0.7370585476714404, + "grad_norm": 4.152151970009728, + "learning_rate": 6.823335334979963e-07, + "loss": 0.84, + "step": 10216 + }, + { + "epoch": 0.7371306951408679, + "grad_norm": 2.9699132949040865, + "learning_rate": 6.819819798226254e-07, + "loss": 0.9461, + "step": 10217 + }, + { + "epoch": 0.7372028426102955, + "grad_norm": 4.754582453384343, + "learning_rate": 6.816304981193381e-07, + "loss": 1.0292, + "step": 10218 + }, + { + "epoch": 0.737274990079723, + "grad_norm": 2.4310274507705887, + "learning_rate": 6.812790884073303e-07, + "loss": 0.8845, + "step": 10219 + }, + { + "epoch": 0.7373471375491505, + "grad_norm": 2.3326377399493863, + "learning_rate": 6.809277507057889e-07, + "loss": 0.9089, + "step": 10220 + }, + { + "epoch": 0.737419285018578, + "grad_norm": 1.9353084475097408, + "learning_rate": 6.805764850339002e-07, + "loss": 0.8657, + "step": 10221 + }, + { + "epoch": 0.7374914324880055, + "grad_norm": 1.893666492799011, + "learning_rate": 6.802252914108453e-07, + "loss": 0.9378, + "step": 10222 + }, + { + "epoch": 0.737563579957433, + "grad_norm": 4.870318977515644, + "learning_rate": 6.798741698558015e-07, + "loss": 0.866, + "step": 10223 + }, + { + "epoch": 0.7376357274268605, + "grad_norm": 2.9497301075207467, + "learning_rate": 6.795231203879428e-07, + "loss": 0.9487, + "step": 10224 + }, + { + "epoch": 0.737707874896288, + "grad_norm": 2.126627915586391, + "learning_rate": 6.791721430264372e-07, + "loss": 0.9181, + "step": 10225 + }, + { + "epoch": 0.7377800223657155, + "grad_norm": 13.231755201148674, + "learning_rate": 6.788212377904528e-07, + "loss": 0.8764, + "step": 10226 + }, + { + "epoch": 0.737852169835143, + "grad_norm": 2.6615831578514406, + "learning_rate": 6.784704046991488e-07, + "loss": 0.926, + "step": 10227 + }, + { + "epoch": 0.7379243173045705, + "grad_norm": 6.6005211364301335, + "learning_rate": 6.781196437716839e-07, + "loss": 0.9154, + "step": 10228 + }, + { + "epoch": 0.737996464773998, + "grad_norm": 2.081375104563619, + "learning_rate": 6.777689550272119e-07, + "loss": 0.7153, + "step": 10229 + }, + { + "epoch": 0.7380686122434256, + "grad_norm": 22.40728374315133, + "learning_rate": 6.774183384848826e-07, + "loss": 0.8771, + "step": 10230 + }, + { + "epoch": 0.7381407597128531, + "grad_norm": 4.40852870592357, + "learning_rate": 6.770677941638416e-07, + "loss": 0.9832, + "step": 10231 + }, + { + "epoch": 0.7382129071822806, + "grad_norm": 2.2834975784519456, + "learning_rate": 6.767173220832312e-07, + "loss": 0.8848, + "step": 10232 + }, + { + "epoch": 0.7382850546517081, + "grad_norm": 1.6435270528119046, + "learning_rate": 6.763669222621899e-07, + "loss": 0.945, + "step": 10233 + }, + { + "epoch": 0.7383572021211356, + "grad_norm": 2.8674693787593544, + "learning_rate": 6.760165947198501e-07, + "loss": 0.8087, + "step": 10234 + }, + { + "epoch": 0.7384293495905631, + "grad_norm": 1.7961576273719222, + "learning_rate": 6.756663394753433e-07, + "loss": 0.95, + "step": 10235 + }, + { + "epoch": 0.7385014970599906, + "grad_norm": 0.643337716790157, + "learning_rate": 6.753161565477949e-07, + "loss": 0.7544, + "step": 10236 + }, + { + "epoch": 0.7385736445294181, + "grad_norm": 2.758313617078804, + "learning_rate": 6.749660459563276e-07, + "loss": 0.9017, + "step": 10237 + }, + { + "epoch": 0.7386457919988456, + "grad_norm": 1.737430115536182, + "learning_rate": 6.746160077200591e-07, + "loss": 0.9677, + "step": 10238 + }, + { + "epoch": 0.7387179394682731, + "grad_norm": 17.25098803358131, + "learning_rate": 6.742660418581047e-07, + "loss": 0.8832, + "step": 10239 + }, + { + "epoch": 0.7387900869377007, + "grad_norm": 2.242998567952325, + "learning_rate": 6.73916148389573e-07, + "loss": 0.8922, + "step": 10240 + }, + { + "epoch": 0.7388622344071282, + "grad_norm": 0.7262399839541133, + "learning_rate": 6.735663273335725e-07, + "loss": 0.7845, + "step": 10241 + }, + { + "epoch": 0.7389343818765557, + "grad_norm": 3.049193453848762, + "learning_rate": 6.732165787092035e-07, + "loss": 0.9163, + "step": 10242 + }, + { + "epoch": 0.7390065293459832, + "grad_norm": 3.6078026719783445, + "learning_rate": 6.728669025355669e-07, + "loss": 0.8172, + "step": 10243 + }, + { + "epoch": 0.7390786768154107, + "grad_norm": 3.2427098600288193, + "learning_rate": 6.725172988317549e-07, + "loss": 0.9616, + "step": 10244 + }, + { + "epoch": 0.7391508242848382, + "grad_norm": 1.909414634979987, + "learning_rate": 6.721677676168593e-07, + "loss": 0.9678, + "step": 10245 + }, + { + "epoch": 0.7392229717542658, + "grad_norm": 2.98316859282382, + "learning_rate": 6.718183089099662e-07, + "loss": 0.9353, + "step": 10246 + }, + { + "epoch": 0.7392951192236932, + "grad_norm": 2.83028047310819, + "learning_rate": 6.714689227301589e-07, + "loss": 0.7554, + "step": 10247 + }, + { + "epoch": 0.7393672666931207, + "grad_norm": 2.787113652162538, + "learning_rate": 6.711196090965159e-07, + "loss": 0.8454, + "step": 10248 + }, + { + "epoch": 0.7394394141625482, + "grad_norm": 2.42137455845845, + "learning_rate": 6.707703680281114e-07, + "loss": 0.8851, + "step": 10249 + }, + { + "epoch": 0.7395115616319757, + "grad_norm": 2.5808720605260453, + "learning_rate": 6.704211995440161e-07, + "loss": 0.9549, + "step": 10250 + }, + { + "epoch": 0.7395837091014033, + "grad_norm": 3.6047198508687286, + "learning_rate": 6.700721036632975e-07, + "loss": 0.8714, + "step": 10251 + }, + { + "epoch": 0.7396558565708308, + "grad_norm": 2.9727435105458033, + "learning_rate": 6.697230804050178e-07, + "loss": 0.832, + "step": 10252 + }, + { + "epoch": 0.7397280040402583, + "grad_norm": 1.982083777549982, + "learning_rate": 6.693741297882364e-07, + "loss": 0.9425, + "step": 10253 + }, + { + "epoch": 0.7398001515096858, + "grad_norm": 2.3052685021432704, + "learning_rate": 6.690252518320078e-07, + "loss": 0.9496, + "step": 10254 + }, + { + "epoch": 0.7398722989791133, + "grad_norm": 3.276464405692517, + "learning_rate": 6.686764465553831e-07, + "loss": 0.9107, + "step": 10255 + }, + { + "epoch": 0.7399444464485408, + "grad_norm": 2.7302219786891837, + "learning_rate": 6.683277139774101e-07, + "loss": 0.9542, + "step": 10256 + }, + { + "epoch": 0.7400165939179684, + "grad_norm": 2.999870615394345, + "learning_rate": 6.679790541171302e-07, + "loss": 0.9361, + "step": 10257 + }, + { + "epoch": 0.7400887413873959, + "grad_norm": 3.0246648933865643, + "learning_rate": 6.67630466993583e-07, + "loss": 0.9236, + "step": 10258 + }, + { + "epoch": 0.7401608888568233, + "grad_norm": 2.6527285402088037, + "learning_rate": 6.67281952625804e-07, + "loss": 0.8136, + "step": 10259 + }, + { + "epoch": 0.7402330363262508, + "grad_norm": 1.9893957585561945, + "learning_rate": 6.66933511032824e-07, + "loss": 0.8675, + "step": 10260 + }, + { + "epoch": 0.7403051837956783, + "grad_norm": 3.336003251510975, + "learning_rate": 6.665851422336699e-07, + "loss": 0.935, + "step": 10261 + }, + { + "epoch": 0.7403773312651059, + "grad_norm": 2.901245566598512, + "learning_rate": 6.662368462473653e-07, + "loss": 0.8921, + "step": 10262 + }, + { + "epoch": 0.7404494787345334, + "grad_norm": 2.405418153127236, + "learning_rate": 6.658886230929298e-07, + "loss": 0.8548, + "step": 10263 + }, + { + "epoch": 0.7405216262039609, + "grad_norm": 5.135053682427394, + "learning_rate": 6.655404727893763e-07, + "loss": 0.936, + "step": 10264 + }, + { + "epoch": 0.7405937736733884, + "grad_norm": 4.1810551482502545, + "learning_rate": 6.65192395355719e-07, + "loss": 0.8754, + "step": 10265 + }, + { + "epoch": 0.7406659211428159, + "grad_norm": 3.524754314112335, + "learning_rate": 6.648443908109631e-07, + "loss": 0.9047, + "step": 10266 + }, + { + "epoch": 0.7407380686122434, + "grad_norm": 2.4572107586080683, + "learning_rate": 6.644964591741122e-07, + "loss": 0.9198, + "step": 10267 + }, + { + "epoch": 0.740810216081671, + "grad_norm": 2.10584783591603, + "learning_rate": 6.641486004641661e-07, + "loss": 0.8736, + "step": 10268 + }, + { + "epoch": 0.7408823635510985, + "grad_norm": 2.264236024429103, + "learning_rate": 6.638008147001197e-07, + "loss": 0.8818, + "step": 10269 + }, + { + "epoch": 0.740954511020526, + "grad_norm": 2.181900924139598, + "learning_rate": 6.634531019009644e-07, + "loss": 0.8897, + "step": 10270 + }, + { + "epoch": 0.7410266584899534, + "grad_norm": 2.204099615431683, + "learning_rate": 6.631054620856879e-07, + "loss": 0.9326, + "step": 10271 + }, + { + "epoch": 0.7410988059593809, + "grad_norm": 0.70582538854353, + "learning_rate": 6.627578952732726e-07, + "loss": 0.7603, + "step": 10272 + }, + { + "epoch": 0.7411709534288085, + "grad_norm": 2.3306958641926547, + "learning_rate": 6.624104014826986e-07, + "loss": 0.959, + "step": 10273 + }, + { + "epoch": 0.741243100898236, + "grad_norm": 2.562824972992421, + "learning_rate": 6.620629807329405e-07, + "loss": 0.9587, + "step": 10274 + }, + { + "epoch": 0.7413152483676635, + "grad_norm": 0.7213991107550234, + "learning_rate": 6.617156330429705e-07, + "loss": 0.8021, + "step": 10275 + }, + { + "epoch": 0.741387395837091, + "grad_norm": 0.6961643701216677, + "learning_rate": 6.613683584317556e-07, + "loss": 0.8195, + "step": 10276 + }, + { + "epoch": 0.7414595433065185, + "grad_norm": 0.7776471796593263, + "learning_rate": 6.610211569182591e-07, + "loss": 0.7965, + "step": 10277 + }, + { + "epoch": 0.741531690775946, + "grad_norm": 1.991992716523149, + "learning_rate": 6.606740285214412e-07, + "loss": 0.9257, + "step": 10278 + }, + { + "epoch": 0.7416038382453736, + "grad_norm": 6.696240284043755, + "learning_rate": 6.60326973260255e-07, + "loss": 0.9117, + "step": 10279 + }, + { + "epoch": 0.7416759857148011, + "grad_norm": 1.9533194483284135, + "learning_rate": 6.599799911536552e-07, + "loss": 0.9481, + "step": 10280 + }, + { + "epoch": 0.7417481331842286, + "grad_norm": 0.6691472549243641, + "learning_rate": 6.596330822205867e-07, + "loss": 0.8011, + "step": 10281 + }, + { + "epoch": 0.7418202806536561, + "grad_norm": 0.6925957786689731, + "learning_rate": 6.592862464799935e-07, + "loss": 0.799, + "step": 10282 + }, + { + "epoch": 0.7418924281230835, + "grad_norm": 2.3728876256686733, + "learning_rate": 6.589394839508156e-07, + "loss": 0.9678, + "step": 10283 + }, + { + "epoch": 0.741964575592511, + "grad_norm": 2.992248853248684, + "learning_rate": 6.585927946519878e-07, + "loss": 0.9074, + "step": 10284 + }, + { + "epoch": 0.7420367230619386, + "grad_norm": 2.3475808942382956, + "learning_rate": 6.582461786024418e-07, + "loss": 0.8644, + "step": 10285 + }, + { + "epoch": 0.7421088705313661, + "grad_norm": 2.0774409953895083, + "learning_rate": 6.578996358211051e-07, + "loss": 0.9559, + "step": 10286 + }, + { + "epoch": 0.7421810180007936, + "grad_norm": 2.128633758184614, + "learning_rate": 6.575531663269016e-07, + "loss": 0.8697, + "step": 10287 + }, + { + "epoch": 0.7422531654702211, + "grad_norm": 2.4663787269929527, + "learning_rate": 6.572067701387493e-07, + "loss": 0.9401, + "step": 10288 + }, + { + "epoch": 0.7423253129396487, + "grad_norm": 2.861204468851054, + "learning_rate": 6.568604472755646e-07, + "loss": 0.9147, + "step": 10289 + }, + { + "epoch": 0.7423974604090762, + "grad_norm": 3.661243387192224, + "learning_rate": 6.565141977562587e-07, + "loss": 0.904, + "step": 10290 + }, + { + "epoch": 0.7424696078785037, + "grad_norm": 0.7542049231406741, + "learning_rate": 6.561680215997391e-07, + "loss": 0.8258, + "step": 10291 + }, + { + "epoch": 0.7425417553479312, + "grad_norm": 3.3197749162691648, + "learning_rate": 6.55821918824909e-07, + "loss": 0.8563, + "step": 10292 + }, + { + "epoch": 0.7426139028173587, + "grad_norm": 2.847256353737135, + "learning_rate": 6.554758894506689e-07, + "loss": 0.9198, + "step": 10293 + }, + { + "epoch": 0.7426860502867861, + "grad_norm": 2.870834650298667, + "learning_rate": 6.551299334959115e-07, + "loss": 0.9465, + "step": 10294 + }, + { + "epoch": 0.7427581977562137, + "grad_norm": 3.7904844152635633, + "learning_rate": 6.547840509795315e-07, + "loss": 0.8727, + "step": 10295 + }, + { + "epoch": 0.7428303452256412, + "grad_norm": 1.9267426541907213, + "learning_rate": 6.54438241920414e-07, + "loss": 0.9932, + "step": 10296 + }, + { + "epoch": 0.7429024926950687, + "grad_norm": 2.1401796643428104, + "learning_rate": 6.540925063374427e-07, + "loss": 0.9164, + "step": 10297 + }, + { + "epoch": 0.7429746401644962, + "grad_norm": 2.688406503071445, + "learning_rate": 6.537468442494976e-07, + "loss": 0.8628, + "step": 10298 + }, + { + "epoch": 0.7430467876339237, + "grad_norm": 3.1345722840007197, + "learning_rate": 6.534012556754536e-07, + "loss": 0.9724, + "step": 10299 + }, + { + "epoch": 0.7431189351033513, + "grad_norm": 4.683663387592183, + "learning_rate": 6.530557406341819e-07, + "loss": 0.9563, + "step": 10300 + }, + { + "epoch": 0.7431910825727788, + "grad_norm": 2.1673186006063863, + "learning_rate": 6.527102991445501e-07, + "loss": 0.9269, + "step": 10301 + }, + { + "epoch": 0.7432632300422063, + "grad_norm": 2.177901982584581, + "learning_rate": 6.52364931225422e-07, + "loss": 0.8905, + "step": 10302 + }, + { + "epoch": 0.7433353775116338, + "grad_norm": 2.738748768989429, + "learning_rate": 6.52019636895655e-07, + "loss": 0.8906, + "step": 10303 + }, + { + "epoch": 0.7434075249810613, + "grad_norm": 2.0941510172313724, + "learning_rate": 6.51674416174107e-07, + "loss": 0.9206, + "step": 10304 + }, + { + "epoch": 0.7434796724504888, + "grad_norm": 3.634463486779164, + "learning_rate": 6.513292690796268e-07, + "loss": 0.8996, + "step": 10305 + }, + { + "epoch": 0.7435518199199163, + "grad_norm": 2.8055398225080874, + "learning_rate": 6.509841956310629e-07, + "loss": 0.8068, + "step": 10306 + }, + { + "epoch": 0.7436239673893438, + "grad_norm": 14.238625788869822, + "learning_rate": 6.506391958472581e-07, + "loss": 0.7912, + "step": 10307 + }, + { + "epoch": 0.7436961148587713, + "grad_norm": 3.0938167372161853, + "learning_rate": 6.502942697470517e-07, + "loss": 0.7859, + "step": 10308 + }, + { + "epoch": 0.7437682623281988, + "grad_norm": 2.82929201443002, + "learning_rate": 6.499494173492788e-07, + "loss": 0.8941, + "step": 10309 + }, + { + "epoch": 0.7438404097976263, + "grad_norm": 4.733353608495423, + "learning_rate": 6.496046386727709e-07, + "loss": 0.7798, + "step": 10310 + }, + { + "epoch": 0.7439125572670539, + "grad_norm": 2.5321185658673464, + "learning_rate": 6.492599337363542e-07, + "loss": 0.9872, + "step": 10311 + }, + { + "epoch": 0.7439847047364814, + "grad_norm": 1.7974617704907474, + "learning_rate": 6.489153025588525e-07, + "loss": 0.9121, + "step": 10312 + }, + { + "epoch": 0.7440568522059089, + "grad_norm": 4.124507990307374, + "learning_rate": 6.485707451590843e-07, + "loss": 0.8544, + "step": 10313 + }, + { + "epoch": 0.7441289996753364, + "grad_norm": 5.684471581693934, + "learning_rate": 6.482262615558649e-07, + "loss": 0.8499, + "step": 10314 + }, + { + "epoch": 0.7442011471447639, + "grad_norm": 2.2426383067064184, + "learning_rate": 6.478818517680054e-07, + "loss": 0.9417, + "step": 10315 + }, + { + "epoch": 0.7442732946141914, + "grad_norm": 1.791666126990422, + "learning_rate": 6.475375158143124e-07, + "loss": 0.897, + "step": 10316 + }, + { + "epoch": 0.744345442083619, + "grad_norm": 4.002345589513168, + "learning_rate": 6.471932537135898e-07, + "loss": 0.7921, + "step": 10317 + }, + { + "epoch": 0.7444175895530464, + "grad_norm": 2.5647175195721252, + "learning_rate": 6.468490654846343e-07, + "loss": 0.8397, + "step": 10318 + }, + { + "epoch": 0.7444897370224739, + "grad_norm": 2.042831620524725, + "learning_rate": 6.465049511462437e-07, + "loss": 1.0155, + "step": 10319 + }, + { + "epoch": 0.7445618844919014, + "grad_norm": 0.7840624349874604, + "learning_rate": 6.461609107172063e-07, + "loss": 0.7981, + "step": 10320 + }, + { + "epoch": 0.7446340319613289, + "grad_norm": 2.78498914733635, + "learning_rate": 6.4581694421631e-07, + "loss": 0.9503, + "step": 10321 + }, + { + "epoch": 0.7447061794307565, + "grad_norm": 3.0412290249566487, + "learning_rate": 6.454730516623373e-07, + "loss": 0.8867, + "step": 10322 + }, + { + "epoch": 0.744778326900184, + "grad_norm": 2.116108637063556, + "learning_rate": 6.451292330740669e-07, + "loss": 0.799, + "step": 10323 + }, + { + "epoch": 0.7448504743696115, + "grad_norm": 2.2930276008605097, + "learning_rate": 6.447854884702737e-07, + "loss": 0.8766, + "step": 10324 + }, + { + "epoch": 0.744922621839039, + "grad_norm": 2.145980595347192, + "learning_rate": 6.444418178697279e-07, + "loss": 0.9711, + "step": 10325 + }, + { + "epoch": 0.7449947693084665, + "grad_norm": 3.711161845603435, + "learning_rate": 6.440982212911971e-07, + "loss": 0.9721, + "step": 10326 + }, + { + "epoch": 0.745066916777894, + "grad_norm": 3.0655424614390423, + "learning_rate": 6.437546987534426e-07, + "loss": 1.0606, + "step": 10327 + }, + { + "epoch": 0.7451390642473216, + "grad_norm": 0.7537870955314302, + "learning_rate": 6.434112502752231e-07, + "loss": 0.8318, + "step": 10328 + }, + { + "epoch": 0.7452112117167491, + "grad_norm": 3.0634160131587964, + "learning_rate": 6.430678758752934e-07, + "loss": 0.8595, + "step": 10329 + }, + { + "epoch": 0.7452833591861765, + "grad_norm": 2.8564436431619056, + "learning_rate": 6.427245755724038e-07, + "loss": 1.073, + "step": 10330 + }, + { + "epoch": 0.745355506655604, + "grad_norm": 3.0632457214489968, + "learning_rate": 6.423813493853006e-07, + "loss": 0.9873, + "step": 10331 + }, + { + "epoch": 0.7454276541250315, + "grad_norm": 4.02214002706754, + "learning_rate": 6.42038197332727e-07, + "loss": 0.9363, + "step": 10332 + }, + { + "epoch": 0.745499801594459, + "grad_norm": 2.1663145610612093, + "learning_rate": 6.416951194334188e-07, + "loss": 0.9048, + "step": 10333 + }, + { + "epoch": 0.7455719490638866, + "grad_norm": 4.144292855125343, + "learning_rate": 6.413521157061135e-07, + "loss": 0.8865, + "step": 10334 + }, + { + "epoch": 0.7456440965333141, + "grad_norm": 2.333326839256332, + "learning_rate": 6.410091861695388e-07, + "loss": 0.8431, + "step": 10335 + }, + { + "epoch": 0.7457162440027416, + "grad_norm": 2.650901525924055, + "learning_rate": 6.406663308424216e-07, + "loss": 0.9704, + "step": 10336 + }, + { + "epoch": 0.7457883914721691, + "grad_norm": 2.7473304101949556, + "learning_rate": 6.40323549743484e-07, + "loss": 0.9224, + "step": 10337 + }, + { + "epoch": 0.7458605389415967, + "grad_norm": 2.558485748108186, + "learning_rate": 6.399808428914441e-07, + "loss": 0.8853, + "step": 10338 + }, + { + "epoch": 0.7459326864110242, + "grad_norm": 2.7280883891477354, + "learning_rate": 6.396382103050157e-07, + "loss": 0.9037, + "step": 10339 + }, + { + "epoch": 0.7460048338804517, + "grad_norm": 2.707985185596734, + "learning_rate": 6.392956520029089e-07, + "loss": 0.9087, + "step": 10340 + }, + { + "epoch": 0.7460769813498791, + "grad_norm": 2.6076617069885586, + "learning_rate": 6.389531680038299e-07, + "loss": 0.9494, + "step": 10341 + }, + { + "epoch": 0.7461491288193066, + "grad_norm": 3.1413535416551626, + "learning_rate": 6.386107583264785e-07, + "loss": 0.8576, + "step": 10342 + }, + { + "epoch": 0.7462212762887341, + "grad_norm": 2.536426102722639, + "learning_rate": 6.382684229895555e-07, + "loss": 0.9104, + "step": 10343 + }, + { + "epoch": 0.7462934237581617, + "grad_norm": 2.528768759717763, + "learning_rate": 6.379261620117522e-07, + "loss": 0.8803, + "step": 10344 + }, + { + "epoch": 0.7463655712275892, + "grad_norm": 4.15760654884103, + "learning_rate": 6.375839754117589e-07, + "loss": 0.9122, + "step": 10345 + }, + { + "epoch": 0.7464377186970167, + "grad_norm": 3.063429866455554, + "learning_rate": 6.372418632082612e-07, + "loss": 0.8788, + "step": 10346 + }, + { + "epoch": 0.7465098661664442, + "grad_norm": 5.021260265808019, + "learning_rate": 6.368998254199407e-07, + "loss": 0.9125, + "step": 10347 + }, + { + "epoch": 0.7465820136358717, + "grad_norm": 2.851130539040079, + "learning_rate": 6.365578620654746e-07, + "loss": 0.8598, + "step": 10348 + }, + { + "epoch": 0.7466541611052993, + "grad_norm": 2.4743206093541628, + "learning_rate": 6.36215973163537e-07, + "loss": 0.8757, + "step": 10349 + }, + { + "epoch": 0.7467263085747268, + "grad_norm": 2.1268712948417394, + "learning_rate": 6.35874158732796e-07, + "loss": 0.8546, + "step": 10350 + }, + { + "epoch": 0.7467984560441543, + "grad_norm": 3.1896422424125435, + "learning_rate": 6.355324187919173e-07, + "loss": 0.9304, + "step": 10351 + }, + { + "epoch": 0.7468706035135818, + "grad_norm": 1.8789431752722325, + "learning_rate": 6.351907533595624e-07, + "loss": 0.7849, + "step": 10352 + }, + { + "epoch": 0.7469427509830092, + "grad_norm": 5.569407428334113, + "learning_rate": 6.348491624543877e-07, + "loss": 0.9434, + "step": 10353 + }, + { + "epoch": 0.7470148984524367, + "grad_norm": 3.0791893682473224, + "learning_rate": 6.34507646095047e-07, + "loss": 0.8622, + "step": 10354 + }, + { + "epoch": 0.7470870459218643, + "grad_norm": 9.582191288665559, + "learning_rate": 6.341662043001885e-07, + "loss": 0.9212, + "step": 10355 + }, + { + "epoch": 0.7471591933912918, + "grad_norm": 3.9089219377383366, + "learning_rate": 6.338248370884583e-07, + "loss": 0.8587, + "step": 10356 + }, + { + "epoch": 0.7472313408607193, + "grad_norm": 0.7291775884264473, + "learning_rate": 6.334835444784949e-07, + "loss": 0.7397, + "step": 10357 + }, + { + "epoch": 0.7473034883301468, + "grad_norm": 4.5288704773821165, + "learning_rate": 6.33142326488938e-07, + "loss": 0.8985, + "step": 10358 + }, + { + "epoch": 0.7473756357995743, + "grad_norm": 2.461247694054139, + "learning_rate": 6.32801183138418e-07, + "loss": 0.9075, + "step": 10359 + }, + { + "epoch": 0.7474477832690019, + "grad_norm": 13.705034007740378, + "learning_rate": 6.324601144455639e-07, + "loss": 0.9603, + "step": 10360 + }, + { + "epoch": 0.7475199307384294, + "grad_norm": 2.788198727332204, + "learning_rate": 6.321191204290009e-07, + "loss": 0.9664, + "step": 10361 + }, + { + "epoch": 0.7475920782078569, + "grad_norm": 2.0754844697441537, + "learning_rate": 6.317782011073487e-07, + "loss": 0.9494, + "step": 10362 + }, + { + "epoch": 0.7476642256772844, + "grad_norm": 2.3506119032892805, + "learning_rate": 6.314373564992242e-07, + "loss": 0.9656, + "step": 10363 + }, + { + "epoch": 0.7477363731467119, + "grad_norm": 4.006491638609039, + "learning_rate": 6.310965866232392e-07, + "loss": 0.8949, + "step": 10364 + }, + { + "epoch": 0.7478085206161393, + "grad_norm": 2.5125782209527827, + "learning_rate": 6.307558914980029e-07, + "loss": 0.8295, + "step": 10365 + }, + { + "epoch": 0.7478806680855669, + "grad_norm": 2.516250153573302, + "learning_rate": 6.304152711421178e-07, + "loss": 0.8268, + "step": 10366 + }, + { + "epoch": 0.7479528155549944, + "grad_norm": 3.0334085860367224, + "learning_rate": 6.300747255741847e-07, + "loss": 0.8502, + "step": 10367 + }, + { + "epoch": 0.7480249630244219, + "grad_norm": 2.8722715284623304, + "learning_rate": 6.297342548127997e-07, + "loss": 0.8919, + "step": 10368 + }, + { + "epoch": 0.7480971104938494, + "grad_norm": 2.864092994107217, + "learning_rate": 6.293938588765544e-07, + "loss": 0.841, + "step": 10369 + }, + { + "epoch": 0.7481692579632769, + "grad_norm": 6.0366123850147995, + "learning_rate": 6.290535377840367e-07, + "loss": 0.8324, + "step": 10370 + }, + { + "epoch": 0.7482414054327045, + "grad_norm": 4.192454353680005, + "learning_rate": 6.28713291553831e-07, + "loss": 0.973, + "step": 10371 + }, + { + "epoch": 0.748313552902132, + "grad_norm": 5.308326248491591, + "learning_rate": 6.283731202045146e-07, + "loss": 0.9228, + "step": 10372 + }, + { + "epoch": 0.7483857003715595, + "grad_norm": 2.4014641865131416, + "learning_rate": 6.28033023754666e-07, + "loss": 0.9888, + "step": 10373 + }, + { + "epoch": 0.748457847840987, + "grad_norm": 2.7714392972241413, + "learning_rate": 6.276930022228543e-07, + "loss": 0.8255, + "step": 10374 + }, + { + "epoch": 0.7485299953104145, + "grad_norm": 3.8341341149757784, + "learning_rate": 6.273530556276478e-07, + "loss": 0.9487, + "step": 10375 + }, + { + "epoch": 0.748602142779842, + "grad_norm": 4.370852439021612, + "learning_rate": 6.270131839876093e-07, + "loss": 0.9251, + "step": 10376 + }, + { + "epoch": 0.7486742902492695, + "grad_norm": 2.8578395368778984, + "learning_rate": 6.266733873212986e-07, + "loss": 0.8494, + "step": 10377 + }, + { + "epoch": 0.748746437718697, + "grad_norm": 2.827981007319319, + "learning_rate": 6.263336656472702e-07, + "loss": 0.9497, + "step": 10378 + }, + { + "epoch": 0.7488185851881245, + "grad_norm": 5.232452312749749, + "learning_rate": 6.259940189840751e-07, + "loss": 0.8698, + "step": 10379 + }, + { + "epoch": 0.748890732657552, + "grad_norm": 4.997532808044432, + "learning_rate": 6.256544473502608e-07, + "loss": 0.8358, + "step": 10380 + }, + { + "epoch": 0.7489628801269795, + "grad_norm": 1.8210905974306422, + "learning_rate": 6.253149507643689e-07, + "loss": 0.8562, + "step": 10381 + }, + { + "epoch": 0.749035027596407, + "grad_norm": 2.6261470649951084, + "learning_rate": 6.249755292449388e-07, + "loss": 0.924, + "step": 10382 + }, + { + "epoch": 0.7491071750658346, + "grad_norm": 3.4013378876950067, + "learning_rate": 6.246361828105046e-07, + "loss": 0.9627, + "step": 10383 + }, + { + "epoch": 0.7491793225352621, + "grad_norm": 3.6323770897924037, + "learning_rate": 6.242969114795971e-07, + "loss": 0.8884, + "step": 10384 + }, + { + "epoch": 0.7492514700046896, + "grad_norm": 3.07935459731791, + "learning_rate": 6.239577152707427e-07, + "loss": 0.9309, + "step": 10385 + }, + { + "epoch": 0.7493236174741171, + "grad_norm": 2.3877150833153853, + "learning_rate": 6.236185942024635e-07, + "loss": 0.9576, + "step": 10386 + }, + { + "epoch": 0.7493957649435447, + "grad_norm": 2.0616896222172127, + "learning_rate": 6.232795482932778e-07, + "loss": 0.9427, + "step": 10387 + }, + { + "epoch": 0.7494679124129722, + "grad_norm": 5.1541619899341, + "learning_rate": 6.229405775616999e-07, + "loss": 0.9563, + "step": 10388 + }, + { + "epoch": 0.7495400598823996, + "grad_norm": 2.863413142852452, + "learning_rate": 6.226016820262388e-07, + "loss": 0.9766, + "step": 10389 + }, + { + "epoch": 0.7496122073518271, + "grad_norm": 2.655384326568086, + "learning_rate": 6.222628617054011e-07, + "loss": 0.9574, + "step": 10390 + }, + { + "epoch": 0.7496843548212546, + "grad_norm": 3.9940014206335057, + "learning_rate": 6.21924116617688e-07, + "loss": 0.8305, + "step": 10391 + }, + { + "epoch": 0.7497565022906821, + "grad_norm": 3.2018267722130522, + "learning_rate": 6.215854467815975e-07, + "loss": 0.9567, + "step": 10392 + }, + { + "epoch": 0.7498286497601097, + "grad_norm": 2.63703819317838, + "learning_rate": 6.212468522156238e-07, + "loss": 0.9654, + "step": 10393 + }, + { + "epoch": 0.7499007972295372, + "grad_norm": 1.9762344389658117, + "learning_rate": 6.209083329382543e-07, + "loss": 0.8783, + "step": 10394 + }, + { + "epoch": 0.7499729446989647, + "grad_norm": 3.657957542310689, + "learning_rate": 6.205698889679767e-07, + "loss": 0.9243, + "step": 10395 + }, + { + "epoch": 0.7500450921683922, + "grad_norm": 2.9732623691672018, + "learning_rate": 6.202315203232698e-07, + "loss": 1.0569, + "step": 10396 + }, + { + "epoch": 0.7501172396378197, + "grad_norm": 3.184980556123271, + "learning_rate": 6.198932270226129e-07, + "loss": 0.9404, + "step": 10397 + }, + { + "epoch": 0.7501893871072473, + "grad_norm": 3.8532045280373706, + "learning_rate": 6.195550090844775e-07, + "loss": 0.968, + "step": 10398 + }, + { + "epoch": 0.7502615345766748, + "grad_norm": 5.134360720056793, + "learning_rate": 6.192168665273328e-07, + "loss": 0.802, + "step": 10399 + }, + { + "epoch": 0.7503336820461022, + "grad_norm": 3.393374264538316, + "learning_rate": 6.188787993696434e-07, + "loss": 0.9582, + "step": 10400 + }, + { + "epoch": 0.7504058295155297, + "grad_norm": 2.6839832983790686, + "learning_rate": 6.185408076298699e-07, + "loss": 0.9161, + "step": 10401 + }, + { + "epoch": 0.7504779769849572, + "grad_norm": 3.087420993056616, + "learning_rate": 6.182028913264692e-07, + "loss": 0.8319, + "step": 10402 + }, + { + "epoch": 0.7505501244543847, + "grad_norm": 4.032093048656906, + "learning_rate": 6.17865050477894e-07, + "loss": 0.9122, + "step": 10403 + }, + { + "epoch": 0.7506222719238123, + "grad_norm": 0.8059488428980927, + "learning_rate": 6.17527285102591e-07, + "loss": 0.8499, + "step": 10404 + }, + { + "epoch": 0.7506944193932398, + "grad_norm": 2.4379019772583588, + "learning_rate": 6.171895952190054e-07, + "loss": 0.9369, + "step": 10405 + }, + { + "epoch": 0.7507665668626673, + "grad_norm": 4.334077893383215, + "learning_rate": 6.16851980845577e-07, + "loss": 0.8815, + "step": 10406 + }, + { + "epoch": 0.7508387143320948, + "grad_norm": 1.6664941062605567, + "learning_rate": 6.165144420007418e-07, + "loss": 0.9242, + "step": 10407 + }, + { + "epoch": 0.7509108618015223, + "grad_norm": 3.957370574049335, + "learning_rate": 6.161769787029312e-07, + "loss": 0.9899, + "step": 10408 + }, + { + "epoch": 0.7509830092709499, + "grad_norm": 2.3526243936241897, + "learning_rate": 6.158395909705732e-07, + "loss": 0.8133, + "step": 10409 + }, + { + "epoch": 0.7510551567403774, + "grad_norm": 1.7855912561269192, + "learning_rate": 6.155022788220918e-07, + "loss": 0.9542, + "step": 10410 + }, + { + "epoch": 0.7511273042098049, + "grad_norm": 4.3879694934712985, + "learning_rate": 6.151650422759042e-07, + "loss": 0.9618, + "step": 10411 + }, + { + "epoch": 0.7511994516792323, + "grad_norm": 2.7536814063520874, + "learning_rate": 6.148278813504288e-07, + "loss": 0.9371, + "step": 10412 + }, + { + "epoch": 0.7512715991486598, + "grad_norm": 2.668423659944008, + "learning_rate": 6.144907960640742e-07, + "loss": 0.8401, + "step": 10413 + }, + { + "epoch": 0.7513437466180873, + "grad_norm": 26.202111731303024, + "learning_rate": 6.141537864352484e-07, + "loss": 0.8721, + "step": 10414 + }, + { + "epoch": 0.7514158940875149, + "grad_norm": 4.226052752373629, + "learning_rate": 6.138168524823539e-07, + "loss": 0.9307, + "step": 10415 + }, + { + "epoch": 0.7514880415569424, + "grad_norm": 3.121154555395646, + "learning_rate": 6.134799942237898e-07, + "loss": 0.9612, + "step": 10416 + }, + { + "epoch": 0.7515601890263699, + "grad_norm": 2.7828402312758844, + "learning_rate": 6.131432116779511e-07, + "loss": 0.9531, + "step": 10417 + }, + { + "epoch": 0.7516323364957974, + "grad_norm": 8.458709507988045, + "learning_rate": 6.128065048632263e-07, + "loss": 0.9185, + "step": 10418 + }, + { + "epoch": 0.7517044839652249, + "grad_norm": 0.8889858822919164, + "learning_rate": 6.124698737980043e-07, + "loss": 0.8416, + "step": 10419 + }, + { + "epoch": 0.7517766314346525, + "grad_norm": 2.2108912378869707, + "learning_rate": 6.121333185006654e-07, + "loss": 0.9116, + "step": 10420 + }, + { + "epoch": 0.75184877890408, + "grad_norm": 3.1768669346555596, + "learning_rate": 6.117968389895885e-07, + "loss": 0.9257, + "step": 10421 + }, + { + "epoch": 0.7519209263735075, + "grad_norm": 2.3818416431166285, + "learning_rate": 6.11460435283147e-07, + "loss": 0.796, + "step": 10422 + }, + { + "epoch": 0.751993073842935, + "grad_norm": 2.135070834805021, + "learning_rate": 6.111241073997109e-07, + "loss": 0.9041, + "step": 10423 + }, + { + "epoch": 0.7520652213123624, + "grad_norm": 0.8085249719531931, + "learning_rate": 6.10787855357646e-07, + "loss": 0.8051, + "step": 10424 + }, + { + "epoch": 0.7521373687817899, + "grad_norm": 3.0850465671480825, + "learning_rate": 6.104516791753143e-07, + "loss": 0.8731, + "step": 10425 + }, + { + "epoch": 0.7522095162512175, + "grad_norm": 6.176147836111589, + "learning_rate": 6.101155788710713e-07, + "loss": 0.9605, + "step": 10426 + }, + { + "epoch": 0.752281663720645, + "grad_norm": 43.032424412078505, + "learning_rate": 6.097795544632722e-07, + "loss": 0.9034, + "step": 10427 + }, + { + "epoch": 0.7523538111900725, + "grad_norm": 22.023338423198567, + "learning_rate": 6.094436059702651e-07, + "loss": 1.0166, + "step": 10428 + }, + { + "epoch": 0.7524259586595, + "grad_norm": 2.2829840353475657, + "learning_rate": 6.091077334103947e-07, + "loss": 0.9144, + "step": 10429 + }, + { + "epoch": 0.7524981061289275, + "grad_norm": 2.8034942553852376, + "learning_rate": 6.087719368020021e-07, + "loss": 0.938, + "step": 10430 + }, + { + "epoch": 0.752570253598355, + "grad_norm": 4.605019583893427, + "learning_rate": 6.084362161634237e-07, + "loss": 0.8262, + "step": 10431 + }, + { + "epoch": 0.7526424010677826, + "grad_norm": 2.544485082491539, + "learning_rate": 6.081005715129932e-07, + "loss": 0.8922, + "step": 10432 + }, + { + "epoch": 0.7527145485372101, + "grad_norm": 4.840483714751392, + "learning_rate": 6.077650028690362e-07, + "loss": 0.8903, + "step": 10433 + }, + { + "epoch": 0.7527866960066376, + "grad_norm": 2.037704892768083, + "learning_rate": 6.074295102498801e-07, + "loss": 0.984, + "step": 10434 + }, + { + "epoch": 0.7528588434760651, + "grad_norm": 2.8982762705746623, + "learning_rate": 6.070940936738418e-07, + "loss": 0.9141, + "step": 10435 + }, + { + "epoch": 0.7529309909454925, + "grad_norm": 2.651103970158394, + "learning_rate": 6.067587531592402e-07, + "loss": 0.964, + "step": 10436 + }, + { + "epoch": 0.7530031384149201, + "grad_norm": 3.0758469570467626, + "learning_rate": 6.064234887243847e-07, + "loss": 0.9963, + "step": 10437 + }, + { + "epoch": 0.7530752858843476, + "grad_norm": 3.2359189388934566, + "learning_rate": 6.060883003875837e-07, + "loss": 0.9748, + "step": 10438 + }, + { + "epoch": 0.7531474333537751, + "grad_norm": 2.251270783035618, + "learning_rate": 6.057531881671405e-07, + "loss": 0.9521, + "step": 10439 + }, + { + "epoch": 0.7532195808232026, + "grad_norm": 3.2408301763959755, + "learning_rate": 6.054181520813542e-07, + "loss": 0.8674, + "step": 10440 + }, + { + "epoch": 0.7532917282926301, + "grad_norm": 3.3105046722512808, + "learning_rate": 6.050831921485209e-07, + "loss": 0.8127, + "step": 10441 + }, + { + "epoch": 0.7533638757620577, + "grad_norm": 0.7047149483691225, + "learning_rate": 6.047483083869301e-07, + "loss": 0.8023, + "step": 10442 + }, + { + "epoch": 0.7534360232314852, + "grad_norm": 4.026779653298738, + "learning_rate": 6.044135008148689e-07, + "loss": 0.9576, + "step": 10443 + }, + { + "epoch": 0.7535081707009127, + "grad_norm": 2.2951950884575347, + "learning_rate": 6.040787694506202e-07, + "loss": 0.8719, + "step": 10444 + }, + { + "epoch": 0.7535803181703402, + "grad_norm": 2.546450620527888, + "learning_rate": 6.037441143124624e-07, + "loss": 0.9233, + "step": 10445 + }, + { + "epoch": 0.7536524656397677, + "grad_norm": 4.171681831831171, + "learning_rate": 6.034095354186699e-07, + "loss": 0.8677, + "step": 10446 + }, + { + "epoch": 0.7537246131091951, + "grad_norm": 2.829989666869369, + "learning_rate": 6.030750327875132e-07, + "loss": 0.901, + "step": 10447 + }, + { + "epoch": 0.7537967605786227, + "grad_norm": 2.6573798974100287, + "learning_rate": 6.027406064372564e-07, + "loss": 0.8547, + "step": 10448 + }, + { + "epoch": 0.7538689080480502, + "grad_norm": 2.407932040386648, + "learning_rate": 6.024062563861638e-07, + "loss": 0.9515, + "step": 10449 + }, + { + "epoch": 0.7539410555174777, + "grad_norm": 1.888402909828955, + "learning_rate": 6.020719826524907e-07, + "loss": 0.9138, + "step": 10450 + }, + { + "epoch": 0.7540132029869052, + "grad_norm": 3.9168956500273433, + "learning_rate": 6.017377852544929e-07, + "loss": 0.959, + "step": 10451 + }, + { + "epoch": 0.7540853504563327, + "grad_norm": 3.0152494364684883, + "learning_rate": 6.014036642104178e-07, + "loss": 0.8354, + "step": 10452 + }, + { + "epoch": 0.7541574979257603, + "grad_norm": 2.3756396787411234, + "learning_rate": 6.010696195385112e-07, + "loss": 0.8083, + "step": 10453 + }, + { + "epoch": 0.7542296453951878, + "grad_norm": 13.596023404547088, + "learning_rate": 6.007356512570141e-07, + "loss": 0.8946, + "step": 10454 + }, + { + "epoch": 0.7543017928646153, + "grad_norm": 2.1936945807692303, + "learning_rate": 6.00401759384163e-07, + "loss": 0.854, + "step": 10455 + }, + { + "epoch": 0.7543739403340428, + "grad_norm": 2.9329516321428244, + "learning_rate": 6.000679439381915e-07, + "loss": 0.9336, + "step": 10456 + }, + { + "epoch": 0.7544460878034703, + "grad_norm": 0.813745790696281, + "learning_rate": 5.99734204937326e-07, + "loss": 0.8468, + "step": 10457 + }, + { + "epoch": 0.7545182352728979, + "grad_norm": 17.459216742671423, + "learning_rate": 5.99400542399793e-07, + "loss": 0.894, + "step": 10458 + }, + { + "epoch": 0.7545903827423253, + "grad_norm": 3.435684140636504, + "learning_rate": 5.990669563438109e-07, + "loss": 0.8884, + "step": 10459 + }, + { + "epoch": 0.7546625302117528, + "grad_norm": 2.3056921886520403, + "learning_rate": 5.987334467875964e-07, + "loss": 0.887, + "step": 10460 + }, + { + "epoch": 0.7547346776811803, + "grad_norm": 2.5410380498085723, + "learning_rate": 5.98400013749361e-07, + "loss": 0.917, + "step": 10461 + }, + { + "epoch": 0.7548068251506078, + "grad_norm": 2.74511979305506, + "learning_rate": 5.980666572473123e-07, + "loss": 0.93, + "step": 10462 + }, + { + "epoch": 0.7548789726200353, + "grad_norm": 2.6034891595089116, + "learning_rate": 5.977333772996535e-07, + "loss": 0.8808, + "step": 10463 + }, + { + "epoch": 0.7549511200894629, + "grad_norm": 2.177951572689006, + "learning_rate": 5.974001739245848e-07, + "loss": 0.9272, + "step": 10464 + }, + { + "epoch": 0.7550232675588904, + "grad_norm": 2.6670310745002306, + "learning_rate": 5.970670471402999e-07, + "loss": 0.8782, + "step": 10465 + }, + { + "epoch": 0.7550954150283179, + "grad_norm": 2.216188913741697, + "learning_rate": 5.967339969649898e-07, + "loss": 0.9766, + "step": 10466 + }, + { + "epoch": 0.7551675624977454, + "grad_norm": 0.6831693585330293, + "learning_rate": 5.964010234168416e-07, + "loss": 0.8105, + "step": 10467 + }, + { + "epoch": 0.7552397099671729, + "grad_norm": 2.2591631060546074, + "learning_rate": 5.960681265140373e-07, + "loss": 0.9437, + "step": 10468 + }, + { + "epoch": 0.7553118574366005, + "grad_norm": 1.6688413816427041, + "learning_rate": 5.957353062747554e-07, + "loss": 0.867, + "step": 10469 + }, + { + "epoch": 0.755384004906028, + "grad_norm": 2.7739648451939374, + "learning_rate": 5.954025627171703e-07, + "loss": 0.8052, + "step": 10470 + }, + { + "epoch": 0.7554561523754554, + "grad_norm": 3.1626481274076967, + "learning_rate": 5.95069895859452e-07, + "loss": 0.8844, + "step": 10471 + }, + { + "epoch": 0.7555282998448829, + "grad_norm": 3.3613613200169445, + "learning_rate": 5.947373057197647e-07, + "loss": 0.8645, + "step": 10472 + }, + { + "epoch": 0.7556004473143104, + "grad_norm": 2.4001980262757767, + "learning_rate": 5.944047923162723e-07, + "loss": 0.8858, + "step": 10473 + }, + { + "epoch": 0.7556725947837379, + "grad_norm": 2.5108323023562167, + "learning_rate": 5.940723556671297e-07, + "loss": 0.9315, + "step": 10474 + }, + { + "epoch": 0.7557447422531655, + "grad_norm": 0.865379200820612, + "learning_rate": 5.937399957904923e-07, + "loss": 0.8272, + "step": 10475 + }, + { + "epoch": 0.755816889722593, + "grad_norm": 7.151357252506584, + "learning_rate": 5.934077127045072e-07, + "loss": 0.9728, + "step": 10476 + }, + { + "epoch": 0.7558890371920205, + "grad_norm": 3.1585228922649766, + "learning_rate": 5.9307550642732e-07, + "loss": 1.0329, + "step": 10477 + }, + { + "epoch": 0.755961184661448, + "grad_norm": 2.6326417952530186, + "learning_rate": 5.927433769770711e-07, + "loss": 0.7791, + "step": 10478 + }, + { + "epoch": 0.7560333321308755, + "grad_norm": 2.24895283384889, + "learning_rate": 5.924113243718969e-07, + "loss": 0.9552, + "step": 10479 + }, + { + "epoch": 0.756105479600303, + "grad_norm": 2.1684963374794113, + "learning_rate": 5.920793486299302e-07, + "loss": 0.9524, + "step": 10480 + }, + { + "epoch": 0.7561776270697306, + "grad_norm": 5.020358600042677, + "learning_rate": 5.917474497692978e-07, + "loss": 0.8376, + "step": 10481 + }, + { + "epoch": 0.7562497745391581, + "grad_norm": 2.518615461594681, + "learning_rate": 5.91415627808124e-07, + "loss": 0.8193, + "step": 10482 + }, + { + "epoch": 0.7563219220085855, + "grad_norm": 2.979912742258916, + "learning_rate": 5.910838827645281e-07, + "loss": 1.0582, + "step": 10483 + }, + { + "epoch": 0.756394069478013, + "grad_norm": 2.5936567910233577, + "learning_rate": 5.90752214656626e-07, + "loss": 0.9384, + "step": 10484 + }, + { + "epoch": 0.7564662169474405, + "grad_norm": 2.92798990530348, + "learning_rate": 5.904206235025284e-07, + "loss": 1.0497, + "step": 10485 + }, + { + "epoch": 0.7565383644168681, + "grad_norm": 2.4380755600834085, + "learning_rate": 5.900891093203433e-07, + "loss": 0.8096, + "step": 10486 + }, + { + "epoch": 0.7566105118862956, + "grad_norm": 3.556890789956139, + "learning_rate": 5.897576721281712e-07, + "loss": 0.8693, + "step": 10487 + }, + { + "epoch": 0.7566826593557231, + "grad_norm": 2.6129203056740384, + "learning_rate": 5.894263119441134e-07, + "loss": 0.8682, + "step": 10488 + }, + { + "epoch": 0.7567548068251506, + "grad_norm": 4.414932246309331, + "learning_rate": 5.890950287862624e-07, + "loss": 0.8969, + "step": 10489 + }, + { + "epoch": 0.7568269542945781, + "grad_norm": 2.2212486704155285, + "learning_rate": 5.887638226727087e-07, + "loss": 0.9114, + "step": 10490 + }, + { + "epoch": 0.7568991017640057, + "grad_norm": 2.504532615170519, + "learning_rate": 5.884326936215387e-07, + "loss": 0.9066, + "step": 10491 + }, + { + "epoch": 0.7569712492334332, + "grad_norm": 3.6821818391571393, + "learning_rate": 5.881016416508338e-07, + "loss": 0.9569, + "step": 10492 + }, + { + "epoch": 0.7570433967028607, + "grad_norm": 3.1343494174632083, + "learning_rate": 5.877706667786713e-07, + "loss": 0.9272, + "step": 10493 + }, + { + "epoch": 0.7571155441722881, + "grad_norm": 3.0023791098417045, + "learning_rate": 5.87439769023125e-07, + "loss": 0.946, + "step": 10494 + }, + { + "epoch": 0.7571876916417156, + "grad_norm": 0.7422403818665616, + "learning_rate": 5.871089484022644e-07, + "loss": 0.7978, + "step": 10495 + }, + { + "epoch": 0.7572598391111431, + "grad_norm": 4.544758831881602, + "learning_rate": 5.867782049341526e-07, + "loss": 0.8499, + "step": 10496 + }, + { + "epoch": 0.7573319865805707, + "grad_norm": 2.7521307233435666, + "learning_rate": 5.864475386368526e-07, + "loss": 0.7849, + "step": 10497 + }, + { + "epoch": 0.7574041340499982, + "grad_norm": 2.3026902677471077, + "learning_rate": 5.86116949528419e-07, + "loss": 0.8219, + "step": 10498 + }, + { + "epoch": 0.7574762815194257, + "grad_norm": 2.06609793600215, + "learning_rate": 5.857864376269051e-07, + "loss": 0.9559, + "step": 10499 + }, + { + "epoch": 0.7575484289888532, + "grad_norm": 2.019347191692633, + "learning_rate": 5.854560029503584e-07, + "loss": 1.0066, + "step": 10500 + }, + { + "epoch": 0.7576205764582807, + "grad_norm": 3.680817189704074, + "learning_rate": 5.85125645516823e-07, + "loss": 0.9134, + "step": 10501 + }, + { + "epoch": 0.7576927239277083, + "grad_norm": 2.2809643501261982, + "learning_rate": 5.847953653443383e-07, + "loss": 0.8832, + "step": 10502 + }, + { + "epoch": 0.7577648713971358, + "grad_norm": 1.949429181499613, + "learning_rate": 5.844651624509405e-07, + "loss": 0.9072, + "step": 10503 + }, + { + "epoch": 0.7578370188665633, + "grad_norm": 2.0228426143503206, + "learning_rate": 5.841350368546594e-07, + "loss": 0.9518, + "step": 10504 + }, + { + "epoch": 0.7579091663359908, + "grad_norm": 3.062092968108074, + "learning_rate": 5.838049885735223e-07, + "loss": 0.9421, + "step": 10505 + }, + { + "epoch": 0.7579813138054182, + "grad_norm": 10.835160164980547, + "learning_rate": 5.834750176255525e-07, + "loss": 0.8763, + "step": 10506 + }, + { + "epoch": 0.7580534612748457, + "grad_norm": 2.4287032804840094, + "learning_rate": 5.831451240287682e-07, + "loss": 0.9323, + "step": 10507 + }, + { + "epoch": 0.7581256087442733, + "grad_norm": 2.6188564160559626, + "learning_rate": 5.828153078011837e-07, + "loss": 0.9336, + "step": 10508 + }, + { + "epoch": 0.7581977562137008, + "grad_norm": 2.20708040165816, + "learning_rate": 5.824855689608088e-07, + "loss": 0.9046, + "step": 10509 + }, + { + "epoch": 0.7582699036831283, + "grad_norm": 5.02691880974087, + "learning_rate": 5.821559075256504e-07, + "loss": 0.8455, + "step": 10510 + }, + { + "epoch": 0.7583420511525558, + "grad_norm": 2.931733660457473, + "learning_rate": 5.818263235137078e-07, + "loss": 0.9301, + "step": 10511 + }, + { + "epoch": 0.7584141986219833, + "grad_norm": 2.339209774785682, + "learning_rate": 5.81496816942981e-07, + "loss": 0.9626, + "step": 10512 + }, + { + "epoch": 0.7584863460914109, + "grad_norm": 2.3317070357032206, + "learning_rate": 5.811673878314613e-07, + "loss": 0.8967, + "step": 10513 + }, + { + "epoch": 0.7585584935608384, + "grad_norm": 15.274627810874549, + "learning_rate": 5.808380361971384e-07, + "loss": 0.8432, + "step": 10514 + }, + { + "epoch": 0.7586306410302659, + "grad_norm": 2.463410119273883, + "learning_rate": 5.805087620579965e-07, + "loss": 0.929, + "step": 10515 + }, + { + "epoch": 0.7587027884996934, + "grad_norm": 2.7423384317882595, + "learning_rate": 5.801795654320165e-07, + "loss": 0.8486, + "step": 10516 + }, + { + "epoch": 0.7587749359691209, + "grad_norm": 2.72147872615407, + "learning_rate": 5.798504463371742e-07, + "loss": 0.89, + "step": 10517 + }, + { + "epoch": 0.7588470834385483, + "grad_norm": 2.0843834964058585, + "learning_rate": 5.79521404791442e-07, + "loss": 0.8116, + "step": 10518 + }, + { + "epoch": 0.7589192309079759, + "grad_norm": 8.97760487107733, + "learning_rate": 5.791924408127881e-07, + "loss": 0.8586, + "step": 10519 + }, + { + "epoch": 0.7589913783774034, + "grad_norm": 4.017281157367364, + "learning_rate": 5.788635544191749e-07, + "loss": 0.9783, + "step": 10520 + }, + { + "epoch": 0.7590635258468309, + "grad_norm": 2.627437368239905, + "learning_rate": 5.785347456285619e-07, + "loss": 0.9939, + "step": 10521 + }, + { + "epoch": 0.7591356733162584, + "grad_norm": 2.610035430112329, + "learning_rate": 5.782060144589045e-07, + "loss": 0.8932, + "step": 10522 + }, + { + "epoch": 0.7592078207856859, + "grad_norm": 3.857185638533601, + "learning_rate": 5.778773609281531e-07, + "loss": 1.0813, + "step": 10523 + }, + { + "epoch": 0.7592799682551135, + "grad_norm": 2.7431736229919728, + "learning_rate": 5.775487850542546e-07, + "loss": 0.826, + "step": 10524 + }, + { + "epoch": 0.759352115724541, + "grad_norm": 2.5480360883256084, + "learning_rate": 5.772202868551519e-07, + "loss": 0.8935, + "step": 10525 + }, + { + "epoch": 0.7594242631939685, + "grad_norm": 3.252099459451546, + "learning_rate": 5.768918663487811e-07, + "loss": 0.9599, + "step": 10526 + }, + { + "epoch": 0.759496410663396, + "grad_norm": 2.6773684061661043, + "learning_rate": 5.765635235530786e-07, + "loss": 0.9442, + "step": 10527 + }, + { + "epoch": 0.7595685581328235, + "grad_norm": 0.761104629617398, + "learning_rate": 5.762352584859722e-07, + "loss": 0.7837, + "step": 10528 + }, + { + "epoch": 0.759640705602251, + "grad_norm": 3.8208577833707222, + "learning_rate": 5.759070711653875e-07, + "loss": 0.9705, + "step": 10529 + }, + { + "epoch": 0.7597128530716785, + "grad_norm": 3.3302903272748505, + "learning_rate": 5.755789616092459e-07, + "loss": 0.8309, + "step": 10530 + }, + { + "epoch": 0.759785000541106, + "grad_norm": 2.2807912495913882, + "learning_rate": 5.752509298354642e-07, + "loss": 0.9886, + "step": 10531 + }, + { + "epoch": 0.7598571480105335, + "grad_norm": 5.154134790460221, + "learning_rate": 5.749229758619549e-07, + "loss": 0.9924, + "step": 10532 + }, + { + "epoch": 0.759929295479961, + "grad_norm": 3.0726675848578435, + "learning_rate": 5.745950997066266e-07, + "loss": 0.9416, + "step": 10533 + }, + { + "epoch": 0.7600014429493885, + "grad_norm": 0.8398066312881957, + "learning_rate": 5.742673013873838e-07, + "loss": 0.7658, + "step": 10534 + }, + { + "epoch": 0.7600735904188161, + "grad_norm": 3.240877406150121, + "learning_rate": 5.739395809221252e-07, + "loss": 0.8769, + "step": 10535 + }, + { + "epoch": 0.7601457378882436, + "grad_norm": 2.7694039254990557, + "learning_rate": 5.73611938328747e-07, + "loss": 0.8643, + "step": 10536 + }, + { + "epoch": 0.7602178853576711, + "grad_norm": 6.7156739756350206, + "learning_rate": 5.732843736251405e-07, + "loss": 0.854, + "step": 10537 + }, + { + "epoch": 0.7602900328270986, + "grad_norm": 3.0351080184894452, + "learning_rate": 5.729568868291928e-07, + "loss": 0.962, + "step": 10538 + }, + { + "epoch": 0.7603621802965261, + "grad_norm": 23.381525607754806, + "learning_rate": 5.726294779587868e-07, + "loss": 0.8656, + "step": 10539 + }, + { + "epoch": 0.7604343277659537, + "grad_norm": 3.3684156262517186, + "learning_rate": 5.72302147031801e-07, + "loss": 0.8835, + "step": 10540 + }, + { + "epoch": 0.7605064752353812, + "grad_norm": 4.267904825166522, + "learning_rate": 5.719748940661097e-07, + "loss": 0.9156, + "step": 10541 + }, + { + "epoch": 0.7605786227048086, + "grad_norm": 2.4254713367285436, + "learning_rate": 5.716477190795839e-07, + "loss": 0.8768, + "step": 10542 + }, + { + "epoch": 0.7606507701742361, + "grad_norm": 3.1499599817549613, + "learning_rate": 5.713206220900879e-07, + "loss": 0.9604, + "step": 10543 + }, + { + "epoch": 0.7607229176436636, + "grad_norm": 2.8183235911677125, + "learning_rate": 5.70993603115484e-07, + "loss": 0.897, + "step": 10544 + }, + { + "epoch": 0.7607950651130911, + "grad_norm": 174.8465733686251, + "learning_rate": 5.706666621736293e-07, + "loss": 0.9111, + "step": 10545 + }, + { + "epoch": 0.7608672125825187, + "grad_norm": 2.5930516785143887, + "learning_rate": 5.703397992823771e-07, + "loss": 0.9463, + "step": 10546 + }, + { + "epoch": 0.7609393600519462, + "grad_norm": 0.7460290371456907, + "learning_rate": 5.700130144595761e-07, + "loss": 0.8366, + "step": 10547 + }, + { + "epoch": 0.7610115075213737, + "grad_norm": 2.1864504885036795, + "learning_rate": 5.696863077230704e-07, + "loss": 0.9276, + "step": 10548 + }, + { + "epoch": 0.7610836549908012, + "grad_norm": 2.7567510966787165, + "learning_rate": 5.693596790907016e-07, + "loss": 1.0168, + "step": 10549 + }, + { + "epoch": 0.7611558024602287, + "grad_norm": 2.349970078785111, + "learning_rate": 5.690331285803032e-07, + "loss": 0.9307, + "step": 10550 + }, + { + "epoch": 0.7612279499296563, + "grad_norm": 2.2133768057671204, + "learning_rate": 5.687066562097099e-07, + "loss": 0.9598, + "step": 10551 + }, + { + "epoch": 0.7613000973990838, + "grad_norm": 2.233581829005876, + "learning_rate": 5.683802619967466e-07, + "loss": 0.9337, + "step": 10552 + }, + { + "epoch": 0.7613722448685112, + "grad_norm": 2.41596105293435, + "learning_rate": 5.680539459592377e-07, + "loss": 0.9112, + "step": 10553 + }, + { + "epoch": 0.7614443923379387, + "grad_norm": 3.476643559228821, + "learning_rate": 5.67727708115002e-07, + "loss": 0.8523, + "step": 10554 + }, + { + "epoch": 0.7615165398073662, + "grad_norm": 2.068742725555446, + "learning_rate": 5.674015484818541e-07, + "loss": 0.8742, + "step": 10555 + }, + { + "epoch": 0.7615886872767937, + "grad_norm": 2.3849122848215454, + "learning_rate": 5.670754670776043e-07, + "loss": 0.9734, + "step": 10556 + }, + { + "epoch": 0.7616608347462213, + "grad_norm": 2.9281006444612245, + "learning_rate": 5.667494639200594e-07, + "loss": 0.8536, + "step": 10557 + }, + { + "epoch": 0.7617329822156488, + "grad_norm": 2.3426096876065414, + "learning_rate": 5.664235390270198e-07, + "loss": 0.8645, + "step": 10558 + }, + { + "epoch": 0.7618051296850763, + "grad_norm": 2.9411128126997044, + "learning_rate": 5.660976924162839e-07, + "loss": 0.8984, + "step": 10559 + }, + { + "epoch": 0.7618772771545038, + "grad_norm": 3.6640975072038358, + "learning_rate": 5.657719241056449e-07, + "loss": 1.0223, + "step": 10560 + }, + { + "epoch": 0.7619494246239313, + "grad_norm": 3.826458751289158, + "learning_rate": 5.654462341128918e-07, + "loss": 1.0008, + "step": 10561 + }, + { + "epoch": 0.7620215720933589, + "grad_norm": 2.839587004545604, + "learning_rate": 5.651206224558091e-07, + "loss": 0.9386, + "step": 10562 + }, + { + "epoch": 0.7620937195627864, + "grad_norm": 3.2462830930131323, + "learning_rate": 5.647950891521778e-07, + "loss": 0.9019, + "step": 10563 + }, + { + "epoch": 0.7621658670322139, + "grad_norm": 2.9449597053815184, + "learning_rate": 5.644696342197744e-07, + "loss": 0.7946, + "step": 10564 + }, + { + "epoch": 0.7622380145016413, + "grad_norm": 3.916290806260334, + "learning_rate": 5.641442576763687e-07, + "loss": 0.7813, + "step": 10565 + }, + { + "epoch": 0.7623101619710688, + "grad_norm": 3.4818218097174394, + "learning_rate": 5.638189595397309e-07, + "loss": 0.9362, + "step": 10566 + }, + { + "epoch": 0.7623823094404963, + "grad_norm": 2.437012941654046, + "learning_rate": 5.634937398276225e-07, + "loss": 0.9058, + "step": 10567 + }, + { + "epoch": 0.7624544569099239, + "grad_norm": 2.7591571933587935, + "learning_rate": 5.631685985578034e-07, + "loss": 0.82, + "step": 10568 + }, + { + "epoch": 0.7625266043793514, + "grad_norm": 1.6481002534954903, + "learning_rate": 5.628435357480281e-07, + "loss": 1.0043, + "step": 10569 + }, + { + "epoch": 0.7625987518487789, + "grad_norm": 2.533165665984126, + "learning_rate": 5.625185514160471e-07, + "loss": 0.9308, + "step": 10570 + }, + { + "epoch": 0.7626708993182064, + "grad_norm": 2.777039759399722, + "learning_rate": 5.621936455796066e-07, + "loss": 0.8158, + "step": 10571 + }, + { + "epoch": 0.7627430467876339, + "grad_norm": 2.8842331608606986, + "learning_rate": 5.618688182564486e-07, + "loss": 0.933, + "step": 10572 + }, + { + "epoch": 0.7628151942570615, + "grad_norm": 2.101430669003481, + "learning_rate": 5.615440694643114e-07, + "loss": 0.8852, + "step": 10573 + }, + { + "epoch": 0.762887341726489, + "grad_norm": 2.646499599961972, + "learning_rate": 5.612193992209268e-07, + "loss": 0.8658, + "step": 10574 + }, + { + "epoch": 0.7629594891959165, + "grad_norm": 2.826607903178608, + "learning_rate": 5.608948075440247e-07, + "loss": 0.9755, + "step": 10575 + }, + { + "epoch": 0.763031636665344, + "grad_norm": 2.1701521592476203, + "learning_rate": 5.605702944513298e-07, + "loss": 0.9267, + "step": 10576 + }, + { + "epoch": 0.7631037841347714, + "grad_norm": 2.109263212279477, + "learning_rate": 5.602458599605624e-07, + "loss": 1.0225, + "step": 10577 + }, + { + "epoch": 0.7631759316041989, + "grad_norm": 2.0469761015405923, + "learning_rate": 5.59921504089439e-07, + "loss": 1.0247, + "step": 10578 + }, + { + "epoch": 0.7632480790736265, + "grad_norm": 3.9017086638730087, + "learning_rate": 5.595972268556716e-07, + "loss": 0.9795, + "step": 10579 + }, + { + "epoch": 0.763320226543054, + "grad_norm": 2.2939112219423037, + "learning_rate": 5.592730282769664e-07, + "loss": 0.9372, + "step": 10580 + }, + { + "epoch": 0.7633923740124815, + "grad_norm": 3.104215088195684, + "learning_rate": 5.58948908371029e-07, + "loss": 0.8384, + "step": 10581 + }, + { + "epoch": 0.763464521481909, + "grad_norm": 2.56236722648711, + "learning_rate": 5.586248671555565e-07, + "loss": 0.9907, + "step": 10582 + }, + { + "epoch": 0.7635366689513365, + "grad_norm": 2.301300974565715, + "learning_rate": 5.583009046482441e-07, + "loss": 0.8429, + "step": 10583 + }, + { + "epoch": 0.7636088164207641, + "grad_norm": 2.749170091420107, + "learning_rate": 5.579770208667824e-07, + "loss": 0.8677, + "step": 10584 + }, + { + "epoch": 0.7636809638901916, + "grad_norm": 3.049961346240639, + "learning_rate": 5.576532158288574e-07, + "loss": 0.8206, + "step": 10585 + }, + { + "epoch": 0.7637531113596191, + "grad_norm": 2.64322665109499, + "learning_rate": 5.573294895521508e-07, + "loss": 1.013, + "step": 10586 + }, + { + "epoch": 0.7638252588290466, + "grad_norm": 1.769182655069366, + "learning_rate": 5.570058420543404e-07, + "loss": 0.9028, + "step": 10587 + }, + { + "epoch": 0.7638974062984741, + "grad_norm": 0.7264864892064279, + "learning_rate": 5.566822733530996e-07, + "loss": 0.7634, + "step": 10588 + }, + { + "epoch": 0.7639695537679015, + "grad_norm": 2.795213429554218, + "learning_rate": 5.563587834660956e-07, + "loss": 0.9052, + "step": 10589 + }, + { + "epoch": 0.7640417012373291, + "grad_norm": 2.5072031201738176, + "learning_rate": 5.560353724109959e-07, + "loss": 0.9794, + "step": 10590 + }, + { + "epoch": 0.7641138487067566, + "grad_norm": 4.864355546191164, + "learning_rate": 5.55712040205458e-07, + "loss": 0.9421, + "step": 10591 + }, + { + "epoch": 0.7641859961761841, + "grad_norm": 3.5476229379010227, + "learning_rate": 5.553887868671394e-07, + "loss": 0.8068, + "step": 10592 + }, + { + "epoch": 0.7642581436456116, + "grad_norm": 2.4889922032124945, + "learning_rate": 5.550656124136912e-07, + "loss": 0.9254, + "step": 10593 + }, + { + "epoch": 0.7643302911150391, + "grad_norm": 3.5064561426283065, + "learning_rate": 5.547425168627611e-07, + "loss": 0.9236, + "step": 10594 + }, + { + "epoch": 0.7644024385844667, + "grad_norm": 2.784536327479229, + "learning_rate": 5.544195002319918e-07, + "loss": 0.9165, + "step": 10595 + }, + { + "epoch": 0.7644745860538942, + "grad_norm": 2.126845848383673, + "learning_rate": 5.540965625390228e-07, + "loss": 0.9533, + "step": 10596 + }, + { + "epoch": 0.7645467335233217, + "grad_norm": 2.675746945349215, + "learning_rate": 5.537737038014876e-07, + "loss": 0.8638, + "step": 10597 + }, + { + "epoch": 0.7646188809927492, + "grad_norm": 5.826568968511762, + "learning_rate": 5.534509240370164e-07, + "loss": 0.9593, + "step": 10598 + }, + { + "epoch": 0.7646910284621767, + "grad_norm": 2.4984066654137176, + "learning_rate": 5.531282232632355e-07, + "loss": 0.8679, + "step": 10599 + }, + { + "epoch": 0.7647631759316041, + "grad_norm": 2.4996459710264474, + "learning_rate": 5.528056014977662e-07, + "loss": 0.959, + "step": 10600 + }, + { + "epoch": 0.7648353234010317, + "grad_norm": 2.0896168237996164, + "learning_rate": 5.524830587582263e-07, + "loss": 0.9147, + "step": 10601 + }, + { + "epoch": 0.7649074708704592, + "grad_norm": 2.50935767750919, + "learning_rate": 5.521605950622268e-07, + "loss": 0.9858, + "step": 10602 + }, + { + "epoch": 0.7649796183398867, + "grad_norm": 2.9746658009257554, + "learning_rate": 5.518382104273787e-07, + "loss": 0.7893, + "step": 10603 + }, + { + "epoch": 0.7650517658093142, + "grad_norm": 4.128378900290082, + "learning_rate": 5.515159048712836e-07, + "loss": 0.9698, + "step": 10604 + }, + { + "epoch": 0.7651239132787417, + "grad_norm": 2.7723996221810414, + "learning_rate": 5.511936784115442e-07, + "loss": 0.9191, + "step": 10605 + }, + { + "epoch": 0.7651960607481693, + "grad_norm": 4.843542082231117, + "learning_rate": 5.508715310657542e-07, + "loss": 0.8288, + "step": 10606 + }, + { + "epoch": 0.7652682082175968, + "grad_norm": 4.436796750502939, + "learning_rate": 5.505494628515053e-07, + "loss": 0.9534, + "step": 10607 + }, + { + "epoch": 0.7653403556870243, + "grad_norm": 2.7092234884839197, + "learning_rate": 5.502274737863846e-07, + "loss": 0.9131, + "step": 10608 + }, + { + "epoch": 0.7654125031564518, + "grad_norm": 2.136095029188628, + "learning_rate": 5.499055638879746e-07, + "loss": 0.8468, + "step": 10609 + }, + { + "epoch": 0.7654846506258793, + "grad_norm": 2.451772426275106, + "learning_rate": 5.495837331738538e-07, + "loss": 0.8605, + "step": 10610 + }, + { + "epoch": 0.7655567980953069, + "grad_norm": 2.603765248195851, + "learning_rate": 5.49261981661596e-07, + "loss": 0.9258, + "step": 10611 + }, + { + "epoch": 0.7656289455647343, + "grad_norm": 3.1244990137971302, + "learning_rate": 5.489403093687714e-07, + "loss": 0.9754, + "step": 10612 + }, + { + "epoch": 0.7657010930341618, + "grad_norm": 3.12405136968796, + "learning_rate": 5.486187163129445e-07, + "loss": 0.9602, + "step": 10613 + }, + { + "epoch": 0.7657732405035893, + "grad_norm": 2.604983077186487, + "learning_rate": 5.482972025116765e-07, + "loss": 0.954, + "step": 10614 + }, + { + "epoch": 0.7658453879730168, + "grad_norm": 2.876067087921642, + "learning_rate": 5.479757679825245e-07, + "loss": 0.8336, + "step": 10615 + }, + { + "epoch": 0.7659175354424443, + "grad_norm": 2.762157874692419, + "learning_rate": 5.476544127430405e-07, + "loss": 0.9434, + "step": 10616 + }, + { + "epoch": 0.7659896829118719, + "grad_norm": 3.079775295235129, + "learning_rate": 5.473331368107725e-07, + "loss": 0.9246, + "step": 10617 + }, + { + "epoch": 0.7660618303812994, + "grad_norm": 3.1410242842227674, + "learning_rate": 5.47011940203265e-07, + "loss": 0.9012, + "step": 10618 + }, + { + "epoch": 0.7661339778507269, + "grad_norm": 4.5239805643353925, + "learning_rate": 5.466908229380554e-07, + "loss": 0.9757, + "step": 10619 + }, + { + "epoch": 0.7662061253201544, + "grad_norm": 2.9254671750754353, + "learning_rate": 5.463697850326811e-07, + "loss": 0.8947, + "step": 10620 + }, + { + "epoch": 0.7662782727895819, + "grad_norm": 3.068669391513465, + "learning_rate": 5.460488265046713e-07, + "loss": 0.9319, + "step": 10621 + }, + { + "epoch": 0.7663504202590095, + "grad_norm": 2.5093045653532933, + "learning_rate": 5.457279473715526e-07, + "loss": 0.8887, + "step": 10622 + }, + { + "epoch": 0.766422567728437, + "grad_norm": 3.405451269732824, + "learning_rate": 5.454071476508473e-07, + "loss": 0.9372, + "step": 10623 + }, + { + "epoch": 0.7664947151978644, + "grad_norm": 2.033508102014411, + "learning_rate": 5.450864273600728e-07, + "loss": 0.9205, + "step": 10624 + }, + { + "epoch": 0.7665668626672919, + "grad_norm": 2.6390240573285215, + "learning_rate": 5.447657865167428e-07, + "loss": 0.9706, + "step": 10625 + }, + { + "epoch": 0.7666390101367194, + "grad_norm": 3.5158102707258267, + "learning_rate": 5.444452251383658e-07, + "loss": 0.9554, + "step": 10626 + }, + { + "epoch": 0.7667111576061469, + "grad_norm": 4.078793525709068, + "learning_rate": 5.441247432424476e-07, + "loss": 0.942, + "step": 10627 + }, + { + "epoch": 0.7667833050755745, + "grad_norm": 4.923581557132109, + "learning_rate": 5.438043408464866e-07, + "loss": 0.9351, + "step": 10628 + }, + { + "epoch": 0.766855452545002, + "grad_norm": 2.5676307485954464, + "learning_rate": 5.434840179679809e-07, + "loss": 0.819, + "step": 10629 + }, + { + "epoch": 0.7669276000144295, + "grad_norm": 2.1858711717970944, + "learning_rate": 5.431637746244204e-07, + "loss": 0.9554, + "step": 10630 + }, + { + "epoch": 0.766999747483857, + "grad_norm": 2.2643354231650075, + "learning_rate": 5.428436108332935e-07, + "loss": 0.8059, + "step": 10631 + }, + { + "epoch": 0.7670718949532845, + "grad_norm": 1.8425551600073409, + "learning_rate": 5.425235266120826e-07, + "loss": 0.882, + "step": 10632 + }, + { + "epoch": 0.7671440424227121, + "grad_norm": 3.0548598296978478, + "learning_rate": 5.422035219782666e-07, + "loss": 0.7205, + "step": 10633 + }, + { + "epoch": 0.7672161898921396, + "grad_norm": 2.6048576862318678, + "learning_rate": 5.418835969493196e-07, + "loss": 0.7982, + "step": 10634 + }, + { + "epoch": 0.7672883373615671, + "grad_norm": 2.2415249473652183, + "learning_rate": 5.415637515427123e-07, + "loss": 0.9072, + "step": 10635 + }, + { + "epoch": 0.7673604848309945, + "grad_norm": 2.524654602750582, + "learning_rate": 5.412439857759088e-07, + "loss": 0.9701, + "step": 10636 + }, + { + "epoch": 0.767432632300422, + "grad_norm": 2.3273887750291817, + "learning_rate": 5.409242996663714e-07, + "loss": 0.7932, + "step": 10637 + }, + { + "epoch": 0.7675047797698495, + "grad_norm": 2.716108771144752, + "learning_rate": 5.406046932315564e-07, + "loss": 0.8499, + "step": 10638 + }, + { + "epoch": 0.7675769272392771, + "grad_norm": 2.4646337452943397, + "learning_rate": 5.402851664889168e-07, + "loss": 0.9006, + "step": 10639 + }, + { + "epoch": 0.7676490747087046, + "grad_norm": 3.1859083502662764, + "learning_rate": 5.39965719455901e-07, + "loss": 0.8116, + "step": 10640 + }, + { + "epoch": 0.7677212221781321, + "grad_norm": 4.232783732944882, + "learning_rate": 5.396463521499515e-07, + "loss": 0.969, + "step": 10641 + }, + { + "epoch": 0.7677933696475596, + "grad_norm": 2.537226650003174, + "learning_rate": 5.393270645885095e-07, + "loss": 0.8622, + "step": 10642 + }, + { + "epoch": 0.7678655171169871, + "grad_norm": 1.975350049541105, + "learning_rate": 5.390078567890084e-07, + "loss": 0.8216, + "step": 10643 + }, + { + "epoch": 0.7679376645864147, + "grad_norm": 2.6931347881288876, + "learning_rate": 5.386887287688808e-07, + "loss": 0.9538, + "step": 10644 + }, + { + "epoch": 0.7680098120558422, + "grad_norm": 2.961466316769672, + "learning_rate": 5.383696805455515e-07, + "loss": 0.7866, + "step": 10645 + }, + { + "epoch": 0.7680819595252697, + "grad_norm": 3.98673301184404, + "learning_rate": 5.380507121364435e-07, + "loss": 0.9845, + "step": 10646 + }, + { + "epoch": 0.7681541069946971, + "grad_norm": 2.397302912148149, + "learning_rate": 5.377318235589739e-07, + "loss": 0.8082, + "step": 10647 + }, + { + "epoch": 0.7682262544641246, + "grad_norm": 2.0602138449032315, + "learning_rate": 5.374130148305563e-07, + "loss": 0.9082, + "step": 10648 + }, + { + "epoch": 0.7682984019335521, + "grad_norm": 2.935726360790783, + "learning_rate": 5.370942859685996e-07, + "loss": 0.9457, + "step": 10649 + }, + { + "epoch": 0.7683705494029797, + "grad_norm": 1.6820740534416423, + "learning_rate": 5.367756369905085e-07, + "loss": 0.8829, + "step": 10650 + }, + { + "epoch": 0.7684426968724072, + "grad_norm": 2.945986076254743, + "learning_rate": 5.364570679136839e-07, + "loss": 1.0162, + "step": 10651 + }, + { + "epoch": 0.7685148443418347, + "grad_norm": 4.72922893376747, + "learning_rate": 5.361385787555204e-07, + "loss": 0.8586, + "step": 10652 + }, + { + "epoch": 0.7685869918112622, + "grad_norm": 2.5321154586597916, + "learning_rate": 5.358201695334099e-07, + "loss": 0.8513, + "step": 10653 + }, + { + "epoch": 0.7686591392806897, + "grad_norm": 2.093988433470748, + "learning_rate": 5.355018402647397e-07, + "loss": 0.9344, + "step": 10654 + }, + { + "epoch": 0.7687312867501173, + "grad_norm": 2.1591963609925537, + "learning_rate": 5.351835909668929e-07, + "loss": 0.8508, + "step": 10655 + }, + { + "epoch": 0.7688034342195448, + "grad_norm": 3.3193454186757796, + "learning_rate": 5.348654216572475e-07, + "loss": 0.7876, + "step": 10656 + }, + { + "epoch": 0.7688755816889723, + "grad_norm": 3.1044780566484, + "learning_rate": 5.345473323531782e-07, + "loss": 0.9647, + "step": 10657 + }, + { + "epoch": 0.7689477291583998, + "grad_norm": 1.7755629788409208, + "learning_rate": 5.342293230720529e-07, + "loss": 0.8918, + "step": 10658 + }, + { + "epoch": 0.7690198766278272, + "grad_norm": 3.5885253079082564, + "learning_rate": 5.339113938312396e-07, + "loss": 0.8617, + "step": 10659 + }, + { + "epoch": 0.7690920240972547, + "grad_norm": 4.596836013680488, + "learning_rate": 5.33593544648097e-07, + "loss": 0.9439, + "step": 10660 + }, + { + "epoch": 0.7691641715666823, + "grad_norm": 3.382171217678212, + "learning_rate": 5.332757755399826e-07, + "loss": 0.9177, + "step": 10661 + }, + { + "epoch": 0.7692363190361098, + "grad_norm": 3.509828527669142, + "learning_rate": 5.329580865242484e-07, + "loss": 0.9478, + "step": 10662 + }, + { + "epoch": 0.7693084665055373, + "grad_norm": 3.1080047808773625, + "learning_rate": 5.326404776182422e-07, + "loss": 0.9338, + "step": 10663 + }, + { + "epoch": 0.7693806139749648, + "grad_norm": 1.9804369079317365, + "learning_rate": 5.323229488393084e-07, + "loss": 0.7685, + "step": 10664 + }, + { + "epoch": 0.7694527614443923, + "grad_norm": 3.0616457000607036, + "learning_rate": 5.320055002047839e-07, + "loss": 0.8437, + "step": 10665 + }, + { + "epoch": 0.7695249089138199, + "grad_norm": 3.2770883963113917, + "learning_rate": 5.316881317320058e-07, + "loss": 0.8793, + "step": 10666 + }, + { + "epoch": 0.7695970563832474, + "grad_norm": 2.54233384886375, + "learning_rate": 5.313708434383022e-07, + "loss": 0.9158, + "step": 10667 + }, + { + "epoch": 0.7696692038526749, + "grad_norm": 3.7170531945741567, + "learning_rate": 5.310536353410013e-07, + "loss": 0.8692, + "step": 10668 + }, + { + "epoch": 0.7697413513221024, + "grad_norm": 3.7549116393968225, + "learning_rate": 5.30736507457423e-07, + "loss": 0.927, + "step": 10669 + }, + { + "epoch": 0.7698134987915299, + "grad_norm": 2.743817054433156, + "learning_rate": 5.30419459804885e-07, + "loss": 0.9757, + "step": 10670 + }, + { + "epoch": 0.7698856462609573, + "grad_norm": 2.293799904428544, + "learning_rate": 5.301024924007e-07, + "loss": 0.9976, + "step": 10671 + }, + { + "epoch": 0.7699577937303849, + "grad_norm": 0.8076315051727384, + "learning_rate": 5.297856052621768e-07, + "loss": 0.8476, + "step": 10672 + }, + { + "epoch": 0.7700299411998124, + "grad_norm": 5.886355848966665, + "learning_rate": 5.294687984066189e-07, + "loss": 0.9951, + "step": 10673 + }, + { + "epoch": 0.7701020886692399, + "grad_norm": 2.111273017608763, + "learning_rate": 5.291520718513267e-07, + "loss": 0.8942, + "step": 10674 + }, + { + "epoch": 0.7701742361386674, + "grad_norm": 8.528683454513079, + "learning_rate": 5.288354256135945e-07, + "loss": 0.9514, + "step": 10675 + }, + { + "epoch": 0.7702463836080949, + "grad_norm": 6.297399607876974, + "learning_rate": 5.285188597107137e-07, + "loss": 0.9703, + "step": 10676 + }, + { + "epoch": 0.7703185310775225, + "grad_norm": 2.535524597211146, + "learning_rate": 5.282023741599706e-07, + "loss": 0.8451, + "step": 10677 + }, + { + "epoch": 0.77039067854695, + "grad_norm": 2.540737409556538, + "learning_rate": 5.278859689786477e-07, + "loss": 0.7894, + "step": 10678 + }, + { + "epoch": 0.7704628260163775, + "grad_norm": 3.0039552999591774, + "learning_rate": 5.275696441840227e-07, + "loss": 0.9629, + "step": 10679 + }, + { + "epoch": 0.770534973485805, + "grad_norm": 4.613924942523896, + "learning_rate": 5.272533997933676e-07, + "loss": 0.8221, + "step": 10680 + }, + { + "epoch": 0.7706071209552325, + "grad_norm": 2.4250486192057004, + "learning_rate": 5.269372358239539e-07, + "loss": 0.9639, + "step": 10681 + }, + { + "epoch": 0.7706792684246601, + "grad_norm": 3.5840265505437374, + "learning_rate": 5.26621152293043e-07, + "loss": 0.9127, + "step": 10682 + }, + { + "epoch": 0.7707514158940875, + "grad_norm": 3.51815935127266, + "learning_rate": 5.263051492178983e-07, + "loss": 0.8428, + "step": 10683 + }, + { + "epoch": 0.770823563363515, + "grad_norm": 2.323865642491628, + "learning_rate": 5.259892266157731e-07, + "loss": 0.7825, + "step": 10684 + }, + { + "epoch": 0.7708957108329425, + "grad_norm": 3.048507018594419, + "learning_rate": 5.256733845039198e-07, + "loss": 0.8056, + "step": 10685 + }, + { + "epoch": 0.77096785830237, + "grad_norm": 2.568280654687146, + "learning_rate": 5.253576228995851e-07, + "loss": 0.9388, + "step": 10686 + }, + { + "epoch": 0.7710400057717975, + "grad_norm": 2.6931107083302384, + "learning_rate": 5.250419418200118e-07, + "loss": 0.8923, + "step": 10687 + }, + { + "epoch": 0.7711121532412251, + "grad_norm": 2.3229827415294864, + "learning_rate": 5.247263412824386e-07, + "loss": 0.9322, + "step": 10688 + }, + { + "epoch": 0.7711843007106526, + "grad_norm": 2.4944643723112634, + "learning_rate": 5.244108213040972e-07, + "loss": 0.9233, + "step": 10689 + }, + { + "epoch": 0.7712564481800801, + "grad_norm": 2.602735741718133, + "learning_rate": 5.240953819022195e-07, + "loss": 0.9313, + "step": 10690 + }, + { + "epoch": 0.7713285956495076, + "grad_norm": 2.925618104753394, + "learning_rate": 5.23780023094029e-07, + "loss": 0.8936, + "step": 10691 + }, + { + "epoch": 0.7714007431189351, + "grad_norm": 3.2171844823303446, + "learning_rate": 5.234647448967469e-07, + "loss": 0.8196, + "step": 10692 + }, + { + "epoch": 0.7714728905883627, + "grad_norm": 2.7657274992316405, + "learning_rate": 5.231495473275887e-07, + "loss": 0.9574, + "step": 10693 + }, + { + "epoch": 0.7715450380577902, + "grad_norm": 62.33319929519902, + "learning_rate": 5.228344304037668e-07, + "loss": 0.9554, + "step": 10694 + }, + { + "epoch": 0.7716171855272176, + "grad_norm": 2.328157667916515, + "learning_rate": 5.225193941424883e-07, + "loss": 0.9414, + "step": 10695 + }, + { + "epoch": 0.7716893329966451, + "grad_norm": 2.615232004701596, + "learning_rate": 5.22204438560957e-07, + "loss": 0.9401, + "step": 10696 + }, + { + "epoch": 0.7717614804660726, + "grad_norm": 2.845938375013591, + "learning_rate": 5.218895636763693e-07, + "loss": 0.7695, + "step": 10697 + }, + { + "epoch": 0.7718336279355001, + "grad_norm": 3.383476068902792, + "learning_rate": 5.21574769505922e-07, + "loss": 0.9154, + "step": 10698 + }, + { + "epoch": 0.7719057754049277, + "grad_norm": 3.718285026652571, + "learning_rate": 5.212600560668032e-07, + "loss": 0.7913, + "step": 10699 + }, + { + "epoch": 0.7719779228743552, + "grad_norm": 2.330782199590144, + "learning_rate": 5.209454233761985e-07, + "loss": 0.8303, + "step": 10700 + }, + { + "epoch": 0.7720500703437827, + "grad_norm": 3.2265236323419413, + "learning_rate": 5.206308714512888e-07, + "loss": 0.8654, + "step": 10701 + }, + { + "epoch": 0.7721222178132102, + "grad_norm": 3.696890818545114, + "learning_rate": 5.203164003092511e-07, + "loss": 0.8978, + "step": 10702 + }, + { + "epoch": 0.7721943652826377, + "grad_norm": 2.661408206793628, + "learning_rate": 5.200020099672575e-07, + "loss": 0.9221, + "step": 10703 + }, + { + "epoch": 0.7722665127520653, + "grad_norm": 4.283698057288878, + "learning_rate": 5.196877004424743e-07, + "loss": 0.9449, + "step": 10704 + }, + { + "epoch": 0.7723386602214928, + "grad_norm": 5.260173386591628, + "learning_rate": 5.193734717520668e-07, + "loss": 0.9006, + "step": 10705 + }, + { + "epoch": 0.7724108076909202, + "grad_norm": 2.918704674182994, + "learning_rate": 5.190593239131926e-07, + "loss": 0.8946, + "step": 10706 + }, + { + "epoch": 0.7724829551603477, + "grad_norm": 3.985312916185753, + "learning_rate": 5.187452569430064e-07, + "loss": 0.8481, + "step": 10707 + }, + { + "epoch": 0.7725551026297752, + "grad_norm": 2.9601060812215767, + "learning_rate": 5.184312708586583e-07, + "loss": 1.013, + "step": 10708 + }, + { + "epoch": 0.7726272500992027, + "grad_norm": 3.58385731318533, + "learning_rate": 5.18117365677294e-07, + "loss": 0.8706, + "step": 10709 + }, + { + "epoch": 0.7726993975686303, + "grad_norm": 1.9579170948046227, + "learning_rate": 5.178035414160544e-07, + "loss": 0.9365, + "step": 10710 + }, + { + "epoch": 0.7727715450380578, + "grad_norm": 3.0856530167032306, + "learning_rate": 5.174897980920767e-07, + "loss": 0.9655, + "step": 10711 + }, + { + "epoch": 0.7728436925074853, + "grad_norm": 2.9746686863136476, + "learning_rate": 5.171761357224935e-07, + "loss": 0.9795, + "step": 10712 + }, + { + "epoch": 0.7729158399769128, + "grad_norm": 8.393142862297388, + "learning_rate": 5.168625543244318e-07, + "loss": 0.9489, + "step": 10713 + }, + { + "epoch": 0.7729879874463403, + "grad_norm": 3.019636421296658, + "learning_rate": 5.165490539150157e-07, + "loss": 0.9356, + "step": 10714 + }, + { + "epoch": 0.7730601349157679, + "grad_norm": 2.5582813800254196, + "learning_rate": 5.162356345113641e-07, + "loss": 0.9302, + "step": 10715 + }, + { + "epoch": 0.7731322823851954, + "grad_norm": 1.9884361101859376, + "learning_rate": 5.159222961305918e-07, + "loss": 0.9122, + "step": 10716 + }, + { + "epoch": 0.7732044298546229, + "grad_norm": 3.3298934035093906, + "learning_rate": 5.156090387898091e-07, + "loss": 0.856, + "step": 10717 + }, + { + "epoch": 0.7732765773240503, + "grad_norm": 1.8199641995524494, + "learning_rate": 5.152958625061223e-07, + "loss": 0.9922, + "step": 10718 + }, + { + "epoch": 0.7733487247934778, + "grad_norm": 2.867977688286262, + "learning_rate": 5.149827672966309e-07, + "loss": 0.9776, + "step": 10719 + }, + { + "epoch": 0.7734208722629053, + "grad_norm": 0.8162780369007924, + "learning_rate": 5.146697531784348e-07, + "loss": 0.8375, + "step": 10720 + }, + { + "epoch": 0.7734930197323329, + "grad_norm": 2.4889044587560356, + "learning_rate": 5.143568201686235e-07, + "loss": 0.8221, + "step": 10721 + }, + { + "epoch": 0.7735651672017604, + "grad_norm": 2.458822740449465, + "learning_rate": 5.140439682842879e-07, + "loss": 0.9918, + "step": 10722 + }, + { + "epoch": 0.7736373146711879, + "grad_norm": 2.6329687055212316, + "learning_rate": 5.137311975425098e-07, + "loss": 0.9203, + "step": 10723 + }, + { + "epoch": 0.7737094621406154, + "grad_norm": 2.1716298575610744, + "learning_rate": 5.13418507960369e-07, + "loss": 0.938, + "step": 10724 + }, + { + "epoch": 0.7737816096100429, + "grad_norm": 2.4167418194189705, + "learning_rate": 5.131058995549402e-07, + "loss": 0.939, + "step": 10725 + }, + { + "epoch": 0.7738537570794705, + "grad_norm": 2.2430756301952637, + "learning_rate": 5.12793372343294e-07, + "loss": 0.8662, + "step": 10726 + }, + { + "epoch": 0.773925904548898, + "grad_norm": 2.210599947661511, + "learning_rate": 5.124809263424969e-07, + "loss": 1.006, + "step": 10727 + }, + { + "epoch": 0.7739980520183255, + "grad_norm": 3.074341283195723, + "learning_rate": 5.121685615696094e-07, + "loss": 0.8649, + "step": 10728 + }, + { + "epoch": 0.774070199487753, + "grad_norm": 0.8605113059791228, + "learning_rate": 5.118562780416889e-07, + "loss": 0.8297, + "step": 10729 + }, + { + "epoch": 0.7741423469571804, + "grad_norm": 2.135225824031406, + "learning_rate": 5.11544075775788e-07, + "loss": 0.9127, + "step": 10730 + }, + { + "epoch": 0.774214494426608, + "grad_norm": 2.989197674021409, + "learning_rate": 5.11231954788955e-07, + "loss": 0.9378, + "step": 10731 + }, + { + "epoch": 0.7742866418960355, + "grad_norm": 0.7783877392595204, + "learning_rate": 5.10919915098234e-07, + "loss": 0.8362, + "step": 10732 + }, + { + "epoch": 0.774358789365463, + "grad_norm": 2.9151233041114795, + "learning_rate": 5.106079567206638e-07, + "loss": 0.9707, + "step": 10733 + }, + { + "epoch": 0.7744309368348905, + "grad_norm": 2.1084300997796293, + "learning_rate": 5.102960796732796e-07, + "loss": 0.9757, + "step": 10734 + }, + { + "epoch": 0.774503084304318, + "grad_norm": 1.928261419135098, + "learning_rate": 5.099842839731124e-07, + "loss": 0.898, + "step": 10735 + }, + { + "epoch": 0.7745752317737455, + "grad_norm": 2.226318881443388, + "learning_rate": 5.096725696371871e-07, + "loss": 1.0264, + "step": 10736 + }, + { + "epoch": 0.7746473792431731, + "grad_norm": 15.685854152956175, + "learning_rate": 5.093609366825256e-07, + "loss": 0.8238, + "step": 10737 + }, + { + "epoch": 0.7747195267126006, + "grad_norm": 4.002561940864139, + "learning_rate": 5.090493851261452e-07, + "loss": 0.983, + "step": 10738 + }, + { + "epoch": 0.7747916741820281, + "grad_norm": 2.5331654777464667, + "learning_rate": 5.087379149850588e-07, + "loss": 0.8599, + "step": 10739 + }, + { + "epoch": 0.7748638216514556, + "grad_norm": 2.410420160667461, + "learning_rate": 5.084265262762741e-07, + "loss": 0.9311, + "step": 10740 + }, + { + "epoch": 0.7749359691208831, + "grad_norm": 2.9414467777129953, + "learning_rate": 5.081152190167954e-07, + "loss": 0.8789, + "step": 10741 + }, + { + "epoch": 0.7750081165903105, + "grad_norm": 2.630477186506138, + "learning_rate": 5.078039932236224e-07, + "loss": 0.8827, + "step": 10742 + }, + { + "epoch": 0.7750802640597381, + "grad_norm": 4.0220669976965775, + "learning_rate": 5.074928489137482e-07, + "loss": 0.9513, + "step": 10743 + }, + { + "epoch": 0.7751524115291656, + "grad_norm": 6.254798267993222, + "learning_rate": 5.071817861041657e-07, + "loss": 1.0388, + "step": 10744 + }, + { + "epoch": 0.7752245589985931, + "grad_norm": 3.2202225066839265, + "learning_rate": 5.068708048118589e-07, + "loss": 0.9492, + "step": 10745 + }, + { + "epoch": 0.7752967064680206, + "grad_norm": 2.7243511337351567, + "learning_rate": 5.065599050538101e-07, + "loss": 0.8781, + "step": 10746 + }, + { + "epoch": 0.7753688539374481, + "grad_norm": 0.7267836777580545, + "learning_rate": 5.062490868469964e-07, + "loss": 0.7846, + "step": 10747 + }, + { + "epoch": 0.7754410014068757, + "grad_norm": 2.1224259887300474, + "learning_rate": 5.059383502083903e-07, + "loss": 0.8709, + "step": 10748 + }, + { + "epoch": 0.7755131488763032, + "grad_norm": 2.0288691726252104, + "learning_rate": 5.056276951549601e-07, + "loss": 0.9713, + "step": 10749 + }, + { + "epoch": 0.7755852963457307, + "grad_norm": 4.346989583110241, + "learning_rate": 5.053171217036702e-07, + "loss": 0.8261, + "step": 10750 + }, + { + "epoch": 0.7756574438151582, + "grad_norm": 2.8056853068178467, + "learning_rate": 5.050066298714784e-07, + "loss": 0.8682, + "step": 10751 + }, + { + "epoch": 0.7757295912845857, + "grad_norm": 3.4049653596838505, + "learning_rate": 5.046962196753404e-07, + "loss": 0.8443, + "step": 10752 + }, + { + "epoch": 0.7758017387540131, + "grad_norm": 2.52589245122806, + "learning_rate": 5.043858911322063e-07, + "loss": 1.0243, + "step": 10753 + }, + { + "epoch": 0.7758738862234407, + "grad_norm": 2.514527833879744, + "learning_rate": 5.040756442590222e-07, + "loss": 0.8456, + "step": 10754 + }, + { + "epoch": 0.7759460336928682, + "grad_norm": 2.7432272480429902, + "learning_rate": 5.037654790727295e-07, + "loss": 0.8765, + "step": 10755 + }, + { + "epoch": 0.7760181811622957, + "grad_norm": 2.1123443049552795, + "learning_rate": 5.03455395590265e-07, + "loss": 0.8564, + "step": 10756 + }, + { + "epoch": 0.7760903286317232, + "grad_norm": 2.4321922810171897, + "learning_rate": 5.031453938285619e-07, + "loss": 0.9098, + "step": 10757 + }, + { + "epoch": 0.7761624761011507, + "grad_norm": 3.447961044410092, + "learning_rate": 5.028354738045466e-07, + "loss": 0.8242, + "step": 10758 + }, + { + "epoch": 0.7762346235705783, + "grad_norm": 3.152515127824853, + "learning_rate": 5.02525635535145e-07, + "loss": 0.9179, + "step": 10759 + }, + { + "epoch": 0.7763067710400058, + "grad_norm": 2.9405367149859183, + "learning_rate": 5.022158790372744e-07, + "loss": 0.9334, + "step": 10760 + }, + { + "epoch": 0.7763789185094333, + "grad_norm": 2.564871830108244, + "learning_rate": 5.019062043278499e-07, + "loss": 0.7908, + "step": 10761 + }, + { + "epoch": 0.7764510659788608, + "grad_norm": 0.8973141821085391, + "learning_rate": 5.015966114237818e-07, + "loss": 0.8997, + "step": 10762 + }, + { + "epoch": 0.7765232134482883, + "grad_norm": 3.2051623308370205, + "learning_rate": 5.012871003419759e-07, + "loss": 0.9025, + "step": 10763 + }, + { + "epoch": 0.7765953609177159, + "grad_norm": 4.432336731564688, + "learning_rate": 5.009776710993334e-07, + "loss": 0.9279, + "step": 10764 + }, + { + "epoch": 0.7766675083871433, + "grad_norm": 2.0241138403092584, + "learning_rate": 5.006683237127511e-07, + "loss": 0.9185, + "step": 10765 + }, + { + "epoch": 0.7767396558565708, + "grad_norm": 2.322479469200358, + "learning_rate": 5.003590581991219e-07, + "loss": 1.0127, + "step": 10766 + }, + { + "epoch": 0.7768118033259983, + "grad_norm": 0.7676274329890643, + "learning_rate": 5.000498745753326e-07, + "loss": 0.7992, + "step": 10767 + }, + { + "epoch": 0.7768839507954258, + "grad_norm": 2.705064287391277, + "learning_rate": 4.997407728582668e-07, + "loss": 0.8572, + "step": 10768 + }, + { + "epoch": 0.7769560982648533, + "grad_norm": 2.409056871837871, + "learning_rate": 4.994317530648038e-07, + "loss": 0.9472, + "step": 10769 + }, + { + "epoch": 0.7770282457342809, + "grad_norm": 3.1019360680057733, + "learning_rate": 4.991228152118176e-07, + "loss": 0.8563, + "step": 10770 + }, + { + "epoch": 0.7771003932037084, + "grad_norm": 2.409892112564722, + "learning_rate": 4.988139593161787e-07, + "loss": 0.9988, + "step": 10771 + }, + { + "epoch": 0.7771725406731359, + "grad_norm": 2.3313119056807055, + "learning_rate": 4.985051853947526e-07, + "loss": 1.003, + "step": 10772 + }, + { + "epoch": 0.7772446881425634, + "grad_norm": 2.4340651347990536, + "learning_rate": 4.98196493464399e-07, + "loss": 1.031, + "step": 10773 + }, + { + "epoch": 0.7773168356119909, + "grad_norm": 8.371506602920073, + "learning_rate": 4.978878835419766e-07, + "loss": 0.8173, + "step": 10774 + }, + { + "epoch": 0.7773889830814185, + "grad_norm": 0.7336209260662018, + "learning_rate": 4.975793556443355e-07, + "loss": 0.7842, + "step": 10775 + }, + { + "epoch": 0.777461130550846, + "grad_norm": 2.3491849825188917, + "learning_rate": 4.972709097883241e-07, + "loss": 1.0089, + "step": 10776 + }, + { + "epoch": 0.7775332780202734, + "grad_norm": 2.205695634307048, + "learning_rate": 4.969625459907856e-07, + "loss": 0.8331, + "step": 10777 + }, + { + "epoch": 0.7776054254897009, + "grad_norm": 2.3520343091758944, + "learning_rate": 4.966542642685581e-07, + "loss": 0.9684, + "step": 10778 + }, + { + "epoch": 0.7776775729591284, + "grad_norm": 1.872630020231634, + "learning_rate": 4.963460646384763e-07, + "loss": 0.9829, + "step": 10779 + }, + { + "epoch": 0.777749720428556, + "grad_norm": 2.350701361351286, + "learning_rate": 4.960379471173692e-07, + "loss": 0.9487, + "step": 10780 + }, + { + "epoch": 0.7778218678979835, + "grad_norm": 2.466724675587567, + "learning_rate": 4.957299117220632e-07, + "loss": 0.9895, + "step": 10781 + }, + { + "epoch": 0.777894015367411, + "grad_norm": 2.290068421653487, + "learning_rate": 4.954219584693768e-07, + "loss": 0.9562, + "step": 10782 + }, + { + "epoch": 0.7779661628368385, + "grad_norm": 2.6727204351626703, + "learning_rate": 4.951140873761288e-07, + "loss": 0.8824, + "step": 10783 + }, + { + "epoch": 0.778038310306266, + "grad_norm": 2.424581283128808, + "learning_rate": 4.94806298459129e-07, + "loss": 0.8908, + "step": 10784 + }, + { + "epoch": 0.7781104577756935, + "grad_norm": 7.845940637352751, + "learning_rate": 4.944985917351852e-07, + "loss": 0.8565, + "step": 10785 + }, + { + "epoch": 0.7781826052451211, + "grad_norm": 3.4225163664332445, + "learning_rate": 4.941909672211e-07, + "loss": 0.9032, + "step": 10786 + }, + { + "epoch": 0.7782547527145486, + "grad_norm": 2.9801108369120706, + "learning_rate": 4.938834249336719e-07, + "loss": 0.974, + "step": 10787 + }, + { + "epoch": 0.7783269001839761, + "grad_norm": 2.800070019255221, + "learning_rate": 4.935759648896945e-07, + "loss": 0.8053, + "step": 10788 + }, + { + "epoch": 0.7783990476534035, + "grad_norm": 3.2993673874192195, + "learning_rate": 4.932685871059577e-07, + "loss": 0.9267, + "step": 10789 + }, + { + "epoch": 0.778471195122831, + "grad_norm": 2.3634484097971558, + "learning_rate": 4.92961291599245e-07, + "loss": 0.8861, + "step": 10790 + }, + { + "epoch": 0.7785433425922585, + "grad_norm": 2.6394946144888745, + "learning_rate": 4.92654078386337e-07, + "loss": 0.9921, + "step": 10791 + }, + { + "epoch": 0.7786154900616861, + "grad_norm": 10.209344017184966, + "learning_rate": 4.923469474840101e-07, + "loss": 1.0037, + "step": 10792 + }, + { + "epoch": 0.7786876375311136, + "grad_norm": 4.404343212755131, + "learning_rate": 4.920398989090353e-07, + "loss": 0.8855, + "step": 10793 + }, + { + "epoch": 0.7787597850005411, + "grad_norm": 2.045637149559093, + "learning_rate": 4.917329326781792e-07, + "loss": 0.8733, + "step": 10794 + }, + { + "epoch": 0.7788319324699686, + "grad_norm": 2.630011814946662, + "learning_rate": 4.914260488082042e-07, + "loss": 0.9331, + "step": 10795 + }, + { + "epoch": 0.7789040799393961, + "grad_norm": 2.070404050710156, + "learning_rate": 4.911192473158687e-07, + "loss": 0.8744, + "step": 10796 + }, + { + "epoch": 0.7789762274088237, + "grad_norm": 4.56370821070391, + "learning_rate": 4.908125282179245e-07, + "loss": 0.9466, + "step": 10797 + }, + { + "epoch": 0.7790483748782512, + "grad_norm": 0.7985181136970042, + "learning_rate": 4.905058915311223e-07, + "loss": 0.8491, + "step": 10798 + }, + { + "epoch": 0.7791205223476787, + "grad_norm": 2.5602244996934482, + "learning_rate": 4.901993372722046e-07, + "loss": 1.0309, + "step": 10799 + }, + { + "epoch": 0.7791926698171062, + "grad_norm": 3.8611302882266396, + "learning_rate": 4.898928654579124e-07, + "loss": 0.8575, + "step": 10800 + }, + { + "epoch": 0.7792648172865336, + "grad_norm": 3.3392317718741755, + "learning_rate": 4.895864761049804e-07, + "loss": 1.0105, + "step": 10801 + }, + { + "epoch": 0.7793369647559611, + "grad_norm": 4.0901494390146285, + "learning_rate": 4.892801692301396e-07, + "loss": 0.9205, + "step": 10802 + }, + { + "epoch": 0.7794091122253887, + "grad_norm": 3.637186022444444, + "learning_rate": 4.889739448501162e-07, + "loss": 0.956, + "step": 10803 + }, + { + "epoch": 0.7794812596948162, + "grad_norm": 4.0242658336494035, + "learning_rate": 4.886678029816323e-07, + "loss": 0.8978, + "step": 10804 + }, + { + "epoch": 0.7795534071642437, + "grad_norm": 3.061478892219792, + "learning_rate": 4.883617436414051e-07, + "loss": 1.0397, + "step": 10805 + }, + { + "epoch": 0.7796255546336712, + "grad_norm": 2.7169571149837175, + "learning_rate": 4.880557668461469e-07, + "loss": 0.7764, + "step": 10806 + }, + { + "epoch": 0.7796977021030987, + "grad_norm": 5.158986977785744, + "learning_rate": 4.877498726125661e-07, + "loss": 0.9037, + "step": 10807 + }, + { + "epoch": 0.7797698495725263, + "grad_norm": 3.1392369785304592, + "learning_rate": 4.874440609573667e-07, + "loss": 0.9746, + "step": 10808 + }, + { + "epoch": 0.7798419970419538, + "grad_norm": 0.6948893684731129, + "learning_rate": 4.871383318972477e-07, + "loss": 0.7629, + "step": 10809 + }, + { + "epoch": 0.7799141445113813, + "grad_norm": 2.1526308508609007, + "learning_rate": 4.868326854489042e-07, + "loss": 0.9294, + "step": 10810 + }, + { + "epoch": 0.7799862919808088, + "grad_norm": 3.5057628055119108, + "learning_rate": 4.865271216290267e-07, + "loss": 0.8782, + "step": 10811 + }, + { + "epoch": 0.7800584394502362, + "grad_norm": 4.0397757337661675, + "learning_rate": 4.862216404542992e-07, + "loss": 0.836, + "step": 10812 + }, + { + "epoch": 0.7801305869196637, + "grad_norm": 2.422341086546049, + "learning_rate": 4.859162419414051e-07, + "loss": 0.8885, + "step": 10813 + }, + { + "epoch": 0.7802027343890913, + "grad_norm": 2.6128871831587532, + "learning_rate": 4.856109261070198e-07, + "loss": 0.963, + "step": 10814 + }, + { + "epoch": 0.7802748818585188, + "grad_norm": 3.958080096258902, + "learning_rate": 4.853056929678156e-07, + "loss": 0.9616, + "step": 10815 + }, + { + "epoch": 0.7803470293279463, + "grad_norm": 2.440159544966049, + "learning_rate": 4.850005425404601e-07, + "loss": 0.9347, + "step": 10816 + }, + { + "epoch": 0.7804191767973738, + "grad_norm": 2.535314804785156, + "learning_rate": 4.84695474841617e-07, + "loss": 0.8588, + "step": 10817 + }, + { + "epoch": 0.7804913242668013, + "grad_norm": 3.107174655519832, + "learning_rate": 4.843904898879445e-07, + "loss": 0.9043, + "step": 10818 + }, + { + "epoch": 0.7805634717362289, + "grad_norm": 1.891193146894273, + "learning_rate": 4.840855876960965e-07, + "loss": 0.9176, + "step": 10819 + }, + { + "epoch": 0.7806356192056564, + "grad_norm": 4.456790526406267, + "learning_rate": 4.837807682827235e-07, + "loss": 0.9881, + "step": 10820 + }, + { + "epoch": 0.7807077666750839, + "grad_norm": 2.3851642947119083, + "learning_rate": 4.834760316644689e-07, + "loss": 0.8961, + "step": 10821 + }, + { + "epoch": 0.7807799141445114, + "grad_norm": 2.136915905659394, + "learning_rate": 4.831713778579754e-07, + "loss": 0.9245, + "step": 10822 + }, + { + "epoch": 0.7808520616139389, + "grad_norm": 1.965212595846821, + "learning_rate": 4.828668068798771e-07, + "loss": 0.9923, + "step": 10823 + }, + { + "epoch": 0.7809242090833663, + "grad_norm": 3.9261298333269172, + "learning_rate": 4.825623187468062e-07, + "loss": 0.7888, + "step": 10824 + }, + { + "epoch": 0.7809963565527939, + "grad_norm": 2.201031928365106, + "learning_rate": 4.822579134753899e-07, + "loss": 0.931, + "step": 10825 + }, + { + "epoch": 0.7810685040222214, + "grad_norm": 2.5203557048010397, + "learning_rate": 4.819535910822503e-07, + "loss": 0.9477, + "step": 10826 + }, + { + "epoch": 0.7811406514916489, + "grad_norm": 2.9378817695734774, + "learning_rate": 4.816493515840055e-07, + "loss": 0.9957, + "step": 10827 + }, + { + "epoch": 0.7812127989610764, + "grad_norm": 2.1342849958217607, + "learning_rate": 4.813451949972696e-07, + "loss": 0.9068, + "step": 10828 + }, + { + "epoch": 0.781284946430504, + "grad_norm": 2.3397755811947487, + "learning_rate": 4.8104112133865e-07, + "loss": 0.9274, + "step": 10829 + }, + { + "epoch": 0.7813570938999315, + "grad_norm": 2.3213460656061025, + "learning_rate": 4.807371306247518e-07, + "loss": 0.8801, + "step": 10830 + }, + { + "epoch": 0.781429241369359, + "grad_norm": 2.68179514218207, + "learning_rate": 4.804332228721748e-07, + "loss": 0.9904, + "step": 10831 + }, + { + "epoch": 0.7815013888387865, + "grad_norm": 4.283356753828287, + "learning_rate": 4.801293980975141e-07, + "loss": 1.0189, + "step": 10832 + }, + { + "epoch": 0.781573536308214, + "grad_norm": 2.0038022614801076, + "learning_rate": 4.798256563173608e-07, + "loss": 0.9766, + "step": 10833 + }, + { + "epoch": 0.7816456837776415, + "grad_norm": 2.479836979707104, + "learning_rate": 4.795219975483009e-07, + "loss": 1.0183, + "step": 10834 + }, + { + "epoch": 0.7817178312470691, + "grad_norm": 8.316642933456816, + "learning_rate": 4.792184218069166e-07, + "loss": 0.8902, + "step": 10835 + }, + { + "epoch": 0.7817899787164965, + "grad_norm": 0.710448547572707, + "learning_rate": 4.789149291097833e-07, + "loss": 0.8046, + "step": 10836 + }, + { + "epoch": 0.781862126185924, + "grad_norm": 2.0695385907951276, + "learning_rate": 4.786115194734763e-07, + "loss": 0.9471, + "step": 10837 + }, + { + "epoch": 0.7819342736553515, + "grad_norm": 2.1318997955893324, + "learning_rate": 4.783081929145614e-07, + "loss": 0.9864, + "step": 10838 + }, + { + "epoch": 0.782006421124779, + "grad_norm": 0.743217636763905, + "learning_rate": 4.78004949449603e-07, + "loss": 0.8291, + "step": 10839 + }, + { + "epoch": 0.7820785685942065, + "grad_norm": 3.8111838898847545, + "learning_rate": 4.777017890951603e-07, + "loss": 0.8807, + "step": 10840 + }, + { + "epoch": 0.7821507160636341, + "grad_norm": 2.6781789909983065, + "learning_rate": 4.773987118677873e-07, + "loss": 0.9184, + "step": 10841 + }, + { + "epoch": 0.7822228635330616, + "grad_norm": 4.631833569700797, + "learning_rate": 4.770957177840342e-07, + "loss": 0.8681, + "step": 10842 + }, + { + "epoch": 0.7822950110024891, + "grad_norm": 2.36148118752783, + "learning_rate": 4.7679280686044655e-07, + "loss": 0.9738, + "step": 10843 + }, + { + "epoch": 0.7823671584719166, + "grad_norm": 7.64855969402517, + "learning_rate": 4.764899791135655e-07, + "loss": 0.8639, + "step": 10844 + }, + { + "epoch": 0.7824393059413441, + "grad_norm": 2.8127193789223677, + "learning_rate": 4.7618723455992605e-07, + "loss": 1.0073, + "step": 10845 + }, + { + "epoch": 0.7825114534107717, + "grad_norm": 2.260333808282689, + "learning_rate": 4.75884573216061e-07, + "loss": 0.882, + "step": 10846 + }, + { + "epoch": 0.7825836008801992, + "grad_norm": 2.4736628841194226, + "learning_rate": 4.755819950984972e-07, + "loss": 1.0068, + "step": 10847 + }, + { + "epoch": 0.7826557483496266, + "grad_norm": 4.323260386333675, + "learning_rate": 4.7527950022375793e-07, + "loss": 0.8923, + "step": 10848 + }, + { + "epoch": 0.7827278958190541, + "grad_norm": 2.838558446618742, + "learning_rate": 4.7497708860835993e-07, + "loss": 1.0168, + "step": 10849 + }, + { + "epoch": 0.7828000432884816, + "grad_norm": 2.280039322329957, + "learning_rate": 4.7467476026881814e-07, + "loss": 0.7956, + "step": 10850 + }, + { + "epoch": 0.7828721907579091, + "grad_norm": 3.766864113710457, + "learning_rate": 4.743725152216418e-07, + "loss": 0.9101, + "step": 10851 + }, + { + "epoch": 0.7829443382273367, + "grad_norm": 2.653780698234828, + "learning_rate": 4.740703534833339e-07, + "loss": 1.0179, + "step": 10852 + }, + { + "epoch": 0.7830164856967642, + "grad_norm": 2.1470234604412357, + "learning_rate": 4.737682750703953e-07, + "loss": 0.8612, + "step": 10853 + }, + { + "epoch": 0.7830886331661917, + "grad_norm": 14.106066952837338, + "learning_rate": 4.7346627999932075e-07, + "loss": 0.9414, + "step": 10854 + }, + { + "epoch": 0.7831607806356192, + "grad_norm": 2.3793363636468197, + "learning_rate": 4.731643682866029e-07, + "loss": 0.9635, + "step": 10855 + }, + { + "epoch": 0.7832329281050467, + "grad_norm": 4.222278165167796, + "learning_rate": 4.7286253994872604e-07, + "loss": 0.9445, + "step": 10856 + }, + { + "epoch": 0.7833050755744743, + "grad_norm": 1.1817331325165708, + "learning_rate": 4.725607950021726e-07, + "loss": 0.8964, + "step": 10857 + }, + { + "epoch": 0.7833772230439018, + "grad_norm": 2.8482864582571423, + "learning_rate": 4.722591334634203e-07, + "loss": 0.7842, + "step": 10858 + }, + { + "epoch": 0.7834493705133292, + "grad_norm": 2.9939810454661115, + "learning_rate": 4.7195755534894075e-07, + "loss": 0.9489, + "step": 10859 + }, + { + "epoch": 0.7835215179827567, + "grad_norm": 2.6867473346756974, + "learning_rate": 4.716560606752018e-07, + "loss": 0.9135, + "step": 10860 + }, + { + "epoch": 0.7835936654521842, + "grad_norm": 2.26373412052443, + "learning_rate": 4.713546494586689e-07, + "loss": 1.029, + "step": 10861 + }, + { + "epoch": 0.7836658129216117, + "grad_norm": 2.7163157472408277, + "learning_rate": 4.71053321715799e-07, + "loss": 0.9425, + "step": 10862 + }, + { + "epoch": 0.7837379603910393, + "grad_norm": 2.3200201498998156, + "learning_rate": 4.7075207746304736e-07, + "loss": 0.9431, + "step": 10863 + }, + { + "epoch": 0.7838101078604668, + "grad_norm": 3.1760156223575904, + "learning_rate": 4.7045091671686354e-07, + "loss": 0.9685, + "step": 10864 + }, + { + "epoch": 0.7838822553298943, + "grad_norm": 2.5359485487306483, + "learning_rate": 4.701498394936938e-07, + "loss": 0.9806, + "step": 10865 + }, + { + "epoch": 0.7839544027993218, + "grad_norm": 2.5808379724214725, + "learning_rate": 4.6984884580997677e-07, + "loss": 0.9438, + "step": 10866 + }, + { + "epoch": 0.7840265502687493, + "grad_norm": 2.3844198840155175, + "learning_rate": 4.6954793568215035e-07, + "loss": 0.8573, + "step": 10867 + }, + { + "epoch": 0.7840986977381769, + "grad_norm": 2.2826773998433003, + "learning_rate": 4.6924710912664613e-07, + "loss": 0.8832, + "step": 10868 + }, + { + "epoch": 0.7841708452076044, + "grad_norm": 2.3621168562611494, + "learning_rate": 4.6894636615989004e-07, + "loss": 0.8679, + "step": 10869 + }, + { + "epoch": 0.7842429926770319, + "grad_norm": 2.5413029102947484, + "learning_rate": 4.686457067983054e-07, + "loss": 0.9522, + "step": 10870 + }, + { + "epoch": 0.7843151401464593, + "grad_norm": 2.520572417656868, + "learning_rate": 4.6834513105831017e-07, + "loss": 0.9479, + "step": 10871 + }, + { + "epoch": 0.7843872876158868, + "grad_norm": 2.0983791135844334, + "learning_rate": 4.6804463895631686e-07, + "loss": 0.903, + "step": 10872 + }, + { + "epoch": 0.7844594350853143, + "grad_norm": 2.585037213026287, + "learning_rate": 4.67744230508734e-07, + "loss": 0.8332, + "step": 10873 + }, + { + "epoch": 0.7845315825547419, + "grad_norm": 3.7673971351194253, + "learning_rate": 4.674439057319672e-07, + "loss": 0.8381, + "step": 10874 + }, + { + "epoch": 0.7846037300241694, + "grad_norm": 2.1536693886910574, + "learning_rate": 4.671436646424158e-07, + "loss": 0.9307, + "step": 10875 + }, + { + "epoch": 0.7846758774935969, + "grad_norm": 3.0739998842535945, + "learning_rate": 4.6684350725647404e-07, + "loss": 0.8982, + "step": 10876 + }, + { + "epoch": 0.7847480249630244, + "grad_norm": 2.307731351155334, + "learning_rate": 4.6654343359053274e-07, + "loss": 0.8891, + "step": 10877 + }, + { + "epoch": 0.784820172432452, + "grad_norm": 2.2718410064378878, + "learning_rate": 4.662434436609786e-07, + "loss": 1.0188, + "step": 10878 + }, + { + "epoch": 0.7848923199018795, + "grad_norm": 3.573456096058686, + "learning_rate": 4.659435374841909e-07, + "loss": 0.8281, + "step": 10879 + }, + { + "epoch": 0.784964467371307, + "grad_norm": 2.7819683561685062, + "learning_rate": 4.6564371507654823e-07, + "loss": 0.8659, + "step": 10880 + }, + { + "epoch": 0.7850366148407345, + "grad_norm": 3.2315456528128115, + "learning_rate": 4.6534397645442246e-07, + "loss": 0.861, + "step": 10881 + }, + { + "epoch": 0.785108762310162, + "grad_norm": 2.588251835788049, + "learning_rate": 4.650443216341815e-07, + "loss": 0.9383, + "step": 10882 + }, + { + "epoch": 0.7851809097795894, + "grad_norm": 2.251017340499674, + "learning_rate": 4.6474475063218753e-07, + "loss": 0.921, + "step": 10883 + }, + { + "epoch": 0.785253057249017, + "grad_norm": 2.0945954395149475, + "learning_rate": 4.644452634647986e-07, + "loss": 0.9718, + "step": 10884 + }, + { + "epoch": 0.7853252047184445, + "grad_norm": 2.719411396825483, + "learning_rate": 4.6414586014837074e-07, + "loss": 0.9443, + "step": 10885 + }, + { + "epoch": 0.785397352187872, + "grad_norm": 2.905900503493892, + "learning_rate": 4.638465406992511e-07, + "loss": 0.8969, + "step": 10886 + }, + { + "epoch": 0.7854694996572995, + "grad_norm": 2.8226411035101244, + "learning_rate": 4.635473051337855e-07, + "loss": 0.9876, + "step": 10887 + }, + { + "epoch": 0.785541647126727, + "grad_norm": 3.533591245439086, + "learning_rate": 4.6324815346831413e-07, + "loss": 0.8791, + "step": 10888 + }, + { + "epoch": 0.7856137945961545, + "grad_norm": 2.087253569716539, + "learning_rate": 4.62949085719172e-07, + "loss": 0.8253, + "step": 10889 + }, + { + "epoch": 0.7856859420655821, + "grad_norm": 1.9476712798186318, + "learning_rate": 4.626501019026894e-07, + "loss": 0.8428, + "step": 10890 + }, + { + "epoch": 0.7857580895350096, + "grad_norm": 2.385228867335953, + "learning_rate": 4.623512020351945e-07, + "loss": 0.9043, + "step": 10891 + }, + { + "epoch": 0.7858302370044371, + "grad_norm": 3.2415154553394605, + "learning_rate": 4.620523861330088e-07, + "loss": 0.8655, + "step": 10892 + }, + { + "epoch": 0.7859023844738646, + "grad_norm": 3.292171938460641, + "learning_rate": 4.617536542124483e-07, + "loss": 0.9446, + "step": 10893 + }, + { + "epoch": 0.7859745319432921, + "grad_norm": 0.9660466838182897, + "learning_rate": 4.6145500628982635e-07, + "loss": 0.8915, + "step": 10894 + }, + { + "epoch": 0.7860466794127196, + "grad_norm": 3.9690325779076425, + "learning_rate": 4.611564423814518e-07, + "loss": 0.9634, + "step": 10895 + }, + { + "epoch": 0.7861188268821471, + "grad_norm": 1.9006508691163095, + "learning_rate": 4.608579625036266e-07, + "loss": 0.9168, + "step": 10896 + }, + { + "epoch": 0.7861909743515746, + "grad_norm": 4.3696555054661, + "learning_rate": 4.6055956667264983e-07, + "loss": 0.869, + "step": 10897 + }, + { + "epoch": 0.7862631218210021, + "grad_norm": 2.3838720748680995, + "learning_rate": 4.602612549048168e-07, + "loss": 0.9588, + "step": 10898 + }, + { + "epoch": 0.7863352692904296, + "grad_norm": 2.9871806592530734, + "learning_rate": 4.599630272164172e-07, + "loss": 0.987, + "step": 10899 + }, + { + "epoch": 0.7864074167598571, + "grad_norm": 2.364280502040289, + "learning_rate": 4.596648836237353e-07, + "loss": 0.8776, + "step": 10900 + }, + { + "epoch": 0.7864795642292847, + "grad_norm": 3.1418084339430536, + "learning_rate": 4.5936682414305194e-07, + "loss": 0.8996, + "step": 10901 + }, + { + "epoch": 0.7865517116987122, + "grad_norm": 3.891924660168129, + "learning_rate": 4.590688487906436e-07, + "loss": 0.7821, + "step": 10902 + }, + { + "epoch": 0.7866238591681397, + "grad_norm": 3.1561889642535936, + "learning_rate": 4.5877095758278006e-07, + "loss": 0.9779, + "step": 10903 + }, + { + "epoch": 0.7866960066375672, + "grad_norm": 2.5675549774078292, + "learning_rate": 4.5847315053572954e-07, + "loss": 0.8606, + "step": 10904 + }, + { + "epoch": 0.7867681541069947, + "grad_norm": 2.7229843489259813, + "learning_rate": 4.5817542766575437e-07, + "loss": 0.9543, + "step": 10905 + }, + { + "epoch": 0.7868403015764222, + "grad_norm": 3.1379886018778236, + "learning_rate": 4.57877788989111e-07, + "loss": 1.0188, + "step": 10906 + }, + { + "epoch": 0.7869124490458497, + "grad_norm": 0.8111081305830844, + "learning_rate": 4.575802345220525e-07, + "loss": 0.8217, + "step": 10907 + }, + { + "epoch": 0.7869845965152772, + "grad_norm": 8.25217894476518, + "learning_rate": 4.5728276428082726e-07, + "loss": 0.9914, + "step": 10908 + }, + { + "epoch": 0.7870567439847047, + "grad_norm": 0.7029967297007218, + "learning_rate": 4.5698537828168036e-07, + "loss": 0.8067, + "step": 10909 + }, + { + "epoch": 0.7871288914541322, + "grad_norm": 3.251295858409556, + "learning_rate": 4.566880765408494e-07, + "loss": 0.9512, + "step": 10910 + }, + { + "epoch": 0.7872010389235597, + "grad_norm": 4.956435772756266, + "learning_rate": 4.563908590745693e-07, + "loss": 0.9373, + "step": 10911 + }, + { + "epoch": 0.7872731863929873, + "grad_norm": 3.235555829463267, + "learning_rate": 4.560937258990709e-07, + "loss": 0.8867, + "step": 10912 + }, + { + "epoch": 0.7873453338624148, + "grad_norm": 2.167662122882506, + "learning_rate": 4.557966770305781e-07, + "loss": 0.8578, + "step": 10913 + }, + { + "epoch": 0.7874174813318423, + "grad_norm": 2.435916875589205, + "learning_rate": 4.554997124853117e-07, + "loss": 0.9548, + "step": 10914 + }, + { + "epoch": 0.7874896288012698, + "grad_norm": 3.3629903102602445, + "learning_rate": 4.5520283227948964e-07, + "loss": 0.7979, + "step": 10915 + }, + { + "epoch": 0.7875617762706973, + "grad_norm": 2.8077952559020685, + "learning_rate": 4.549060364293218e-07, + "loss": 0.9089, + "step": 10916 + }, + { + "epoch": 0.7876339237401249, + "grad_norm": 2.5398302062045164, + "learning_rate": 4.5460932495101545e-07, + "loss": 0.8821, + "step": 10917 + }, + { + "epoch": 0.7877060712095523, + "grad_norm": 2.0895704999880156, + "learning_rate": 4.543126978607732e-07, + "loss": 0.9776, + "step": 10918 + }, + { + "epoch": 0.7877782186789798, + "grad_norm": 4.5390959063825544, + "learning_rate": 4.54016155174793e-07, + "loss": 0.8358, + "step": 10919 + }, + { + "epoch": 0.7878503661484073, + "grad_norm": 2.3676138012452377, + "learning_rate": 4.537196969092669e-07, + "loss": 0.8455, + "step": 10920 + }, + { + "epoch": 0.7879225136178348, + "grad_norm": 2.0735149446698093, + "learning_rate": 4.5342332308038346e-07, + "loss": 0.8762, + "step": 10921 + }, + { + "epoch": 0.7879946610872623, + "grad_norm": 2.6973330677835006, + "learning_rate": 4.5312703370432845e-07, + "loss": 0.9868, + "step": 10922 + }, + { + "epoch": 0.7880668085566899, + "grad_norm": 3.149029773066256, + "learning_rate": 4.528308287972791e-07, + "loss": 0.9782, + "step": 10923 + }, + { + "epoch": 0.7881389560261174, + "grad_norm": 3.551480030423111, + "learning_rate": 4.5253470837541074e-07, + "loss": 0.8916, + "step": 10924 + }, + { + "epoch": 0.7882111034955449, + "grad_norm": 2.532238237535754, + "learning_rate": 4.5223867245489346e-07, + "loss": 1.0049, + "step": 10925 + }, + { + "epoch": 0.7882832509649724, + "grad_norm": 10.150766247920492, + "learning_rate": 4.5194272105189314e-07, + "loss": 0.9443, + "step": 10926 + }, + { + "epoch": 0.7883553984344, + "grad_norm": 3.2576298285117318, + "learning_rate": 4.516468541825691e-07, + "loss": 0.838, + "step": 10927 + }, + { + "epoch": 0.7884275459038275, + "grad_norm": 2.1245853917177437, + "learning_rate": 4.513510718630791e-07, + "loss": 0.9329, + "step": 10928 + }, + { + "epoch": 0.788499693373255, + "grad_norm": 4.2579455433629185, + "learning_rate": 4.5105537410957464e-07, + "loss": 0.9211, + "step": 10929 + }, + { + "epoch": 0.7885718408426824, + "grad_norm": 2.207077052898616, + "learning_rate": 4.5075976093820167e-07, + "loss": 0.9286, + "step": 10930 + }, + { + "epoch": 0.7886439883121099, + "grad_norm": 4.284821517236198, + "learning_rate": 4.504642323651031e-07, + "loss": 0.9804, + "step": 10931 + }, + { + "epoch": 0.7887161357815374, + "grad_norm": 2.6437568826788733, + "learning_rate": 4.501687884064174e-07, + "loss": 0.9417, + "step": 10932 + }, + { + "epoch": 0.788788283250965, + "grad_norm": 3.4517319575062206, + "learning_rate": 4.498734290782755e-07, + "loss": 0.8049, + "step": 10933 + }, + { + "epoch": 0.7888604307203925, + "grad_norm": 2.3350270005488665, + "learning_rate": 4.495781543968078e-07, + "loss": 0.8446, + "step": 10934 + }, + { + "epoch": 0.78893257818982, + "grad_norm": 3.304972462734168, + "learning_rate": 4.492829643781377e-07, + "loss": 0.9181, + "step": 10935 + }, + { + "epoch": 0.7890047256592475, + "grad_norm": 1.8706827050665964, + "learning_rate": 4.489878590383849e-07, + "loss": 0.8327, + "step": 10936 + }, + { + "epoch": 0.789076873128675, + "grad_norm": 3.1987149280862948, + "learning_rate": 4.48692838393663e-07, + "loss": 0.8884, + "step": 10937 + }, + { + "epoch": 0.7891490205981025, + "grad_norm": 1.9388803056795945, + "learning_rate": 4.483979024600817e-07, + "loss": 0.9303, + "step": 10938 + }, + { + "epoch": 0.7892211680675301, + "grad_norm": 3.4760508643007593, + "learning_rate": 4.4810305125374825e-07, + "loss": 1.0157, + "step": 10939 + }, + { + "epoch": 0.7892933155369576, + "grad_norm": 2.8407424281391274, + "learning_rate": 4.478082847907618e-07, + "loss": 0.8637, + "step": 10940 + }, + { + "epoch": 0.7893654630063851, + "grad_norm": 1.991921081680218, + "learning_rate": 4.475136030872189e-07, + "loss": 0.9987, + "step": 10941 + }, + { + "epoch": 0.7894376104758125, + "grad_norm": 2.2980778457497326, + "learning_rate": 4.4721900615921117e-07, + "loss": 0.9994, + "step": 10942 + }, + { + "epoch": 0.78950975794524, + "grad_norm": 2.3307499776298264, + "learning_rate": 4.469244940228256e-07, + "loss": 0.8865, + "step": 10943 + }, + { + "epoch": 0.7895819054146676, + "grad_norm": 4.035896875854151, + "learning_rate": 4.466300666941432e-07, + "loss": 0.8243, + "step": 10944 + }, + { + "epoch": 0.7896540528840951, + "grad_norm": 2.180610926006773, + "learning_rate": 4.4633572418924295e-07, + "loss": 0.8594, + "step": 10945 + }, + { + "epoch": 0.7897262003535226, + "grad_norm": 3.224580243612734, + "learning_rate": 4.4604146652419784e-07, + "loss": 0.7628, + "step": 10946 + }, + { + "epoch": 0.7897983478229501, + "grad_norm": 2.3630547533185506, + "learning_rate": 4.457472937150752e-07, + "loss": 0.8872, + "step": 10947 + }, + { + "epoch": 0.7898704952923776, + "grad_norm": 3.2877560011830913, + "learning_rate": 4.454532057779392e-07, + "loss": 0.9406, + "step": 10948 + }, + { + "epoch": 0.7899426427618051, + "grad_norm": 2.745728730296563, + "learning_rate": 4.451592027288493e-07, + "loss": 0.9054, + "step": 10949 + }, + { + "epoch": 0.7900147902312327, + "grad_norm": 1.6983354048237445, + "learning_rate": 4.448652845838592e-07, + "loss": 0.9106, + "step": 10950 + }, + { + "epoch": 0.7900869377006602, + "grad_norm": 6.63673398182846, + "learning_rate": 4.445714513590184e-07, + "loss": 0.9778, + "step": 10951 + }, + { + "epoch": 0.7901590851700877, + "grad_norm": 2.881456671396136, + "learning_rate": 4.4427770307037307e-07, + "loss": 0.9413, + "step": 10952 + }, + { + "epoch": 0.7902312326395152, + "grad_norm": 1.999874349461799, + "learning_rate": 4.439840397339638e-07, + "loss": 0.9222, + "step": 10953 + }, + { + "epoch": 0.7903033801089426, + "grad_norm": 0.840567520899246, + "learning_rate": 4.4369046136582564e-07, + "loss": 0.7812, + "step": 10954 + }, + { + "epoch": 0.7903755275783702, + "grad_norm": 2.1369364346880366, + "learning_rate": 4.4339696798199e-07, + "loss": 0.9113, + "step": 10955 + }, + { + "epoch": 0.7904476750477977, + "grad_norm": 2.384257094441583, + "learning_rate": 4.43103559598484e-07, + "loss": 0.8972, + "step": 10956 + }, + { + "epoch": 0.7905198225172252, + "grad_norm": 2.750176684165659, + "learning_rate": 4.428102362313282e-07, + "loss": 0.7861, + "step": 10957 + }, + { + "epoch": 0.7905919699866527, + "grad_norm": 4.993217063606099, + "learning_rate": 4.4251699789654153e-07, + "loss": 0.8489, + "step": 10958 + }, + { + "epoch": 0.7906641174560802, + "grad_norm": 2.8983636317132784, + "learning_rate": 4.422238446101365e-07, + "loss": 0.818, + "step": 10959 + }, + { + "epoch": 0.7907362649255077, + "grad_norm": 4.146654568457254, + "learning_rate": 4.419307763881199e-07, + "loss": 0.9186, + "step": 10960 + }, + { + "epoch": 0.7908084123949353, + "grad_norm": 5.312520105660483, + "learning_rate": 4.416377932464961e-07, + "loss": 0.9954, + "step": 10961 + }, + { + "epoch": 0.7908805598643628, + "grad_norm": 2.9680777340706292, + "learning_rate": 4.4134489520126284e-07, + "loss": 0.937, + "step": 10962 + }, + { + "epoch": 0.7909527073337903, + "grad_norm": 2.6942785076506715, + "learning_rate": 4.4105208226841604e-07, + "loss": 0.9015, + "step": 10963 + }, + { + "epoch": 0.7910248548032178, + "grad_norm": 4.610640012787232, + "learning_rate": 4.407593544639434e-07, + "loss": 0.9406, + "step": 10964 + }, + { + "epoch": 0.7910970022726452, + "grad_norm": 3.0977484555850543, + "learning_rate": 4.404667118038303e-07, + "loss": 0.9749, + "step": 10965 + }, + { + "epoch": 0.7911691497420728, + "grad_norm": 3.1646682724510327, + "learning_rate": 4.401741543040576e-07, + "loss": 0.901, + "step": 10966 + }, + { + "epoch": 0.7912412972115003, + "grad_norm": 2.5282657126745, + "learning_rate": 4.398816819805993e-07, + "loss": 0.7814, + "step": 10967 + }, + { + "epoch": 0.7913134446809278, + "grad_norm": 2.7116072209490674, + "learning_rate": 4.395892948494264e-07, + "loss": 1.0156, + "step": 10968 + }, + { + "epoch": 0.7913855921503553, + "grad_norm": 2.4310534400228785, + "learning_rate": 4.3929699292650647e-07, + "loss": 0.8659, + "step": 10969 + }, + { + "epoch": 0.7914577396197828, + "grad_norm": 2.6489903595958846, + "learning_rate": 4.3900477622780063e-07, + "loss": 0.9297, + "step": 10970 + }, + { + "epoch": 0.7915298870892103, + "grad_norm": 3.772584933367011, + "learning_rate": 4.3871264476926463e-07, + "loss": 0.9229, + "step": 10971 + }, + { + "epoch": 0.7916020345586379, + "grad_norm": 8.338300775737235, + "learning_rate": 4.3842059856685163e-07, + "loss": 0.8995, + "step": 10972 + }, + { + "epoch": 0.7916741820280654, + "grad_norm": 2.533350226279035, + "learning_rate": 4.381286376365094e-07, + "loss": 0.8596, + "step": 10973 + }, + { + "epoch": 0.7917463294974929, + "grad_norm": 2.2718825643169507, + "learning_rate": 4.3783676199418005e-07, + "loss": 0.8775, + "step": 10974 + }, + { + "epoch": 0.7918184769669204, + "grad_norm": 2.1147842562325208, + "learning_rate": 4.375449716558015e-07, + "loss": 0.9215, + "step": 10975 + }, + { + "epoch": 0.791890624436348, + "grad_norm": 2.195825808643089, + "learning_rate": 4.372532666373092e-07, + "loss": 0.8381, + "step": 10976 + }, + { + "epoch": 0.7919627719057754, + "grad_norm": 3.0964773467479687, + "learning_rate": 4.3696164695463044e-07, + "loss": 0.913, + "step": 10977 + }, + { + "epoch": 0.7920349193752029, + "grad_norm": 3.834302627639063, + "learning_rate": 4.366701126236898e-07, + "loss": 0.8318, + "step": 10978 + }, + { + "epoch": 0.7921070668446304, + "grad_norm": 3.5946494925604684, + "learning_rate": 4.363786636604072e-07, + "loss": 0.8188, + "step": 10979 + }, + { + "epoch": 0.7921792143140579, + "grad_norm": 2.9934911054508055, + "learning_rate": 4.360873000806979e-07, + "loss": 0.9131, + "step": 10980 + }, + { + "epoch": 0.7922513617834854, + "grad_norm": 2.665170925592216, + "learning_rate": 4.357960219004708e-07, + "loss": 0.9039, + "step": 10981 + }, + { + "epoch": 0.792323509252913, + "grad_norm": 3.013839430239622, + "learning_rate": 4.355048291356329e-07, + "loss": 0.8863, + "step": 10982 + }, + { + "epoch": 0.7923956567223405, + "grad_norm": 2.538970194385853, + "learning_rate": 4.3521372180208524e-07, + "loss": 0.8603, + "step": 10983 + }, + { + "epoch": 0.792467804191768, + "grad_norm": 3.203814771862947, + "learning_rate": 4.3492269991572295e-07, + "loss": 0.9379, + "step": 10984 + }, + { + "epoch": 0.7925399516611955, + "grad_norm": 2.488897370095032, + "learning_rate": 4.3463176349243834e-07, + "loss": 1.0162, + "step": 10985 + }, + { + "epoch": 0.792612099130623, + "grad_norm": 2.3277051918388474, + "learning_rate": 4.3434091254811786e-07, + "loss": 0.851, + "step": 10986 + }, + { + "epoch": 0.7926842466000505, + "grad_norm": 3.0639234076505923, + "learning_rate": 4.340501470986455e-07, + "loss": 0.8613, + "step": 10987 + }, + { + "epoch": 0.7927563940694781, + "grad_norm": 2.3418450306831997, + "learning_rate": 4.3375946715989696e-07, + "loss": 0.7774, + "step": 10988 + }, + { + "epoch": 0.7928285415389055, + "grad_norm": 2.0979249238726476, + "learning_rate": 4.3346887274774577e-07, + "loss": 0.8985, + "step": 10989 + }, + { + "epoch": 0.792900689008333, + "grad_norm": 3.7469106028235792, + "learning_rate": 4.3317836387806104e-07, + "loss": 0.8968, + "step": 10990 + }, + { + "epoch": 0.7929728364777605, + "grad_norm": 2.623682236289346, + "learning_rate": 4.3288794056670517e-07, + "loss": 0.8403, + "step": 10991 + }, + { + "epoch": 0.793044983947188, + "grad_norm": 0.7662368587219575, + "learning_rate": 4.3259760282953704e-07, + "loss": 0.7831, + "step": 10992 + }, + { + "epoch": 0.7931171314166156, + "grad_norm": 2.7479384237550724, + "learning_rate": 4.3230735068241263e-07, + "loss": 0.9162, + "step": 10993 + }, + { + "epoch": 0.7931892788860431, + "grad_norm": 5.4337811962186064, + "learning_rate": 4.320171841411797e-07, + "loss": 0.8713, + "step": 10994 + }, + { + "epoch": 0.7932614263554706, + "grad_norm": 2.4305565558537436, + "learning_rate": 4.31727103221684e-07, + "loss": 0.9546, + "step": 10995 + }, + { + "epoch": 0.7933335738248981, + "grad_norm": 3.42750984173787, + "learning_rate": 4.3143710793976563e-07, + "loss": 0.9189, + "step": 10996 + }, + { + "epoch": 0.7934057212943256, + "grad_norm": 0.6184187570809095, + "learning_rate": 4.3114719831126067e-07, + "loss": 0.7375, + "step": 10997 + }, + { + "epoch": 0.7934778687637531, + "grad_norm": 2.6150534056540047, + "learning_rate": 4.30857374351999e-07, + "loss": 0.819, + "step": 10998 + }, + { + "epoch": 0.7935500162331807, + "grad_norm": 2.606224030655755, + "learning_rate": 4.305676360778068e-07, + "loss": 0.8838, + "step": 10999 + }, + { + "epoch": 0.7936221637026082, + "grad_norm": 2.375809380999189, + "learning_rate": 4.3027798350450717e-07, + "loss": 0.9555, + "step": 11000 + }, + { + "epoch": 0.7936943111720356, + "grad_norm": 0.7821379383950243, + "learning_rate": 4.2998841664791554e-07, + "loss": 0.8353, + "step": 11001 + }, + { + "epoch": 0.7937664586414631, + "grad_norm": 4.259521589324542, + "learning_rate": 4.296989355238445e-07, + "loss": 0.9257, + "step": 11002 + }, + { + "epoch": 0.7938386061108906, + "grad_norm": 1.9545153132648994, + "learning_rate": 4.294095401481019e-07, + "loss": 0.9375, + "step": 11003 + }, + { + "epoch": 0.7939107535803182, + "grad_norm": 2.480329951210144, + "learning_rate": 4.291202305364898e-07, + "loss": 0.8448, + "step": 11004 + }, + { + "epoch": 0.7939829010497457, + "grad_norm": 1.9526347040851384, + "learning_rate": 4.288310067048062e-07, + "loss": 0.966, + "step": 11005 + }, + { + "epoch": 0.7940550485191732, + "grad_norm": 2.783828739993407, + "learning_rate": 4.285418686688453e-07, + "loss": 0.8268, + "step": 11006 + }, + { + "epoch": 0.7941271959886007, + "grad_norm": 2.9251828976041883, + "learning_rate": 4.282528164443966e-07, + "loss": 0.9636, + "step": 11007 + }, + { + "epoch": 0.7941993434580282, + "grad_norm": 3.619883115788776, + "learning_rate": 4.279638500472422e-07, + "loss": 0.9324, + "step": 11008 + }, + { + "epoch": 0.7942714909274557, + "grad_norm": 2.915444054257996, + "learning_rate": 4.2767496949316274e-07, + "loss": 0.9109, + "step": 11009 + }, + { + "epoch": 0.7943436383968833, + "grad_norm": 2.569773701601253, + "learning_rate": 4.2738617479793326e-07, + "loss": 0.8982, + "step": 11010 + }, + { + "epoch": 0.7944157858663108, + "grad_norm": 2.5521018098953827, + "learning_rate": 4.270974659773219e-07, + "loss": 0.9413, + "step": 11011 + }, + { + "epoch": 0.7944879333357382, + "grad_norm": 3.142238676848516, + "learning_rate": 4.2680884304709575e-07, + "loss": 0.9299, + "step": 11012 + }, + { + "epoch": 0.7945600808051657, + "grad_norm": 3.336303404991256, + "learning_rate": 4.2652030602301514e-07, + "loss": 0.9459, + "step": 11013 + }, + { + "epoch": 0.7946322282745932, + "grad_norm": 2.849594314880526, + "learning_rate": 4.262318549208361e-07, + "loss": 0.8809, + "step": 11014 + }, + { + "epoch": 0.7947043757440208, + "grad_norm": 2.2179057704560656, + "learning_rate": 4.259434897563092e-07, + "loss": 0.9477, + "step": 11015 + }, + { + "epoch": 0.7947765232134483, + "grad_norm": 3.2421133745291484, + "learning_rate": 4.2565521054518074e-07, + "loss": 0.9059, + "step": 11016 + }, + { + "epoch": 0.7948486706828758, + "grad_norm": 3.3837838485259253, + "learning_rate": 4.2536701730319426e-07, + "loss": 0.9356, + "step": 11017 + }, + { + "epoch": 0.7949208181523033, + "grad_norm": 2.0535419011806857, + "learning_rate": 4.2507891004608543e-07, + "loss": 1.0224, + "step": 11018 + }, + { + "epoch": 0.7949929656217308, + "grad_norm": 2.0281749043254136, + "learning_rate": 4.2479088878958725e-07, + "loss": 0.8513, + "step": 11019 + }, + { + "epoch": 0.7950651130911583, + "grad_norm": 3.6532681034928896, + "learning_rate": 4.245029535494278e-07, + "loss": 0.9741, + "step": 11020 + }, + { + "epoch": 0.7951372605605859, + "grad_norm": 2.099662517496069, + "learning_rate": 4.2421510434132913e-07, + "loss": 0.9949, + "step": 11021 + }, + { + "epoch": 0.7952094080300134, + "grad_norm": 4.245254110721714, + "learning_rate": 4.2392734118101047e-07, + "loss": 0.9545, + "step": 11022 + }, + { + "epoch": 0.7952815554994409, + "grad_norm": 3.4782547507543233, + "learning_rate": 4.236396640841846e-07, + "loss": 0.8556, + "step": 11023 + }, + { + "epoch": 0.7953537029688683, + "grad_norm": 2.1596100653023624, + "learning_rate": 4.233520730665623e-07, + "loss": 0.9034, + "step": 11024 + }, + { + "epoch": 0.7954258504382958, + "grad_norm": 5.146241618444489, + "learning_rate": 4.230645681438463e-07, + "loss": 0.8512, + "step": 11025 + }, + { + "epoch": 0.7954979979077234, + "grad_norm": 2.0780918434194096, + "learning_rate": 4.227771493317365e-07, + "loss": 0.9676, + "step": 11026 + }, + { + "epoch": 0.7955701453771509, + "grad_norm": 2.6266560553160487, + "learning_rate": 4.224898166459283e-07, + "loss": 0.7783, + "step": 11027 + }, + { + "epoch": 0.7956422928465784, + "grad_norm": 0.6630946342785605, + "learning_rate": 4.2220257010211127e-07, + "loss": 0.792, + "step": 11028 + }, + { + "epoch": 0.7957144403160059, + "grad_norm": 2.4282681652463003, + "learning_rate": 4.219154097159703e-07, + "loss": 0.8763, + "step": 11029 + }, + { + "epoch": 0.7957865877854334, + "grad_norm": 2.3469231043125705, + "learning_rate": 4.2162833550318754e-07, + "loss": 0.9341, + "step": 11030 + }, + { + "epoch": 0.795858735254861, + "grad_norm": 2.6739158643840475, + "learning_rate": 4.2134134747943897e-07, + "loss": 0.8963, + "step": 11031 + }, + { + "epoch": 0.7959308827242885, + "grad_norm": 2.5507857365006488, + "learning_rate": 4.2105444566039485e-07, + "loss": 0.8197, + "step": 11032 + }, + { + "epoch": 0.796003030193716, + "grad_norm": 2.656449344111114, + "learning_rate": 4.2076763006172245e-07, + "loss": 0.9393, + "step": 11033 + }, + { + "epoch": 0.7960751776631435, + "grad_norm": 3.2271411737061175, + "learning_rate": 4.2048090069908414e-07, + "loss": 0.7986, + "step": 11034 + }, + { + "epoch": 0.796147325132571, + "grad_norm": 2.3651988926942176, + "learning_rate": 4.201942575881356e-07, + "loss": 0.8852, + "step": 11035 + }, + { + "epoch": 0.7962194726019984, + "grad_norm": 2.8721583047129458, + "learning_rate": 4.199077007445311e-07, + "loss": 0.9889, + "step": 11036 + }, + { + "epoch": 0.796291620071426, + "grad_norm": 3.0632370042512003, + "learning_rate": 4.196212301839181e-07, + "loss": 0.8812, + "step": 11037 + }, + { + "epoch": 0.7963637675408535, + "grad_norm": 2.0199462936362456, + "learning_rate": 4.19334845921939e-07, + "loss": 0.9227, + "step": 11038 + }, + { + "epoch": 0.796435915010281, + "grad_norm": 2.7307898860575914, + "learning_rate": 4.1904854797423253e-07, + "loss": 0.9088, + "step": 11039 + }, + { + "epoch": 0.7965080624797085, + "grad_norm": 4.100226949597791, + "learning_rate": 4.1876233635643165e-07, + "loss": 0.9669, + "step": 11040 + }, + { + "epoch": 0.796580209949136, + "grad_norm": 2.725578064806348, + "learning_rate": 4.1847621108416733e-07, + "loss": 0.888, + "step": 11041 + }, + { + "epoch": 0.7966523574185636, + "grad_norm": 2.620590684712933, + "learning_rate": 4.1819017217306186e-07, + "loss": 0.9861, + "step": 11042 + }, + { + "epoch": 0.7967245048879911, + "grad_norm": 3.2134260572084874, + "learning_rate": 4.1790421963873543e-07, + "loss": 0.9975, + "step": 11043 + }, + { + "epoch": 0.7967966523574186, + "grad_norm": 2.1028876115386423, + "learning_rate": 4.176183534968034e-07, + "loss": 0.9275, + "step": 11044 + }, + { + "epoch": 0.7968687998268461, + "grad_norm": 2.917930674272581, + "learning_rate": 4.173325737628748e-07, + "loss": 0.8863, + "step": 11045 + }, + { + "epoch": 0.7969409472962736, + "grad_norm": 0.6633781721744775, + "learning_rate": 4.170468804525553e-07, + "loss": 0.8095, + "step": 11046 + }, + { + "epoch": 0.7970130947657011, + "grad_norm": 2.3845319703755115, + "learning_rate": 4.167612735814465e-07, + "loss": 1.0738, + "step": 11047 + }, + { + "epoch": 0.7970852422351286, + "grad_norm": 3.268139177073874, + "learning_rate": 4.164757531651426e-07, + "loss": 0.9361, + "step": 11048 + }, + { + "epoch": 0.7971573897045561, + "grad_norm": 3.9191739329106396, + "learning_rate": 4.161903192192362e-07, + "loss": 0.9362, + "step": 11049 + }, + { + "epoch": 0.7972295371739836, + "grad_norm": 2.6779489466022146, + "learning_rate": 4.159049717593135e-07, + "loss": 0.9048, + "step": 11050 + }, + { + "epoch": 0.7973016846434111, + "grad_norm": 4.658062882781456, + "learning_rate": 4.156197108009567e-07, + "loss": 0.8889, + "step": 11051 + }, + { + "epoch": 0.7973738321128386, + "grad_norm": 2.6255616994889848, + "learning_rate": 4.153345363597418e-07, + "loss": 1.0311, + "step": 11052 + }, + { + "epoch": 0.7974459795822662, + "grad_norm": 2.064858677348616, + "learning_rate": 4.1504944845124104e-07, + "loss": 0.8792, + "step": 11053 + }, + { + "epoch": 0.7975181270516937, + "grad_norm": 2.3490767917804734, + "learning_rate": 4.1476444709102367e-07, + "loss": 0.9332, + "step": 11054 + }, + { + "epoch": 0.7975902745211212, + "grad_norm": 2.453763453504513, + "learning_rate": 4.1447953229465104e-07, + "loss": 0.9375, + "step": 11055 + }, + { + "epoch": 0.7976624219905487, + "grad_norm": 4.97754049946639, + "learning_rate": 4.1419470407768165e-07, + "loss": 0.8733, + "step": 11056 + }, + { + "epoch": 0.7977345694599762, + "grad_norm": 3.404648010389032, + "learning_rate": 4.13909962455669e-07, + "loss": 0.9801, + "step": 11057 + }, + { + "epoch": 0.7978067169294037, + "grad_norm": 5.556737937142186, + "learning_rate": 4.136253074441625e-07, + "loss": 0.8549, + "step": 11058 + }, + { + "epoch": 0.7978788643988312, + "grad_norm": 2.1724422455058714, + "learning_rate": 4.1334073905870426e-07, + "loss": 1.0154, + "step": 11059 + }, + { + "epoch": 0.7979510118682587, + "grad_norm": 3.823846785289092, + "learning_rate": 4.1305625731483505e-07, + "loss": 0.975, + "step": 11060 + }, + { + "epoch": 0.7980231593376862, + "grad_norm": 2.021668710458618, + "learning_rate": 4.127718622280898e-07, + "loss": 0.9052, + "step": 11061 + }, + { + "epoch": 0.7980953068071137, + "grad_norm": 2.8978980042994533, + "learning_rate": 4.124875538139967e-07, + "loss": 0.8445, + "step": 11062 + }, + { + "epoch": 0.7981674542765412, + "grad_norm": 2.4163930562667715, + "learning_rate": 4.1220333208808154e-07, + "loss": 0.895, + "step": 11063 + }, + { + "epoch": 0.7982396017459688, + "grad_norm": 3.260244801696981, + "learning_rate": 4.1191919706586507e-07, + "loss": 0.9435, + "step": 11064 + }, + { + "epoch": 0.7983117492153963, + "grad_norm": 1.884619261439721, + "learning_rate": 4.116351487628613e-07, + "loss": 0.922, + "step": 11065 + }, + { + "epoch": 0.7983838966848238, + "grad_norm": 0.7717611082122215, + "learning_rate": 4.1135118719458274e-07, + "loss": 0.8777, + "step": 11066 + }, + { + "epoch": 0.7984560441542513, + "grad_norm": 1.9771920509581435, + "learning_rate": 4.110673123765349e-07, + "loss": 0.9409, + "step": 11067 + }, + { + "epoch": 0.7985281916236788, + "grad_norm": 2.3167409413824283, + "learning_rate": 4.107835243242197e-07, + "loss": 0.9223, + "step": 11068 + }, + { + "epoch": 0.7986003390931063, + "grad_norm": 2.2080984050618686, + "learning_rate": 4.1049982305313244e-07, + "loss": 0.8739, + "step": 11069 + }, + { + "epoch": 0.7986724865625339, + "grad_norm": 2.851878694744672, + "learning_rate": 4.102162085787657e-07, + "loss": 0.9639, + "step": 11070 + }, + { + "epoch": 0.7987446340319613, + "grad_norm": 2.722653885652892, + "learning_rate": 4.099326809166075e-07, + "loss": 0.9564, + "step": 11071 + }, + { + "epoch": 0.7988167815013888, + "grad_norm": 2.2813119749246953, + "learning_rate": 4.0964924008213786e-07, + "loss": 0.85, + "step": 11072 + }, + { + "epoch": 0.7988889289708163, + "grad_norm": 3.103819671967318, + "learning_rate": 4.093658860908367e-07, + "loss": 0.9686, + "step": 11073 + }, + { + "epoch": 0.7989610764402438, + "grad_norm": 4.3995594064157215, + "learning_rate": 4.090826189581762e-07, + "loss": 0.9311, + "step": 11074 + }, + { + "epoch": 0.7990332239096714, + "grad_norm": 5.1047005925566715, + "learning_rate": 4.0879943869962495e-07, + "loss": 0.9918, + "step": 11075 + }, + { + "epoch": 0.7991053713790989, + "grad_norm": 1.8928735146238647, + "learning_rate": 4.085163453306455e-07, + "loss": 1.0156, + "step": 11076 + }, + { + "epoch": 0.7991775188485264, + "grad_norm": 3.102147890285556, + "learning_rate": 4.082333388666963e-07, + "loss": 0.8126, + "step": 11077 + }, + { + "epoch": 0.7992496663179539, + "grad_norm": 2.5962782338222983, + "learning_rate": 4.0795041932323305e-07, + "loss": 0.9406, + "step": 11078 + }, + { + "epoch": 0.7993218137873814, + "grad_norm": 0.7310753409963652, + "learning_rate": 4.076675867157033e-07, + "loss": 0.8079, + "step": 11079 + }, + { + "epoch": 0.799393961256809, + "grad_norm": 5.34990351224506, + "learning_rate": 4.0738484105955216e-07, + "loss": 0.9618, + "step": 11080 + }, + { + "epoch": 0.7994661087262365, + "grad_norm": 4.522424139648663, + "learning_rate": 4.0710218237021964e-07, + "loss": 0.876, + "step": 11081 + }, + { + "epoch": 0.799538256195664, + "grad_norm": 2.6995394525868663, + "learning_rate": 4.0681961066313966e-07, + "loss": 0.9033, + "step": 11082 + }, + { + "epoch": 0.7996104036650914, + "grad_norm": 2.685667788471191, + "learning_rate": 4.0653712595374246e-07, + "loss": 0.9559, + "step": 11083 + }, + { + "epoch": 0.7996825511345189, + "grad_norm": 3.3777912042027483, + "learning_rate": 4.062547282574547e-07, + "loss": 0.9071, + "step": 11084 + }, + { + "epoch": 0.7997546986039464, + "grad_norm": 2.142504458695427, + "learning_rate": 4.0597241758969656e-07, + "loss": 0.8897, + "step": 11085 + }, + { + "epoch": 0.799826846073374, + "grad_norm": 2.093347994799609, + "learning_rate": 4.056901939658837e-07, + "loss": 0.9119, + "step": 11086 + }, + { + "epoch": 0.7998989935428015, + "grad_norm": 3.7009196092295076, + "learning_rate": 4.0540805740142715e-07, + "loss": 0.8846, + "step": 11087 + }, + { + "epoch": 0.799971141012229, + "grad_norm": 1.9973033964874338, + "learning_rate": 4.0512600791173424e-07, + "loss": 0.9417, + "step": 11088 + }, + { + "epoch": 0.8000432884816565, + "grad_norm": 4.148208985437599, + "learning_rate": 4.0484404551220487e-07, + "loss": 0.9334, + "step": 11089 + }, + { + "epoch": 0.800115435951084, + "grad_norm": 2.43267540419851, + "learning_rate": 4.0456217021823756e-07, + "loss": 0.9255, + "step": 11090 + }, + { + "epoch": 0.8001875834205116, + "grad_norm": 1.8795036270378576, + "learning_rate": 4.0428038204522476e-07, + "loss": 0.9161, + "step": 11091 + }, + { + "epoch": 0.8002597308899391, + "grad_norm": 2.327926627290809, + "learning_rate": 4.039986810085525e-07, + "loss": 0.9612, + "step": 11092 + }, + { + "epoch": 0.8003318783593666, + "grad_norm": 2.1090631325082514, + "learning_rate": 4.037170671236041e-07, + "loss": 0.9609, + "step": 11093 + }, + { + "epoch": 0.8004040258287941, + "grad_norm": 1.8917422185702248, + "learning_rate": 4.0343554040575746e-07, + "loss": 1.0092, + "step": 11094 + }, + { + "epoch": 0.8004761732982215, + "grad_norm": 3.3119109007787735, + "learning_rate": 4.031541008703863e-07, + "loss": 0.8589, + "step": 11095 + }, + { + "epoch": 0.800548320767649, + "grad_norm": 3.7394694928167977, + "learning_rate": 4.028727485328571e-07, + "loss": 0.9146, + "step": 11096 + }, + { + "epoch": 0.8006204682370766, + "grad_norm": 2.9551470257478933, + "learning_rate": 4.025914834085353e-07, + "loss": 0.9063, + "step": 11097 + }, + { + "epoch": 0.8006926157065041, + "grad_norm": 1.9357972199707059, + "learning_rate": 4.023103055127799e-07, + "loss": 0.7794, + "step": 11098 + }, + { + "epoch": 0.8007647631759316, + "grad_norm": 2.4184577706218584, + "learning_rate": 4.020292148609437e-07, + "loss": 0.8823, + "step": 11099 + }, + { + "epoch": 0.8008369106453591, + "grad_norm": 3.2187931094940034, + "learning_rate": 4.017482114683763e-07, + "loss": 0.8114, + "step": 11100 + }, + { + "epoch": 0.8009090581147866, + "grad_norm": 2.976448757771852, + "learning_rate": 4.0146729535042234e-07, + "loss": 0.9153, + "step": 11101 + }, + { + "epoch": 0.8009812055842142, + "grad_norm": 3.674330649110684, + "learning_rate": 4.011864665224227e-07, + "loss": 0.9321, + "step": 11102 + }, + { + "epoch": 0.8010533530536417, + "grad_norm": 20.110143639399137, + "learning_rate": 4.009057249997112e-07, + "loss": 0.9803, + "step": 11103 + }, + { + "epoch": 0.8011255005230692, + "grad_norm": 2.4684834396737116, + "learning_rate": 4.0062507079761823e-07, + "loss": 0.9377, + "step": 11104 + }, + { + "epoch": 0.8011976479924967, + "grad_norm": 2.6846957102851956, + "learning_rate": 4.003445039314699e-07, + "loss": 0.933, + "step": 11105 + }, + { + "epoch": 0.8012697954619242, + "grad_norm": 2.630462140719951, + "learning_rate": 4.0006402441658624e-07, + "loss": 0.8709, + "step": 11106 + }, + { + "epoch": 0.8013419429313516, + "grad_norm": 2.436207061592743, + "learning_rate": 3.997836322682828e-07, + "loss": 0.9613, + "step": 11107 + }, + { + "epoch": 0.8014140904007792, + "grad_norm": 2.507276531237721, + "learning_rate": 3.995033275018724e-07, + "loss": 0.9369, + "step": 11108 + }, + { + "epoch": 0.8014862378702067, + "grad_norm": 2.5069997070458405, + "learning_rate": 3.9922311013266e-07, + "loss": 0.8288, + "step": 11109 + }, + { + "epoch": 0.8015583853396342, + "grad_norm": 2.6217889673897035, + "learning_rate": 3.9894298017594765e-07, + "loss": 0.8934, + "step": 11110 + }, + { + "epoch": 0.8016305328090617, + "grad_norm": 3.6648531388109857, + "learning_rate": 3.9866293764703207e-07, + "loss": 0.9242, + "step": 11111 + }, + { + "epoch": 0.8017026802784892, + "grad_norm": 3.1542395807973826, + "learning_rate": 3.983829825612062e-07, + "loss": 0.8594, + "step": 11112 + }, + { + "epoch": 0.8017748277479168, + "grad_norm": 3.026393342497499, + "learning_rate": 3.981031149337557e-07, + "loss": 0.9656, + "step": 11113 + }, + { + "epoch": 0.8018469752173443, + "grad_norm": 3.4065541822770924, + "learning_rate": 3.978233347799644e-07, + "loss": 0.9197, + "step": 11114 + }, + { + "epoch": 0.8019191226867718, + "grad_norm": 2.1775164987002547, + "learning_rate": 3.975436421151104e-07, + "loss": 0.9479, + "step": 11115 + }, + { + "epoch": 0.8019912701561993, + "grad_norm": 4.454536936934048, + "learning_rate": 3.9726403695446555e-07, + "loss": 0.9504, + "step": 11116 + }, + { + "epoch": 0.8020634176256268, + "grad_norm": 2.0335965027938894, + "learning_rate": 3.9698451931329836e-07, + "loss": 0.968, + "step": 11117 + }, + { + "epoch": 0.8021355650950542, + "grad_norm": 3.889472514969369, + "learning_rate": 3.967050892068724e-07, + "loss": 1.0129, + "step": 11118 + }, + { + "epoch": 0.8022077125644818, + "grad_norm": 3.9491304865028543, + "learning_rate": 3.96425746650447e-07, + "loss": 0.9079, + "step": 11119 + }, + { + "epoch": 0.8022798600339093, + "grad_norm": 2.720226807534609, + "learning_rate": 3.9614649165927426e-07, + "loss": 0.7805, + "step": 11120 + }, + { + "epoch": 0.8023520075033368, + "grad_norm": 2.5780754315784096, + "learning_rate": 3.958673242486048e-07, + "loss": 1.0062, + "step": 11121 + }, + { + "epoch": 0.8024241549727643, + "grad_norm": 2.6761934951312036, + "learning_rate": 3.955882444336831e-07, + "loss": 0.8082, + "step": 11122 + }, + { + "epoch": 0.8024963024421918, + "grad_norm": 2.4417476323822127, + "learning_rate": 3.953092522297474e-07, + "loss": 0.8244, + "step": 11123 + }, + { + "epoch": 0.8025684499116194, + "grad_norm": 1.8551788424832945, + "learning_rate": 3.9503034765203315e-07, + "loss": 0.8135, + "step": 11124 + }, + { + "epoch": 0.8026405973810469, + "grad_norm": 5.161188523380729, + "learning_rate": 3.9475153071577074e-07, + "loss": 1.019, + "step": 11125 + }, + { + "epoch": 0.8027127448504744, + "grad_norm": 2.364049663622309, + "learning_rate": 3.9447280143618375e-07, + "loss": 0.9027, + "step": 11126 + }, + { + "epoch": 0.8027848923199019, + "grad_norm": 3.0042273618904334, + "learning_rate": 3.9419415982849416e-07, + "loss": 0.9255, + "step": 11127 + }, + { + "epoch": 0.8028570397893294, + "grad_norm": 0.9640785302509567, + "learning_rate": 3.9391560590791695e-07, + "loss": 0.7074, + "step": 11128 + }, + { + "epoch": 0.802929187258757, + "grad_norm": 4.82706457054016, + "learning_rate": 3.936371396896634e-07, + "loss": 0.945, + "step": 11129 + }, + { + "epoch": 0.8030013347281844, + "grad_norm": 2.2815332106280484, + "learning_rate": 3.9335876118893883e-07, + "loss": 0.9596, + "step": 11130 + }, + { + "epoch": 0.8030734821976119, + "grad_norm": 2.25328491743919, + "learning_rate": 3.9308047042094407e-07, + "loss": 1.0046, + "step": 11131 + }, + { + "epoch": 0.8031456296670394, + "grad_norm": 3.252357654784638, + "learning_rate": 3.928022674008773e-07, + "loss": 0.8264, + "step": 11132 + }, + { + "epoch": 0.8032177771364669, + "grad_norm": 2.4998639069707864, + "learning_rate": 3.925241521439284e-07, + "loss": 0.9557, + "step": 11133 + }, + { + "epoch": 0.8032899246058944, + "grad_norm": 7.35588700558376, + "learning_rate": 3.9224612466528504e-07, + "loss": 0.9672, + "step": 11134 + }, + { + "epoch": 0.803362072075322, + "grad_norm": 2.43743368205306, + "learning_rate": 3.919681849801295e-07, + "loss": 0.9291, + "step": 11135 + }, + { + "epoch": 0.8034342195447495, + "grad_norm": 3.42042350466926, + "learning_rate": 3.916903331036383e-07, + "loss": 0.9175, + "step": 11136 + }, + { + "epoch": 0.803506367014177, + "grad_norm": 4.100184850479971, + "learning_rate": 3.9141256905098353e-07, + "loss": 0.854, + "step": 11137 + }, + { + "epoch": 0.8035785144836045, + "grad_norm": 1.9043675492694605, + "learning_rate": 3.9113489283733436e-07, + "loss": 0.9199, + "step": 11138 + }, + { + "epoch": 0.803650661953032, + "grad_norm": 2.8714418368924437, + "learning_rate": 3.9085730447785317e-07, + "loss": 0.8474, + "step": 11139 + }, + { + "epoch": 0.8037228094224595, + "grad_norm": 0.7242820391114313, + "learning_rate": 3.905798039876973e-07, + "loss": 0.8242, + "step": 11140 + }, + { + "epoch": 0.8037949568918871, + "grad_norm": 2.310497757429113, + "learning_rate": 3.903023913820205e-07, + "loss": 0.968, + "step": 11141 + }, + { + "epoch": 0.8038671043613145, + "grad_norm": 2.558447634186598, + "learning_rate": 3.900250666759717e-07, + "loss": 0.9857, + "step": 11142 + }, + { + "epoch": 0.803939251830742, + "grad_norm": 2.523837125803773, + "learning_rate": 3.897478298846935e-07, + "loss": 0.8914, + "step": 11143 + }, + { + "epoch": 0.8040113993001695, + "grad_norm": 2.821242583285797, + "learning_rate": 3.8947068102332505e-07, + "loss": 0.9495, + "step": 11144 + }, + { + "epoch": 0.804083546769597, + "grad_norm": 2.3294074091641104, + "learning_rate": 3.8919362010700097e-07, + "loss": 1.0468, + "step": 11145 + }, + { + "epoch": 0.8041556942390246, + "grad_norm": 0.6569213158333561, + "learning_rate": 3.8891664715085093e-07, + "loss": 0.7518, + "step": 11146 + }, + { + "epoch": 0.8042278417084521, + "grad_norm": 4.411975360403835, + "learning_rate": 3.886397621699984e-07, + "loss": 0.9053, + "step": 11147 + }, + { + "epoch": 0.8042999891778796, + "grad_norm": 2.791424005314762, + "learning_rate": 3.8836296517956323e-07, + "loss": 0.8947, + "step": 11148 + }, + { + "epoch": 0.8043721366473071, + "grad_norm": 2.7384579066868193, + "learning_rate": 3.8808625619466093e-07, + "loss": 0.7999, + "step": 11149 + }, + { + "epoch": 0.8044442841167346, + "grad_norm": 3.119197799550347, + "learning_rate": 3.8780963523040014e-07, + "loss": 0.8717, + "step": 11150 + }, + { + "epoch": 0.8045164315861622, + "grad_norm": 2.990094680562886, + "learning_rate": 3.875331023018875e-07, + "loss": 0.9331, + "step": 11151 + }, + { + "epoch": 0.8045885790555897, + "grad_norm": 2.0454201223386055, + "learning_rate": 3.872566574242235e-07, + "loss": 0.7915, + "step": 11152 + }, + { + "epoch": 0.8046607265250172, + "grad_norm": 0.6592516372408624, + "learning_rate": 3.86980300612503e-07, + "loss": 0.7603, + "step": 11153 + }, + { + "epoch": 0.8047328739944446, + "grad_norm": 2.4285497424014935, + "learning_rate": 3.867040318818169e-07, + "loss": 0.8857, + "step": 11154 + }, + { + "epoch": 0.8048050214638721, + "grad_norm": 2.7515728527353063, + "learning_rate": 3.8642785124725076e-07, + "loss": 0.9349, + "step": 11155 + }, + { + "epoch": 0.8048771689332996, + "grad_norm": 1.7793851091468205, + "learning_rate": 3.861517587238876e-07, + "loss": 0.9472, + "step": 11156 + }, + { + "epoch": 0.8049493164027272, + "grad_norm": 2.632355692821598, + "learning_rate": 3.858757543268021e-07, + "loss": 0.9048, + "step": 11157 + }, + { + "epoch": 0.8050214638721547, + "grad_norm": 3.8851571631339414, + "learning_rate": 3.8559983807106634e-07, + "loss": 0.7847, + "step": 11158 + }, + { + "epoch": 0.8050936113415822, + "grad_norm": 3.0831770126994305, + "learning_rate": 3.8532400997174764e-07, + "loss": 0.9365, + "step": 11159 + }, + { + "epoch": 0.8051657588110097, + "grad_norm": 3.2279685145926025, + "learning_rate": 3.8504827004390706e-07, + "loss": 0.8983, + "step": 11160 + }, + { + "epoch": 0.8052379062804372, + "grad_norm": 2.7597267488304795, + "learning_rate": 3.8477261830260145e-07, + "loss": 0.8404, + "step": 11161 + }, + { + "epoch": 0.8053100537498648, + "grad_norm": 2.4300263098089134, + "learning_rate": 3.844970547628843e-07, + "loss": 0.9538, + "step": 11162 + }, + { + "epoch": 0.8053822012192923, + "grad_norm": 2.140992404566142, + "learning_rate": 3.8422157943980316e-07, + "loss": 0.9756, + "step": 11163 + }, + { + "epoch": 0.8054543486887198, + "grad_norm": 2.9140548092328977, + "learning_rate": 3.839461923483995e-07, + "loss": 0.8664, + "step": 11164 + }, + { + "epoch": 0.8055264961581472, + "grad_norm": 2.7450595172481895, + "learning_rate": 3.836708935037121e-07, + "loss": 0.9821, + "step": 11165 + }, + { + "epoch": 0.8055986436275747, + "grad_norm": 2.8029546883683985, + "learning_rate": 3.8339568292077406e-07, + "loss": 0.871, + "step": 11166 + }, + { + "epoch": 0.8056707910970022, + "grad_norm": 3.827183946433574, + "learning_rate": 3.8312056061461285e-07, + "loss": 0.9129, + "step": 11167 + }, + { + "epoch": 0.8057429385664298, + "grad_norm": 4.252900648401147, + "learning_rate": 3.8284552660025195e-07, + "loss": 0.9347, + "step": 11168 + }, + { + "epoch": 0.8058150860358573, + "grad_norm": 2.602167832341592, + "learning_rate": 3.825705808927113e-07, + "loss": 0.9242, + "step": 11169 + }, + { + "epoch": 0.8058872335052848, + "grad_norm": 2.251461296269028, + "learning_rate": 3.8229572350700303e-07, + "loss": 1.0457, + "step": 11170 + }, + { + "epoch": 0.8059593809747123, + "grad_norm": 2.547081032124853, + "learning_rate": 3.82020954458137e-07, + "loss": 0.8911, + "step": 11171 + }, + { + "epoch": 0.8060315284441398, + "grad_norm": 3.086043807504349, + "learning_rate": 3.81746273761117e-07, + "loss": 0.9199, + "step": 11172 + }, + { + "epoch": 0.8061036759135674, + "grad_norm": 3.4623198149032084, + "learning_rate": 3.814716814309429e-07, + "loss": 0.8914, + "step": 11173 + }, + { + "epoch": 0.8061758233829949, + "grad_norm": 0.8145971742577717, + "learning_rate": 3.811971774826077e-07, + "loss": 0.8267, + "step": 11174 + }, + { + "epoch": 0.8062479708524224, + "grad_norm": 2.199186768646513, + "learning_rate": 3.809227619311026e-07, + "loss": 0.8906, + "step": 11175 + }, + { + "epoch": 0.8063201183218499, + "grad_norm": 2.7873263044082397, + "learning_rate": 3.806484347914123e-07, + "loss": 0.8871, + "step": 11176 + }, + { + "epoch": 0.8063922657912773, + "grad_norm": 3.795919667060199, + "learning_rate": 3.8037419607851586e-07, + "loss": 0.8069, + "step": 11177 + }, + { + "epoch": 0.8064644132607048, + "grad_norm": 5.164920799039571, + "learning_rate": 3.8010004580738887e-07, + "loss": 0.9939, + "step": 11178 + }, + { + "epoch": 0.8065365607301324, + "grad_norm": 3.043725209849958, + "learning_rate": 3.7982598399300115e-07, + "loss": 0.8451, + "step": 11179 + }, + { + "epoch": 0.8066087081995599, + "grad_norm": 2.72327021021003, + "learning_rate": 3.7955201065032005e-07, + "loss": 0.9183, + "step": 11180 + }, + { + "epoch": 0.8066808556689874, + "grad_norm": 2.2097179702386467, + "learning_rate": 3.7927812579430405e-07, + "loss": 0.8436, + "step": 11181 + }, + { + "epoch": 0.8067530031384149, + "grad_norm": 2.115316880748945, + "learning_rate": 3.7900432943991035e-07, + "loss": 1.0403, + "step": 11182 + }, + { + "epoch": 0.8068251506078424, + "grad_norm": 3.082074569958546, + "learning_rate": 3.787306216020896e-07, + "loss": 0.9576, + "step": 11183 + }, + { + "epoch": 0.80689729807727, + "grad_norm": 15.201976768047306, + "learning_rate": 3.784570022957876e-07, + "loss": 0.7845, + "step": 11184 + }, + { + "epoch": 0.8069694455466975, + "grad_norm": 3.839198325040229, + "learning_rate": 3.7818347153594533e-07, + "loss": 0.9049, + "step": 11185 + }, + { + "epoch": 0.807041593016125, + "grad_norm": 2.6377136613983687, + "learning_rate": 3.7791002933750105e-07, + "loss": 0.9747, + "step": 11186 + }, + { + "epoch": 0.8071137404855525, + "grad_norm": 2.588726740667678, + "learning_rate": 3.776366757153846e-07, + "loss": 0.8892, + "step": 11187 + }, + { + "epoch": 0.80718588795498, + "grad_norm": 2.6559033335938675, + "learning_rate": 3.773634106845238e-07, + "loss": 0.9678, + "step": 11188 + }, + { + "epoch": 0.8072580354244074, + "grad_norm": 5.4074926105930246, + "learning_rate": 3.770902342598401e-07, + "loss": 0.9377, + "step": 11189 + }, + { + "epoch": 0.807330182893835, + "grad_norm": 0.7336030108402538, + "learning_rate": 3.768171464562515e-07, + "loss": 0.8033, + "step": 11190 + }, + { + "epoch": 0.8074023303632625, + "grad_norm": 2.351154279286522, + "learning_rate": 3.765441472886692e-07, + "loss": 0.8065, + "step": 11191 + }, + { + "epoch": 0.80747447783269, + "grad_norm": 1.9879825270209188, + "learning_rate": 3.7627123677200066e-07, + "loss": 0.9554, + "step": 11192 + }, + { + "epoch": 0.8075466253021175, + "grad_norm": 3.394623819895376, + "learning_rate": 3.759984149211501e-07, + "loss": 0.9734, + "step": 11193 + }, + { + "epoch": 0.807618772771545, + "grad_norm": 3.4297239262551655, + "learning_rate": 3.757256817510137e-07, + "loss": 0.8841, + "step": 11194 + }, + { + "epoch": 0.8076909202409726, + "grad_norm": 2.3171341316935092, + "learning_rate": 3.7545303727648505e-07, + "loss": 0.8774, + "step": 11195 + }, + { + "epoch": 0.8077630677104001, + "grad_norm": 0.8408449459565756, + "learning_rate": 3.751804815124526e-07, + "loss": 0.8786, + "step": 11196 + }, + { + "epoch": 0.8078352151798276, + "grad_norm": 2.4514376917932026, + "learning_rate": 3.749080144737986e-07, + "loss": 0.8658, + "step": 11197 + }, + { + "epoch": 0.8079073626492551, + "grad_norm": 2.3536922910871434, + "learning_rate": 3.746356361754017e-07, + "loss": 0.9142, + "step": 11198 + }, + { + "epoch": 0.8079795101186826, + "grad_norm": 1.8613089113154444, + "learning_rate": 3.7436334663213607e-07, + "loss": 0.8744, + "step": 11199 + }, + { + "epoch": 0.8080516575881102, + "grad_norm": 4.801167012445671, + "learning_rate": 3.7409114585887093e-07, + "loss": 0.7882, + "step": 11200 + }, + { + "epoch": 0.8081238050575376, + "grad_norm": 2.4460551441305327, + "learning_rate": 3.7381903387046874e-07, + "loss": 0.8805, + "step": 11201 + }, + { + "epoch": 0.8081959525269651, + "grad_norm": 2.1878740263385716, + "learning_rate": 3.7354701068178927e-07, + "loss": 0.8995, + "step": 11202 + }, + { + "epoch": 0.8082680999963926, + "grad_norm": 3.424369286151038, + "learning_rate": 3.732750763076873e-07, + "loss": 0.8578, + "step": 11203 + }, + { + "epoch": 0.8083402474658201, + "grad_norm": 2.4194350226250827, + "learning_rate": 3.730032307630102e-07, + "loss": 0.947, + "step": 11204 + }, + { + "epoch": 0.8084123949352476, + "grad_norm": 1.9802694427302456, + "learning_rate": 3.727314740626044e-07, + "loss": 0.9645, + "step": 11205 + }, + { + "epoch": 0.8084845424046752, + "grad_norm": 2.4637842576399787, + "learning_rate": 3.7245980622130893e-07, + "loss": 0.8681, + "step": 11206 + }, + { + "epoch": 0.8085566898741027, + "grad_norm": 5.073301307252981, + "learning_rate": 3.721882272539587e-07, + "loss": 0.9573, + "step": 11207 + }, + { + "epoch": 0.8086288373435302, + "grad_norm": 2.3810114823680926, + "learning_rate": 3.719167371753831e-07, + "loss": 0.7707, + "step": 11208 + }, + { + "epoch": 0.8087009848129577, + "grad_norm": 4.442862910107929, + "learning_rate": 3.7164533600040704e-07, + "loss": 0.901, + "step": 11209 + }, + { + "epoch": 0.8087731322823852, + "grad_norm": 2.233405803339679, + "learning_rate": 3.7137402374385227e-07, + "loss": 0.9041, + "step": 11210 + }, + { + "epoch": 0.8088452797518128, + "grad_norm": 3.3210737006587974, + "learning_rate": 3.7110280042053256e-07, + "loss": 1.0461, + "step": 11211 + }, + { + "epoch": 0.8089174272212402, + "grad_norm": 2.4596052166262927, + "learning_rate": 3.708316660452588e-07, + "loss": 0.94, + "step": 11212 + }, + { + "epoch": 0.8089895746906677, + "grad_norm": 2.1234762675246066, + "learning_rate": 3.7056062063283756e-07, + "loss": 0.983, + "step": 11213 + }, + { + "epoch": 0.8090617221600952, + "grad_norm": 2.630235446500621, + "learning_rate": 3.702896641980682e-07, + "loss": 0.8546, + "step": 11214 + }, + { + "epoch": 0.8091338696295227, + "grad_norm": 0.7817314953006141, + "learning_rate": 3.700187967557471e-07, + "loss": 0.836, + "step": 11215 + }, + { + "epoch": 0.8092060170989502, + "grad_norm": 9.304940041702983, + "learning_rate": 3.697480183206652e-07, + "loss": 0.8861, + "step": 11216 + }, + { + "epoch": 0.8092781645683778, + "grad_norm": 2.5222885303305045, + "learning_rate": 3.694773289076101e-07, + "loss": 0.8585, + "step": 11217 + }, + { + "epoch": 0.8093503120378053, + "grad_norm": 2.627856743692215, + "learning_rate": 3.692067285313616e-07, + "loss": 0.9122, + "step": 11218 + }, + { + "epoch": 0.8094224595072328, + "grad_norm": 10.024071522475053, + "learning_rate": 3.6893621720669654e-07, + "loss": 0.9168, + "step": 11219 + }, + { + "epoch": 0.8094946069766603, + "grad_norm": 4.177430072703683, + "learning_rate": 3.6866579494838713e-07, + "loss": 0.9064, + "step": 11220 + }, + { + "epoch": 0.8095667544460878, + "grad_norm": 2.4351427466440656, + "learning_rate": 3.683954617711993e-07, + "loss": 0.9875, + "step": 11221 + }, + { + "epoch": 0.8096389019155154, + "grad_norm": 3.6013743585196245, + "learning_rate": 3.681252176898948e-07, + "loss": 0.8857, + "step": 11222 + }, + { + "epoch": 0.8097110493849429, + "grad_norm": 2.8926028521689555, + "learning_rate": 3.678550627192316e-07, + "loss": 0.8665, + "step": 11223 + }, + { + "epoch": 0.8097831968543703, + "grad_norm": 2.4027512119517587, + "learning_rate": 3.6758499687396196e-07, + "loss": 0.835, + "step": 11224 + }, + { + "epoch": 0.8098553443237978, + "grad_norm": 3.2242880279293193, + "learning_rate": 3.673150201688322e-07, + "loss": 0.9172, + "step": 11225 + }, + { + "epoch": 0.8099274917932253, + "grad_norm": 3.3022756590726066, + "learning_rate": 3.6704513261858506e-07, + "loss": 0.9047, + "step": 11226 + }, + { + "epoch": 0.8099996392626528, + "grad_norm": 2.6010796726084413, + "learning_rate": 3.6677533423795895e-07, + "loss": 0.8581, + "step": 11227 + }, + { + "epoch": 0.8100717867320804, + "grad_norm": 2.2509763506643066, + "learning_rate": 3.665056250416847e-07, + "loss": 0.9374, + "step": 11228 + }, + { + "epoch": 0.8101439342015079, + "grad_norm": 3.3297540680394833, + "learning_rate": 3.662360050444917e-07, + "loss": 0.8265, + "step": 11229 + }, + { + "epoch": 0.8102160816709354, + "grad_norm": 3.2652168566437734, + "learning_rate": 3.6596647426110305e-07, + "loss": 0.8617, + "step": 11230 + }, + { + "epoch": 0.8102882291403629, + "grad_norm": 4.221260510613141, + "learning_rate": 3.656970327062359e-07, + "loss": 0.9886, + "step": 11231 + }, + { + "epoch": 0.8103603766097904, + "grad_norm": 3.2232024868506532, + "learning_rate": 3.654276803946037e-07, + "loss": 0.933, + "step": 11232 + }, + { + "epoch": 0.810432524079218, + "grad_norm": 2.5493400205853005, + "learning_rate": 3.6515841734091435e-07, + "loss": 0.99, + "step": 11233 + }, + { + "epoch": 0.8105046715486455, + "grad_norm": 5.11663675620483, + "learning_rate": 3.648892435598727e-07, + "loss": 0.963, + "step": 11234 + }, + { + "epoch": 0.810576819018073, + "grad_norm": 2.8012447213119347, + "learning_rate": 3.646201590661762e-07, + "loss": 0.822, + "step": 11235 + }, + { + "epoch": 0.8106489664875004, + "grad_norm": 6.5718642499094795, + "learning_rate": 3.6435116387451894e-07, + "loss": 0.9532, + "step": 11236 + }, + { + "epoch": 0.8107211139569279, + "grad_norm": 3.0453338637955576, + "learning_rate": 3.6408225799958993e-07, + "loss": 0.9871, + "step": 11237 + }, + { + "epoch": 0.8107932614263554, + "grad_norm": 2.41707731463095, + "learning_rate": 3.638134414560725e-07, + "loss": 0.8939, + "step": 11238 + }, + { + "epoch": 0.810865408895783, + "grad_norm": 0.936048019074049, + "learning_rate": 3.635447142586461e-07, + "loss": 0.8381, + "step": 11239 + }, + { + "epoch": 0.8109375563652105, + "grad_norm": 2.228882982498276, + "learning_rate": 3.632760764219847e-07, + "loss": 0.9863, + "step": 11240 + }, + { + "epoch": 0.811009703834638, + "grad_norm": 2.4525122393536973, + "learning_rate": 3.630075279607581e-07, + "loss": 0.909, + "step": 11241 + }, + { + "epoch": 0.8110818513040655, + "grad_norm": 3.3523265616821503, + "learning_rate": 3.627390688896301e-07, + "loss": 0.8478, + "step": 11242 + }, + { + "epoch": 0.811153998773493, + "grad_norm": 2.58148002800317, + "learning_rate": 3.624706992232609e-07, + "loss": 0.9423, + "step": 11243 + }, + { + "epoch": 0.8112261462429206, + "grad_norm": 1.662722840826788, + "learning_rate": 3.622024189763053e-07, + "loss": 0.9004, + "step": 11244 + }, + { + "epoch": 0.8112982937123481, + "grad_norm": 2.6714138949306534, + "learning_rate": 3.6193422816341235e-07, + "loss": 1.0038, + "step": 11245 + }, + { + "epoch": 0.8113704411817756, + "grad_norm": 2.965792176305047, + "learning_rate": 3.6166612679922647e-07, + "loss": 0.9387, + "step": 11246 + }, + { + "epoch": 0.8114425886512031, + "grad_norm": 4.545363342063751, + "learning_rate": 3.613981148983898e-07, + "loss": 0.9576, + "step": 11247 + }, + { + "epoch": 0.8115147361206305, + "grad_norm": 3.022507004394493, + "learning_rate": 3.6113019247553543e-07, + "loss": 0.7893, + "step": 11248 + }, + { + "epoch": 0.811586883590058, + "grad_norm": 2.85507996883064, + "learning_rate": 3.6086235954529445e-07, + "loss": 0.8656, + "step": 11249 + }, + { + "epoch": 0.8116590310594856, + "grad_norm": 2.3198430777786627, + "learning_rate": 3.605946161222921e-07, + "loss": 0.8961, + "step": 11250 + }, + { + "epoch": 0.8117311785289131, + "grad_norm": 4.863222616677171, + "learning_rate": 3.6032696222114954e-07, + "loss": 0.9819, + "step": 11251 + }, + { + "epoch": 0.8118033259983406, + "grad_norm": 3.020149434163034, + "learning_rate": 3.6005939785648075e-07, + "loss": 0.8495, + "step": 11252 + }, + { + "epoch": 0.8118754734677681, + "grad_norm": 2.9422607801714578, + "learning_rate": 3.597919230428976e-07, + "loss": 0.927, + "step": 11253 + }, + { + "epoch": 0.8119476209371956, + "grad_norm": 2.1660945454865472, + "learning_rate": 3.5952453779500646e-07, + "loss": 0.9924, + "step": 11254 + }, + { + "epoch": 0.8120197684066232, + "grad_norm": 0.8367186844627148, + "learning_rate": 3.5925724212740694e-07, + "loss": 0.8474, + "step": 11255 + }, + { + "epoch": 0.8120919158760507, + "grad_norm": 3.312539154397131, + "learning_rate": 3.589900360546956e-07, + "loss": 0.968, + "step": 11256 + }, + { + "epoch": 0.8121640633454782, + "grad_norm": 0.7433113022463194, + "learning_rate": 3.587229195914638e-07, + "loss": 0.7526, + "step": 11257 + }, + { + "epoch": 0.8122362108149057, + "grad_norm": 0.7690213833156225, + "learning_rate": 3.5845589275229694e-07, + "loss": 0.7728, + "step": 11258 + }, + { + "epoch": 0.8123083582843332, + "grad_norm": 2.210389409689673, + "learning_rate": 3.581889555517772e-07, + "loss": 0.9407, + "step": 11259 + }, + { + "epoch": 0.8123805057537606, + "grad_norm": 1.8533714889582644, + "learning_rate": 3.5792210800448095e-07, + "loss": 0.9209, + "step": 11260 + }, + { + "epoch": 0.8124526532231882, + "grad_norm": 2.7975404256809395, + "learning_rate": 3.5765535012498014e-07, + "loss": 0.9498, + "step": 11261 + }, + { + "epoch": 0.8125248006926157, + "grad_norm": 0.6927363215554964, + "learning_rate": 3.5738868192784045e-07, + "loss": 0.7608, + "step": 11262 + }, + { + "epoch": 0.8125969481620432, + "grad_norm": 2.550634406216623, + "learning_rate": 3.571221034276235e-07, + "loss": 0.9839, + "step": 11263 + }, + { + "epoch": 0.8126690956314707, + "grad_norm": 2.359511819876562, + "learning_rate": 3.568556146388879e-07, + "loss": 0.959, + "step": 11264 + }, + { + "epoch": 0.8127412431008982, + "grad_norm": 12.485897195803739, + "learning_rate": 3.5658921557618384e-07, + "loss": 0.954, + "step": 11265 + }, + { + "epoch": 0.8128133905703258, + "grad_norm": 3.5759225255267117, + "learning_rate": 3.563229062540589e-07, + "loss": 0.9559, + "step": 11266 + }, + { + "epoch": 0.8128855380397533, + "grad_norm": 4.810617846740979, + "learning_rate": 3.5605668668705555e-07, + "loss": 0.8514, + "step": 11267 + }, + { + "epoch": 0.8129576855091808, + "grad_norm": 3.3812419954723443, + "learning_rate": 3.557905568897115e-07, + "loss": 0.9352, + "step": 11268 + }, + { + "epoch": 0.8130298329786083, + "grad_norm": 2.4801569224517532, + "learning_rate": 3.5552451687655773e-07, + "loss": 0.9014, + "step": 11269 + }, + { + "epoch": 0.8131019804480358, + "grad_norm": 0.7009966958195283, + "learning_rate": 3.5525856666212195e-07, + "loss": 0.7734, + "step": 11270 + }, + { + "epoch": 0.8131741279174632, + "grad_norm": 1.886821041138011, + "learning_rate": 3.5499270626092834e-07, + "loss": 0.9284, + "step": 11271 + }, + { + "epoch": 0.8132462753868908, + "grad_norm": 2.7090361245733074, + "learning_rate": 3.547269356874929e-07, + "loss": 0.8126, + "step": 11272 + }, + { + "epoch": 0.8133184228563183, + "grad_norm": 3.6209104423667573, + "learning_rate": 3.5446125495632885e-07, + "loss": 0.9032, + "step": 11273 + }, + { + "epoch": 0.8133905703257458, + "grad_norm": 2.3601589289993794, + "learning_rate": 3.541956640819448e-07, + "loss": 0.9691, + "step": 11274 + }, + { + "epoch": 0.8134627177951733, + "grad_norm": 2.667469519715425, + "learning_rate": 3.5393016307884225e-07, + "loss": 0.9505, + "step": 11275 + }, + { + "epoch": 0.8135348652646008, + "grad_norm": 2.9811460592836148, + "learning_rate": 3.5366475196151944e-07, + "loss": 0.9589, + "step": 11276 + }, + { + "epoch": 0.8136070127340284, + "grad_norm": 25.531502207674126, + "learning_rate": 3.533994307444708e-07, + "loss": 0.9151, + "step": 11277 + }, + { + "epoch": 0.8136791602034559, + "grad_norm": 5.616543325698054, + "learning_rate": 3.5313419944218394e-07, + "loss": 1.0285, + "step": 11278 + }, + { + "epoch": 0.8137513076728834, + "grad_norm": 4.093893965890553, + "learning_rate": 3.528690580691416e-07, + "loss": 0.8557, + "step": 11279 + }, + { + "epoch": 0.8138234551423109, + "grad_norm": 1.9677742856310207, + "learning_rate": 3.5260400663982237e-07, + "loss": 0.8167, + "step": 11280 + }, + { + "epoch": 0.8138956026117384, + "grad_norm": 8.139550429167747, + "learning_rate": 3.523390451687005e-07, + "loss": 1.0161, + "step": 11281 + }, + { + "epoch": 0.813967750081166, + "grad_norm": 2.8604147432954736, + "learning_rate": 3.5207417367024304e-07, + "loss": 0.9131, + "step": 11282 + }, + { + "epoch": 0.8140398975505934, + "grad_norm": 3.1062258007557597, + "learning_rate": 3.518093921589149e-07, + "loss": 0.9314, + "step": 11283 + }, + { + "epoch": 0.8141120450200209, + "grad_norm": 2.3173256090179577, + "learning_rate": 3.51544700649175e-07, + "loss": 0.9101, + "step": 11284 + }, + { + "epoch": 0.8141841924894484, + "grad_norm": 3.2862758837725625, + "learning_rate": 3.5128009915547607e-07, + "loss": 0.868, + "step": 11285 + }, + { + "epoch": 0.8142563399588759, + "grad_norm": 2.709657658282971, + "learning_rate": 3.510155876922676e-07, + "loss": 0.8984, + "step": 11286 + }, + { + "epoch": 0.8143284874283034, + "grad_norm": 2.2720675716090124, + "learning_rate": 3.50751166273993e-07, + "loss": 0.9292, + "step": 11287 + }, + { + "epoch": 0.814400634897731, + "grad_norm": 1.8761821834702752, + "learning_rate": 3.504868349150931e-07, + "loss": 0.8324, + "step": 11288 + }, + { + "epoch": 0.8144727823671585, + "grad_norm": 1.7325032762328598, + "learning_rate": 3.502225936300003e-07, + "loss": 0.9675, + "step": 11289 + }, + { + "epoch": 0.814544929836586, + "grad_norm": 2.2496824040377046, + "learning_rate": 3.4995844243314433e-07, + "loss": 0.9421, + "step": 11290 + }, + { + "epoch": 0.8146170773060135, + "grad_norm": 2.619030204112295, + "learning_rate": 3.4969438133895015e-07, + "loss": 1.0544, + "step": 11291 + }, + { + "epoch": 0.814689224775441, + "grad_norm": 2.640362246831792, + "learning_rate": 3.4943041036183594e-07, + "loss": 1.0458, + "step": 11292 + }, + { + "epoch": 0.8147613722448686, + "grad_norm": 0.7901826593633517, + "learning_rate": 3.4916652951621693e-07, + "loss": 0.8255, + "step": 11293 + }, + { + "epoch": 0.8148335197142961, + "grad_norm": 2.546207644784951, + "learning_rate": 3.48902738816502e-07, + "loss": 0.8665, + "step": 11294 + }, + { + "epoch": 0.8149056671837235, + "grad_norm": 3.9555924144794425, + "learning_rate": 3.486390382770974e-07, + "loss": 0.8613, + "step": 11295 + }, + { + "epoch": 0.814977814653151, + "grad_norm": 2.2055010596951448, + "learning_rate": 3.4837542791240116e-07, + "loss": 0.9084, + "step": 11296 + }, + { + "epoch": 0.8150499621225785, + "grad_norm": 4.532243290915314, + "learning_rate": 3.4811190773680886e-07, + "loss": 0.7818, + "step": 11297 + }, + { + "epoch": 0.815122109592006, + "grad_norm": 2.750063895436903, + "learning_rate": 3.478484777647106e-07, + "loss": 0.7924, + "step": 11298 + }, + { + "epoch": 0.8151942570614336, + "grad_norm": 2.5950270117381034, + "learning_rate": 3.475851380104904e-07, + "loss": 1.0578, + "step": 11299 + }, + { + "epoch": 0.8152664045308611, + "grad_norm": 0.6888017035461033, + "learning_rate": 3.473218884885285e-07, + "loss": 0.7373, + "step": 11300 + }, + { + "epoch": 0.8153385520002886, + "grad_norm": 2.8252787730985833, + "learning_rate": 3.4705872921320103e-07, + "loss": 0.8715, + "step": 11301 + }, + { + "epoch": 0.8154106994697161, + "grad_norm": 2.212974984428826, + "learning_rate": 3.467956601988771e-07, + "loss": 0.9522, + "step": 11302 + }, + { + "epoch": 0.8154828469391436, + "grad_norm": 1.7762163883602187, + "learning_rate": 3.4653268145992185e-07, + "loss": 0.9355, + "step": 11303 + }, + { + "epoch": 0.8155549944085712, + "grad_norm": 2.4925675535852165, + "learning_rate": 3.4626979301069615e-07, + "loss": 1.0144, + "step": 11304 + }, + { + "epoch": 0.8156271418779987, + "grad_norm": 2.2916230862259463, + "learning_rate": 3.460069948655557e-07, + "loss": 0.9011, + "step": 11305 + }, + { + "epoch": 0.8156992893474262, + "grad_norm": 3.6516354123868333, + "learning_rate": 3.457442870388494e-07, + "loss": 0.9957, + "step": 11306 + }, + { + "epoch": 0.8157714368168536, + "grad_norm": 2.4230823091766154, + "learning_rate": 3.454816695449241e-07, + "loss": 0.8472, + "step": 11307 + }, + { + "epoch": 0.8158435842862811, + "grad_norm": 2.2217739076861536, + "learning_rate": 3.4521914239812076e-07, + "loss": 0.9916, + "step": 11308 + }, + { + "epoch": 0.8159157317557086, + "grad_norm": 3.1710062575505593, + "learning_rate": 3.4495670561277354e-07, + "loss": 0.9355, + "step": 11309 + }, + { + "epoch": 0.8159878792251362, + "grad_norm": 1.789223876071643, + "learning_rate": 3.446943592032139e-07, + "loss": 0.9494, + "step": 11310 + }, + { + "epoch": 0.8160600266945637, + "grad_norm": 4.353099832236111, + "learning_rate": 3.4443210318376716e-07, + "loss": 0.8731, + "step": 11311 + }, + { + "epoch": 0.8161321741639912, + "grad_norm": 2.683741314202775, + "learning_rate": 3.4416993756875544e-07, + "loss": 0.9432, + "step": 11312 + }, + { + "epoch": 0.8162043216334187, + "grad_norm": 3.2994305437214155, + "learning_rate": 3.4390786237249337e-07, + "loss": 0.9543, + "step": 11313 + }, + { + "epoch": 0.8162764691028462, + "grad_norm": 4.067635913681254, + "learning_rate": 3.436458776092923e-07, + "loss": 0.944, + "step": 11314 + }, + { + "epoch": 0.8163486165722738, + "grad_norm": 2.4802174838902777, + "learning_rate": 3.4338398329345863e-07, + "loss": 0.9369, + "step": 11315 + }, + { + "epoch": 0.8164207640417013, + "grad_norm": 4.971439809113127, + "learning_rate": 3.4312217943929266e-07, + "loss": 0.8953, + "step": 11316 + }, + { + "epoch": 0.8164929115111288, + "grad_norm": 0.7888364184717065, + "learning_rate": 3.4286046606109075e-07, + "loss": 0.8713, + "step": 11317 + }, + { + "epoch": 0.8165650589805562, + "grad_norm": 1.8261796430942736, + "learning_rate": 3.425988431731448e-07, + "loss": 0.8645, + "step": 11318 + }, + { + "epoch": 0.8166372064499837, + "grad_norm": 2.0979695858637295, + "learning_rate": 3.423373107897396e-07, + "loss": 0.9245, + "step": 11319 + }, + { + "epoch": 0.8167093539194112, + "grad_norm": 4.6887122049132985, + "learning_rate": 3.420758689251579e-07, + "loss": 0.9462, + "step": 11320 + }, + { + "epoch": 0.8167815013888388, + "grad_norm": 2.9232737411230496, + "learning_rate": 3.4181451759367575e-07, + "loss": 0.8641, + "step": 11321 + }, + { + "epoch": 0.8168536488582663, + "grad_norm": 3.8766068849248283, + "learning_rate": 3.415532568095647e-07, + "loss": 0.9789, + "step": 11322 + }, + { + "epoch": 0.8169257963276938, + "grad_norm": 2.0908157512502794, + "learning_rate": 3.412920865870905e-07, + "loss": 0.8882, + "step": 11323 + }, + { + "epoch": 0.8169979437971213, + "grad_norm": 2.596135341002327, + "learning_rate": 3.4103100694051465e-07, + "loss": 0.9771, + "step": 11324 + }, + { + "epoch": 0.8170700912665488, + "grad_norm": 2.9636816830847983, + "learning_rate": 3.407700178840951e-07, + "loss": 0.8962, + "step": 11325 + }, + { + "epoch": 0.8171422387359764, + "grad_norm": 4.766484217497662, + "learning_rate": 3.4050911943208215e-07, + "loss": 0.9659, + "step": 11326 + }, + { + "epoch": 0.8172143862054039, + "grad_norm": 2.008172266451662, + "learning_rate": 3.402483115987229e-07, + "loss": 0.9677, + "step": 11327 + }, + { + "epoch": 0.8172865336748314, + "grad_norm": 2.4268593964528242, + "learning_rate": 3.3998759439826e-07, + "loss": 0.9764, + "step": 11328 + }, + { + "epoch": 0.8173586811442589, + "grad_norm": 2.337252141693861, + "learning_rate": 3.397269678449286e-07, + "loss": 0.9358, + "step": 11329 + }, + { + "epoch": 0.8174308286136863, + "grad_norm": 3.4277246371834615, + "learning_rate": 3.3946643195296096e-07, + "loss": 0.9404, + "step": 11330 + }, + { + "epoch": 0.8175029760831138, + "grad_norm": 3.8663895217185225, + "learning_rate": 3.3920598673658486e-07, + "loss": 0.9349, + "step": 11331 + }, + { + "epoch": 0.8175751235525414, + "grad_norm": 0.7979715320127091, + "learning_rate": 3.389456322100221e-07, + "loss": 0.8155, + "step": 11332 + }, + { + "epoch": 0.8176472710219689, + "grad_norm": 1.8126255518080685, + "learning_rate": 3.3868536838748904e-07, + "loss": 0.9694, + "step": 11333 + }, + { + "epoch": 0.8177194184913964, + "grad_norm": 2.9229757102246805, + "learning_rate": 3.3842519528319825e-07, + "loss": 0.8855, + "step": 11334 + }, + { + "epoch": 0.8177915659608239, + "grad_norm": 2.8957283129047524, + "learning_rate": 3.3816511291135676e-07, + "loss": 1.0375, + "step": 11335 + }, + { + "epoch": 0.8178637134302514, + "grad_norm": 3.4259260053510534, + "learning_rate": 3.3790512128616566e-07, + "loss": 0.9373, + "step": 11336 + }, + { + "epoch": 0.817935860899679, + "grad_norm": 3.026865984225665, + "learning_rate": 3.3764522042182366e-07, + "loss": 0.9326, + "step": 11337 + }, + { + "epoch": 0.8180080083691065, + "grad_norm": 2.005809096139877, + "learning_rate": 3.373854103325222e-07, + "loss": 0.9064, + "step": 11338 + }, + { + "epoch": 0.818080155838534, + "grad_norm": 0.818479761268425, + "learning_rate": 3.3712569103244913e-07, + "loss": 0.8289, + "step": 11339 + }, + { + "epoch": 0.8181523033079615, + "grad_norm": 2.644780607877741, + "learning_rate": 3.3686606253578596e-07, + "loss": 0.8554, + "step": 11340 + }, + { + "epoch": 0.818224450777389, + "grad_norm": 2.2488696119845994, + "learning_rate": 3.366065248567103e-07, + "loss": 0.9806, + "step": 11341 + }, + { + "epoch": 0.8182965982468164, + "grad_norm": 1.9187487474866127, + "learning_rate": 3.3634707800939535e-07, + "loss": 0.9505, + "step": 11342 + }, + { + "epoch": 0.818368745716244, + "grad_norm": 3.863775053951964, + "learning_rate": 3.360877220080065e-07, + "loss": 0.8338, + "step": 11343 + }, + { + "epoch": 0.8184408931856715, + "grad_norm": 2.7766187019220046, + "learning_rate": 3.358284568667082e-07, + "loss": 0.9891, + "step": 11344 + }, + { + "epoch": 0.818513040655099, + "grad_norm": 0.8965850972957392, + "learning_rate": 3.355692825996579e-07, + "loss": 0.8412, + "step": 11345 + }, + { + "epoch": 0.8185851881245265, + "grad_norm": 2.5260739563651184, + "learning_rate": 3.35310199221007e-07, + "loss": 0.9062, + "step": 11346 + }, + { + "epoch": 0.818657335593954, + "grad_norm": 1.861751224108135, + "learning_rate": 3.3505120674490386e-07, + "loss": 0.9973, + "step": 11347 + }, + { + "epoch": 0.8187294830633816, + "grad_norm": 2.6144566194951806, + "learning_rate": 3.347923051854902e-07, + "loss": 0.8731, + "step": 11348 + }, + { + "epoch": 0.8188016305328091, + "grad_norm": 3.1968656090442775, + "learning_rate": 3.3453349455690536e-07, + "loss": 0.8924, + "step": 11349 + }, + { + "epoch": 0.8188737780022366, + "grad_norm": 0.7519578413381922, + "learning_rate": 3.342747748732806e-07, + "loss": 0.8229, + "step": 11350 + }, + { + "epoch": 0.8189459254716641, + "grad_norm": 3.2339885683535896, + "learning_rate": 3.340161461487439e-07, + "loss": 0.8819, + "step": 11351 + }, + { + "epoch": 0.8190180729410916, + "grad_norm": 3.2709126715949166, + "learning_rate": 3.3375760839741896e-07, + "loss": 0.9301, + "step": 11352 + }, + { + "epoch": 0.8190902204105192, + "grad_norm": 2.4105033439101877, + "learning_rate": 3.334991616334222e-07, + "loss": 0.9768, + "step": 11353 + }, + { + "epoch": 0.8191623678799466, + "grad_norm": 3.066822038391066, + "learning_rate": 3.3324080587086644e-07, + "loss": 0.9743, + "step": 11354 + }, + { + "epoch": 0.8192345153493741, + "grad_norm": 0.759504572484791, + "learning_rate": 3.329825411238607e-07, + "loss": 0.7873, + "step": 11355 + }, + { + "epoch": 0.8193066628188016, + "grad_norm": 4.75279133746018, + "learning_rate": 3.327243674065079e-07, + "loss": 0.9148, + "step": 11356 + }, + { + "epoch": 0.8193788102882291, + "grad_norm": 2.4217903368518394, + "learning_rate": 3.324662847329047e-07, + "loss": 0.8874, + "step": 11357 + }, + { + "epoch": 0.8194509577576566, + "grad_norm": 3.5569355657498893, + "learning_rate": 3.322082931171448e-07, + "loss": 0.9762, + "step": 11358 + }, + { + "epoch": 0.8195231052270842, + "grad_norm": 2.049916693692776, + "learning_rate": 3.3195039257331647e-07, + "loss": 1.0057, + "step": 11359 + }, + { + "epoch": 0.8195952526965117, + "grad_norm": 0.6765654759859174, + "learning_rate": 3.316925831155013e-07, + "loss": 0.7108, + "step": 11360 + }, + { + "epoch": 0.8196674001659392, + "grad_norm": 2.7152206488602486, + "learning_rate": 3.3143486475777893e-07, + "loss": 0.8081, + "step": 11361 + }, + { + "epoch": 0.8197395476353667, + "grad_norm": 2.3600370981242444, + "learning_rate": 3.3117723751422233e-07, + "loss": 0.9132, + "step": 11362 + }, + { + "epoch": 0.8198116951047942, + "grad_norm": 2.623382796807203, + "learning_rate": 3.3091970139889823e-07, + "loss": 0.8959, + "step": 11363 + }, + { + "epoch": 0.8198838425742218, + "grad_norm": 2.8844763443522865, + "learning_rate": 3.3066225642587075e-07, + "loss": 0.9268, + "step": 11364 + }, + { + "epoch": 0.8199559900436493, + "grad_norm": 3.2062261688358076, + "learning_rate": 3.3040490260919773e-07, + "loss": 0.9277, + "step": 11365 + }, + { + "epoch": 0.8200281375130767, + "grad_norm": 1.704061058459362, + "learning_rate": 3.301476399629328e-07, + "loss": 0.834, + "step": 11366 + }, + { + "epoch": 0.8201002849825042, + "grad_norm": 0.7802539579516601, + "learning_rate": 3.298904685011228e-07, + "loss": 0.7874, + "step": 11367 + }, + { + "epoch": 0.8201724324519317, + "grad_norm": 2.450761762255163, + "learning_rate": 3.2963338823781194e-07, + "loss": 1.0071, + "step": 11368 + }, + { + "epoch": 0.8202445799213592, + "grad_norm": 3.3716487493964493, + "learning_rate": 3.293763991870391e-07, + "loss": 0.9055, + "step": 11369 + }, + { + "epoch": 0.8203167273907868, + "grad_norm": 2.452431161519064, + "learning_rate": 3.2911950136283605e-07, + "loss": 0.864, + "step": 11370 + }, + { + "epoch": 0.8203888748602143, + "grad_norm": 5.487556947312711, + "learning_rate": 3.288626947792317e-07, + "loss": 0.8787, + "step": 11371 + }, + { + "epoch": 0.8204610223296418, + "grad_norm": 2.8571922434217023, + "learning_rate": 3.286059794502496e-07, + "loss": 0.9657, + "step": 11372 + }, + { + "epoch": 0.8205331697990693, + "grad_norm": 2.5920026477576497, + "learning_rate": 3.2834935538990683e-07, + "loss": 0.8691, + "step": 11373 + }, + { + "epoch": 0.8206053172684968, + "grad_norm": 3.197275617300742, + "learning_rate": 3.2809282261221793e-07, + "loss": 0.9807, + "step": 11374 + }, + { + "epoch": 0.8206774647379244, + "grad_norm": 4.385200597857214, + "learning_rate": 3.2783638113119085e-07, + "loss": 0.9785, + "step": 11375 + }, + { + "epoch": 0.8207496122073519, + "grad_norm": 2.747133755349654, + "learning_rate": 3.275800309608294e-07, + "loss": 0.9631, + "step": 11376 + }, + { + "epoch": 0.8208217596767793, + "grad_norm": 2.0243940412209427, + "learning_rate": 3.2732377211513075e-07, + "loss": 0.9244, + "step": 11377 + }, + { + "epoch": 0.8208939071462068, + "grad_norm": 2.844435053882728, + "learning_rate": 3.2706760460808847e-07, + "loss": 0.9258, + "step": 11378 + }, + { + "epoch": 0.8209660546156343, + "grad_norm": 13.562006084396332, + "learning_rate": 3.268115284536923e-07, + "loss": 0.972, + "step": 11379 + }, + { + "epoch": 0.8210382020850618, + "grad_norm": 3.2512936584990886, + "learning_rate": 3.26555543665924e-07, + "loss": 0.8548, + "step": 11380 + }, + { + "epoch": 0.8211103495544894, + "grad_norm": 2.363606882911844, + "learning_rate": 3.262996502587627e-07, + "loss": 0.9039, + "step": 11381 + }, + { + "epoch": 0.8211824970239169, + "grad_norm": 2.230673758844174, + "learning_rate": 3.260438482461818e-07, + "loss": 0.8352, + "step": 11382 + }, + { + "epoch": 0.8212546444933444, + "grad_norm": 2.133583348743699, + "learning_rate": 3.2578813764214984e-07, + "loss": 0.9343, + "step": 11383 + }, + { + "epoch": 0.8213267919627719, + "grad_norm": 3.1540716226654273, + "learning_rate": 3.25532518460629e-07, + "loss": 0.8482, + "step": 11384 + }, + { + "epoch": 0.8213989394321994, + "grad_norm": 2.782380377982051, + "learning_rate": 3.2527699071557926e-07, + "loss": 0.9172, + "step": 11385 + }, + { + "epoch": 0.821471086901627, + "grad_norm": 0.6234878844741324, + "learning_rate": 3.2502155442095405e-07, + "loss": 0.7207, + "step": 11386 + }, + { + "epoch": 0.8215432343710545, + "grad_norm": 3.2366867281144382, + "learning_rate": 3.247662095907004e-07, + "loss": 0.9489, + "step": 11387 + }, + { + "epoch": 0.821615381840482, + "grad_norm": 2.9451476842769955, + "learning_rate": 3.2451095623876267e-07, + "loss": 0.9076, + "step": 11388 + }, + { + "epoch": 0.8216875293099094, + "grad_norm": 2.4724655198334187, + "learning_rate": 3.242557943790798e-07, + "loss": 1.034, + "step": 11389 + }, + { + "epoch": 0.8217596767793369, + "grad_norm": 5.009980159575567, + "learning_rate": 3.2400072402558396e-07, + "loss": 0.949, + "step": 11390 + }, + { + "epoch": 0.8218318242487644, + "grad_norm": 3.1066793901021943, + "learning_rate": 3.237457451922039e-07, + "loss": 0.8975, + "step": 11391 + }, + { + "epoch": 0.821903971718192, + "grad_norm": 2.442146079309501, + "learning_rate": 3.2349085789286366e-07, + "loss": 0.9026, + "step": 11392 + }, + { + "epoch": 0.8219761191876195, + "grad_norm": 3.409138985497148, + "learning_rate": 3.232360621414822e-07, + "loss": 0.8167, + "step": 11393 + }, + { + "epoch": 0.822048266657047, + "grad_norm": 2.7834846862788343, + "learning_rate": 3.229813579519716e-07, + "loss": 0.8029, + "step": 11394 + }, + { + "epoch": 0.8221204141264745, + "grad_norm": 0.709637595823716, + "learning_rate": 3.2272674533824093e-07, + "loss": 0.7916, + "step": 11395 + }, + { + "epoch": 0.822192561595902, + "grad_norm": 0.8183017246087846, + "learning_rate": 3.224722243141942e-07, + "loss": 0.8514, + "step": 11396 + }, + { + "epoch": 0.8222647090653296, + "grad_norm": 2.402764806057039, + "learning_rate": 3.2221779489372815e-07, + "loss": 0.8308, + "step": 11397 + }, + { + "epoch": 0.8223368565347571, + "grad_norm": 5.820373452910703, + "learning_rate": 3.2196345709073793e-07, + "loss": 0.8707, + "step": 11398 + }, + { + "epoch": 0.8224090040041846, + "grad_norm": 5.9908170363839135, + "learning_rate": 3.217092109191115e-07, + "loss": 0.9752, + "step": 11399 + }, + { + "epoch": 0.8224811514736121, + "grad_norm": 2.5001969259903687, + "learning_rate": 3.2145505639273274e-07, + "loss": 0.8749, + "step": 11400 + }, + { + "epoch": 0.8225532989430395, + "grad_norm": 2.921318939662187, + "learning_rate": 3.212009935254791e-07, + "loss": 0.8635, + "step": 11401 + }, + { + "epoch": 0.822625446412467, + "grad_norm": 2.559371992954821, + "learning_rate": 3.2094702233122405e-07, + "loss": 0.8349, + "step": 11402 + }, + { + "epoch": 0.8226975938818946, + "grad_norm": 2.4317119051738847, + "learning_rate": 3.2069314282383754e-07, + "loss": 0.8736, + "step": 11403 + }, + { + "epoch": 0.8227697413513221, + "grad_norm": 2.5716795836656012, + "learning_rate": 3.2043935501718113e-07, + "loss": 0.7912, + "step": 11404 + }, + { + "epoch": 0.8228418888207496, + "grad_norm": 2.0034005343082795, + "learning_rate": 3.201856589251142e-07, + "loss": 0.9301, + "step": 11405 + }, + { + "epoch": 0.8229140362901771, + "grad_norm": 2.4168665134625216, + "learning_rate": 3.1993205456149054e-07, + "loss": 0.9591, + "step": 11406 + }, + { + "epoch": 0.8229861837596046, + "grad_norm": 4.4471440355167084, + "learning_rate": 3.196785419401573e-07, + "loss": 0.9019, + "step": 11407 + }, + { + "epoch": 0.8230583312290322, + "grad_norm": 4.560703289608918, + "learning_rate": 3.194251210749581e-07, + "loss": 0.9019, + "step": 11408 + }, + { + "epoch": 0.8231304786984597, + "grad_norm": 4.169251949625737, + "learning_rate": 3.191717919797323e-07, + "loss": 0.8743, + "step": 11409 + }, + { + "epoch": 0.8232026261678872, + "grad_norm": 1.7863315578071515, + "learning_rate": 3.189185546683133e-07, + "loss": 0.896, + "step": 11410 + }, + { + "epoch": 0.8232747736373147, + "grad_norm": 6.490849436230099, + "learning_rate": 3.1866540915452846e-07, + "loss": 0.8731, + "step": 11411 + }, + { + "epoch": 0.8233469211067422, + "grad_norm": 2.02532905919116, + "learning_rate": 3.1841235545220137e-07, + "loss": 0.8689, + "step": 11412 + }, + { + "epoch": 0.8234190685761696, + "grad_norm": 1.743212000938341, + "learning_rate": 3.18159393575151e-07, + "loss": 0.8226, + "step": 11413 + }, + { + "epoch": 0.8234912160455972, + "grad_norm": 1.9779998016511608, + "learning_rate": 3.179065235371896e-07, + "loss": 0.8161, + "step": 11414 + }, + { + "epoch": 0.8235633635150247, + "grad_norm": 3.032614961034798, + "learning_rate": 3.1765374535212575e-07, + "loss": 0.9197, + "step": 11415 + }, + { + "epoch": 0.8236355109844522, + "grad_norm": 1.9096492414304833, + "learning_rate": 3.174010590337641e-07, + "loss": 0.9256, + "step": 11416 + }, + { + "epoch": 0.8237076584538797, + "grad_norm": 2.1218278433007813, + "learning_rate": 3.1714846459590106e-07, + "loss": 0.8937, + "step": 11417 + }, + { + "epoch": 0.8237798059233072, + "grad_norm": 2.31666849076819, + "learning_rate": 3.168959620523308e-07, + "loss": 0.8968, + "step": 11418 + }, + { + "epoch": 0.8238519533927348, + "grad_norm": 3.6386264013949305, + "learning_rate": 3.166435514168415e-07, + "loss": 0.8799, + "step": 11419 + }, + { + "epoch": 0.8239241008621623, + "grad_norm": 3.757961087848138, + "learning_rate": 3.1639123270321677e-07, + "loss": 0.965, + "step": 11420 + }, + { + "epoch": 0.8239962483315898, + "grad_norm": 2.7073090205725565, + "learning_rate": 3.1613900592523337e-07, + "loss": 1.0061, + "step": 11421 + }, + { + "epoch": 0.8240683958010173, + "grad_norm": 2.4221365510545323, + "learning_rate": 3.1588687109666577e-07, + "loss": 1.0212, + "step": 11422 + }, + { + "epoch": 0.8241405432704448, + "grad_norm": 8.058291260833998, + "learning_rate": 3.156348282312822e-07, + "loss": 0.9553, + "step": 11423 + }, + { + "epoch": 0.8242126907398722, + "grad_norm": 2.0263450676503543, + "learning_rate": 3.1538287734284507e-07, + "loss": 0.9034, + "step": 11424 + }, + { + "epoch": 0.8242848382092998, + "grad_norm": 2.161605155891239, + "learning_rate": 3.1513101844511256e-07, + "loss": 0.9515, + "step": 11425 + }, + { + "epoch": 0.8243569856787273, + "grad_norm": 2.163366763655519, + "learning_rate": 3.148792515518377e-07, + "loss": 0.8939, + "step": 11426 + }, + { + "epoch": 0.8244291331481548, + "grad_norm": 2.3754384489409595, + "learning_rate": 3.1462757667676964e-07, + "loss": 0.9924, + "step": 11427 + }, + { + "epoch": 0.8245012806175823, + "grad_norm": 2.4124610304773375, + "learning_rate": 3.143759938336501e-07, + "loss": 0.9454, + "step": 11428 + }, + { + "epoch": 0.8245734280870098, + "grad_norm": 3.648868380145746, + "learning_rate": 3.1412450303621763e-07, + "loss": 0.8531, + "step": 11429 + }, + { + "epoch": 0.8246455755564374, + "grad_norm": 0.7526756085517048, + "learning_rate": 3.1387310429820544e-07, + "loss": 0.783, + "step": 11430 + }, + { + "epoch": 0.8247177230258649, + "grad_norm": 9.108581403708719, + "learning_rate": 3.1362179763334107e-07, + "loss": 0.8554, + "step": 11431 + }, + { + "epoch": 0.8247898704952924, + "grad_norm": 2.1984779641576395, + "learning_rate": 3.1337058305534683e-07, + "loss": 0.9118, + "step": 11432 + }, + { + "epoch": 0.8248620179647199, + "grad_norm": 2.0966429580577133, + "learning_rate": 3.1311946057794237e-07, + "loss": 1.017, + "step": 11433 + }, + { + "epoch": 0.8249341654341474, + "grad_norm": 6.059810080998697, + "learning_rate": 3.12868430214839e-07, + "loss": 0.9093, + "step": 11434 + }, + { + "epoch": 0.825006312903575, + "grad_norm": 1.9071698080233292, + "learning_rate": 3.126174919797453e-07, + "loss": 0.9081, + "step": 11435 + }, + { + "epoch": 0.8250784603730024, + "grad_norm": 2.0375566439565658, + "learning_rate": 3.1236664588636385e-07, + "loss": 0.9586, + "step": 11436 + }, + { + "epoch": 0.8251506078424299, + "grad_norm": 2.535680026124206, + "learning_rate": 3.121158919483928e-07, + "loss": 0.9491, + "step": 11437 + }, + { + "epoch": 0.8252227553118574, + "grad_norm": 3.174956079614789, + "learning_rate": 3.1186523017952414e-07, + "loss": 0.8104, + "step": 11438 + }, + { + "epoch": 0.8252949027812849, + "grad_norm": 2.250964911507266, + "learning_rate": 3.1161466059344556e-07, + "loss": 0.9244, + "step": 11439 + }, + { + "epoch": 0.8253670502507124, + "grad_norm": 2.8912624996457708, + "learning_rate": 3.1136418320384116e-07, + "loss": 0.9121, + "step": 11440 + }, + { + "epoch": 0.82543919772014, + "grad_norm": 2.067013743184636, + "learning_rate": 3.11113798024387e-07, + "loss": 0.9907, + "step": 11441 + }, + { + "epoch": 0.8255113451895675, + "grad_norm": 3.572097031842083, + "learning_rate": 3.1086350506875623e-07, + "loss": 1.0039, + "step": 11442 + }, + { + "epoch": 0.825583492658995, + "grad_norm": 2.727773927382449, + "learning_rate": 3.106133043506167e-07, + "loss": 0.8644, + "step": 11443 + }, + { + "epoch": 0.8256556401284225, + "grad_norm": 2.3977713886954044, + "learning_rate": 3.1036319588363105e-07, + "loss": 0.9255, + "step": 11444 + }, + { + "epoch": 0.82572778759785, + "grad_norm": 2.1778514060832435, + "learning_rate": 3.1011317968145554e-07, + "loss": 0.9961, + "step": 11445 + }, + { + "epoch": 0.8257999350672776, + "grad_norm": 2.556929604770859, + "learning_rate": 3.098632557577441e-07, + "loss": 0.8075, + "step": 11446 + }, + { + "epoch": 0.8258720825367051, + "grad_norm": 2.309264935876216, + "learning_rate": 3.0961342412614386e-07, + "loss": 0.9109, + "step": 11447 + }, + { + "epoch": 0.8259442300061325, + "grad_norm": 2.096637954616399, + "learning_rate": 3.093636848002967e-07, + "loss": 0.9236, + "step": 11448 + }, + { + "epoch": 0.82601637747556, + "grad_norm": 3.775207104591237, + "learning_rate": 3.0911403779384015e-07, + "loss": 0.835, + "step": 11449 + }, + { + "epoch": 0.8260885249449875, + "grad_norm": 8.741960592452932, + "learning_rate": 3.0886448312040704e-07, + "loss": 1.0107, + "step": 11450 + }, + { + "epoch": 0.826160672414415, + "grad_norm": 2.46435309845153, + "learning_rate": 3.0861502079362287e-07, + "loss": 0.9035, + "step": 11451 + }, + { + "epoch": 0.8262328198838426, + "grad_norm": 3.931150857860932, + "learning_rate": 3.0836565082711195e-07, + "loss": 0.9318, + "step": 11452 + }, + { + "epoch": 0.8263049673532701, + "grad_norm": 2.6102495384044184, + "learning_rate": 3.081163732344905e-07, + "loss": 1.0801, + "step": 11453 + }, + { + "epoch": 0.8263771148226976, + "grad_norm": 3.098359652266024, + "learning_rate": 3.0786718802937127e-07, + "loss": 0.9107, + "step": 11454 + }, + { + "epoch": 0.8264492622921251, + "grad_norm": 3.4071465108645653, + "learning_rate": 3.0761809522536044e-07, + "loss": 0.961, + "step": 11455 + }, + { + "epoch": 0.8265214097615526, + "grad_norm": 2.689264295011796, + "learning_rate": 3.0736909483605966e-07, + "loss": 0.8181, + "step": 11456 + }, + { + "epoch": 0.8265935572309802, + "grad_norm": 2.0406434226263306, + "learning_rate": 3.071201868750679e-07, + "loss": 0.9503, + "step": 11457 + }, + { + "epoch": 0.8266657047004077, + "grad_norm": 2.3471091040667074, + "learning_rate": 3.068713713559754e-07, + "loss": 0.9416, + "step": 11458 + }, + { + "epoch": 0.8267378521698352, + "grad_norm": 3.0976497847308004, + "learning_rate": 3.066226482923697e-07, + "loss": 0.8577, + "step": 11459 + }, + { + "epoch": 0.8268099996392626, + "grad_norm": 0.7688357700789189, + "learning_rate": 3.063740176978329e-07, + "loss": 0.7818, + "step": 11460 + }, + { + "epoch": 0.8268821471086901, + "grad_norm": 0.8213998037240959, + "learning_rate": 3.061254795859407e-07, + "loss": 0.8408, + "step": 11461 + }, + { + "epoch": 0.8269542945781176, + "grad_norm": 2.0190377139630833, + "learning_rate": 3.058770339702659e-07, + "loss": 0.9561, + "step": 11462 + }, + { + "epoch": 0.8270264420475452, + "grad_norm": 2.117779300649566, + "learning_rate": 3.056286808643742e-07, + "loss": 0.8596, + "step": 11463 + }, + { + "epoch": 0.8270985895169727, + "grad_norm": 2.319736704684141, + "learning_rate": 3.0538042028182865e-07, + "loss": 0.9443, + "step": 11464 + }, + { + "epoch": 0.8271707369864002, + "grad_norm": 1.9385397182525186, + "learning_rate": 3.0513225223618497e-07, + "loss": 1.0861, + "step": 11465 + }, + { + "epoch": 0.8272428844558277, + "grad_norm": 2.183235698681504, + "learning_rate": 3.0488417674099466e-07, + "loss": 0.9391, + "step": 11466 + }, + { + "epoch": 0.8273150319252552, + "grad_norm": 2.7036717032895656, + "learning_rate": 3.0463619380980475e-07, + "loss": 0.9185, + "step": 11467 + }, + { + "epoch": 0.8273871793946828, + "grad_norm": 2.530446337052689, + "learning_rate": 3.0438830345615586e-07, + "loss": 0.9395, + "step": 11468 + }, + { + "epoch": 0.8274593268641103, + "grad_norm": 2.466295494901243, + "learning_rate": 3.041405056935842e-07, + "loss": 1.0796, + "step": 11469 + }, + { + "epoch": 0.8275314743335378, + "grad_norm": 4.131106855766354, + "learning_rate": 3.038928005356223e-07, + "loss": 0.8681, + "step": 11470 + }, + { + "epoch": 0.8276036218029652, + "grad_norm": 1.9327130402344612, + "learning_rate": 3.036451879957962e-07, + "loss": 1.0009, + "step": 11471 + }, + { + "epoch": 0.8276757692723927, + "grad_norm": 2.477287306754612, + "learning_rate": 3.0339766808762623e-07, + "loss": 0.8606, + "step": 11472 + }, + { + "epoch": 0.8277479167418202, + "grad_norm": 3.293856861907238, + "learning_rate": 3.031502408246289e-07, + "loss": 0.8119, + "step": 11473 + }, + { + "epoch": 0.8278200642112478, + "grad_norm": 2.5966644458818386, + "learning_rate": 3.0290290622031613e-07, + "loss": 0.9725, + "step": 11474 + }, + { + "epoch": 0.8278922116806753, + "grad_norm": 2.4226927330421724, + "learning_rate": 3.0265566428819214e-07, + "loss": 0.9306, + "step": 11475 + }, + { + "epoch": 0.8279643591501028, + "grad_norm": 4.751579825041376, + "learning_rate": 3.0240851504175923e-07, + "loss": 0.8655, + "step": 11476 + }, + { + "epoch": 0.8280365066195303, + "grad_norm": 1.7046065579646392, + "learning_rate": 3.021614584945138e-07, + "loss": 0.756, + "step": 11477 + }, + { + "epoch": 0.8281086540889578, + "grad_norm": 2.353711638459619, + "learning_rate": 3.019144946599452e-07, + "loss": 1.0143, + "step": 11478 + }, + { + "epoch": 0.8281808015583854, + "grad_norm": 2.888912208984673, + "learning_rate": 3.016676235515403e-07, + "loss": 0.99, + "step": 11479 + }, + { + "epoch": 0.8282529490278129, + "grad_norm": 4.838930550323037, + "learning_rate": 3.0142084518277865e-07, + "loss": 0.9511, + "step": 11480 + }, + { + "epoch": 0.8283250964972404, + "grad_norm": 3.7175257615079933, + "learning_rate": 3.011741595671378e-07, + "loss": 0.9719, + "step": 11481 + }, + { + "epoch": 0.8283972439666679, + "grad_norm": 2.3213722557531082, + "learning_rate": 3.0092756671808684e-07, + "loss": 0.9828, + "step": 11482 + }, + { + "epoch": 0.8284693914360953, + "grad_norm": 3.6321971084952063, + "learning_rate": 3.0068106664909174e-07, + "loss": 1.0285, + "step": 11483 + }, + { + "epoch": 0.8285415389055228, + "grad_norm": 2.275810026228503, + "learning_rate": 3.004346593736133e-07, + "loss": 1.0371, + "step": 11484 + }, + { + "epoch": 0.8286136863749504, + "grad_norm": 2.3579258794807356, + "learning_rate": 3.0018834490510613e-07, + "loss": 0.9045, + "step": 11485 + }, + { + "epoch": 0.8286858338443779, + "grad_norm": 2.2063786714979403, + "learning_rate": 2.99942123257021e-07, + "loss": 0.7946, + "step": 11486 + }, + { + "epoch": 0.8287579813138054, + "grad_norm": 4.501560788218826, + "learning_rate": 2.9969599444280235e-07, + "loss": 0.9306, + "step": 11487 + }, + { + "epoch": 0.8288301287832329, + "grad_norm": 2.3829193591561615, + "learning_rate": 2.994499584758925e-07, + "loss": 0.9024, + "step": 11488 + }, + { + "epoch": 0.8289022762526604, + "grad_norm": 2.7143743095423423, + "learning_rate": 2.992040153697244e-07, + "loss": 0.9554, + "step": 11489 + }, + { + "epoch": 0.828974423722088, + "grad_norm": 2.033081051022825, + "learning_rate": 2.989581651377291e-07, + "loss": 0.8428, + "step": 11490 + }, + { + "epoch": 0.8290465711915155, + "grad_norm": 4.082062789576122, + "learning_rate": 2.9871240779333164e-07, + "loss": 0.8684, + "step": 11491 + }, + { + "epoch": 0.829118718660943, + "grad_norm": 0.6924812236195932, + "learning_rate": 2.984667433499513e-07, + "loss": 0.7332, + "step": 11492 + }, + { + "epoch": 0.8291908661303705, + "grad_norm": 3.7234266350604046, + "learning_rate": 2.982211718210028e-07, + "loss": 0.9312, + "step": 11493 + }, + { + "epoch": 0.829263013599798, + "grad_norm": 2.482914717378431, + "learning_rate": 2.9797569321989713e-07, + "loss": 1.0054, + "step": 11494 + }, + { + "epoch": 0.8293351610692254, + "grad_norm": 2.60271787905858, + "learning_rate": 2.977303075600379e-07, + "loss": 0.9896, + "step": 11495 + }, + { + "epoch": 0.829407308538653, + "grad_norm": 2.3034700960702725, + "learning_rate": 2.9748501485482493e-07, + "loss": 0.9212, + "step": 11496 + }, + { + "epoch": 0.8294794560080805, + "grad_norm": 2.4047663685021163, + "learning_rate": 2.972398151176527e-07, + "loss": 0.9035, + "step": 11497 + }, + { + "epoch": 0.829551603477508, + "grad_norm": 2.4326278704665567, + "learning_rate": 2.9699470836191134e-07, + "loss": 0.9449, + "step": 11498 + }, + { + "epoch": 0.8296237509469355, + "grad_norm": 2.6205017967749296, + "learning_rate": 2.9674969460098354e-07, + "loss": 0.7836, + "step": 11499 + }, + { + "epoch": 0.829695898416363, + "grad_norm": 12.732150280170115, + "learning_rate": 2.9650477384825023e-07, + "loss": 0.8844, + "step": 11500 + }, + { + "epoch": 0.8297680458857906, + "grad_norm": 2.4803585959177883, + "learning_rate": 2.962599461170856e-07, + "loss": 0.8887, + "step": 11501 + }, + { + "epoch": 0.8298401933552181, + "grad_norm": 2.9205275095679317, + "learning_rate": 2.960152114208578e-07, + "loss": 0.8682, + "step": 11502 + }, + { + "epoch": 0.8299123408246456, + "grad_norm": 2.1084001337056053, + "learning_rate": 2.957705697729316e-07, + "loss": 0.9539, + "step": 11503 + }, + { + "epoch": 0.8299844882940731, + "grad_norm": 0.7561292536157578, + "learning_rate": 2.955260211866659e-07, + "loss": 0.8615, + "step": 11504 + }, + { + "epoch": 0.8300566357635006, + "grad_norm": 0.6494744523730539, + "learning_rate": 2.9528156567541374e-07, + "loss": 0.7861, + "step": 11505 + }, + { + "epoch": 0.8301287832329282, + "grad_norm": 0.700174562793197, + "learning_rate": 2.95037203252525e-07, + "loss": 0.8049, + "step": 11506 + }, + { + "epoch": 0.8302009307023556, + "grad_norm": 3.180892678449168, + "learning_rate": 2.9479293393134285e-07, + "loss": 0.8349, + "step": 11507 + }, + { + "epoch": 0.8302730781717831, + "grad_norm": 3.247854992124144, + "learning_rate": 2.9454875772520684e-07, + "loss": 0.8672, + "step": 11508 + }, + { + "epoch": 0.8303452256412106, + "grad_norm": 2.213544838098984, + "learning_rate": 2.943046746474491e-07, + "loss": 0.9528, + "step": 11509 + }, + { + "epoch": 0.8304173731106381, + "grad_norm": 5.494220818491349, + "learning_rate": 2.940606847113991e-07, + "loss": 0.8083, + "step": 11510 + }, + { + "epoch": 0.8304895205800656, + "grad_norm": 2.962018703847246, + "learning_rate": 2.938167879303797e-07, + "loss": 0.8966, + "step": 11511 + }, + { + "epoch": 0.8305616680494932, + "grad_norm": 3.2987892415096636, + "learning_rate": 2.935729843177095e-07, + "loss": 0.8831, + "step": 11512 + }, + { + "epoch": 0.8306338155189207, + "grad_norm": 2.232538922729883, + "learning_rate": 2.933292738867017e-07, + "loss": 0.9558, + "step": 11513 + }, + { + "epoch": 0.8307059629883482, + "grad_norm": 2.881501020969817, + "learning_rate": 2.9308565665066456e-07, + "loss": 0.9328, + "step": 11514 + }, + { + "epoch": 0.8307781104577757, + "grad_norm": 2.032714550523189, + "learning_rate": 2.928421326229011e-07, + "loss": 0.9787, + "step": 11515 + }, + { + "epoch": 0.8308502579272032, + "grad_norm": 3.7418133701940937, + "learning_rate": 2.9259870181670865e-07, + "loss": 0.9223, + "step": 11516 + }, + { + "epoch": 0.8309224053966308, + "grad_norm": 2.0317954871520234, + "learning_rate": 2.9235536424538023e-07, + "loss": 0.9513, + "step": 11517 + }, + { + "epoch": 0.8309945528660583, + "grad_norm": 12.41843505828633, + "learning_rate": 2.921121199222045e-07, + "loss": 0.8951, + "step": 11518 + }, + { + "epoch": 0.8310667003354857, + "grad_norm": 3.588945577375177, + "learning_rate": 2.918689688604634e-07, + "loss": 0.9901, + "step": 11519 + }, + { + "epoch": 0.8311388478049132, + "grad_norm": 3.4520053342864196, + "learning_rate": 2.916259110734345e-07, + "loss": 0.9008, + "step": 11520 + }, + { + "epoch": 0.8312109952743407, + "grad_norm": 3.9025713879973454, + "learning_rate": 2.9138294657439067e-07, + "loss": 0.8274, + "step": 11521 + }, + { + "epoch": 0.8312831427437682, + "grad_norm": 2.910648422097823, + "learning_rate": 2.911400753765989e-07, + "loss": 0.9922, + "step": 11522 + }, + { + "epoch": 0.8313552902131958, + "grad_norm": 3.3184464540332925, + "learning_rate": 2.90897297493321e-07, + "loss": 0.9163, + "step": 11523 + }, + { + "epoch": 0.8314274376826233, + "grad_norm": 9.289569658017122, + "learning_rate": 2.9065461293781537e-07, + "loss": 0.8791, + "step": 11524 + }, + { + "epoch": 0.8314995851520508, + "grad_norm": 2.346336565082223, + "learning_rate": 2.904120217233339e-07, + "loss": 0.9493, + "step": 11525 + }, + { + "epoch": 0.8315717326214783, + "grad_norm": 7.484830392509818, + "learning_rate": 2.901695238631228e-07, + "loss": 0.9597, + "step": 11526 + }, + { + "epoch": 0.8316438800909058, + "grad_norm": 14.03382221255331, + "learning_rate": 2.899271193704245e-07, + "loss": 0.8572, + "step": 11527 + }, + { + "epoch": 0.8317160275603334, + "grad_norm": 1.846415516453275, + "learning_rate": 2.89684808258476e-07, + "loss": 0.9002, + "step": 11528 + }, + { + "epoch": 0.8317881750297609, + "grad_norm": 2.1975083416291143, + "learning_rate": 2.8944259054050803e-07, + "loss": 0.8681, + "step": 11529 + }, + { + "epoch": 0.8318603224991883, + "grad_norm": 2.861198469225414, + "learning_rate": 2.8920046622974823e-07, + "loss": 0.9223, + "step": 11530 + }, + { + "epoch": 0.8319324699686158, + "grad_norm": 2.312606293580926, + "learning_rate": 2.8895843533941786e-07, + "loss": 0.9132, + "step": 11531 + }, + { + "epoch": 0.8320046174380433, + "grad_norm": 1.8386796966120549, + "learning_rate": 2.887164978827339e-07, + "loss": 0.8967, + "step": 11532 + }, + { + "epoch": 0.8320767649074708, + "grad_norm": 3.8483133907029927, + "learning_rate": 2.884746538729064e-07, + "loss": 1.0006, + "step": 11533 + }, + { + "epoch": 0.8321489123768984, + "grad_norm": 2.647339359701511, + "learning_rate": 2.882329033231423e-07, + "loss": 0.8515, + "step": 11534 + }, + { + "epoch": 0.8322210598463259, + "grad_norm": 2.6855127838745276, + "learning_rate": 2.879912462466425e-07, + "loss": 0.9341, + "step": 11535 + }, + { + "epoch": 0.8322932073157534, + "grad_norm": 4.08408642006957, + "learning_rate": 2.8774968265660324e-07, + "loss": 0.8593, + "step": 11536 + }, + { + "epoch": 0.8323653547851809, + "grad_norm": 0.8202626167752456, + "learning_rate": 2.8750821256621515e-07, + "loss": 0.7942, + "step": 11537 + }, + { + "epoch": 0.8324375022546084, + "grad_norm": 2.32009342071283, + "learning_rate": 2.8726683598866476e-07, + "loss": 0.8942, + "step": 11538 + }, + { + "epoch": 0.832509649724036, + "grad_norm": 2.104106398622668, + "learning_rate": 2.870255529371315e-07, + "loss": 0.9105, + "step": 11539 + }, + { + "epoch": 0.8325817971934635, + "grad_norm": 2.3899810522118456, + "learning_rate": 2.867843634247917e-07, + "loss": 0.9123, + "step": 11540 + }, + { + "epoch": 0.832653944662891, + "grad_norm": 2.5296111276182778, + "learning_rate": 2.8654326746481517e-07, + "loss": 0.9352, + "step": 11541 + }, + { + "epoch": 0.8327260921323184, + "grad_norm": 5.314916083749105, + "learning_rate": 2.863022650703688e-07, + "loss": 0.8783, + "step": 11542 + }, + { + "epoch": 0.8327982396017459, + "grad_norm": 2.5616228416663485, + "learning_rate": 2.860613562546113e-07, + "loss": 0.8825, + "step": 11543 + }, + { + "epoch": 0.8328703870711734, + "grad_norm": 2.480565058723331, + "learning_rate": 2.8582054103069844e-07, + "loss": 0.8705, + "step": 11544 + }, + { + "epoch": 0.832942534540601, + "grad_norm": 3.32056352451829, + "learning_rate": 2.8557981941178043e-07, + "loss": 0.9123, + "step": 11545 + }, + { + "epoch": 0.8330146820100285, + "grad_norm": 2.3893938073114813, + "learning_rate": 2.8533919141100174e-07, + "loss": 0.9198, + "step": 11546 + }, + { + "epoch": 0.833086829479456, + "grad_norm": 2.806533588446975, + "learning_rate": 2.8509865704150173e-07, + "loss": 0.888, + "step": 11547 + }, + { + "epoch": 0.8331589769488835, + "grad_norm": 1.9186171544711332, + "learning_rate": 2.848582163164166e-07, + "loss": 0.8673, + "step": 11548 + }, + { + "epoch": 0.833231124418311, + "grad_norm": 3.1127917049629787, + "learning_rate": 2.8461786924887455e-07, + "loss": 0.8385, + "step": 11549 + }, + { + "epoch": 0.8333032718877386, + "grad_norm": 6.381173230343641, + "learning_rate": 2.8437761585200057e-07, + "loss": 0.8208, + "step": 11550 + }, + { + "epoch": 0.8333754193571661, + "grad_norm": 3.6027472609854394, + "learning_rate": 2.84137456138914e-07, + "loss": 0.8711, + "step": 11551 + }, + { + "epoch": 0.8334475668265936, + "grad_norm": 3.3169314414556257, + "learning_rate": 2.838973901227295e-07, + "loss": 0.9003, + "step": 11552 + }, + { + "epoch": 0.8335197142960211, + "grad_norm": 5.047605479377271, + "learning_rate": 2.8365741781655494e-07, + "loss": 0.9965, + "step": 11553 + }, + { + "epoch": 0.8335918617654485, + "grad_norm": 3.369976968205473, + "learning_rate": 2.8341753923349543e-07, + "loss": 0.9471, + "step": 11554 + }, + { + "epoch": 0.833664009234876, + "grad_norm": 8.049260113756185, + "learning_rate": 2.8317775438665003e-07, + "loss": 0.9579, + "step": 11555 + }, + { + "epoch": 0.8337361567043036, + "grad_norm": 2.9533566984482325, + "learning_rate": 2.829380632891114e-07, + "loss": 0.8951, + "step": 11556 + }, + { + "epoch": 0.8338083041737311, + "grad_norm": 2.3391115192381, + "learning_rate": 2.8269846595396906e-07, + "loss": 0.9037, + "step": 11557 + }, + { + "epoch": 0.8338804516431586, + "grad_norm": 1.6875218284042854, + "learning_rate": 2.824589623943061e-07, + "loss": 0.8811, + "step": 11558 + }, + { + "epoch": 0.8339525991125861, + "grad_norm": 2.44991923997081, + "learning_rate": 2.8221955262320096e-07, + "loss": 0.9374, + "step": 11559 + }, + { + "epoch": 0.8340247465820136, + "grad_norm": 2.603957439918608, + "learning_rate": 2.8198023665372716e-07, + "loss": 0.8712, + "step": 11560 + }, + { + "epoch": 0.8340968940514412, + "grad_norm": 2.868042031154401, + "learning_rate": 2.817410144989529e-07, + "loss": 0.7403, + "step": 11561 + }, + { + "epoch": 0.8341690415208687, + "grad_norm": 3.4723492069336257, + "learning_rate": 2.815018861719414e-07, + "loss": 0.9744, + "step": 11562 + }, + { + "epoch": 0.8342411889902962, + "grad_norm": 13.092625389875378, + "learning_rate": 2.812628516857498e-07, + "loss": 0.8439, + "step": 11563 + }, + { + "epoch": 0.8343133364597237, + "grad_norm": 2.2050991006669536, + "learning_rate": 2.8102391105343117e-07, + "loss": 0.9428, + "step": 11564 + }, + { + "epoch": 0.8343854839291512, + "grad_norm": 2.701545739108106, + "learning_rate": 2.807850642880338e-07, + "loss": 0.9294, + "step": 11565 + }, + { + "epoch": 0.8344576313985786, + "grad_norm": 3.434421339755046, + "learning_rate": 2.80546311402599e-07, + "loss": 0.9636, + "step": 11566 + }, + { + "epoch": 0.8345297788680062, + "grad_norm": 4.406648225577392, + "learning_rate": 2.803076524101655e-07, + "loss": 0.9865, + "step": 11567 + }, + { + "epoch": 0.8346019263374337, + "grad_norm": 2.709095969785999, + "learning_rate": 2.8006908732376475e-07, + "loss": 0.8321, + "step": 11568 + }, + { + "epoch": 0.8346740738068612, + "grad_norm": 2.1251466363914213, + "learning_rate": 2.798306161564248e-07, + "loss": 0.7808, + "step": 11569 + }, + { + "epoch": 0.8347462212762887, + "grad_norm": 2.1954205578040535, + "learning_rate": 2.7959223892116643e-07, + "loss": 1.0206, + "step": 11570 + }, + { + "epoch": 0.8348183687457162, + "grad_norm": 5.559935589424119, + "learning_rate": 2.7935395563100696e-07, + "loss": 0.8484, + "step": 11571 + }, + { + "epoch": 0.8348905162151438, + "grad_norm": 2.5416894088311333, + "learning_rate": 2.791157662989589e-07, + "loss": 0.9671, + "step": 11572 + }, + { + "epoch": 0.8349626636845713, + "grad_norm": 2.9856446446440668, + "learning_rate": 2.7887767093802825e-07, + "loss": 0.9142, + "step": 11573 + }, + { + "epoch": 0.8350348111539988, + "grad_norm": 2.370513191352192, + "learning_rate": 2.7863966956121653e-07, + "loss": 0.9789, + "step": 11574 + }, + { + "epoch": 0.8351069586234263, + "grad_norm": 2.571896514246379, + "learning_rate": 2.7840176218152e-07, + "loss": 0.9943, + "step": 11575 + }, + { + "epoch": 0.8351791060928538, + "grad_norm": 2.9888658537640787, + "learning_rate": 2.781639488119307e-07, + "loss": 0.9545, + "step": 11576 + }, + { + "epoch": 0.8352512535622812, + "grad_norm": 3.2524840325352824, + "learning_rate": 2.779262294654332e-07, + "loss": 0.9263, + "step": 11577 + }, + { + "epoch": 0.8353234010317088, + "grad_norm": 2.457198824118386, + "learning_rate": 2.7768860415500995e-07, + "loss": 0.8947, + "step": 11578 + }, + { + "epoch": 0.8353955485011363, + "grad_norm": 2.7743875944707197, + "learning_rate": 2.7745107289363655e-07, + "loss": 0.9275, + "step": 11579 + }, + { + "epoch": 0.8354676959705638, + "grad_norm": 2.8096699357228916, + "learning_rate": 2.7721363569428314e-07, + "loss": 0.9273, + "step": 11580 + }, + { + "epoch": 0.8355398434399913, + "grad_norm": 2.4506776106191905, + "learning_rate": 2.769762925699155e-07, + "loss": 0.8509, + "step": 11581 + }, + { + "epoch": 0.8356119909094188, + "grad_norm": 2.386705663866795, + "learning_rate": 2.7673904353349444e-07, + "loss": 0.8697, + "step": 11582 + }, + { + "epoch": 0.8356841383788464, + "grad_norm": 4.142800274350573, + "learning_rate": 2.7650188859797396e-07, + "loss": 0.8951, + "step": 11583 + }, + { + "epoch": 0.8357562858482739, + "grad_norm": 2.156827876842732, + "learning_rate": 2.7626482777630575e-07, + "loss": 0.8248, + "step": 11584 + }, + { + "epoch": 0.8358284333177014, + "grad_norm": 3.8503463093948453, + "learning_rate": 2.7602786108143437e-07, + "loss": 1.017, + "step": 11585 + }, + { + "epoch": 0.8359005807871289, + "grad_norm": 2.6637494303193865, + "learning_rate": 2.757909885262999e-07, + "loss": 0.9473, + "step": 11586 + }, + { + "epoch": 0.8359727282565564, + "grad_norm": 1.9311233923438242, + "learning_rate": 2.755542101238362e-07, + "loss": 0.8618, + "step": 11587 + }, + { + "epoch": 0.836044875725984, + "grad_norm": 1.8922481662591675, + "learning_rate": 2.7531752588697354e-07, + "loss": 0.9342, + "step": 11588 + }, + { + "epoch": 0.8361170231954114, + "grad_norm": 3.2450985260804837, + "learning_rate": 2.7508093582863656e-07, + "loss": 0.8857, + "step": 11589 + }, + { + "epoch": 0.8361891706648389, + "grad_norm": 2.6995610904694782, + "learning_rate": 2.7484443996174334e-07, + "loss": 0.9034, + "step": 11590 + }, + { + "epoch": 0.8362613181342664, + "grad_norm": 6.98760188101808, + "learning_rate": 2.7460803829920954e-07, + "loss": 0.8442, + "step": 11591 + }, + { + "epoch": 0.8363334656036939, + "grad_norm": 2.2165270132754604, + "learning_rate": 2.7437173085394396e-07, + "loss": 0.9054, + "step": 11592 + }, + { + "epoch": 0.8364056130731214, + "grad_norm": 2.392079826936697, + "learning_rate": 2.7413551763884954e-07, + "loss": 0.8779, + "step": 11593 + }, + { + "epoch": 0.836477760542549, + "grad_norm": 1.9977889355935, + "learning_rate": 2.7389939866682586e-07, + "loss": 0.8059, + "step": 11594 + }, + { + "epoch": 0.8365499080119765, + "grad_norm": 2.2464971826885054, + "learning_rate": 2.7366337395076546e-07, + "loss": 0.8888, + "step": 11595 + }, + { + "epoch": 0.836622055481404, + "grad_norm": 3.560237902503752, + "learning_rate": 2.734274435035586e-07, + "loss": 0.8592, + "step": 11596 + }, + { + "epoch": 0.8366942029508315, + "grad_norm": 2.49580575060582, + "learning_rate": 2.731916073380871e-07, + "loss": 0.8961, + "step": 11597 + }, + { + "epoch": 0.836766350420259, + "grad_norm": 2.221161619713309, + "learning_rate": 2.7295586546722926e-07, + "loss": 0.8869, + "step": 11598 + }, + { + "epoch": 0.8368384978896866, + "grad_norm": 1.7703298488753358, + "learning_rate": 2.7272021790385903e-07, + "loss": 0.9372, + "step": 11599 + }, + { + "epoch": 0.8369106453591141, + "grad_norm": 1.943250674174133, + "learning_rate": 2.72484664660843e-07, + "loss": 0.9629, + "step": 11600 + }, + { + "epoch": 0.8369827928285415, + "grad_norm": 2.225788396821845, + "learning_rate": 2.7224920575104373e-07, + "loss": 0.9248, + "step": 11601 + }, + { + "epoch": 0.837054940297969, + "grad_norm": 2.7874863389026845, + "learning_rate": 2.720138411873201e-07, + "loss": 0.8921, + "step": 11602 + }, + { + "epoch": 0.8371270877673965, + "grad_norm": 2.9396805481997306, + "learning_rate": 2.717785709825242e-07, + "loss": 0.7539, + "step": 11603 + }, + { + "epoch": 0.837199235236824, + "grad_norm": 2.3088894056067297, + "learning_rate": 2.715433951495023e-07, + "loss": 1.0064, + "step": 11604 + }, + { + "epoch": 0.8372713827062516, + "grad_norm": 2.044352950534384, + "learning_rate": 2.713083137010974e-07, + "loss": 0.8834, + "step": 11605 + }, + { + "epoch": 0.8373435301756791, + "grad_norm": 0.6438111646538571, + "learning_rate": 2.7107332665014617e-07, + "loss": 0.7879, + "step": 11606 + }, + { + "epoch": 0.8374156776451066, + "grad_norm": 0.6689703498108764, + "learning_rate": 2.708384340094796e-07, + "loss": 0.7309, + "step": 11607 + }, + { + "epoch": 0.8374878251145341, + "grad_norm": 2.5917358851209893, + "learning_rate": 2.7060363579192547e-07, + "loss": 0.8818, + "step": 11608 + }, + { + "epoch": 0.8375599725839616, + "grad_norm": 4.289705711176581, + "learning_rate": 2.703689320103051e-07, + "loss": 0.8962, + "step": 11609 + }, + { + "epoch": 0.8376321200533892, + "grad_norm": 3.2322995810986046, + "learning_rate": 2.701343226774342e-07, + "loss": 0.8524, + "step": 11610 + }, + { + "epoch": 0.8377042675228167, + "grad_norm": 2.11838455424397, + "learning_rate": 2.698998078061241e-07, + "loss": 0.9322, + "step": 11611 + }, + { + "epoch": 0.8377764149922442, + "grad_norm": 1.081270577119161, + "learning_rate": 2.6966538740918097e-07, + "loss": 0.8692, + "step": 11612 + }, + { + "epoch": 0.8378485624616716, + "grad_norm": 2.2529985791848244, + "learning_rate": 2.69431061499406e-07, + "loss": 0.9035, + "step": 11613 + }, + { + "epoch": 0.8379207099310991, + "grad_norm": 3.338555267036496, + "learning_rate": 2.6919683008959326e-07, + "loss": 0.8637, + "step": 11614 + }, + { + "epoch": 0.8379928574005266, + "grad_norm": 2.056425570613224, + "learning_rate": 2.6896269319253506e-07, + "loss": 0.8926, + "step": 11615 + }, + { + "epoch": 0.8380650048699542, + "grad_norm": 7.999483568688835, + "learning_rate": 2.687286508210167e-07, + "loss": 0.8076, + "step": 11616 + }, + { + "epoch": 0.8381371523393817, + "grad_norm": 2.9264653504922684, + "learning_rate": 2.684947029878171e-07, + "loss": 0.8244, + "step": 11617 + }, + { + "epoch": 0.8382092998088092, + "grad_norm": 2.3169648653213946, + "learning_rate": 2.6826084970571196e-07, + "loss": 0.9291, + "step": 11618 + }, + { + "epoch": 0.8382814472782367, + "grad_norm": 3.7022997745462844, + "learning_rate": 2.680270909874709e-07, + "loss": 0.9102, + "step": 11619 + }, + { + "epoch": 0.8383535947476642, + "grad_norm": 1.9027040990160493, + "learning_rate": 2.6779342684585947e-07, + "loss": 0.9336, + "step": 11620 + }, + { + "epoch": 0.8384257422170918, + "grad_norm": 0.7143094092934799, + "learning_rate": 2.675598572936364e-07, + "loss": 0.801, + "step": 11621 + }, + { + "epoch": 0.8384978896865193, + "grad_norm": 0.7647121592896239, + "learning_rate": 2.673263823435561e-07, + "loss": 0.7913, + "step": 11622 + }, + { + "epoch": 0.8385700371559468, + "grad_norm": 0.7251293034181698, + "learning_rate": 2.670930020083684e-07, + "loss": 0.7978, + "step": 11623 + }, + { + "epoch": 0.8386421846253742, + "grad_norm": 2.6639676342851004, + "learning_rate": 2.6685971630081636e-07, + "loss": 0.97, + "step": 11624 + }, + { + "epoch": 0.8387143320948017, + "grad_norm": 1.7772995492658052, + "learning_rate": 2.6662652523363905e-07, + "loss": 1.0239, + "step": 11625 + }, + { + "epoch": 0.8387864795642292, + "grad_norm": 2.8394256242004667, + "learning_rate": 2.663934288195713e-07, + "loss": 0.9704, + "step": 11626 + }, + { + "epoch": 0.8388586270336568, + "grad_norm": 1.9859026938088744, + "learning_rate": 2.6616042707134025e-07, + "loss": 1.0092, + "step": 11627 + }, + { + "epoch": 0.8389307745030843, + "grad_norm": 2.313452447175389, + "learning_rate": 2.659275200016697e-07, + "loss": 0.8618, + "step": 11628 + }, + { + "epoch": 0.8390029219725118, + "grad_norm": 2.1977961599088878, + "learning_rate": 2.6569470762327826e-07, + "loss": 0.944, + "step": 11629 + }, + { + "epoch": 0.8390750694419393, + "grad_norm": 3.019948439505302, + "learning_rate": 2.6546198994887904e-07, + "loss": 0.9203, + "step": 11630 + }, + { + "epoch": 0.8391472169113668, + "grad_norm": 2.1009130218818424, + "learning_rate": 2.652293669911787e-07, + "loss": 0.9659, + "step": 11631 + }, + { + "epoch": 0.8392193643807944, + "grad_norm": 2.6522268907200224, + "learning_rate": 2.6499683876288093e-07, + "loss": 0.8361, + "step": 11632 + }, + { + "epoch": 0.8392915118502219, + "grad_norm": 1.7446507031637608, + "learning_rate": 2.647644052766838e-07, + "loss": 0.9991, + "step": 11633 + }, + { + "epoch": 0.8393636593196494, + "grad_norm": 0.9547852299141931, + "learning_rate": 2.645320665452782e-07, + "loss": 0.9034, + "step": 11634 + }, + { + "epoch": 0.8394358067890769, + "grad_norm": 2.1130328076438376, + "learning_rate": 2.6429982258135195e-07, + "loss": 0.9248, + "step": 11635 + }, + { + "epoch": 0.8395079542585043, + "grad_norm": 2.021480836726976, + "learning_rate": 2.640676733975875e-07, + "loss": 0.7973, + "step": 11636 + }, + { + "epoch": 0.8395801017279318, + "grad_norm": 1.8720853403967768, + "learning_rate": 2.6383561900666086e-07, + "loss": 0.9731, + "step": 11637 + }, + { + "epoch": 0.8396522491973594, + "grad_norm": 4.27573880955721, + "learning_rate": 2.636036594212434e-07, + "loss": 0.9022, + "step": 11638 + }, + { + "epoch": 0.8397243966667869, + "grad_norm": 2.796807314943485, + "learning_rate": 2.633717946540028e-07, + "loss": 0.9037, + "step": 11639 + }, + { + "epoch": 0.8397965441362144, + "grad_norm": 2.606306544726508, + "learning_rate": 2.631400247176001e-07, + "loss": 0.9114, + "step": 11640 + }, + { + "epoch": 0.8398686916056419, + "grad_norm": 2.877299011528973, + "learning_rate": 2.6290834962469046e-07, + "loss": 0.9375, + "step": 11641 + }, + { + "epoch": 0.8399408390750694, + "grad_norm": 11.998992877659566, + "learning_rate": 2.6267676938792547e-07, + "loss": 0.8921, + "step": 11642 + }, + { + "epoch": 0.840012986544497, + "grad_norm": 9.151846077218178, + "learning_rate": 2.6244528401995114e-07, + "loss": 0.8606, + "step": 11643 + }, + { + "epoch": 0.8400851340139245, + "grad_norm": 3.0882286354869364, + "learning_rate": 2.622138935334068e-07, + "loss": 0.8984, + "step": 11644 + }, + { + "epoch": 0.840157281483352, + "grad_norm": 5.1252355754022805, + "learning_rate": 2.6198259794092916e-07, + "loss": 0.8814, + "step": 11645 + }, + { + "epoch": 0.8402294289527795, + "grad_norm": 2.6963150592394873, + "learning_rate": 2.6175139725514793e-07, + "loss": 0.8786, + "step": 11646 + }, + { + "epoch": 0.840301576422207, + "grad_norm": 3.3705346979953528, + "learning_rate": 2.615202914886885e-07, + "loss": 1.0139, + "step": 11647 + }, + { + "epoch": 0.8403737238916344, + "grad_norm": 2.905665349091434, + "learning_rate": 2.6128928065416975e-07, + "loss": 0.9376, + "step": 11648 + }, + { + "epoch": 0.840445871361062, + "grad_norm": 2.3334109316138782, + "learning_rate": 2.610583647642066e-07, + "loss": 0.8874, + "step": 11649 + }, + { + "epoch": 0.8405180188304895, + "grad_norm": 4.113026668136585, + "learning_rate": 2.608275438314098e-07, + "loss": 0.9246, + "step": 11650 + }, + { + "epoch": 0.840590166299917, + "grad_norm": 3.3001244608210127, + "learning_rate": 2.605968178683822e-07, + "loss": 0.8873, + "step": 11651 + }, + { + "epoch": 0.8406623137693445, + "grad_norm": 2.554299327979257, + "learning_rate": 2.6036618688772317e-07, + "loss": 0.9008, + "step": 11652 + }, + { + "epoch": 0.840734461238772, + "grad_norm": 2.483741727526269, + "learning_rate": 2.6013565090202714e-07, + "loss": 0.881, + "step": 11653 + }, + { + "epoch": 0.8408066087081996, + "grad_norm": 3.10080261915114, + "learning_rate": 2.599052099238821e-07, + "loss": 0.9273, + "step": 11654 + }, + { + "epoch": 0.8408787561776271, + "grad_norm": 0.6894590901665193, + "learning_rate": 2.5967486396587146e-07, + "loss": 0.7918, + "step": 11655 + }, + { + "epoch": 0.8409509036470546, + "grad_norm": 9.603675901461507, + "learning_rate": 2.594446130405745e-07, + "loss": 1.0096, + "step": 11656 + }, + { + "epoch": 0.8410230511164821, + "grad_norm": 0.687156309570965, + "learning_rate": 2.5921445716056454e-07, + "loss": 0.8095, + "step": 11657 + }, + { + "epoch": 0.8410951985859096, + "grad_norm": 2.329205563060649, + "learning_rate": 2.589843963384082e-07, + "loss": 0.981, + "step": 11658 + }, + { + "epoch": 0.8411673460553372, + "grad_norm": 8.520382845161654, + "learning_rate": 2.5875443058666914e-07, + "loss": 0.9519, + "step": 11659 + }, + { + "epoch": 0.8412394935247646, + "grad_norm": 2.998522235573551, + "learning_rate": 2.5852455991790513e-07, + "loss": 0.8486, + "step": 11660 + }, + { + "epoch": 0.8413116409941921, + "grad_norm": 2.0217609308484343, + "learning_rate": 2.5829478434466767e-07, + "loss": 1.0005, + "step": 11661 + }, + { + "epoch": 0.8413837884636196, + "grad_norm": 2.2256402498240733, + "learning_rate": 2.5806510387950407e-07, + "loss": 0.7586, + "step": 11662 + }, + { + "epoch": 0.8414559359330471, + "grad_norm": 2.6163425277190053, + "learning_rate": 2.5783551853495724e-07, + "loss": 0.9655, + "step": 11663 + }, + { + "epoch": 0.8415280834024746, + "grad_norm": 2.912035034786368, + "learning_rate": 2.576060283235639e-07, + "loss": 0.9249, + "step": 11664 + }, + { + "epoch": 0.8416002308719022, + "grad_norm": 2.6689805978675105, + "learning_rate": 2.573766332578546e-07, + "loss": 0.9344, + "step": 11665 + }, + { + "epoch": 0.8416723783413297, + "grad_norm": 2.4880952630832867, + "learning_rate": 2.5714733335035666e-07, + "loss": 0.8522, + "step": 11666 + }, + { + "epoch": 0.8417445258107572, + "grad_norm": 3.125255116539613, + "learning_rate": 2.569181286135913e-07, + "loss": 0.9237, + "step": 11667 + }, + { + "epoch": 0.8418166732801847, + "grad_norm": 2.4855062919971402, + "learning_rate": 2.566890190600735e-07, + "loss": 0.9905, + "step": 11668 + }, + { + "epoch": 0.8418888207496122, + "grad_norm": 7.622633519890908, + "learning_rate": 2.564600047023151e-07, + "loss": 0.9213, + "step": 11669 + }, + { + "epoch": 0.8419609682190398, + "grad_norm": 3.3114184647270353, + "learning_rate": 2.56231085552822e-07, + "loss": 0.9285, + "step": 11670 + }, + { + "epoch": 0.8420331156884673, + "grad_norm": 1.642631321248623, + "learning_rate": 2.560022616240938e-07, + "loss": 0.9799, + "step": 11671 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 2.4892120297256044, + "learning_rate": 2.557735329286259e-07, + "loss": 0.9128, + "step": 11672 + }, + { + "epoch": 0.8421774106273222, + "grad_norm": 2.2060400973745615, + "learning_rate": 2.55544899478908e-07, + "loss": 0.9531, + "step": 11673 + }, + { + "epoch": 0.8422495580967497, + "grad_norm": 2.0588614997247454, + "learning_rate": 2.553163612874263e-07, + "loss": 0.8182, + "step": 11674 + }, + { + "epoch": 0.8423217055661772, + "grad_norm": 2.2331146740507837, + "learning_rate": 2.5508791836665924e-07, + "loss": 0.9043, + "step": 11675 + }, + { + "epoch": 0.8423938530356048, + "grad_norm": 2.629224284689855, + "learning_rate": 2.5485957072908125e-07, + "loss": 0.962, + "step": 11676 + }, + { + "epoch": 0.8424660005050323, + "grad_norm": 2.0081310687432006, + "learning_rate": 2.5463131838716247e-07, + "loss": 0.8972, + "step": 11677 + }, + { + "epoch": 0.8425381479744598, + "grad_norm": 3.103787256048737, + "learning_rate": 2.544031613533657e-07, + "loss": 0.9525, + "step": 11678 + }, + { + "epoch": 0.8426102954438873, + "grad_norm": 4.175961896452136, + "learning_rate": 2.5417509964014994e-07, + "loss": 0.8643, + "step": 11679 + }, + { + "epoch": 0.8426824429133148, + "grad_norm": 5.834090301808689, + "learning_rate": 2.5394713325996987e-07, + "loss": 0.9412, + "step": 11680 + }, + { + "epoch": 0.8427545903827424, + "grad_norm": 3.906157957899991, + "learning_rate": 2.5371926222527285e-07, + "loss": 1.0321, + "step": 11681 + }, + { + "epoch": 0.8428267378521699, + "grad_norm": 2.0927450700582653, + "learning_rate": 2.534914865485025e-07, + "loss": 0.9132, + "step": 11682 + }, + { + "epoch": 0.8428988853215973, + "grad_norm": 3.7938878746054323, + "learning_rate": 2.5326380624209665e-07, + "loss": 0.9069, + "step": 11683 + }, + { + "epoch": 0.8429710327910248, + "grad_norm": 2.128752032652516, + "learning_rate": 2.530362213184887e-07, + "loss": 0.9622, + "step": 11684 + }, + { + "epoch": 0.8430431802604523, + "grad_norm": 2.5804188089738087, + "learning_rate": 2.528087317901051e-07, + "loss": 0.9455, + "step": 11685 + }, + { + "epoch": 0.8431153277298798, + "grad_norm": 2.783378815041395, + "learning_rate": 2.525813376693682e-07, + "loss": 0.9267, + "step": 11686 + }, + { + "epoch": 0.8431874751993074, + "grad_norm": 2.468002592496577, + "learning_rate": 2.523540389686969e-07, + "loss": 1.0608, + "step": 11687 + }, + { + "epoch": 0.8432596226687349, + "grad_norm": 2.086462290609806, + "learning_rate": 2.521268357005013e-07, + "loss": 0.9265, + "step": 11688 + }, + { + "epoch": 0.8433317701381624, + "grad_norm": 0.7455061750866221, + "learning_rate": 2.5189972787718884e-07, + "loss": 0.8732, + "step": 11689 + }, + { + "epoch": 0.8434039176075899, + "grad_norm": 2.1270912202644654, + "learning_rate": 2.5167271551116107e-07, + "loss": 0.8254, + "step": 11690 + }, + { + "epoch": 0.8434760650770174, + "grad_norm": 4.03305249983566, + "learning_rate": 2.514457986148146e-07, + "loss": 0.9526, + "step": 11691 + }, + { + "epoch": 0.843548212546445, + "grad_norm": 2.11940050069845, + "learning_rate": 2.5121897720053933e-07, + "loss": 0.8443, + "step": 11692 + }, + { + "epoch": 0.8436203600158725, + "grad_norm": 3.3831544487427485, + "learning_rate": 2.5099225128072235e-07, + "loss": 0.7951, + "step": 11693 + }, + { + "epoch": 0.8436925074853, + "grad_norm": 3.127739735295379, + "learning_rate": 2.5076562086774445e-07, + "loss": 0.9541, + "step": 11694 + }, + { + "epoch": 0.8437646549547274, + "grad_norm": 2.383325540312006, + "learning_rate": 2.5053908597398e-07, + "loss": 0.9439, + "step": 11695 + }, + { + "epoch": 0.8438368024241549, + "grad_norm": 2.382954077344989, + "learning_rate": 2.5031264661179976e-07, + "loss": 0.8999, + "step": 11696 + }, + { + "epoch": 0.8439089498935824, + "grad_norm": 2.078300181871131, + "learning_rate": 2.5008630279356914e-07, + "loss": 0.9322, + "step": 11697 + }, + { + "epoch": 0.84398109736301, + "grad_norm": 2.3026761863804754, + "learning_rate": 2.498600545316469e-07, + "loss": 0.8907, + "step": 11698 + }, + { + "epoch": 0.8440532448324375, + "grad_norm": 1.9532469444353535, + "learning_rate": 2.496339018383886e-07, + "loss": 0.9853, + "step": 11699 + }, + { + "epoch": 0.844125392301865, + "grad_norm": 2.375429114676192, + "learning_rate": 2.494078447261434e-07, + "loss": 0.8859, + "step": 11700 + }, + { + "epoch": 0.8441975397712925, + "grad_norm": 3.2613577820003674, + "learning_rate": 2.4918188320725563e-07, + "loss": 0.8732, + "step": 11701 + }, + { + "epoch": 0.84426968724072, + "grad_norm": 2.36936875366901, + "learning_rate": 2.4895601729406345e-07, + "loss": 0.9835, + "step": 11702 + }, + { + "epoch": 0.8443418347101476, + "grad_norm": 2.1848363735639444, + "learning_rate": 2.487302469989008e-07, + "loss": 0.7681, + "step": 11703 + }, + { + "epoch": 0.8444139821795751, + "grad_norm": 2.1583294999764866, + "learning_rate": 2.4850457233409705e-07, + "loss": 0.8594, + "step": 11704 + }, + { + "epoch": 0.8444861296490026, + "grad_norm": 2.190547237566953, + "learning_rate": 2.4827899331197446e-07, + "loss": 0.9748, + "step": 11705 + }, + { + "epoch": 0.8445582771184301, + "grad_norm": 2.9503863518086857, + "learning_rate": 2.480535099448513e-07, + "loss": 0.925, + "step": 11706 + }, + { + "epoch": 0.8446304245878575, + "grad_norm": 2.6941449718819146, + "learning_rate": 2.4782812224504047e-07, + "loss": 0.8847, + "step": 11707 + }, + { + "epoch": 0.844702572057285, + "grad_norm": 2.8153560124660455, + "learning_rate": 2.4760283022485005e-07, + "loss": 0.8848, + "step": 11708 + }, + { + "epoch": 0.8447747195267126, + "grad_norm": 2.4125015495967337, + "learning_rate": 2.473776338965816e-07, + "loss": 0.8991, + "step": 11709 + }, + { + "epoch": 0.8448468669961401, + "grad_norm": 4.615787920081353, + "learning_rate": 2.471525332725319e-07, + "loss": 0.8591, + "step": 11710 + }, + { + "epoch": 0.8449190144655676, + "grad_norm": 2.1053902747652105, + "learning_rate": 2.469275283649945e-07, + "loss": 0.9937, + "step": 11711 + }, + { + "epoch": 0.8449911619349951, + "grad_norm": 6.990727959575108, + "learning_rate": 2.467026191862549e-07, + "loss": 0.8648, + "step": 11712 + }, + { + "epoch": 0.8450633094044226, + "grad_norm": 2.637957337127745, + "learning_rate": 2.4647780574859454e-07, + "loss": 0.8624, + "step": 11713 + }, + { + "epoch": 0.8451354568738502, + "grad_norm": 1.5855553077234794, + "learning_rate": 2.4625308806429033e-07, + "loss": 0.8817, + "step": 11714 + }, + { + "epoch": 0.8452076043432777, + "grad_norm": 2.181761271283604, + "learning_rate": 2.4602846614561246e-07, + "loss": 1.0889, + "step": 11715 + }, + { + "epoch": 0.8452797518127052, + "grad_norm": 3.138858279263695, + "learning_rate": 2.4580394000482686e-07, + "loss": 0.8765, + "step": 11716 + }, + { + "epoch": 0.8453518992821327, + "grad_norm": 2.5098848425904294, + "learning_rate": 2.455795096541944e-07, + "loss": 1.0093, + "step": 11717 + }, + { + "epoch": 0.8454240467515602, + "grad_norm": 3.016844033634952, + "learning_rate": 2.453551751059708e-07, + "loss": 0.8371, + "step": 11718 + }, + { + "epoch": 0.8454961942209877, + "grad_norm": 6.847582714950287, + "learning_rate": 2.451309363724052e-07, + "loss": 0.9623, + "step": 11719 + }, + { + "epoch": 0.8455683416904152, + "grad_norm": 2.654188186182988, + "learning_rate": 2.449067934657429e-07, + "loss": 0.9408, + "step": 11720 + }, + { + "epoch": 0.8456404891598427, + "grad_norm": 2.1709906884076755, + "learning_rate": 2.4468274639822375e-07, + "loss": 0.8376, + "step": 11721 + }, + { + "epoch": 0.8457126366292702, + "grad_norm": 2.498336524664444, + "learning_rate": 2.4445879518208114e-07, + "loss": 0.9539, + "step": 11722 + }, + { + "epoch": 0.8457847840986977, + "grad_norm": 2.5395214428973714, + "learning_rate": 2.4423493982954535e-07, + "loss": 0.9277, + "step": 11723 + }, + { + "epoch": 0.8458569315681252, + "grad_norm": 2.118578689536295, + "learning_rate": 2.4401118035283996e-07, + "loss": 0.9263, + "step": 11724 + }, + { + "epoch": 0.8459290790375528, + "grad_norm": 2.287070074484874, + "learning_rate": 2.4378751676418384e-07, + "loss": 0.9669, + "step": 11725 + }, + { + "epoch": 0.8460012265069803, + "grad_norm": 3.836893556869678, + "learning_rate": 2.4356394907578967e-07, + "loss": 0.8455, + "step": 11726 + }, + { + "epoch": 0.8460733739764078, + "grad_norm": 2.802576572348769, + "learning_rate": 2.4334047729986596e-07, + "loss": 0.9493, + "step": 11727 + }, + { + "epoch": 0.8461455214458353, + "grad_norm": 5.056237196935563, + "learning_rate": 2.4311710144861664e-07, + "loss": 0.9873, + "step": 11728 + }, + { + "epoch": 0.8462176689152628, + "grad_norm": 0.7101538814425444, + "learning_rate": 2.42893821534238e-07, + "loss": 0.8142, + "step": 11729 + }, + { + "epoch": 0.8462898163846903, + "grad_norm": 2.4351274730363754, + "learning_rate": 2.4267063756892336e-07, + "loss": 0.9416, + "step": 11730 + }, + { + "epoch": 0.8463619638541178, + "grad_norm": 2.884535359959882, + "learning_rate": 2.4244754956486013e-07, + "loss": 0.8821, + "step": 11731 + }, + { + "epoch": 0.8464341113235453, + "grad_norm": 2.7838899748699473, + "learning_rate": 2.4222455753422963e-07, + "loss": 0.821, + "step": 11732 + }, + { + "epoch": 0.8465062587929728, + "grad_norm": 2.2056598554858375, + "learning_rate": 2.420016614892091e-07, + "loss": 0.8947, + "step": 11733 + }, + { + "epoch": 0.8465784062624003, + "grad_norm": 5.624231413148597, + "learning_rate": 2.4177886144196933e-07, + "loss": 0.8661, + "step": 11734 + }, + { + "epoch": 0.8466505537318278, + "grad_norm": 2.0095448423661075, + "learning_rate": 2.4155615740467807e-07, + "loss": 1.0095, + "step": 11735 + }, + { + "epoch": 0.8467227012012554, + "grad_norm": 2.279949183148712, + "learning_rate": 2.4133354938949525e-07, + "loss": 0.9032, + "step": 11736 + }, + { + "epoch": 0.8467948486706829, + "grad_norm": 2.766253470010559, + "learning_rate": 2.4111103740857674e-07, + "loss": 1.0254, + "step": 11737 + }, + { + "epoch": 0.8468669961401104, + "grad_norm": 3.8811511778114047, + "learning_rate": 2.4088862147407397e-07, + "loss": 0.9565, + "step": 11738 + }, + { + "epoch": 0.8469391436095379, + "grad_norm": 0.7050241888238886, + "learning_rate": 2.4066630159813096e-07, + "loss": 0.7507, + "step": 11739 + }, + { + "epoch": 0.8470112910789654, + "grad_norm": 2.5410227559229304, + "learning_rate": 2.40444077792888e-07, + "loss": 0.9425, + "step": 11740 + }, + { + "epoch": 0.847083438548393, + "grad_norm": 2.3337237735330985, + "learning_rate": 2.402219500704812e-07, + "loss": 0.8823, + "step": 11741 + }, + { + "epoch": 0.8471555860178204, + "grad_norm": 2.227701210498853, + "learning_rate": 2.399999184430388e-07, + "loss": 0.9357, + "step": 11742 + }, + { + "epoch": 0.8472277334872479, + "grad_norm": 2.2251458195255105, + "learning_rate": 2.3977798292268536e-07, + "loss": 0.9149, + "step": 11743 + }, + { + "epoch": 0.8472998809566754, + "grad_norm": 4.852483738710336, + "learning_rate": 2.395561435215405e-07, + "loss": 0.9968, + "step": 11744 + }, + { + "epoch": 0.8473720284261029, + "grad_norm": 1.439775076360128, + "learning_rate": 2.393344002517179e-07, + "loss": 0.7664, + "step": 11745 + }, + { + "epoch": 0.8474441758955304, + "grad_norm": 0.7155810446753871, + "learning_rate": 2.391127531253252e-07, + "loss": 0.7439, + "step": 11746 + }, + { + "epoch": 0.847516323364958, + "grad_norm": 2.899680804082402, + "learning_rate": 2.388912021544667e-07, + "loss": 0.9455, + "step": 11747 + }, + { + "epoch": 0.8475884708343855, + "grad_norm": 2.1203206095192537, + "learning_rate": 2.386697473512409e-07, + "loss": 0.8732, + "step": 11748 + }, + { + "epoch": 0.847660618303813, + "grad_norm": 2.3649320527330566, + "learning_rate": 2.384483887277393e-07, + "loss": 0.983, + "step": 11749 + }, + { + "epoch": 0.8477327657732405, + "grad_norm": 2.1916756149902232, + "learning_rate": 2.3822712629605047e-07, + "loss": 0.9101, + "step": 11750 + }, + { + "epoch": 0.847804913242668, + "grad_norm": 2.3039619984936666, + "learning_rate": 2.380059600682558e-07, + "loss": 0.9723, + "step": 11751 + }, + { + "epoch": 0.8478770607120956, + "grad_norm": 0.644446904992562, + "learning_rate": 2.3778489005643387e-07, + "loss": 0.7624, + "step": 11752 + }, + { + "epoch": 0.8479492081815231, + "grad_norm": 2.758805741791394, + "learning_rate": 2.3756391627265504e-07, + "loss": 0.8818, + "step": 11753 + }, + { + "epoch": 0.8480213556509505, + "grad_norm": 3.2026968320918257, + "learning_rate": 2.3734303872898653e-07, + "loss": 0.9538, + "step": 11754 + }, + { + "epoch": 0.848093503120378, + "grad_norm": 2.341952537472666, + "learning_rate": 2.3712225743749003e-07, + "loss": 0.9448, + "step": 11755 + }, + { + "epoch": 0.8481656505898055, + "grad_norm": 2.8749230415990596, + "learning_rate": 2.3690157241022057e-07, + "loss": 0.7209, + "step": 11756 + }, + { + "epoch": 0.848237798059233, + "grad_norm": 3.589287549786346, + "learning_rate": 2.3668098365922961e-07, + "loss": 0.8706, + "step": 11757 + }, + { + "epoch": 0.8483099455286606, + "grad_norm": 3.84211137780746, + "learning_rate": 2.3646049119656286e-07, + "loss": 0.8307, + "step": 11758 + }, + { + "epoch": 0.8483820929980881, + "grad_norm": 7.1918035477231275, + "learning_rate": 2.3624009503425934e-07, + "loss": 0.9174, + "step": 11759 + }, + { + "epoch": 0.8484542404675156, + "grad_norm": 3.2753099717075025, + "learning_rate": 2.3601979518435565e-07, + "loss": 0.9279, + "step": 11760 + }, + { + "epoch": 0.8485263879369431, + "grad_norm": 1.7111740993834843, + "learning_rate": 2.3579959165888086e-07, + "loss": 0.9225, + "step": 11761 + }, + { + "epoch": 0.8485985354063706, + "grad_norm": 2.0289037211640717, + "learning_rate": 2.3557948446985977e-07, + "loss": 0.859, + "step": 11762 + }, + { + "epoch": 0.8486706828757982, + "grad_norm": 5.1655420904781115, + "learning_rate": 2.3535947362931118e-07, + "loss": 0.872, + "step": 11763 + }, + { + "epoch": 0.8487428303452257, + "grad_norm": 2.313776746334586, + "learning_rate": 2.3513955914924865e-07, + "loss": 0.9408, + "step": 11764 + }, + { + "epoch": 0.8488149778146532, + "grad_norm": 2.6624958880039036, + "learning_rate": 2.3491974104168255e-07, + "loss": 1.0622, + "step": 11765 + }, + { + "epoch": 0.8488871252840806, + "grad_norm": 2.7656853448351173, + "learning_rate": 2.3470001931861482e-07, + "loss": 0.798, + "step": 11766 + }, + { + "epoch": 0.8489592727535081, + "grad_norm": 2.796731018111108, + "learning_rate": 2.344803939920439e-07, + "loss": 1.0423, + "step": 11767 + }, + { + "epoch": 0.8490314202229357, + "grad_norm": 5.497544347316173, + "learning_rate": 2.3426086507396326e-07, + "loss": 0.9209, + "step": 11768 + }, + { + "epoch": 0.8491035676923632, + "grad_norm": 2.5195436462570533, + "learning_rate": 2.3404143257636045e-07, + "loss": 0.8194, + "step": 11769 + }, + { + "epoch": 0.8491757151617907, + "grad_norm": 2.0263934250936906, + "learning_rate": 2.3382209651121675e-07, + "loss": 0.8813, + "step": 11770 + }, + { + "epoch": 0.8492478626312182, + "grad_norm": 6.7880973876189215, + "learning_rate": 2.3360285689051084e-07, + "loss": 0.8131, + "step": 11771 + }, + { + "epoch": 0.8493200101006457, + "grad_norm": 0.6971693640333075, + "learning_rate": 2.3338371372621424e-07, + "loss": 0.7826, + "step": 11772 + }, + { + "epoch": 0.8493921575700732, + "grad_norm": 2.231795308985234, + "learning_rate": 2.331646670302927e-07, + "loss": 0.9362, + "step": 11773 + }, + { + "epoch": 0.8494643050395008, + "grad_norm": 2.806493746020799, + "learning_rate": 2.3294571681470821e-07, + "loss": 1.0569, + "step": 11774 + }, + { + "epoch": 0.8495364525089283, + "grad_norm": 2.2312669812151977, + "learning_rate": 2.327268630914172e-07, + "loss": 0.7474, + "step": 11775 + }, + { + "epoch": 0.8496085999783558, + "grad_norm": 2.272015313560296, + "learning_rate": 2.3250810587236902e-07, + "loss": 0.9, + "step": 11776 + }, + { + "epoch": 0.8496807474477833, + "grad_norm": 2.1350975232330303, + "learning_rate": 2.3228944516951077e-07, + "loss": 0.9209, + "step": 11777 + }, + { + "epoch": 0.8497528949172107, + "grad_norm": 3.532341442396789, + "learning_rate": 2.3207088099478178e-07, + "loss": 0.8956, + "step": 11778 + }, + { + "epoch": 0.8498250423866383, + "grad_norm": 0.7600633680126778, + "learning_rate": 2.3185241336011785e-07, + "loss": 0.8027, + "step": 11779 + }, + { + "epoch": 0.8498971898560658, + "grad_norm": 2.1819605853107453, + "learning_rate": 2.3163404227744788e-07, + "loss": 0.8615, + "step": 11780 + }, + { + "epoch": 0.8499693373254933, + "grad_norm": 1.718007360755686, + "learning_rate": 2.3141576775869653e-07, + "loss": 0.9457, + "step": 11781 + }, + { + "epoch": 0.8500414847949208, + "grad_norm": 2.1823122908195027, + "learning_rate": 2.311975898157834e-07, + "loss": 0.9824, + "step": 11782 + }, + { + "epoch": 0.8501136322643483, + "grad_norm": 4.555640302557279, + "learning_rate": 2.3097950846062119e-07, + "loss": 0.8756, + "step": 11783 + }, + { + "epoch": 0.8501857797337758, + "grad_norm": 2.179624276286812, + "learning_rate": 2.307615237051197e-07, + "loss": 0.9485, + "step": 11784 + }, + { + "epoch": 0.8502579272032034, + "grad_norm": 2.6673459936157893, + "learning_rate": 2.305436355611825e-07, + "loss": 0.8296, + "step": 11785 + }, + { + "epoch": 0.8503300746726309, + "grad_norm": 2.656833898656683, + "learning_rate": 2.3032584404070653e-07, + "loss": 0.9526, + "step": 11786 + }, + { + "epoch": 0.8504022221420584, + "grad_norm": 2.632163038433367, + "learning_rate": 2.3010814915558497e-07, + "loss": 0.9532, + "step": 11787 + }, + { + "epoch": 0.8504743696114859, + "grad_norm": 2.83941789920886, + "learning_rate": 2.2989055091770515e-07, + "loss": 0.8565, + "step": 11788 + }, + { + "epoch": 0.8505465170809133, + "grad_norm": 2.420981851597186, + "learning_rate": 2.296730493389505e-07, + "loss": 1.0778, + "step": 11789 + }, + { + "epoch": 0.8506186645503409, + "grad_norm": 2.106863943210718, + "learning_rate": 2.294556444311968e-07, + "loss": 0.9922, + "step": 11790 + }, + { + "epoch": 0.8506908120197684, + "grad_norm": 2.140845517125942, + "learning_rate": 2.292383362063157e-07, + "loss": 0.839, + "step": 11791 + }, + { + "epoch": 0.8507629594891959, + "grad_norm": 2.0643655836010124, + "learning_rate": 2.2902112467617462e-07, + "loss": 0.9977, + "step": 11792 + }, + { + "epoch": 0.8508351069586234, + "grad_norm": 2.6447994484974124, + "learning_rate": 2.288040098526336e-07, + "loss": 0.8544, + "step": 11793 + }, + { + "epoch": 0.8509072544280509, + "grad_norm": 1.477662176535993, + "learning_rate": 2.2858699174754824e-07, + "loss": 0.8575, + "step": 11794 + }, + { + "epoch": 0.8509794018974784, + "grad_norm": 0.6439310920957063, + "learning_rate": 2.2837007037277024e-07, + "loss": 0.7966, + "step": 11795 + }, + { + "epoch": 0.851051549366906, + "grad_norm": 4.69814627594499, + "learning_rate": 2.2815324574014473e-07, + "loss": 0.8914, + "step": 11796 + }, + { + "epoch": 0.8511236968363335, + "grad_norm": 2.345405095945456, + "learning_rate": 2.2793651786151092e-07, + "loss": 0.9336, + "step": 11797 + }, + { + "epoch": 0.851195844305761, + "grad_norm": 2.3849775640364066, + "learning_rate": 2.277198867487038e-07, + "loss": 0.8869, + "step": 11798 + }, + { + "epoch": 0.8512679917751885, + "grad_norm": 5.268486897878293, + "learning_rate": 2.2750335241355346e-07, + "loss": 0.927, + "step": 11799 + }, + { + "epoch": 0.851340139244616, + "grad_norm": 3.6791749504114186, + "learning_rate": 2.2728691486788266e-07, + "loss": 0.8018, + "step": 11800 + }, + { + "epoch": 0.8514122867140435, + "grad_norm": 2.217711944601028, + "learning_rate": 2.2707057412351148e-07, + "loss": 0.9692, + "step": 11801 + }, + { + "epoch": 0.851484434183471, + "grad_norm": 3.0717520020736786, + "learning_rate": 2.2685433019225343e-07, + "loss": 0.8426, + "step": 11802 + }, + { + "epoch": 0.8515565816528985, + "grad_norm": 2.5265307290575008, + "learning_rate": 2.2663818308591608e-07, + "loss": 0.9065, + "step": 11803 + }, + { + "epoch": 0.851628729122326, + "grad_norm": 3.3590828702016027, + "learning_rate": 2.2642213281630295e-07, + "loss": 0.9275, + "step": 11804 + }, + { + "epoch": 0.8517008765917535, + "grad_norm": 2.324848916672127, + "learning_rate": 2.2620617939521125e-07, + "loss": 0.9174, + "step": 11805 + }, + { + "epoch": 0.851773024061181, + "grad_norm": 2.3725051826766137, + "learning_rate": 2.2599032283443397e-07, + "loss": 0.9087, + "step": 11806 + }, + { + "epoch": 0.8518451715306086, + "grad_norm": 1.676027648879806, + "learning_rate": 2.2577456314575793e-07, + "loss": 0.9217, + "step": 11807 + }, + { + "epoch": 0.8519173190000361, + "grad_norm": 2.677817089492225, + "learning_rate": 2.2555890034096504e-07, + "loss": 0.8792, + "step": 11808 + }, + { + "epoch": 0.8519894664694636, + "grad_norm": 4.321495956381932, + "learning_rate": 2.253433344318323e-07, + "loss": 1.0053, + "step": 11809 + }, + { + "epoch": 0.8520616139388911, + "grad_norm": 3.38535218788757, + "learning_rate": 2.2512786543013007e-07, + "loss": 0.9335, + "step": 11810 + }, + { + "epoch": 0.8521337614083186, + "grad_norm": 1.9469661807633576, + "learning_rate": 2.2491249334762474e-07, + "loss": 0.9649, + "step": 11811 + }, + { + "epoch": 0.8522059088777462, + "grad_norm": 2.9688495820058267, + "learning_rate": 2.2469721819607668e-07, + "loss": 0.9255, + "step": 11812 + }, + { + "epoch": 0.8522780563471736, + "grad_norm": 1.890996344802366, + "learning_rate": 2.2448203998724225e-07, + "loss": 0.8536, + "step": 11813 + }, + { + "epoch": 0.8523502038166011, + "grad_norm": 2.157851688545698, + "learning_rate": 2.242669587328707e-07, + "loss": 0.9399, + "step": 11814 + }, + { + "epoch": 0.8524223512860286, + "grad_norm": 4.516633551406989, + "learning_rate": 2.2405197444470668e-07, + "loss": 0.9186, + "step": 11815 + }, + { + "epoch": 0.8524944987554561, + "grad_norm": 2.7228696457631982, + "learning_rate": 2.238370871344908e-07, + "loss": 0.9349, + "step": 11816 + }, + { + "epoch": 0.8525666462248837, + "grad_norm": 3.553408348575158, + "learning_rate": 2.2362229681395563e-07, + "loss": 0.837, + "step": 11817 + }, + { + "epoch": 0.8526387936943112, + "grad_norm": 1.8632204697650292, + "learning_rate": 2.234076034948309e-07, + "loss": 0.9422, + "step": 11818 + }, + { + "epoch": 0.8527109411637387, + "grad_norm": 2.3572395209932053, + "learning_rate": 2.231930071888408e-07, + "loss": 0.877, + "step": 11819 + }, + { + "epoch": 0.8527830886331662, + "grad_norm": 2.2695731653226527, + "learning_rate": 2.2297850790770267e-07, + "loss": 0.8592, + "step": 11820 + }, + { + "epoch": 0.8528552361025937, + "grad_norm": 2.5564806883045437, + "learning_rate": 2.2276410566312998e-07, + "loss": 0.8468, + "step": 11821 + }, + { + "epoch": 0.8529273835720212, + "grad_norm": 1.9986588034183548, + "learning_rate": 2.2254980046683025e-07, + "loss": 0.8854, + "step": 11822 + }, + { + "epoch": 0.8529995310414488, + "grad_norm": 21.417514571488493, + "learning_rate": 2.2233559233050658e-07, + "loss": 0.9663, + "step": 11823 + }, + { + "epoch": 0.8530716785108763, + "grad_norm": 5.617311945824631, + "learning_rate": 2.221214812658545e-07, + "loss": 0.9203, + "step": 11824 + }, + { + "epoch": 0.8531438259803037, + "grad_norm": 5.694771279153799, + "learning_rate": 2.2190746728456755e-07, + "loss": 0.8768, + "step": 11825 + }, + { + "epoch": 0.8532159734497312, + "grad_norm": 1.8423967082497348, + "learning_rate": 2.216935503983317e-07, + "loss": 0.8375, + "step": 11826 + }, + { + "epoch": 0.8532881209191587, + "grad_norm": 1.9109409959188752, + "learning_rate": 2.2147973061882763e-07, + "loss": 0.8941, + "step": 11827 + }, + { + "epoch": 0.8533602683885863, + "grad_norm": 2.218658660298048, + "learning_rate": 2.2126600795773175e-07, + "loss": 0.9522, + "step": 11828 + }, + { + "epoch": 0.8534324158580138, + "grad_norm": 2.29461340596424, + "learning_rate": 2.2105238242671476e-07, + "loss": 0.8956, + "step": 11829 + }, + { + "epoch": 0.8535045633274413, + "grad_norm": 2.630518879198935, + "learning_rate": 2.2083885403744106e-07, + "loss": 0.9982, + "step": 11830 + }, + { + "epoch": 0.8535767107968688, + "grad_norm": 2.4891138525273417, + "learning_rate": 2.206254228015716e-07, + "loss": 0.9409, + "step": 11831 + }, + { + "epoch": 0.8536488582662963, + "grad_norm": 3.517866852479278, + "learning_rate": 2.20412088730761e-07, + "loss": 0.8603, + "step": 11832 + }, + { + "epoch": 0.8537210057357238, + "grad_norm": 3.1221873023123092, + "learning_rate": 2.2019885183665866e-07, + "loss": 1.0446, + "step": 11833 + }, + { + "epoch": 0.8537931532051514, + "grad_norm": 2.313013792954224, + "learning_rate": 2.199857121309079e-07, + "loss": 0.9551, + "step": 11834 + }, + { + "epoch": 0.8538653006745789, + "grad_norm": 2.3514666870469902, + "learning_rate": 2.1977266962514828e-07, + "loss": 0.9497, + "step": 11835 + }, + { + "epoch": 0.8539374481440063, + "grad_norm": 3.5742111456117875, + "learning_rate": 2.1955972433101322e-07, + "loss": 0.9988, + "step": 11836 + }, + { + "epoch": 0.8540095956134338, + "grad_norm": 2.8767628448191607, + "learning_rate": 2.193468762601298e-07, + "loss": 0.9462, + "step": 11837 + }, + { + "epoch": 0.8540817430828613, + "grad_norm": 2.1059110102676475, + "learning_rate": 2.1913412542412235e-07, + "loss": 1.001, + "step": 11838 + }, + { + "epoch": 0.8541538905522889, + "grad_norm": 2.2638363848359555, + "learning_rate": 2.1892147183460752e-07, + "loss": 0.8365, + "step": 11839 + }, + { + "epoch": 0.8542260380217164, + "grad_norm": 3.3028673430599738, + "learning_rate": 2.1870891550319846e-07, + "loss": 0.8775, + "step": 11840 + }, + { + "epoch": 0.8542981854911439, + "grad_norm": 2.9919953047975634, + "learning_rate": 2.1849645644150082e-07, + "loss": 0.9061, + "step": 11841 + }, + { + "epoch": 0.8543703329605714, + "grad_norm": 1.5763176870674018, + "learning_rate": 2.182840946611164e-07, + "loss": 1.0051, + "step": 11842 + }, + { + "epoch": 0.8544424804299989, + "grad_norm": 2.2428445418249443, + "learning_rate": 2.1807183017364262e-07, + "loss": 0.991, + "step": 11843 + }, + { + "epoch": 0.8545146278994264, + "grad_norm": 2.2019412059505337, + "learning_rate": 2.178596629906695e-07, + "loss": 0.8577, + "step": 11844 + }, + { + "epoch": 0.854586775368854, + "grad_norm": 5.195108005256416, + "learning_rate": 2.176475931237829e-07, + "loss": 0.8111, + "step": 11845 + }, + { + "epoch": 0.8546589228382815, + "grad_norm": 2.2312321467179537, + "learning_rate": 2.1743562058456334e-07, + "loss": 0.9293, + "step": 11846 + }, + { + "epoch": 0.854731070307709, + "grad_norm": 2.1260087479979286, + "learning_rate": 2.1722374538458553e-07, + "loss": 0.9634, + "step": 11847 + }, + { + "epoch": 0.8548032177771364, + "grad_norm": 2.556922704698747, + "learning_rate": 2.1701196753541895e-07, + "loss": 0.8897, + "step": 11848 + }, + { + "epoch": 0.8548753652465639, + "grad_norm": 3.725700102690498, + "learning_rate": 2.1680028704862897e-07, + "loss": 1.0084, + "step": 11849 + }, + { + "epoch": 0.8549475127159915, + "grad_norm": 2.697134712241, + "learning_rate": 2.1658870393577433e-07, + "loss": 0.9815, + "step": 11850 + }, + { + "epoch": 0.855019660185419, + "grad_norm": 2.1154224879202457, + "learning_rate": 2.1637721820840825e-07, + "loss": 0.8469, + "step": 11851 + }, + { + "epoch": 0.8550918076548465, + "grad_norm": 0.8146870595025795, + "learning_rate": 2.1616582987807953e-07, + "loss": 0.7997, + "step": 11852 + }, + { + "epoch": 0.855163955124274, + "grad_norm": 4.225054741679737, + "learning_rate": 2.1595453895633174e-07, + "loss": 0.8633, + "step": 11853 + }, + { + "epoch": 0.8552361025937015, + "grad_norm": 3.12240248491959, + "learning_rate": 2.157433454547015e-07, + "loss": 1.0015, + "step": 11854 + }, + { + "epoch": 0.855308250063129, + "grad_norm": 1.758600083979042, + "learning_rate": 2.1553224938472247e-07, + "loss": 0.8454, + "step": 11855 + }, + { + "epoch": 0.8553803975325566, + "grad_norm": 1.9419844014045173, + "learning_rate": 2.153212507579214e-07, + "loss": 0.7868, + "step": 11856 + }, + { + "epoch": 0.8554525450019841, + "grad_norm": 2.220225528568005, + "learning_rate": 2.1511034958582042e-07, + "loss": 1.0055, + "step": 11857 + }, + { + "epoch": 0.8555246924714116, + "grad_norm": 0.7789561738722861, + "learning_rate": 2.1489954587993564e-07, + "loss": 0.8273, + "step": 11858 + }, + { + "epoch": 0.8555968399408391, + "grad_norm": 11.45336614045258, + "learning_rate": 2.146888396517783e-07, + "loss": 0.9873, + "step": 11859 + }, + { + "epoch": 0.8556689874102665, + "grad_norm": 2.839140961171161, + "learning_rate": 2.14478230912855e-07, + "loss": 0.8445, + "step": 11860 + }, + { + "epoch": 0.855741134879694, + "grad_norm": 0.8037055607519688, + "learning_rate": 2.1426771967466472e-07, + "loss": 0.8104, + "step": 11861 + }, + { + "epoch": 0.8558132823491216, + "grad_norm": 2.6502788073144767, + "learning_rate": 2.1405730594870431e-07, + "loss": 0.868, + "step": 11862 + }, + { + "epoch": 0.8558854298185491, + "grad_norm": 2.1927378479578774, + "learning_rate": 2.1384698974646344e-07, + "loss": 1.0064, + "step": 11863 + }, + { + "epoch": 0.8559575772879766, + "grad_norm": 2.6724991989491387, + "learning_rate": 2.136367710794258e-07, + "loss": 1.0085, + "step": 11864 + }, + { + "epoch": 0.8560297247574041, + "grad_norm": 2.5116887543164204, + "learning_rate": 2.1342664995907134e-07, + "loss": 0.9353, + "step": 11865 + }, + { + "epoch": 0.8561018722268317, + "grad_norm": 0.8170333352192721, + "learning_rate": 2.1321662639687333e-07, + "loss": 0.8282, + "step": 11866 + }, + { + "epoch": 0.8561740196962592, + "grad_norm": 2.215523428801446, + "learning_rate": 2.1300670040430192e-07, + "loss": 0.9346, + "step": 11867 + }, + { + "epoch": 0.8562461671656867, + "grad_norm": 11.837693248488597, + "learning_rate": 2.127968719928188e-07, + "loss": 1.0057, + "step": 11868 + }, + { + "epoch": 0.8563183146351142, + "grad_norm": 2.5429205589591755, + "learning_rate": 2.1258714117388266e-07, + "loss": 0.8525, + "step": 11869 + }, + { + "epoch": 0.8563904621045417, + "grad_norm": 4.837055132614991, + "learning_rate": 2.1237750795894605e-07, + "loss": 0.8296, + "step": 11870 + }, + { + "epoch": 0.8564626095739692, + "grad_norm": 2.3605606434730015, + "learning_rate": 2.1216797235945606e-07, + "loss": 0.8261, + "step": 11871 + }, + { + "epoch": 0.8565347570433967, + "grad_norm": 3.048295535991163, + "learning_rate": 2.119585343868544e-07, + "loss": 0.9257, + "step": 11872 + }, + { + "epoch": 0.8566069045128242, + "grad_norm": 2.4606482926231075, + "learning_rate": 2.1174919405257863e-07, + "loss": 0.8983, + "step": 11873 + }, + { + "epoch": 0.8566790519822517, + "grad_norm": 2.24848961468199, + "learning_rate": 2.1153995136805936e-07, + "loss": 0.8629, + "step": 11874 + }, + { + "epoch": 0.8567511994516792, + "grad_norm": 2.4540245204965836, + "learning_rate": 2.1133080634472254e-07, + "loss": 0.8746, + "step": 11875 + }, + { + "epoch": 0.8568233469211067, + "grad_norm": 3.4425685621427657, + "learning_rate": 2.1112175899398909e-07, + "loss": 0.9998, + "step": 11876 + }, + { + "epoch": 0.8568954943905343, + "grad_norm": 0.7533863627145108, + "learning_rate": 2.109128093272745e-07, + "loss": 0.835, + "step": 11877 + }, + { + "epoch": 0.8569676418599618, + "grad_norm": 3.479184651280745, + "learning_rate": 2.1070395735598744e-07, + "loss": 0.8665, + "step": 11878 + }, + { + "epoch": 0.8570397893293893, + "grad_norm": 2.3567934375894293, + "learning_rate": 2.1049520309153411e-07, + "loss": 0.837, + "step": 11879 + }, + { + "epoch": 0.8571119367988168, + "grad_norm": 1.8934240507613678, + "learning_rate": 2.1028654654531365e-07, + "loss": 0.9226, + "step": 11880 + }, + { + "epoch": 0.8571840842682443, + "grad_norm": 4.077375905892674, + "learning_rate": 2.100779877287191e-07, + "loss": 0.9121, + "step": 11881 + }, + { + "epoch": 0.8572562317376718, + "grad_norm": 2.532806200682086, + "learning_rate": 2.0986952665313963e-07, + "loss": 0.8701, + "step": 11882 + }, + { + "epoch": 0.8573283792070993, + "grad_norm": 2.6790013449659456, + "learning_rate": 2.0966116332995853e-07, + "loss": 0.8364, + "step": 11883 + }, + { + "epoch": 0.8574005266765268, + "grad_norm": 2.7069070630672742, + "learning_rate": 2.0945289777055385e-07, + "loss": 0.85, + "step": 11884 + }, + { + "epoch": 0.8574726741459543, + "grad_norm": 2.366194009818732, + "learning_rate": 2.0924472998629762e-07, + "loss": 0.8947, + "step": 11885 + }, + { + "epoch": 0.8575448216153818, + "grad_norm": 1.8799246647126893, + "learning_rate": 2.0903665998855757e-07, + "loss": 0.9209, + "step": 11886 + }, + { + "epoch": 0.8576169690848093, + "grad_norm": 2.8403039530506824, + "learning_rate": 2.0882868778869623e-07, + "loss": 1.0034, + "step": 11887 + }, + { + "epoch": 0.8576891165542369, + "grad_norm": 2.2864703711769945, + "learning_rate": 2.0862081339806915e-07, + "loss": 0.9679, + "step": 11888 + }, + { + "epoch": 0.8577612640236644, + "grad_norm": 2.4672534124497867, + "learning_rate": 2.084130368280279e-07, + "loss": 0.9107, + "step": 11889 + }, + { + "epoch": 0.8578334114930919, + "grad_norm": 2.999781918546395, + "learning_rate": 2.082053580899188e-07, + "loss": 0.8541, + "step": 11890 + }, + { + "epoch": 0.8579055589625194, + "grad_norm": 2.7037170291041956, + "learning_rate": 2.079977771950816e-07, + "loss": 0.8373, + "step": 11891 + }, + { + "epoch": 0.8579777064319469, + "grad_norm": 2.6382053281858506, + "learning_rate": 2.0779029415485217e-07, + "loss": 0.9693, + "step": 11892 + }, + { + "epoch": 0.8580498539013744, + "grad_norm": 2.348968494559652, + "learning_rate": 2.075829089805603e-07, + "loss": 0.8378, + "step": 11893 + }, + { + "epoch": 0.858122001370802, + "grad_norm": 0.6721045301618157, + "learning_rate": 2.0737562168353073e-07, + "loss": 0.7959, + "step": 11894 + }, + { + "epoch": 0.8581941488402294, + "grad_norm": 7.394231775086657, + "learning_rate": 2.0716843227508218e-07, + "loss": 0.9327, + "step": 11895 + }, + { + "epoch": 0.8582662963096569, + "grad_norm": 2.754503031248598, + "learning_rate": 2.0696134076652827e-07, + "loss": 0.8076, + "step": 11896 + }, + { + "epoch": 0.8583384437790844, + "grad_norm": 2.9370223935409707, + "learning_rate": 2.0675434716917883e-07, + "loss": 0.8555, + "step": 11897 + }, + { + "epoch": 0.8584105912485119, + "grad_norm": 2.263585297160673, + "learning_rate": 2.065474514943355e-07, + "loss": 0.8804, + "step": 11898 + }, + { + "epoch": 0.8584827387179395, + "grad_norm": 1.9450449931964227, + "learning_rate": 2.0634065375329702e-07, + "loss": 0.8756, + "step": 11899 + }, + { + "epoch": 0.858554886187367, + "grad_norm": 2.5529189214566235, + "learning_rate": 2.0613395395735523e-07, + "loss": 0.9203, + "step": 11900 + }, + { + "epoch": 0.8586270336567945, + "grad_norm": 2.3715579538456333, + "learning_rate": 2.05927352117798e-07, + "loss": 0.9128, + "step": 11901 + }, + { + "epoch": 0.858699181126222, + "grad_norm": 3.1412761259984876, + "learning_rate": 2.0572084824590608e-07, + "loss": 0.9396, + "step": 11902 + }, + { + "epoch": 0.8587713285956495, + "grad_norm": 3.4232147241812867, + "learning_rate": 2.0551444235295668e-07, + "loss": 0.9216, + "step": 11903 + }, + { + "epoch": 0.858843476065077, + "grad_norm": 2.2291135930193184, + "learning_rate": 2.05308134450221e-07, + "loss": 0.9187, + "step": 11904 + }, + { + "epoch": 0.8589156235345046, + "grad_norm": 4.600504781144457, + "learning_rate": 2.0510192454896403e-07, + "loss": 0.9475, + "step": 11905 + }, + { + "epoch": 0.8589877710039321, + "grad_norm": 2.627731536913604, + "learning_rate": 2.0489581266044632e-07, + "loss": 0.9783, + "step": 11906 + }, + { + "epoch": 0.8590599184733595, + "grad_norm": 3.288432806269591, + "learning_rate": 2.046897987959235e-07, + "loss": 0.7692, + "step": 11907 + }, + { + "epoch": 0.859132065942787, + "grad_norm": 2.428576936234712, + "learning_rate": 2.0448388296664442e-07, + "loss": 0.8452, + "step": 11908 + }, + { + "epoch": 0.8592042134122145, + "grad_norm": 3.7761902808398538, + "learning_rate": 2.0427806518385337e-07, + "loss": 0.9173, + "step": 11909 + }, + { + "epoch": 0.859276360881642, + "grad_norm": 2.4122920287625313, + "learning_rate": 2.0407234545879003e-07, + "loss": 0.8047, + "step": 11910 + }, + { + "epoch": 0.8593485083510696, + "grad_norm": 2.12359662557978, + "learning_rate": 2.0386672380268787e-07, + "loss": 0.8798, + "step": 11911 + }, + { + "epoch": 0.8594206558204971, + "grad_norm": 2.8609414734492478, + "learning_rate": 2.0366120022677457e-07, + "loss": 1.0108, + "step": 11912 + }, + { + "epoch": 0.8594928032899246, + "grad_norm": 2.9743501546502995, + "learning_rate": 2.034557747422734e-07, + "loss": 0.8796, + "step": 11913 + }, + { + "epoch": 0.8595649507593521, + "grad_norm": 2.5823690911909605, + "learning_rate": 2.03250447360402e-07, + "loss": 0.986, + "step": 11914 + }, + { + "epoch": 0.8596370982287797, + "grad_norm": 2.4997136905755974, + "learning_rate": 2.0304521809237162e-07, + "loss": 0.8942, + "step": 11915 + }, + { + "epoch": 0.8597092456982072, + "grad_norm": 3.1539701782732523, + "learning_rate": 2.0284008694939026e-07, + "loss": 0.8983, + "step": 11916 + }, + { + "epoch": 0.8597813931676347, + "grad_norm": 3.786133820044359, + "learning_rate": 2.0263505394265912e-07, + "loss": 0.8962, + "step": 11917 + }, + { + "epoch": 0.8598535406370622, + "grad_norm": 2.9329610617257793, + "learning_rate": 2.0243011908337393e-07, + "loss": 0.8852, + "step": 11918 + }, + { + "epoch": 0.8599256881064896, + "grad_norm": 9.744872089640635, + "learning_rate": 2.022252823827253e-07, + "loss": 0.867, + "step": 11919 + }, + { + "epoch": 0.8599978355759171, + "grad_norm": 1.886864066204425, + "learning_rate": 2.0202054385189848e-07, + "loss": 0.9805, + "step": 11920 + }, + { + "epoch": 0.8600699830453447, + "grad_norm": 1.903263817345548, + "learning_rate": 2.0181590350207479e-07, + "loss": 1.0078, + "step": 11921 + }, + { + "epoch": 0.8601421305147722, + "grad_norm": 2.5727040953347315, + "learning_rate": 2.0161136134442748e-07, + "loss": 0.944, + "step": 11922 + }, + { + "epoch": 0.8602142779841997, + "grad_norm": 2.8222167119734163, + "learning_rate": 2.014069173901265e-07, + "loss": 0.8643, + "step": 11923 + }, + { + "epoch": 0.8602864254536272, + "grad_norm": 3.1866989625174766, + "learning_rate": 2.0120257165033584e-07, + "loss": 0.9034, + "step": 11924 + }, + { + "epoch": 0.8603585729230547, + "grad_norm": 2.543610053934335, + "learning_rate": 2.0099832413621343e-07, + "loss": 0.8768, + "step": 11925 + }, + { + "epoch": 0.8604307203924823, + "grad_norm": 2.499559649786453, + "learning_rate": 2.0079417485891236e-07, + "loss": 0.9369, + "step": 11926 + }, + { + "epoch": 0.8605028678619098, + "grad_norm": 3.9883148460953373, + "learning_rate": 2.0059012382958152e-07, + "loss": 0.8454, + "step": 11927 + }, + { + "epoch": 0.8605750153313373, + "grad_norm": 0.770566099256392, + "learning_rate": 2.003861710593633e-07, + "loss": 0.7731, + "step": 11928 + }, + { + "epoch": 0.8606471628007648, + "grad_norm": 2.348853594579865, + "learning_rate": 2.0018231655939365e-07, + "loss": 0.8021, + "step": 11929 + }, + { + "epoch": 0.8607193102701923, + "grad_norm": 1.9336371522906644, + "learning_rate": 1.9997856034080528e-07, + "loss": 0.9352, + "step": 11930 + }, + { + "epoch": 0.8607914577396197, + "grad_norm": 1.9526126647368138, + "learning_rate": 1.997749024147246e-07, + "loss": 1.0043, + "step": 11931 + }, + { + "epoch": 0.8608636052090473, + "grad_norm": 2.087095018037954, + "learning_rate": 1.9957134279227185e-07, + "loss": 0.9408, + "step": 11932 + }, + { + "epoch": 0.8609357526784748, + "grad_norm": 3.31297068580672, + "learning_rate": 1.9936788148456273e-07, + "loss": 0.9198, + "step": 11933 + }, + { + "epoch": 0.8610079001479023, + "grad_norm": 2.3781779259545655, + "learning_rate": 1.9916451850270865e-07, + "loss": 1.0405, + "step": 11934 + }, + { + "epoch": 0.8610800476173298, + "grad_norm": 2.365661128988992, + "learning_rate": 1.9896125385781337e-07, + "loss": 0.9598, + "step": 11935 + }, + { + "epoch": 0.8611521950867573, + "grad_norm": 2.37690447692488, + "learning_rate": 1.987580875609769e-07, + "loss": 0.9413, + "step": 11936 + }, + { + "epoch": 0.8612243425561849, + "grad_norm": 3.685242948145078, + "learning_rate": 1.9855501962329302e-07, + "loss": 0.9736, + "step": 11937 + }, + { + "epoch": 0.8612964900256124, + "grad_norm": 2.157559425711393, + "learning_rate": 1.9835205005585155e-07, + "loss": 0.939, + "step": 11938 + }, + { + "epoch": 0.8613686374950399, + "grad_norm": 2.957036571422309, + "learning_rate": 1.9814917886973403e-07, + "loss": 0.8626, + "step": 11939 + }, + { + "epoch": 0.8614407849644674, + "grad_norm": 1.9438074870056723, + "learning_rate": 1.9794640607602032e-07, + "loss": 0.9879, + "step": 11940 + }, + { + "epoch": 0.8615129324338949, + "grad_norm": 0.7752148407359941, + "learning_rate": 1.9774373168578263e-07, + "loss": 0.8198, + "step": 11941 + }, + { + "epoch": 0.8615850799033223, + "grad_norm": 2.3465907873363157, + "learning_rate": 1.9754115571008767e-07, + "loss": 0.9132, + "step": 11942 + }, + { + "epoch": 0.8616572273727499, + "grad_norm": 2.9141369521215394, + "learning_rate": 1.973386781599977e-07, + "loss": 1.0138, + "step": 11943 + }, + { + "epoch": 0.8617293748421774, + "grad_norm": 4.412099864420914, + "learning_rate": 1.9713629904656902e-07, + "loss": 0.8381, + "step": 11944 + }, + { + "epoch": 0.8618015223116049, + "grad_norm": 2.7743491810050354, + "learning_rate": 1.9693401838085367e-07, + "loss": 1.0818, + "step": 11945 + }, + { + "epoch": 0.8618736697810324, + "grad_norm": 3.56259984997474, + "learning_rate": 1.9673183617389656e-07, + "loss": 0.8864, + "step": 11946 + }, + { + "epoch": 0.8619458172504599, + "grad_norm": 3.705399728028032, + "learning_rate": 1.9652975243673842e-07, + "loss": 0.9019, + "step": 11947 + }, + { + "epoch": 0.8620179647198875, + "grad_norm": 2.3871624373423566, + "learning_rate": 1.9632776718041467e-07, + "loss": 0.9011, + "step": 11948 + }, + { + "epoch": 0.862090112189315, + "grad_norm": 2.580545941779665, + "learning_rate": 1.9612588041595425e-07, + "loss": 0.9537, + "step": 11949 + }, + { + "epoch": 0.8621622596587425, + "grad_norm": 2.002300012820035, + "learning_rate": 1.9592409215438122e-07, + "loss": 0.8024, + "step": 11950 + }, + { + "epoch": 0.86223440712817, + "grad_norm": 6.683295809037166, + "learning_rate": 1.9572240240671588e-07, + "loss": 0.8301, + "step": 11951 + }, + { + "epoch": 0.8623065545975975, + "grad_norm": 6.4678379466292615, + "learning_rate": 1.955208111839708e-07, + "loss": 0.8843, + "step": 11952 + }, + { + "epoch": 0.862378702067025, + "grad_norm": 2.5378675260427666, + "learning_rate": 1.95319318497154e-07, + "loss": 0.8973, + "step": 11953 + }, + { + "epoch": 0.8624508495364525, + "grad_norm": 1.9616235391655987, + "learning_rate": 1.951179243572687e-07, + "loss": 0.9005, + "step": 11954 + }, + { + "epoch": 0.86252299700588, + "grad_norm": 3.2035873451572625, + "learning_rate": 1.9491662877531234e-07, + "loss": 0.9311, + "step": 11955 + }, + { + "epoch": 0.8625951444753075, + "grad_norm": 3.0873352748156613, + "learning_rate": 1.9471543176227635e-07, + "loss": 0.8999, + "step": 11956 + }, + { + "epoch": 0.862667291944735, + "grad_norm": 2.5756275449322295, + "learning_rate": 1.9451433332914724e-07, + "loss": 0.8976, + "step": 11957 + }, + { + "epoch": 0.8627394394141625, + "grad_norm": 2.7139306158812135, + "learning_rate": 1.943133334869076e-07, + "loss": 0.8716, + "step": 11958 + }, + { + "epoch": 0.86281158688359, + "grad_norm": 0.8599509649935271, + "learning_rate": 1.9411243224653195e-07, + "loss": 0.8377, + "step": 11959 + }, + { + "epoch": 0.8628837343530176, + "grad_norm": 2.693646167426778, + "learning_rate": 1.9391162961899132e-07, + "loss": 0.8779, + "step": 11960 + }, + { + "epoch": 0.8629558818224451, + "grad_norm": 2.281608344373652, + "learning_rate": 1.9371092561525093e-07, + "loss": 0.895, + "step": 11961 + }, + { + "epoch": 0.8630280292918726, + "grad_norm": 3.8544960224914417, + "learning_rate": 1.9351032024626979e-07, + "loss": 0.8461, + "step": 11962 + }, + { + "epoch": 0.8631001767613001, + "grad_norm": 2.5079824800016284, + "learning_rate": 1.9330981352300246e-07, + "loss": 0.8994, + "step": 11963 + }, + { + "epoch": 0.8631723242307277, + "grad_norm": 2.2415762144461873, + "learning_rate": 1.9310940545639842e-07, + "loss": 0.9861, + "step": 11964 + }, + { + "epoch": 0.8632444717001552, + "grad_norm": 2.9204795075285226, + "learning_rate": 1.9290909605740112e-07, + "loss": 0.868, + "step": 11965 + }, + { + "epoch": 0.8633166191695826, + "grad_norm": 3.9458854636938376, + "learning_rate": 1.927088853369483e-07, + "loss": 0.95, + "step": 11966 + }, + { + "epoch": 0.8633887666390101, + "grad_norm": 0.7622451137337318, + "learning_rate": 1.925087733059727e-07, + "loss": 0.8566, + "step": 11967 + }, + { + "epoch": 0.8634609141084376, + "grad_norm": 4.687825102658506, + "learning_rate": 1.923087599754023e-07, + "loss": 0.9853, + "step": 11968 + }, + { + "epoch": 0.8635330615778651, + "grad_norm": 2.8220448760716503, + "learning_rate": 1.9210884535615812e-07, + "loss": 1.0251, + "step": 11969 + }, + { + "epoch": 0.8636052090472927, + "grad_norm": 3.402136602566748, + "learning_rate": 1.9190902945915742e-07, + "loss": 0.8385, + "step": 11970 + }, + { + "epoch": 0.8636773565167202, + "grad_norm": 0.721863017457537, + "learning_rate": 1.917093122953115e-07, + "loss": 0.7396, + "step": 11971 + }, + { + "epoch": 0.8637495039861477, + "grad_norm": 4.381593803379371, + "learning_rate": 1.9150969387552629e-07, + "loss": 0.8566, + "step": 11972 + }, + { + "epoch": 0.8638216514555752, + "grad_norm": 4.489519630682355, + "learning_rate": 1.9131017421070128e-07, + "loss": 0.8901, + "step": 11973 + }, + { + "epoch": 0.8638937989250027, + "grad_norm": 2.978956646396661, + "learning_rate": 1.9111075331173197e-07, + "loss": 0.9094, + "step": 11974 + }, + { + "epoch": 0.8639659463944303, + "grad_norm": 4.191565518583961, + "learning_rate": 1.9091143118950902e-07, + "loss": 0.9931, + "step": 11975 + }, + { + "epoch": 0.8640380938638578, + "grad_norm": 2.278413760038412, + "learning_rate": 1.9071220785491525e-07, + "loss": 0.9699, + "step": 11976 + }, + { + "epoch": 0.8641102413332853, + "grad_norm": 2.628243667517985, + "learning_rate": 1.9051308331883e-07, + "loss": 0.9923, + "step": 11977 + }, + { + "epoch": 0.8641823888027127, + "grad_norm": 3.4737754365663025, + "learning_rate": 1.9031405759212716e-07, + "loss": 0.916, + "step": 11978 + }, + { + "epoch": 0.8642545362721402, + "grad_norm": 9.157556434101293, + "learning_rate": 1.901151306856741e-07, + "loss": 0.9144, + "step": 11979 + }, + { + "epoch": 0.8643266837415677, + "grad_norm": 2.3728704441872632, + "learning_rate": 1.8991630261033388e-07, + "loss": 0.9851, + "step": 11980 + }, + { + "epoch": 0.8643988312109953, + "grad_norm": 2.5676398484161123, + "learning_rate": 1.8971757337696314e-07, + "loss": 0.849, + "step": 11981 + }, + { + "epoch": 0.8644709786804228, + "grad_norm": 2.443877948035916, + "learning_rate": 1.89518942996415e-07, + "loss": 0.9581, + "step": 11982 + }, + { + "epoch": 0.8645431261498503, + "grad_norm": 1.8404456447916344, + "learning_rate": 1.8932041147953504e-07, + "loss": 0.8968, + "step": 11983 + }, + { + "epoch": 0.8646152736192778, + "grad_norm": 1.8803633593388063, + "learning_rate": 1.891219788371643e-07, + "loss": 0.8161, + "step": 11984 + }, + { + "epoch": 0.8646874210887053, + "grad_norm": 2.184250625322097, + "learning_rate": 1.8892364508013926e-07, + "loss": 0.9458, + "step": 11985 + }, + { + "epoch": 0.8647595685581329, + "grad_norm": 7.063848856171952, + "learning_rate": 1.8872541021928902e-07, + "loss": 0.9136, + "step": 11986 + }, + { + "epoch": 0.8648317160275604, + "grad_norm": 3.287220201080333, + "learning_rate": 1.8852727426543868e-07, + "loss": 0.903, + "step": 11987 + }, + { + "epoch": 0.8649038634969879, + "grad_norm": 2.8718249161994884, + "learning_rate": 1.8832923722940853e-07, + "loss": 0.9343, + "step": 11988 + }, + { + "epoch": 0.8649760109664153, + "grad_norm": 2.55105304292318, + "learning_rate": 1.8813129912201232e-07, + "loss": 0.8579, + "step": 11989 + }, + { + "epoch": 0.8650481584358428, + "grad_norm": 2.869158574016355, + "learning_rate": 1.879334599540583e-07, + "loss": 0.932, + "step": 11990 + }, + { + "epoch": 0.8651203059052703, + "grad_norm": 2.4320812147910185, + "learning_rate": 1.8773571973635006e-07, + "loss": 0.8225, + "step": 11991 + }, + { + "epoch": 0.8651924533746979, + "grad_norm": 3.7081537667564644, + "learning_rate": 1.8753807847968584e-07, + "loss": 0.9042, + "step": 11992 + }, + { + "epoch": 0.8652646008441254, + "grad_norm": 3.5571405351667327, + "learning_rate": 1.873405361948568e-07, + "loss": 0.8984, + "step": 11993 + }, + { + "epoch": 0.8653367483135529, + "grad_norm": 2.429802404305442, + "learning_rate": 1.871430928926512e-07, + "loss": 0.8595, + "step": 11994 + }, + { + "epoch": 0.8654088957829804, + "grad_norm": 2.7043647348743804, + "learning_rate": 1.869457485838506e-07, + "loss": 0.9338, + "step": 11995 + }, + { + "epoch": 0.8654810432524079, + "grad_norm": 5.341793125855632, + "learning_rate": 1.8674850327923063e-07, + "loss": 0.9241, + "step": 11996 + }, + { + "epoch": 0.8655531907218355, + "grad_norm": 2.850744511583721, + "learning_rate": 1.8655135698956247e-07, + "loss": 0.845, + "step": 11997 + }, + { + "epoch": 0.865625338191263, + "grad_norm": 2.821756854282442, + "learning_rate": 1.863543097256113e-07, + "loss": 0.9424, + "step": 11998 + }, + { + "epoch": 0.8656974856606905, + "grad_norm": 2.918081503531881, + "learning_rate": 1.86157361498138e-07, + "loss": 0.8994, + "step": 11999 + }, + { + "epoch": 0.865769633130118, + "grad_norm": 2.014684176624954, + "learning_rate": 1.8596051231789645e-07, + "loss": 0.8711, + "step": 12000 + }, + { + "epoch": 0.8658417805995454, + "grad_norm": 2.3955964897403934, + "learning_rate": 1.857637621956356e-07, + "loss": 0.9029, + "step": 12001 + }, + { + "epoch": 0.8659139280689729, + "grad_norm": 2.526592254935286, + "learning_rate": 1.8556711114210022e-07, + "loss": 0.8429, + "step": 12002 + }, + { + "epoch": 0.8659860755384005, + "grad_norm": 2.2921038008414425, + "learning_rate": 1.853705591680277e-07, + "loss": 0.9644, + "step": 12003 + }, + { + "epoch": 0.866058223007828, + "grad_norm": 2.4855904155240696, + "learning_rate": 1.8517410628415142e-07, + "loss": 0.9262, + "step": 12004 + }, + { + "epoch": 0.8661303704772555, + "grad_norm": 0.7045571259471552, + "learning_rate": 1.849777525011993e-07, + "loss": 0.7808, + "step": 12005 + }, + { + "epoch": 0.866202517946683, + "grad_norm": 1.7898625080219654, + "learning_rate": 1.847814978298925e-07, + "loss": 1.0122, + "step": 12006 + }, + { + "epoch": 0.8662746654161105, + "grad_norm": 2.356802441014452, + "learning_rate": 1.8458534228094868e-07, + "loss": 0.9148, + "step": 12007 + }, + { + "epoch": 0.866346812885538, + "grad_norm": 2.1293258393493586, + "learning_rate": 1.8438928586507885e-07, + "loss": 0.9422, + "step": 12008 + }, + { + "epoch": 0.8664189603549656, + "grad_norm": 1.7893026931975236, + "learning_rate": 1.841933285929893e-07, + "loss": 0.8591, + "step": 12009 + }, + { + "epoch": 0.8664911078243931, + "grad_norm": 2.4096496148458812, + "learning_rate": 1.8399747047537994e-07, + "loss": 0.8543, + "step": 12010 + }, + { + "epoch": 0.8665632552938206, + "grad_norm": 2.6568994962199675, + "learning_rate": 1.8380171152294555e-07, + "loss": 1.0379, + "step": 12011 + }, + { + "epoch": 0.8666354027632481, + "grad_norm": 2.488993161104339, + "learning_rate": 1.8360605174637733e-07, + "loss": 0.8874, + "step": 12012 + }, + { + "epoch": 0.8667075502326755, + "grad_norm": 2.9284408154752266, + "learning_rate": 1.834104911563581e-07, + "loss": 0.9066, + "step": 12013 + }, + { + "epoch": 0.8667796977021031, + "grad_norm": 5.846023199034737, + "learning_rate": 1.8321502976356705e-07, + "loss": 0.9324, + "step": 12014 + }, + { + "epoch": 0.8668518451715306, + "grad_norm": 3.2944205309160948, + "learning_rate": 1.830196675786777e-07, + "loss": 0.9558, + "step": 12015 + }, + { + "epoch": 0.8669239926409581, + "grad_norm": 3.196848157544399, + "learning_rate": 1.8282440461235858e-07, + "loss": 0.8011, + "step": 12016 + }, + { + "epoch": 0.8669961401103856, + "grad_norm": 2.391049814813348, + "learning_rate": 1.8262924087527097e-07, + "loss": 0.8519, + "step": 12017 + }, + { + "epoch": 0.8670682875798131, + "grad_norm": 2.5252075601003843, + "learning_rate": 1.8243417637807345e-07, + "loss": 0.9015, + "step": 12018 + }, + { + "epoch": 0.8671404350492407, + "grad_norm": 2.3789255927541313, + "learning_rate": 1.8223921113141726e-07, + "loss": 0.9236, + "step": 12019 + }, + { + "epoch": 0.8672125825186682, + "grad_norm": 2.086673449585507, + "learning_rate": 1.8204434514594835e-07, + "loss": 0.8785, + "step": 12020 + }, + { + "epoch": 0.8672847299880957, + "grad_norm": 3.436530097766351, + "learning_rate": 1.8184957843230776e-07, + "loss": 0.9022, + "step": 12021 + }, + { + "epoch": 0.8673568774575232, + "grad_norm": 4.552817744437229, + "learning_rate": 1.8165491100113163e-07, + "loss": 0.9157, + "step": 12022 + }, + { + "epoch": 0.8674290249269507, + "grad_norm": 0.6240719580852901, + "learning_rate": 1.8146034286304879e-07, + "loss": 0.7883, + "step": 12023 + }, + { + "epoch": 0.8675011723963783, + "grad_norm": 2.183762545535948, + "learning_rate": 1.8126587402868498e-07, + "loss": 0.9548, + "step": 12024 + }, + { + "epoch": 0.8675733198658057, + "grad_norm": 2.421221047988022, + "learning_rate": 1.8107150450865906e-07, + "loss": 0.7825, + "step": 12025 + }, + { + "epoch": 0.8676454673352332, + "grad_norm": 3.0642903602608067, + "learning_rate": 1.8087723431358514e-07, + "loss": 0.9363, + "step": 12026 + }, + { + "epoch": 0.8677176148046607, + "grad_norm": 5.724984527966777, + "learning_rate": 1.8068306345407104e-07, + "loss": 0.8799, + "step": 12027 + }, + { + "epoch": 0.8677897622740882, + "grad_norm": 2.630623108039986, + "learning_rate": 1.8048899194071997e-07, + "loss": 0.902, + "step": 12028 + }, + { + "epoch": 0.8678619097435157, + "grad_norm": 2.010484513108099, + "learning_rate": 1.8029501978412974e-07, + "loss": 0.9329, + "step": 12029 + }, + { + "epoch": 0.8679340572129433, + "grad_norm": 2.213570795802516, + "learning_rate": 1.8010114699489142e-07, + "loss": 0.8599, + "step": 12030 + }, + { + "epoch": 0.8680062046823708, + "grad_norm": 2.6341020725548385, + "learning_rate": 1.7990737358359276e-07, + "loss": 0.946, + "step": 12031 + }, + { + "epoch": 0.8680783521517983, + "grad_norm": 2.861265464330957, + "learning_rate": 1.7971369956081483e-07, + "loss": 0.9398, + "step": 12032 + }, + { + "epoch": 0.8681504996212258, + "grad_norm": 5.649881392052642, + "learning_rate": 1.7952012493713365e-07, + "loss": 0.944, + "step": 12033 + }, + { + "epoch": 0.8682226470906533, + "grad_norm": 2.63411999392492, + "learning_rate": 1.7932664972311872e-07, + "loss": 0.9645, + "step": 12034 + }, + { + "epoch": 0.8682947945600809, + "grad_norm": 2.3177358789985893, + "learning_rate": 1.791332739293354e-07, + "loss": 0.9152, + "step": 12035 + }, + { + "epoch": 0.8683669420295083, + "grad_norm": 2.4056155867081475, + "learning_rate": 1.789399975663439e-07, + "loss": 0.9163, + "step": 12036 + }, + { + "epoch": 0.8684390894989358, + "grad_norm": 2.521031605723273, + "learning_rate": 1.787468206446976e-07, + "loss": 0.939, + "step": 12037 + }, + { + "epoch": 0.8685112369683633, + "grad_norm": 3.1380683779247467, + "learning_rate": 1.785537431749453e-07, + "loss": 0.8804, + "step": 12038 + }, + { + "epoch": 0.8685833844377908, + "grad_norm": 3.191693856809914, + "learning_rate": 1.7836076516763067e-07, + "loss": 0.8745, + "step": 12039 + }, + { + "epoch": 0.8686555319072183, + "grad_norm": 2.2335870593925784, + "learning_rate": 1.7816788663329074e-07, + "loss": 0.8341, + "step": 12040 + }, + { + "epoch": 0.8687276793766459, + "grad_norm": 4.124481630988946, + "learning_rate": 1.7797510758245805e-07, + "loss": 0.9674, + "step": 12041 + }, + { + "epoch": 0.8687998268460734, + "grad_norm": 2.069540549257727, + "learning_rate": 1.7778242802566014e-07, + "loss": 0.964, + "step": 12042 + }, + { + "epoch": 0.8688719743155009, + "grad_norm": 2.742616451437354, + "learning_rate": 1.775898479734188e-07, + "loss": 0.8355, + "step": 12043 + }, + { + "epoch": 0.8689441217849284, + "grad_norm": 3.0279419141915147, + "learning_rate": 1.77397367436249e-07, + "loss": 0.8594, + "step": 12044 + }, + { + "epoch": 0.8690162692543559, + "grad_norm": 3.7316573082035736, + "learning_rate": 1.7720498642466185e-07, + "loss": 0.8462, + "step": 12045 + }, + { + "epoch": 0.8690884167237835, + "grad_norm": 3.4969372291243714, + "learning_rate": 1.7701270494916298e-07, + "loss": 0.9819, + "step": 12046 + }, + { + "epoch": 0.869160564193211, + "grad_norm": 2.256252079590238, + "learning_rate": 1.768205230202513e-07, + "loss": 0.91, + "step": 12047 + }, + { + "epoch": 0.8692327116626384, + "grad_norm": 4.120344598302992, + "learning_rate": 1.7662844064842198e-07, + "loss": 0.9268, + "step": 12048 + }, + { + "epoch": 0.8693048591320659, + "grad_norm": 2.953647143509911, + "learning_rate": 1.7643645784416394e-07, + "loss": 0.8974, + "step": 12049 + }, + { + "epoch": 0.8693770066014934, + "grad_norm": 4.372764670299147, + "learning_rate": 1.7624457461795994e-07, + "loss": 0.8516, + "step": 12050 + }, + { + "epoch": 0.8694491540709209, + "grad_norm": 2.022241305108677, + "learning_rate": 1.7605279098028846e-07, + "loss": 0.913, + "step": 12051 + }, + { + "epoch": 0.8695213015403485, + "grad_norm": 2.3667678712867417, + "learning_rate": 1.7586110694162205e-07, + "loss": 0.9397, + "step": 12052 + }, + { + "epoch": 0.869593449009776, + "grad_norm": 2.866130082973705, + "learning_rate": 1.7566952251242806e-07, + "loss": 0.9595, + "step": 12053 + }, + { + "epoch": 0.8696655964792035, + "grad_norm": 2.230946077222643, + "learning_rate": 1.754780377031675e-07, + "loss": 0.8564, + "step": 12054 + }, + { + "epoch": 0.869737743948631, + "grad_norm": 0.8162769781097057, + "learning_rate": 1.752866525242973e-07, + "loss": 0.8532, + "step": 12055 + }, + { + "epoch": 0.8698098914180585, + "grad_norm": 1.8062286402089465, + "learning_rate": 1.750953669862687e-07, + "loss": 1.0298, + "step": 12056 + }, + { + "epoch": 0.869882038887486, + "grad_norm": 2.224207653450762, + "learning_rate": 1.749041810995262e-07, + "loss": 0.9056, + "step": 12057 + }, + { + "epoch": 0.8699541863569136, + "grad_norm": 2.2532388899082845, + "learning_rate": 1.747130948745099e-07, + "loss": 0.7799, + "step": 12058 + }, + { + "epoch": 0.8700263338263411, + "grad_norm": 2.362363123082282, + "learning_rate": 1.7452210832165438e-07, + "loss": 0.8946, + "step": 12059 + }, + { + "epoch": 0.8700984812957685, + "grad_norm": 2.310549970610247, + "learning_rate": 1.7433122145138945e-07, + "loss": 0.8913, + "step": 12060 + }, + { + "epoch": 0.870170628765196, + "grad_norm": 3.4333904323335913, + "learning_rate": 1.7414043427413772e-07, + "loss": 0.9056, + "step": 12061 + }, + { + "epoch": 0.8702427762346235, + "grad_norm": 1.9562737679789588, + "learning_rate": 1.739497468003177e-07, + "loss": 0.8612, + "step": 12062 + }, + { + "epoch": 0.8703149237040511, + "grad_norm": 3.342369008318584, + "learning_rate": 1.7375915904034265e-07, + "loss": 0.932, + "step": 12063 + }, + { + "epoch": 0.8703870711734786, + "grad_norm": 3.6851431863572244, + "learning_rate": 1.7356867100461914e-07, + "loss": 0.8964, + "step": 12064 + }, + { + "epoch": 0.8704592186429061, + "grad_norm": 10.55463039515455, + "learning_rate": 1.7337828270354882e-07, + "loss": 0.8795, + "step": 12065 + }, + { + "epoch": 0.8705313661123336, + "grad_norm": 2.3303299261799504, + "learning_rate": 1.7318799414752938e-07, + "loss": 0.8822, + "step": 12066 + }, + { + "epoch": 0.8706035135817611, + "grad_norm": 2.0525951102333737, + "learning_rate": 1.7299780534695052e-07, + "loss": 0.8157, + "step": 12067 + }, + { + "epoch": 0.8706756610511887, + "grad_norm": 2.221098932507749, + "learning_rate": 1.7280771631219816e-07, + "loss": 0.9473, + "step": 12068 + }, + { + "epoch": 0.8707478085206162, + "grad_norm": 3.036884851528031, + "learning_rate": 1.726177270536524e-07, + "loss": 0.9491, + "step": 12069 + }, + { + "epoch": 0.8708199559900437, + "grad_norm": 1.9682913957543136, + "learning_rate": 1.7242783758168834e-07, + "loss": 0.9626, + "step": 12070 + }, + { + "epoch": 0.8708921034594712, + "grad_norm": 2.2049560513084847, + "learning_rate": 1.7223804790667362e-07, + "loss": 1.0022, + "step": 12071 + }, + { + "epoch": 0.8709642509288986, + "grad_norm": 3.681234565015989, + "learning_rate": 1.7204835803897356e-07, + "loss": 0.8982, + "step": 12072 + }, + { + "epoch": 0.8710363983983261, + "grad_norm": 3.5852708383828, + "learning_rate": 1.7185876798894604e-07, + "loss": 0.8419, + "step": 12073 + }, + { + "epoch": 0.8711085458677537, + "grad_norm": 2.4846205679968203, + "learning_rate": 1.7166927776694328e-07, + "loss": 0.8549, + "step": 12074 + }, + { + "epoch": 0.8711806933371812, + "grad_norm": 2.6826026119007014, + "learning_rate": 1.714798873833132e-07, + "loss": 0.8699, + "step": 12075 + }, + { + "epoch": 0.8712528408066087, + "grad_norm": 2.2961175409196173, + "learning_rate": 1.7129059684839732e-07, + "loss": 1.018, + "step": 12076 + }, + { + "epoch": 0.8713249882760362, + "grad_norm": 2.264301307988978, + "learning_rate": 1.711014061725329e-07, + "loss": 1.0703, + "step": 12077 + }, + { + "epoch": 0.8713971357454637, + "grad_norm": 1.7591886431852644, + "learning_rate": 1.7091231536604923e-07, + "loss": 0.9532, + "step": 12078 + }, + { + "epoch": 0.8714692832148913, + "grad_norm": 2.205547759167493, + "learning_rate": 1.707233244392736e-07, + "loss": 0.867, + "step": 12079 + }, + { + "epoch": 0.8715414306843188, + "grad_norm": 0.8651222518019617, + "learning_rate": 1.7053443340252583e-07, + "loss": 0.8187, + "step": 12080 + }, + { + "epoch": 0.8716135781537463, + "grad_norm": 2.694265145522512, + "learning_rate": 1.703456422661198e-07, + "loss": 0.8348, + "step": 12081 + }, + { + "epoch": 0.8716857256231738, + "grad_norm": 2.4131641876982908, + "learning_rate": 1.701569510403651e-07, + "loss": 0.955, + "step": 12082 + }, + { + "epoch": 0.8717578730926013, + "grad_norm": 2.885729133589341, + "learning_rate": 1.6996835973556588e-07, + "loss": 0.9427, + "step": 12083 + }, + { + "epoch": 0.8718300205620287, + "grad_norm": 2.6317650949997016, + "learning_rate": 1.6977986836201908e-07, + "loss": 0.9106, + "step": 12084 + }, + { + "epoch": 0.8719021680314563, + "grad_norm": 2.485228769682721, + "learning_rate": 1.6959147693001886e-07, + "loss": 0.8394, + "step": 12085 + }, + { + "epoch": 0.8719743155008838, + "grad_norm": 2.535612326861373, + "learning_rate": 1.6940318544985232e-07, + "loss": 0.8964, + "step": 12086 + }, + { + "epoch": 0.8720464629703113, + "grad_norm": 2.0829582512490576, + "learning_rate": 1.6921499393180128e-07, + "loss": 0.9607, + "step": 12087 + }, + { + "epoch": 0.8721186104397388, + "grad_norm": 2.091044143604852, + "learning_rate": 1.6902690238614192e-07, + "loss": 0.8961, + "step": 12088 + }, + { + "epoch": 0.8721907579091663, + "grad_norm": 3.012227254210057, + "learning_rate": 1.6883891082314494e-07, + "loss": 0.8494, + "step": 12089 + }, + { + "epoch": 0.8722629053785939, + "grad_norm": 2.7995696486407216, + "learning_rate": 1.6865101925307702e-07, + "loss": 1.0063, + "step": 12090 + }, + { + "epoch": 0.8723350528480214, + "grad_norm": 4.134626311422523, + "learning_rate": 1.6846322768619724e-07, + "loss": 0.9009, + "step": 12091 + }, + { + "epoch": 0.8724072003174489, + "grad_norm": 2.550634406216623, + "learning_rate": 1.6827553613276035e-07, + "loss": 0.9922, + "step": 12092 + }, + { + "epoch": 0.8724793477868764, + "grad_norm": 33.697365641424774, + "learning_rate": 1.6808794460301612e-07, + "loss": 0.8429, + "step": 12093 + }, + { + "epoch": 0.8725514952563039, + "grad_norm": 1.981336234349009, + "learning_rate": 1.6790045310720747e-07, + "loss": 0.8533, + "step": 12094 + }, + { + "epoch": 0.8726236427257313, + "grad_norm": 2.1774675556485565, + "learning_rate": 1.6771306165557218e-07, + "loss": 0.9454, + "step": 12095 + }, + { + "epoch": 0.8726957901951589, + "grad_norm": 3.6780045663827052, + "learning_rate": 1.6752577025834436e-07, + "loss": 0.899, + "step": 12096 + }, + { + "epoch": 0.8727679376645864, + "grad_norm": 2.8091280751240144, + "learning_rate": 1.6733857892575086e-07, + "loss": 0.9135, + "step": 12097 + }, + { + "epoch": 0.8728400851340139, + "grad_norm": 2.2922997611651583, + "learning_rate": 1.671514876680129e-07, + "loss": 0.9093, + "step": 12098 + }, + { + "epoch": 0.8729122326034414, + "grad_norm": 2.4065549582905748, + "learning_rate": 1.669644964953474e-07, + "loss": 0.9392, + "step": 12099 + }, + { + "epoch": 0.8729843800728689, + "grad_norm": 8.790059839246776, + "learning_rate": 1.6677760541796527e-07, + "loss": 0.8173, + "step": 12100 + }, + { + "epoch": 0.8730565275422965, + "grad_norm": 4.774515632708468, + "learning_rate": 1.6659081444607127e-07, + "loss": 0.9998, + "step": 12101 + }, + { + "epoch": 0.873128675011724, + "grad_norm": 2.2626414681565508, + "learning_rate": 1.6640412358986565e-07, + "loss": 0.8629, + "step": 12102 + }, + { + "epoch": 0.8732008224811515, + "grad_norm": 2.551338450399862, + "learning_rate": 1.6621753285954366e-07, + "loss": 0.9384, + "step": 12103 + }, + { + "epoch": 0.873272969950579, + "grad_norm": 10.106479613385725, + "learning_rate": 1.6603104226529374e-07, + "loss": 0.9394, + "step": 12104 + }, + { + "epoch": 0.8733451174200065, + "grad_norm": 3.181603007038586, + "learning_rate": 1.6584465181729933e-07, + "loss": 0.9116, + "step": 12105 + }, + { + "epoch": 0.873417264889434, + "grad_norm": 2.375217227036372, + "learning_rate": 1.6565836152573876e-07, + "loss": 0.9006, + "step": 12106 + }, + { + "epoch": 0.8734894123588615, + "grad_norm": 2.4876103477347455, + "learning_rate": 1.6547217140078473e-07, + "loss": 0.9076, + "step": 12107 + }, + { + "epoch": 0.873561559828289, + "grad_norm": 3.2828564299122958, + "learning_rate": 1.6528608145260358e-07, + "loss": 0.937, + "step": 12108 + }, + { + "epoch": 0.8736337072977165, + "grad_norm": 2.8231387361872575, + "learning_rate": 1.651000916913581e-07, + "loss": 0.9353, + "step": 12109 + }, + { + "epoch": 0.873705854767144, + "grad_norm": 2.122095591521372, + "learning_rate": 1.649142021272043e-07, + "loss": 0.8304, + "step": 12110 + }, + { + "epoch": 0.8737780022365715, + "grad_norm": 2.932145128038495, + "learning_rate": 1.6472841277029237e-07, + "loss": 0.9353, + "step": 12111 + }, + { + "epoch": 0.8738501497059991, + "grad_norm": 4.138318610675345, + "learning_rate": 1.6454272363076772e-07, + "loss": 0.8996, + "step": 12112 + }, + { + "epoch": 0.8739222971754266, + "grad_norm": 2.3718784297068933, + "learning_rate": 1.6435713471877022e-07, + "loss": 0.9429, + "step": 12113 + }, + { + "epoch": 0.8739944446448541, + "grad_norm": 2.5059380582139745, + "learning_rate": 1.6417164604443467e-07, + "loss": 0.8718, + "step": 12114 + }, + { + "epoch": 0.8740665921142816, + "grad_norm": 2.0816226293437756, + "learning_rate": 1.6398625761788941e-07, + "loss": 0.8163, + "step": 12115 + }, + { + "epoch": 0.8741387395837091, + "grad_norm": 1.975735336741642, + "learning_rate": 1.6380096944925793e-07, + "loss": 0.8853, + "step": 12116 + }, + { + "epoch": 0.8742108870531367, + "grad_norm": 2.3993685527455546, + "learning_rate": 1.6361578154865829e-07, + "loss": 0.8495, + "step": 12117 + }, + { + "epoch": 0.8742830345225642, + "grad_norm": 2.2959959462957453, + "learning_rate": 1.6343069392620268e-07, + "loss": 0.8702, + "step": 12118 + }, + { + "epoch": 0.8743551819919916, + "grad_norm": 2.5779731474573646, + "learning_rate": 1.6324570659199765e-07, + "loss": 0.9762, + "step": 12119 + }, + { + "epoch": 0.8744273294614191, + "grad_norm": 1.7573322911777987, + "learning_rate": 1.6306081955614538e-07, + "loss": 0.9525, + "step": 12120 + }, + { + "epoch": 0.8744994769308466, + "grad_norm": 2.393895623381636, + "learning_rate": 1.628760328287422e-07, + "loss": 0.8324, + "step": 12121 + }, + { + "epoch": 0.8745716244002741, + "grad_norm": 2.410003706743231, + "learning_rate": 1.626913464198776e-07, + "loss": 0.9645, + "step": 12122 + }, + { + "epoch": 0.8746437718697017, + "grad_norm": 0.7213763475737662, + "learning_rate": 1.6250676033963707e-07, + "loss": 0.8332, + "step": 12123 + }, + { + "epoch": 0.8747159193391292, + "grad_norm": 1.031029706602832, + "learning_rate": 1.6232227459810056e-07, + "loss": 0.8281, + "step": 12124 + }, + { + "epoch": 0.8747880668085567, + "grad_norm": 2.4510054452027092, + "learning_rate": 1.621378892053409e-07, + "loss": 0.8698, + "step": 12125 + }, + { + "epoch": 0.8748602142779842, + "grad_norm": 5.429466188160573, + "learning_rate": 1.6195360417142823e-07, + "loss": 0.8955, + "step": 12126 + }, + { + "epoch": 0.8749323617474117, + "grad_norm": 3.0191598049348722, + "learning_rate": 1.617694195064252e-07, + "loss": 0.9395, + "step": 12127 + }, + { + "epoch": 0.8750045092168393, + "grad_norm": 2.722459301604941, + "learning_rate": 1.6158533522038908e-07, + "loss": 0.8491, + "step": 12128 + }, + { + "epoch": 0.8750766566862668, + "grad_norm": 2.0558350118200623, + "learning_rate": 1.6140135132337207e-07, + "loss": 0.8469, + "step": 12129 + }, + { + "epoch": 0.8751488041556943, + "grad_norm": 2.2929556487194924, + "learning_rate": 1.6121746782542101e-07, + "loss": 0.8919, + "step": 12130 + }, + { + "epoch": 0.8752209516251217, + "grad_norm": 3.8478464766983294, + "learning_rate": 1.6103368473657742e-07, + "loss": 0.9143, + "step": 12131 + }, + { + "epoch": 0.8752930990945492, + "grad_norm": 1.7076666236553277, + "learning_rate": 1.6085000206687616e-07, + "loss": 0.8873, + "step": 12132 + }, + { + "epoch": 0.8753652465639767, + "grad_norm": 4.244300386359726, + "learning_rate": 1.6066641982634833e-07, + "loss": 0.9283, + "step": 12133 + }, + { + "epoch": 0.8754373940334043, + "grad_norm": 2.390249684112244, + "learning_rate": 1.6048293802501856e-07, + "loss": 0.8408, + "step": 12134 + }, + { + "epoch": 0.8755095415028318, + "grad_norm": 4.252153860600958, + "learning_rate": 1.602995566729055e-07, + "loss": 0.8898, + "step": 12135 + }, + { + "epoch": 0.8755816889722593, + "grad_norm": 3.029116322039127, + "learning_rate": 1.601162757800234e-07, + "loss": 0.8503, + "step": 12136 + }, + { + "epoch": 0.8756538364416868, + "grad_norm": 3.204203056720771, + "learning_rate": 1.5993309535638068e-07, + "loss": 0.8516, + "step": 12137 + }, + { + "epoch": 0.8757259839111143, + "grad_norm": 2.9350252481183676, + "learning_rate": 1.597500154119793e-07, + "loss": 0.8462, + "step": 12138 + }, + { + "epoch": 0.8757981313805419, + "grad_norm": 2.335894325903535, + "learning_rate": 1.5956703595681774e-07, + "loss": 0.9429, + "step": 12139 + }, + { + "epoch": 0.8758702788499694, + "grad_norm": 2.5905084231736146, + "learning_rate": 1.5938415700088714e-07, + "loss": 0.896, + "step": 12140 + }, + { + "epoch": 0.8759424263193969, + "grad_norm": 3.3032576982735713, + "learning_rate": 1.5920137855417437e-07, + "loss": 0.9067, + "step": 12141 + }, + { + "epoch": 0.8760145737888243, + "grad_norm": 2.3983152277591993, + "learning_rate": 1.5901870062665946e-07, + "loss": 0.8363, + "step": 12142 + }, + { + "epoch": 0.8760867212582518, + "grad_norm": 0.7010008196859968, + "learning_rate": 1.5883612322831797e-07, + "loss": 0.7529, + "step": 12143 + }, + { + "epoch": 0.8761588687276793, + "grad_norm": 2.9070773023911207, + "learning_rate": 1.586536463691208e-07, + "loss": 0.882, + "step": 12144 + }, + { + "epoch": 0.8762310161971069, + "grad_norm": 2.9761184000642804, + "learning_rate": 1.5847127005903117e-07, + "loss": 0.8947, + "step": 12145 + }, + { + "epoch": 0.8763031636665344, + "grad_norm": 2.362088897298371, + "learning_rate": 1.5828899430800834e-07, + "loss": 0.9489, + "step": 12146 + }, + { + "epoch": 0.8763753111359619, + "grad_norm": 3.5641918515116893, + "learning_rate": 1.5810681912600577e-07, + "loss": 0.9303, + "step": 12147 + }, + { + "epoch": 0.8764474586053894, + "grad_norm": 2.6523255922035864, + "learning_rate": 1.5792474452297167e-07, + "loss": 0.9714, + "step": 12148 + }, + { + "epoch": 0.8765196060748169, + "grad_norm": 2.1495768906975616, + "learning_rate": 1.5774277050884744e-07, + "loss": 0.8346, + "step": 12149 + }, + { + "epoch": 0.8765917535442445, + "grad_norm": 2.008492440006006, + "learning_rate": 1.575608970935711e-07, + "loss": 0.9793, + "step": 12150 + }, + { + "epoch": 0.876663901013672, + "grad_norm": 2.5366363671791143, + "learning_rate": 1.5737912428707413e-07, + "loss": 0.8893, + "step": 12151 + }, + { + "epoch": 0.8767360484830995, + "grad_norm": 3.8302545346373345, + "learning_rate": 1.5719745209928158e-07, + "loss": 0.8439, + "step": 12152 + }, + { + "epoch": 0.876808195952527, + "grad_norm": 9.95066565965878, + "learning_rate": 1.570158805401143e-07, + "loss": 0.841, + "step": 12153 + }, + { + "epoch": 0.8768803434219544, + "grad_norm": 2.430097343002141, + "learning_rate": 1.568344096194878e-07, + "loss": 0.8039, + "step": 12154 + }, + { + "epoch": 0.8769524908913819, + "grad_norm": 3.057109838187719, + "learning_rate": 1.5665303934731046e-07, + "loss": 0.967, + "step": 12155 + }, + { + "epoch": 0.8770246383608095, + "grad_norm": 2.675728055327767, + "learning_rate": 1.5647176973348676e-07, + "loss": 0.9179, + "step": 12156 + }, + { + "epoch": 0.877096785830237, + "grad_norm": 2.8231871265384867, + "learning_rate": 1.5629060078791522e-07, + "loss": 0.841, + "step": 12157 + }, + { + "epoch": 0.8771689332996645, + "grad_norm": 0.6808793590806005, + "learning_rate": 1.5610953252048931e-07, + "loss": 0.7395, + "step": 12158 + }, + { + "epoch": 0.877241080769092, + "grad_norm": 3.2194813017938304, + "learning_rate": 1.5592856494109574e-07, + "loss": 0.793, + "step": 12159 + }, + { + "epoch": 0.8773132282385195, + "grad_norm": 2.59283569031315, + "learning_rate": 1.5574769805961642e-07, + "loss": 0.9199, + "step": 12160 + }, + { + "epoch": 0.8773853757079471, + "grad_norm": 3.26471708535363, + "learning_rate": 1.5556693188592872e-07, + "loss": 0.8674, + "step": 12161 + }, + { + "epoch": 0.8774575231773746, + "grad_norm": 2.7774345715745388, + "learning_rate": 1.5538626642990215e-07, + "loss": 0.9573, + "step": 12162 + }, + { + "epoch": 0.8775296706468021, + "grad_norm": 5.842024767788234, + "learning_rate": 1.5520570170140368e-07, + "loss": 1.0058, + "step": 12163 + }, + { + "epoch": 0.8776018181162296, + "grad_norm": 3.4627387394866935, + "learning_rate": 1.550252377102923e-07, + "loss": 0.8398, + "step": 12164 + }, + { + "epoch": 0.8776739655856571, + "grad_norm": 2.6105046368323768, + "learning_rate": 1.5484487446642346e-07, + "loss": 0.8985, + "step": 12165 + }, + { + "epoch": 0.8777461130550845, + "grad_norm": 1.895658743508013, + "learning_rate": 1.5466461197964486e-07, + "loss": 0.9264, + "step": 12166 + }, + { + "epoch": 0.8778182605245121, + "grad_norm": 2.117352243467936, + "learning_rate": 1.544844502598004e-07, + "loss": 0.966, + "step": 12167 + }, + { + "epoch": 0.8778904079939396, + "grad_norm": 2.964633697989827, + "learning_rate": 1.5430438931672906e-07, + "loss": 0.9513, + "step": 12168 + }, + { + "epoch": 0.8779625554633671, + "grad_norm": 2.3922665015492766, + "learning_rate": 1.5412442916026192e-07, + "loss": 0.9007, + "step": 12169 + }, + { + "epoch": 0.8780347029327946, + "grad_norm": 2.5976367003199976, + "learning_rate": 1.5394456980022664e-07, + "loss": 0.9725, + "step": 12170 + }, + { + "epoch": 0.8781068504022221, + "grad_norm": 2.032039899561981, + "learning_rate": 1.537648112464449e-07, + "loss": 0.8651, + "step": 12171 + }, + { + "epoch": 0.8781789978716497, + "grad_norm": 5.0764211760035, + "learning_rate": 1.535851535087318e-07, + "loss": 0.9005, + "step": 12172 + }, + { + "epoch": 0.8782511453410772, + "grad_norm": 3.2065836771842053, + "learning_rate": 1.5340559659689812e-07, + "loss": 0.8314, + "step": 12173 + }, + { + "epoch": 0.8783232928105047, + "grad_norm": 2.0792298785776357, + "learning_rate": 1.532261405207491e-07, + "loss": 0.8768, + "step": 12174 + }, + { + "epoch": 0.8783954402799322, + "grad_norm": 5.939874756388558, + "learning_rate": 1.5304678529008453e-07, + "loss": 0.9866, + "step": 12175 + }, + { + "epoch": 0.8784675877493597, + "grad_norm": 2.320939823723191, + "learning_rate": 1.5286753091469739e-07, + "loss": 0.8376, + "step": 12176 + }, + { + "epoch": 0.8785397352187873, + "grad_norm": 4.402199689510148, + "learning_rate": 1.5268837740437656e-07, + "loss": 0.8768, + "step": 12177 + }, + { + "epoch": 0.8786118826882147, + "grad_norm": 3.0319285468989827, + "learning_rate": 1.5250932476890532e-07, + "loss": 0.9421, + "step": 12178 + }, + { + "epoch": 0.8786840301576422, + "grad_norm": 2.053111817478727, + "learning_rate": 1.523303730180603e-07, + "loss": 0.897, + "step": 12179 + }, + { + "epoch": 0.8787561776270697, + "grad_norm": 0.7009854294627177, + "learning_rate": 1.5215152216161342e-07, + "loss": 0.7876, + "step": 12180 + }, + { + "epoch": 0.8788283250964972, + "grad_norm": 2.742547948899738, + "learning_rate": 1.5197277220933226e-07, + "loss": 0.8793, + "step": 12181 + }, + { + "epoch": 0.8789004725659247, + "grad_norm": 2.1997019912712794, + "learning_rate": 1.5179412317097629e-07, + "loss": 0.8737, + "step": 12182 + }, + { + "epoch": 0.8789726200353523, + "grad_norm": 2.784631966015837, + "learning_rate": 1.516155750563015e-07, + "loss": 0.9642, + "step": 12183 + }, + { + "epoch": 0.8790447675047798, + "grad_norm": 2.7680575777250507, + "learning_rate": 1.5143712787505769e-07, + "loss": 0.8293, + "step": 12184 + }, + { + "epoch": 0.8791169149742073, + "grad_norm": 3.6909945465270844, + "learning_rate": 1.5125878163698947e-07, + "loss": 0.8977, + "step": 12185 + }, + { + "epoch": 0.8791890624436348, + "grad_norm": 2.9857165132567727, + "learning_rate": 1.510805363518346e-07, + "loss": 0.8089, + "step": 12186 + }, + { + "epoch": 0.8792612099130623, + "grad_norm": 2.4867000616283996, + "learning_rate": 1.5090239202932775e-07, + "loss": 0.9021, + "step": 12187 + }, + { + "epoch": 0.8793333573824899, + "grad_norm": 1.8947977094954114, + "learning_rate": 1.5072434867919627e-07, + "loss": 0.933, + "step": 12188 + }, + { + "epoch": 0.8794055048519174, + "grad_norm": 2.285357917950899, + "learning_rate": 1.505464063111621e-07, + "loss": 0.9723, + "step": 12189 + }, + { + "epoch": 0.8794776523213448, + "grad_norm": 3.1465940923996247, + "learning_rate": 1.503685649349422e-07, + "loss": 0.948, + "step": 12190 + }, + { + "epoch": 0.8795497997907723, + "grad_norm": 1.8816611540887915, + "learning_rate": 1.5019082456024768e-07, + "loss": 0.9229, + "step": 12191 + }, + { + "epoch": 0.8796219472601998, + "grad_norm": 0.7525711096309218, + "learning_rate": 1.5001318519678496e-07, + "loss": 0.8457, + "step": 12192 + }, + { + "epoch": 0.8796940947296273, + "grad_norm": 2.747659901347268, + "learning_rate": 1.4983564685425343e-07, + "loss": 0.9046, + "step": 12193 + }, + { + "epoch": 0.8797662421990549, + "grad_norm": 3.1354784954138397, + "learning_rate": 1.496582095423482e-07, + "loss": 0.9578, + "step": 12194 + }, + { + "epoch": 0.8798383896684824, + "grad_norm": 2.35553980478183, + "learning_rate": 1.4948087327075888e-07, + "loss": 0.892, + "step": 12195 + }, + { + "epoch": 0.8799105371379099, + "grad_norm": 2.8427102000974847, + "learning_rate": 1.4930363804916813e-07, + "loss": 0.8795, + "step": 12196 + }, + { + "epoch": 0.8799826846073374, + "grad_norm": 2.4768156770604315, + "learning_rate": 1.4912650388725445e-07, + "loss": 0.93, + "step": 12197 + }, + { + "epoch": 0.8800548320767649, + "grad_norm": 2.408539512634854, + "learning_rate": 1.4894947079469145e-07, + "loss": 0.7917, + "step": 12198 + }, + { + "epoch": 0.8801269795461925, + "grad_norm": 1.921340123621007, + "learning_rate": 1.4877253878114516e-07, + "loss": 0.9459, + "step": 12199 + }, + { + "epoch": 0.88019912701562, + "grad_norm": 3.5982022777141998, + "learning_rate": 1.485957078562774e-07, + "loss": 0.8515, + "step": 12200 + }, + { + "epoch": 0.8802712744850474, + "grad_norm": 2.6444189140768604, + "learning_rate": 1.4841897802974422e-07, + "loss": 0.9275, + "step": 12201 + }, + { + "epoch": 0.8803434219544749, + "grad_norm": 2.553021929188378, + "learning_rate": 1.4824234931119684e-07, + "loss": 0.8711, + "step": 12202 + }, + { + "epoch": 0.8804155694239024, + "grad_norm": 2.780754816785223, + "learning_rate": 1.480658217102795e-07, + "loss": 1.0061, + "step": 12203 + }, + { + "epoch": 0.8804877168933299, + "grad_norm": 2.6852386186722743, + "learning_rate": 1.4788939523663158e-07, + "loss": 0.8896, + "step": 12204 + }, + { + "epoch": 0.8805598643627575, + "grad_norm": 6.513321942905447, + "learning_rate": 1.4771306989988829e-07, + "loss": 0.7916, + "step": 12205 + }, + { + "epoch": 0.880632011832185, + "grad_norm": 3.2909237630910777, + "learning_rate": 1.4753684570967705e-07, + "loss": 0.938, + "step": 12206 + }, + { + "epoch": 0.8807041593016125, + "grad_norm": 4.204680541396717, + "learning_rate": 1.4736072267562084e-07, + "loss": 0.8548, + "step": 12207 + }, + { + "epoch": 0.88077630677104, + "grad_norm": 5.962562429553116, + "learning_rate": 1.4718470080733746e-07, + "loss": 0.9243, + "step": 12208 + }, + { + "epoch": 0.8808484542404675, + "grad_norm": 4.351989834689442, + "learning_rate": 1.4700878011443906e-07, + "loss": 0.8059, + "step": 12209 + }, + { + "epoch": 0.8809206017098951, + "grad_norm": 3.3862297294632393, + "learning_rate": 1.4683296060653082e-07, + "loss": 0.8757, + "step": 12210 + }, + { + "epoch": 0.8809927491793226, + "grad_norm": 3.1724603422916235, + "learning_rate": 1.4665724229321485e-07, + "loss": 0.9445, + "step": 12211 + }, + { + "epoch": 0.8810648966487501, + "grad_norm": 2.6840843851607605, + "learning_rate": 1.4648162518408635e-07, + "loss": 0.962, + "step": 12212 + }, + { + "epoch": 0.8811370441181775, + "grad_norm": 2.360894932543023, + "learning_rate": 1.4630610928873454e-07, + "loss": 0.8806, + "step": 12213 + }, + { + "epoch": 0.881209191587605, + "grad_norm": 2.776450913676651, + "learning_rate": 1.46130694616744e-07, + "loss": 1.0081, + "step": 12214 + }, + { + "epoch": 0.8812813390570325, + "grad_norm": 2.434739138751232, + "learning_rate": 1.4595538117769369e-07, + "loss": 0.876, + "step": 12215 + }, + { + "epoch": 0.8813534865264601, + "grad_norm": 2.4723291647803367, + "learning_rate": 1.4578016898115574e-07, + "loss": 0.9846, + "step": 12216 + }, + { + "epoch": 0.8814256339958876, + "grad_norm": 1.9184626236715383, + "learning_rate": 1.456050580366992e-07, + "loss": 0.8332, + "step": 12217 + }, + { + "epoch": 0.8814977814653151, + "grad_norm": 3.002552218399998, + "learning_rate": 1.4543004835388572e-07, + "loss": 0.9019, + "step": 12218 + }, + { + "epoch": 0.8815699289347426, + "grad_norm": 2.7789311547030713, + "learning_rate": 1.4525513994227213e-07, + "loss": 0.9054, + "step": 12219 + }, + { + "epoch": 0.8816420764041701, + "grad_norm": 2.830263456880218, + "learning_rate": 1.450803328114092e-07, + "loss": 0.9869, + "step": 12220 + }, + { + "epoch": 0.8817142238735977, + "grad_norm": 3.0232162197468333, + "learning_rate": 1.4490562697084196e-07, + "loss": 0.8349, + "step": 12221 + }, + { + "epoch": 0.8817863713430252, + "grad_norm": 3.891710980072517, + "learning_rate": 1.4473102243011192e-07, + "loss": 0.9425, + "step": 12222 + }, + { + "epoch": 0.8818585188124527, + "grad_norm": 2.551409002990938, + "learning_rate": 1.4455651919875257e-07, + "loss": 0.9038, + "step": 12223 + }, + { + "epoch": 0.8819306662818802, + "grad_norm": 1.622910476627061, + "learning_rate": 1.443821172862929e-07, + "loss": 0.9396, + "step": 12224 + }, + { + "epoch": 0.8820028137513076, + "grad_norm": 3.3624544445482973, + "learning_rate": 1.4420781670225668e-07, + "loss": 0.8493, + "step": 12225 + }, + { + "epoch": 0.8820749612207351, + "grad_norm": 2.4000783271723654, + "learning_rate": 1.4403361745616204e-07, + "loss": 0.9096, + "step": 12226 + }, + { + "epoch": 0.8821471086901627, + "grad_norm": 2.0749189843597495, + "learning_rate": 1.4385951955752052e-07, + "loss": 0.943, + "step": 12227 + }, + { + "epoch": 0.8822192561595902, + "grad_norm": 1.939430259731151, + "learning_rate": 1.4368552301583935e-07, + "loss": 0.8888, + "step": 12228 + }, + { + "epoch": 0.8822914036290177, + "grad_norm": 8.26474946257896, + "learning_rate": 1.4351162784062032e-07, + "loss": 0.8354, + "step": 12229 + }, + { + "epoch": 0.8823635510984452, + "grad_norm": 1.865173722324565, + "learning_rate": 1.4333783404135868e-07, + "loss": 0.9194, + "step": 12230 + }, + { + "epoch": 0.8824356985678727, + "grad_norm": 2.8418450315399983, + "learning_rate": 1.4316414162754464e-07, + "loss": 0.9476, + "step": 12231 + }, + { + "epoch": 0.8825078460373003, + "grad_norm": 3.007854035326041, + "learning_rate": 1.429905506086635e-07, + "loss": 0.8302, + "step": 12232 + }, + { + "epoch": 0.8825799935067278, + "grad_norm": 4.506745263341644, + "learning_rate": 1.4281706099419344e-07, + "loss": 0.9683, + "step": 12233 + }, + { + "epoch": 0.8826521409761553, + "grad_norm": 2.8628918641907877, + "learning_rate": 1.4264367279360845e-07, + "loss": 0.7928, + "step": 12234 + }, + { + "epoch": 0.8827242884455828, + "grad_norm": 2.6238372589038668, + "learning_rate": 1.4247038601637717e-07, + "loss": 0.8054, + "step": 12235 + }, + { + "epoch": 0.8827964359150103, + "grad_norm": 2.4764798027214665, + "learning_rate": 1.4229720067196204e-07, + "loss": 0.9006, + "step": 12236 + }, + { + "epoch": 0.8828685833844377, + "grad_norm": 2.577128177171451, + "learning_rate": 1.421241167698195e-07, + "loss": 0.88, + "step": 12237 + }, + { + "epoch": 0.8829407308538653, + "grad_norm": 2.4148371666454764, + "learning_rate": 1.419511343194013e-07, + "loss": 0.9761, + "step": 12238 + }, + { + "epoch": 0.8830128783232928, + "grad_norm": 5.130499331617112, + "learning_rate": 1.417782533301537e-07, + "loss": 0.8743, + "step": 12239 + }, + { + "epoch": 0.8830850257927203, + "grad_norm": 2.130034261615986, + "learning_rate": 1.4160547381151622e-07, + "loss": 0.9204, + "step": 12240 + }, + { + "epoch": 0.8831571732621478, + "grad_norm": 2.716678400150852, + "learning_rate": 1.4143279577292444e-07, + "loss": 0.8287, + "step": 12241 + }, + { + "epoch": 0.8832293207315753, + "grad_norm": 2.300315615518826, + "learning_rate": 1.4126021922380793e-07, + "loss": 0.8769, + "step": 12242 + }, + { + "epoch": 0.8833014682010029, + "grad_norm": 3.419137774413655, + "learning_rate": 1.4108774417358982e-07, + "loss": 0.9246, + "step": 12243 + }, + { + "epoch": 0.8833736156704304, + "grad_norm": 2.484588134068375, + "learning_rate": 1.4091537063168835e-07, + "loss": 1.0218, + "step": 12244 + }, + { + "epoch": 0.8834457631398579, + "grad_norm": 1.9668083304229764, + "learning_rate": 1.407430986075162e-07, + "loss": 0.9011, + "step": 12245 + }, + { + "epoch": 0.8835179106092854, + "grad_norm": 7.536022146415161, + "learning_rate": 1.405709281104812e-07, + "loss": 0.8458, + "step": 12246 + }, + { + "epoch": 0.8835900580787129, + "grad_norm": 3.199537994888841, + "learning_rate": 1.4039885914998429e-07, + "loss": 1.0061, + "step": 12247 + }, + { + "epoch": 0.8836622055481403, + "grad_norm": 2.428602951710737, + "learning_rate": 1.4022689173542146e-07, + "loss": 1.0152, + "step": 12248 + }, + { + "epoch": 0.8837343530175679, + "grad_norm": 4.028609719717673, + "learning_rate": 1.400550258761839e-07, + "loss": 0.8918, + "step": 12249 + }, + { + "epoch": 0.8838065004869954, + "grad_norm": 5.037776907227253, + "learning_rate": 1.3988326158165586e-07, + "loss": 1.0001, + "step": 12250 + }, + { + "epoch": 0.8838786479564229, + "grad_norm": 3.34437013383717, + "learning_rate": 1.3971159886121675e-07, + "loss": 0.925, + "step": 12251 + }, + { + "epoch": 0.8839507954258504, + "grad_norm": 2.4390013545972153, + "learning_rate": 1.395400377242406e-07, + "loss": 0.8583, + "step": 12252 + }, + { + "epoch": 0.8840229428952779, + "grad_norm": 2.5438020105691854, + "learning_rate": 1.393685781800964e-07, + "loss": 0.9839, + "step": 12253 + }, + { + "epoch": 0.8840950903647055, + "grad_norm": 2.9197089178195355, + "learning_rate": 1.3919722023814572e-07, + "loss": 0.9028, + "step": 12254 + }, + { + "epoch": 0.884167237834133, + "grad_norm": 2.9350054273928836, + "learning_rate": 1.3902596390774667e-07, + "loss": 0.9471, + "step": 12255 + }, + { + "epoch": 0.8842393853035605, + "grad_norm": 3.377556574282004, + "learning_rate": 1.3885480919825087e-07, + "loss": 0.9848, + "step": 12256 + }, + { + "epoch": 0.884311532772988, + "grad_norm": 3.6004476745727643, + "learning_rate": 1.3868375611900396e-07, + "loss": 0.9765, + "step": 12257 + }, + { + "epoch": 0.8843836802424155, + "grad_norm": 3.200128111659012, + "learning_rate": 1.3851280467934644e-07, + "loss": 0.9868, + "step": 12258 + }, + { + "epoch": 0.8844558277118431, + "grad_norm": 2.5842986200524467, + "learning_rate": 1.3834195488861422e-07, + "loss": 0.9247, + "step": 12259 + }, + { + "epoch": 0.8845279751812705, + "grad_norm": 2.1352924838049807, + "learning_rate": 1.3817120675613602e-07, + "loss": 0.8475, + "step": 12260 + }, + { + "epoch": 0.884600122650698, + "grad_norm": 2.7083239530743257, + "learning_rate": 1.3800056029123596e-07, + "loss": 0.9644, + "step": 12261 + }, + { + "epoch": 0.8846722701201255, + "grad_norm": 2.228328927700773, + "learning_rate": 1.3783001550323237e-07, + "loss": 0.897, + "step": 12262 + }, + { + "epoch": 0.884744417589553, + "grad_norm": 3.0886894992659877, + "learning_rate": 1.3765957240143844e-07, + "loss": 0.9999, + "step": 12263 + }, + { + "epoch": 0.8848165650589805, + "grad_norm": 1.943427463278008, + "learning_rate": 1.374892309951603e-07, + "loss": 0.8576, + "step": 12264 + }, + { + "epoch": 0.8848887125284081, + "grad_norm": 2.2111053942228924, + "learning_rate": 1.3731899129370095e-07, + "loss": 0.8843, + "step": 12265 + }, + { + "epoch": 0.8849608599978356, + "grad_norm": 3.198168087166391, + "learning_rate": 1.371488533063565e-07, + "loss": 0.9117, + "step": 12266 + }, + { + "epoch": 0.8850330074672631, + "grad_norm": 3.448723106154979, + "learning_rate": 1.3697881704241665e-07, + "loss": 0.8846, + "step": 12267 + }, + { + "epoch": 0.8851051549366906, + "grad_norm": 1.9987385467143892, + "learning_rate": 1.3680888251116706e-07, + "loss": 0.9044, + "step": 12268 + }, + { + "epoch": 0.8851773024061181, + "grad_norm": 4.519104986110444, + "learning_rate": 1.3663904972188677e-07, + "loss": 1.0219, + "step": 12269 + }, + { + "epoch": 0.8852494498755457, + "grad_norm": 2.326400322830671, + "learning_rate": 1.3646931868385059e-07, + "loss": 0.9503, + "step": 12270 + }, + { + "epoch": 0.8853215973449732, + "grad_norm": 3.249039141127138, + "learning_rate": 1.3629968940632597e-07, + "loss": 0.8693, + "step": 12271 + }, + { + "epoch": 0.8853937448144006, + "grad_norm": 3.3116271110467177, + "learning_rate": 1.361301618985764e-07, + "loss": 0.87, + "step": 12272 + }, + { + "epoch": 0.8854658922838281, + "grad_norm": 3.069695408277044, + "learning_rate": 1.3596073616985894e-07, + "loss": 0.9108, + "step": 12273 + }, + { + "epoch": 0.8855380397532556, + "grad_norm": 0.7928863632925054, + "learning_rate": 1.3579141222942503e-07, + "loss": 0.8452, + "step": 12274 + }, + { + "epoch": 0.8856101872226831, + "grad_norm": 1.730076799286683, + "learning_rate": 1.356221900865211e-07, + "loss": 0.8832, + "step": 12275 + }, + { + "epoch": 0.8856823346921107, + "grad_norm": 3.1941605335803374, + "learning_rate": 1.3545306975038794e-07, + "loss": 0.8403, + "step": 12276 + }, + { + "epoch": 0.8857544821615382, + "grad_norm": 2.5653230886743854, + "learning_rate": 1.3528405123025954e-07, + "loss": 0.8305, + "step": 12277 + }, + { + "epoch": 0.8858266296309657, + "grad_norm": 3.8168018022848402, + "learning_rate": 1.351151345353667e-07, + "loss": 0.9606, + "step": 12278 + }, + { + "epoch": 0.8858987771003932, + "grad_norm": 1.9737681059825258, + "learning_rate": 1.3494631967493276e-07, + "loss": 0.8622, + "step": 12279 + }, + { + "epoch": 0.8859709245698207, + "grad_norm": 4.333430264684862, + "learning_rate": 1.3477760665817628e-07, + "loss": 0.9084, + "step": 12280 + }, + { + "epoch": 0.8860430720392483, + "grad_norm": 2.445856295926788, + "learning_rate": 1.346089954943097e-07, + "loss": 0.9207, + "step": 12281 + }, + { + "epoch": 0.8861152195086758, + "grad_norm": 2.5741038086950248, + "learning_rate": 1.344404861925401e-07, + "loss": 0.8264, + "step": 12282 + }, + { + "epoch": 0.8861873669781033, + "grad_norm": 2.362385023437628, + "learning_rate": 1.3427207876207013e-07, + "loss": 0.8753, + "step": 12283 + }, + { + "epoch": 0.8862595144475307, + "grad_norm": 2.908264938322685, + "learning_rate": 1.3410377321209487e-07, + "loss": 0.9183, + "step": 12284 + }, + { + "epoch": 0.8863316619169582, + "grad_norm": 2.969181230245057, + "learning_rate": 1.3393556955180519e-07, + "loss": 1.042, + "step": 12285 + }, + { + "epoch": 0.8864038093863857, + "grad_norm": 2.15203622604567, + "learning_rate": 1.3376746779038662e-07, + "loss": 0.9279, + "step": 12286 + }, + { + "epoch": 0.8864759568558133, + "grad_norm": 0.8705240619145064, + "learning_rate": 1.335994679370176e-07, + "loss": 0.9499, + "step": 12287 + }, + { + "epoch": 0.8865481043252408, + "grad_norm": 2.677134017365592, + "learning_rate": 1.3343157000087234e-07, + "loss": 0.903, + "step": 12288 + }, + { + "epoch": 0.8866202517946683, + "grad_norm": 3.1774674146948785, + "learning_rate": 1.3326377399111933e-07, + "loss": 0.9117, + "step": 12289 + }, + { + "epoch": 0.8866923992640958, + "grad_norm": 2.3792829544143648, + "learning_rate": 1.3309607991692162e-07, + "loss": 0.883, + "step": 12290 + }, + { + "epoch": 0.8867645467335233, + "grad_norm": 2.288788554493441, + "learning_rate": 1.329284877874357e-07, + "loss": 1.0329, + "step": 12291 + }, + { + "epoch": 0.8868366942029509, + "grad_norm": 2.673903024654018, + "learning_rate": 1.3276099761181314e-07, + "loss": 0.8747, + "step": 12292 + }, + { + "epoch": 0.8869088416723784, + "grad_norm": 5.133187801395399, + "learning_rate": 1.3259360939920084e-07, + "loss": 0.9773, + "step": 12293 + }, + { + "epoch": 0.8869809891418059, + "grad_norm": 0.7025624461944753, + "learning_rate": 1.3242632315873792e-07, + "loss": 0.8292, + "step": 12294 + }, + { + "epoch": 0.8870531366112333, + "grad_norm": 0.762528248560706, + "learning_rate": 1.322591388995602e-07, + "loss": 0.7911, + "step": 12295 + }, + { + "epoch": 0.8871252840806608, + "grad_norm": 2.8334623943180626, + "learning_rate": 1.320920566307968e-07, + "loss": 0.9289, + "step": 12296 + }, + { + "epoch": 0.8871974315500883, + "grad_norm": 10.068812315527214, + "learning_rate": 1.3192507636157179e-07, + "loss": 0.9235, + "step": 12297 + }, + { + "epoch": 0.8872695790195159, + "grad_norm": 2.4261843827035308, + "learning_rate": 1.3175819810100252e-07, + "loss": 0.874, + "step": 12298 + }, + { + "epoch": 0.8873417264889434, + "grad_norm": 3.278024741386062, + "learning_rate": 1.3159142185820216e-07, + "loss": 0.9732, + "step": 12299 + }, + { + "epoch": 0.8874138739583709, + "grad_norm": 4.393916189596835, + "learning_rate": 1.3142474764227807e-07, + "loss": 0.8508, + "step": 12300 + }, + { + "epoch": 0.8874860214277984, + "grad_norm": 4.087975076020886, + "learning_rate": 1.3125817546233054e-07, + "loss": 0.9944, + "step": 12301 + }, + { + "epoch": 0.8875581688972259, + "grad_norm": 2.1374814975227734, + "learning_rate": 1.3109170532745651e-07, + "loss": 0.8956, + "step": 12302 + }, + { + "epoch": 0.8876303163666535, + "grad_norm": 2.7885653696535964, + "learning_rate": 1.3092533724674625e-07, + "loss": 0.8652, + "step": 12303 + }, + { + "epoch": 0.887702463836081, + "grad_norm": 2.2032859148279527, + "learning_rate": 1.307590712292841e-07, + "loss": 0.835, + "step": 12304 + }, + { + "epoch": 0.8877746113055085, + "grad_norm": 2.7374258405839114, + "learning_rate": 1.3059290728414918e-07, + "loss": 0.8765, + "step": 12305 + }, + { + "epoch": 0.887846758774936, + "grad_norm": 4.770358976855558, + "learning_rate": 1.3042684542041514e-07, + "loss": 0.923, + "step": 12306 + }, + { + "epoch": 0.8879189062443634, + "grad_norm": 2.307589808046154, + "learning_rate": 1.3026088564715055e-07, + "loss": 0.8575, + "step": 12307 + }, + { + "epoch": 0.887991053713791, + "grad_norm": 1.9997773046488143, + "learning_rate": 1.3009502797341743e-07, + "loss": 0.9413, + "step": 12308 + }, + { + "epoch": 0.8880632011832185, + "grad_norm": 2.686553965997936, + "learning_rate": 1.2992927240827256e-07, + "loss": 0.9203, + "step": 12309 + }, + { + "epoch": 0.888135348652646, + "grad_norm": 2.492440237784694, + "learning_rate": 1.297636189607676e-07, + "loss": 0.895, + "step": 12310 + }, + { + "epoch": 0.8882074961220735, + "grad_norm": 2.472967577193017, + "learning_rate": 1.2959806763994774e-07, + "loss": 0.8807, + "step": 12311 + }, + { + "epoch": 0.888279643591501, + "grad_norm": 2.445370901952097, + "learning_rate": 1.294326184548531e-07, + "loss": 0.9406, + "step": 12312 + }, + { + "epoch": 0.8883517910609285, + "grad_norm": 2.1598162806050882, + "learning_rate": 1.2926727141451889e-07, + "loss": 0.9287, + "step": 12313 + }, + { + "epoch": 0.8884239385303561, + "grad_norm": 2.4307928484175125, + "learning_rate": 1.291020265279741e-07, + "loss": 0.9221, + "step": 12314 + }, + { + "epoch": 0.8884960859997836, + "grad_norm": 2.603254804546143, + "learning_rate": 1.2893688380424152e-07, + "loss": 1.0441, + "step": 12315 + }, + { + "epoch": 0.8885682334692111, + "grad_norm": 3.4114153133718, + "learning_rate": 1.2877184325233926e-07, + "loss": 0.8994, + "step": 12316 + }, + { + "epoch": 0.8886403809386386, + "grad_norm": 2.41996621005846, + "learning_rate": 1.286069048812799e-07, + "loss": 1.047, + "step": 12317 + }, + { + "epoch": 0.8887125284080661, + "grad_norm": 3.0400426701010557, + "learning_rate": 1.2844206870006936e-07, + "loss": 0.8736, + "step": 12318 + }, + { + "epoch": 0.8887846758774935, + "grad_norm": 1.8292582333295924, + "learning_rate": 1.282773347177093e-07, + "loss": 0.8208, + "step": 12319 + }, + { + "epoch": 0.8888568233469211, + "grad_norm": 2.1874706811302507, + "learning_rate": 1.2811270294319541e-07, + "loss": 0.9636, + "step": 12320 + }, + { + "epoch": 0.8889289708163486, + "grad_norm": 2.954813640933879, + "learning_rate": 1.27948173385517e-07, + "loss": 0.9433, + "step": 12321 + }, + { + "epoch": 0.8890011182857761, + "grad_norm": 2.6503799202125844, + "learning_rate": 1.2778374605365883e-07, + "loss": 0.999, + "step": 12322 + }, + { + "epoch": 0.8890732657552036, + "grad_norm": 2.3842794936664737, + "learning_rate": 1.276194209565995e-07, + "loss": 0.9526, + "step": 12323 + }, + { + "epoch": 0.8891454132246311, + "grad_norm": 3.5840602107950827, + "learning_rate": 1.274551981033125e-07, + "loss": 0.9029, + "step": 12324 + }, + { + "epoch": 0.8892175606940587, + "grad_norm": 2.828176002674818, + "learning_rate": 1.2729107750276492e-07, + "loss": 0.8293, + "step": 12325 + }, + { + "epoch": 0.8892897081634862, + "grad_norm": 1.5187672555202227, + "learning_rate": 1.2712705916391909e-07, + "loss": 0.9692, + "step": 12326 + }, + { + "epoch": 0.8893618556329137, + "grad_norm": 2.986845900942016, + "learning_rate": 1.2696314309573186e-07, + "loss": 0.9668, + "step": 12327 + }, + { + "epoch": 0.8894340031023412, + "grad_norm": 2.557055201478013, + "learning_rate": 1.267993293071532e-07, + "loss": 0.9345, + "step": 12328 + }, + { + "epoch": 0.8895061505717687, + "grad_norm": 3.487398213094096, + "learning_rate": 1.2663561780712884e-07, + "loss": 0.9167, + "step": 12329 + }, + { + "epoch": 0.8895782980411963, + "grad_norm": 2.583636501940185, + "learning_rate": 1.2647200860459893e-07, + "loss": 0.8927, + "step": 12330 + }, + { + "epoch": 0.8896504455106237, + "grad_norm": 2.1162107121740767, + "learning_rate": 1.2630850170849617e-07, + "loss": 0.8689, + "step": 12331 + }, + { + "epoch": 0.8897225929800512, + "grad_norm": 2.431174556081371, + "learning_rate": 1.2614509712775047e-07, + "loss": 0.9109, + "step": 12332 + }, + { + "epoch": 0.8897947404494787, + "grad_norm": 2.6832790495631844, + "learning_rate": 1.2598179487128423e-07, + "loss": 0.898, + "step": 12333 + }, + { + "epoch": 0.8898668879189062, + "grad_norm": 2.8081755558899366, + "learning_rate": 1.2581859494801506e-07, + "loss": 0.8957, + "step": 12334 + }, + { + "epoch": 0.8899390353883337, + "grad_norm": 4.863894014170035, + "learning_rate": 1.256554973668542e-07, + "loss": 0.8054, + "step": 12335 + }, + { + "epoch": 0.8900111828577613, + "grad_norm": 1.8834587032934167, + "learning_rate": 1.2549250213670748e-07, + "loss": 0.8991, + "step": 12336 + }, + { + "epoch": 0.8900833303271888, + "grad_norm": 2.0646159559868105, + "learning_rate": 1.2532960926647684e-07, + "loss": 0.9867, + "step": 12337 + }, + { + "epoch": 0.8901554777966163, + "grad_norm": 1.9504250136616983, + "learning_rate": 1.251668187650563e-07, + "loss": 0.9412, + "step": 12338 + }, + { + "epoch": 0.8902276252660438, + "grad_norm": 2.1841125417923966, + "learning_rate": 1.250041306413352e-07, + "loss": 0.9435, + "step": 12339 + }, + { + "epoch": 0.8902997727354713, + "grad_norm": 2.715321099558219, + "learning_rate": 1.2484154490419752e-07, + "loss": 1.0101, + "step": 12340 + }, + { + "epoch": 0.8903719202048989, + "grad_norm": 2.9106256503251267, + "learning_rate": 1.2467906156252174e-07, + "loss": 0.9074, + "step": 12341 + }, + { + "epoch": 0.8904440676743264, + "grad_norm": 2.9146876751153354, + "learning_rate": 1.2451668062517962e-07, + "loss": 0.8821, + "step": 12342 + }, + { + "epoch": 0.8905162151437538, + "grad_norm": 2.5421381236921805, + "learning_rate": 1.243544021010392e-07, + "loss": 0.8997, + "step": 12343 + }, + { + "epoch": 0.8905883626131813, + "grad_norm": 2.3929436104774395, + "learning_rate": 1.2419222599896162e-07, + "loss": 0.9449, + "step": 12344 + }, + { + "epoch": 0.8906605100826088, + "grad_norm": 0.8131619837777831, + "learning_rate": 1.2403015232780244e-07, + "loss": 0.8217, + "step": 12345 + }, + { + "epoch": 0.8907326575520363, + "grad_norm": 2.463330948724827, + "learning_rate": 1.2386818109641194e-07, + "loss": 0.9156, + "step": 12346 + }, + { + "epoch": 0.8908048050214639, + "grad_norm": 2.508540823160429, + "learning_rate": 1.2370631231363526e-07, + "loss": 0.8483, + "step": 12347 + }, + { + "epoch": 0.8908769524908914, + "grad_norm": 2.3812811261242848, + "learning_rate": 1.2354454598831066e-07, + "loss": 0.8624, + "step": 12348 + }, + { + "epoch": 0.8909490999603189, + "grad_norm": 2.7786636328134553, + "learning_rate": 1.2338288212927172e-07, + "loss": 0.8836, + "step": 12349 + }, + { + "epoch": 0.8910212474297464, + "grad_norm": 3.3889221978418007, + "learning_rate": 1.2322132074534697e-07, + "loss": 0.9384, + "step": 12350 + }, + { + "epoch": 0.8910933948991739, + "grad_norm": 1.948707834684391, + "learning_rate": 1.2305986184535843e-07, + "loss": 0.8986, + "step": 12351 + }, + { + "epoch": 0.8911655423686015, + "grad_norm": 2.5077688623357286, + "learning_rate": 1.2289850543812242e-07, + "loss": 0.986, + "step": 12352 + }, + { + "epoch": 0.891237689838029, + "grad_norm": 2.1050863113286615, + "learning_rate": 1.227372515324503e-07, + "loss": 0.9218, + "step": 12353 + }, + { + "epoch": 0.8913098373074564, + "grad_norm": 2.4114823364205695, + "learning_rate": 1.225761001371477e-07, + "loss": 0.8641, + "step": 12354 + }, + { + "epoch": 0.8913819847768839, + "grad_norm": 1.9780025739585552, + "learning_rate": 1.224150512610138e-07, + "loss": 0.919, + "step": 12355 + }, + { + "epoch": 0.8914541322463114, + "grad_norm": 2.462284555063151, + "learning_rate": 1.222541049128436e-07, + "loss": 0.9479, + "step": 12356 + }, + { + "epoch": 0.891526279715739, + "grad_norm": 3.6608345437317418, + "learning_rate": 1.2209326110142537e-07, + "loss": 0.8846, + "step": 12357 + }, + { + "epoch": 0.8915984271851665, + "grad_norm": 4.239209107824628, + "learning_rate": 1.2193251983554298e-07, + "loss": 0.882, + "step": 12358 + }, + { + "epoch": 0.891670574654594, + "grad_norm": 2.6667787905803895, + "learning_rate": 1.2177188112397296e-07, + "loss": 0.9341, + "step": 12359 + }, + { + "epoch": 0.8917427221240215, + "grad_norm": 2.9440646576415443, + "learning_rate": 1.2161134497548697e-07, + "loss": 0.9406, + "step": 12360 + }, + { + "epoch": 0.891814869593449, + "grad_norm": 0.7138073605204678, + "learning_rate": 1.2145091139885287e-07, + "loss": 0.8144, + "step": 12361 + }, + { + "epoch": 0.8918870170628765, + "grad_norm": 5.45433472601992, + "learning_rate": 1.212905804028297e-07, + "loss": 0.9258, + "step": 12362 + }, + { + "epoch": 0.8919591645323041, + "grad_norm": 2.645423637169059, + "learning_rate": 1.211303519961735e-07, + "loss": 0.8203, + "step": 12363 + }, + { + "epoch": 0.8920313120017316, + "grad_norm": 2.2089585792661053, + "learning_rate": 1.2097022618763352e-07, + "loss": 0.88, + "step": 12364 + }, + { + "epoch": 0.8921034594711591, + "grad_norm": 2.0095193339431225, + "learning_rate": 1.2081020298595323e-07, + "loss": 0.9036, + "step": 12365 + }, + { + "epoch": 0.8921756069405865, + "grad_norm": 10.366687799986204, + "learning_rate": 1.2065028239987118e-07, + "loss": 1.0275, + "step": 12366 + }, + { + "epoch": 0.892247754410014, + "grad_norm": 0.977467659125004, + "learning_rate": 1.2049046443812017e-07, + "loss": 0.8813, + "step": 12367 + }, + { + "epoch": 0.8923199018794415, + "grad_norm": 2.359428657780524, + "learning_rate": 1.2033074910942764e-07, + "loss": 0.9445, + "step": 12368 + }, + { + "epoch": 0.8923920493488691, + "grad_norm": 3.002085278890078, + "learning_rate": 1.2017113642251418e-07, + "loss": 0.9, + "step": 12369 + }, + { + "epoch": 0.8924641968182966, + "grad_norm": 2.915621833758664, + "learning_rate": 1.200116263860962e-07, + "loss": 0.9039, + "step": 12370 + }, + { + "epoch": 0.8925363442877241, + "grad_norm": 2.884296975780169, + "learning_rate": 1.1985221900888398e-07, + "loss": 0.8907, + "step": 12371 + }, + { + "epoch": 0.8926084917571516, + "grad_norm": 6.8576213011267955, + "learning_rate": 1.1969291429958173e-07, + "loss": 0.8759, + "step": 12372 + }, + { + "epoch": 0.8926806392265791, + "grad_norm": 2.61598492223145, + "learning_rate": 1.1953371226688847e-07, + "loss": 0.8977, + "step": 12373 + }, + { + "epoch": 0.8927527866960067, + "grad_norm": 2.3722348429006406, + "learning_rate": 1.1937461291949857e-07, + "loss": 0.887, + "step": 12374 + }, + { + "epoch": 0.8928249341654342, + "grad_norm": 2.7744383819276877, + "learning_rate": 1.1921561626609888e-07, + "loss": 0.8373, + "step": 12375 + }, + { + "epoch": 0.8928970816348617, + "grad_norm": 2.8510181482246564, + "learning_rate": 1.1905672231537223e-07, + "loss": 0.8675, + "step": 12376 + }, + { + "epoch": 0.8929692291042892, + "grad_norm": 2.6849512833876594, + "learning_rate": 1.1889793107599455e-07, + "loss": 0.886, + "step": 12377 + }, + { + "epoch": 0.8930413765737166, + "grad_norm": 4.501894022456405, + "learning_rate": 1.187392425566378e-07, + "loss": 0.9985, + "step": 12378 + }, + { + "epoch": 0.8931135240431441, + "grad_norm": 2.2667457702334723, + "learning_rate": 1.1858065676596618e-07, + "loss": 0.995, + "step": 12379 + }, + { + "epoch": 0.8931856715125717, + "grad_norm": 2.9306949765113943, + "learning_rate": 1.1842217371264052e-07, + "loss": 0.955, + "step": 12380 + }, + { + "epoch": 0.8932578189819992, + "grad_norm": 2.4973139638412674, + "learning_rate": 1.1826379340531478e-07, + "loss": 0.8352, + "step": 12381 + }, + { + "epoch": 0.8933299664514267, + "grad_norm": 2.786411767493258, + "learning_rate": 1.1810551585263694e-07, + "loss": 1.0075, + "step": 12382 + }, + { + "epoch": 0.8934021139208542, + "grad_norm": 0.6440258703499954, + "learning_rate": 1.1794734106325055e-07, + "loss": 0.7226, + "step": 12383 + }, + { + "epoch": 0.8934742613902817, + "grad_norm": 0.7504194993793992, + "learning_rate": 1.1778926904579222e-07, + "loss": 0.8215, + "step": 12384 + }, + { + "epoch": 0.8935464088597093, + "grad_norm": 2.0890865487174595, + "learning_rate": 1.1763129980889485e-07, + "loss": 0.9059, + "step": 12385 + }, + { + "epoch": 0.8936185563291368, + "grad_norm": 2.1809954256956887, + "learning_rate": 1.1747343336118376e-07, + "loss": 0.7942, + "step": 12386 + }, + { + "epoch": 0.8936907037985643, + "grad_norm": 6.159496709897512, + "learning_rate": 1.1731566971127959e-07, + "loss": 0.8861, + "step": 12387 + }, + { + "epoch": 0.8937628512679918, + "grad_norm": 3.7167818647473445, + "learning_rate": 1.1715800886779726e-07, + "loss": 0.753, + "step": 12388 + }, + { + "epoch": 0.8938349987374193, + "grad_norm": 2.1857463892191653, + "learning_rate": 1.1700045083934606e-07, + "loss": 0.9132, + "step": 12389 + }, + { + "epoch": 0.8939071462068467, + "grad_norm": 4.61156078173409, + "learning_rate": 1.1684299563452915e-07, + "loss": 0.949, + "step": 12390 + }, + { + "epoch": 0.8939792936762743, + "grad_norm": 2.615916475852483, + "learning_rate": 1.1668564326194563e-07, + "loss": 0.9118, + "step": 12391 + }, + { + "epoch": 0.8940514411457018, + "grad_norm": 2.58756354802182, + "learning_rate": 1.1652839373018707e-07, + "loss": 0.9638, + "step": 12392 + }, + { + "epoch": 0.8941235886151293, + "grad_norm": 4.467538736150901, + "learning_rate": 1.1637124704784063e-07, + "loss": 0.982, + "step": 12393 + }, + { + "epoch": 0.8941957360845568, + "grad_norm": 2.197895742982702, + "learning_rate": 1.1621420322348741e-07, + "loss": 0.8714, + "step": 12394 + }, + { + "epoch": 0.8942678835539843, + "grad_norm": 10.267644071086016, + "learning_rate": 1.1605726226570323e-07, + "loss": 0.9334, + "step": 12395 + }, + { + "epoch": 0.8943400310234119, + "grad_norm": 2.6026573283107437, + "learning_rate": 1.1590042418305745e-07, + "loss": 0.967, + "step": 12396 + }, + { + "epoch": 0.8944121784928394, + "grad_norm": 3.859577621515586, + "learning_rate": 1.157436889841148e-07, + "loss": 0.8835, + "step": 12397 + }, + { + "epoch": 0.8944843259622669, + "grad_norm": 2.2719633691279397, + "learning_rate": 1.155870566774344e-07, + "loss": 0.8671, + "step": 12398 + }, + { + "epoch": 0.8945564734316944, + "grad_norm": 4.762101668873472, + "learning_rate": 1.1543052727156877e-07, + "loss": 0.9293, + "step": 12399 + }, + { + "epoch": 0.8946286209011219, + "grad_norm": 2.5005833898780274, + "learning_rate": 1.152741007750655e-07, + "loss": 0.9256, + "step": 12400 + }, + { + "epoch": 0.8947007683705493, + "grad_norm": 2.242940636660935, + "learning_rate": 1.1511777719646664e-07, + "loss": 0.9349, + "step": 12401 + }, + { + "epoch": 0.8947729158399769, + "grad_norm": 0.908872886545082, + "learning_rate": 1.1496155654430873e-07, + "loss": 0.8067, + "step": 12402 + }, + { + "epoch": 0.8948450633094044, + "grad_norm": 2.5180160816275987, + "learning_rate": 1.1480543882712135e-07, + "loss": 0.8018, + "step": 12403 + }, + { + "epoch": 0.8949172107788319, + "grad_norm": 1.909418817944496, + "learning_rate": 1.1464942405343059e-07, + "loss": 0.9158, + "step": 12404 + }, + { + "epoch": 0.8949893582482594, + "grad_norm": 4.085802123414596, + "learning_rate": 1.1449351223175564e-07, + "loss": 0.8208, + "step": 12405 + }, + { + "epoch": 0.895061505717687, + "grad_norm": 2.116639576635945, + "learning_rate": 1.1433770337060988e-07, + "loss": 0.9633, + "step": 12406 + }, + { + "epoch": 0.8951336531871145, + "grad_norm": 2.6900973521488187, + "learning_rate": 1.1418199747850166e-07, + "loss": 0.9884, + "step": 12407 + }, + { + "epoch": 0.895205800656542, + "grad_norm": 2.6859619707609457, + "learning_rate": 1.1402639456393371e-07, + "loss": 0.9666, + "step": 12408 + }, + { + "epoch": 0.8952779481259695, + "grad_norm": 4.234870909757989, + "learning_rate": 1.1387089463540211e-07, + "loss": 0.9037, + "step": 12409 + }, + { + "epoch": 0.895350095595397, + "grad_norm": 2.2589714481327565, + "learning_rate": 1.1371549770139921e-07, + "loss": 0.9911, + "step": 12410 + }, + { + "epoch": 0.8954222430648245, + "grad_norm": 3.204532667536652, + "learning_rate": 1.1356020377040998e-07, + "loss": 0.8319, + "step": 12411 + }, + { + "epoch": 0.8954943905342521, + "grad_norm": 2.4487464851015344, + "learning_rate": 1.1340501285091497e-07, + "loss": 0.8436, + "step": 12412 + }, + { + "epoch": 0.8955665380036795, + "grad_norm": 2.947036683712798, + "learning_rate": 1.1324992495138808e-07, + "loss": 0.9016, + "step": 12413 + }, + { + "epoch": 0.895638685473107, + "grad_norm": 2.3788818959459967, + "learning_rate": 1.1309494008029808e-07, + "loss": 0.8659, + "step": 12414 + }, + { + "epoch": 0.8957108329425345, + "grad_norm": 2.7667927835820114, + "learning_rate": 1.1294005824610887e-07, + "loss": 0.9072, + "step": 12415 + }, + { + "epoch": 0.895782980411962, + "grad_norm": 3.055743647094005, + "learning_rate": 1.12785279457277e-07, + "loss": 0.8416, + "step": 12416 + }, + { + "epoch": 0.8958551278813895, + "grad_norm": 6.576963054760277, + "learning_rate": 1.1263060372225486e-07, + "loss": 0.9249, + "step": 12417 + }, + { + "epoch": 0.8959272753508171, + "grad_norm": 2.433262393736035, + "learning_rate": 1.12476031049489e-07, + "loss": 0.9466, + "step": 12418 + }, + { + "epoch": 0.8959994228202446, + "grad_norm": 0.7213337524683728, + "learning_rate": 1.1232156144741933e-07, + "loss": 0.8553, + "step": 12419 + }, + { + "epoch": 0.8960715702896721, + "grad_norm": 6.609024074608142, + "learning_rate": 1.1216719492448112e-07, + "loss": 0.8541, + "step": 12420 + }, + { + "epoch": 0.8961437177590996, + "grad_norm": 2.49541119002334, + "learning_rate": 1.1201293148910385e-07, + "loss": 0.8993, + "step": 12421 + }, + { + "epoch": 0.8962158652285271, + "grad_norm": 3.652996996226983, + "learning_rate": 1.1185877114971166e-07, + "loss": 0.9324, + "step": 12422 + }, + { + "epoch": 0.8962880126979547, + "grad_norm": 3.9933497459655993, + "learning_rate": 1.1170471391472225e-07, + "loss": 0.9499, + "step": 12423 + }, + { + "epoch": 0.8963601601673822, + "grad_norm": 3.2677259485858103, + "learning_rate": 1.1155075979254802e-07, + "loss": 0.9139, + "step": 12424 + }, + { + "epoch": 0.8964323076368096, + "grad_norm": 1.8018272159100779, + "learning_rate": 1.1139690879159624e-07, + "loss": 0.9543, + "step": 12425 + }, + { + "epoch": 0.8965044551062371, + "grad_norm": 2.2411711512672254, + "learning_rate": 1.1124316092026776e-07, + "loss": 0.9254, + "step": 12426 + }, + { + "epoch": 0.8965766025756646, + "grad_norm": 2.422786024468799, + "learning_rate": 1.1108951618695783e-07, + "loss": 0.9683, + "step": 12427 + }, + { + "epoch": 0.8966487500450921, + "grad_norm": 1.911939478993736, + "learning_rate": 1.1093597460005755e-07, + "loss": 0.9272, + "step": 12428 + }, + { + "epoch": 0.8967208975145197, + "grad_norm": 0.8642338463860151, + "learning_rate": 1.1078253616795064e-07, + "loss": 0.8929, + "step": 12429 + }, + { + "epoch": 0.8967930449839472, + "grad_norm": 3.413683139142075, + "learning_rate": 1.106292008990155e-07, + "loss": 0.9465, + "step": 12430 + }, + { + "epoch": 0.8968651924533747, + "grad_norm": 2.291464316942403, + "learning_rate": 1.1047596880162569e-07, + "loss": 0.8985, + "step": 12431 + }, + { + "epoch": 0.8969373399228022, + "grad_norm": 4.632624141472868, + "learning_rate": 1.1032283988414847e-07, + "loss": 0.8377, + "step": 12432 + }, + { + "epoch": 0.8970094873922297, + "grad_norm": 3.296251717177498, + "learning_rate": 1.101698141549452e-07, + "loss": 0.9558, + "step": 12433 + }, + { + "epoch": 0.8970816348616573, + "grad_norm": 2.080747856530448, + "learning_rate": 1.100168916223727e-07, + "loss": 0.7982, + "step": 12434 + }, + { + "epoch": 0.8971537823310848, + "grad_norm": 1.8976097028982781, + "learning_rate": 1.0986407229478167e-07, + "loss": 1.0083, + "step": 12435 + }, + { + "epoch": 0.8972259298005123, + "grad_norm": 2.8356449196956417, + "learning_rate": 1.0971135618051608e-07, + "loss": 0.8462, + "step": 12436 + }, + { + "epoch": 0.8972980772699397, + "grad_norm": 0.7935706424071578, + "learning_rate": 1.0955874328791592e-07, + "loss": 0.9162, + "step": 12437 + }, + { + "epoch": 0.8973702247393672, + "grad_norm": 3.8198613873643055, + "learning_rate": 1.0940623362531431e-07, + "loss": 0.9769, + "step": 12438 + }, + { + "epoch": 0.8974423722087947, + "grad_norm": 2.9653866638949884, + "learning_rate": 1.0925382720104015e-07, + "loss": 0.8668, + "step": 12439 + }, + { + "epoch": 0.8975145196782223, + "grad_norm": 3.255681280747033, + "learning_rate": 1.0910152402341477e-07, + "loss": 0.8271, + "step": 12440 + }, + { + "epoch": 0.8975866671476498, + "grad_norm": 2.2164263310563146, + "learning_rate": 1.089493241007553e-07, + "loss": 1.0201, + "step": 12441 + }, + { + "epoch": 0.8976588146170773, + "grad_norm": 2.396734272830542, + "learning_rate": 1.0879722744137331e-07, + "loss": 0.8829, + "step": 12442 + }, + { + "epoch": 0.8977309620865048, + "grad_norm": 1.639057609616267, + "learning_rate": 1.0864523405357329e-07, + "loss": 0.9359, + "step": 12443 + }, + { + "epoch": 0.8978031095559323, + "grad_norm": 3.6845136284733377, + "learning_rate": 1.0849334394565501e-07, + "loss": 0.8922, + "step": 12444 + }, + { + "epoch": 0.8978752570253599, + "grad_norm": 2.042441422218958, + "learning_rate": 1.0834155712591364e-07, + "loss": 0.9814, + "step": 12445 + }, + { + "epoch": 0.8979474044947874, + "grad_norm": 2.224495661293372, + "learning_rate": 1.0818987360263743e-07, + "loss": 0.8975, + "step": 12446 + }, + { + "epoch": 0.8980195519642149, + "grad_norm": 1.766941288830282, + "learning_rate": 1.0803829338410863e-07, + "loss": 0.9401, + "step": 12447 + }, + { + "epoch": 0.8980916994336423, + "grad_norm": 6.070959108298203, + "learning_rate": 1.0788681647860488e-07, + "loss": 0.8063, + "step": 12448 + }, + { + "epoch": 0.8981638469030698, + "grad_norm": 2.83598541975914, + "learning_rate": 1.0773544289439795e-07, + "loss": 0.8901, + "step": 12449 + }, + { + "epoch": 0.8982359943724973, + "grad_norm": 3.0079207760036426, + "learning_rate": 1.0758417263975327e-07, + "loss": 0.9626, + "step": 12450 + }, + { + "epoch": 0.8983081418419249, + "grad_norm": 2.192683264316469, + "learning_rate": 1.0743300572293113e-07, + "loss": 0.8457, + "step": 12451 + }, + { + "epoch": 0.8983802893113524, + "grad_norm": 2.811714401729324, + "learning_rate": 1.072819421521871e-07, + "loss": 0.8732, + "step": 12452 + }, + { + "epoch": 0.8984524367807799, + "grad_norm": 3.946970736564979, + "learning_rate": 1.0713098193576953e-07, + "loss": 0.8264, + "step": 12453 + }, + { + "epoch": 0.8985245842502074, + "grad_norm": 2.3784670369508625, + "learning_rate": 1.0698012508192156e-07, + "loss": 1.0298, + "step": 12454 + }, + { + "epoch": 0.898596731719635, + "grad_norm": 2.7857675617139974, + "learning_rate": 1.0682937159888128e-07, + "loss": 1.031, + "step": 12455 + }, + { + "epoch": 0.8986688791890625, + "grad_norm": 4.290498642474348, + "learning_rate": 1.0667872149488122e-07, + "loss": 0.9382, + "step": 12456 + }, + { + "epoch": 0.89874102665849, + "grad_norm": 2.1698157338605255, + "learning_rate": 1.0652817477814657e-07, + "loss": 0.9576, + "step": 12457 + }, + { + "epoch": 0.8988131741279175, + "grad_norm": 0.786153094107067, + "learning_rate": 1.0637773145689944e-07, + "loss": 0.8125, + "step": 12458 + }, + { + "epoch": 0.898885321597345, + "grad_norm": 2.5372182868210706, + "learning_rate": 1.0622739153935456e-07, + "loss": 0.9897, + "step": 12459 + }, + { + "epoch": 0.8989574690667724, + "grad_norm": 1.8161553322365236, + "learning_rate": 1.0607715503372095e-07, + "loss": 0.9423, + "step": 12460 + }, + { + "epoch": 0.8990296165362, + "grad_norm": 1.9309550458237783, + "learning_rate": 1.0592702194820291e-07, + "loss": 0.9811, + "step": 12461 + }, + { + "epoch": 0.8991017640056275, + "grad_norm": 5.3379502975305275, + "learning_rate": 1.0577699229099901e-07, + "loss": 0.9343, + "step": 12462 + }, + { + "epoch": 0.899173911475055, + "grad_norm": 4.069867539933276, + "learning_rate": 1.056270660703007e-07, + "loss": 0.9407, + "step": 12463 + }, + { + "epoch": 0.8992460589444825, + "grad_norm": 2.4311409187852906, + "learning_rate": 1.0547724329429586e-07, + "loss": 0.9242, + "step": 12464 + }, + { + "epoch": 0.89931820641391, + "grad_norm": 2.8662658374214076, + "learning_rate": 1.0532752397116551e-07, + "loss": 0.8815, + "step": 12465 + }, + { + "epoch": 0.8993903538833375, + "grad_norm": 3.144485302701969, + "learning_rate": 1.0517790810908555e-07, + "loss": 1.0606, + "step": 12466 + }, + { + "epoch": 0.8994625013527651, + "grad_norm": 0.6275216967794582, + "learning_rate": 1.0502839571622524e-07, + "loss": 0.7828, + "step": 12467 + }, + { + "epoch": 0.8995346488221926, + "grad_norm": 2.4879467318618933, + "learning_rate": 1.0487898680074913e-07, + "loss": 0.9291, + "step": 12468 + }, + { + "epoch": 0.8996067962916201, + "grad_norm": 6.185337189570318, + "learning_rate": 1.047296813708165e-07, + "loss": 1.0039, + "step": 12469 + }, + { + "epoch": 0.8996789437610476, + "grad_norm": 3.0167367101585065, + "learning_rate": 1.0458047943457971e-07, + "loss": 0.9443, + "step": 12470 + }, + { + "epoch": 0.8997510912304751, + "grad_norm": 2.5197088607758853, + "learning_rate": 1.0443138100018622e-07, + "loss": 0.8432, + "step": 12471 + }, + { + "epoch": 0.8998232386999026, + "grad_norm": 3.0526259702641356, + "learning_rate": 1.04282386075778e-07, + "loss": 1.0276, + "step": 12472 + }, + { + "epoch": 0.8998953861693301, + "grad_norm": 4.77032039278079, + "learning_rate": 1.0413349466949139e-07, + "loss": 0.8844, + "step": 12473 + }, + { + "epoch": 0.8999675336387576, + "grad_norm": 2.1886518443094976, + "learning_rate": 1.0398470678945593e-07, + "loss": 1.0211, + "step": 12474 + }, + { + "epoch": 0.9000396811081851, + "grad_norm": 3.668473246597082, + "learning_rate": 1.038360224437964e-07, + "loss": 0.9953, + "step": 12475 + }, + { + "epoch": 0.9001118285776126, + "grad_norm": 2.3959891779978397, + "learning_rate": 1.0368744164063303e-07, + "loss": 0.8878, + "step": 12476 + }, + { + "epoch": 0.9001839760470401, + "grad_norm": 10.777133852500256, + "learning_rate": 1.0353896438807842e-07, + "loss": 0.9568, + "step": 12477 + }, + { + "epoch": 0.9002561235164677, + "grad_norm": 2.166432869352496, + "learning_rate": 1.0339059069424027e-07, + "loss": 0.931, + "step": 12478 + }, + { + "epoch": 0.9003282709858952, + "grad_norm": 14.386216009068205, + "learning_rate": 1.0324232056722127e-07, + "loss": 0.8481, + "step": 12479 + }, + { + "epoch": 0.9004004184553227, + "grad_norm": 1.8351093778312306, + "learning_rate": 1.0309415401511757e-07, + "loss": 0.9478, + "step": 12480 + }, + { + "epoch": 0.9004725659247502, + "grad_norm": 2.4558753856333024, + "learning_rate": 1.0294609104601937e-07, + "loss": 0.9495, + "step": 12481 + }, + { + "epoch": 0.9005447133941777, + "grad_norm": 11.033778260202283, + "learning_rate": 1.0279813166801288e-07, + "loss": 0.9209, + "step": 12482 + }, + { + "epoch": 0.9006168608636053, + "grad_norm": 2.362294897632412, + "learning_rate": 1.0265027588917763e-07, + "loss": 0.937, + "step": 12483 + }, + { + "epoch": 0.9006890083330327, + "grad_norm": 2.07854990809915, + "learning_rate": 1.0250252371758672e-07, + "loss": 0.9349, + "step": 12484 + }, + { + "epoch": 0.9007611558024602, + "grad_norm": 2.320823947013169, + "learning_rate": 1.0235487516130859e-07, + "loss": 0.8801, + "step": 12485 + }, + { + "epoch": 0.9008333032718877, + "grad_norm": 3.5390901206582903, + "learning_rate": 1.0220733022840633e-07, + "loss": 0.8506, + "step": 12486 + }, + { + "epoch": 0.9009054507413152, + "grad_norm": 2.9007989769354654, + "learning_rate": 1.0205988892693573e-07, + "loss": 0.9369, + "step": 12487 + }, + { + "epoch": 0.9009775982107427, + "grad_norm": 2.3521531082584612, + "learning_rate": 1.0191255126494901e-07, + "loss": 0.9333, + "step": 12488 + }, + { + "epoch": 0.9010497456801703, + "grad_norm": 2.650653823231693, + "learning_rate": 1.0176531725049132e-07, + "loss": 0.9326, + "step": 12489 + }, + { + "epoch": 0.9011218931495978, + "grad_norm": 2.5745711367601642, + "learning_rate": 1.0161818689160285e-07, + "loss": 0.8636, + "step": 12490 + }, + { + "epoch": 0.9011940406190253, + "grad_norm": 7.038727848679987, + "learning_rate": 1.014711601963174e-07, + "loss": 0.951, + "step": 12491 + }, + { + "epoch": 0.9012661880884528, + "grad_norm": 3.6268555726144505, + "learning_rate": 1.013242371726637e-07, + "loss": 0.9332, + "step": 12492 + }, + { + "epoch": 0.9013383355578803, + "grad_norm": 2.969677428893119, + "learning_rate": 1.0117741782866506e-07, + "loss": 0.9696, + "step": 12493 + }, + { + "epoch": 0.9014104830273079, + "grad_norm": 2.4176335757256164, + "learning_rate": 1.0103070217233823e-07, + "loss": 0.9274, + "step": 12494 + }, + { + "epoch": 0.9014826304967354, + "grad_norm": 2.2607951279573526, + "learning_rate": 1.0088409021169497e-07, + "loss": 0.8795, + "step": 12495 + }, + { + "epoch": 0.9015547779661628, + "grad_norm": 3.6587958968150684, + "learning_rate": 1.0073758195474158e-07, + "loss": 0.9094, + "step": 12496 + }, + { + "epoch": 0.9016269254355903, + "grad_norm": 1.9763996524596628, + "learning_rate": 1.0059117740947764e-07, + "loss": 0.9039, + "step": 12497 + }, + { + "epoch": 0.9016990729050178, + "grad_norm": 1.767869585992528, + "learning_rate": 1.0044487658389833e-07, + "loss": 0.9218, + "step": 12498 + }, + { + "epoch": 0.9017712203744453, + "grad_norm": 2.32808618705194, + "learning_rate": 1.0029867948599191e-07, + "loss": 0.9187, + "step": 12499 + }, + { + "epoch": 0.9018433678438729, + "grad_norm": 4.130086363843102, + "learning_rate": 1.0015258612374289e-07, + "loss": 0.8728, + "step": 12500 + }, + { + "epoch": 0.9019155153133004, + "grad_norm": 2.3044807987948523, + "learning_rate": 1.0000659650512777e-07, + "loss": 0.7996, + "step": 12501 + }, + { + "epoch": 0.9019876627827279, + "grad_norm": 2.5035783907146523, + "learning_rate": 9.986071063811885e-08, + "loss": 1.0216, + "step": 12502 + }, + { + "epoch": 0.9020598102521554, + "grad_norm": 2.9914749133642418, + "learning_rate": 9.971492853068264e-08, + "loss": 0.9821, + "step": 12503 + }, + { + "epoch": 0.9021319577215829, + "grad_norm": 0.8819926345521981, + "learning_rate": 9.956925019077944e-08, + "loss": 0.8562, + "step": 12504 + }, + { + "epoch": 0.9022041051910105, + "grad_norm": 3.300670590323305, + "learning_rate": 9.9423675626364e-08, + "loss": 1.0454, + "step": 12505 + }, + { + "epoch": 0.902276252660438, + "grad_norm": 1.955353160189699, + "learning_rate": 9.927820484538663e-08, + "loss": 0.909, + "step": 12506 + }, + { + "epoch": 0.9023484001298654, + "grad_norm": 2.481704236017979, + "learning_rate": 9.913283785578985e-08, + "loss": 0.9175, + "step": 12507 + }, + { + "epoch": 0.9024205475992929, + "grad_norm": 3.2269303156064257, + "learning_rate": 9.8987574665512e-08, + "loss": 0.8101, + "step": 12508 + }, + { + "epoch": 0.9024926950687204, + "grad_norm": 3.74103262328745, + "learning_rate": 9.884241528248538e-08, + "loss": 0.8563, + "step": 12509 + }, + { + "epoch": 0.902564842538148, + "grad_norm": 2.07285046751302, + "learning_rate": 9.869735971463699e-08, + "loss": 0.9205, + "step": 12510 + }, + { + "epoch": 0.9026369900075755, + "grad_norm": 3.7176559505832083, + "learning_rate": 9.855240796988695e-08, + "loss": 0.9212, + "step": 12511 + }, + { + "epoch": 0.902709137477003, + "grad_norm": 3.049144193308095, + "learning_rate": 9.840756005615115e-08, + "loss": 0.8525, + "step": 12512 + }, + { + "epoch": 0.9027812849464305, + "grad_norm": 2.018198071312032, + "learning_rate": 9.826281598133923e-08, + "loss": 0.9319, + "step": 12513 + }, + { + "epoch": 0.902853432415858, + "grad_norm": 2.633558761830493, + "learning_rate": 9.811817575335468e-08, + "loss": 0.8708, + "step": 12514 + }, + { + "epoch": 0.9029255798852855, + "grad_norm": 2.2450084419346648, + "learning_rate": 9.797363938009606e-08, + "loss": 0.9114, + "step": 12515 + }, + { + "epoch": 0.9029977273547131, + "grad_norm": 2.620784735620735, + "learning_rate": 9.782920686945572e-08, + "loss": 1.0436, + "step": 12516 + }, + { + "epoch": 0.9030698748241406, + "grad_norm": 2.6287796420803726, + "learning_rate": 9.768487822932137e-08, + "loss": 0.7324, + "step": 12517 + }, + { + "epoch": 0.9031420222935681, + "grad_norm": 2.598301215161921, + "learning_rate": 9.754065346757334e-08, + "loss": 0.9368, + "step": 12518 + }, + { + "epoch": 0.9032141697629955, + "grad_norm": 2.278495798166941, + "learning_rate": 9.739653259208758e-08, + "loss": 0.9384, + "step": 12519 + }, + { + "epoch": 0.903286317232423, + "grad_norm": 3.160688395544241, + "learning_rate": 9.725251561073423e-08, + "loss": 0.8738, + "step": 12520 + }, + { + "epoch": 0.9033584647018505, + "grad_norm": 3.988617676417223, + "learning_rate": 9.710860253137699e-08, + "loss": 0.8803, + "step": 12521 + }, + { + "epoch": 0.9034306121712781, + "grad_norm": 1.2802780931098607, + "learning_rate": 9.696479336187468e-08, + "loss": 0.8918, + "step": 12522 + }, + { + "epoch": 0.9035027596407056, + "grad_norm": 2.5291962946036755, + "learning_rate": 9.68210881100806e-08, + "loss": 0.8708, + "step": 12523 + }, + { + "epoch": 0.9035749071101331, + "grad_norm": 2.2111849696903074, + "learning_rate": 9.667748678384092e-08, + "loss": 0.9436, + "step": 12524 + }, + { + "epoch": 0.9036470545795606, + "grad_norm": 2.377070177216249, + "learning_rate": 9.653398939099821e-08, + "loss": 0.8753, + "step": 12525 + }, + { + "epoch": 0.9037192020489881, + "grad_norm": 3.0030252779501083, + "learning_rate": 9.639059593938781e-08, + "loss": 0.9262, + "step": 12526 + }, + { + "epoch": 0.9037913495184157, + "grad_norm": 3.782044382433046, + "learning_rate": 9.624730643684031e-08, + "loss": 0.9329, + "step": 12527 + }, + { + "epoch": 0.9038634969878432, + "grad_norm": 3.669997234291469, + "learning_rate": 9.610412089117969e-08, + "loss": 0.8057, + "step": 12528 + }, + { + "epoch": 0.9039356444572707, + "grad_norm": 0.8155376431391834, + "learning_rate": 9.596103931022504e-08, + "loss": 0.8276, + "step": 12529 + }, + { + "epoch": 0.9040077919266982, + "grad_norm": 3.8227922937324896, + "learning_rate": 9.581806170178985e-08, + "loss": 0.9573, + "step": 12530 + }, + { + "epoch": 0.9040799393961256, + "grad_norm": 0.675647325420372, + "learning_rate": 9.567518807368102e-08, + "loss": 0.7664, + "step": 12531 + }, + { + "epoch": 0.9041520868655532, + "grad_norm": 3.741382870747454, + "learning_rate": 9.553241843370053e-08, + "loss": 0.8757, + "step": 12532 + }, + { + "epoch": 0.9042242343349807, + "grad_norm": 1.9849332489782043, + "learning_rate": 9.538975278964456e-08, + "loss": 0.8396, + "step": 12533 + }, + { + "epoch": 0.9042963818044082, + "grad_norm": 2.236600181322927, + "learning_rate": 9.524719114930402e-08, + "loss": 0.8924, + "step": 12534 + }, + { + "epoch": 0.9043685292738357, + "grad_norm": 2.1847162244870306, + "learning_rate": 9.510473352046244e-08, + "loss": 0.887, + "step": 12535 + }, + { + "epoch": 0.9044406767432632, + "grad_norm": 3.0684092594707697, + "learning_rate": 9.496237991090006e-08, + "loss": 0.9801, + "step": 12536 + }, + { + "epoch": 0.9045128242126907, + "grad_norm": 2.061172751881352, + "learning_rate": 9.482013032839021e-08, + "loss": 0.8294, + "step": 12537 + }, + { + "epoch": 0.9045849716821183, + "grad_norm": 2.6990256564774873, + "learning_rate": 9.467798478069999e-08, + "loss": 0.8443, + "step": 12538 + }, + { + "epoch": 0.9046571191515458, + "grad_norm": 17.033682273281645, + "learning_rate": 9.45359432755919e-08, + "loss": 0.9511, + "step": 12539 + }, + { + "epoch": 0.9047292666209733, + "grad_norm": 5.894869380730089, + "learning_rate": 9.439400582082236e-08, + "loss": 0.8119, + "step": 12540 + }, + { + "epoch": 0.9048014140904008, + "grad_norm": 1.9981145912546021, + "learning_rate": 9.425217242414119e-08, + "loss": 0.8863, + "step": 12541 + }, + { + "epoch": 0.9048735615598283, + "grad_norm": 2.9732930006476312, + "learning_rate": 9.411044309329441e-08, + "loss": 0.8633, + "step": 12542 + }, + { + "epoch": 0.9049457090292558, + "grad_norm": 3.121746964067718, + "learning_rate": 9.396881783602095e-08, + "loss": 0.9147, + "step": 12543 + }, + { + "epoch": 0.9050178564986833, + "grad_norm": 2.417771339061211, + "learning_rate": 9.382729666005484e-08, + "loss": 0.9864, + "step": 12544 + }, + { + "epoch": 0.9050900039681108, + "grad_norm": 6.237958064871212, + "learning_rate": 9.368587957312324e-08, + "loss": 0.8794, + "step": 12545 + }, + { + "epoch": 0.9051621514375383, + "grad_norm": 2.17457533450488, + "learning_rate": 9.354456658294885e-08, + "loss": 1.0352, + "step": 12546 + }, + { + "epoch": 0.9052342989069658, + "grad_norm": 3.7526779785238764, + "learning_rate": 9.340335769724861e-08, + "loss": 0.9297, + "step": 12547 + }, + { + "epoch": 0.9053064463763933, + "grad_norm": 0.75981081163828, + "learning_rate": 9.326225292373236e-08, + "loss": 0.7762, + "step": 12548 + }, + { + "epoch": 0.9053785938458209, + "grad_norm": 2.3402912304268653, + "learning_rate": 9.312125227010615e-08, + "loss": 0.8837, + "step": 12549 + }, + { + "epoch": 0.9054507413152484, + "grad_norm": 2.087171553853407, + "learning_rate": 9.298035574406981e-08, + "loss": 0.9383, + "step": 12550 + }, + { + "epoch": 0.9055228887846759, + "grad_norm": 1.9992054910879755, + "learning_rate": 9.283956335331633e-08, + "loss": 0.8686, + "step": 12551 + }, + { + "epoch": 0.9055950362541034, + "grad_norm": 2.1839039263221993, + "learning_rate": 9.26988751055342e-08, + "loss": 0.9228, + "step": 12552 + }, + { + "epoch": 0.9056671837235309, + "grad_norm": 1.8855210445729818, + "learning_rate": 9.255829100840573e-08, + "loss": 0.9113, + "step": 12553 + }, + { + "epoch": 0.9057393311929584, + "grad_norm": 1.6130333957135716, + "learning_rate": 9.241781106960834e-08, + "loss": 0.9265, + "step": 12554 + }, + { + "epoch": 0.9058114786623859, + "grad_norm": 3.061217993385532, + "learning_rate": 9.227743529681254e-08, + "loss": 0.9002, + "step": 12555 + }, + { + "epoch": 0.9058836261318134, + "grad_norm": 2.047007316214751, + "learning_rate": 9.213716369768354e-08, + "loss": 0.9115, + "step": 12556 + }, + { + "epoch": 0.9059557736012409, + "grad_norm": 2.0921001481615895, + "learning_rate": 9.199699627988189e-08, + "loss": 0.9406, + "step": 12557 + }, + { + "epoch": 0.9060279210706684, + "grad_norm": 2.6296140899656733, + "learning_rate": 9.18569330510608e-08, + "loss": 0.9691, + "step": 12558 + }, + { + "epoch": 0.906100068540096, + "grad_norm": 0.654064831936867, + "learning_rate": 9.171697401886857e-08, + "loss": 0.7684, + "step": 12559 + }, + { + "epoch": 0.9061722160095235, + "grad_norm": 2.014873749064792, + "learning_rate": 9.157711919094846e-08, + "loss": 0.8021, + "step": 12560 + }, + { + "epoch": 0.906244363478951, + "grad_norm": 7.316251656385293, + "learning_rate": 9.143736857493744e-08, + "loss": 0.8948, + "step": 12561 + }, + { + "epoch": 0.9063165109483785, + "grad_norm": 0.6971166756160125, + "learning_rate": 9.12977221784661e-08, + "loss": 0.7569, + "step": 12562 + }, + { + "epoch": 0.906388658417806, + "grad_norm": 3.0812020195558327, + "learning_rate": 9.115818000916031e-08, + "loss": 1.0343, + "step": 12563 + }, + { + "epoch": 0.9064608058872335, + "grad_norm": 2.8984970539072124, + "learning_rate": 9.101874207464044e-08, + "loss": 0.9034, + "step": 12564 + }, + { + "epoch": 0.9065329533566611, + "grad_norm": 2.2601743178530542, + "learning_rate": 9.087940838251974e-08, + "loss": 0.9322, + "step": 12565 + }, + { + "epoch": 0.9066051008260885, + "grad_norm": 2.986563633975127, + "learning_rate": 9.074017894040743e-08, + "loss": 0.8448, + "step": 12566 + }, + { + "epoch": 0.906677248295516, + "grad_norm": 2.438750508678389, + "learning_rate": 9.060105375590632e-08, + "loss": 0.9091, + "step": 12567 + }, + { + "epoch": 0.9067493957649435, + "grad_norm": 3.2093644570063256, + "learning_rate": 9.046203283661324e-08, + "loss": 0.8886, + "step": 12568 + }, + { + "epoch": 0.906821543234371, + "grad_norm": 2.08173590134425, + "learning_rate": 9.032311619011945e-08, + "loss": 0.9049, + "step": 12569 + }, + { + "epoch": 0.9068936907037985, + "grad_norm": 1.7904523756292325, + "learning_rate": 9.01843038240111e-08, + "loss": 0.897, + "step": 12570 + }, + { + "epoch": 0.9069658381732261, + "grad_norm": 4.448770528961946, + "learning_rate": 9.004559574586812e-08, + "loss": 0.887, + "step": 12571 + }, + { + "epoch": 0.9070379856426536, + "grad_norm": 2.0696914605078747, + "learning_rate": 8.990699196326423e-08, + "loss": 0.895, + "step": 12572 + }, + { + "epoch": 0.9071101331120811, + "grad_norm": 2.6067881296189315, + "learning_rate": 8.976849248376894e-08, + "loss": 0.7924, + "step": 12573 + }, + { + "epoch": 0.9071822805815086, + "grad_norm": 2.276134032260073, + "learning_rate": 8.963009731494508e-08, + "loss": 0.8012, + "step": 12574 + }, + { + "epoch": 0.9072544280509361, + "grad_norm": 2.218024014183872, + "learning_rate": 8.949180646434929e-08, + "loss": 0.8979, + "step": 12575 + }, + { + "epoch": 0.9073265755203637, + "grad_norm": 1.9063160368940748, + "learning_rate": 8.935361993953372e-08, + "loss": 0.9644, + "step": 12576 + }, + { + "epoch": 0.9073987229897912, + "grad_norm": 2.2718029110475864, + "learning_rate": 8.921553774804347e-08, + "loss": 0.9154, + "step": 12577 + }, + { + "epoch": 0.9074708704592186, + "grad_norm": 2.547724482685059, + "learning_rate": 8.907755989742005e-08, + "loss": 0.9423, + "step": 12578 + }, + { + "epoch": 0.9075430179286461, + "grad_norm": 3.474372362498507, + "learning_rate": 8.893968639519656e-08, + "loss": 0.9404, + "step": 12579 + }, + { + "epoch": 0.9076151653980736, + "grad_norm": 3.3004842229522975, + "learning_rate": 8.880191724890229e-08, + "loss": 0.8538, + "step": 12580 + }, + { + "epoch": 0.9076873128675012, + "grad_norm": 3.601747454281917, + "learning_rate": 8.86642524660608e-08, + "loss": 0.9081, + "step": 12581 + }, + { + "epoch": 0.9077594603369287, + "grad_norm": 2.2958219021364323, + "learning_rate": 8.85266920541885e-08, + "loss": 0.8916, + "step": 12582 + }, + { + "epoch": 0.9078316078063562, + "grad_norm": 3.4469752721649316, + "learning_rate": 8.838923602079739e-08, + "loss": 0.9281, + "step": 12583 + }, + { + "epoch": 0.9079037552757837, + "grad_norm": 3.416271900228239, + "learning_rate": 8.825188437339393e-08, + "loss": 0.8791, + "step": 12584 + }, + { + "epoch": 0.9079759027452112, + "grad_norm": 3.786838027967533, + "learning_rate": 8.811463711947786e-08, + "loss": 0.9239, + "step": 12585 + }, + { + "epoch": 0.9080480502146387, + "grad_norm": 2.4381499768786616, + "learning_rate": 8.797749426654388e-08, + "loss": 0.9189, + "step": 12586 + }, + { + "epoch": 0.9081201976840663, + "grad_norm": 1.8362189543860654, + "learning_rate": 8.784045582208089e-08, + "loss": 0.9242, + "step": 12587 + }, + { + "epoch": 0.9081923451534938, + "grad_norm": 1.9375349472339667, + "learning_rate": 8.770352179357221e-08, + "loss": 0.8267, + "step": 12588 + }, + { + "epoch": 0.9082644926229213, + "grad_norm": 2.678947680251137, + "learning_rate": 8.756669218849456e-08, + "loss": 1.0015, + "step": 12589 + }, + { + "epoch": 0.9083366400923487, + "grad_norm": 1.9687612018569276, + "learning_rate": 8.742996701432082e-08, + "loss": 0.9039, + "step": 12590 + }, + { + "epoch": 0.9084087875617762, + "grad_norm": 2.0443460697656617, + "learning_rate": 8.729334627851637e-08, + "loss": 0.8527, + "step": 12591 + }, + { + "epoch": 0.9084809350312038, + "grad_norm": 3.0423884297510253, + "learning_rate": 8.715682998854168e-08, + "loss": 0.7787, + "step": 12592 + }, + { + "epoch": 0.9085530825006313, + "grad_norm": 2.251367365411093, + "learning_rate": 8.702041815185124e-08, + "loss": 0.9921, + "step": 12593 + }, + { + "epoch": 0.9086252299700588, + "grad_norm": 2.3734038159087807, + "learning_rate": 8.688411077589463e-08, + "loss": 1.0164, + "step": 12594 + }, + { + "epoch": 0.9086973774394863, + "grad_norm": 3.7520075192894655, + "learning_rate": 8.674790786811414e-08, + "loss": 0.9716, + "step": 12595 + }, + { + "epoch": 0.9087695249089138, + "grad_norm": 2.7105195440598595, + "learning_rate": 8.661180943594759e-08, + "loss": 0.9055, + "step": 12596 + }, + { + "epoch": 0.9088416723783413, + "grad_norm": 2.2483775329015683, + "learning_rate": 8.647581548682748e-08, + "loss": 1.0407, + "step": 12597 + }, + { + "epoch": 0.9089138198477689, + "grad_norm": 4.470644462603881, + "learning_rate": 8.633992602817941e-08, + "loss": 0.8662, + "step": 12598 + }, + { + "epoch": 0.9089859673171964, + "grad_norm": 2.0328814483842423, + "learning_rate": 8.620414106742368e-08, + "loss": 0.9027, + "step": 12599 + }, + { + "epoch": 0.9090581147866239, + "grad_norm": 2.1711859399001545, + "learning_rate": 8.606846061197526e-08, + "loss": 0.9205, + "step": 12600 + }, + { + "epoch": 0.9091302622560513, + "grad_norm": 2.0510242136581924, + "learning_rate": 8.593288466924331e-08, + "loss": 1.1338, + "step": 12601 + }, + { + "epoch": 0.9092024097254788, + "grad_norm": 2.342234617526613, + "learning_rate": 8.579741324663037e-08, + "loss": 0.9918, + "step": 12602 + }, + { + "epoch": 0.9092745571949064, + "grad_norm": 2.424781581048032, + "learning_rate": 8.566204635153474e-08, + "loss": 0.9552, + "step": 12603 + }, + { + "epoch": 0.9093467046643339, + "grad_norm": 2.6822827286555926, + "learning_rate": 8.552678399134827e-08, + "loss": 0.8019, + "step": 12604 + }, + { + "epoch": 0.9094188521337614, + "grad_norm": 2.3699844002512087, + "learning_rate": 8.539162617345708e-08, + "loss": 0.9144, + "step": 12605 + }, + { + "epoch": 0.9094909996031889, + "grad_norm": 3.8654107838564973, + "learning_rate": 8.525657290524146e-08, + "loss": 0.8, + "step": 12606 + }, + { + "epoch": 0.9095631470726164, + "grad_norm": 2.3612437142429323, + "learning_rate": 8.512162419407599e-08, + "loss": 1.0218, + "step": 12607 + }, + { + "epoch": 0.909635294542044, + "grad_norm": 0.639185963948214, + "learning_rate": 8.498678004733028e-08, + "loss": 0.7943, + "step": 12608 + }, + { + "epoch": 0.9097074420114715, + "grad_norm": 3.2715234372667936, + "learning_rate": 8.485204047236738e-08, + "loss": 0.7879, + "step": 12609 + }, + { + "epoch": 0.909779589480899, + "grad_norm": 2.228190472725651, + "learning_rate": 8.471740547654471e-08, + "loss": 0.9516, + "step": 12610 + }, + { + "epoch": 0.9098517369503265, + "grad_norm": 1.9318706811309294, + "learning_rate": 8.458287506721463e-08, + "loss": 0.945, + "step": 12611 + }, + { + "epoch": 0.909923884419754, + "grad_norm": 6.7043637996098875, + "learning_rate": 8.4448449251723e-08, + "loss": 0.8754, + "step": 12612 + }, + { + "epoch": 0.9099960318891814, + "grad_norm": 2.9527691016115094, + "learning_rate": 8.431412803741023e-08, + "loss": 0.9736, + "step": 12613 + }, + { + "epoch": 0.910068179358609, + "grad_norm": 2.2176704466811583, + "learning_rate": 8.417991143161151e-08, + "loss": 0.7715, + "step": 12614 + }, + { + "epoch": 0.9101403268280365, + "grad_norm": 2.9092572148426923, + "learning_rate": 8.40457994416559e-08, + "loss": 0.9068, + "step": 12615 + }, + { + "epoch": 0.910212474297464, + "grad_norm": 2.0815973169621764, + "learning_rate": 8.39117920748662e-08, + "loss": 0.9403, + "step": 12616 + }, + { + "epoch": 0.9102846217668915, + "grad_norm": 2.5372278715892636, + "learning_rate": 8.377788933856056e-08, + "loss": 0.8836, + "step": 12617 + }, + { + "epoch": 0.910356769236319, + "grad_norm": 2.7129952923417155, + "learning_rate": 8.36440912400509e-08, + "loss": 0.8764, + "step": 12618 + }, + { + "epoch": 0.9104289167057465, + "grad_norm": 2.705769562748605, + "learning_rate": 8.351039778664315e-08, + "loss": 0.8645, + "step": 12619 + }, + { + "epoch": 0.9105010641751741, + "grad_norm": 0.7599460024850204, + "learning_rate": 8.337680898563792e-08, + "loss": 0.7947, + "step": 12620 + }, + { + "epoch": 0.9105732116446016, + "grad_norm": 2.835112985179411, + "learning_rate": 8.324332484433005e-08, + "loss": 0.9357, + "step": 12621 + }, + { + "epoch": 0.9106453591140291, + "grad_norm": 2.540322328675898, + "learning_rate": 8.310994537000881e-08, + "loss": 0.9862, + "step": 12622 + }, + { + "epoch": 0.9107175065834566, + "grad_norm": 1.908947583697383, + "learning_rate": 8.297667056995727e-08, + "loss": 0.8359, + "step": 12623 + }, + { + "epoch": 0.9107896540528841, + "grad_norm": 3.0789220717090626, + "learning_rate": 8.284350045145317e-08, + "loss": 0.912, + "step": 12624 + }, + { + "epoch": 0.9108618015223116, + "grad_norm": 2.3845286708523132, + "learning_rate": 8.271043502176866e-08, + "loss": 0.9217, + "step": 12625 + }, + { + "epoch": 0.9109339489917391, + "grad_norm": 3.4691619628414974, + "learning_rate": 8.25774742881693e-08, + "loss": 1.0087, + "step": 12626 + }, + { + "epoch": 0.9110060964611666, + "grad_norm": 0.7326434804723285, + "learning_rate": 8.244461825791616e-08, + "loss": 0.8346, + "step": 12627 + }, + { + "epoch": 0.9110782439305941, + "grad_norm": 2.4482038180703936, + "learning_rate": 8.231186693826408e-08, + "loss": 0.8929, + "step": 12628 + }, + { + "epoch": 0.9111503914000216, + "grad_norm": 2.638987558316025, + "learning_rate": 8.217922033646174e-08, + "loss": 0.9113, + "step": 12629 + }, + { + "epoch": 0.9112225388694492, + "grad_norm": 2.4657897584740684, + "learning_rate": 8.204667845975267e-08, + "loss": 0.8734, + "step": 12630 + }, + { + "epoch": 0.9112946863388767, + "grad_norm": 2.5005357168324522, + "learning_rate": 8.191424131537416e-08, + "loss": 0.9409, + "step": 12631 + }, + { + "epoch": 0.9113668338083042, + "grad_norm": 2.402810350710587, + "learning_rate": 8.178190891055914e-08, + "loss": 0.9578, + "step": 12632 + }, + { + "epoch": 0.9114389812777317, + "grad_norm": 3.9507031165029463, + "learning_rate": 8.164968125253247e-08, + "loss": 0.8534, + "step": 12633 + }, + { + "epoch": 0.9115111287471592, + "grad_norm": 2.6431725306464418, + "learning_rate": 8.151755834851548e-08, + "loss": 0.8886, + "step": 12634 + }, + { + "epoch": 0.9115832762165867, + "grad_norm": 2.6491463311842742, + "learning_rate": 8.138554020572286e-08, + "loss": 0.9067, + "step": 12635 + }, + { + "epoch": 0.9116554236860143, + "grad_norm": 3.2797346703279926, + "learning_rate": 8.125362683136305e-08, + "loss": 0.8943, + "step": 12636 + }, + { + "epoch": 0.9117275711554417, + "grad_norm": 2.848043365748421, + "learning_rate": 8.112181823263964e-08, + "loss": 0.8879, + "step": 12637 + }, + { + "epoch": 0.9117997186248692, + "grad_norm": 2.3899951179805674, + "learning_rate": 8.099011441675063e-08, + "loss": 1.0111, + "step": 12638 + }, + { + "epoch": 0.9118718660942967, + "grad_norm": 2.7013526354015713, + "learning_rate": 8.08585153908874e-08, + "loss": 0.8502, + "step": 12639 + }, + { + "epoch": 0.9119440135637242, + "grad_norm": 2.47373391723031, + "learning_rate": 8.072702116223618e-08, + "loss": 0.8924, + "step": 12640 + }, + { + "epoch": 0.9120161610331518, + "grad_norm": 3.7498443571216016, + "learning_rate": 8.059563173797746e-08, + "loss": 1.0475, + "step": 12641 + }, + { + "epoch": 0.9120883085025793, + "grad_norm": 2.549273993419336, + "learning_rate": 8.046434712528594e-08, + "loss": 0.9031, + "step": 12642 + }, + { + "epoch": 0.9121604559720068, + "grad_norm": 0.8598721193447514, + "learning_rate": 8.033316733133055e-08, + "loss": 0.8206, + "step": 12643 + }, + { + "epoch": 0.9122326034414343, + "grad_norm": 3.0914444472522513, + "learning_rate": 8.020209236327424e-08, + "loss": 0.9012, + "step": 12644 + }, + { + "epoch": 0.9123047509108618, + "grad_norm": 2.2082483767116217, + "learning_rate": 8.007112222827528e-08, + "loss": 0.9309, + "step": 12645 + }, + { + "epoch": 0.9123768983802893, + "grad_norm": 2.0961423273247903, + "learning_rate": 7.994025693348482e-08, + "loss": 1.0677, + "step": 12646 + }, + { + "epoch": 0.9124490458497169, + "grad_norm": 2.262118131330456, + "learning_rate": 7.980949648604896e-08, + "loss": 0.9461, + "step": 12647 + }, + { + "epoch": 0.9125211933191444, + "grad_norm": 39.13766518250035, + "learning_rate": 7.967884089310839e-08, + "loss": 0.8523, + "step": 12648 + }, + { + "epoch": 0.9125933407885718, + "grad_norm": 2.2758162071824937, + "learning_rate": 7.954829016179765e-08, + "loss": 0.91, + "step": 12649 + }, + { + "epoch": 0.9126654882579993, + "grad_norm": 3.143738678746855, + "learning_rate": 7.941784429924503e-08, + "loss": 0.7726, + "step": 12650 + }, + { + "epoch": 0.9127376357274268, + "grad_norm": 1.9487787334875928, + "learning_rate": 7.928750331257439e-08, + "loss": 0.9504, + "step": 12651 + }, + { + "epoch": 0.9128097831968544, + "grad_norm": 3.4350296854349507, + "learning_rate": 7.915726720890314e-08, + "loss": 0.8789, + "step": 12652 + }, + { + "epoch": 0.9128819306662819, + "grad_norm": 4.773462273616156, + "learning_rate": 7.902713599534272e-08, + "loss": 0.9116, + "step": 12653 + }, + { + "epoch": 0.9129540781357094, + "grad_norm": 1.9221498866549347, + "learning_rate": 7.88971096789992e-08, + "loss": 0.9553, + "step": 12654 + }, + { + "epoch": 0.9130262256051369, + "grad_norm": 1.799096368096834, + "learning_rate": 7.876718826697315e-08, + "loss": 0.9038, + "step": 12655 + }, + { + "epoch": 0.9130983730745644, + "grad_norm": 1.8394775674151504, + "learning_rate": 7.863737176635821e-08, + "loss": 0.8307, + "step": 12656 + }, + { + "epoch": 0.913170520543992, + "grad_norm": 2.9508493526195925, + "learning_rate": 7.850766018424382e-08, + "loss": 0.8815, + "step": 12657 + }, + { + "epoch": 0.9132426680134195, + "grad_norm": 1.983012174467957, + "learning_rate": 7.837805352771321e-08, + "loss": 0.9042, + "step": 12658 + }, + { + "epoch": 0.913314815482847, + "grad_norm": 3.568996729564157, + "learning_rate": 7.824855180384382e-08, + "loss": 0.8395, + "step": 12659 + }, + { + "epoch": 0.9133869629522744, + "grad_norm": 2.4021923978935997, + "learning_rate": 7.811915501970645e-08, + "loss": 0.9053, + "step": 12660 + }, + { + "epoch": 0.9134591104217019, + "grad_norm": 4.261378371704518, + "learning_rate": 7.798986318236744e-08, + "loss": 1.0202, + "step": 12661 + }, + { + "epoch": 0.9135312578911294, + "grad_norm": 1.8785677979748654, + "learning_rate": 7.786067629888716e-08, + "loss": 0.9052, + "step": 12662 + }, + { + "epoch": 0.913603405360557, + "grad_norm": 2.6669173619526756, + "learning_rate": 7.773159437631994e-08, + "loss": 0.8801, + "step": 12663 + }, + { + "epoch": 0.9136755528299845, + "grad_norm": 2.964664257783156, + "learning_rate": 7.760261742171414e-08, + "loss": 0.9295, + "step": 12664 + }, + { + "epoch": 0.913747700299412, + "grad_norm": 0.7840099412278079, + "learning_rate": 7.747374544211305e-08, + "loss": 0.8269, + "step": 12665 + }, + { + "epoch": 0.9138198477688395, + "grad_norm": 3.66319033525587, + "learning_rate": 7.73449784445539e-08, + "loss": 0.887, + "step": 12666 + }, + { + "epoch": 0.913891995238267, + "grad_norm": 1.8809649634903287, + "learning_rate": 7.721631643606774e-08, + "loss": 0.8936, + "step": 12667 + }, + { + "epoch": 0.9139641427076945, + "grad_norm": 2.3598191809485023, + "learning_rate": 7.708775942368051e-08, + "loss": 1.0074, + "step": 12668 + }, + { + "epoch": 0.9140362901771221, + "grad_norm": 2.945065272956628, + "learning_rate": 7.695930741441281e-08, + "loss": 0.9133, + "step": 12669 + }, + { + "epoch": 0.9141084376465496, + "grad_norm": 2.3065550398231123, + "learning_rate": 7.683096041527837e-08, + "loss": 0.8709, + "step": 12670 + }, + { + "epoch": 0.9141805851159771, + "grad_norm": 30.860260074332956, + "learning_rate": 7.67027184332858e-08, + "loss": 0.9413, + "step": 12671 + }, + { + "epoch": 0.9142527325854045, + "grad_norm": 4.074640297957509, + "learning_rate": 7.657458147543816e-08, + "loss": 0.89, + "step": 12672 + }, + { + "epoch": 0.914324880054832, + "grad_norm": 2.101781103400187, + "learning_rate": 7.644654954873209e-08, + "loss": 0.9428, + "step": 12673 + }, + { + "epoch": 0.9143970275242596, + "grad_norm": 0.6960796667316249, + "learning_rate": 7.631862266015887e-08, + "loss": 0.7838, + "step": 12674 + }, + { + "epoch": 0.9144691749936871, + "grad_norm": 2.3213341515590944, + "learning_rate": 7.619080081670492e-08, + "loss": 0.8839, + "step": 12675 + }, + { + "epoch": 0.9145413224631146, + "grad_norm": 2.326960329944675, + "learning_rate": 7.606308402534977e-08, + "loss": 0.918, + "step": 12676 + }, + { + "epoch": 0.9146134699325421, + "grad_norm": 2.8267770157573384, + "learning_rate": 7.593547229306718e-08, + "loss": 0.9138, + "step": 12677 + }, + { + "epoch": 0.9146856174019696, + "grad_norm": 1.9549640389766807, + "learning_rate": 7.580796562682579e-08, + "loss": 0.8815, + "step": 12678 + }, + { + "epoch": 0.9147577648713971, + "grad_norm": 2.584493366376851, + "learning_rate": 7.568056403358846e-08, + "loss": 0.9245, + "step": 12679 + }, + { + "epoch": 0.9148299123408247, + "grad_norm": 2.0830719084430642, + "learning_rate": 7.555326752031166e-08, + "loss": 0.9031, + "step": 12680 + }, + { + "epoch": 0.9149020598102522, + "grad_norm": 2.4156101043755145, + "learning_rate": 7.542607609394691e-08, + "loss": 0.8931, + "step": 12681 + }, + { + "epoch": 0.9149742072796797, + "grad_norm": 0.7168858444338156, + "learning_rate": 7.529898976143978e-08, + "loss": 0.8322, + "step": 12682 + }, + { + "epoch": 0.9150463547491072, + "grad_norm": 2.8462965745093176, + "learning_rate": 7.517200852972982e-08, + "loss": 0.8826, + "step": 12683 + }, + { + "epoch": 0.9151185022185346, + "grad_norm": 1.9094397950611417, + "learning_rate": 7.504513240575083e-08, + "loss": 0.9234, + "step": 12684 + }, + { + "epoch": 0.9151906496879622, + "grad_norm": 2.5970332495698507, + "learning_rate": 7.491836139643104e-08, + "loss": 0.9447, + "step": 12685 + }, + { + "epoch": 0.9152627971573897, + "grad_norm": 2.874334175299376, + "learning_rate": 7.47916955086938e-08, + "loss": 0.8605, + "step": 12686 + }, + { + "epoch": 0.9153349446268172, + "grad_norm": 3.6471167058531706, + "learning_rate": 7.466513474945468e-08, + "loss": 0.9451, + "step": 12687 + }, + { + "epoch": 0.9154070920962447, + "grad_norm": 3.596749820072071, + "learning_rate": 7.453867912562529e-08, + "loss": 0.9391, + "step": 12688 + }, + { + "epoch": 0.9154792395656722, + "grad_norm": 2.26826617754155, + "learning_rate": 7.441232864411117e-08, + "loss": 1.013, + "step": 12689 + }, + { + "epoch": 0.9155513870350998, + "grad_norm": 2.213951078961638, + "learning_rate": 7.428608331181107e-08, + "loss": 0.8927, + "step": 12690 + }, + { + "epoch": 0.9156235345045273, + "grad_norm": 5.552039608476052, + "learning_rate": 7.415994313561924e-08, + "loss": 0.9953, + "step": 12691 + }, + { + "epoch": 0.9156956819739548, + "grad_norm": 2.9142001122102696, + "learning_rate": 7.40339081224235e-08, + "loss": 0.9324, + "step": 12692 + }, + { + "epoch": 0.9157678294433823, + "grad_norm": 2.50159289158713, + "learning_rate": 7.390797827910699e-08, + "loss": 0.9016, + "step": 12693 + }, + { + "epoch": 0.9158399769128098, + "grad_norm": 2.689429544139088, + "learning_rate": 7.378215361254514e-08, + "loss": 0.9062, + "step": 12694 + }, + { + "epoch": 0.9159121243822373, + "grad_norm": 3.0223326722077006, + "learning_rate": 7.365643412960953e-08, + "loss": 0.9311, + "step": 12695 + }, + { + "epoch": 0.9159842718516648, + "grad_norm": 6.886022436259976, + "learning_rate": 7.353081983716491e-08, + "loss": 0.8695, + "step": 12696 + }, + { + "epoch": 0.9160564193210923, + "grad_norm": 2.497396257630525, + "learning_rate": 7.340531074207068e-08, + "loss": 0.974, + "step": 12697 + }, + { + "epoch": 0.9161285667905198, + "grad_norm": 2.466013876994272, + "learning_rate": 7.327990685118024e-08, + "loss": 0.97, + "step": 12698 + }, + { + "epoch": 0.9162007142599473, + "grad_norm": 2.04634716420777, + "learning_rate": 7.315460817134189e-08, + "loss": 1.0159, + "step": 12699 + }, + { + "epoch": 0.9162728617293748, + "grad_norm": 2.6721201109416066, + "learning_rate": 7.302941470939727e-08, + "loss": 0.909, + "step": 12700 + }, + { + "epoch": 0.9163450091988024, + "grad_norm": 2.696025809286172, + "learning_rate": 7.290432647218292e-08, + "loss": 1.0102, + "step": 12701 + }, + { + "epoch": 0.9164171566682299, + "grad_norm": 2.6401738791569422, + "learning_rate": 7.277934346652937e-08, + "loss": 0.8143, + "step": 12702 + }, + { + "epoch": 0.9164893041376574, + "grad_norm": 2.4768695821421276, + "learning_rate": 7.265446569926182e-08, + "loss": 0.9546, + "step": 12703 + }, + { + "epoch": 0.9165614516070849, + "grad_norm": 2.342767433200153, + "learning_rate": 7.252969317719859e-08, + "loss": 0.8235, + "step": 12704 + }, + { + "epoch": 0.9166335990765124, + "grad_norm": 2.7288205652902993, + "learning_rate": 7.240502590715403e-08, + "loss": 0.8382, + "step": 12705 + }, + { + "epoch": 0.91670574654594, + "grad_norm": 2.033795097427844, + "learning_rate": 7.228046389593534e-08, + "loss": 0.8711, + "step": 12706 + }, + { + "epoch": 0.9167778940153674, + "grad_norm": 2.144598509909786, + "learning_rate": 7.21560071503442e-08, + "loss": 0.8155, + "step": 12707 + }, + { + "epoch": 0.9168500414847949, + "grad_norm": 2.5653267132930764, + "learning_rate": 7.203165567717673e-08, + "loss": 0.8791, + "step": 12708 + }, + { + "epoch": 0.9169221889542224, + "grad_norm": 4.449423231829859, + "learning_rate": 7.190740948322327e-08, + "loss": 0.9692, + "step": 12709 + }, + { + "epoch": 0.9169943364236499, + "grad_norm": 0.7473918148203004, + "learning_rate": 7.178326857526906e-08, + "loss": 0.7871, + "step": 12710 + }, + { + "epoch": 0.9170664838930774, + "grad_norm": 2.0127668594755397, + "learning_rate": 7.165923296009246e-08, + "loss": 0.8662, + "step": 12711 + }, + { + "epoch": 0.917138631362505, + "grad_norm": 4.264660848569357, + "learning_rate": 7.15353026444665e-08, + "loss": 0.9489, + "step": 12712 + }, + { + "epoch": 0.9172107788319325, + "grad_norm": 2.829843241525305, + "learning_rate": 7.141147763515909e-08, + "loss": 0.9729, + "step": 12713 + }, + { + "epoch": 0.91728292630136, + "grad_norm": 2.384365088765955, + "learning_rate": 7.128775793893127e-08, + "loss": 0.8224, + "step": 12714 + }, + { + "epoch": 0.9173550737707875, + "grad_norm": 2.0499561212077086, + "learning_rate": 7.116414356253898e-08, + "loss": 0.9836, + "step": 12715 + }, + { + "epoch": 0.917427221240215, + "grad_norm": 2.4791416733280665, + "learning_rate": 7.104063451273279e-08, + "loss": 0.8834, + "step": 12716 + }, + { + "epoch": 0.9174993687096425, + "grad_norm": 5.008157179660163, + "learning_rate": 7.091723079625645e-08, + "loss": 0.9309, + "step": 12717 + }, + { + "epoch": 0.9175715161790701, + "grad_norm": 3.6331231753497133, + "learning_rate": 7.079393241984921e-08, + "loss": 1.0315, + "step": 12718 + }, + { + "epoch": 0.9176436636484975, + "grad_norm": 2.1322259899983202, + "learning_rate": 7.067073939024349e-08, + "loss": 1.0433, + "step": 12719 + }, + { + "epoch": 0.917715811117925, + "grad_norm": 3.782691995874993, + "learning_rate": 7.054765171416677e-08, + "loss": 0.9538, + "step": 12720 + }, + { + "epoch": 0.9177879585873525, + "grad_norm": 3.3577747880709308, + "learning_rate": 7.04246693983399e-08, + "loss": 0.9292, + "step": 12721 + }, + { + "epoch": 0.91786010605678, + "grad_norm": 0.7729930178486092, + "learning_rate": 7.030179244947887e-08, + "loss": 0.8251, + "step": 12722 + }, + { + "epoch": 0.9179322535262076, + "grad_norm": 5.13632346833436, + "learning_rate": 7.017902087429362e-08, + "loss": 1.0501, + "step": 12723 + }, + { + "epoch": 0.9180044009956351, + "grad_norm": 2.191912206764191, + "learning_rate": 7.00563546794879e-08, + "loss": 0.8949, + "step": 12724 + }, + { + "epoch": 0.9180765484650626, + "grad_norm": 1.6308333526179892, + "learning_rate": 6.993379387176035e-08, + "loss": 0.9319, + "step": 12725 + }, + { + "epoch": 0.9181486959344901, + "grad_norm": 2.7551648449605506, + "learning_rate": 6.98113384578034e-08, + "loss": 0.8762, + "step": 12726 + }, + { + "epoch": 0.9182208434039176, + "grad_norm": 2.11074682768037, + "learning_rate": 6.968898844430415e-08, + "loss": 0.8191, + "step": 12727 + }, + { + "epoch": 0.9182929908733451, + "grad_norm": 2.225623109983281, + "learning_rate": 6.956674383794304e-08, + "loss": 0.8772, + "step": 12728 + }, + { + "epoch": 0.9183651383427727, + "grad_norm": 2.579015774439375, + "learning_rate": 6.944460464539603e-08, + "loss": 0.8397, + "step": 12729 + }, + { + "epoch": 0.9184372858122002, + "grad_norm": 2.207293739575908, + "learning_rate": 6.93225708733327e-08, + "loss": 1.0013, + "step": 12730 + }, + { + "epoch": 0.9185094332816276, + "grad_norm": 2.19549755253196, + "learning_rate": 6.920064252841662e-08, + "loss": 0.9426, + "step": 12731 + }, + { + "epoch": 0.9185815807510551, + "grad_norm": 2.6269013920905158, + "learning_rate": 6.907881961730578e-08, + "loss": 0.9076, + "step": 12732 + }, + { + "epoch": 0.9186537282204826, + "grad_norm": 2.524244055566535, + "learning_rate": 6.895710214665262e-08, + "loss": 0.8312, + "step": 12733 + }, + { + "epoch": 0.9187258756899102, + "grad_norm": 2.6800978098998334, + "learning_rate": 6.883549012310341e-08, + "loss": 0.9876, + "step": 12734 + }, + { + "epoch": 0.9187980231593377, + "grad_norm": 0.6759949935961888, + "learning_rate": 6.871398355329927e-08, + "loss": 0.7496, + "step": 12735 + }, + { + "epoch": 0.9188701706287652, + "grad_norm": 0.7049116535158078, + "learning_rate": 6.859258244387511e-08, + "loss": 0.7984, + "step": 12736 + }, + { + "epoch": 0.9189423180981927, + "grad_norm": 3.433308629684707, + "learning_rate": 6.847128680146052e-08, + "loss": 0.9181, + "step": 12737 + }, + { + "epoch": 0.9190144655676202, + "grad_norm": 2.2104320150472287, + "learning_rate": 6.835009663267821e-08, + "loss": 0.9118, + "step": 12738 + }, + { + "epoch": 0.9190866130370478, + "grad_norm": 2.236568201496724, + "learning_rate": 6.822901194414665e-08, + "loss": 0.9592, + "step": 12739 + }, + { + "epoch": 0.9191587605064753, + "grad_norm": 0.700700989616918, + "learning_rate": 6.810803274247745e-08, + "loss": 0.838, + "step": 12740 + }, + { + "epoch": 0.9192309079759028, + "grad_norm": 2.2934713251530563, + "learning_rate": 6.79871590342771e-08, + "loss": 0.8845, + "step": 12741 + }, + { + "epoch": 0.9193030554453303, + "grad_norm": 2.436368239358309, + "learning_rate": 6.786639082614587e-08, + "loss": 1.0218, + "step": 12742 + }, + { + "epoch": 0.9193752029147577, + "grad_norm": 0.948061893502926, + "learning_rate": 6.774572812467872e-08, + "loss": 0.877, + "step": 12743 + }, + { + "epoch": 0.9194473503841852, + "grad_norm": 3.270202932331254, + "learning_rate": 6.762517093646413e-08, + "loss": 0.9485, + "step": 12744 + }, + { + "epoch": 0.9195194978536128, + "grad_norm": 4.091859803364557, + "learning_rate": 6.750471926808554e-08, + "loss": 0.912, + "step": 12745 + }, + { + "epoch": 0.9195916453230403, + "grad_norm": 2.481612679137865, + "learning_rate": 6.738437312612012e-08, + "loss": 0.98, + "step": 12746 + }, + { + "epoch": 0.9196637927924678, + "grad_norm": 3.152254654051874, + "learning_rate": 6.726413251714036e-08, + "loss": 0.9816, + "step": 12747 + }, + { + "epoch": 0.9197359402618953, + "grad_norm": 3.0225387144172973, + "learning_rate": 6.714399744771127e-08, + "loss": 0.936, + "step": 12748 + }, + { + "epoch": 0.9198080877313228, + "grad_norm": 2.1846671153326023, + "learning_rate": 6.702396792439336e-08, + "loss": 0.9064, + "step": 12749 + }, + { + "epoch": 0.9198802352007504, + "grad_norm": 6.92467488651529, + "learning_rate": 6.690404395374116e-08, + "loss": 0.9082, + "step": 12750 + }, + { + "epoch": 0.9199523826701779, + "grad_norm": 12.67491329848151, + "learning_rate": 6.678422554230278e-08, + "loss": 0.9339, + "step": 12751 + }, + { + "epoch": 0.9200245301396054, + "grad_norm": 0.7952133881392678, + "learning_rate": 6.666451269662122e-08, + "loss": 0.8612, + "step": 12752 + }, + { + "epoch": 0.9200966776090329, + "grad_norm": 4.539063970726418, + "learning_rate": 6.654490542323366e-08, + "loss": 0.9758, + "step": 12753 + }, + { + "epoch": 0.9201688250784604, + "grad_norm": 5.416596495345005, + "learning_rate": 6.642540372867179e-08, + "loss": 0.9584, + "step": 12754 + }, + { + "epoch": 0.9202409725478878, + "grad_norm": 1.917898390464161, + "learning_rate": 6.630600761946036e-08, + "loss": 1.0404, + "step": 12755 + }, + { + "epoch": 0.9203131200173154, + "grad_norm": 9.268120826773245, + "learning_rate": 6.618671710211976e-08, + "loss": 0.8369, + "step": 12756 + }, + { + "epoch": 0.9203852674867429, + "grad_norm": 2.31263113938761, + "learning_rate": 6.606753218316407e-08, + "loss": 0.8538, + "step": 12757 + }, + { + "epoch": 0.9204574149561704, + "grad_norm": 3.603250302520593, + "learning_rate": 6.594845286910078e-08, + "loss": 0.7994, + "step": 12758 + }, + { + "epoch": 0.9205295624255979, + "grad_norm": 3.8980293136676645, + "learning_rate": 6.582947916643289e-08, + "loss": 0.7872, + "step": 12759 + }, + { + "epoch": 0.9206017098950254, + "grad_norm": 2.612215243795854, + "learning_rate": 6.571061108165743e-08, + "loss": 0.8179, + "step": 12760 + }, + { + "epoch": 0.920673857364453, + "grad_norm": 3.379745255751447, + "learning_rate": 6.559184862126498e-08, + "loss": 0.995, + "step": 12761 + }, + { + "epoch": 0.9207460048338805, + "grad_norm": 2.6247451295190705, + "learning_rate": 6.54731917917406e-08, + "loss": 0.8029, + "step": 12762 + }, + { + "epoch": 0.920818152303308, + "grad_norm": 4.214756410558242, + "learning_rate": 6.535464059956352e-08, + "loss": 0.923, + "step": 12763 + }, + { + "epoch": 0.9208902997727355, + "grad_norm": 2.7321588745502505, + "learning_rate": 6.523619505120814e-08, + "loss": 0.9677, + "step": 12764 + }, + { + "epoch": 0.920962447242163, + "grad_norm": 2.815258538280209, + "learning_rate": 6.511785515314172e-08, + "loss": 0.8492, + "step": 12765 + }, + { + "epoch": 0.9210345947115904, + "grad_norm": 2.9762802190420077, + "learning_rate": 6.499962091182665e-08, + "loss": 0.9254, + "step": 12766 + }, + { + "epoch": 0.921106742181018, + "grad_norm": 5.088825302522649, + "learning_rate": 6.488149233371932e-08, + "loss": 0.9273, + "step": 12767 + }, + { + "epoch": 0.9211788896504455, + "grad_norm": 2.1194839691614153, + "learning_rate": 6.476346942526989e-08, + "loss": 0.9446, + "step": 12768 + }, + { + "epoch": 0.921251037119873, + "grad_norm": 12.590527294894228, + "learning_rate": 6.464555219292345e-08, + "loss": 0.9366, + "step": 12769 + }, + { + "epoch": 0.9213231845893005, + "grad_norm": 2.3251397367754536, + "learning_rate": 6.452774064311861e-08, + "loss": 0.9552, + "step": 12770 + }, + { + "epoch": 0.921395332058728, + "grad_norm": 3.115682673709631, + "learning_rate": 6.441003478228957e-08, + "loss": 0.915, + "step": 12771 + }, + { + "epoch": 0.9214674795281556, + "grad_norm": 2.3560473489002742, + "learning_rate": 6.429243461686296e-08, + "loss": 0.8865, + "step": 12772 + }, + { + "epoch": 0.9215396269975831, + "grad_norm": 2.61615179264869, + "learning_rate": 6.417494015326075e-08, + "loss": 0.9888, + "step": 12773 + }, + { + "epoch": 0.9216117744670106, + "grad_norm": 2.0946448392420796, + "learning_rate": 6.405755139789914e-08, + "loss": 0.9557, + "step": 12774 + }, + { + "epoch": 0.9216839219364381, + "grad_norm": 3.0680058091666917, + "learning_rate": 6.394026835718769e-08, + "loss": 0.9635, + "step": 12775 + }, + { + "epoch": 0.9217560694058656, + "grad_norm": 2.8183126782773877, + "learning_rate": 6.382309103753103e-08, + "loss": 0.9152, + "step": 12776 + }, + { + "epoch": 0.9218282168752931, + "grad_norm": 3.7870213624781313, + "learning_rate": 6.370601944532828e-08, + "loss": 0.8745, + "step": 12777 + }, + { + "epoch": 0.9219003643447206, + "grad_norm": 2.093937424199301, + "learning_rate": 6.358905358697165e-08, + "loss": 0.9362, + "step": 12778 + }, + { + "epoch": 0.9219725118141481, + "grad_norm": 2.3251910059415177, + "learning_rate": 6.347219346884847e-08, + "loss": 0.9283, + "step": 12779 + }, + { + "epoch": 0.9220446592835756, + "grad_norm": 2.5457942050189093, + "learning_rate": 6.335543909734009e-08, + "loss": 0.9098, + "step": 12780 + }, + { + "epoch": 0.9221168067530031, + "grad_norm": 2.5986296930763877, + "learning_rate": 6.323879047882208e-08, + "loss": 0.8833, + "step": 12781 + }, + { + "epoch": 0.9221889542224306, + "grad_norm": 2.464492990531403, + "learning_rate": 6.312224761966378e-08, + "loss": 0.9409, + "step": 12782 + }, + { + "epoch": 0.9222611016918582, + "grad_norm": 2.1236820621861505, + "learning_rate": 6.300581052622944e-08, + "loss": 0.8292, + "step": 12783 + }, + { + "epoch": 0.9223332491612857, + "grad_norm": 3.133434357603983, + "learning_rate": 6.288947920487775e-08, + "loss": 0.9649, + "step": 12784 + }, + { + "epoch": 0.9224053966307132, + "grad_norm": 4.071691122170289, + "learning_rate": 6.277325366196029e-08, + "loss": 0.9638, + "step": 12785 + }, + { + "epoch": 0.9224775441001407, + "grad_norm": 2.2914541203739622, + "learning_rate": 6.265713390382421e-08, + "loss": 0.9374, + "step": 12786 + }, + { + "epoch": 0.9225496915695682, + "grad_norm": 1.726614972866379, + "learning_rate": 6.254111993681044e-08, + "loss": 1.0079, + "step": 12787 + }, + { + "epoch": 0.9226218390389958, + "grad_norm": 2.324614778005032, + "learning_rate": 6.242521176725323e-08, + "loss": 0.9455, + "step": 12788 + }, + { + "epoch": 0.9226939865084233, + "grad_norm": 2.592781529530687, + "learning_rate": 6.23094094014831e-08, + "loss": 0.853, + "step": 12789 + }, + { + "epoch": 0.9227661339778507, + "grad_norm": 2.0032799766446625, + "learning_rate": 6.219371284582276e-08, + "loss": 0.9273, + "step": 12790 + }, + { + "epoch": 0.9228382814472782, + "grad_norm": 3.338704375509077, + "learning_rate": 6.207812210659069e-08, + "loss": 0.9929, + "step": 12791 + }, + { + "epoch": 0.9229104289167057, + "grad_norm": 2.177835751213837, + "learning_rate": 6.196263719009809e-08, + "loss": 0.9279, + "step": 12792 + }, + { + "epoch": 0.9229825763861332, + "grad_norm": 0.6936497040152855, + "learning_rate": 6.184725810265145e-08, + "loss": 0.7693, + "step": 12793 + }, + { + "epoch": 0.9230547238555608, + "grad_norm": 2.6691951585912967, + "learning_rate": 6.173198485055153e-08, + "loss": 0.8716, + "step": 12794 + }, + { + "epoch": 0.9231268713249883, + "grad_norm": 9.015504832929663, + "learning_rate": 6.161681744009239e-08, + "loss": 0.9166, + "step": 12795 + }, + { + "epoch": 0.9231990187944158, + "grad_norm": 1.7707264213451714, + "learning_rate": 6.150175587756323e-08, + "loss": 0.8787, + "step": 12796 + }, + { + "epoch": 0.9232711662638433, + "grad_norm": 2.4323279455012954, + "learning_rate": 6.138680016924725e-08, + "loss": 0.9089, + "step": 12797 + }, + { + "epoch": 0.9233433137332708, + "grad_norm": 3.6885909713441847, + "learning_rate": 6.127195032142163e-08, + "loss": 0.9509, + "step": 12798 + }, + { + "epoch": 0.9234154612026984, + "grad_norm": 2.866389108868072, + "learning_rate": 6.115720634035781e-08, + "loss": 0.8701, + "step": 12799 + }, + { + "epoch": 0.9234876086721259, + "grad_norm": 4.12323052572394, + "learning_rate": 6.104256823232124e-08, + "loss": 0.8517, + "step": 12800 + }, + { + "epoch": 0.9235597561415534, + "grad_norm": 12.684248183378433, + "learning_rate": 6.092803600357288e-08, + "loss": 0.9361, + "step": 12801 + }, + { + "epoch": 0.9236319036109808, + "grad_norm": 3.824243313610144, + "learning_rate": 6.081360966036597e-08, + "loss": 0.8797, + "step": 12802 + }, + { + "epoch": 0.9237040510804083, + "grad_norm": 2.9303485582315103, + "learning_rate": 6.069928920894929e-08, + "loss": 0.8634, + "step": 12803 + }, + { + "epoch": 0.9237761985498358, + "grad_norm": 2.463968789708034, + "learning_rate": 6.05850746555654e-08, + "loss": 0.8256, + "step": 12804 + }, + { + "epoch": 0.9238483460192634, + "grad_norm": 3.3107123679597743, + "learning_rate": 6.047096600645108e-08, + "loss": 0.9339, + "step": 12805 + }, + { + "epoch": 0.9239204934886909, + "grad_norm": 2.054175599590117, + "learning_rate": 6.035696326783713e-08, + "loss": 0.9235, + "step": 12806 + }, + { + "epoch": 0.9239926409581184, + "grad_norm": 2.504433610581113, + "learning_rate": 6.024306644594923e-08, + "loss": 1.0312, + "step": 12807 + }, + { + "epoch": 0.9240647884275459, + "grad_norm": 11.57431498863128, + "learning_rate": 6.012927554700709e-08, + "loss": 0.8885, + "step": 12808 + }, + { + "epoch": 0.9241369358969734, + "grad_norm": 3.0918853997710496, + "learning_rate": 6.001559057722394e-08, + "loss": 0.8498, + "step": 12809 + }, + { + "epoch": 0.924209083366401, + "grad_norm": 2.8453845681663625, + "learning_rate": 5.99020115428075e-08, + "loss": 0.9497, + "step": 12810 + }, + { + "epoch": 0.9242812308358285, + "grad_norm": 2.8177361807208046, + "learning_rate": 5.978853844996079e-08, + "loss": 0.9591, + "step": 12811 + }, + { + "epoch": 0.924353378305256, + "grad_norm": 2.215856572842636, + "learning_rate": 5.967517130487886e-08, + "loss": 0.8666, + "step": 12812 + }, + { + "epoch": 0.9244255257746834, + "grad_norm": 2.4003264602515406, + "learning_rate": 5.956191011375322e-08, + "loss": 0.8484, + "step": 12813 + }, + { + "epoch": 0.9244976732441109, + "grad_norm": 3.139034342888743, + "learning_rate": 5.9448754882768684e-08, + "loss": 0.9844, + "step": 12814 + }, + { + "epoch": 0.9245698207135384, + "grad_norm": 2.9268620815802517, + "learning_rate": 5.933570561810386e-08, + "loss": 0.8505, + "step": 12815 + }, + { + "epoch": 0.924641968182966, + "grad_norm": 2.8186437684064067, + "learning_rate": 5.922276232593204e-08, + "loss": 0.9216, + "step": 12816 + }, + { + "epoch": 0.9247141156523935, + "grad_norm": 2.1794674123037363, + "learning_rate": 5.910992501242051e-08, + "loss": 0.9392, + "step": 12817 + }, + { + "epoch": 0.924786263121821, + "grad_norm": 4.329627922581575, + "learning_rate": 5.899719368373146e-08, + "loss": 0.8529, + "step": 12818 + }, + { + "epoch": 0.9248584105912485, + "grad_norm": 3.299406264096109, + "learning_rate": 5.888456834601974e-08, + "loss": 0.8547, + "step": 12819 + }, + { + "epoch": 0.924930558060676, + "grad_norm": 2.5035314413312997, + "learning_rate": 5.8772049005436195e-08, + "loss": 0.9718, + "step": 12820 + }, + { + "epoch": 0.9250027055301036, + "grad_norm": 2.7937540724210477, + "learning_rate": 5.8659635668125265e-08, + "loss": 0.9679, + "step": 12821 + }, + { + "epoch": 0.9250748529995311, + "grad_norm": 4.685947415404023, + "learning_rate": 5.8547328340224687e-08, + "loss": 0.9251, + "step": 12822 + }, + { + "epoch": 0.9251470004689586, + "grad_norm": 2.144320229186931, + "learning_rate": 5.843512702786735e-08, + "loss": 0.8755, + "step": 12823 + }, + { + "epoch": 0.9252191479383861, + "grad_norm": 2.460495972761654, + "learning_rate": 5.8323031737180337e-08, + "loss": 0.9791, + "step": 12824 + }, + { + "epoch": 0.9252912954078135, + "grad_norm": 3.1323718286643834, + "learning_rate": 5.8211042474284986e-08, + "loss": 0.8777, + "step": 12825 + }, + { + "epoch": 0.925363442877241, + "grad_norm": 1.5500128160992803, + "learning_rate": 5.809915924529618e-08, + "loss": 0.9134, + "step": 12826 + }, + { + "epoch": 0.9254355903466686, + "grad_norm": 2.911947265042698, + "learning_rate": 5.79873820563237e-08, + "loss": 0.8132, + "step": 12827 + }, + { + "epoch": 0.9255077378160961, + "grad_norm": 3.858262121698245, + "learning_rate": 5.787571091347132e-08, + "loss": 0.8469, + "step": 12828 + }, + { + "epoch": 0.9255798852855236, + "grad_norm": 2.3537790997813763, + "learning_rate": 5.776414582283662e-08, + "loss": 0.8171, + "step": 12829 + }, + { + "epoch": 0.9256520327549511, + "grad_norm": 3.2878763773898263, + "learning_rate": 5.765268679051183e-08, + "loss": 0.9186, + "step": 12830 + }, + { + "epoch": 0.9257241802243786, + "grad_norm": 5.573769971696688, + "learning_rate": 5.754133382258386e-08, + "loss": 0.8996, + "step": 12831 + }, + { + "epoch": 0.9257963276938062, + "grad_norm": 0.8793982929867316, + "learning_rate": 5.743008692513274e-08, + "loss": 0.8435, + "step": 12832 + }, + { + "epoch": 0.9258684751632337, + "grad_norm": 2.0449247897618315, + "learning_rate": 5.73189461042336e-08, + "loss": 0.8695, + "step": 12833 + }, + { + "epoch": 0.9259406226326612, + "grad_norm": 2.917218419520544, + "learning_rate": 5.720791136595515e-08, + "loss": 0.9485, + "step": 12834 + }, + { + "epoch": 0.9260127701020887, + "grad_norm": 3.2503559211095023, + "learning_rate": 5.7096982716360763e-08, + "loss": 0.9211, + "step": 12835 + }, + { + "epoch": 0.9260849175715162, + "grad_norm": 3.2255321270678086, + "learning_rate": 5.698616016150737e-08, + "loss": 0.9222, + "step": 12836 + }, + { + "epoch": 0.9261570650409436, + "grad_norm": 14.653418423998854, + "learning_rate": 5.6875443707447235e-08, + "loss": 0.9461, + "step": 12837 + }, + { + "epoch": 0.9262292125103712, + "grad_norm": 3.8906159305083796, + "learning_rate": 5.6764833360226194e-08, + "loss": 0.8716, + "step": 12838 + }, + { + "epoch": 0.9263013599797987, + "grad_norm": 2.266049365058942, + "learning_rate": 5.665432912588386e-08, + "loss": 0.955, + "step": 12839 + }, + { + "epoch": 0.9263735074492262, + "grad_norm": 3.54819077608692, + "learning_rate": 5.654393101045452e-08, + "loss": 0.9195, + "step": 12840 + }, + { + "epoch": 0.9264456549186537, + "grad_norm": 2.3950676620577886, + "learning_rate": 5.643363901996667e-08, + "loss": 0.9067, + "step": 12841 + }, + { + "epoch": 0.9265178023880812, + "grad_norm": 3.9130425561452324, + "learning_rate": 5.632345316044307e-08, + "loss": 0.8895, + "step": 12842 + }, + { + "epoch": 0.9265899498575088, + "grad_norm": 0.6587645403145815, + "learning_rate": 5.6213373437900005e-08, + "loss": 0.7394, + "step": 12843 + }, + { + "epoch": 0.9266620973269363, + "grad_norm": 3.056248258812531, + "learning_rate": 5.6103399858349335e-08, + "loss": 0.9181, + "step": 12844 + }, + { + "epoch": 0.9267342447963638, + "grad_norm": 2.9066339054652874, + "learning_rate": 5.599353242779603e-08, + "loss": 0.9197, + "step": 12845 + }, + { + "epoch": 0.9268063922657913, + "grad_norm": 2.4380860235545327, + "learning_rate": 5.58837711522393e-08, + "loss": 0.7731, + "step": 12846 + }, + { + "epoch": 0.9268785397352188, + "grad_norm": 2.3642468205999645, + "learning_rate": 5.577411603767301e-08, + "loss": 1.012, + "step": 12847 + }, + { + "epoch": 0.9269506872046464, + "grad_norm": 2.08744740220009, + "learning_rate": 5.5664567090085044e-08, + "loss": 0.7941, + "step": 12848 + }, + { + "epoch": 0.9270228346740738, + "grad_norm": 3.0639436394311095, + "learning_rate": 5.5555124315457056e-08, + "loss": 0.7558, + "step": 12849 + }, + { + "epoch": 0.9270949821435013, + "grad_norm": 1.970279099788617, + "learning_rate": 5.5445787719765824e-08, + "loss": 0.8306, + "step": 12850 + }, + { + "epoch": 0.9271671296129288, + "grad_norm": 0.6972187143995775, + "learning_rate": 5.533655730898168e-08, + "loss": 0.7832, + "step": 12851 + }, + { + "epoch": 0.9272392770823563, + "grad_norm": 2.57332279433759, + "learning_rate": 5.522743308906941e-08, + "loss": 0.8636, + "step": 12852 + }, + { + "epoch": 0.9273114245517838, + "grad_norm": 2.3654317356113665, + "learning_rate": 5.511841506598758e-08, + "loss": 0.8396, + "step": 12853 + }, + { + "epoch": 0.9273835720212114, + "grad_norm": 3.223636214639601, + "learning_rate": 5.5009503245688985e-08, + "loss": 0.8743, + "step": 12854 + }, + { + "epoch": 0.9274557194906389, + "grad_norm": 3.9196384329356566, + "learning_rate": 5.4900697634121974e-08, + "loss": 0.9521, + "step": 12855 + }, + { + "epoch": 0.9275278669600664, + "grad_norm": 2.1259172367339767, + "learning_rate": 5.479199823722691e-08, + "loss": 0.9648, + "step": 12856 + }, + { + "epoch": 0.9276000144294939, + "grad_norm": 3.6751446390558504, + "learning_rate": 5.468340506094016e-08, + "loss": 1.0175, + "step": 12857 + }, + { + "epoch": 0.9276721618989214, + "grad_norm": 4.333731755255934, + "learning_rate": 5.457491811119119e-08, + "loss": 0.8039, + "step": 12858 + }, + { + "epoch": 0.927744309368349, + "grad_norm": 0.699784396380716, + "learning_rate": 5.446653739390439e-08, + "loss": 0.7982, + "step": 12859 + }, + { + "epoch": 0.9278164568377764, + "grad_norm": 2.075300893453853, + "learning_rate": 5.4358262914997455e-08, + "loss": 0.8336, + "step": 12860 + }, + { + "epoch": 0.9278886043072039, + "grad_norm": 2.5386052585589116, + "learning_rate": 5.4250094680383665e-08, + "loss": 0.9641, + "step": 12861 + }, + { + "epoch": 0.9279607517766314, + "grad_norm": 1.8218391140107508, + "learning_rate": 5.41420326959694e-08, + "loss": 0.8328, + "step": 12862 + }, + { + "epoch": 0.9280328992460589, + "grad_norm": 0.6726541437167022, + "learning_rate": 5.403407696765505e-08, + "loss": 0.8038, + "step": 12863 + }, + { + "epoch": 0.9281050467154864, + "grad_norm": 2.1822621442911596, + "learning_rate": 5.392622750133613e-08, + "loss": 0.8715, + "step": 12864 + }, + { + "epoch": 0.928177194184914, + "grad_norm": 4.862747248857832, + "learning_rate": 5.3818484302902146e-08, + "loss": 0.7908, + "step": 12865 + }, + { + "epoch": 0.9282493416543415, + "grad_norm": 3.239832158156484, + "learning_rate": 5.371084737823594e-08, + "loss": 0.835, + "step": 12866 + }, + { + "epoch": 0.928321489123769, + "grad_norm": 2.6361327473813616, + "learning_rate": 5.3603316733215054e-08, + "loss": 0.9157, + "step": 12867 + }, + { + "epoch": 0.9283936365931965, + "grad_norm": 2.788806465428792, + "learning_rate": 5.3495892373712104e-08, + "loss": 0.8509, + "step": 12868 + }, + { + "epoch": 0.928465784062624, + "grad_norm": 2.1936023064696513, + "learning_rate": 5.338857430559307e-08, + "loss": 1.035, + "step": 12869 + }, + { + "epoch": 0.9285379315320516, + "grad_norm": 5.465485563075393, + "learning_rate": 5.328136253471749e-08, + "loss": 0.9665, + "step": 12870 + }, + { + "epoch": 0.9286100790014791, + "grad_norm": 2.504523476449301, + "learning_rate": 5.317425706694023e-08, + "loss": 0.926, + "step": 12871 + }, + { + "epoch": 0.9286822264709065, + "grad_norm": 3.5644183431522176, + "learning_rate": 5.306725790811018e-08, + "loss": 0.9641, + "step": 12872 + }, + { + "epoch": 0.928754373940334, + "grad_norm": 2.738195921347945, + "learning_rate": 5.2960365064069314e-08, + "loss": 0.9369, + "step": 12873 + }, + { + "epoch": 0.9288265214097615, + "grad_norm": 2.9196393440862063, + "learning_rate": 5.285357854065542e-08, + "loss": 0.9835, + "step": 12874 + }, + { + "epoch": 0.928898668879189, + "grad_norm": 3.5557882428191636, + "learning_rate": 5.274689834369983e-08, + "loss": 0.8658, + "step": 12875 + }, + { + "epoch": 0.9289708163486166, + "grad_norm": 2.9919937110875585, + "learning_rate": 5.2640324479027444e-08, + "loss": 0.8964, + "step": 12876 + }, + { + "epoch": 0.9290429638180441, + "grad_norm": 2.6839528294723287, + "learning_rate": 5.253385695245804e-08, + "loss": 0.8437, + "step": 12877 + }, + { + "epoch": 0.9291151112874716, + "grad_norm": 3.035078953528292, + "learning_rate": 5.242749576980521e-08, + "loss": 0.9212, + "step": 12878 + }, + { + "epoch": 0.9291872587568991, + "grad_norm": 2.7600116552576734, + "learning_rate": 5.232124093687762e-08, + "loss": 0.808, + "step": 12879 + }, + { + "epoch": 0.9292594062263266, + "grad_norm": 2.6237313974419814, + "learning_rate": 5.2215092459476865e-08, + "loss": 0.8618, + "step": 12880 + }, + { + "epoch": 0.9293315536957542, + "grad_norm": 2.502834334624145, + "learning_rate": 5.2109050343399184e-08, + "loss": 0.8939, + "step": 12881 + }, + { + "epoch": 0.9294037011651817, + "grad_norm": 2.7204129625029276, + "learning_rate": 5.200311459443596e-08, + "loss": 0.8968, + "step": 12882 + }, + { + "epoch": 0.9294758486346092, + "grad_norm": 1.0303874020573434, + "learning_rate": 5.1897285218371e-08, + "loss": 0.8381, + "step": 12883 + }, + { + "epoch": 0.9295479961040366, + "grad_norm": 2.585639498501471, + "learning_rate": 5.179156222098346e-08, + "loss": 0.953, + "step": 12884 + }, + { + "epoch": 0.9296201435734641, + "grad_norm": 2.011081038941601, + "learning_rate": 5.168594560804695e-08, + "loss": 0.8952, + "step": 12885 + }, + { + "epoch": 0.9296922910428916, + "grad_norm": 1.7805201055628315, + "learning_rate": 5.1580435385328634e-08, + "loss": 0.9566, + "step": 12886 + }, + { + "epoch": 0.9297644385123192, + "grad_norm": 2.4713650144788484, + "learning_rate": 5.1475031558589895e-08, + "loss": 0.8859, + "step": 12887 + }, + { + "epoch": 0.9298365859817467, + "grad_norm": 2.3482572830300064, + "learning_rate": 5.1369734133586364e-08, + "loss": 0.939, + "step": 12888 + }, + { + "epoch": 0.9299087334511742, + "grad_norm": 5.137522216154824, + "learning_rate": 5.1264543116068315e-08, + "loss": 0.9609, + "step": 12889 + }, + { + "epoch": 0.9299808809206017, + "grad_norm": 2.7976794231512874, + "learning_rate": 5.115945851177939e-08, + "loss": 0.8013, + "step": 12890 + }, + { + "epoch": 0.9300530283900292, + "grad_norm": 10.459864611943852, + "learning_rate": 5.1054480326458094e-08, + "loss": 0.8985, + "step": 12891 + }, + { + "epoch": 0.9301251758594568, + "grad_norm": 2.438019525964543, + "learning_rate": 5.0949608565836965e-08, + "loss": 0.8788, + "step": 12892 + }, + { + "epoch": 0.9301973233288843, + "grad_norm": 2.863982443931518, + "learning_rate": 5.084484323564253e-08, + "loss": 0.8884, + "step": 12893 + }, + { + "epoch": 0.9302694707983118, + "grad_norm": 2.493111370409941, + "learning_rate": 5.074018434159555e-08, + "loss": 0.8981, + "step": 12894 + }, + { + "epoch": 0.9303416182677393, + "grad_norm": 2.0181475091637617, + "learning_rate": 5.0635631889411444e-08, + "loss": 0.8688, + "step": 12895 + }, + { + "epoch": 0.9304137657371667, + "grad_norm": 0.7886663518885126, + "learning_rate": 5.053118588479921e-08, + "loss": 0.816, + "step": 12896 + }, + { + "epoch": 0.9304859132065942, + "grad_norm": 0.6840378109174076, + "learning_rate": 5.042684633346184e-08, + "loss": 0.874, + "step": 12897 + }, + { + "epoch": 0.9305580606760218, + "grad_norm": 3.2565312416034122, + "learning_rate": 5.032261324109743e-08, + "loss": 0.9482, + "step": 12898 + }, + { + "epoch": 0.9306302081454493, + "grad_norm": 1.991398314695535, + "learning_rate": 5.021848661339789e-08, + "loss": 0.8951, + "step": 12899 + }, + { + "epoch": 0.9307023556148768, + "grad_norm": 3.690192710372327, + "learning_rate": 5.011446645604889e-08, + "loss": 0.865, + "step": 12900 + }, + { + "epoch": 0.9307745030843043, + "grad_norm": 2.570717823106534, + "learning_rate": 5.001055277473054e-08, + "loss": 0.985, + "step": 12901 + }, + { + "epoch": 0.9308466505537318, + "grad_norm": 0.7645102579465253, + "learning_rate": 4.990674557511698e-08, + "loss": 0.8069, + "step": 12902 + }, + { + "epoch": 0.9309187980231594, + "grad_norm": 2.696213458312762, + "learning_rate": 4.980304486287767e-08, + "loss": 0.9596, + "step": 12903 + }, + { + "epoch": 0.9309909454925869, + "grad_norm": 2.8113016754598066, + "learning_rate": 4.969945064367431e-08, + "loss": 0.8381, + "step": 12904 + }, + { + "epoch": 0.9310630929620144, + "grad_norm": 11.758683911852856, + "learning_rate": 4.959596292316437e-08, + "loss": 0.8828, + "step": 12905 + }, + { + "epoch": 0.9311352404314419, + "grad_norm": 2.0268895222875574, + "learning_rate": 4.949258170699888e-08, + "loss": 0.9324, + "step": 12906 + }, + { + "epoch": 0.9312073879008694, + "grad_norm": 2.130668149037297, + "learning_rate": 4.938930700082267e-08, + "loss": 0.9597, + "step": 12907 + }, + { + "epoch": 0.9312795353702968, + "grad_norm": 2.023782708019758, + "learning_rate": 4.9286138810275436e-08, + "loss": 1.0151, + "step": 12908 + }, + { + "epoch": 0.9313516828397244, + "grad_norm": 2.174192660220083, + "learning_rate": 4.918307714099113e-08, + "loss": 0.9863, + "step": 12909 + }, + { + "epoch": 0.9314238303091519, + "grad_norm": 3.3656592917222836, + "learning_rate": 4.908012199859723e-08, + "loss": 0.9529, + "step": 12910 + }, + { + "epoch": 0.9314959777785794, + "grad_norm": 2.1523694488597362, + "learning_rate": 4.8977273388715715e-08, + "loss": 0.9844, + "step": 12911 + }, + { + "epoch": 0.9315681252480069, + "grad_norm": 3.2174794179973887, + "learning_rate": 4.887453131696273e-08, + "loss": 0.8572, + "step": 12912 + }, + { + "epoch": 0.9316402727174344, + "grad_norm": 3.6416530221876973, + "learning_rate": 4.8771895788949136e-08, + "loss": 0.7947, + "step": 12913 + }, + { + "epoch": 0.931712420186862, + "grad_norm": 1.9904143576807922, + "learning_rate": 4.866936681027889e-08, + "loss": 0.8132, + "step": 12914 + }, + { + "epoch": 0.9317845676562895, + "grad_norm": 2.613479127790713, + "learning_rate": 4.856694438655085e-08, + "loss": 0.8477, + "step": 12915 + }, + { + "epoch": 0.931856715125717, + "grad_norm": 3.150803048843278, + "learning_rate": 4.846462852335831e-08, + "loss": 0.933, + "step": 12916 + }, + { + "epoch": 0.9319288625951445, + "grad_norm": 2.2984681086573495, + "learning_rate": 4.836241922628792e-08, + "loss": 0.8844, + "step": 12917 + }, + { + "epoch": 0.932001010064572, + "grad_norm": 2.3302941170256775, + "learning_rate": 4.8260316500920996e-08, + "loss": 0.9622, + "step": 12918 + }, + { + "epoch": 0.9320731575339994, + "grad_norm": 3.9313304947464465, + "learning_rate": 4.8158320352833516e-08, + "loss": 0.8502, + "step": 12919 + }, + { + "epoch": 0.932145305003427, + "grad_norm": 3.9208946674561576, + "learning_rate": 4.805643078759436e-08, + "loss": 0.9788, + "step": 12920 + }, + { + "epoch": 0.9322174524728545, + "grad_norm": 5.715429457233746, + "learning_rate": 4.79546478107673e-08, + "loss": 0.8846, + "step": 12921 + }, + { + "epoch": 0.932289599942282, + "grad_norm": 3.877956124193758, + "learning_rate": 4.7852971427911224e-08, + "loss": 0.9354, + "step": 12922 + }, + { + "epoch": 0.9323617474117095, + "grad_norm": 4.462598287744843, + "learning_rate": 4.7751401644577694e-08, + "loss": 0.9302, + "step": 12923 + }, + { + "epoch": 0.932433894881137, + "grad_norm": 3.146903826402541, + "learning_rate": 4.764993846631316e-08, + "loss": 0.7818, + "step": 12924 + }, + { + "epoch": 0.9325060423505646, + "grad_norm": 2.9322918112141023, + "learning_rate": 4.754858189865807e-08, + "loss": 0.9843, + "step": 12925 + }, + { + "epoch": 0.9325781898199921, + "grad_norm": 2.629055251366922, + "learning_rate": 4.744733194714734e-08, + "loss": 0.955, + "step": 12926 + }, + { + "epoch": 0.9326503372894196, + "grad_norm": 2.0733635067762934, + "learning_rate": 4.7346188617309214e-08, + "loss": 0.8956, + "step": 12927 + }, + { + "epoch": 0.9327224847588471, + "grad_norm": 3.3870146906995617, + "learning_rate": 4.724515191466749e-08, + "loss": 1.0032, + "step": 12928 + }, + { + "epoch": 0.9327946322282746, + "grad_norm": 2.2755620408876376, + "learning_rate": 4.714422184473932e-08, + "loss": 0.9332, + "step": 12929 + }, + { + "epoch": 0.9328667796977022, + "grad_norm": 1.8382546594385087, + "learning_rate": 4.704339841303606e-08, + "loss": 0.9276, + "step": 12930 + }, + { + "epoch": 0.9329389271671296, + "grad_norm": 2.938210259462143, + "learning_rate": 4.69426816250631e-08, + "loss": 0.8616, + "step": 12931 + }, + { + "epoch": 0.9330110746365571, + "grad_norm": 1.8780367102158075, + "learning_rate": 4.6842071486320026e-08, + "loss": 1.0307, + "step": 12932 + }, + { + "epoch": 0.9330832221059846, + "grad_norm": 3.0559156051586323, + "learning_rate": 4.674156800230156e-08, + "loss": 0.936, + "step": 12933 + }, + { + "epoch": 0.9331553695754121, + "grad_norm": 2.170733913651545, + "learning_rate": 4.6641171178495086e-08, + "loss": 0.9488, + "step": 12934 + }, + { + "epoch": 0.9332275170448396, + "grad_norm": 2.0227642338447245, + "learning_rate": 4.6540881020383336e-08, + "loss": 0.8364, + "step": 12935 + }, + { + "epoch": 0.9332996645142672, + "grad_norm": 4.423222959780426, + "learning_rate": 4.6440697533442594e-08, + "loss": 0.8922, + "step": 12936 + }, + { + "epoch": 0.9333718119836947, + "grad_norm": 2.211589056731216, + "learning_rate": 4.6340620723143376e-08, + "loss": 0.8759, + "step": 12937 + }, + { + "epoch": 0.9334439594531222, + "grad_norm": 2.2618319626376917, + "learning_rate": 4.624065059495086e-08, + "loss": 0.8929, + "step": 12938 + }, + { + "epoch": 0.9335161069225497, + "grad_norm": 2.0509252879104536, + "learning_rate": 4.6140787154323346e-08, + "loss": 0.9707, + "step": 12939 + }, + { + "epoch": 0.9335882543919772, + "grad_norm": 0.7088436691175829, + "learning_rate": 4.604103040671514e-08, + "loss": 0.7757, + "step": 12940 + }, + { + "epoch": 0.9336604018614048, + "grad_norm": 2.4053351905072664, + "learning_rate": 4.594138035757256e-08, + "loss": 0.8205, + "step": 12941 + }, + { + "epoch": 0.9337325493308323, + "grad_norm": 5.693513813835867, + "learning_rate": 4.584183701233768e-08, + "loss": 0.8854, + "step": 12942 + }, + { + "epoch": 0.9338046968002597, + "grad_norm": 0.7275128455846829, + "learning_rate": 4.5742400376446167e-08, + "loss": 0.8213, + "step": 12943 + }, + { + "epoch": 0.9338768442696872, + "grad_norm": 3.0428154922694732, + "learning_rate": 4.564307045532767e-08, + "loss": 0.8617, + "step": 12944 + }, + { + "epoch": 0.9339489917391147, + "grad_norm": 2.0883560161571415, + "learning_rate": 4.554384725440585e-08, + "loss": 0.9708, + "step": 12945 + }, + { + "epoch": 0.9340211392085422, + "grad_norm": 3.2356054941569066, + "learning_rate": 4.544473077909994e-08, + "loss": 0.8202, + "step": 12946 + }, + { + "epoch": 0.9340932866779698, + "grad_norm": 1.9089664427943636, + "learning_rate": 4.534572103482159e-08, + "loss": 0.9708, + "step": 12947 + }, + { + "epoch": 0.9341654341473973, + "grad_norm": 3.7524106224847418, + "learning_rate": 4.5246818026977605e-08, + "loss": 0.9185, + "step": 12948 + }, + { + "epoch": 0.9342375816168248, + "grad_norm": 2.2485199404852985, + "learning_rate": 4.514802176096855e-08, + "loss": 0.9651, + "step": 12949 + }, + { + "epoch": 0.9343097290862523, + "grad_norm": 2.790106740844367, + "learning_rate": 4.504933224218965e-08, + "loss": 0.8663, + "step": 12950 + }, + { + "epoch": 0.9343818765556798, + "grad_norm": 2.159192275446685, + "learning_rate": 4.495074947602928e-08, + "loss": 0.9358, + "step": 12951 + }, + { + "epoch": 0.9344540240251074, + "grad_norm": 2.1837196382050674, + "learning_rate": 4.485227346787135e-08, + "loss": 0.8416, + "step": 12952 + }, + { + "epoch": 0.9345261714945349, + "grad_norm": 2.9906282271444513, + "learning_rate": 4.475390422309333e-08, + "loss": 0.7349, + "step": 12953 + }, + { + "epoch": 0.9345983189639624, + "grad_norm": 2.559059904308915, + "learning_rate": 4.4655641747066265e-08, + "loss": 0.9352, + "step": 12954 + }, + { + "epoch": 0.9346704664333898, + "grad_norm": 3.153837131501653, + "learning_rate": 4.455748604515608e-08, + "loss": 0.9489, + "step": 12955 + }, + { + "epoch": 0.9347426139028173, + "grad_norm": 2.0063817013683347, + "learning_rate": 4.445943712272271e-08, + "loss": 0.9648, + "step": 12956 + }, + { + "epoch": 0.9348147613722448, + "grad_norm": 3.1042975754411253, + "learning_rate": 4.436149498512054e-08, + "loss": 0.9584, + "step": 12957 + }, + { + "epoch": 0.9348869088416724, + "grad_norm": 2.662206187625389, + "learning_rate": 4.4263659637697515e-08, + "loss": 0.8857, + "step": 12958 + }, + { + "epoch": 0.9349590563110999, + "grad_norm": 0.8522530651714408, + "learning_rate": 4.4165931085796246e-08, + "loss": 0.8213, + "step": 12959 + }, + { + "epoch": 0.9350312037805274, + "grad_norm": 2.770569772064483, + "learning_rate": 4.406830933475336e-08, + "loss": 0.8321, + "step": 12960 + }, + { + "epoch": 0.9351033512499549, + "grad_norm": 3.32416538863605, + "learning_rate": 4.3970794389899256e-08, + "loss": 0.9135, + "step": 12961 + }, + { + "epoch": 0.9351754987193824, + "grad_norm": 2.093628609397893, + "learning_rate": 4.387338625655923e-08, + "loss": 1.0073, + "step": 12962 + }, + { + "epoch": 0.93524764618881, + "grad_norm": 2.3619287072581523, + "learning_rate": 4.3776084940052135e-08, + "loss": 0.8065, + "step": 12963 + }, + { + "epoch": 0.9353197936582375, + "grad_norm": 0.8006211655798162, + "learning_rate": 4.367889044569129e-08, + "loss": 0.779, + "step": 12964 + }, + { + "epoch": 0.935391941127665, + "grad_norm": 0.5903260741079184, + "learning_rate": 4.35818027787842e-08, + "loss": 0.7382, + "step": 12965 + }, + { + "epoch": 0.9354640885970924, + "grad_norm": 2.0878329576855514, + "learning_rate": 4.348482194463243e-08, + "loss": 0.8714, + "step": 12966 + }, + { + "epoch": 0.9355362360665199, + "grad_norm": 3.0204296675451516, + "learning_rate": 4.3387947948532175e-08, + "loss": 0.9198, + "step": 12967 + }, + { + "epoch": 0.9356083835359474, + "grad_norm": 3.4938442046920404, + "learning_rate": 4.329118079577254e-08, + "loss": 0.851, + "step": 12968 + }, + { + "epoch": 0.935680531005375, + "grad_norm": 2.461547388464073, + "learning_rate": 4.3194520491637963e-08, + "loss": 1.0004, + "step": 12969 + }, + { + "epoch": 0.9357526784748025, + "grad_norm": 2.967753674816973, + "learning_rate": 4.309796704140711e-08, + "loss": 0.9472, + "step": 12970 + }, + { + "epoch": 0.93582482594423, + "grad_norm": 3.085516869572583, + "learning_rate": 4.300152045035177e-08, + "loss": 0.9314, + "step": 12971 + }, + { + "epoch": 0.9358969734136575, + "grad_norm": 2.4836633010730185, + "learning_rate": 4.2905180723739054e-08, + "loss": 0.9649, + "step": 12972 + }, + { + "epoch": 0.935969120883085, + "grad_norm": 3.323930272854903, + "learning_rate": 4.28089478668292e-08, + "loss": 0.9079, + "step": 12973 + }, + { + "epoch": 0.9360412683525126, + "grad_norm": 2.718746667618737, + "learning_rate": 4.2712821884878015e-08, + "loss": 0.9054, + "step": 12974 + }, + { + "epoch": 0.9361134158219401, + "grad_norm": 2.400746579079702, + "learning_rate": 4.2616802783133285e-08, + "loss": 0.8975, + "step": 12975 + }, + { + "epoch": 0.9361855632913676, + "grad_norm": 4.3416152209103505, + "learning_rate": 4.252089056683927e-08, + "loss": 0.8971, + "step": 12976 + }, + { + "epoch": 0.9362577107607951, + "grad_norm": 3.242733975970896, + "learning_rate": 4.242508524123334e-08, + "loss": 1.0072, + "step": 12977 + }, + { + "epoch": 0.9363298582302225, + "grad_norm": 2.722078062206447, + "learning_rate": 4.232938681154663e-08, + "loss": 0.9323, + "step": 12978 + }, + { + "epoch": 0.93640200569965, + "grad_norm": 5.938520484364173, + "learning_rate": 4.223379528300497e-08, + "loss": 0.9749, + "step": 12979 + }, + { + "epoch": 0.9364741531690776, + "grad_norm": 3.1014948532452915, + "learning_rate": 4.2138310660828625e-08, + "loss": 0.94, + "step": 12980 + }, + { + "epoch": 0.9365463006385051, + "grad_norm": 6.28887143052145, + "learning_rate": 4.2042932950230756e-08, + "loss": 0.9751, + "step": 12981 + }, + { + "epoch": 0.9366184481079326, + "grad_norm": 0.6392576698421945, + "learning_rate": 4.194766215642076e-08, + "loss": 0.739, + "step": 12982 + }, + { + "epoch": 0.9366905955773601, + "grad_norm": 2.8310754044279074, + "learning_rate": 4.185249828460025e-08, + "loss": 0.9642, + "step": 12983 + }, + { + "epoch": 0.9367627430467876, + "grad_norm": 2.84650212514407, + "learning_rate": 4.175744133996617e-08, + "loss": 0.9075, + "step": 12984 + }, + { + "epoch": 0.9368348905162152, + "grad_norm": 2.6148025805785777, + "learning_rate": 4.166249132770905e-08, + "loss": 0.9371, + "step": 12985 + }, + { + "epoch": 0.9369070379856427, + "grad_norm": 2.6528237179633454, + "learning_rate": 4.1567648253013624e-08, + "loss": 0.9024, + "step": 12986 + }, + { + "epoch": 0.9369791854550702, + "grad_norm": 2.8031132352278023, + "learning_rate": 4.1472912121059076e-08, + "loss": 0.9058, + "step": 12987 + }, + { + "epoch": 0.9370513329244977, + "grad_norm": 4.0628891978533215, + "learning_rate": 4.137828293701839e-08, + "loss": 0.9509, + "step": 12988 + }, + { + "epoch": 0.9371234803939252, + "grad_norm": 1.2386885974805453, + "learning_rate": 4.128376070605943e-08, + "loss": 0.7994, + "step": 12989 + }, + { + "epoch": 0.9371956278633526, + "grad_norm": 2.1162822519493405, + "learning_rate": 4.118934543334318e-08, + "loss": 0.8731, + "step": 12990 + }, + { + "epoch": 0.9372677753327802, + "grad_norm": 2.906145155461409, + "learning_rate": 4.1095037124025734e-08, + "loss": 0.9518, + "step": 12991 + }, + { + "epoch": 0.9373399228022077, + "grad_norm": 2.716832679715538, + "learning_rate": 4.100083578325653e-08, + "loss": 0.9576, + "step": 12992 + }, + { + "epoch": 0.9374120702716352, + "grad_norm": 9.187022086448607, + "learning_rate": 4.0906741416179445e-08, + "loss": 0.8777, + "step": 12993 + }, + { + "epoch": 0.9374842177410627, + "grad_norm": 8.69139659409576, + "learning_rate": 4.0812754027933496e-08, + "loss": 0.9188, + "step": 12994 + }, + { + "epoch": 0.9375563652104902, + "grad_norm": 2.2806420757295283, + "learning_rate": 4.0718873623649896e-08, + "loss": 0.7818, + "step": 12995 + }, + { + "epoch": 0.9376285126799178, + "grad_norm": 2.1495049061831244, + "learning_rate": 4.062510020845589e-08, + "loss": 0.9674, + "step": 12996 + }, + { + "epoch": 0.9377006601493453, + "grad_norm": 1.981801263790284, + "learning_rate": 4.053143378747182e-08, + "loss": 0.8653, + "step": 12997 + }, + { + "epoch": 0.9377728076187728, + "grad_norm": 2.986676990996409, + "learning_rate": 4.0437874365812256e-08, + "loss": 0.9763, + "step": 12998 + }, + { + "epoch": 0.9378449550882003, + "grad_norm": 1.9288305317945096, + "learning_rate": 4.034442194858623e-08, + "loss": 0.8282, + "step": 12999 + }, + { + "epoch": 0.9379171025576278, + "grad_norm": 2.9157922433067704, + "learning_rate": 4.025107654089699e-08, + "loss": 0.9277, + "step": 13000 + }, + { + "epoch": 0.9379892500270554, + "grad_norm": 2.9655842817463665, + "learning_rate": 4.0157838147842013e-08, + "loss": 0.9593, + "step": 13001 + }, + { + "epoch": 0.9380613974964828, + "grad_norm": 2.7274055817063223, + "learning_rate": 4.0064706774512124e-08, + "loss": 0.9054, + "step": 13002 + }, + { + "epoch": 0.9381335449659103, + "grad_norm": 8.809675047961962, + "learning_rate": 3.9971682425993244e-08, + "loss": 0.7714, + "step": 13003 + }, + { + "epoch": 0.9382056924353378, + "grad_norm": 2.277812196373314, + "learning_rate": 3.9878765107365096e-08, + "loss": 0.9092, + "step": 13004 + }, + { + "epoch": 0.9382778399047653, + "grad_norm": 2.403465244925175, + "learning_rate": 3.9785954823701175e-08, + "loss": 0.9562, + "step": 13005 + }, + { + "epoch": 0.9383499873741928, + "grad_norm": 2.253682831214485, + "learning_rate": 3.9693251580070087e-08, + "loss": 0.96, + "step": 13006 + }, + { + "epoch": 0.9384221348436204, + "grad_norm": 2.2765868092052655, + "learning_rate": 3.9600655381533786e-08, + "loss": 0.9261, + "step": 13007 + }, + { + "epoch": 0.9384942823130479, + "grad_norm": 2.5199148433163208, + "learning_rate": 3.950816623314823e-08, + "loss": 0.8532, + "step": 13008 + }, + { + "epoch": 0.9385664297824754, + "grad_norm": 2.810456275142402, + "learning_rate": 3.941578413996449e-08, + "loss": 0.8783, + "step": 13009 + }, + { + "epoch": 0.9386385772519029, + "grad_norm": 2.957762291467512, + "learning_rate": 3.9323509107026756e-08, + "loss": 0.8882, + "step": 13010 + }, + { + "epoch": 0.9387107247213304, + "grad_norm": 1.9478121098763246, + "learning_rate": 3.92313411393741e-08, + "loss": 0.9789, + "step": 13011 + }, + { + "epoch": 0.938782872190758, + "grad_norm": 2.198920492271002, + "learning_rate": 3.913928024203939e-08, + "loss": 0.8634, + "step": 13012 + }, + { + "epoch": 0.9388550196601854, + "grad_norm": 4.370694248736965, + "learning_rate": 3.904732642004971e-08, + "loss": 0.9923, + "step": 13013 + }, + { + "epoch": 0.9389271671296129, + "grad_norm": 4.266629614271769, + "learning_rate": 3.8955479678426385e-08, + "loss": 0.9992, + "step": 13014 + }, + { + "epoch": 0.9389993145990404, + "grad_norm": 2.4787697567828104, + "learning_rate": 3.886374002218473e-08, + "loss": 0.8883, + "step": 13015 + }, + { + "epoch": 0.9390714620684679, + "grad_norm": 3.3072505597941606, + "learning_rate": 3.8772107456334304e-08, + "loss": 0.948, + "step": 13016 + }, + { + "epoch": 0.9391436095378954, + "grad_norm": 2.997632204883447, + "learning_rate": 3.868058198587887e-08, + "loss": 0.9472, + "step": 13017 + }, + { + "epoch": 0.939215757007323, + "grad_norm": 2.2716219754579643, + "learning_rate": 3.8589163615816436e-08, + "loss": 0.8649, + "step": 13018 + }, + { + "epoch": 0.9392879044767505, + "grad_norm": 2.652043591061961, + "learning_rate": 3.849785235113878e-08, + "loss": 0.8152, + "step": 13019 + }, + { + "epoch": 0.939360051946178, + "grad_norm": 3.546185531487385, + "learning_rate": 3.840664819683215e-08, + "loss": 0.8625, + "step": 13020 + }, + { + "epoch": 0.9394321994156055, + "grad_norm": 5.10182196291762, + "learning_rate": 3.831555115787721e-08, + "loss": 0.885, + "step": 13021 + }, + { + "epoch": 0.939504346885033, + "grad_norm": 3.8688063806915043, + "learning_rate": 3.8224561239247773e-08, + "loss": 0.9253, + "step": 13022 + }, + { + "epoch": 0.9395764943544606, + "grad_norm": 2.5714521577298317, + "learning_rate": 3.813367844591275e-08, + "loss": 0.8561, + "step": 13023 + }, + { + "epoch": 0.9396486418238881, + "grad_norm": 2.2970840462772846, + "learning_rate": 3.8042902782835063e-08, + "loss": 0.934, + "step": 13024 + }, + { + "epoch": 0.9397207892933155, + "grad_norm": 2.947625099743469, + "learning_rate": 3.7952234254971624e-08, + "loss": 0.9411, + "step": 13025 + }, + { + "epoch": 0.939792936762743, + "grad_norm": 3.5411917573749094, + "learning_rate": 3.7861672867273375e-08, + "loss": 0.9404, + "step": 13026 + }, + { + "epoch": 0.9398650842321705, + "grad_norm": 3.2029791077674736, + "learning_rate": 3.7771218624685466e-08, + "loss": 0.9261, + "step": 13027 + }, + { + "epoch": 0.939937231701598, + "grad_norm": 15.017412887023381, + "learning_rate": 3.7680871532147715e-08, + "loss": 0.9269, + "step": 13028 + }, + { + "epoch": 0.9400093791710256, + "grad_norm": 2.56224514694119, + "learning_rate": 3.7590631594592857e-08, + "loss": 0.8742, + "step": 13029 + }, + { + "epoch": 0.9400815266404531, + "grad_norm": 2.423377081919084, + "learning_rate": 3.750049881694917e-08, + "loss": 0.9538, + "step": 13030 + }, + { + "epoch": 0.9401536741098806, + "grad_norm": 2.731545690247585, + "learning_rate": 3.74104732041387e-08, + "loss": 0.9332, + "step": 13031 + }, + { + "epoch": 0.9402258215793081, + "grad_norm": 3.162910911190656, + "learning_rate": 3.732055476107665e-08, + "loss": 0.9114, + "step": 13032 + }, + { + "epoch": 0.9402979690487356, + "grad_norm": 2.3146143346084362, + "learning_rate": 3.723074349267352e-08, + "loss": 0.9359, + "step": 13033 + }, + { + "epoch": 0.9403701165181632, + "grad_norm": 2.2896091472453675, + "learning_rate": 3.7141039403833616e-08, + "loss": 0.9118, + "step": 13034 + }, + { + "epoch": 0.9404422639875907, + "grad_norm": 2.273538384050826, + "learning_rate": 3.705144249945524e-08, + "loss": 0.9693, + "step": 13035 + }, + { + "epoch": 0.9405144114570182, + "grad_norm": 2.9428263322696115, + "learning_rate": 3.696195278443115e-08, + "loss": 0.9004, + "step": 13036 + }, + { + "epoch": 0.9405865589264456, + "grad_norm": 2.4712994123582965, + "learning_rate": 3.6872570263647876e-08, + "loss": 0.9249, + "step": 13037 + }, + { + "epoch": 0.9406587063958731, + "grad_norm": 3.5482046181079934, + "learning_rate": 3.67832949419864e-08, + "loss": 0.8801, + "step": 13038 + }, + { + "epoch": 0.9407308538653006, + "grad_norm": 2.5850698623218515, + "learning_rate": 3.669412682432127e-08, + "loss": 0.8696, + "step": 13039 + }, + { + "epoch": 0.9408030013347282, + "grad_norm": 2.529707450387475, + "learning_rate": 3.6605065915522145e-08, + "loss": 1.0559, + "step": 13040 + }, + { + "epoch": 0.9408751488041557, + "grad_norm": 2.5130522467407306, + "learning_rate": 3.651611222045225e-08, + "loss": 0.9097, + "step": 13041 + }, + { + "epoch": 0.9409472962735832, + "grad_norm": 2.435159488757822, + "learning_rate": 3.642726574396882e-08, + "loss": 0.8661, + "step": 13042 + }, + { + "epoch": 0.9410194437430107, + "grad_norm": 2.402259787976359, + "learning_rate": 3.633852649092328e-08, + "loss": 0.97, + "step": 13043 + }, + { + "epoch": 0.9410915912124382, + "grad_norm": 2.6743626307597936, + "learning_rate": 3.624989446616178e-08, + "loss": 0.9575, + "step": 13044 + }, + { + "epoch": 0.9411637386818658, + "grad_norm": 0.7825427612832713, + "learning_rate": 3.616136967452421e-08, + "loss": 0.8134, + "step": 13045 + }, + { + "epoch": 0.9412358861512933, + "grad_norm": 2.0263158879048655, + "learning_rate": 3.6072952120844046e-08, + "loss": 0.8993, + "step": 13046 + }, + { + "epoch": 0.9413080336207208, + "grad_norm": 3.3724557682244507, + "learning_rate": 3.598464180994964e-08, + "loss": 0.815, + "step": 13047 + }, + { + "epoch": 0.9413801810901483, + "grad_norm": 2.4003078859252844, + "learning_rate": 3.589643874666382e-08, + "loss": 0.8045, + "step": 13048 + }, + { + "epoch": 0.9414523285595757, + "grad_norm": 2.919287040363733, + "learning_rate": 3.580834293580226e-08, + "loss": 0.9275, + "step": 13049 + }, + { + "epoch": 0.9415244760290032, + "grad_norm": 0.7140574064845218, + "learning_rate": 3.5720354382176023e-08, + "loss": 0.7992, + "step": 13050 + }, + { + "epoch": 0.9415966234984308, + "grad_norm": 5.915875055073115, + "learning_rate": 3.563247309058992e-08, + "loss": 0.8652, + "step": 13051 + }, + { + "epoch": 0.9416687709678583, + "grad_norm": 2.8300061792636444, + "learning_rate": 3.554469906584256e-08, + "loss": 0.8762, + "step": 13052 + }, + { + "epoch": 0.9417409184372858, + "grad_norm": 0.7391609723095063, + "learning_rate": 3.545703231272679e-08, + "loss": 0.8041, + "step": 13053 + }, + { + "epoch": 0.9418130659067133, + "grad_norm": 2.188578421568379, + "learning_rate": 3.536947283603009e-08, + "loss": 0.8712, + "step": 13054 + }, + { + "epoch": 0.9418852133761408, + "grad_norm": 1.9108008172847797, + "learning_rate": 3.5282020640533764e-08, + "loss": 1.1224, + "step": 13055 + }, + { + "epoch": 0.9419573608455684, + "grad_norm": 3.657615471853405, + "learning_rate": 3.519467573101309e-08, + "loss": 1.0019, + "step": 13056 + }, + { + "epoch": 0.9420295083149959, + "grad_norm": 2.5742764502673605, + "learning_rate": 3.5107438112237595e-08, + "loss": 0.9957, + "step": 13057 + }, + { + "epoch": 0.9421016557844234, + "grad_norm": 2.7173731156445347, + "learning_rate": 3.502030778897147e-08, + "loss": 0.8625, + "step": 13058 + }, + { + "epoch": 0.9421738032538509, + "grad_norm": 3.364946579983852, + "learning_rate": 3.493328476597179e-08, + "loss": 0.9241, + "step": 13059 + }, + { + "epoch": 0.9422459507232784, + "grad_norm": 2.119760561503173, + "learning_rate": 3.484636904799121e-08, + "loss": 0.9674, + "step": 13060 + }, + { + "epoch": 0.9423180981927058, + "grad_norm": 3.3938238989440004, + "learning_rate": 3.475956063977547e-08, + "loss": 0.8889, + "step": 13061 + }, + { + "epoch": 0.9423902456621334, + "grad_norm": 2.09123397647718, + "learning_rate": 3.467285954606547e-08, + "loss": 0.8918, + "step": 13062 + }, + { + "epoch": 0.9424623931315609, + "grad_norm": 3.8199889625270025, + "learning_rate": 3.458626577159496e-08, + "loss": 1.009, + "step": 13063 + }, + { + "epoch": 0.9425345406009884, + "grad_norm": 2.538093358462362, + "learning_rate": 3.449977932109282e-08, + "loss": 0.9447, + "step": 13064 + }, + { + "epoch": 0.9426066880704159, + "grad_norm": 1.761495098119104, + "learning_rate": 3.441340019928174e-08, + "loss": 0.9664, + "step": 13065 + }, + { + "epoch": 0.9426788355398434, + "grad_norm": 0.7175776831013456, + "learning_rate": 3.432712841087837e-08, + "loss": 0.7955, + "step": 13066 + }, + { + "epoch": 0.942750983009271, + "grad_norm": 4.4170613862242, + "learning_rate": 3.424096396059384e-08, + "loss": 0.8563, + "step": 13067 + }, + { + "epoch": 0.9428231304786985, + "grad_norm": 3.1071758832256853, + "learning_rate": 3.415490685313349e-08, + "loss": 0.8669, + "step": 13068 + }, + { + "epoch": 0.942895277948126, + "grad_norm": 3.0826809854581887, + "learning_rate": 3.406895709319624e-08, + "loss": 0.925, + "step": 13069 + }, + { + "epoch": 0.9429674254175535, + "grad_norm": 2.5882713642386386, + "learning_rate": 3.398311468547566e-08, + "loss": 0.9486, + "step": 13070 + }, + { + "epoch": 0.943039572886981, + "grad_norm": 2.8462724502341503, + "learning_rate": 3.3897379634659104e-08, + "loss": 1.0341, + "step": 13071 + }, + { + "epoch": 0.9431117203564084, + "grad_norm": 2.079591047542355, + "learning_rate": 3.3811751945428624e-08, + "loss": 0.8706, + "step": 13072 + }, + { + "epoch": 0.943183867825836, + "grad_norm": 3.8083338252650862, + "learning_rate": 3.3726231622459576e-08, + "loss": 0.874, + "step": 13073 + }, + { + "epoch": 0.9432560152952635, + "grad_norm": 2.3743050210937344, + "learning_rate": 3.3640818670422235e-08, + "loss": 0.9371, + "step": 13074 + }, + { + "epoch": 0.943328162764691, + "grad_norm": 3.174756024215634, + "learning_rate": 3.3555513093980635e-08, + "loss": 0.8423, + "step": 13075 + }, + { + "epoch": 0.9434003102341185, + "grad_norm": 7.760472758728352, + "learning_rate": 3.347031489779284e-08, + "loss": 0.9382, + "step": 13076 + }, + { + "epoch": 0.943472457703546, + "grad_norm": 3.356294150745814, + "learning_rate": 3.3385224086511124e-08, + "loss": 0.8286, + "step": 13077 + }, + { + "epoch": 0.9435446051729736, + "grad_norm": 2.2851178549124866, + "learning_rate": 3.3300240664782206e-08, + "loss": 0.8355, + "step": 13078 + }, + { + "epoch": 0.9436167526424011, + "grad_norm": 0.760923860964258, + "learning_rate": 3.3215364637247053e-08, + "loss": 0.9101, + "step": 13079 + }, + { + "epoch": 0.9436889001118286, + "grad_norm": 2.388973189007493, + "learning_rate": 3.313059600853974e-08, + "loss": 0.8318, + "step": 13080 + }, + { + "epoch": 0.9437610475812561, + "grad_norm": 2.3712725249866047, + "learning_rate": 3.304593478328943e-08, + "loss": 0.913, + "step": 13081 + }, + { + "epoch": 0.9438331950506836, + "grad_norm": 2.3273036455205776, + "learning_rate": 3.2961380966119554e-08, + "loss": 0.863, + "step": 13082 + }, + { + "epoch": 0.9439053425201112, + "grad_norm": 2.211923332279141, + "learning_rate": 3.287693456164642e-08, + "loss": 0.8892, + "step": 13083 + }, + { + "epoch": 0.9439774899895386, + "grad_norm": 3.9203915164992336, + "learning_rate": 3.2792595574482105e-08, + "loss": 0.8945, + "step": 13084 + }, + { + "epoch": 0.9440496374589661, + "grad_norm": 2.4252009043107194, + "learning_rate": 3.2708364009232045e-08, + "loss": 0.8188, + "step": 13085 + }, + { + "epoch": 0.9441217849283936, + "grad_norm": 2.995525997634212, + "learning_rate": 3.262423987049523e-08, + "loss": 0.9515, + "step": 13086 + }, + { + "epoch": 0.9441939323978211, + "grad_norm": 3.103453091182846, + "learning_rate": 3.2540223162865755e-08, + "loss": 0.9336, + "step": 13087 + }, + { + "epoch": 0.9442660798672486, + "grad_norm": 0.7403026941824705, + "learning_rate": 3.245631389093151e-08, + "loss": 0.7503, + "step": 13088 + }, + { + "epoch": 0.9443382273366762, + "grad_norm": 2.52641154199806, + "learning_rate": 3.2372512059274384e-08, + "loss": 0.9307, + "step": 13089 + }, + { + "epoch": 0.9444103748061037, + "grad_norm": 4.684065310654054, + "learning_rate": 3.228881767247027e-08, + "loss": 0.9601, + "step": 13090 + }, + { + "epoch": 0.9444825222755312, + "grad_norm": 2.0726224859875924, + "learning_rate": 3.220523073508974e-08, + "loss": 0.9056, + "step": 13091 + }, + { + "epoch": 0.9445546697449587, + "grad_norm": 2.2699806175580783, + "learning_rate": 3.2121751251697135e-08, + "loss": 0.9011, + "step": 13092 + }, + { + "epoch": 0.9446268172143862, + "grad_norm": 2.435245547280462, + "learning_rate": 3.203837922685082e-08, + "loss": 0.9371, + "step": 13093 + }, + { + "epoch": 0.9446989646838138, + "grad_norm": 2.3790278160967273, + "learning_rate": 3.1955114665103367e-08, + "loss": 0.9172, + "step": 13094 + }, + { + "epoch": 0.9447711121532413, + "grad_norm": 2.2537970822240845, + "learning_rate": 3.187195757100181e-08, + "loss": 0.8998, + "step": 13095 + }, + { + "epoch": 0.9448432596226687, + "grad_norm": 2.684447306857081, + "learning_rate": 3.178890794908651e-08, + "loss": 0.8878, + "step": 13096 + }, + { + "epoch": 0.9449154070920962, + "grad_norm": 1.7575845528201166, + "learning_rate": 3.1705965803893174e-08, + "loss": 0.7832, + "step": 13097 + }, + { + "epoch": 0.9449875545615237, + "grad_norm": 2.162292748370808, + "learning_rate": 3.162313113995063e-08, + "loss": 0.8926, + "step": 13098 + }, + { + "epoch": 0.9450597020309512, + "grad_norm": 0.6830936019117573, + "learning_rate": 3.1540403961782144e-08, + "loss": 0.8091, + "step": 13099 + }, + { + "epoch": 0.9451318495003788, + "grad_norm": 3.0136962414966013, + "learning_rate": 3.145778427390544e-08, + "loss": 0.9045, + "step": 13100 + }, + { + "epoch": 0.9452039969698063, + "grad_norm": 3.4533283985009398, + "learning_rate": 3.137527208083135e-08, + "loss": 0.9171, + "step": 13101 + }, + { + "epoch": 0.9452761444392338, + "grad_norm": 2.4233062452807625, + "learning_rate": 3.129286738706649e-08, + "loss": 0.8619, + "step": 13102 + }, + { + "epoch": 0.9453482919086613, + "grad_norm": 2.0537507565311794, + "learning_rate": 3.1210570197110155e-08, + "loss": 0.8695, + "step": 13103 + }, + { + "epoch": 0.9454204393780888, + "grad_norm": 6.444883390071623, + "learning_rate": 3.1128380515456296e-08, + "loss": 0.8598, + "step": 13104 + }, + { + "epoch": 0.9454925868475164, + "grad_norm": 3.2890769976298655, + "learning_rate": 3.104629834659311e-08, + "loss": 0.8632, + "step": 13105 + }, + { + "epoch": 0.9455647343169439, + "grad_norm": 2.2278371274509667, + "learning_rate": 3.096432369500301e-08, + "loss": 0.8924, + "step": 13106 + }, + { + "epoch": 0.9456368817863714, + "grad_norm": 2.7278404410787784, + "learning_rate": 3.088245656516153e-08, + "loss": 0.8976, + "step": 13107 + }, + { + "epoch": 0.9457090292557988, + "grad_norm": 2.565649283848278, + "learning_rate": 3.080069696153997e-08, + "loss": 0.8962, + "step": 13108 + }, + { + "epoch": 0.9457811767252263, + "grad_norm": 2.9425511861155713, + "learning_rate": 3.0719044888602776e-08, + "loss": 0.9982, + "step": 13109 + }, + { + "epoch": 0.9458533241946538, + "grad_norm": 3.258080926535211, + "learning_rate": 3.063750035080836e-08, + "loss": 0.9577, + "step": 13110 + }, + { + "epoch": 0.9459254716640814, + "grad_norm": 3.6255108703526076, + "learning_rate": 3.055606335260963e-08, + "loss": 0.9478, + "step": 13111 + }, + { + "epoch": 0.9459976191335089, + "grad_norm": 1.850370192218666, + "learning_rate": 3.04747338984539e-08, + "loss": 0.9981, + "step": 13112 + }, + { + "epoch": 0.9460697666029364, + "grad_norm": 2.428367133323152, + "learning_rate": 3.039351199278184e-08, + "loss": 0.8639, + "step": 13113 + }, + { + "epoch": 0.9461419140723639, + "grad_norm": 2.5528457021918145, + "learning_rate": 3.0312397640028575e-08, + "loss": 0.9192, + "step": 13114 + }, + { + "epoch": 0.9462140615417914, + "grad_norm": 3.139104674432418, + "learning_rate": 3.023139084462412e-08, + "loss": 0.8584, + "step": 13115 + }, + { + "epoch": 0.946286209011219, + "grad_norm": 3.021507573793859, + "learning_rate": 3.015049161099159e-08, + "loss": 0.9398, + "step": 13116 + }, + { + "epoch": 0.9463583564806465, + "grad_norm": 2.2205949017164963, + "learning_rate": 3.006969994354835e-08, + "loss": 0.8955, + "step": 13117 + }, + { + "epoch": 0.946430503950074, + "grad_norm": 2.352327545525752, + "learning_rate": 2.9989015846706654e-08, + "loss": 0.8523, + "step": 13118 + }, + { + "epoch": 0.9465026514195014, + "grad_norm": 2.8276572499281607, + "learning_rate": 2.990843932487186e-08, + "loss": 0.9188, + "step": 13119 + }, + { + "epoch": 0.9465747988889289, + "grad_norm": 0.7640983385913461, + "learning_rate": 2.982797038244422e-08, + "loss": 0.7959, + "step": 13120 + }, + { + "epoch": 0.9466469463583564, + "grad_norm": 3.1070864128911593, + "learning_rate": 2.9747609023817564e-08, + "loss": 0.8959, + "step": 13121 + }, + { + "epoch": 0.946719093827784, + "grad_norm": 2.1961822415941388, + "learning_rate": 2.96673552533806e-08, + "loss": 0.966, + "step": 13122 + }, + { + "epoch": 0.9467912412972115, + "grad_norm": 2.5886096804693874, + "learning_rate": 2.9587209075515597e-08, + "loss": 0.8883, + "step": 13123 + }, + { + "epoch": 0.946863388766639, + "grad_norm": 0.675371828804994, + "learning_rate": 2.950717049459861e-08, + "loss": 0.7676, + "step": 13124 + }, + { + "epoch": 0.9469355362360665, + "grad_norm": 4.524695977382805, + "learning_rate": 2.942723951500059e-08, + "loss": 0.9277, + "step": 13125 + }, + { + "epoch": 0.947007683705494, + "grad_norm": 2.39932801057791, + "learning_rate": 2.9347416141086266e-08, + "loss": 1.0448, + "step": 13126 + }, + { + "epoch": 0.9470798311749216, + "grad_norm": 3.26327473879752, + "learning_rate": 2.926770037721438e-08, + "loss": 0.9733, + "step": 13127 + }, + { + "epoch": 0.9471519786443491, + "grad_norm": 4.17664010731827, + "learning_rate": 2.9188092227738105e-08, + "loss": 0.9659, + "step": 13128 + }, + { + "epoch": 0.9472241261137766, + "grad_norm": 2.9307446009530795, + "learning_rate": 2.9108591697004415e-08, + "loss": 0.8672, + "step": 13129 + }, + { + "epoch": 0.9472962735832041, + "grad_norm": 2.302836667442775, + "learning_rate": 2.9029198789354502e-08, + "loss": 0.9647, + "step": 13130 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 3.3592760648131414, + "learning_rate": 2.894991350912357e-08, + "loss": 0.8858, + "step": 13131 + }, + { + "epoch": 0.947440568522059, + "grad_norm": 2.6945607781831526, + "learning_rate": 2.887073586064126e-08, + "loss": 0.9992, + "step": 13132 + }, + { + "epoch": 0.9475127159914866, + "grad_norm": 2.504802192446524, + "learning_rate": 2.879166584823145e-08, + "loss": 0.7821, + "step": 13133 + }, + { + "epoch": 0.9475848634609141, + "grad_norm": 2.110519211873932, + "learning_rate": 2.8712703476211576e-08, + "loss": 0.9496, + "step": 13134 + }, + { + "epoch": 0.9476570109303416, + "grad_norm": 2.408100160004819, + "learning_rate": 2.8633848748893518e-08, + "loss": 0.9762, + "step": 13135 + }, + { + "epoch": 0.9477291583997691, + "grad_norm": 3.053880511761183, + "learning_rate": 2.8555101670583172e-08, + "loss": 0.9415, + "step": 13136 + }, + { + "epoch": 0.9478013058691966, + "grad_norm": 2.922533583240065, + "learning_rate": 2.8476462245580644e-08, + "loss": 0.8104, + "step": 13137 + }, + { + "epoch": 0.9478734533386242, + "grad_norm": 2.7773974020452923, + "learning_rate": 2.8397930478180064e-08, + "loss": 0.8708, + "step": 13138 + }, + { + "epoch": 0.9479456008080517, + "grad_norm": 3.1457876408439622, + "learning_rate": 2.831950637267022e-08, + "loss": 0.8437, + "step": 13139 + }, + { + "epoch": 0.9480177482774792, + "grad_norm": 4.866947664135057, + "learning_rate": 2.824118993333302e-08, + "loss": 0.9065, + "step": 13140 + }, + { + "epoch": 0.9480898957469067, + "grad_norm": 3.266755852811759, + "learning_rate": 2.8162981164445265e-08, + "loss": 0.9013, + "step": 13141 + }, + { + "epoch": 0.9481620432163342, + "grad_norm": 3.433753312605408, + "learning_rate": 2.808488007027754e-08, + "loss": 0.8583, + "step": 13142 + }, + { + "epoch": 0.9482341906857616, + "grad_norm": 3.1004367582222736, + "learning_rate": 2.8006886655094875e-08, + "loss": 0.9997, + "step": 13143 + }, + { + "epoch": 0.9483063381551892, + "grad_norm": 3.6586855088138943, + "learning_rate": 2.7929000923155866e-08, + "loss": 1.0326, + "step": 13144 + }, + { + "epoch": 0.9483784856246167, + "grad_norm": 2.9090479026671607, + "learning_rate": 2.7851222878713776e-08, + "loss": 0.9576, + "step": 13145 + }, + { + "epoch": 0.9484506330940442, + "grad_norm": 2.052822876972388, + "learning_rate": 2.7773552526015875e-08, + "loss": 1.03, + "step": 13146 + }, + { + "epoch": 0.9485227805634717, + "grad_norm": 3.223434890546311, + "learning_rate": 2.7695989869303437e-08, + "loss": 0.8233, + "step": 13147 + }, + { + "epoch": 0.9485949280328992, + "grad_norm": 3.140799294565037, + "learning_rate": 2.761853491281152e-08, + "loss": 0.965, + "step": 13148 + }, + { + "epoch": 0.9486670755023268, + "grad_norm": 3.2757813431612197, + "learning_rate": 2.7541187660769848e-08, + "loss": 0.8876, + "step": 13149 + }, + { + "epoch": 0.9487392229717543, + "grad_norm": 2.595802506151025, + "learning_rate": 2.74639481174026e-08, + "loss": 0.8816, + "step": 13150 + }, + { + "epoch": 0.9488113704411818, + "grad_norm": 2.667528599279193, + "learning_rate": 2.7386816286926627e-08, + "loss": 0.9648, + "step": 13151 + }, + { + "epoch": 0.9488835179106093, + "grad_norm": 3.9020564648459652, + "learning_rate": 2.7309792173554557e-08, + "loss": 0.8913, + "step": 13152 + }, + { + "epoch": 0.9489556653800368, + "grad_norm": 2.7186322241517944, + "learning_rate": 2.7232875781491916e-08, + "loss": 1.0754, + "step": 13153 + }, + { + "epoch": 0.9490278128494644, + "grad_norm": 7.898912002392051, + "learning_rate": 2.715606711493912e-08, + "loss": 0.9056, + "step": 13154 + }, + { + "epoch": 0.9490999603188918, + "grad_norm": 2.1237394296473995, + "learning_rate": 2.7079366178089923e-08, + "loss": 0.9716, + "step": 13155 + }, + { + "epoch": 0.9491721077883193, + "grad_norm": 2.234870908964616, + "learning_rate": 2.7002772975133426e-08, + "loss": 0.911, + "step": 13156 + }, + { + "epoch": 0.9492442552577468, + "grad_norm": 4.2150439903343955, + "learning_rate": 2.692628751025161e-08, + "loss": 0.8406, + "step": 13157 + }, + { + "epoch": 0.9493164027271743, + "grad_norm": 4.053588483041137, + "learning_rate": 2.684990978762114e-08, + "loss": 0.804, + "step": 13158 + }, + { + "epoch": 0.9493885501966018, + "grad_norm": 0.7548664678504227, + "learning_rate": 2.6773639811412674e-08, + "loss": 0.7914, + "step": 13159 + }, + { + "epoch": 0.9494606976660294, + "grad_norm": 3.8394318182130207, + "learning_rate": 2.6697477585791328e-08, + "loss": 0.9617, + "step": 13160 + }, + { + "epoch": 0.9495328451354569, + "grad_norm": 0.6430494209195587, + "learning_rate": 2.6621423114915554e-08, + "loss": 0.7188, + "step": 13161 + }, + { + "epoch": 0.9496049926048844, + "grad_norm": 2.121243466314932, + "learning_rate": 2.6545476402938695e-08, + "loss": 0.8355, + "step": 13162 + }, + { + "epoch": 0.9496771400743119, + "grad_norm": 6.294385545721946, + "learning_rate": 2.64696374540081e-08, + "loss": 0.9683, + "step": 13163 + }, + { + "epoch": 0.9497492875437394, + "grad_norm": 4.578106876897256, + "learning_rate": 2.6393906272264677e-08, + "loss": 0.8657, + "step": 13164 + }, + { + "epoch": 0.949821435013167, + "grad_norm": 2.035232513228907, + "learning_rate": 2.631828286184401e-08, + "loss": 0.8349, + "step": 13165 + }, + { + "epoch": 0.9498935824825945, + "grad_norm": 2.982703734583497, + "learning_rate": 2.6242767226875463e-08, + "loss": 0.9539, + "step": 13166 + }, + { + "epoch": 0.9499657299520219, + "grad_norm": 2.317994883678935, + "learning_rate": 2.6167359371483065e-08, + "loss": 0.7843, + "step": 13167 + }, + { + "epoch": 0.9500378774214494, + "grad_norm": 3.1474376065688183, + "learning_rate": 2.6092059299783974e-08, + "loss": 0.9185, + "step": 13168 + }, + { + "epoch": 0.9501100248908769, + "grad_norm": 2.8965398534913636, + "learning_rate": 2.6016867015890677e-08, + "loss": 0.8948, + "step": 13169 + }, + { + "epoch": 0.9501821723603044, + "grad_norm": 2.8513032978299275, + "learning_rate": 2.5941782523908772e-08, + "loss": 0.9204, + "step": 13170 + }, + { + "epoch": 0.950254319829732, + "grad_norm": 3.2700502628581494, + "learning_rate": 2.586680582793832e-08, + "loss": 0.819, + "step": 13171 + }, + { + "epoch": 0.9503264672991595, + "grad_norm": 1.7730126229066527, + "learning_rate": 2.579193693207382e-08, + "loss": 0.9013, + "step": 13172 + }, + { + "epoch": 0.950398614768587, + "grad_norm": 2.459720177281331, + "learning_rate": 2.5717175840403337e-08, + "loss": 0.9973, + "step": 13173 + }, + { + "epoch": 0.9504707622380145, + "grad_norm": 1.996086941761567, + "learning_rate": 2.564252255700894e-08, + "loss": 0.8484, + "step": 13174 + }, + { + "epoch": 0.950542909707442, + "grad_norm": 2.6796982656546224, + "learning_rate": 2.5567977085967806e-08, + "loss": 0.8257, + "step": 13175 + }, + { + "epoch": 0.9506150571768696, + "grad_norm": 2.4603946147536653, + "learning_rate": 2.5493539431350465e-08, + "loss": 0.8467, + "step": 13176 + }, + { + "epoch": 0.9506872046462971, + "grad_norm": 2.8405490508381095, + "learning_rate": 2.5419209597221436e-08, + "loss": 0.9326, + "step": 13177 + }, + { + "epoch": 0.9507593521157245, + "grad_norm": 2.8691808439361157, + "learning_rate": 2.53449875876397e-08, + "loss": 0.8533, + "step": 13178 + }, + { + "epoch": 0.950831499585152, + "grad_norm": 2.139008294387722, + "learning_rate": 2.5270873406658233e-08, + "loss": 0.9446, + "step": 13179 + }, + { + "epoch": 0.9509036470545795, + "grad_norm": 2.3341920385192747, + "learning_rate": 2.5196867058324466e-08, + "loss": 0.9023, + "step": 13180 + }, + { + "epoch": 0.950975794524007, + "grad_norm": 2.6480598292953252, + "learning_rate": 2.512296854667917e-08, + "loss": 0.8571, + "step": 13181 + }, + { + "epoch": 0.9510479419934346, + "grad_norm": 3.4206600732327797, + "learning_rate": 2.5049177875757553e-08, + "loss": 0.8739, + "step": 13182 + }, + { + "epoch": 0.9511200894628621, + "grad_norm": 6.291425216117569, + "learning_rate": 2.4975495049589735e-08, + "loss": 0.9098, + "step": 13183 + }, + { + "epoch": 0.9511922369322896, + "grad_norm": 3.380961486578998, + "learning_rate": 2.490192007219871e-08, + "loss": 0.956, + "step": 13184 + }, + { + "epoch": 0.9512643844017171, + "grad_norm": 2.2693881647962058, + "learning_rate": 2.4828452947602163e-08, + "loss": 0.8705, + "step": 13185 + }, + { + "epoch": 0.9513365318711446, + "grad_norm": 3.0359499960036995, + "learning_rate": 2.4755093679811768e-08, + "loss": 0.909, + "step": 13186 + }, + { + "epoch": 0.9514086793405722, + "grad_norm": 7.653585414035791, + "learning_rate": 2.4681842272833873e-08, + "loss": 0.9759, + "step": 13187 + }, + { + "epoch": 0.9514808268099997, + "grad_norm": 4.208666243777674, + "learning_rate": 2.4608698730668175e-08, + "loss": 0.8644, + "step": 13188 + }, + { + "epoch": 0.9515529742794272, + "grad_norm": 0.7694068072241752, + "learning_rate": 2.4535663057308807e-08, + "loss": 0.8655, + "step": 13189 + }, + { + "epoch": 0.9516251217488546, + "grad_norm": 2.9128782100446173, + "learning_rate": 2.4462735256743915e-08, + "loss": 0.8939, + "step": 13190 + }, + { + "epoch": 0.9516972692182821, + "grad_norm": 2.577603836558001, + "learning_rate": 2.4389915332955868e-08, + "loss": 0.8907, + "step": 13191 + }, + { + "epoch": 0.9517694166877096, + "grad_norm": 23.501823192241385, + "learning_rate": 2.431720328992104e-08, + "loss": 0.9417, + "step": 13192 + }, + { + "epoch": 0.9518415641571372, + "grad_norm": 3.8578007379215817, + "learning_rate": 2.424459913161003e-08, + "loss": 0.9242, + "step": 13193 + }, + { + "epoch": 0.9519137116265647, + "grad_norm": 2.8096580557966355, + "learning_rate": 2.4172102861987677e-08, + "loss": 1.023, + "step": 13194 + }, + { + "epoch": 0.9519858590959922, + "grad_norm": 2.425832259521806, + "learning_rate": 2.4099714485012133e-08, + "loss": 0.8609, + "step": 13195 + }, + { + "epoch": 0.9520580065654197, + "grad_norm": 2.3016154876845554, + "learning_rate": 2.4027434004636916e-08, + "loss": 0.8399, + "step": 13196 + }, + { + "epoch": 0.9521301540348472, + "grad_norm": 2.787697765365566, + "learning_rate": 2.395526142480886e-08, + "loss": 0.8447, + "step": 13197 + }, + { + "epoch": 0.9522023015042748, + "grad_norm": 3.018732239096383, + "learning_rate": 2.388319674946859e-08, + "loss": 0.9361, + "step": 13198 + }, + { + "epoch": 0.9522744489737023, + "grad_norm": 0.7898863089516005, + "learning_rate": 2.3811239982551635e-08, + "loss": 0.7859, + "step": 13199 + }, + { + "epoch": 0.9523465964431298, + "grad_norm": 8.16556178293674, + "learning_rate": 2.3739391127987506e-08, + "loss": 0.9875, + "step": 13200 + }, + { + "epoch": 0.9524187439125573, + "grad_norm": 3.3779837523228307, + "learning_rate": 2.3667650189699297e-08, + "loss": 0.9645, + "step": 13201 + }, + { + "epoch": 0.9524908913819847, + "grad_norm": 2.8197958517918402, + "learning_rate": 2.359601717160431e-08, + "loss": 1.0142, + "step": 13202 + }, + { + "epoch": 0.9525630388514122, + "grad_norm": 2.097590212785888, + "learning_rate": 2.3524492077614533e-08, + "loss": 0.85, + "step": 13203 + }, + { + "epoch": 0.9526351863208398, + "grad_norm": 1.8654517880898835, + "learning_rate": 2.345307491163573e-08, + "loss": 0.8986, + "step": 13204 + }, + { + "epoch": 0.9527073337902673, + "grad_norm": 4.181399784341859, + "learning_rate": 2.3381765677567666e-08, + "loss": 0.9168, + "step": 13205 + }, + { + "epoch": 0.9527794812596948, + "grad_norm": 4.418371747134234, + "learning_rate": 2.3310564379303898e-08, + "loss": 0.9172, + "step": 13206 + }, + { + "epoch": 0.9528516287291223, + "grad_norm": 4.886096551866342, + "learning_rate": 2.3239471020733092e-08, + "loss": 0.9428, + "step": 13207 + }, + { + "epoch": 0.9529237761985498, + "grad_norm": 2.6600301435383154, + "learning_rate": 2.316848560573681e-08, + "loss": 1.0121, + "step": 13208 + }, + { + "epoch": 0.9529959236679774, + "grad_norm": 2.0334301326202597, + "learning_rate": 2.3097608138191503e-08, + "loss": 0.958, + "step": 13209 + }, + { + "epoch": 0.9530680711374049, + "grad_norm": 2.6667094127884723, + "learning_rate": 2.3026838621967637e-08, + "loss": 1.0829, + "step": 13210 + }, + { + "epoch": 0.9531402186068324, + "grad_norm": 1.9204079215519034, + "learning_rate": 2.295617706092967e-08, + "loss": 0.9969, + "step": 13211 + }, + { + "epoch": 0.9532123660762599, + "grad_norm": 2.562837811044904, + "learning_rate": 2.2885623458936076e-08, + "loss": 0.9359, + "step": 13212 + }, + { + "epoch": 0.9532845135456874, + "grad_norm": 3.8395755088148253, + "learning_rate": 2.281517781983955e-08, + "loss": 0.9111, + "step": 13213 + }, + { + "epoch": 0.9533566610151148, + "grad_norm": 2.671209062718695, + "learning_rate": 2.274484014748701e-08, + "loss": 0.8766, + "step": 13214 + }, + { + "epoch": 0.9534288084845424, + "grad_norm": 5.736602766237821, + "learning_rate": 2.2674610445719167e-08, + "loss": 0.9226, + "step": 13215 + }, + { + "epoch": 0.9535009559539699, + "grad_norm": 2.4060619330295157, + "learning_rate": 2.260448871837073e-08, + "loss": 0.9051, + "step": 13216 + }, + { + "epoch": 0.9535731034233974, + "grad_norm": 0.7511824188047875, + "learning_rate": 2.2534474969271522e-08, + "loss": 0.8267, + "step": 13217 + }, + { + "epoch": 0.9536452508928249, + "grad_norm": 2.344518611763306, + "learning_rate": 2.2464569202244043e-08, + "loss": 1.0134, + "step": 13218 + }, + { + "epoch": 0.9537173983622524, + "grad_norm": 3.5781676443965353, + "learning_rate": 2.2394771421105907e-08, + "loss": 0.8907, + "step": 13219 + }, + { + "epoch": 0.95378954583168, + "grad_norm": 2.6483021029180778, + "learning_rate": 2.2325081629668506e-08, + "loss": 0.8251, + "step": 13220 + }, + { + "epoch": 0.9538616933011075, + "grad_norm": 3.9576434822603996, + "learning_rate": 2.2255499831737246e-08, + "loss": 0.9629, + "step": 13221 + }, + { + "epoch": 0.953933840770535, + "grad_norm": 4.476450441033095, + "learning_rate": 2.2186026031111748e-08, + "loss": 0.8836, + "step": 13222 + }, + { + "epoch": 0.9540059882399625, + "grad_norm": 22.332239304501364, + "learning_rate": 2.211666023158587e-08, + "loss": 0.8535, + "step": 13223 + }, + { + "epoch": 0.95407813570939, + "grad_norm": 2.24216176726033, + "learning_rate": 2.2047402436947248e-08, + "loss": 0.9181, + "step": 13224 + }, + { + "epoch": 0.9541502831788174, + "grad_norm": 2.1850516650553558, + "learning_rate": 2.1978252650977746e-08, + "loss": 0.855, + "step": 13225 + }, + { + "epoch": 0.954222430648245, + "grad_norm": 2.1301383556508924, + "learning_rate": 2.1909210877453678e-08, + "loss": 0.9637, + "step": 13226 + }, + { + "epoch": 0.9542945781176725, + "grad_norm": 0.7649710061307174, + "learning_rate": 2.184027712014469e-08, + "loss": 0.8476, + "step": 13227 + }, + { + "epoch": 0.9543667255871, + "grad_norm": 2.7410140175751487, + "learning_rate": 2.1771451382815553e-08, + "loss": 1.0854, + "step": 13228 + }, + { + "epoch": 0.9544388730565275, + "grad_norm": 2.313283220604003, + "learning_rate": 2.170273366922415e-08, + "loss": 0.9582, + "step": 13229 + }, + { + "epoch": 0.954511020525955, + "grad_norm": 3.3733034461535687, + "learning_rate": 2.163412398312303e-08, + "loss": 0.9738, + "step": 13230 + }, + { + "epoch": 0.9545831679953826, + "grad_norm": 3.447130203829104, + "learning_rate": 2.156562232825898e-08, + "loss": 0.9385, + "step": 13231 + }, + { + "epoch": 0.9546553154648101, + "grad_norm": 4.096197197637535, + "learning_rate": 2.1497228708372116e-08, + "loss": 0.8424, + "step": 13232 + }, + { + "epoch": 0.9547274629342376, + "grad_norm": 2.764544518396395, + "learning_rate": 2.1428943127197453e-08, + "loss": 0.8987, + "step": 13233 + }, + { + "epoch": 0.9547996104036651, + "grad_norm": 2.220664689160011, + "learning_rate": 2.1360765588464003e-08, + "loss": 0.988, + "step": 13234 + }, + { + "epoch": 0.9548717578730926, + "grad_norm": 2.423151479698885, + "learning_rate": 2.1292696095894125e-08, + "loss": 0.7892, + "step": 13235 + }, + { + "epoch": 0.9549439053425202, + "grad_norm": 2.132424567298287, + "learning_rate": 2.1224734653205512e-08, + "loss": 0.8589, + "step": 13236 + }, + { + "epoch": 0.9550160528119476, + "grad_norm": 2.4617819654168125, + "learning_rate": 2.115688126410875e-08, + "loss": 0.9416, + "step": 13237 + }, + { + "epoch": 0.9550882002813751, + "grad_norm": 5.227596802145864, + "learning_rate": 2.1089135932309542e-08, + "loss": 1.0134, + "step": 13238 + }, + { + "epoch": 0.9551603477508026, + "grad_norm": 3.407136154396705, + "learning_rate": 2.102149866150671e-08, + "loss": 0.9404, + "step": 13239 + }, + { + "epoch": 0.9552324952202301, + "grad_norm": 0.7951726494460577, + "learning_rate": 2.0953969455393738e-08, + "loss": 0.8515, + "step": 13240 + }, + { + "epoch": 0.9553046426896576, + "grad_norm": 2.1549349105457836, + "learning_rate": 2.088654831765879e-08, + "loss": 0.9211, + "step": 13241 + }, + { + "epoch": 0.9553767901590852, + "grad_norm": 5.6851815329415825, + "learning_rate": 2.0819235251982705e-08, + "loss": 1.073, + "step": 13242 + }, + { + "epoch": 0.9554489376285127, + "grad_norm": 2.343971038567835, + "learning_rate": 2.075203026204142e-08, + "loss": 0.9182, + "step": 13243 + }, + { + "epoch": 0.9555210850979402, + "grad_norm": 2.2111503579486724, + "learning_rate": 2.068493335150512e-08, + "loss": 0.8011, + "step": 13244 + }, + { + "epoch": 0.9555932325673677, + "grad_norm": 2.503722185579354, + "learning_rate": 2.0617944524037313e-08, + "loss": 0.8845, + "step": 13245 + }, + { + "epoch": 0.9556653800367952, + "grad_norm": 3.1870653940750633, + "learning_rate": 2.0551063783295965e-08, + "loss": 0.9007, + "step": 13246 + }, + { + "epoch": 0.9557375275062228, + "grad_norm": 2.459707964178465, + "learning_rate": 2.0484291132933483e-08, + "loss": 0.8311, + "step": 13247 + }, + { + "epoch": 0.9558096749756503, + "grad_norm": 2.7186546747442994, + "learning_rate": 2.041762657659607e-08, + "loss": 0.9405, + "step": 13248 + }, + { + "epoch": 0.9558818224450777, + "grad_norm": 2.3538141464916817, + "learning_rate": 2.0351070117923918e-08, + "loss": 0.9664, + "step": 13249 + }, + { + "epoch": 0.9559539699145052, + "grad_norm": 0.7397240965943562, + "learning_rate": 2.0284621760551235e-08, + "loss": 0.7755, + "step": 13250 + }, + { + "epoch": 0.9560261173839327, + "grad_norm": 2.276313038120168, + "learning_rate": 2.0218281508106894e-08, + "loss": 0.9655, + "step": 13251 + }, + { + "epoch": 0.9560982648533602, + "grad_norm": 2.2144930122835422, + "learning_rate": 2.0152049364212888e-08, + "loss": 1.027, + "step": 13252 + }, + { + "epoch": 0.9561704123227878, + "grad_norm": 4.709495108527674, + "learning_rate": 2.0085925332486764e-08, + "loss": 0.9301, + "step": 13253 + }, + { + "epoch": 0.9562425597922153, + "grad_norm": 2.4760184186764937, + "learning_rate": 2.0019909416538528e-08, + "loss": 0.8925, + "step": 13254 + }, + { + "epoch": 0.9563147072616428, + "grad_norm": 3.2056928064973578, + "learning_rate": 1.9954001619973737e-08, + "loss": 0.8503, + "step": 13255 + }, + { + "epoch": 0.9563868547310703, + "grad_norm": 2.1839240136604174, + "learning_rate": 1.988820194639085e-08, + "loss": 0.98, + "step": 13256 + }, + { + "epoch": 0.9564590022004978, + "grad_norm": 3.7671070269071003, + "learning_rate": 1.982251039938321e-08, + "loss": 1.0039, + "step": 13257 + }, + { + "epoch": 0.9565311496699254, + "grad_norm": 2.589727845050882, + "learning_rate": 1.9756926982537945e-08, + "loss": 0.8897, + "step": 13258 + }, + { + "epoch": 0.9566032971393529, + "grad_norm": 2.1947965728410423, + "learning_rate": 1.96914516994362e-08, + "loss": 0.9509, + "step": 13259 + }, + { + "epoch": 0.9566754446087804, + "grad_norm": 3.6211335181034983, + "learning_rate": 1.9626084553653332e-08, + "loss": 0.9198, + "step": 13260 + }, + { + "epoch": 0.9567475920782078, + "grad_norm": 2.121320680734404, + "learning_rate": 1.9560825548759152e-08, + "loss": 0.8194, + "step": 13261 + }, + { + "epoch": 0.9568197395476353, + "grad_norm": 2.3799009451568662, + "learning_rate": 1.949567468831681e-08, + "loss": 0.858, + "step": 13262 + }, + { + "epoch": 0.9568918870170628, + "grad_norm": 2.0210931224254316, + "learning_rate": 1.9430631975884127e-08, + "loss": 0.8531, + "step": 13263 + }, + { + "epoch": 0.9569640344864904, + "grad_norm": 2.9191470543887448, + "learning_rate": 1.9365697415012483e-08, + "loss": 0.876, + "step": 13264 + }, + { + "epoch": 0.9570361819559179, + "grad_norm": 3.678825131581397, + "learning_rate": 1.9300871009248598e-08, + "loss": 0.9536, + "step": 13265 + }, + { + "epoch": 0.9571083294253454, + "grad_norm": 4.486656751541435, + "learning_rate": 1.923615276213142e-08, + "loss": 0.9271, + "step": 13266 + }, + { + "epoch": 0.9571804768947729, + "grad_norm": 3.8117632544894975, + "learning_rate": 1.9171542677195672e-08, + "loss": 0.9356, + "step": 13267 + }, + { + "epoch": 0.9572526243642004, + "grad_norm": 3.4410055145866907, + "learning_rate": 1.9107040757969206e-08, + "loss": 0.9455, + "step": 13268 + }, + { + "epoch": 0.957324771833628, + "grad_norm": 4.042089512259974, + "learning_rate": 1.9042647007974088e-08, + "loss": 0.9274, + "step": 13269 + }, + { + "epoch": 0.9573969193030555, + "grad_norm": 2.799793446960345, + "learning_rate": 1.897836143072662e-08, + "loss": 0.963, + "step": 13270 + }, + { + "epoch": 0.957469066772483, + "grad_norm": 3.871684840522727, + "learning_rate": 1.8914184029737546e-08, + "loss": 0.9087, + "step": 13271 + }, + { + "epoch": 0.9575412142419104, + "grad_norm": 2.544246229653434, + "learning_rate": 1.8850114808511176e-08, + "loss": 0.8879, + "step": 13272 + }, + { + "epoch": 0.9576133617113379, + "grad_norm": 2.3463225424297516, + "learning_rate": 1.8786153770546044e-08, + "loss": 0.9094, + "step": 13273 + }, + { + "epoch": 0.9576855091807654, + "grad_norm": 5.5896458407487675, + "learning_rate": 1.872230091933491e-08, + "loss": 0.8068, + "step": 13274 + }, + { + "epoch": 0.957757656650193, + "grad_norm": 2.7441976195148867, + "learning_rate": 1.8658556258364543e-08, + "loss": 0.9083, + "step": 13275 + }, + { + "epoch": 0.9578298041196205, + "grad_norm": 2.4264419319378243, + "learning_rate": 1.8594919791115494e-08, + "loss": 0.8049, + "step": 13276 + }, + { + "epoch": 0.957901951589048, + "grad_norm": 2.939761812850896, + "learning_rate": 1.853139152106298e-08, + "loss": 0.8095, + "step": 13277 + }, + { + "epoch": 0.9579740990584755, + "grad_norm": 2.2311780773720318, + "learning_rate": 1.8467971451676224e-08, + "loss": 1.0037, + "step": 13278 + }, + { + "epoch": 0.958046246527903, + "grad_norm": 1.9374756965343474, + "learning_rate": 1.8404659586418014e-08, + "loss": 0.9314, + "step": 13279 + }, + { + "epoch": 0.9581183939973306, + "grad_norm": 3.359415595230738, + "learning_rate": 1.8341455928745807e-08, + "loss": 0.8747, + "step": 13280 + }, + { + "epoch": 0.9581905414667581, + "grad_norm": 2.873132181878498, + "learning_rate": 1.8278360482110844e-08, + "loss": 0.8532, + "step": 13281 + }, + { + "epoch": 0.9582626889361856, + "grad_norm": 2.642050273812561, + "learning_rate": 1.8215373249958587e-08, + "loss": 0.9056, + "step": 13282 + }, + { + "epoch": 0.9583348364056131, + "grad_norm": 2.456160592740288, + "learning_rate": 1.8152494235728288e-08, + "loss": 0.8961, + "step": 13283 + }, + { + "epoch": 0.9584069838750405, + "grad_norm": 2.3071286479001176, + "learning_rate": 1.8089723442853865e-08, + "loss": 0.9112, + "step": 13284 + }, + { + "epoch": 0.958479131344468, + "grad_norm": 3.025032982764645, + "learning_rate": 1.8027060874763022e-08, + "loss": 0.9228, + "step": 13285 + }, + { + "epoch": 0.9585512788138956, + "grad_norm": 5.865970246565674, + "learning_rate": 1.796450653487702e-08, + "loss": 0.9402, + "step": 13286 + }, + { + "epoch": 0.9586234262833231, + "grad_norm": 2.037544942722914, + "learning_rate": 1.7902060426612243e-08, + "loss": 1.0074, + "step": 13287 + }, + { + "epoch": 0.9586955737527506, + "grad_norm": 1.9944101538170527, + "learning_rate": 1.7839722553378622e-08, + "loss": 0.9428, + "step": 13288 + }, + { + "epoch": 0.9587677212221781, + "grad_norm": 4.6501588855793745, + "learning_rate": 1.7777492918579884e-08, + "loss": 0.9065, + "step": 13289 + }, + { + "epoch": 0.9588398686916056, + "grad_norm": 3.772276389722978, + "learning_rate": 1.7715371525614198e-08, + "loss": 0.9451, + "step": 13290 + }, + { + "epoch": 0.9589120161610332, + "grad_norm": 3.005342653513382, + "learning_rate": 1.7653358377874184e-08, + "loss": 0.7531, + "step": 13291 + }, + { + "epoch": 0.9589841636304607, + "grad_norm": 2.540720237082414, + "learning_rate": 1.75914534787458e-08, + "loss": 0.9122, + "step": 13292 + }, + { + "epoch": 0.9590563110998882, + "grad_norm": 3.671905777173209, + "learning_rate": 1.7529656831609452e-08, + "loss": 0.9786, + "step": 13293 + }, + { + "epoch": 0.9591284585693157, + "grad_norm": 3.1557113641082593, + "learning_rate": 1.7467968439839554e-08, + "loss": 0.8492, + "step": 13294 + }, + { + "epoch": 0.9592006060387432, + "grad_norm": 0.7097701245229858, + "learning_rate": 1.7406388306804964e-08, + "loss": 0.7896, + "step": 13295 + }, + { + "epoch": 0.9592727535081707, + "grad_norm": 2.5937295016662243, + "learning_rate": 1.734491643586833e-08, + "loss": 0.9601, + "step": 13296 + }, + { + "epoch": 0.9593449009775982, + "grad_norm": 2.5102705748115377, + "learning_rate": 1.7283552830386074e-08, + "loss": 0.9261, + "step": 13297 + }, + { + "epoch": 0.9594170484470257, + "grad_norm": 2.1346973851527475, + "learning_rate": 1.7222297493709292e-08, + "loss": 0.9419, + "step": 13298 + }, + { + "epoch": 0.9594891959164532, + "grad_norm": 2.1013715579280583, + "learning_rate": 1.7161150429182868e-08, + "loss": 0.9322, + "step": 13299 + }, + { + "epoch": 0.9595613433858807, + "grad_norm": 2.2953436281341015, + "learning_rate": 1.7100111640145686e-08, + "loss": 0.9602, + "step": 13300 + }, + { + "epoch": 0.9596334908553082, + "grad_norm": 2.969121648715014, + "learning_rate": 1.7039181129931078e-08, + "loss": 0.9741, + "step": 13301 + }, + { + "epoch": 0.9597056383247358, + "grad_norm": 3.197339298841921, + "learning_rate": 1.6978358901866164e-08, + "loss": 0.8898, + "step": 13302 + }, + { + "epoch": 0.9597777857941633, + "grad_norm": 2.05068277758374, + "learning_rate": 1.691764495927206e-08, + "loss": 0.8962, + "step": 13303 + }, + { + "epoch": 0.9598499332635908, + "grad_norm": 4.16549778119455, + "learning_rate": 1.6857039305464336e-08, + "loss": 0.8891, + "step": 13304 + }, + { + "epoch": 0.9599220807330183, + "grad_norm": 2.592616833341392, + "learning_rate": 1.679654194375235e-08, + "loss": 0.9279, + "step": 13305 + }, + { + "epoch": 0.9599942282024458, + "grad_norm": 4.219424723604978, + "learning_rate": 1.673615287743946e-08, + "loss": 0.8663, + "step": 13306 + }, + { + "epoch": 0.9600663756718734, + "grad_norm": 2.374203196929972, + "learning_rate": 1.6675872109823687e-08, + "loss": 0.9791, + "step": 13307 + }, + { + "epoch": 0.9601385231413008, + "grad_norm": 4.107007871506483, + "learning_rate": 1.66156996441964e-08, + "loss": 0.9787, + "step": 13308 + }, + { + "epoch": 0.9602106706107283, + "grad_norm": 4.342425247278097, + "learning_rate": 1.6555635483843645e-08, + "loss": 0.9036, + "step": 13309 + }, + { + "epoch": 0.9602828180801558, + "grad_norm": 6.562699378481709, + "learning_rate": 1.6495679632045233e-08, + "loss": 0.9153, + "step": 13310 + }, + { + "epoch": 0.9603549655495833, + "grad_norm": 2.6748841055604506, + "learning_rate": 1.6435832092074776e-08, + "loss": 0.9113, + "step": 13311 + }, + { + "epoch": 0.9604271130190108, + "grad_norm": 1.95168183987282, + "learning_rate": 1.6376092867200987e-08, + "loss": 0.8646, + "step": 13312 + }, + { + "epoch": 0.9604992604884384, + "grad_norm": 2.630589030195038, + "learning_rate": 1.6316461960685257e-08, + "loss": 0.9113, + "step": 13313 + }, + { + "epoch": 0.9605714079578659, + "grad_norm": 0.8672745806987029, + "learning_rate": 1.625693937578432e-08, + "loss": 0.8276, + "step": 13314 + }, + { + "epoch": 0.9606435554272934, + "grad_norm": 2.803764084990213, + "learning_rate": 1.6197525115748456e-08, + "loss": 0.8009, + "step": 13315 + }, + { + "epoch": 0.9607157028967209, + "grad_norm": 1.6208061238859497, + "learning_rate": 1.613821918382219e-08, + "loss": 0.9777, + "step": 13316 + }, + { + "epoch": 0.9607878503661484, + "grad_norm": 2.495273126854072, + "learning_rate": 1.6079021583243367e-08, + "loss": 0.8909, + "step": 13317 + }, + { + "epoch": 0.960859997835576, + "grad_norm": 1.871808514823341, + "learning_rate": 1.6019932317244966e-08, + "loss": 0.8829, + "step": 13318 + }, + { + "epoch": 0.9609321453050035, + "grad_norm": 3.168287080135852, + "learning_rate": 1.5960951389053956e-08, + "loss": 0.9916, + "step": 13319 + }, + { + "epoch": 0.9610042927744309, + "grad_norm": 1.9114393047941296, + "learning_rate": 1.590207880189043e-08, + "loss": 0.9819, + "step": 13320 + }, + { + "epoch": 0.9610764402438584, + "grad_norm": 3.609232846057531, + "learning_rate": 1.5843314558969588e-08, + "loss": 0.9515, + "step": 13321 + }, + { + "epoch": 0.9611485877132859, + "grad_norm": 2.437432508267837, + "learning_rate": 1.5784658663500206e-08, + "loss": 0.9341, + "step": 13322 + }, + { + "epoch": 0.9612207351827134, + "grad_norm": 2.409896366692863, + "learning_rate": 1.5726111118685493e-08, + "loss": 0.9567, + "step": 13323 + }, + { + "epoch": 0.961292882652141, + "grad_norm": 1.9124257465982388, + "learning_rate": 1.5667671927722003e-08, + "loss": 0.9746, + "step": 13324 + }, + { + "epoch": 0.9613650301215685, + "grad_norm": 3.567683414270107, + "learning_rate": 1.560934109380141e-08, + "loss": 0.8979, + "step": 13325 + }, + { + "epoch": 0.961437177590996, + "grad_norm": 5.30344777515844, + "learning_rate": 1.5551118620108717e-08, + "loss": 0.8956, + "step": 13326 + }, + { + "epoch": 0.9615093250604235, + "grad_norm": 2.2512097815056418, + "learning_rate": 1.549300450982316e-08, + "loss": 0.9057, + "step": 13327 + }, + { + "epoch": 0.961581472529851, + "grad_norm": 2.7870960302106944, + "learning_rate": 1.5434998766118423e-08, + "loss": 1.0557, + "step": 13328 + }, + { + "epoch": 0.9616536199992786, + "grad_norm": 2.7450740217779708, + "learning_rate": 1.537710139216175e-08, + "loss": 0.8391, + "step": 13329 + }, + { + "epoch": 0.9617257674687061, + "grad_norm": 1.9202434778806727, + "learning_rate": 1.5319312391114393e-08, + "loss": 1.0175, + "step": 13330 + }, + { + "epoch": 0.9617979149381335, + "grad_norm": 2.813184697200659, + "learning_rate": 1.526163176613271e-08, + "loss": 1.0305, + "step": 13331 + }, + { + "epoch": 0.961870062407561, + "grad_norm": 4.755178940993099, + "learning_rate": 1.5204059520366185e-08, + "loss": 0.932, + "step": 13332 + }, + { + "epoch": 0.9619422098769885, + "grad_norm": 2.320281468450428, + "learning_rate": 1.5146595656958304e-08, + "loss": 0.8814, + "step": 13333 + }, + { + "epoch": 0.962014357346416, + "grad_norm": 2.7186858947913333, + "learning_rate": 1.5089240179047003e-08, + "loss": 0.8675, + "step": 13334 + }, + { + "epoch": 0.9620865048158436, + "grad_norm": 3.5502386738236744, + "learning_rate": 1.503199308976466e-08, + "loss": 0.82, + "step": 13335 + }, + { + "epoch": 0.9621586522852711, + "grad_norm": 2.7319770107279444, + "learning_rate": 1.4974854392237003e-08, + "loss": 0.8828, + "step": 13336 + }, + { + "epoch": 0.9622307997546986, + "grad_norm": 0.8331532323450753, + "learning_rate": 1.4917824089584198e-08, + "loss": 0.8064, + "step": 13337 + }, + { + "epoch": 0.9623029472241261, + "grad_norm": 2.3660950610684885, + "learning_rate": 1.4860902184920421e-08, + "loss": 0.9199, + "step": 13338 + }, + { + "epoch": 0.9623750946935536, + "grad_norm": 3.098630504204545, + "learning_rate": 1.4804088681354298e-08, + "loss": 1.0363, + "step": 13339 + }, + { + "epoch": 0.9624472421629812, + "grad_norm": 2.8773300017277776, + "learning_rate": 1.4747383581987794e-08, + "loss": 0.9044, + "step": 13340 + }, + { + "epoch": 0.9625193896324087, + "grad_norm": 3.2607190821658176, + "learning_rate": 1.4690786889917317e-08, + "loss": 0.9151, + "step": 13341 + }, + { + "epoch": 0.9625915371018362, + "grad_norm": 2.3684095998676753, + "learning_rate": 1.463429860823373e-08, + "loss": 0.8041, + "step": 13342 + }, + { + "epoch": 0.9626636845712636, + "grad_norm": 2.1961240524106693, + "learning_rate": 1.4577918740021677e-08, + "loss": 0.8854, + "step": 13343 + }, + { + "epoch": 0.9627358320406911, + "grad_norm": 2.269134749478643, + "learning_rate": 1.4521647288359585e-08, + "loss": 0.9303, + "step": 13344 + }, + { + "epoch": 0.9628079795101186, + "grad_norm": 2.817428339915946, + "learning_rate": 1.4465484256320325e-08, + "loss": 0.9109, + "step": 13345 + }, + { + "epoch": 0.9628801269795462, + "grad_norm": 0.6824469620144509, + "learning_rate": 1.4409429646971006e-08, + "loss": 0.7547, + "step": 13346 + }, + { + "epoch": 0.9629522744489737, + "grad_norm": 3.128508162683072, + "learning_rate": 1.4353483463372062e-08, + "loss": 0.9326, + "step": 13347 + }, + { + "epoch": 0.9630244219184012, + "grad_norm": 2.590269580199696, + "learning_rate": 1.4297645708578832e-08, + "loss": 0.9124, + "step": 13348 + }, + { + "epoch": 0.9630965693878287, + "grad_norm": 2.286592680880623, + "learning_rate": 1.4241916385640428e-08, + "loss": 0.8509, + "step": 13349 + }, + { + "epoch": 0.9631687168572562, + "grad_norm": 2.1914863231805715, + "learning_rate": 1.4186295497599975e-08, + "loss": 0.9985, + "step": 13350 + }, + { + "epoch": 0.9632408643266838, + "grad_norm": 37.07550757196387, + "learning_rate": 1.41307830474946e-08, + "loss": 0.9037, + "step": 13351 + }, + { + "epoch": 0.9633130117961113, + "grad_norm": 2.9332653933662263, + "learning_rate": 1.407537903835565e-08, + "loss": 0.8708, + "step": 13352 + }, + { + "epoch": 0.9633851592655388, + "grad_norm": 1.6255534770034175, + "learning_rate": 1.4020083473208933e-08, + "loss": 0.9589, + "step": 13353 + }, + { + "epoch": 0.9634573067349663, + "grad_norm": 2.5415437282993314, + "learning_rate": 1.396489635507314e-08, + "loss": 0.8892, + "step": 13354 + }, + { + "epoch": 0.9635294542043937, + "grad_norm": 1.6696087459987672, + "learning_rate": 1.3909817686962533e-08, + "loss": 0.7922, + "step": 13355 + }, + { + "epoch": 0.9636016016738213, + "grad_norm": 2.452656306303055, + "learning_rate": 1.38548474718847e-08, + "loss": 0.9118, + "step": 13356 + }, + { + "epoch": 0.9636737491432488, + "grad_norm": 2.4579110079144746, + "learning_rate": 1.3799985712841023e-08, + "loss": 0.972, + "step": 13357 + }, + { + "epoch": 0.9637458966126763, + "grad_norm": 2.3166255750602414, + "learning_rate": 1.3745232412827323e-08, + "loss": 0.9708, + "step": 13358 + }, + { + "epoch": 0.9638180440821038, + "grad_norm": 2.4293863287706476, + "learning_rate": 1.3690587574833878e-08, + "loss": 0.9184, + "step": 13359 + }, + { + "epoch": 0.9638901915515313, + "grad_norm": 8.364406433388455, + "learning_rate": 1.36360512018443e-08, + "loss": 0.7286, + "step": 13360 + }, + { + "epoch": 0.9639623390209588, + "grad_norm": 0.7988932613314457, + "learning_rate": 1.3581623296836652e-08, + "loss": 0.8037, + "step": 13361 + }, + { + "epoch": 0.9640344864903864, + "grad_norm": 2.3387122371689277, + "learning_rate": 1.3527303862783002e-08, + "loss": 0.9505, + "step": 13362 + }, + { + "epoch": 0.9641066339598139, + "grad_norm": 4.086827372699165, + "learning_rate": 1.3473092902649862e-08, + "loss": 1.0041, + "step": 13363 + }, + { + "epoch": 0.9641787814292414, + "grad_norm": 2.844716337573883, + "learning_rate": 1.3418990419397092e-08, + "loss": 0.8423, + "step": 13364 + }, + { + "epoch": 0.9642509288986689, + "grad_norm": 2.488070732086407, + "learning_rate": 1.3364996415979213e-08, + "loss": 0.9077, + "step": 13365 + }, + { + "epoch": 0.9643230763680964, + "grad_norm": 2.800318979077655, + "learning_rate": 1.3311110895344535e-08, + "loss": 0.8709, + "step": 13366 + }, + { + "epoch": 0.9643952238375239, + "grad_norm": 2.2397091024407407, + "learning_rate": 1.3257333860435594e-08, + "loss": 0.9178, + "step": 13367 + }, + { + "epoch": 0.9644673713069514, + "grad_norm": 2.8623963116519167, + "learning_rate": 1.320366531418915e-08, + "loss": 0.9054, + "step": 13368 + }, + { + "epoch": 0.9645395187763789, + "grad_norm": 2.774120579805876, + "learning_rate": 1.3150105259535526e-08, + "loss": 1.035, + "step": 13369 + }, + { + "epoch": 0.9646116662458064, + "grad_norm": 0.6809987322239551, + "learning_rate": 1.3096653699399719e-08, + "loss": 0.7681, + "step": 13370 + }, + { + "epoch": 0.9646838137152339, + "grad_norm": 2.8803124480833646, + "learning_rate": 1.3043310636700278e-08, + "loss": 0.8469, + "step": 13371 + }, + { + "epoch": 0.9647559611846614, + "grad_norm": 2.2342743750902248, + "learning_rate": 1.299007607435021e-08, + "loss": 0.9659, + "step": 13372 + }, + { + "epoch": 0.964828108654089, + "grad_norm": 25.278451734042424, + "learning_rate": 1.293695001525652e-08, + "loss": 0.9123, + "step": 13373 + }, + { + "epoch": 0.9649002561235165, + "grad_norm": 2.359463014287318, + "learning_rate": 1.2883932462320002e-08, + "loss": 0.9681, + "step": 13374 + }, + { + "epoch": 0.964972403592944, + "grad_norm": 2.63804599841593, + "learning_rate": 1.2831023418435894e-08, + "loss": 0.921, + "step": 13375 + }, + { + "epoch": 0.9650445510623715, + "grad_norm": 0.7201958501865204, + "learning_rate": 1.277822288649344e-08, + "loss": 0.7876, + "step": 13376 + }, + { + "epoch": 0.965116698531799, + "grad_norm": 3.7064657473368943, + "learning_rate": 1.272553086937589e-08, + "loss": 0.996, + "step": 13377 + }, + { + "epoch": 0.9651888460012265, + "grad_norm": 2.719860595468628, + "learning_rate": 1.2672947369960053e-08, + "loss": 0.9678, + "step": 13378 + }, + { + "epoch": 0.965260993470654, + "grad_norm": 28.694154320828137, + "learning_rate": 1.2620472391118075e-08, + "loss": 0.9636, + "step": 13379 + }, + { + "epoch": 0.9653331409400815, + "grad_norm": 2.589611750779524, + "learning_rate": 1.2568105935715001e-08, + "loss": 0.9909, + "step": 13380 + }, + { + "epoch": 0.965405288409509, + "grad_norm": 2.531220235767328, + "learning_rate": 1.251584800661054e-08, + "loss": 0.9435, + "step": 13381 + }, + { + "epoch": 0.9654774358789365, + "grad_norm": 2.505703428415481, + "learning_rate": 1.2463698606657968e-08, + "loss": 0.9704, + "step": 13382 + }, + { + "epoch": 0.965549583348364, + "grad_norm": 2.5583384146210184, + "learning_rate": 1.241165773870545e-08, + "loss": 0.9545, + "step": 13383 + }, + { + "epoch": 0.9656217308177916, + "grad_norm": 3.2502815418072912, + "learning_rate": 1.2359725405594268e-08, + "loss": 0.9352, + "step": 13384 + }, + { + "epoch": 0.9656938782872191, + "grad_norm": 2.401300665952863, + "learning_rate": 1.2307901610160599e-08, + "loss": 0.8812, + "step": 13385 + }, + { + "epoch": 0.9657660257566466, + "grad_norm": 2.304724431146103, + "learning_rate": 1.2256186355234177e-08, + "loss": 0.8801, + "step": 13386 + }, + { + "epoch": 0.9658381732260741, + "grad_norm": 2.6920879148550467, + "learning_rate": 1.220457964363919e-08, + "loss": 0.9798, + "step": 13387 + }, + { + "epoch": 0.9659103206955016, + "grad_norm": 3.1745628656749827, + "learning_rate": 1.2153081478193606e-08, + "loss": 0.8563, + "step": 13388 + }, + { + "epoch": 0.9659824681649292, + "grad_norm": 2.239171036021668, + "learning_rate": 1.2101691861709173e-08, + "loss": 0.9012, + "step": 13389 + }, + { + "epoch": 0.9660546156343566, + "grad_norm": 2.2878549144133937, + "learning_rate": 1.205041079699276e-08, + "loss": 0.9904, + "step": 13390 + }, + { + "epoch": 0.9661267631037841, + "grad_norm": 3.2700991120323324, + "learning_rate": 1.1999238286843904e-08, + "loss": 0.9487, + "step": 13391 + }, + { + "epoch": 0.9661989105732116, + "grad_norm": 4.326699362946233, + "learning_rate": 1.194817433405748e-08, + "loss": 0.8696, + "step": 13392 + }, + { + "epoch": 0.9662710580426391, + "grad_norm": 2.5410096199967476, + "learning_rate": 1.1897218941421927e-08, + "loss": 0.9643, + "step": 13393 + }, + { + "epoch": 0.9663432055120666, + "grad_norm": 2.7610294933353443, + "learning_rate": 1.1846372111719239e-08, + "loss": 0.8496, + "step": 13394 + }, + { + "epoch": 0.9664153529814942, + "grad_norm": 2.9356579788106596, + "learning_rate": 1.1795633847726527e-08, + "loss": 0.9531, + "step": 13395 + }, + { + "epoch": 0.9664875004509217, + "grad_norm": 2.1907656680231047, + "learning_rate": 1.1745004152213799e-08, + "loss": 0.8369, + "step": 13396 + }, + { + "epoch": 0.9665596479203492, + "grad_norm": 3.071523956389214, + "learning_rate": 1.1694483027946622e-08, + "loss": 0.9594, + "step": 13397 + }, + { + "epoch": 0.9666317953897767, + "grad_norm": 4.098888870031984, + "learning_rate": 1.164407047768301e-08, + "loss": 0.9596, + "step": 13398 + }, + { + "epoch": 0.9667039428592042, + "grad_norm": 3.035067484575144, + "learning_rate": 1.1593766504176095e-08, + "loss": 0.9556, + "step": 13399 + }, + { + "epoch": 0.9667760903286318, + "grad_norm": 2.4519724476994726, + "learning_rate": 1.154357111017279e-08, + "loss": 0.8312, + "step": 13400 + }, + { + "epoch": 0.9668482377980593, + "grad_norm": 2.7574090851696407, + "learning_rate": 1.1493484298414013e-08, + "loss": 0.9395, + "step": 13401 + }, + { + "epoch": 0.9669203852674867, + "grad_norm": 2.4522121213044183, + "learning_rate": 1.144350607163469e-08, + "loss": 0.8126, + "step": 13402 + }, + { + "epoch": 0.9669925327369142, + "grad_norm": 2.7674314989056645, + "learning_rate": 1.139363643256419e-08, + "loss": 0.8162, + "step": 13403 + }, + { + "epoch": 0.9670646802063417, + "grad_norm": 4.372373173806246, + "learning_rate": 1.1343875383925894e-08, + "loss": 0.9361, + "step": 13404 + }, + { + "epoch": 0.9671368276757693, + "grad_norm": 3.1877269290103865, + "learning_rate": 1.1294222928436736e-08, + "loss": 0.9136, + "step": 13405 + }, + { + "epoch": 0.9672089751451968, + "grad_norm": 6.073928914407493, + "learning_rate": 1.1244679068807884e-08, + "loss": 0.9638, + "step": 13406 + }, + { + "epoch": 0.9672811226146243, + "grad_norm": 2.6648536022119345, + "learning_rate": 1.1195243807745392e-08, + "loss": 1.0248, + "step": 13407 + }, + { + "epoch": 0.9673532700840518, + "grad_norm": 2.622022666000992, + "learning_rate": 1.1145917147947992e-08, + "loss": 0.9485, + "step": 13408 + }, + { + "epoch": 0.9674254175534793, + "grad_norm": 3.4931908175345243, + "learning_rate": 1.109669909210953e-08, + "loss": 0.9435, + "step": 13409 + }, + { + "epoch": 0.9674975650229068, + "grad_norm": 3.1489269548207806, + "learning_rate": 1.1047589642917853e-08, + "loss": 0.9683, + "step": 13410 + }, + { + "epoch": 0.9675697124923344, + "grad_norm": 2.203686040771299, + "learning_rate": 1.0998588803054375e-08, + "loss": 0.9222, + "step": 13411 + }, + { + "epoch": 0.9676418599617619, + "grad_norm": 2.5477883976994447, + "learning_rate": 1.0949696575194956e-08, + "loss": 0.9684, + "step": 13412 + }, + { + "epoch": 0.9677140074311894, + "grad_norm": 4.0377563489663615, + "learning_rate": 1.0900912962009457e-08, + "loss": 0.954, + "step": 13413 + }, + { + "epoch": 0.9677861549006168, + "grad_norm": 5.493019182117192, + "learning_rate": 1.0852237966161748e-08, + "loss": 0.8182, + "step": 13414 + }, + { + "epoch": 0.9678583023700443, + "grad_norm": 3.8069921011255365, + "learning_rate": 1.0803671590309481e-08, + "loss": 0.8185, + "step": 13415 + }, + { + "epoch": 0.9679304498394719, + "grad_norm": 2.116926451633012, + "learning_rate": 1.0755213837105204e-08, + "loss": 1.0516, + "step": 13416 + }, + { + "epoch": 0.9680025973088994, + "grad_norm": 8.055235909737432, + "learning_rate": 1.0706864709194797e-08, + "loss": 0.901, + "step": 13417 + }, + { + "epoch": 0.9680747447783269, + "grad_norm": 2.3929689173833806, + "learning_rate": 1.0658624209218148e-08, + "loss": 0.9478, + "step": 13418 + }, + { + "epoch": 0.9681468922477544, + "grad_norm": 4.646944514705755, + "learning_rate": 1.0610492339810039e-08, + "loss": 0.8683, + "step": 13419 + }, + { + "epoch": 0.9682190397171819, + "grad_norm": 3.4982582935743913, + "learning_rate": 1.0562469103598371e-08, + "loss": 0.9705, + "step": 13420 + }, + { + "epoch": 0.9682911871866094, + "grad_norm": 2.372047697472218, + "learning_rate": 1.0514554503205486e-08, + "loss": 0.9643, + "step": 13421 + }, + { + "epoch": 0.968363334656037, + "grad_norm": 2.938551369675487, + "learning_rate": 1.0466748541247961e-08, + "loss": 0.8738, + "step": 13422 + }, + { + "epoch": 0.9684354821254645, + "grad_norm": 4.127616514875391, + "learning_rate": 1.0419051220336373e-08, + "loss": 0.8927, + "step": 13423 + }, + { + "epoch": 0.968507629594892, + "grad_norm": 2.344086686428992, + "learning_rate": 1.0371462543075304e-08, + "loss": 0.9772, + "step": 13424 + }, + { + "epoch": 0.9685797770643194, + "grad_norm": 2.7606673120870835, + "learning_rate": 1.0323982512063124e-08, + "loss": 0.9575, + "step": 13425 + }, + { + "epoch": 0.9686519245337469, + "grad_norm": 2.5061056918410243, + "learning_rate": 1.0276611129892865e-08, + "loss": 1.0153, + "step": 13426 + }, + { + "epoch": 0.9687240720031745, + "grad_norm": 2.774229984158646, + "learning_rate": 1.0229348399151128e-08, + "loss": 0.8435, + "step": 13427 + }, + { + "epoch": 0.968796219472602, + "grad_norm": 2.3814440190128634, + "learning_rate": 1.0182194322418736e-08, + "loss": 0.9443, + "step": 13428 + }, + { + "epoch": 0.9688683669420295, + "grad_norm": 2.8055846923053744, + "learning_rate": 1.013514890227074e-08, + "loss": 0.8453, + "step": 13429 + }, + { + "epoch": 0.968940514411457, + "grad_norm": 2.189145804214926, + "learning_rate": 1.0088212141276197e-08, + "loss": 0.8773, + "step": 13430 + }, + { + "epoch": 0.9690126618808845, + "grad_norm": 6.126425129664507, + "learning_rate": 1.0041384041997725e-08, + "loss": 0.9903, + "step": 13431 + }, + { + "epoch": 0.969084809350312, + "grad_norm": 2.2446038919199784, + "learning_rate": 9.994664606992831e-09, + "loss": 0.9029, + "step": 13432 + }, + { + "epoch": 0.9691569568197396, + "grad_norm": 3.221542184328828, + "learning_rate": 9.948053838812365e-09, + "loss": 0.8883, + "step": 13433 + }, + { + "epoch": 0.9692291042891671, + "grad_norm": 2.5010018248736325, + "learning_rate": 9.901551740002068e-09, + "loss": 1.0429, + "step": 13434 + }, + { + "epoch": 0.9693012517585946, + "grad_norm": 2.398940043035283, + "learning_rate": 9.855158313100798e-09, + "loss": 0.9744, + "step": 13435 + }, + { + "epoch": 0.9693733992280221, + "grad_norm": 3.120271691450409, + "learning_rate": 9.808873560642083e-09, + "loss": 0.8427, + "step": 13436 + }, + { + "epoch": 0.9694455466974495, + "grad_norm": 2.697053650986808, + "learning_rate": 9.762697485153237e-09, + "loss": 0.9337, + "step": 13437 + }, + { + "epoch": 0.969517694166877, + "grad_norm": 2.5738026770397364, + "learning_rate": 9.71663008915602e-09, + "loss": 0.9574, + "step": 13438 + }, + { + "epoch": 0.9695898416363046, + "grad_norm": 2.4323737207549816, + "learning_rate": 9.67067137516575e-09, + "loss": 0.9248, + "step": 13439 + }, + { + "epoch": 0.9696619891057321, + "grad_norm": 2.678319019959079, + "learning_rate": 9.624821345692202e-09, + "loss": 0.8681, + "step": 13440 + }, + { + "epoch": 0.9697341365751596, + "grad_norm": 3.2036595340477763, + "learning_rate": 9.579080003239148e-09, + "loss": 0.9187, + "step": 13441 + }, + { + "epoch": 0.9698062840445871, + "grad_norm": 2.2474055697044912, + "learning_rate": 9.533447350304147e-09, + "loss": 0.9462, + "step": 13442 + }, + { + "epoch": 0.9698784315140146, + "grad_norm": 1.9640869139107637, + "learning_rate": 9.487923389378982e-09, + "loss": 0.8552, + "step": 13443 + }, + { + "epoch": 0.9699505789834422, + "grad_norm": 2.5031535286295616, + "learning_rate": 9.442508122949889e-09, + "loss": 0.8497, + "step": 13444 + }, + { + "epoch": 0.9700227264528697, + "grad_norm": 2.7346515624647356, + "learning_rate": 9.397201553496658e-09, + "loss": 0.9412, + "step": 13445 + }, + { + "epoch": 0.9700948739222972, + "grad_norm": 3.283792727584123, + "learning_rate": 9.352003683493093e-09, + "loss": 0.7988, + "step": 13446 + }, + { + "epoch": 0.9701670213917247, + "grad_norm": 3.3575384753331248, + "learning_rate": 9.30691451540766e-09, + "loss": 0.8292, + "step": 13447 + }, + { + "epoch": 0.9702391688611522, + "grad_norm": 3.0151249762549686, + "learning_rate": 9.26193405170239e-09, + "loss": 0.8299, + "step": 13448 + }, + { + "epoch": 0.9703113163305797, + "grad_norm": 3.763626391339399, + "learning_rate": 9.217062294833321e-09, + "loss": 0.8986, + "step": 13449 + }, + { + "epoch": 0.9703834638000072, + "grad_norm": 2.4755475103341547, + "learning_rate": 9.172299247250714e-09, + "loss": 0.8914, + "step": 13450 + }, + { + "epoch": 0.9704556112694347, + "grad_norm": 2.567086650704043, + "learning_rate": 9.127644911399278e-09, + "loss": 0.8757, + "step": 13451 + }, + { + "epoch": 0.9705277587388622, + "grad_norm": 1.602966288839768, + "learning_rate": 9.08309928971729e-09, + "loss": 0.8838, + "step": 13452 + }, + { + "epoch": 0.9705999062082897, + "grad_norm": 2.5425739128299436, + "learning_rate": 9.038662384636797e-09, + "loss": 0.8829, + "step": 13453 + }, + { + "epoch": 0.9706720536777173, + "grad_norm": 2.285609012972399, + "learning_rate": 8.994334198584974e-09, + "loss": 0.8484, + "step": 13454 + }, + { + "epoch": 0.9707442011471448, + "grad_norm": 3.404868869581172, + "learning_rate": 8.950114733981884e-09, + "loss": 0.8651, + "step": 13455 + }, + { + "epoch": 0.9708163486165723, + "grad_norm": 2.896468406216517, + "learning_rate": 8.906003993242484e-09, + "loss": 0.8156, + "step": 13456 + }, + { + "epoch": 0.9708884960859998, + "grad_norm": 15.500833304370536, + "learning_rate": 8.862001978775512e-09, + "loss": 0.9914, + "step": 13457 + }, + { + "epoch": 0.9709606435554273, + "grad_norm": 4.3667568478966565, + "learning_rate": 8.818108692983495e-09, + "loss": 0.9678, + "step": 13458 + }, + { + "epoch": 0.9710327910248548, + "grad_norm": 2.509985911873115, + "learning_rate": 8.774324138263622e-09, + "loss": 0.9605, + "step": 13459 + }, + { + "epoch": 0.9711049384942824, + "grad_norm": 2.3808989298588212, + "learning_rate": 8.73064831700665e-09, + "loss": 0.8575, + "step": 13460 + }, + { + "epoch": 0.9711770859637098, + "grad_norm": 2.3578374033447624, + "learning_rate": 8.68708123159756e-09, + "loss": 0.8485, + "step": 13461 + }, + { + "epoch": 0.9712492334331373, + "grad_norm": 3.120812013058398, + "learning_rate": 8.643622884415335e-09, + "loss": 0.93, + "step": 13462 + }, + { + "epoch": 0.9713213809025648, + "grad_norm": 2.9882667291824485, + "learning_rate": 8.60027327783297e-09, + "loss": 0.8736, + "step": 13463 + }, + { + "epoch": 0.9713935283719923, + "grad_norm": 2.523290671219311, + "learning_rate": 8.557032414217902e-09, + "loss": 0.8498, + "step": 13464 + }, + { + "epoch": 0.9714656758414199, + "grad_norm": 2.1299388935833745, + "learning_rate": 8.51390029593113e-09, + "loss": 0.8632, + "step": 13465 + }, + { + "epoch": 0.9715378233108474, + "grad_norm": 2.119012137881877, + "learning_rate": 8.470876925328107e-09, + "loss": 0.9248, + "step": 13466 + }, + { + "epoch": 0.9716099707802749, + "grad_norm": 2.593266269663325, + "learning_rate": 8.427962304758063e-09, + "loss": 0.918, + "step": 13467 + }, + { + "epoch": 0.9716821182497024, + "grad_norm": 5.178688010420916, + "learning_rate": 8.385156436564677e-09, + "loss": 0.8722, + "step": 13468 + }, + { + "epoch": 0.9717542657191299, + "grad_norm": 2.403172295827966, + "learning_rate": 8.342459323084972e-09, + "loss": 0.966, + "step": 13469 + }, + { + "epoch": 0.9718264131885574, + "grad_norm": 2.9986858668591125, + "learning_rate": 8.299870966650635e-09, + "loss": 0.9314, + "step": 13470 + }, + { + "epoch": 0.971898560657985, + "grad_norm": 0.7916396705307718, + "learning_rate": 8.257391369587585e-09, + "loss": 0.7914, + "step": 13471 + }, + { + "epoch": 0.9719707081274125, + "grad_norm": 2.314832181512183, + "learning_rate": 8.215020534215078e-09, + "loss": 0.8328, + "step": 13472 + }, + { + "epoch": 0.9720428555968399, + "grad_norm": 5.216553734155158, + "learning_rate": 8.172758462846818e-09, + "loss": 0.9242, + "step": 13473 + }, + { + "epoch": 0.9721150030662674, + "grad_norm": 2.962386368129517, + "learning_rate": 8.130605157790737e-09, + "loss": 0.8156, + "step": 13474 + }, + { + "epoch": 0.9721871505356949, + "grad_norm": 2.4292506962981584, + "learning_rate": 8.088560621348995e-09, + "loss": 0.9287, + "step": 13475 + }, + { + "epoch": 0.9722592980051225, + "grad_norm": 3.319783965286277, + "learning_rate": 8.046624855816864e-09, + "loss": 0.8391, + "step": 13476 + }, + { + "epoch": 0.97233144547455, + "grad_norm": 3.809509730596866, + "learning_rate": 8.004797863484514e-09, + "loss": 0.901, + "step": 13477 + }, + { + "epoch": 0.9724035929439775, + "grad_norm": 2.945998863163021, + "learning_rate": 7.963079646636339e-09, + "loss": 0.8447, + "step": 13478 + }, + { + "epoch": 0.972475740413405, + "grad_norm": 3.2736018100916238, + "learning_rate": 7.921470207549852e-09, + "loss": 0.998, + "step": 13479 + }, + { + "epoch": 0.9725478878828325, + "grad_norm": 3.8947482034292324, + "learning_rate": 7.879969548497677e-09, + "loss": 1.0301, + "step": 13480 + }, + { + "epoch": 0.97262003535226, + "grad_norm": 3.230635688278457, + "learning_rate": 7.838577671745783e-09, + "loss": 0.9581, + "step": 13481 + }, + { + "epoch": 0.9726921828216876, + "grad_norm": 2.8161866503217934, + "learning_rate": 7.797294579554137e-09, + "loss": 0.8753, + "step": 13482 + }, + { + "epoch": 0.9727643302911151, + "grad_norm": 1.9880269605585315, + "learning_rate": 7.756120274177602e-09, + "loss": 0.9407, + "step": 13483 + }, + { + "epoch": 0.9728364777605425, + "grad_norm": 2.465196877407292, + "learning_rate": 7.715054757864158e-09, + "loss": 0.8298, + "step": 13484 + }, + { + "epoch": 0.97290862522997, + "grad_norm": 3.924530710726886, + "learning_rate": 7.674098032856679e-09, + "loss": 0.9168, + "step": 13485 + }, + { + "epoch": 0.9729807726993975, + "grad_norm": 2.9633378337000633, + "learning_rate": 7.633250101391154e-09, + "loss": 0.9299, + "step": 13486 + }, + { + "epoch": 0.973052920168825, + "grad_norm": 29.996257294205158, + "learning_rate": 7.59251096569824e-09, + "loss": 0.8746, + "step": 13487 + }, + { + "epoch": 0.9731250676382526, + "grad_norm": 3.282399003624352, + "learning_rate": 7.551880628002827e-09, + "loss": 0.9102, + "step": 13488 + }, + { + "epoch": 0.9731972151076801, + "grad_norm": 4.133435726948403, + "learning_rate": 7.511359090523361e-09, + "loss": 0.9411, + "step": 13489 + }, + { + "epoch": 0.9732693625771076, + "grad_norm": 3.386324216044145, + "learning_rate": 7.470946355472518e-09, + "loss": 0.9411, + "step": 13490 + }, + { + "epoch": 0.9733415100465351, + "grad_norm": 3.7150878826030302, + "learning_rate": 7.430642425057421e-09, + "loss": 0.8564, + "step": 13491 + }, + { + "epoch": 0.9734136575159626, + "grad_norm": 6.537469412329107, + "learning_rate": 7.390447301478753e-09, + "loss": 0.9073, + "step": 13492 + }, + { + "epoch": 0.9734858049853902, + "grad_norm": 2.153762820355676, + "learning_rate": 7.350360986931204e-09, + "loss": 0.885, + "step": 13493 + }, + { + "epoch": 0.9735579524548177, + "grad_norm": 2.4718322827377213, + "learning_rate": 7.31038348360391e-09, + "loss": 0.9319, + "step": 13494 + }, + { + "epoch": 0.9736300999242452, + "grad_norm": 2.407264990819031, + "learning_rate": 7.270514793679794e-09, + "loss": 0.9562, + "step": 13495 + }, + { + "epoch": 0.9737022473936726, + "grad_norm": 2.0133709983376193, + "learning_rate": 7.230754919336224e-09, + "loss": 0.9447, + "step": 13496 + }, + { + "epoch": 0.9737743948631001, + "grad_norm": 2.4060077297565345, + "learning_rate": 7.19110386274413e-09, + "loss": 0.944, + "step": 13497 + }, + { + "epoch": 0.9738465423325277, + "grad_norm": 2.5274622327810885, + "learning_rate": 7.151561626068669e-09, + "loss": 0.9139, + "step": 13498 + }, + { + "epoch": 0.9739186898019552, + "grad_norm": 2.397643315046365, + "learning_rate": 7.112128211469226e-09, + "loss": 0.8951, + "step": 13499 + }, + { + "epoch": 0.9739908372713827, + "grad_norm": 2.424072745103469, + "learning_rate": 7.072803621098966e-09, + "loss": 1.0059, + "step": 13500 + }, + { + "epoch": 0.9740629847408102, + "grad_norm": 2.214932448345191, + "learning_rate": 7.033587857105283e-09, + "loss": 0.8301, + "step": 13501 + }, + { + "epoch": 0.9741351322102377, + "grad_norm": 2.7359850693624352, + "learning_rate": 6.994480921629798e-09, + "loss": 0.9641, + "step": 13502 + }, + { + "epoch": 0.9742072796796653, + "grad_norm": 7.957411175039441, + "learning_rate": 6.95548281680769e-09, + "loss": 0.7552, + "step": 13503 + }, + { + "epoch": 0.9742794271490928, + "grad_norm": 2.086219339939686, + "learning_rate": 6.916593544768812e-09, + "loss": 0.7905, + "step": 13504 + }, + { + "epoch": 0.9743515746185203, + "grad_norm": 2.255639320638832, + "learning_rate": 6.8778131076365765e-09, + "loss": 0.8981, + "step": 13505 + }, + { + "epoch": 0.9744237220879478, + "grad_norm": 2.461415756014473, + "learning_rate": 6.839141507528623e-09, + "loss": 0.844, + "step": 13506 + }, + { + "epoch": 0.9744958695573753, + "grad_norm": 2.147917306519446, + "learning_rate": 6.800578746556817e-09, + "loss": 0.9279, + "step": 13507 + }, + { + "epoch": 0.9745680170268027, + "grad_norm": 3.4257186135771835, + "learning_rate": 6.7621248268268095e-09, + "loss": 0.8681, + "step": 13508 + }, + { + "epoch": 0.9746401644962303, + "grad_norm": 7.226246297980857, + "learning_rate": 6.723779750438474e-09, + "loss": 1.0034, + "step": 13509 + }, + { + "epoch": 0.9747123119656578, + "grad_norm": 6.191400396783511, + "learning_rate": 6.6855435194854705e-09, + "loss": 0.8671, + "step": 13510 + }, + { + "epoch": 0.9747844594350853, + "grad_norm": 2.868974258865818, + "learning_rate": 6.647416136056127e-09, + "loss": 0.7541, + "step": 13511 + }, + { + "epoch": 0.9748566069045128, + "grad_norm": 2.5776807922537945, + "learning_rate": 6.609397602232336e-09, + "loss": 0.7942, + "step": 13512 + }, + { + "epoch": 0.9749287543739403, + "grad_norm": 5.2100853173121715, + "learning_rate": 6.571487920089769e-09, + "loss": 0.9219, + "step": 13513 + }, + { + "epoch": 0.9750009018433679, + "grad_norm": 1.9415840052198519, + "learning_rate": 6.533687091698991e-09, + "loss": 0.9046, + "step": 13514 + }, + { + "epoch": 0.9750730493127954, + "grad_norm": 2.0430535906066134, + "learning_rate": 6.495995119124131e-09, + "loss": 0.9493, + "step": 13515 + }, + { + "epoch": 0.9751451967822229, + "grad_norm": 2.3140442305931312, + "learning_rate": 6.458412004423097e-09, + "loss": 0.8465, + "step": 13516 + }, + { + "epoch": 0.9752173442516504, + "grad_norm": 2.2778651587723697, + "learning_rate": 6.42093774964847e-09, + "loss": 0.8742, + "step": 13517 + }, + { + "epoch": 0.9752894917210779, + "grad_norm": 2.2044163159315695, + "learning_rate": 6.383572356846389e-09, + "loss": 0.8517, + "step": 13518 + }, + { + "epoch": 0.9753616391905054, + "grad_norm": 2.3441920562935006, + "learning_rate": 6.346315828057225e-09, + "loss": 1.0425, + "step": 13519 + }, + { + "epoch": 0.9754337866599329, + "grad_norm": 2.2989775707118123, + "learning_rate": 6.30916816531557e-09, + "loss": 0.8162, + "step": 13520 + }, + { + "epoch": 0.9755059341293604, + "grad_norm": 2.427214612125477, + "learning_rate": 6.272129370650025e-09, + "loss": 0.9257, + "step": 13521 + }, + { + "epoch": 0.9755780815987879, + "grad_norm": 2.5623700178721243, + "learning_rate": 6.2351994460827505e-09, + "loss": 0.9553, + "step": 13522 + }, + { + "epoch": 0.9756502290682154, + "grad_norm": 2.6215100476561517, + "learning_rate": 6.198378393630576e-09, + "loss": 0.9464, + "step": 13523 + }, + { + "epoch": 0.9757223765376429, + "grad_norm": 0.6635733878809069, + "learning_rate": 6.16166621530434e-09, + "loss": 0.7483, + "step": 13524 + }, + { + "epoch": 0.9757945240070705, + "grad_norm": 2.701666554288805, + "learning_rate": 6.125062913108437e-09, + "loss": 0.837, + "step": 13525 + }, + { + "epoch": 0.975866671476498, + "grad_norm": 2.841672368943505, + "learning_rate": 6.088568489041712e-09, + "loss": 0.7642, + "step": 13526 + }, + { + "epoch": 0.9759388189459255, + "grad_norm": 2.243175754010128, + "learning_rate": 6.052182945097239e-09, + "loss": 0.9034, + "step": 13527 + }, + { + "epoch": 0.976010966415353, + "grad_norm": 2.4598028564543237, + "learning_rate": 6.015906283261429e-09, + "loss": 0.8873, + "step": 13528 + }, + { + "epoch": 0.9760831138847805, + "grad_norm": 2.781978811719493, + "learning_rate": 5.979738505515808e-09, + "loss": 0.93, + "step": 13529 + }, + { + "epoch": 0.976155261354208, + "grad_norm": 3.1054114762308944, + "learning_rate": 5.943679613834796e-09, + "loss": 0.9269, + "step": 13530 + }, + { + "epoch": 0.9762274088236355, + "grad_norm": 4.597766806829741, + "learning_rate": 5.907729610187928e-09, + "loss": 0.8927, + "step": 13531 + }, + { + "epoch": 0.976299556293063, + "grad_norm": 2.8128850037590514, + "learning_rate": 5.8718884965378575e-09, + "loss": 0.8595, + "step": 13532 + }, + { + "epoch": 0.9763717037624905, + "grad_norm": 3.700439509501076, + "learning_rate": 5.83615627484213e-09, + "loss": 0.6813, + "step": 13533 + }, + { + "epoch": 0.976443851231918, + "grad_norm": 3.8412383022061958, + "learning_rate": 5.800532947051628e-09, + "loss": 0.8744, + "step": 13534 + }, + { + "epoch": 0.9765159987013455, + "grad_norm": 1.9550411742964853, + "learning_rate": 5.765018515111686e-09, + "loss": 0.919, + "step": 13535 + }, + { + "epoch": 0.976588146170773, + "grad_norm": 2.507494469557622, + "learning_rate": 5.729612980961862e-09, + "loss": 0.959, + "step": 13536 + }, + { + "epoch": 0.9766602936402006, + "grad_norm": 2.3439772432153485, + "learning_rate": 5.694316346535277e-09, + "loss": 0.8534, + "step": 13537 + }, + { + "epoch": 0.9767324411096281, + "grad_norm": 2.5930838591244325, + "learning_rate": 5.659128613759501e-09, + "loss": 0.7945, + "step": 13538 + }, + { + "epoch": 0.9768045885790556, + "grad_norm": 3.244277023396895, + "learning_rate": 5.624049784555884e-09, + "loss": 0.9053, + "step": 13539 + }, + { + "epoch": 0.9768767360484831, + "grad_norm": 2.976480798275857, + "learning_rate": 5.589079860839785e-09, + "loss": 0.9268, + "step": 13540 + }, + { + "epoch": 0.9769488835179106, + "grad_norm": 0.8428985043195515, + "learning_rate": 5.554218844521008e-09, + "loss": 0.8721, + "step": 13541 + }, + { + "epoch": 0.9770210309873382, + "grad_norm": 2.661835396788454, + "learning_rate": 5.519466737503364e-09, + "loss": 0.8853, + "step": 13542 + }, + { + "epoch": 0.9770931784567656, + "grad_norm": 2.222495986711721, + "learning_rate": 5.484823541684225e-09, + "loss": 0.899, + "step": 13543 + }, + { + "epoch": 0.9771653259261931, + "grad_norm": 3.1554006812396236, + "learning_rate": 5.450289258955409e-09, + "loss": 0.8563, + "step": 13544 + }, + { + "epoch": 0.9772374733956206, + "grad_norm": 3.1696161955972806, + "learning_rate": 5.415863891202743e-09, + "loss": 0.969, + "step": 13545 + }, + { + "epoch": 0.9773096208650481, + "grad_norm": 2.7350899769768864, + "learning_rate": 5.381547440306056e-09, + "loss": 0.8489, + "step": 13546 + }, + { + "epoch": 0.9773817683344757, + "grad_norm": 2.875364860934364, + "learning_rate": 5.347339908139182e-09, + "loss": 0.9043, + "step": 13547 + }, + { + "epoch": 0.9774539158039032, + "grad_norm": 2.4952514373435766, + "learning_rate": 5.313241296570181e-09, + "loss": 0.9017, + "step": 13548 + }, + { + "epoch": 0.9775260632733307, + "grad_norm": 2.0619761639835175, + "learning_rate": 5.279251607461122e-09, + "loss": 0.9571, + "step": 13549 + }, + { + "epoch": 0.9775982107427582, + "grad_norm": 0.7297844767200435, + "learning_rate": 5.245370842667851e-09, + "loss": 0.7758, + "step": 13550 + }, + { + "epoch": 0.9776703582121857, + "grad_norm": 2.901139719394389, + "learning_rate": 5.211599004040446e-09, + "loss": 0.8653, + "step": 13551 + }, + { + "epoch": 0.9777425056816132, + "grad_norm": 2.930508185832943, + "learning_rate": 5.177936093423207e-09, + "loss": 0.9154, + "step": 13552 + }, + { + "epoch": 0.9778146531510408, + "grad_norm": 2.4931742947731172, + "learning_rate": 5.144382112654222e-09, + "loss": 0.8917, + "step": 13553 + }, + { + "epoch": 0.9778868006204683, + "grad_norm": 3.527451489261651, + "learning_rate": 5.1109370635660235e-09, + "loss": 1.0095, + "step": 13554 + }, + { + "epoch": 0.9779589480898957, + "grad_norm": 2.9594645598413627, + "learning_rate": 5.077600947984484e-09, + "loss": 0.8979, + "step": 13555 + }, + { + "epoch": 0.9780310955593232, + "grad_norm": 2.2740838354487756, + "learning_rate": 5.04437376773037e-09, + "loss": 0.9324, + "step": 13556 + }, + { + "epoch": 0.9781032430287507, + "grad_norm": 2.6197555468719234, + "learning_rate": 5.011255524618007e-09, + "loss": 0.9828, + "step": 13557 + }, + { + "epoch": 0.9781753904981783, + "grad_norm": 2.370209932968549, + "learning_rate": 4.978246220455506e-09, + "loss": 0.8416, + "step": 13558 + }, + { + "epoch": 0.9782475379676058, + "grad_norm": 2.7153854598091627, + "learning_rate": 4.945345857046091e-09, + "loss": 0.8525, + "step": 13559 + }, + { + "epoch": 0.9783196854370333, + "grad_norm": 3.3008477018089017, + "learning_rate": 4.9125544361854346e-09, + "loss": 0.8891, + "step": 13560 + }, + { + "epoch": 0.9783918329064608, + "grad_norm": 0.8561556423015368, + "learning_rate": 4.879871959664772e-09, + "loss": 0.8128, + "step": 13561 + }, + { + "epoch": 0.9784639803758883, + "grad_norm": 13.221210223737971, + "learning_rate": 4.847298429268676e-09, + "loss": 0.901, + "step": 13562 + }, + { + "epoch": 0.9785361278453159, + "grad_norm": 2.0044858217068606, + "learning_rate": 4.814833846775946e-09, + "loss": 0.9348, + "step": 13563 + }, + { + "epoch": 0.9786082753147434, + "grad_norm": 2.6967973682462016, + "learning_rate": 4.7824782139589405e-09, + "loss": 0.9666, + "step": 13564 + }, + { + "epoch": 0.9786804227841709, + "grad_norm": 2.190614499490035, + "learning_rate": 4.750231532584914e-09, + "loss": 0.9891, + "step": 13565 + }, + { + "epoch": 0.9787525702535984, + "grad_norm": 2.9551873650186247, + "learning_rate": 4.71809380441468e-09, + "loss": 0.9034, + "step": 13566 + }, + { + "epoch": 0.9788247177230258, + "grad_norm": 3.710371436061689, + "learning_rate": 4.686065031202835e-09, + "loss": 0.9166, + "step": 13567 + }, + { + "epoch": 0.9788968651924533, + "grad_norm": 2.0853190558953734, + "learning_rate": 4.654145214698646e-09, + "loss": 0.9586, + "step": 13568 + }, + { + "epoch": 0.9789690126618809, + "grad_norm": 2.923519059577246, + "learning_rate": 4.6223343566451636e-09, + "loss": 0.9472, + "step": 13569 + }, + { + "epoch": 0.9790411601313084, + "grad_norm": 1.914038085781796, + "learning_rate": 4.590632458779442e-09, + "loss": 0.9546, + "step": 13570 + }, + { + "epoch": 0.9791133076007359, + "grad_norm": 2.0031148972754935, + "learning_rate": 4.55903952283232e-09, + "loss": 0.8164, + "step": 13571 + }, + { + "epoch": 0.9791854550701634, + "grad_norm": 0.8940197677560322, + "learning_rate": 4.527555550529305e-09, + "loss": 0.9204, + "step": 13572 + }, + { + "epoch": 0.9792576025395909, + "grad_norm": 3.7230282064307416, + "learning_rate": 4.4961805435894676e-09, + "loss": 0.8953, + "step": 13573 + }, + { + "epoch": 0.9793297500090185, + "grad_norm": 4.9341931435797495, + "learning_rate": 4.464914503726103e-09, + "loss": 0.8444, + "step": 13574 + }, + { + "epoch": 0.979401897478446, + "grad_norm": 1.8754791283377932, + "learning_rate": 4.433757432646512e-09, + "loss": 0.8559, + "step": 13575 + }, + { + "epoch": 0.9794740449478735, + "grad_norm": 3.8579724191774347, + "learning_rate": 4.402709332052224e-09, + "loss": 0.973, + "step": 13576 + }, + { + "epoch": 0.979546192417301, + "grad_norm": 3.791328177521752, + "learning_rate": 4.3717702036383255e-09, + "loss": 0.9675, + "step": 13577 + }, + { + "epoch": 0.9796183398867284, + "grad_norm": 2.3974158879105048, + "learning_rate": 4.340940049094577e-09, + "loss": 0.7974, + "step": 13578 + }, + { + "epoch": 0.9796904873561559, + "grad_norm": 2.117030852318688, + "learning_rate": 4.3102188701042985e-09, + "loss": 0.8946, + "step": 13579 + }, + { + "epoch": 0.9797626348255835, + "grad_norm": 2.750487804497451, + "learning_rate": 4.279606668345481e-09, + "loss": 0.8178, + "step": 13580 + }, + { + "epoch": 0.979834782295011, + "grad_norm": 3.1151454422552742, + "learning_rate": 4.249103445489232e-09, + "loss": 0.8604, + "step": 13581 + }, + { + "epoch": 0.9799069297644385, + "grad_norm": 0.7427037902807223, + "learning_rate": 4.218709203201332e-09, + "loss": 0.8196, + "step": 13582 + }, + { + "epoch": 0.979979077233866, + "grad_norm": 2.0919394564720744, + "learning_rate": 4.188423943141783e-09, + "loss": 0.9628, + "step": 13583 + }, + { + "epoch": 0.9800512247032935, + "grad_norm": 2.7484302375354854, + "learning_rate": 4.1582476669641545e-09, + "loss": 0.8842, + "step": 13584 + }, + { + "epoch": 0.980123372172721, + "grad_norm": 2.276802744654799, + "learning_rate": 4.128180376316015e-09, + "loss": 0.8996, + "step": 13585 + }, + { + "epoch": 0.9801955196421486, + "grad_norm": 3.2463641736880917, + "learning_rate": 4.09822207283983e-09, + "loss": 0.9413, + "step": 13586 + }, + { + "epoch": 0.9802676671115761, + "grad_norm": 2.393062570534189, + "learning_rate": 4.068372758170957e-09, + "loss": 0.956, + "step": 13587 + }, + { + "epoch": 0.9803398145810036, + "grad_norm": 2.8208473100038933, + "learning_rate": 4.038632433939648e-09, + "loss": 0.9456, + "step": 13588 + }, + { + "epoch": 0.9804119620504311, + "grad_norm": 1.946492767775216, + "learning_rate": 4.009001101769716e-09, + "loss": 0.8475, + "step": 13589 + }, + { + "epoch": 0.9804841095198585, + "grad_norm": 2.4150572273629436, + "learning_rate": 3.97947876327942e-09, + "loss": 0.9382, + "step": 13590 + }, + { + "epoch": 0.9805562569892861, + "grad_norm": 2.2283685152542905, + "learning_rate": 3.950065420080806e-09, + "loss": 0.9348, + "step": 13591 + }, + { + "epoch": 0.9806284044587136, + "grad_norm": 2.3361382541699958, + "learning_rate": 3.920761073779921e-09, + "loss": 0.6699, + "step": 13592 + }, + { + "epoch": 0.9807005519281411, + "grad_norm": 2.1095776495899976, + "learning_rate": 3.891565725977042e-09, + "loss": 0.9686, + "step": 13593 + }, + { + "epoch": 0.9807726993975686, + "grad_norm": 2.5500321741505303, + "learning_rate": 3.862479378266448e-09, + "loss": 0.951, + "step": 13594 + }, + { + "epoch": 0.9808448468669961, + "grad_norm": 1.9824099685987895, + "learning_rate": 3.8335020322364244e-09, + "loss": 0.9225, + "step": 13595 + }, + { + "epoch": 0.9809169943364237, + "grad_norm": 2.50524637956075, + "learning_rate": 3.804633689469261e-09, + "loss": 0.8703, + "step": 13596 + }, + { + "epoch": 0.9809891418058512, + "grad_norm": 10.132589250986108, + "learning_rate": 3.775874351541475e-09, + "loss": 0.8543, + "step": 13597 + }, + { + "epoch": 0.9810612892752787, + "grad_norm": 6.539248009241367, + "learning_rate": 3.747224020023365e-09, + "loss": 0.8787, + "step": 13598 + }, + { + "epoch": 0.9811334367447062, + "grad_norm": 3.114127815813387, + "learning_rate": 3.7186826964794584e-09, + "loss": 0.8085, + "step": 13599 + }, + { + "epoch": 0.9812055842141337, + "grad_norm": 2.4399938297193713, + "learning_rate": 3.6902503824682852e-09, + "loss": 0.8283, + "step": 13600 + }, + { + "epoch": 0.9812777316835612, + "grad_norm": 0.8215773866740484, + "learning_rate": 3.661927079542382e-09, + "loss": 0.823, + "step": 13601 + }, + { + "epoch": 0.9813498791529887, + "grad_norm": 2.0540602274072106, + "learning_rate": 3.633712789248511e-09, + "loss": 0.9001, + "step": 13602 + }, + { + "epoch": 0.9814220266224162, + "grad_norm": 2.2053599823668133, + "learning_rate": 3.60560751312744e-09, + "loss": 0.8855, + "step": 13603 + }, + { + "epoch": 0.9814941740918437, + "grad_norm": 3.063506291944126, + "learning_rate": 3.577611252713719e-09, + "loss": 1.0225, + "step": 13604 + }, + { + "epoch": 0.9815663215612712, + "grad_norm": 2.7603378188670273, + "learning_rate": 3.5497240095359036e-09, + "loss": 0.842, + "step": 13605 + }, + { + "epoch": 0.9816384690306987, + "grad_norm": 1.9694187905428686, + "learning_rate": 3.5219457851172197e-09, + "loss": 0.8069, + "step": 13606 + }, + { + "epoch": 0.9817106165001263, + "grad_norm": 2.1738841691380104, + "learning_rate": 3.4942765809744536e-09, + "loss": 0.9026, + "step": 13607 + }, + { + "epoch": 0.9817827639695538, + "grad_norm": 2.7953939085908828, + "learning_rate": 3.466716398618175e-09, + "loss": 0.8919, + "step": 13608 + }, + { + "epoch": 0.9818549114389813, + "grad_norm": 2.7240569836542172, + "learning_rate": 3.439265239553846e-09, + "loss": 0.7833, + "step": 13609 + }, + { + "epoch": 0.9819270589084088, + "grad_norm": 0.7604512158910545, + "learning_rate": 3.411923105280268e-09, + "loss": 0.8125, + "step": 13610 + }, + { + "epoch": 0.9819992063778363, + "grad_norm": 2.1093344225336117, + "learning_rate": 3.3846899972902467e-09, + "loss": 0.9187, + "step": 13611 + }, + { + "epoch": 0.9820713538472639, + "grad_norm": 2.4404571397328008, + "learning_rate": 3.357565917071037e-09, + "loss": 0.9094, + "step": 13612 + }, + { + "epoch": 0.9821435013166914, + "grad_norm": 3.709827393580614, + "learning_rate": 3.3305508661041205e-09, + "loss": 0.9483, + "step": 13613 + }, + { + "epoch": 0.9822156487861188, + "grad_norm": 2.8040244506172596, + "learning_rate": 3.303644845864095e-09, + "loss": 0.8638, + "step": 13614 + }, + { + "epoch": 0.9822877962555463, + "grad_norm": 2.139350455987788, + "learning_rate": 3.2768478578206747e-09, + "loss": 1.039, + "step": 13615 + }, + { + "epoch": 0.9823599437249738, + "grad_norm": 3.008930741552304, + "learning_rate": 3.250159903437133e-09, + "loss": 1.0056, + "step": 13616 + }, + { + "epoch": 0.9824320911944013, + "grad_norm": 4.476041380690129, + "learning_rate": 3.2235809841703042e-09, + "loss": 0.9053, + "step": 13617 + }, + { + "epoch": 0.9825042386638289, + "grad_norm": 3.610133632183734, + "learning_rate": 3.1971111014721386e-09, + "loss": 1.0175, + "step": 13618 + }, + { + "epoch": 0.9825763861332564, + "grad_norm": 0.7719059046068141, + "learning_rate": 3.170750256787702e-09, + "loss": 0.7927, + "step": 13619 + }, + { + "epoch": 0.9826485336026839, + "grad_norm": 5.592540247861449, + "learning_rate": 3.144498451556732e-09, + "loss": 0.8009, + "step": 13620 + }, + { + "epoch": 0.9827206810721114, + "grad_norm": 2.8052582651466484, + "learning_rate": 3.1183556872125262e-09, + "loss": 0.8966, + "step": 13621 + }, + { + "epoch": 0.9827928285415389, + "grad_norm": 2.0586252517751, + "learning_rate": 3.0923219651826095e-09, + "loss": 0.9776, + "step": 13622 + }, + { + "epoch": 0.9828649760109665, + "grad_norm": 20.659525139845453, + "learning_rate": 3.0663972868887332e-09, + "loss": 0.9402, + "step": 13623 + }, + { + "epoch": 0.982937123480394, + "grad_norm": 2.960615236162768, + "learning_rate": 3.0405816537466546e-09, + "loss": 0.8957, + "step": 13624 + }, + { + "epoch": 0.9830092709498215, + "grad_norm": 1.7450145142615636, + "learning_rate": 3.0148750671656896e-09, + "loss": 0.9595, + "step": 13625 + }, + { + "epoch": 0.9830814184192489, + "grad_norm": 2.6756482167830593, + "learning_rate": 2.9892775285500494e-09, + "loss": 0.9801, + "step": 13626 + }, + { + "epoch": 0.9831535658886764, + "grad_norm": 3.8992355233200264, + "learning_rate": 2.963789039297282e-09, + "loss": 0.9208, + "step": 13627 + }, + { + "epoch": 0.9832257133581039, + "grad_norm": 2.8459591511457494, + "learning_rate": 2.9384096007989414e-09, + "loss": 0.957, + "step": 13628 + }, + { + "epoch": 0.9832978608275315, + "grad_norm": 5.0751151687257385, + "learning_rate": 2.9131392144414736e-09, + "loss": 0.9022, + "step": 13629 + }, + { + "epoch": 0.983370008296959, + "grad_norm": 6.520240667383046, + "learning_rate": 2.887977881604442e-09, + "loss": 0.9084, + "step": 13630 + }, + { + "epoch": 0.9834421557663865, + "grad_norm": 2.440960603857871, + "learning_rate": 2.8629256036618587e-09, + "loss": 0.9199, + "step": 13631 + }, + { + "epoch": 0.983514303235814, + "grad_norm": 4.746226769612255, + "learning_rate": 2.8379823819815186e-09, + "loss": 0.8325, + "step": 13632 + }, + { + "epoch": 0.9835864507052415, + "grad_norm": 2.278075007556, + "learning_rate": 2.813148217925887e-09, + "loss": 0.8766, + "step": 13633 + }, + { + "epoch": 0.983658598174669, + "grad_norm": 2.659986045198477, + "learning_rate": 2.7884231128509907e-09, + "loss": 0.9245, + "step": 13634 + }, + { + "epoch": 0.9837307456440966, + "grad_norm": 2.2630679729601666, + "learning_rate": 2.7638070681068604e-09, + "loss": 0.9352, + "step": 13635 + }, + { + "epoch": 0.9838028931135241, + "grad_norm": 2.606541357060988, + "learning_rate": 2.7393000850377547e-09, + "loss": 0.8382, + "step": 13636 + }, + { + "epoch": 0.9838750405829515, + "grad_norm": 12.571111811325661, + "learning_rate": 2.7149021649817138e-09, + "loss": 0.9648, + "step": 13637 + }, + { + "epoch": 0.983947188052379, + "grad_norm": 2.744841592661575, + "learning_rate": 2.690613309271228e-09, + "loss": 0.9072, + "step": 13638 + }, + { + "epoch": 0.9840193355218065, + "grad_norm": 2.456963518479698, + "learning_rate": 2.6664335192325694e-09, + "loss": 0.8403, + "step": 13639 + }, + { + "epoch": 0.9840914829912341, + "grad_norm": 2.4843912184083923, + "learning_rate": 2.642362796186237e-09, + "loss": 0.8609, + "step": 13640 + }, + { + "epoch": 0.9841636304606616, + "grad_norm": 2.401281602722032, + "learning_rate": 2.6184011414462915e-09, + "loss": 0.9822, + "step": 13641 + }, + { + "epoch": 0.9842357779300891, + "grad_norm": 0.8921558207484446, + "learning_rate": 2.594548556321685e-09, + "loss": 0.7647, + "step": 13642 + }, + { + "epoch": 0.9843079253995166, + "grad_norm": 2.427300657657806, + "learning_rate": 2.5708050421142658e-09, + "loss": 0.9364, + "step": 13643 + }, + { + "epoch": 0.9843800728689441, + "grad_norm": 3.048437412404695, + "learning_rate": 2.5471706001212177e-09, + "loss": 0.8243, + "step": 13644 + }, + { + "epoch": 0.9844522203383717, + "grad_norm": 10.804476721439178, + "learning_rate": 2.5236452316328427e-09, + "loss": 0.9345, + "step": 13645 + }, + { + "epoch": 0.9845243678077992, + "grad_norm": 2.4678769076732743, + "learning_rate": 2.5002289379336685e-09, + "loss": 0.8935, + "step": 13646 + }, + { + "epoch": 0.9845965152772267, + "grad_norm": 3.6992614163573645, + "learning_rate": 2.4769217203026716e-09, + "loss": 0.9105, + "step": 13647 + }, + { + "epoch": 0.9846686627466542, + "grad_norm": 2.2971904304495046, + "learning_rate": 2.4537235800123902e-09, + "loss": 0.8555, + "step": 13648 + }, + { + "epoch": 0.9847408102160816, + "grad_norm": 2.1266241038058276, + "learning_rate": 2.4306345183293663e-09, + "loss": 0.7935, + "step": 13649 + }, + { + "epoch": 0.9848129576855091, + "grad_norm": 2.1671282814477624, + "learning_rate": 2.407654536514814e-09, + "loss": 0.9498, + "step": 13650 + }, + { + "epoch": 0.9848851051549367, + "grad_norm": 2.6652724575166102, + "learning_rate": 2.384783635823506e-09, + "loss": 1.0232, + "step": 13651 + }, + { + "epoch": 0.9849572526243642, + "grad_norm": 2.9177084197689336, + "learning_rate": 2.3620218175040006e-09, + "loss": 0.9771, + "step": 13652 + }, + { + "epoch": 0.9850294000937917, + "grad_norm": 2.0170304477343426, + "learning_rate": 2.339369082799747e-09, + "loss": 0.9513, + "step": 13653 + }, + { + "epoch": 0.9851015475632192, + "grad_norm": 2.4632248678145823, + "learning_rate": 2.3168254329473114e-09, + "loss": 1.0068, + "step": 13654 + }, + { + "epoch": 0.9851736950326467, + "grad_norm": 5.348068722964558, + "learning_rate": 2.2943908691779313e-09, + "loss": 0.9072, + "step": 13655 + }, + { + "epoch": 0.9852458425020743, + "grad_norm": 4.089550864035225, + "learning_rate": 2.272065392716405e-09, + "loss": 0.8869, + "step": 13656 + }, + { + "epoch": 0.9853179899715018, + "grad_norm": 2.9095353456229396, + "learning_rate": 2.2498490047824226e-09, + "loss": 0.8767, + "step": 13657 + }, + { + "epoch": 0.9853901374409293, + "grad_norm": 3.4012944281892805, + "learning_rate": 2.2277417065885706e-09, + "loss": 0.9471, + "step": 13658 + }, + { + "epoch": 0.9854622849103568, + "grad_norm": 5.534047599254143, + "learning_rate": 2.2057434993421054e-09, + "loss": 0.8518, + "step": 13659 + }, + { + "epoch": 0.9855344323797843, + "grad_norm": 3.125535079922738, + "learning_rate": 2.18385438424451e-09, + "loss": 0.9112, + "step": 13660 + }, + { + "epoch": 0.9856065798492117, + "grad_norm": 2.418395071085085, + "learning_rate": 2.162074362491051e-09, + "loss": 0.996, + "step": 13661 + }, + { + "epoch": 0.9856787273186393, + "grad_norm": 1.7317045806219717, + "learning_rate": 2.1404034352709987e-09, + "loss": 0.9483, + "step": 13662 + }, + { + "epoch": 0.9857508747880668, + "grad_norm": 2.6080567462106874, + "learning_rate": 2.118841603767407e-09, + "loss": 0.9151, + "step": 13663 + }, + { + "epoch": 0.9858230222574943, + "grad_norm": 2.790941476337478, + "learning_rate": 2.097388869158223e-09, + "loss": 0.8906, + "step": 13664 + }, + { + "epoch": 0.9858951697269218, + "grad_norm": 2.203170343027131, + "learning_rate": 2.0760452326147317e-09, + "loss": 0.8577, + "step": 13665 + }, + { + "epoch": 0.9859673171963493, + "grad_norm": 2.2760081226894466, + "learning_rate": 2.0548106953022226e-09, + "loss": 0.8428, + "step": 13666 + }, + { + "epoch": 0.9860394646657769, + "grad_norm": 11.940008888433013, + "learning_rate": 2.0336852583802135e-09, + "loss": 0.7583, + "step": 13667 + }, + { + "epoch": 0.9861116121352044, + "grad_norm": 2.3731984784714055, + "learning_rate": 2.0126689230026693e-09, + "loss": 0.9135, + "step": 13668 + }, + { + "epoch": 0.9861837596046319, + "grad_norm": 2.6282587260040917, + "learning_rate": 1.9917616903168954e-09, + "loss": 0.8555, + "step": 13669 + }, + { + "epoch": 0.9862559070740594, + "grad_norm": 3.5516901479154, + "learning_rate": 1.9709635614646446e-09, + "loss": 0.8843, + "step": 13670 + }, + { + "epoch": 0.9863280545434869, + "grad_norm": 0.7101524965629803, + "learning_rate": 1.9502745375814534e-09, + "loss": 0.7841, + "step": 13671 + }, + { + "epoch": 0.9864002020129145, + "grad_norm": 2.2844622218313995, + "learning_rate": 1.929694619797528e-09, + "loss": 1.0307, + "step": 13672 + }, + { + "epoch": 0.9864723494823419, + "grad_norm": 2.697248653549069, + "learning_rate": 1.9092238092361934e-09, + "loss": 0.9127, + "step": 13673 + }, + { + "epoch": 0.9865444969517694, + "grad_norm": 0.7009632575939915, + "learning_rate": 1.888862107015443e-09, + "loss": 0.7455, + "step": 13674 + }, + { + "epoch": 0.9866166444211969, + "grad_norm": 0.6894005601420837, + "learning_rate": 1.868609514247277e-09, + "loss": 0.7661, + "step": 13675 + }, + { + "epoch": 0.9866887918906244, + "grad_norm": 3.532999466171246, + "learning_rate": 1.8484660320374768e-09, + "loss": 0.8657, + "step": 13676 + }, + { + "epoch": 0.9867609393600519, + "grad_norm": 2.7314495895401354, + "learning_rate": 1.8284316614860517e-09, + "loss": 0.8951, + "step": 13677 + }, + { + "epoch": 0.9868330868294795, + "grad_norm": 2.382293244363996, + "learning_rate": 1.8085064036872377e-09, + "loss": 0.8765, + "step": 13678 + }, + { + "epoch": 0.986905234298907, + "grad_norm": 2.096249582908193, + "learning_rate": 1.788690259728387e-09, + "loss": 0.9255, + "step": 13679 + }, + { + "epoch": 0.9869773817683345, + "grad_norm": 3.575283338082564, + "learning_rate": 1.7689832306924113e-09, + "loss": 0.9021, + "step": 13680 + }, + { + "epoch": 0.987049529237762, + "grad_norm": 2.4994077934752297, + "learning_rate": 1.7493853176548945e-09, + "loss": 0.8322, + "step": 13681 + }, + { + "epoch": 0.9871216767071895, + "grad_norm": 3.0854057529899523, + "learning_rate": 1.7298965216860917e-09, + "loss": 1.0028, + "step": 13682 + }, + { + "epoch": 0.987193824176617, + "grad_norm": 3.434081024555963, + "learning_rate": 1.7105168438504846e-09, + "loss": 0.8225, + "step": 13683 + }, + { + "epoch": 0.9872659716460445, + "grad_norm": 2.324695083148381, + "learning_rate": 1.691246285206116e-09, + "loss": 0.7759, + "step": 13684 + }, + { + "epoch": 0.987338119115472, + "grad_norm": 2.32403091825709, + "learning_rate": 1.672084846805033e-09, + "loss": 0.9521, + "step": 13685 + }, + { + "epoch": 0.9874102665848995, + "grad_norm": 2.275599025629418, + "learning_rate": 1.6530325296941762e-09, + "loss": 0.9447, + "step": 13686 + }, + { + "epoch": 0.987482414054327, + "grad_norm": 3.5064575025114655, + "learning_rate": 1.63408933491338e-09, + "loss": 0.8604, + "step": 13687 + }, + { + "epoch": 0.9875545615237545, + "grad_norm": 2.4889745779367614, + "learning_rate": 1.6152552634973726e-09, + "loss": 0.913, + "step": 13688 + }, + { + "epoch": 0.9876267089931821, + "grad_norm": 2.036319919835282, + "learning_rate": 1.5965303164744425e-09, + "loss": 0.8732, + "step": 13689 + }, + { + "epoch": 0.9876988564626096, + "grad_norm": 3.2763078073046747, + "learning_rate": 1.5779144948671052e-09, + "loss": 0.891, + "step": 13690 + }, + { + "epoch": 0.9877710039320371, + "grad_norm": 2.386523249684142, + "learning_rate": 1.559407799692103e-09, + "loss": 0.9255, + "step": 13691 + }, + { + "epoch": 0.9878431514014646, + "grad_norm": 2.099511830037877, + "learning_rate": 1.5410102319597386e-09, + "loss": 0.9397, + "step": 13692 + }, + { + "epoch": 0.9879152988708921, + "grad_norm": 2.76219991029277, + "learning_rate": 1.5227217926749859e-09, + "loss": 0.8759, + "step": 13693 + }, + { + "epoch": 0.9879874463403197, + "grad_norm": 1.89072165754422, + "learning_rate": 1.5045424828359355e-09, + "loss": 0.938, + "step": 13694 + }, + { + "epoch": 0.9880595938097472, + "grad_norm": 3.05924160503842, + "learning_rate": 1.4864723034360148e-09, + "loss": 0.858, + "step": 13695 + }, + { + "epoch": 0.9881317412791746, + "grad_norm": 1.9281098870623334, + "learning_rate": 1.4685112554613245e-09, + "loss": 0.9096, + "step": 13696 + }, + { + "epoch": 0.9882038887486021, + "grad_norm": 2.7075550623936633, + "learning_rate": 1.4506593398930788e-09, + "loss": 0.9909, + "step": 13697 + }, + { + "epoch": 0.9882760362180296, + "grad_norm": 3.6848389673956183, + "learning_rate": 1.4329165577058322e-09, + "loss": 0.919, + "step": 13698 + }, + { + "epoch": 0.9883481836874571, + "grad_norm": 3.1294080543791325, + "learning_rate": 1.415282909868587e-09, + "loss": 0.9408, + "step": 13699 + }, + { + "epoch": 0.9884203311568847, + "grad_norm": 2.2472377351661064, + "learning_rate": 1.3977583973443508e-09, + "loss": 0.8845, + "step": 13700 + }, + { + "epoch": 0.9884924786263122, + "grad_norm": 2.4486319829241534, + "learning_rate": 1.3803430210899137e-09, + "loss": 0.8704, + "step": 13701 + }, + { + "epoch": 0.9885646260957397, + "grad_norm": 2.451076259495918, + "learning_rate": 1.3630367820562926e-09, + "loss": 0.9213, + "step": 13702 + }, + { + "epoch": 0.9886367735651672, + "grad_norm": 2.6913583250166586, + "learning_rate": 1.3458396811885099e-09, + "loss": 0.9559, + "step": 13703 + }, + { + "epoch": 0.9887089210345947, + "grad_norm": 2.9140710089101645, + "learning_rate": 1.3287517194255915e-09, + "loss": 0.9561, + "step": 13704 + }, + { + "epoch": 0.9887810685040223, + "grad_norm": 8.75848979525879, + "learning_rate": 1.3117728977007913e-09, + "loss": 0.8615, + "step": 13705 + }, + { + "epoch": 0.9888532159734498, + "grad_norm": 2.7972958210877605, + "learning_rate": 1.2949032169411457e-09, + "loss": 0.9215, + "step": 13706 + }, + { + "epoch": 0.9889253634428773, + "grad_norm": 3.3994126597957464, + "learning_rate": 1.2781426780679173e-09, + "loss": 0.8213, + "step": 13707 + }, + { + "epoch": 0.9889975109123047, + "grad_norm": 2.2837722630242996, + "learning_rate": 1.261491281996374e-09, + "loss": 0.801, + "step": 13708 + }, + { + "epoch": 0.9890696583817322, + "grad_norm": 3.469086364383603, + "learning_rate": 1.244949029635567e-09, + "loss": 1.024, + "step": 13709 + }, + { + "epoch": 0.9891418058511597, + "grad_norm": 2.482521757431956, + "learning_rate": 1.2285159218889952e-09, + "loss": 0.8763, + "step": 13710 + }, + { + "epoch": 0.9892139533205873, + "grad_norm": 3.4455072257099437, + "learning_rate": 1.212191959653941e-09, + "loss": 0.9274, + "step": 13711 + }, + { + "epoch": 0.9892861007900148, + "grad_norm": 2.3533146309402118, + "learning_rate": 1.1959771438219135e-09, + "loss": 0.9094, + "step": 13712 + }, + { + "epoch": 0.9893582482594423, + "grad_norm": 2.3519723731449025, + "learning_rate": 1.1798714752782047e-09, + "loss": 0.8662, + "step": 13713 + }, + { + "epoch": 0.9894303957288698, + "grad_norm": 2.912986577200485, + "learning_rate": 1.163874954902333e-09, + "loss": 0.9636, + "step": 13714 + }, + { + "epoch": 0.9895025431982973, + "grad_norm": 3.3909933004445274, + "learning_rate": 1.1479875835678222e-09, + "loss": 1.0221, + "step": 13715 + }, + { + "epoch": 0.9895746906677249, + "grad_norm": 2.394589894807197, + "learning_rate": 1.1322093621422e-09, + "loss": 0.8339, + "step": 13716 + }, + { + "epoch": 0.9896468381371524, + "grad_norm": 2.2787988113641275, + "learning_rate": 1.1165402914872224e-09, + "loss": 0.945, + "step": 13717 + }, + { + "epoch": 0.9897189856065799, + "grad_norm": 2.216386745341511, + "learning_rate": 1.1009803724582044e-09, + "loss": 0.8021, + "step": 13718 + }, + { + "epoch": 0.9897911330760074, + "grad_norm": 2.933263605184921, + "learning_rate": 1.0855296059049113e-09, + "loss": 0.9233, + "step": 13719 + }, + { + "epoch": 0.9898632805454348, + "grad_norm": 8.710044096562456, + "learning_rate": 1.0701879926713342e-09, + "loss": 0.9885, + "step": 13720 + }, + { + "epoch": 0.9899354280148623, + "grad_norm": 2.7217664975111067, + "learning_rate": 1.0549555335948035e-09, + "loss": 0.8764, + "step": 13721 + }, + { + "epoch": 0.9900075754842899, + "grad_norm": 2.2197549182911684, + "learning_rate": 1.0398322295073204e-09, + "loss": 0.9673, + "step": 13722 + }, + { + "epoch": 0.9900797229537174, + "grad_norm": 8.753929345825178, + "learning_rate": 1.0248180812348906e-09, + "loss": 0.9353, + "step": 13723 + }, + { + "epoch": 0.9901518704231449, + "grad_norm": 3.423319054647777, + "learning_rate": 1.009913089597081e-09, + "loss": 0.898, + "step": 13724 + }, + { + "epoch": 0.9902240178925724, + "grad_norm": 3.0887669979107866, + "learning_rate": 9.951172554079068e-10, + "loss": 0.8363, + "step": 13725 + }, + { + "epoch": 0.9902961653619999, + "grad_norm": 2.5507838671263374, + "learning_rate": 9.804305794753886e-10, + "loss": 0.9409, + "step": 13726 + }, + { + "epoch": 0.9903683128314275, + "grad_norm": 2.4094163943048574, + "learning_rate": 9.658530626013295e-10, + "loss": 0.8793, + "step": 13727 + }, + { + "epoch": 0.990440460300855, + "grad_norm": 2.623180848972952, + "learning_rate": 9.513847055817593e-10, + "loss": 0.8801, + "step": 13728 + }, + { + "epoch": 0.9905126077702825, + "grad_norm": 2.190931081679474, + "learning_rate": 9.370255092071566e-10, + "loss": 0.8943, + "step": 13729 + }, + { + "epoch": 0.99058475523971, + "grad_norm": 2.379379050088797, + "learning_rate": 9.227754742611171e-10, + "loss": 0.889, + "step": 13730 + }, + { + "epoch": 0.9906569027091375, + "grad_norm": 2.5222449540651053, + "learning_rate": 9.086346015219071e-10, + "loss": 0.8418, + "step": 13731 + }, + { + "epoch": 0.9907290501785649, + "grad_norm": 2.6153145080551403, + "learning_rate": 8.946028917620197e-10, + "loss": 1.0509, + "step": 13732 + }, + { + "epoch": 0.9908011976479925, + "grad_norm": 3.144262380537536, + "learning_rate": 8.806803457472866e-10, + "loss": 0.7659, + "step": 13733 + }, + { + "epoch": 0.99087334511742, + "grad_norm": 1.7049061971461963, + "learning_rate": 8.668669642379889e-10, + "loss": 1.0023, + "step": 13734 + }, + { + "epoch": 0.9909454925868475, + "grad_norm": 3.8817129746641212, + "learning_rate": 8.531627479886339e-10, + "loss": 0.8868, + "step": 13735 + }, + { + "epoch": 0.991017640056275, + "grad_norm": 4.213352169244247, + "learning_rate": 8.395676977475119e-10, + "loss": 0.9801, + "step": 13736 + }, + { + "epoch": 0.9910897875257025, + "grad_norm": 2.4757693010248296, + "learning_rate": 8.260818142569181e-10, + "loss": 0.9101, + "step": 13737 + }, + { + "epoch": 0.9911619349951301, + "grad_norm": 3.131205386759589, + "learning_rate": 8.127050982533745e-10, + "loss": 0.9196, + "step": 13738 + }, + { + "epoch": 0.9912340824645576, + "grad_norm": 0.7279654872515764, + "learning_rate": 7.994375504674078e-10, + "loss": 0.7725, + "step": 13739 + }, + { + "epoch": 0.9913062299339851, + "grad_norm": 2.5021088288908957, + "learning_rate": 7.862791716231054e-10, + "loss": 1.0602, + "step": 13740 + }, + { + "epoch": 0.9913783774034126, + "grad_norm": 2.4057612727857562, + "learning_rate": 7.732299624394478e-10, + "loss": 0.8586, + "step": 13741 + }, + { + "epoch": 0.9914505248728401, + "grad_norm": 2.2820132298265747, + "learning_rate": 7.602899236287541e-10, + "loss": 0.95, + "step": 13742 + }, + { + "epoch": 0.9915226723422675, + "grad_norm": 2.560615311674822, + "learning_rate": 7.474590558975702e-10, + "loss": 0.9022, + "step": 13743 + }, + { + "epoch": 0.9915948198116951, + "grad_norm": 3.2568902555908896, + "learning_rate": 7.347373599468909e-10, + "loss": 0.8704, + "step": 13744 + }, + { + "epoch": 0.9916669672811226, + "grad_norm": 3.91486001788771, + "learning_rate": 7.221248364710497e-10, + "loss": 0.8119, + "step": 13745 + }, + { + "epoch": 0.9917391147505501, + "grad_norm": 2.3271717962004357, + "learning_rate": 7.09621486158829e-10, + "loss": 0.9301, + "step": 13746 + }, + { + "epoch": 0.9918112622199776, + "grad_norm": 3.6272236810941823, + "learning_rate": 6.972273096930159e-10, + "loss": 0.9534, + "step": 13747 + }, + { + "epoch": 0.9918834096894051, + "grad_norm": 3.161298585445211, + "learning_rate": 6.849423077506245e-10, + "loss": 0.8589, + "step": 13748 + }, + { + "epoch": 0.9919555571588327, + "grad_norm": 2.782726281794026, + "learning_rate": 6.727664810022293e-10, + "loss": 0.9896, + "step": 13749 + }, + { + "epoch": 0.9920277046282602, + "grad_norm": 3.8836617414147327, + "learning_rate": 6.606998301128541e-10, + "loss": 0.9335, + "step": 13750 + }, + { + "epoch": 0.9920998520976877, + "grad_norm": 2.826461049584681, + "learning_rate": 6.487423557415272e-10, + "loss": 0.9725, + "step": 13751 + }, + { + "epoch": 0.9921719995671152, + "grad_norm": 2.538468135811829, + "learning_rate": 6.368940585408377e-10, + "loss": 0.8849, + "step": 13752 + }, + { + "epoch": 0.9922441470365427, + "grad_norm": 22.346624186861607, + "learning_rate": 6.251549391578237e-10, + "loss": 0.9513, + "step": 13753 + }, + { + "epoch": 0.9923162945059703, + "grad_norm": 2.940850315522384, + "learning_rate": 6.135249982339719e-10, + "loss": 0.8939, + "step": 13754 + }, + { + "epoch": 0.9923884419753977, + "grad_norm": 2.401624916273221, + "learning_rate": 6.020042364038858e-10, + "loss": 0.8818, + "step": 13755 + }, + { + "epoch": 0.9924605894448252, + "grad_norm": 1.595763710767486, + "learning_rate": 5.905926542968398e-10, + "loss": 0.9084, + "step": 13756 + }, + { + "epoch": 0.9925327369142527, + "grad_norm": 4.353821857920672, + "learning_rate": 5.792902525358912e-10, + "loss": 0.9415, + "step": 13757 + }, + { + "epoch": 0.9926048843836802, + "grad_norm": 2.111042973885551, + "learning_rate": 5.680970317385458e-10, + "loss": 0.948, + "step": 13758 + }, + { + "epoch": 0.9926770318531077, + "grad_norm": 2.215255994965363, + "learning_rate": 5.570129925156486e-10, + "loss": 0.8356, + "step": 13759 + }, + { + "epoch": 0.9927491793225353, + "grad_norm": 2.572007660903319, + "learning_rate": 5.46038135472493e-10, + "loss": 0.8709, + "step": 13760 + }, + { + "epoch": 0.9928213267919628, + "grad_norm": 2.723044675597125, + "learning_rate": 5.351724612085995e-10, + "loss": 0.8452, + "step": 13761 + }, + { + "epoch": 0.9928934742613903, + "grad_norm": 2.2281699284414973, + "learning_rate": 5.244159703170492e-10, + "loss": 0.8904, + "step": 13762 + }, + { + "epoch": 0.9929656217308178, + "grad_norm": 1.9913073820860911, + "learning_rate": 5.137686633853722e-10, + "loss": 0.8394, + "step": 13763 + }, + { + "epoch": 0.9930377692002453, + "grad_norm": 2.971931278887219, + "learning_rate": 5.032305409951032e-10, + "loss": 0.9771, + "step": 13764 + }, + { + "epoch": 0.9931099166696729, + "grad_norm": 0.7200911076176114, + "learning_rate": 4.928016037213378e-10, + "loss": 0.7911, + "step": 13765 + }, + { + "epoch": 0.9931820641391004, + "grad_norm": 0.7963003442679464, + "learning_rate": 4.824818521338425e-10, + "loss": 0.8132, + "step": 13766 + }, + { + "epoch": 0.9932542116085278, + "grad_norm": 2.2079491790990415, + "learning_rate": 4.722712867959445e-10, + "loss": 0.9475, + "step": 13767 + }, + { + "epoch": 0.9933263590779553, + "grad_norm": 2.3784935002794483, + "learning_rate": 4.6216990826541955e-10, + "loss": 0.9282, + "step": 13768 + }, + { + "epoch": 0.9933985065473828, + "grad_norm": 0.7879617397004829, + "learning_rate": 4.521777170938268e-10, + "loss": 0.8163, + "step": 13769 + }, + { + "epoch": 0.9934706540168103, + "grad_norm": 5.603753038006039, + "learning_rate": 4.422947138265076e-10, + "loss": 0.9077, + "step": 13770 + }, + { + "epoch": 0.9935428014862379, + "grad_norm": 5.077142144969893, + "learning_rate": 4.325208990034745e-10, + "loss": 0.8411, + "step": 13771 + }, + { + "epoch": 0.9936149489556654, + "grad_norm": 4.8609061328117456, + "learning_rate": 4.228562731585228e-10, + "loss": 0.9393, + "step": 13772 + }, + { + "epoch": 0.9936870964250929, + "grad_norm": 4.692468680113142, + "learning_rate": 4.133008368190083e-10, + "loss": 1.0415, + "step": 13773 + }, + { + "epoch": 0.9937592438945204, + "grad_norm": 2.2215559503660245, + "learning_rate": 4.0385459050695794e-10, + "loss": 0.8438, + "step": 13774 + }, + { + "epoch": 0.9938313913639479, + "grad_norm": 2.571484330496164, + "learning_rate": 3.945175347381813e-10, + "loss": 0.9108, + "step": 13775 + }, + { + "epoch": 0.9939035388333755, + "grad_norm": 3.2034142340355825, + "learning_rate": 3.852896700224928e-10, + "loss": 0.8533, + "step": 13776 + }, + { + "epoch": 0.993975686302803, + "grad_norm": 2.4638538337554294, + "learning_rate": 3.761709968637117e-10, + "loss": 0.9134, + "step": 13777 + }, + { + "epoch": 0.9940478337722305, + "grad_norm": 0.7228190470661628, + "learning_rate": 3.6716151575988397e-10, + "loss": 0.7935, + "step": 13778 + }, + { + "epoch": 0.9941199812416579, + "grad_norm": 7.768816593342021, + "learning_rate": 3.5826122720328256e-10, + "loss": 0.8854, + "step": 13779 + }, + { + "epoch": 0.9941921287110854, + "grad_norm": 2.9673648212755954, + "learning_rate": 3.4947013167929694e-10, + "loss": 0.9784, + "step": 13780 + }, + { + "epoch": 0.9942642761805129, + "grad_norm": 3.0558307196282857, + "learning_rate": 3.407882296684317e-10, + "loss": 0.8071, + "step": 13781 + }, + { + "epoch": 0.9943364236499405, + "grad_norm": 2.332580297300239, + "learning_rate": 3.3221552164452994e-10, + "loss": 0.9085, + "step": 13782 + }, + { + "epoch": 0.994408571119368, + "grad_norm": 2.6682980335608204, + "learning_rate": 3.2375200807588376e-10, + "loss": 0.9808, + "step": 13783 + }, + { + "epoch": 0.9944807185887955, + "grad_norm": 2.6325950645967953, + "learning_rate": 3.1539768942479004e-10, + "loss": 0.8586, + "step": 13784 + }, + { + "epoch": 0.994552866058223, + "grad_norm": 2.4337713626811546, + "learning_rate": 3.0715256614688434e-10, + "loss": 0.8372, + "step": 13785 + }, + { + "epoch": 0.9946250135276505, + "grad_norm": 3.4019468231440655, + "learning_rate": 2.990166386931392e-10, + "loss": 0.8518, + "step": 13786 + }, + { + "epoch": 0.9946971609970781, + "grad_norm": 2.5620398689818793, + "learning_rate": 2.909899075071998e-10, + "loss": 0.8316, + "step": 13787 + }, + { + "epoch": 0.9947693084665056, + "grad_norm": 2.218195456393253, + "learning_rate": 2.8307237302760413e-10, + "loss": 0.9655, + "step": 13788 + }, + { + "epoch": 0.9948414559359331, + "grad_norm": 2.627065390985151, + "learning_rate": 2.7526403568689515e-10, + "loss": 0.9819, + "step": 13789 + }, + { + "epoch": 0.9949136034053605, + "grad_norm": 3.536251745617983, + "learning_rate": 2.675648959111765e-10, + "loss": 0.9216, + "step": 13790 + }, + { + "epoch": 0.994985750874788, + "grad_norm": 3.2609703073281544, + "learning_rate": 2.599749541210006e-10, + "loss": 0.8777, + "step": 13791 + }, + { + "epoch": 0.9950578983442155, + "grad_norm": 0.8206622513438718, + "learning_rate": 2.524942107309247e-10, + "loss": 0.8504, + "step": 13792 + }, + { + "epoch": 0.9951300458136431, + "grad_norm": 3.1815523494794213, + "learning_rate": 2.451226661492889e-10, + "loss": 0.8782, + "step": 13793 + }, + { + "epoch": 0.9952021932830706, + "grad_norm": 4.049072847087405, + "learning_rate": 2.37860320778438e-10, + "loss": 0.9104, + "step": 13794 + }, + { + "epoch": 0.9952743407524981, + "grad_norm": 2.0419905528897093, + "learning_rate": 2.3070717501538773e-10, + "loss": 0.8757, + "step": 13795 + }, + { + "epoch": 0.9953464882219256, + "grad_norm": 2.0776474482867777, + "learning_rate": 2.2366322925049252e-10, + "loss": 1.0053, + "step": 13796 + }, + { + "epoch": 0.9954186356913531, + "grad_norm": 2.1041085515321827, + "learning_rate": 2.1672848386833364e-10, + "loss": 0.9371, + "step": 13797 + }, + { + "epoch": 0.9954907831607807, + "grad_norm": 4.418887149997064, + "learning_rate": 2.0990293924794123e-10, + "loss": 0.8095, + "step": 13798 + }, + { + "epoch": 0.9955629306302082, + "grad_norm": 2.325725060893572, + "learning_rate": 2.0318659576146202e-10, + "loss": 0.9356, + "step": 13799 + }, + { + "epoch": 0.9956350780996357, + "grad_norm": 5.0231409534116995, + "learning_rate": 1.965794537761578e-10, + "loss": 0.9143, + "step": 13800 + }, + { + "epoch": 0.9957072255690632, + "grad_norm": 1.869986507142699, + "learning_rate": 1.9008151365262903e-10, + "loss": 0.8669, + "step": 13801 + }, + { + "epoch": 0.9957793730384906, + "grad_norm": 2.3535277813482733, + "learning_rate": 1.8369277574570295e-10, + "loss": 0.9028, + "step": 13802 + }, + { + "epoch": 0.9958515205079181, + "grad_norm": 2.7083980748190157, + "learning_rate": 1.7741324040421167e-10, + "loss": 0.8757, + "step": 13803 + }, + { + "epoch": 0.9959236679773457, + "grad_norm": 2.621272574178826, + "learning_rate": 1.7124290797099206e-10, + "loss": 0.9332, + "step": 13804 + }, + { + "epoch": 0.9959958154467732, + "grad_norm": 8.695813928011884, + "learning_rate": 1.6518177878332983e-10, + "loss": 1.0106, + "step": 13805 + }, + { + "epoch": 0.9960679629162007, + "grad_norm": 3.590489907375229, + "learning_rate": 1.5922985317184945e-10, + "loss": 0.8112, + "step": 13806 + }, + { + "epoch": 0.9961401103856282, + "grad_norm": 1.7463193060761866, + "learning_rate": 1.5338713146140215e-10, + "loss": 0.9103, + "step": 13807 + }, + { + "epoch": 0.9962122578550557, + "grad_norm": 2.588266574254962, + "learning_rate": 1.4765361397173214e-10, + "loss": 0.8561, + "step": 13808 + }, + { + "epoch": 0.9962844053244833, + "grad_norm": 2.8894240075274307, + "learning_rate": 1.4202930101525624e-10, + "loss": 0.9392, + "step": 13809 + }, + { + "epoch": 0.9963565527939108, + "grad_norm": 0.7054397416513908, + "learning_rate": 1.3651419289928413e-10, + "loss": 0.7857, + "step": 13810 + }, + { + "epoch": 0.9964287002633383, + "grad_norm": 1.9891910532412833, + "learning_rate": 1.3110828992513034e-10, + "loss": 0.8795, + "step": 13811 + }, + { + "epoch": 0.9965008477327658, + "grad_norm": 3.0009570184748395, + "learning_rate": 1.2581159238789217e-10, + "loss": 0.8444, + "step": 13812 + }, + { + "epoch": 0.9965729952021933, + "grad_norm": 3.062372010339683, + "learning_rate": 1.2062410057689376e-10, + "loss": 0.8448, + "step": 13813 + }, + { + "epoch": 0.9966451426716207, + "grad_norm": 2.0743577123957744, + "learning_rate": 1.1554581477524195e-10, + "loss": 0.9839, + "step": 13814 + }, + { + "epoch": 0.9967172901410483, + "grad_norm": 2.4482502703234643, + "learning_rate": 1.1057673526027044e-10, + "loss": 0.9568, + "step": 13815 + }, + { + "epoch": 0.9967894376104758, + "grad_norm": 0.8274376373763452, + "learning_rate": 1.0571686230331778e-10, + "loss": 0.8502, + "step": 13816 + }, + { + "epoch": 0.9968615850799033, + "grad_norm": 2.895394509271379, + "learning_rate": 1.0096619616994927e-10, + "loss": 0.8542, + "step": 13817 + }, + { + "epoch": 0.9969337325493308, + "grad_norm": 3.406723689468625, + "learning_rate": 9.632473711929101e-11, + "loss": 0.929, + "step": 13818 + }, + { + "epoch": 0.9970058800187583, + "grad_norm": 1.7521885402207598, + "learning_rate": 9.179248540513995e-11, + "loss": 0.9654, + "step": 13819 + }, + { + "epoch": 0.9970780274881859, + "grad_norm": 2.6933498147265063, + "learning_rate": 8.736944127463176e-11, + "loss": 1.0222, + "step": 13820 + }, + { + "epoch": 0.9971501749576134, + "grad_norm": 2.4205846486962415, + "learning_rate": 8.305560496957297e-11, + "loss": 0.9199, + "step": 13821 + }, + { + "epoch": 0.9972223224270409, + "grad_norm": 7.339751758171528, + "learning_rate": 7.885097672533092e-11, + "loss": 0.8912, + "step": 13822 + }, + { + "epoch": 0.9972944698964684, + "grad_norm": 2.5438840188753216, + "learning_rate": 7.475555677149969e-11, + "loss": 0.8418, + "step": 13823 + }, + { + "epoch": 0.9973666173658959, + "grad_norm": 3.583322273651325, + "learning_rate": 7.076934533190027e-11, + "loss": 0.8533, + "step": 13824 + }, + { + "epoch": 0.9974387648353235, + "grad_norm": 2.1711740803553803, + "learning_rate": 6.689234262413634e-11, + "loss": 0.966, + "step": 13825 + }, + { + "epoch": 0.9975109123047509, + "grad_norm": 7.658466376420904, + "learning_rate": 6.312454885981644e-11, + "loss": 0.9277, + "step": 13826 + }, + { + "epoch": 0.9975830597741784, + "grad_norm": 4.312521892989605, + "learning_rate": 5.94659642447759e-11, + "loss": 0.8686, + "step": 13827 + }, + { + "epoch": 0.9976552072436059, + "grad_norm": 3.16478609829695, + "learning_rate": 5.5916588978854874e-11, + "loss": 0.9282, + "step": 13828 + }, + { + "epoch": 0.9977273547130334, + "grad_norm": 3.4106879764703417, + "learning_rate": 5.247642325567625e-11, + "loss": 1.0409, + "step": 13829 + }, + { + "epoch": 0.9977995021824609, + "grad_norm": 3.3245219754660136, + "learning_rate": 4.914546726331181e-11, + "loss": 0.9319, + "step": 13830 + }, + { + "epoch": 0.9978716496518885, + "grad_norm": 2.810852755190984, + "learning_rate": 4.592372118339405e-11, + "loss": 0.9173, + "step": 13831 + }, + { + "epoch": 0.997943797121316, + "grad_norm": 2.118896020229274, + "learning_rate": 4.2811185192226374e-11, + "loss": 0.9843, + "step": 13832 + }, + { + "epoch": 0.9980159445907435, + "grad_norm": 2.460138778648713, + "learning_rate": 3.980785945945087e-11, + "loss": 0.8391, + "step": 13833 + }, + { + "epoch": 0.998088092060171, + "grad_norm": 3.6483975067233874, + "learning_rate": 3.691374414893644e-11, + "loss": 0.9839, + "step": 13834 + }, + { + "epoch": 0.9981602395295985, + "grad_norm": 5.139244566742945, + "learning_rate": 3.412883941922295e-11, + "loss": 0.8653, + "step": 13835 + }, + { + "epoch": 0.9982323869990261, + "grad_norm": 3.8231501425701655, + "learning_rate": 3.1453145421966865e-11, + "loss": 0.9163, + "step": 13836 + }, + { + "epoch": 0.9983045344684535, + "grad_norm": 2.3659039029351097, + "learning_rate": 2.8886662303717614e-11, + "loss": 0.9147, + "step": 13837 + }, + { + "epoch": 0.998376681937881, + "grad_norm": 2.5240294526886076, + "learning_rate": 2.642939020391921e-11, + "loss": 1.0405, + "step": 13838 + }, + { + "epoch": 0.9984488294073085, + "grad_norm": 2.2745814640586968, + "learning_rate": 2.4081329257352733e-11, + "loss": 0.8316, + "step": 13839 + }, + { + "epoch": 0.998520976876736, + "grad_norm": 2.968210432309541, + "learning_rate": 2.184247959213792e-11, + "loss": 0.8893, + "step": 13840 + }, + { + "epoch": 0.9985931243461635, + "grad_norm": 2.7174076844426596, + "learning_rate": 1.97128413303993e-11, + "loss": 0.9221, + "step": 13841 + }, + { + "epoch": 0.9986652718155911, + "grad_norm": 2.2066521513116713, + "learning_rate": 1.76924145882662e-11, + "loss": 0.8904, + "step": 13842 + }, + { + "epoch": 0.9987374192850186, + "grad_norm": 2.627016110710983, + "learning_rate": 1.5781199476538887e-11, + "loss": 0.8799, + "step": 13843 + }, + { + "epoch": 0.9988095667544461, + "grad_norm": 2.946772772430692, + "learning_rate": 1.3979196099134227e-11, + "loss": 0.8684, + "step": 13844 + }, + { + "epoch": 0.9988817142238736, + "grad_norm": 3.4800545631165134, + "learning_rate": 1.2286404554862073e-11, + "loss": 0.8941, + "step": 13845 + }, + { + "epoch": 0.9989538616933011, + "grad_norm": 4.6697850480530185, + "learning_rate": 1.0702824935648891e-11, + "loss": 0.9272, + "step": 13846 + }, + { + "epoch": 0.9990260091627287, + "grad_norm": 2.42419529177263, + "learning_rate": 9.22845732831412e-12, + "loss": 1.0091, + "step": 13847 + }, + { + "epoch": 0.9990981566321562, + "grad_norm": 2.3400898136932957, + "learning_rate": 7.863301813459954e-12, + "loss": 0.8857, + "step": 13848 + }, + { + "epoch": 0.9991703041015836, + "grad_norm": 2.3561956950843315, + "learning_rate": 6.607358465249291e-12, + "loss": 0.8586, + "step": 13849 + }, + { + "epoch": 0.9992424515710111, + "grad_norm": 3.9445762627175838, + "learning_rate": 5.460627352738001e-12, + "loss": 0.8632, + "step": 13850 + }, + { + "epoch": 0.9993145990404386, + "grad_norm": 1.8499746475544572, + "learning_rate": 4.423108538098574e-12, + "loss": 0.9538, + "step": 13851 + }, + { + "epoch": 0.9993867465098661, + "grad_norm": 2.646115525783683, + "learning_rate": 3.4948020781744304e-12, + "loss": 0.9684, + "step": 13852 + }, + { + "epoch": 0.9994588939792937, + "grad_norm": 2.1768729246635083, + "learning_rate": 2.6757080238137832e-12, + "loss": 0.9481, + "step": 13853 + }, + { + "epoch": 0.9995310414487212, + "grad_norm": 2.1665622857210765, + "learning_rate": 1.9658264194255538e-12, + "loss": 0.9034, + "step": 13854 + }, + { + "epoch": 0.9996031889181487, + "grad_norm": 3.3938639416192915, + "learning_rate": 1.3651573040895925e-12, + "loss": 0.9665, + "step": 13855 + }, + { + "epoch": 0.9996753363875762, + "grad_norm": 0.7143438290047307, + "learning_rate": 8.737007104464567e-13, + "loss": 0.7875, + "step": 13856 + }, + { + "epoch": 0.9997474838570037, + "grad_norm": 2.2164065382872895, + "learning_rate": 4.914566651414986e-13, + "loss": 0.948, + "step": 13857 + }, + { + "epoch": 0.9998196313264313, + "grad_norm": 2.7205634372229937, + "learning_rate": 2.1842518949100052e-13, + "loss": 0.9593, + "step": 13858 + }, + { + "epoch": 0.9998917787958588, + "grad_norm": 2.320905924220547, + "learning_rate": 5.460629814990625e-14, + "loss": 0.9464, + "step": 13859 + }, + { + "epoch": 0.9999639262652863, + "grad_norm": 12.859198790370785, + "learning_rate": 0.0, + "loss": 0.7757, + "step": 13860 + }, + { + "epoch": 0.9999639262652863, + "step": 13860, + "total_flos": 1.0308343916742574e+18, + "train_loss": 0.9310774591578034, + "train_runtime": 439745.8749, + "train_samples_per_second": 1.513, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 13860, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0308343916742574e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}