diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8358 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9995792131285504, + "eval_steps": 500, + "global_step": 1188, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001683147485798443, + "grad_norm": 0.17560942471027374, + "learning_rate": 0.0, + "loss": 2.613, + "step": 1 + }, + { + "epoch": 0.003366294971596886, + "grad_norm": 0.15861666202545166, + "learning_rate": 2.7894294565112984e-06, + "loss": 2.6655, + "step": 2 + }, + { + "epoch": 0.005049442457395329, + "grad_norm": 0.1817302405834198, + "learning_rate": 4.421141086977404e-06, + "loss": 2.55, + "step": 3 + }, + { + "epoch": 0.006732589943193772, + "grad_norm": 0.17854492366313934, + "learning_rate": 5.578858913022597e-06, + "loss": 2.7908, + "step": 4 + }, + { + "epoch": 0.008415737428992216, + "grad_norm": 0.17169038951396942, + "learning_rate": 6.47685462377997e-06, + "loss": 2.6868, + "step": 5 + }, + { + "epoch": 0.010098884914790659, + "grad_norm": 0.18368647992610931, + "learning_rate": 7.210570543488702e-06, + "loss": 2.5874, + "step": 6 + }, + { + "epoch": 0.011782032400589101, + "grad_norm": 0.19648714363574982, + "learning_rate": 7.830918514469461e-06, + "loss": 2.6633, + "step": 7 + }, + { + "epoch": 0.013465179886387544, + "grad_norm": 0.18358571827411652, + "learning_rate": 8.368288369533896e-06, + "loss": 2.6355, + "step": 8 + }, + { + "epoch": 0.015148327372185988, + "grad_norm": 0.19153611361980438, + "learning_rate": 8.842282173954808e-06, + "loss": 2.6633, + "step": 9 + }, + { + "epoch": 0.016831474857984433, + "grad_norm": 0.20646820962429047, + "learning_rate": 9.26628408029127e-06, + "loss": 2.7268, + "step": 10 + }, + { + "epoch": 0.018514622343782875, + "grad_norm": 0.18688935041427612, + "learning_rate": 9.64984045981344e-06, + "loss": 2.7832, + "step": 11 + }, + { + "epoch": 0.020197769829581318, + "grad_norm": 0.1985747218132019, + "learning_rate": 1e-05, + "loss": 2.738, + "step": 12 + }, + { + "epoch": 0.02188091731537976, + "grad_norm": 0.19321100413799286, + "learning_rate": 1e-05, + "loss": 2.6206, + "step": 13 + }, + { + "epoch": 0.023564064801178203, + "grad_norm": 0.1875382661819458, + "learning_rate": 1e-05, + "loss": 2.7153, + "step": 14 + }, + { + "epoch": 0.025247212286976645, + "grad_norm": 0.18803201615810394, + "learning_rate": 1e-05, + "loss": 2.5359, + "step": 15 + }, + { + "epoch": 0.026930359772775088, + "grad_norm": 0.19693922996520996, + "learning_rate": 1e-05, + "loss": 2.6082, + "step": 16 + }, + { + "epoch": 0.028613507258573534, + "grad_norm": 0.20534300804138184, + "learning_rate": 1e-05, + "loss": 2.5317, + "step": 17 + }, + { + "epoch": 0.030296654744371977, + "grad_norm": 0.22174465656280518, + "learning_rate": 1e-05, + "loss": 2.6067, + "step": 18 + }, + { + "epoch": 0.03197980223017042, + "grad_norm": 0.1947612464427948, + "learning_rate": 1e-05, + "loss": 2.6824, + "step": 19 + }, + { + "epoch": 0.033662949715968865, + "grad_norm": 0.19715926051139832, + "learning_rate": 1e-05, + "loss": 2.6868, + "step": 20 + }, + { + "epoch": 0.035346097201767304, + "grad_norm": 0.19586338102817535, + "learning_rate": 1e-05, + "loss": 2.6206, + "step": 21 + }, + { + "epoch": 0.03702924468756575, + "grad_norm": 0.19280074536800385, + "learning_rate": 1e-05, + "loss": 2.6023, + "step": 22 + }, + { + "epoch": 0.03871239217336419, + "grad_norm": 0.19658198952674866, + "learning_rate": 1e-05, + "loss": 2.6384, + "step": 23 + }, + { + "epoch": 0.040395539659162635, + "grad_norm": 0.17433768510818481, + "learning_rate": 1e-05, + "loss": 2.5305, + "step": 24 + }, + { + "epoch": 0.042078687144961074, + "grad_norm": 0.18013380467891693, + "learning_rate": 1e-05, + "loss": 2.6519, + "step": 25 + }, + { + "epoch": 0.04376183463075952, + "grad_norm": 0.1933555006980896, + "learning_rate": 1e-05, + "loss": 2.5591, + "step": 26 + }, + { + "epoch": 0.045444982116557966, + "grad_norm": 0.18386027216911316, + "learning_rate": 1e-05, + "loss": 2.6169, + "step": 27 + }, + { + "epoch": 0.047128129602356406, + "grad_norm": 0.18173415958881378, + "learning_rate": 1e-05, + "loss": 2.623, + "step": 28 + }, + { + "epoch": 0.04881127708815485, + "grad_norm": 0.19154761731624603, + "learning_rate": 1e-05, + "loss": 2.5981, + "step": 29 + }, + { + "epoch": 0.05049442457395329, + "grad_norm": 0.2001664638519287, + "learning_rate": 1e-05, + "loss": 2.5066, + "step": 30 + }, + { + "epoch": 0.05217757205975174, + "grad_norm": 0.15573543310165405, + "learning_rate": 1e-05, + "loss": 2.6013, + "step": 31 + }, + { + "epoch": 0.053860719545550176, + "grad_norm": 0.16071979701519012, + "learning_rate": 1e-05, + "loss": 2.4634, + "step": 32 + }, + { + "epoch": 0.05554386703134862, + "grad_norm": 0.1769736260175705, + "learning_rate": 1e-05, + "loss": 2.5491, + "step": 33 + }, + { + "epoch": 0.05722701451714707, + "grad_norm": 0.17623937129974365, + "learning_rate": 1e-05, + "loss": 2.4399, + "step": 34 + }, + { + "epoch": 0.05891016200294551, + "grad_norm": 0.17367449402809143, + "learning_rate": 1e-05, + "loss": 2.5464, + "step": 35 + }, + { + "epoch": 0.06059330948874395, + "grad_norm": 0.14842955768108368, + "learning_rate": 1e-05, + "loss": 2.4174, + "step": 36 + }, + { + "epoch": 0.06227645697454239, + "grad_norm": 0.17405100166797638, + "learning_rate": 1e-05, + "loss": 2.5303, + "step": 37 + }, + { + "epoch": 0.06395960446034084, + "grad_norm": 0.145203098654747, + "learning_rate": 1e-05, + "loss": 2.6428, + "step": 38 + }, + { + "epoch": 0.06564275194613928, + "grad_norm": 0.1542726755142212, + "learning_rate": 1e-05, + "loss": 2.5618, + "step": 39 + }, + { + "epoch": 0.06732589943193773, + "grad_norm": 0.14489781856536865, + "learning_rate": 1e-05, + "loss": 2.6885, + "step": 40 + }, + { + "epoch": 0.06900904691773617, + "grad_norm": 0.14798486232757568, + "learning_rate": 1e-05, + "loss": 2.5322, + "step": 41 + }, + { + "epoch": 0.07069219440353461, + "grad_norm": 0.15226829051971436, + "learning_rate": 1e-05, + "loss": 2.6011, + "step": 42 + }, + { + "epoch": 0.07237534188933305, + "grad_norm": 0.14561522006988525, + "learning_rate": 1e-05, + "loss": 2.5657, + "step": 43 + }, + { + "epoch": 0.0740584893751315, + "grad_norm": 0.13787826895713806, + "learning_rate": 1e-05, + "loss": 2.6011, + "step": 44 + }, + { + "epoch": 0.07574163686092994, + "grad_norm": 0.14005698263645172, + "learning_rate": 1e-05, + "loss": 2.4673, + "step": 45 + }, + { + "epoch": 0.07742478434672838, + "grad_norm": 0.13822345435619354, + "learning_rate": 1e-05, + "loss": 2.512, + "step": 46 + }, + { + "epoch": 0.07910793183252683, + "grad_norm": 0.1284177154302597, + "learning_rate": 1e-05, + "loss": 2.5625, + "step": 47 + }, + { + "epoch": 0.08079107931832527, + "grad_norm": 0.1279960423707962, + "learning_rate": 1e-05, + "loss": 2.46, + "step": 48 + }, + { + "epoch": 0.08247422680412371, + "grad_norm": 0.12479826807975769, + "learning_rate": 1e-05, + "loss": 2.5706, + "step": 49 + }, + { + "epoch": 0.08415737428992215, + "grad_norm": 0.12982836365699768, + "learning_rate": 1e-05, + "loss": 2.5098, + "step": 50 + }, + { + "epoch": 0.0858405217757206, + "grad_norm": 0.13269256055355072, + "learning_rate": 1e-05, + "loss": 2.4688, + "step": 51 + }, + { + "epoch": 0.08752366926151904, + "grad_norm": 0.11713477969169617, + "learning_rate": 1e-05, + "loss": 2.6226, + "step": 52 + }, + { + "epoch": 0.08920681674731748, + "grad_norm": 0.11179152131080627, + "learning_rate": 1e-05, + "loss": 2.4224, + "step": 53 + }, + { + "epoch": 0.09088996423311593, + "grad_norm": 0.12146276980638504, + "learning_rate": 1e-05, + "loss": 2.4639, + "step": 54 + }, + { + "epoch": 0.09257311171891437, + "grad_norm": 0.12470445781946182, + "learning_rate": 1e-05, + "loss": 2.5195, + "step": 55 + }, + { + "epoch": 0.09425625920471281, + "grad_norm": 0.11872275173664093, + "learning_rate": 1e-05, + "loss": 2.5186, + "step": 56 + }, + { + "epoch": 0.09593940669051125, + "grad_norm": 0.11616484075784683, + "learning_rate": 1e-05, + "loss": 2.5581, + "step": 57 + }, + { + "epoch": 0.0976225541763097, + "grad_norm": 0.1075875386595726, + "learning_rate": 1e-05, + "loss": 2.5693, + "step": 58 + }, + { + "epoch": 0.09930570166210814, + "grad_norm": 0.10176095366477966, + "learning_rate": 1e-05, + "loss": 2.521, + "step": 59 + }, + { + "epoch": 0.10098884914790658, + "grad_norm": 0.1076890155673027, + "learning_rate": 1e-05, + "loss": 2.53, + "step": 60 + }, + { + "epoch": 0.10267199663370503, + "grad_norm": 0.09105601906776428, + "learning_rate": 1e-05, + "loss": 2.3733, + "step": 61 + }, + { + "epoch": 0.10435514411950347, + "grad_norm": 0.09733142703771591, + "learning_rate": 1e-05, + "loss": 2.416, + "step": 62 + }, + { + "epoch": 0.10603829160530191, + "grad_norm": 0.09099874645471573, + "learning_rate": 1e-05, + "loss": 2.3774, + "step": 63 + }, + { + "epoch": 0.10772143909110035, + "grad_norm": 0.0884426161646843, + "learning_rate": 1e-05, + "loss": 2.4136, + "step": 64 + }, + { + "epoch": 0.1094045865768988, + "grad_norm": 0.08939989656209946, + "learning_rate": 1e-05, + "loss": 2.4482, + "step": 65 + }, + { + "epoch": 0.11108773406269724, + "grad_norm": 0.09078355878591537, + "learning_rate": 1e-05, + "loss": 2.5256, + "step": 66 + }, + { + "epoch": 0.11277088154849568, + "grad_norm": 0.08570227026939392, + "learning_rate": 1e-05, + "loss": 2.4954, + "step": 67 + }, + { + "epoch": 0.11445402903429414, + "grad_norm": 0.0766797736287117, + "learning_rate": 1e-05, + "loss": 2.3694, + "step": 68 + }, + { + "epoch": 0.11613717652009257, + "grad_norm": 0.08015618473291397, + "learning_rate": 1e-05, + "loss": 2.4724, + "step": 69 + }, + { + "epoch": 0.11782032400589101, + "grad_norm": 0.08956343680620193, + "learning_rate": 1e-05, + "loss": 2.47, + "step": 70 + }, + { + "epoch": 0.11950347149168945, + "grad_norm": 0.08134786039590836, + "learning_rate": 1e-05, + "loss": 2.4482, + "step": 71 + }, + { + "epoch": 0.1211866189774879, + "grad_norm": 0.07923366874456406, + "learning_rate": 1e-05, + "loss": 2.4182, + "step": 72 + }, + { + "epoch": 0.12286976646328635, + "grad_norm": 0.07909434288740158, + "learning_rate": 1e-05, + "loss": 2.3711, + "step": 73 + }, + { + "epoch": 0.12455291394908478, + "grad_norm": 0.07540368288755417, + "learning_rate": 1e-05, + "loss": 2.3962, + "step": 74 + }, + { + "epoch": 0.12623606143488322, + "grad_norm": 0.06906846165657043, + "learning_rate": 1e-05, + "loss": 2.519, + "step": 75 + }, + { + "epoch": 0.12791920892068168, + "grad_norm": 0.07301697880029678, + "learning_rate": 1e-05, + "loss": 2.5537, + "step": 76 + }, + { + "epoch": 0.12960235640648013, + "grad_norm": 0.07182423770427704, + "learning_rate": 1e-05, + "loss": 2.4807, + "step": 77 + }, + { + "epoch": 0.13128550389227855, + "grad_norm": 0.06827539950609207, + "learning_rate": 1e-05, + "loss": 2.5796, + "step": 78 + }, + { + "epoch": 0.132968651378077, + "grad_norm": 0.07280007749795914, + "learning_rate": 1e-05, + "loss": 2.499, + "step": 79 + }, + { + "epoch": 0.13465179886387546, + "grad_norm": 0.07410164177417755, + "learning_rate": 1e-05, + "loss": 2.3418, + "step": 80 + }, + { + "epoch": 0.13633494634967389, + "grad_norm": 0.07245635986328125, + "learning_rate": 1e-05, + "loss": 2.4685, + "step": 81 + }, + { + "epoch": 0.13801809383547234, + "grad_norm": 0.06992876529693604, + "learning_rate": 1e-05, + "loss": 2.4634, + "step": 82 + }, + { + "epoch": 0.13970124132127076, + "grad_norm": 0.07322832196950912, + "learning_rate": 1e-05, + "loss": 2.4949, + "step": 83 + }, + { + "epoch": 0.14138438880706922, + "grad_norm": 0.06528163701295853, + "learning_rate": 1e-05, + "loss": 2.3982, + "step": 84 + }, + { + "epoch": 0.14306753629286767, + "grad_norm": 0.06972632557153702, + "learning_rate": 1e-05, + "loss": 2.4268, + "step": 85 + }, + { + "epoch": 0.1447506837786661, + "grad_norm": 0.062493499368429184, + "learning_rate": 1e-05, + "loss": 2.4309, + "step": 86 + }, + { + "epoch": 0.14643383126446455, + "grad_norm": 0.07086165249347687, + "learning_rate": 1e-05, + "loss": 2.4373, + "step": 87 + }, + { + "epoch": 0.148116978750263, + "grad_norm": 0.06631726026535034, + "learning_rate": 1e-05, + "loss": 2.4141, + "step": 88 + }, + { + "epoch": 0.14980012623606143, + "grad_norm": 0.07114582508802414, + "learning_rate": 1e-05, + "loss": 2.3546, + "step": 89 + }, + { + "epoch": 0.15148327372185988, + "grad_norm": 0.06932078301906586, + "learning_rate": 1e-05, + "loss": 2.4758, + "step": 90 + }, + { + "epoch": 0.15316642120765833, + "grad_norm": 0.06153389438986778, + "learning_rate": 1e-05, + "loss": 2.481, + "step": 91 + }, + { + "epoch": 0.15484956869345676, + "grad_norm": 0.06216192990541458, + "learning_rate": 1e-05, + "loss": 2.4421, + "step": 92 + }, + { + "epoch": 0.1565327161792552, + "grad_norm": 0.06554314494132996, + "learning_rate": 1e-05, + "loss": 2.3008, + "step": 93 + }, + { + "epoch": 0.15821586366505366, + "grad_norm": 0.06210967153310776, + "learning_rate": 1e-05, + "loss": 2.2554, + "step": 94 + }, + { + "epoch": 0.1598990111508521, + "grad_norm": 0.06851295381784439, + "learning_rate": 1e-05, + "loss": 2.5356, + "step": 95 + }, + { + "epoch": 0.16158215863665054, + "grad_norm": 0.06121644005179405, + "learning_rate": 1e-05, + "loss": 2.4299, + "step": 96 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.06593657284975052, + "learning_rate": 1e-05, + "loss": 2.3811, + "step": 97 + }, + { + "epoch": 0.16494845360824742, + "grad_norm": 0.06456276774406433, + "learning_rate": 1e-05, + "loss": 2.3574, + "step": 98 + }, + { + "epoch": 0.16663160109404587, + "grad_norm": 0.061866894364356995, + "learning_rate": 1e-05, + "loss": 2.4758, + "step": 99 + }, + { + "epoch": 0.1683147485798443, + "grad_norm": 0.058500371873378754, + "learning_rate": 1e-05, + "loss": 2.4133, + "step": 100 + }, + { + "epoch": 0.16999789606564275, + "grad_norm": 0.06366603821516037, + "learning_rate": 1e-05, + "loss": 2.3328, + "step": 101 + }, + { + "epoch": 0.1716810435514412, + "grad_norm": 0.061924271285533905, + "learning_rate": 1e-05, + "loss": 2.4047, + "step": 102 + }, + { + "epoch": 0.17336419103723963, + "grad_norm": 0.057471342384815216, + "learning_rate": 1e-05, + "loss": 2.4333, + "step": 103 + }, + { + "epoch": 0.17504733852303808, + "grad_norm": 0.05482906475663185, + "learning_rate": 1e-05, + "loss": 2.3499, + "step": 104 + }, + { + "epoch": 0.17673048600883653, + "grad_norm": 0.056116051971912384, + "learning_rate": 1e-05, + "loss": 2.4653, + "step": 105 + }, + { + "epoch": 0.17841363349463496, + "grad_norm": 0.052277661859989166, + "learning_rate": 1e-05, + "loss": 2.4653, + "step": 106 + }, + { + "epoch": 0.1800967809804334, + "grad_norm": 0.06346592307090759, + "learning_rate": 1e-05, + "loss": 2.3549, + "step": 107 + }, + { + "epoch": 0.18177992846623187, + "grad_norm": 0.06070290133357048, + "learning_rate": 1e-05, + "loss": 2.2886, + "step": 108 + }, + { + "epoch": 0.1834630759520303, + "grad_norm": 0.055994004011154175, + "learning_rate": 1e-05, + "loss": 2.4692, + "step": 109 + }, + { + "epoch": 0.18514622343782874, + "grad_norm": 0.05782800912857056, + "learning_rate": 1e-05, + "loss": 2.3303, + "step": 110 + }, + { + "epoch": 0.18682937092362717, + "grad_norm": 0.05491410568356514, + "learning_rate": 1e-05, + "loss": 2.47, + "step": 111 + }, + { + "epoch": 0.18851251840942562, + "grad_norm": 0.060252465307712555, + "learning_rate": 1e-05, + "loss": 2.5464, + "step": 112 + }, + { + "epoch": 0.19019566589522408, + "grad_norm": 0.05614893510937691, + "learning_rate": 1e-05, + "loss": 2.3457, + "step": 113 + }, + { + "epoch": 0.1918788133810225, + "grad_norm": 0.051146939396858215, + "learning_rate": 1e-05, + "loss": 2.3918, + "step": 114 + }, + { + "epoch": 0.19356196086682095, + "grad_norm": 0.05474052205681801, + "learning_rate": 1e-05, + "loss": 2.3689, + "step": 115 + }, + { + "epoch": 0.1952451083526194, + "grad_norm": 0.052064936608076096, + "learning_rate": 1e-05, + "loss": 2.5073, + "step": 116 + }, + { + "epoch": 0.19692825583841783, + "grad_norm": 0.06184034049510956, + "learning_rate": 1e-05, + "loss": 2.4248, + "step": 117 + }, + { + "epoch": 0.19861140332421628, + "grad_norm": 0.05613533779978752, + "learning_rate": 1e-05, + "loss": 2.5742, + "step": 118 + }, + { + "epoch": 0.20029455081001474, + "grad_norm": 0.05547456443309784, + "learning_rate": 1e-05, + "loss": 2.3884, + "step": 119 + }, + { + "epoch": 0.20197769829581316, + "grad_norm": 0.05933033674955368, + "learning_rate": 1e-05, + "loss": 2.45, + "step": 120 + }, + { + "epoch": 0.20366084578161162, + "grad_norm": 0.058600571006536484, + "learning_rate": 1e-05, + "loss": 2.3875, + "step": 121 + }, + { + "epoch": 0.20534399326741007, + "grad_norm": 0.0554657019674778, + "learning_rate": 1e-05, + "loss": 2.3215, + "step": 122 + }, + { + "epoch": 0.2070271407532085, + "grad_norm": 0.05604475364089012, + "learning_rate": 1e-05, + "loss": 2.3329, + "step": 123 + }, + { + "epoch": 0.20871028823900695, + "grad_norm": 0.06094202771782875, + "learning_rate": 1e-05, + "loss": 2.4177, + "step": 124 + }, + { + "epoch": 0.2103934357248054, + "grad_norm": 0.05517999082803726, + "learning_rate": 1e-05, + "loss": 2.3247, + "step": 125 + }, + { + "epoch": 0.21207658321060383, + "grad_norm": 0.05678452178835869, + "learning_rate": 1e-05, + "loss": 2.3481, + "step": 126 + }, + { + "epoch": 0.21375973069640228, + "grad_norm": 0.05295870825648308, + "learning_rate": 1e-05, + "loss": 2.3694, + "step": 127 + }, + { + "epoch": 0.2154428781822007, + "grad_norm": 0.05118125304579735, + "learning_rate": 1e-05, + "loss": 2.4102, + "step": 128 + }, + { + "epoch": 0.21712602566799916, + "grad_norm": 0.05659961327910423, + "learning_rate": 1e-05, + "loss": 2.3104, + "step": 129 + }, + { + "epoch": 0.2188091731537976, + "grad_norm": 0.05049075558781624, + "learning_rate": 1e-05, + "loss": 2.4949, + "step": 130 + }, + { + "epoch": 0.22049232063959603, + "grad_norm": 0.05323097109794617, + "learning_rate": 1e-05, + "loss": 2.323, + "step": 131 + }, + { + "epoch": 0.2221754681253945, + "grad_norm": 0.05309610068798065, + "learning_rate": 1e-05, + "loss": 2.5203, + "step": 132 + }, + { + "epoch": 0.22385861561119294, + "grad_norm": 0.05474167317152023, + "learning_rate": 1e-05, + "loss": 2.408, + "step": 133 + }, + { + "epoch": 0.22554176309699137, + "grad_norm": 0.056433092802762985, + "learning_rate": 1e-05, + "loss": 2.3779, + "step": 134 + }, + { + "epoch": 0.22722491058278982, + "grad_norm": 0.047424182295799255, + "learning_rate": 1e-05, + "loss": 2.45, + "step": 135 + }, + { + "epoch": 0.22890805806858827, + "grad_norm": 0.05422671511769295, + "learning_rate": 1e-05, + "loss": 2.3397, + "step": 136 + }, + { + "epoch": 0.2305912055543867, + "grad_norm": 0.05421329662203789, + "learning_rate": 1e-05, + "loss": 2.3779, + "step": 137 + }, + { + "epoch": 0.23227435304018515, + "grad_norm": 0.057494040578603745, + "learning_rate": 1e-05, + "loss": 2.4509, + "step": 138 + }, + { + "epoch": 0.2339575005259836, + "grad_norm": 0.0516960434615612, + "learning_rate": 1e-05, + "loss": 2.3647, + "step": 139 + }, + { + "epoch": 0.23564064801178203, + "grad_norm": 0.049899645149707794, + "learning_rate": 1e-05, + "loss": 2.4844, + "step": 140 + }, + { + "epoch": 0.23732379549758048, + "grad_norm": 0.05162065476179123, + "learning_rate": 1e-05, + "loss": 2.3613, + "step": 141 + }, + { + "epoch": 0.2390069429833789, + "grad_norm": 0.05812832713127136, + "learning_rate": 1e-05, + "loss": 2.4548, + "step": 142 + }, + { + "epoch": 0.24069009046917736, + "grad_norm": 0.04910556599497795, + "learning_rate": 1e-05, + "loss": 2.3274, + "step": 143 + }, + { + "epoch": 0.2423732379549758, + "grad_norm": 0.05346587672829628, + "learning_rate": 1e-05, + "loss": 2.325, + "step": 144 + }, + { + "epoch": 0.24405638544077424, + "grad_norm": 0.0495002381503582, + "learning_rate": 1e-05, + "loss": 2.4131, + "step": 145 + }, + { + "epoch": 0.2457395329265727, + "grad_norm": 0.05076875165104866, + "learning_rate": 1e-05, + "loss": 2.3887, + "step": 146 + }, + { + "epoch": 0.24742268041237114, + "grad_norm": 0.050955574959516525, + "learning_rate": 1e-05, + "loss": 2.4517, + "step": 147 + }, + { + "epoch": 0.24910582789816957, + "grad_norm": 0.05082906410098076, + "learning_rate": 1e-05, + "loss": 2.3401, + "step": 148 + }, + { + "epoch": 0.250788975383968, + "grad_norm": 0.052096717059612274, + "learning_rate": 1e-05, + "loss": 2.3218, + "step": 149 + }, + { + "epoch": 0.25247212286976645, + "grad_norm": 0.052378151565790176, + "learning_rate": 1e-05, + "loss": 2.4246, + "step": 150 + }, + { + "epoch": 0.2541552703555649, + "grad_norm": 0.04881056025624275, + "learning_rate": 1e-05, + "loss": 2.3435, + "step": 151 + }, + { + "epoch": 0.25583841784136335, + "grad_norm": 0.05233067274093628, + "learning_rate": 1e-05, + "loss": 2.4761, + "step": 152 + }, + { + "epoch": 0.2575215653271618, + "grad_norm": 0.05231297388672829, + "learning_rate": 1e-05, + "loss": 2.4065, + "step": 153 + }, + { + "epoch": 0.25920471281296026, + "grad_norm": 0.04649129509925842, + "learning_rate": 1e-05, + "loss": 2.4175, + "step": 154 + }, + { + "epoch": 0.26088786029875866, + "grad_norm": 0.05354660376906395, + "learning_rate": 1e-05, + "loss": 2.4731, + "step": 155 + }, + { + "epoch": 0.2625710077845571, + "grad_norm": 0.05071151629090309, + "learning_rate": 1e-05, + "loss": 2.4421, + "step": 156 + }, + { + "epoch": 0.26425415527035556, + "grad_norm": 0.04953297600150108, + "learning_rate": 1e-05, + "loss": 2.3134, + "step": 157 + }, + { + "epoch": 0.265937302756154, + "grad_norm": 0.051142722368240356, + "learning_rate": 1e-05, + "loss": 2.3335, + "step": 158 + }, + { + "epoch": 0.26762045024195247, + "grad_norm": 0.05187085270881653, + "learning_rate": 1e-05, + "loss": 2.4387, + "step": 159 + }, + { + "epoch": 0.2693035977277509, + "grad_norm": 0.04968629032373428, + "learning_rate": 1e-05, + "loss": 2.4905, + "step": 160 + }, + { + "epoch": 0.2709867452135493, + "grad_norm": 0.053009629249572754, + "learning_rate": 1e-05, + "loss": 2.4441, + "step": 161 + }, + { + "epoch": 0.27266989269934777, + "grad_norm": 0.04917874187231064, + "learning_rate": 1e-05, + "loss": 2.4763, + "step": 162 + }, + { + "epoch": 0.2743530401851462, + "grad_norm": 0.048884451389312744, + "learning_rate": 1e-05, + "loss": 2.4248, + "step": 163 + }, + { + "epoch": 0.2760361876709447, + "grad_norm": 0.049946676939725876, + "learning_rate": 1e-05, + "loss": 2.5173, + "step": 164 + }, + { + "epoch": 0.27771933515674313, + "grad_norm": 0.052534863352775574, + "learning_rate": 1e-05, + "loss": 2.4558, + "step": 165 + }, + { + "epoch": 0.2794024826425415, + "grad_norm": 0.05162844434380531, + "learning_rate": 1e-05, + "loss": 2.405, + "step": 166 + }, + { + "epoch": 0.28108563012834, + "grad_norm": 0.049985259771347046, + "learning_rate": 1e-05, + "loss": 2.3542, + "step": 167 + }, + { + "epoch": 0.28276877761413843, + "grad_norm": 0.05239354074001312, + "learning_rate": 1e-05, + "loss": 2.3721, + "step": 168 + }, + { + "epoch": 0.2844519250999369, + "grad_norm": 0.05592744052410126, + "learning_rate": 1e-05, + "loss": 2.2701, + "step": 169 + }, + { + "epoch": 0.28613507258573534, + "grad_norm": 0.052739113569259644, + "learning_rate": 1e-05, + "loss": 2.4216, + "step": 170 + }, + { + "epoch": 0.2878182200715338, + "grad_norm": 0.04806948080658913, + "learning_rate": 1e-05, + "loss": 2.3884, + "step": 171 + }, + { + "epoch": 0.2895013675573322, + "grad_norm": 0.04990949481725693, + "learning_rate": 1e-05, + "loss": 2.4419, + "step": 172 + }, + { + "epoch": 0.29118451504313064, + "grad_norm": 0.050067439675331116, + "learning_rate": 1e-05, + "loss": 2.4331, + "step": 173 + }, + { + "epoch": 0.2928676625289291, + "grad_norm": 0.0507354810833931, + "learning_rate": 1e-05, + "loss": 2.406, + "step": 174 + }, + { + "epoch": 0.29455081001472755, + "grad_norm": 0.0538686104118824, + "learning_rate": 1e-05, + "loss": 2.4182, + "step": 175 + }, + { + "epoch": 0.296233957500526, + "grad_norm": 0.05205219238996506, + "learning_rate": 1e-05, + "loss": 2.3401, + "step": 176 + }, + { + "epoch": 0.2979171049863244, + "grad_norm": 0.04672086611390114, + "learning_rate": 1e-05, + "loss": 2.3149, + "step": 177 + }, + { + "epoch": 0.29960025247212285, + "grad_norm": 0.051963068544864655, + "learning_rate": 1e-05, + "loss": 2.2537, + "step": 178 + }, + { + "epoch": 0.3012833999579213, + "grad_norm": 0.053639005869627, + "learning_rate": 1e-05, + "loss": 2.4353, + "step": 179 + }, + { + "epoch": 0.30296654744371976, + "grad_norm": 0.05326982960104942, + "learning_rate": 1e-05, + "loss": 2.334, + "step": 180 + }, + { + "epoch": 0.3046496949295182, + "grad_norm": 0.05361334979534149, + "learning_rate": 1e-05, + "loss": 2.4224, + "step": 181 + }, + { + "epoch": 0.30633284241531666, + "grad_norm": 0.05790587514638901, + "learning_rate": 1e-05, + "loss": 2.334, + "step": 182 + }, + { + "epoch": 0.30801598990111506, + "grad_norm": 0.04790763929486275, + "learning_rate": 1e-05, + "loss": 2.5073, + "step": 183 + }, + { + "epoch": 0.3096991373869135, + "grad_norm": 0.054103124886751175, + "learning_rate": 1e-05, + "loss": 2.3483, + "step": 184 + }, + { + "epoch": 0.31138228487271197, + "grad_norm": 0.05902162939310074, + "learning_rate": 1e-05, + "loss": 2.3301, + "step": 185 + }, + { + "epoch": 0.3130654323585104, + "grad_norm": 0.04853544384241104, + "learning_rate": 1e-05, + "loss": 2.5566, + "step": 186 + }, + { + "epoch": 0.3147485798443089, + "grad_norm": 0.055288165807724, + "learning_rate": 1e-05, + "loss": 2.2903, + "step": 187 + }, + { + "epoch": 0.3164317273301073, + "grad_norm": 0.05180734023451805, + "learning_rate": 1e-05, + "loss": 2.4285, + "step": 188 + }, + { + "epoch": 0.3181148748159057, + "grad_norm": 0.04889997839927673, + "learning_rate": 1e-05, + "loss": 2.2542, + "step": 189 + }, + { + "epoch": 0.3197980223017042, + "grad_norm": 0.051011502742767334, + "learning_rate": 1e-05, + "loss": 2.2893, + "step": 190 + }, + { + "epoch": 0.32148116978750263, + "grad_norm": 0.04864371567964554, + "learning_rate": 1e-05, + "loss": 2.5225, + "step": 191 + }, + { + "epoch": 0.3231643172733011, + "grad_norm": 0.05374041944742203, + "learning_rate": 1e-05, + "loss": 2.4504, + "step": 192 + }, + { + "epoch": 0.32484746475909954, + "grad_norm": 0.05158041790127754, + "learning_rate": 1e-05, + "loss": 2.4683, + "step": 193 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.05630083382129669, + "learning_rate": 1e-05, + "loss": 2.2415, + "step": 194 + }, + { + "epoch": 0.3282137597306964, + "grad_norm": 0.05439196154475212, + "learning_rate": 1e-05, + "loss": 2.3684, + "step": 195 + }, + { + "epoch": 0.32989690721649484, + "grad_norm": 0.05023415759205818, + "learning_rate": 1e-05, + "loss": 2.415, + "step": 196 + }, + { + "epoch": 0.3315800547022933, + "grad_norm": 0.05531445890665054, + "learning_rate": 1e-05, + "loss": 2.4626, + "step": 197 + }, + { + "epoch": 0.33326320218809175, + "grad_norm": 0.05087656155228615, + "learning_rate": 1e-05, + "loss": 2.3936, + "step": 198 + }, + { + "epoch": 0.3349463496738902, + "grad_norm": 0.05231088399887085, + "learning_rate": 1e-05, + "loss": 2.3779, + "step": 199 + }, + { + "epoch": 0.3366294971596886, + "grad_norm": 0.0514984093606472, + "learning_rate": 1e-05, + "loss": 2.3967, + "step": 200 + }, + { + "epoch": 0.33831264464548705, + "grad_norm": 0.05334719642996788, + "learning_rate": 1e-05, + "loss": 2.4604, + "step": 201 + }, + { + "epoch": 0.3399957921312855, + "grad_norm": 0.054843124002218246, + "learning_rate": 1e-05, + "loss": 2.3538, + "step": 202 + }, + { + "epoch": 0.34167893961708395, + "grad_norm": 0.04888272285461426, + "learning_rate": 1e-05, + "loss": 2.4844, + "step": 203 + }, + { + "epoch": 0.3433620871028824, + "grad_norm": 0.054122187197208405, + "learning_rate": 1e-05, + "loss": 2.3291, + "step": 204 + }, + { + "epoch": 0.34504523458868086, + "grad_norm": 0.054561201483011246, + "learning_rate": 1e-05, + "loss": 2.3218, + "step": 205 + }, + { + "epoch": 0.34672838207447926, + "grad_norm": 0.04919834062457085, + "learning_rate": 1e-05, + "loss": 2.4478, + "step": 206 + }, + { + "epoch": 0.3484115295602777, + "grad_norm": 0.050551943480968475, + "learning_rate": 1e-05, + "loss": 2.3755, + "step": 207 + }, + { + "epoch": 0.35009467704607616, + "grad_norm": 0.05242514982819557, + "learning_rate": 1e-05, + "loss": 2.3922, + "step": 208 + }, + { + "epoch": 0.3517778245318746, + "grad_norm": 0.06077054515480995, + "learning_rate": 1e-05, + "loss": 2.3218, + "step": 209 + }, + { + "epoch": 0.35346097201767307, + "grad_norm": 0.061367545276880264, + "learning_rate": 1e-05, + "loss": 2.2957, + "step": 210 + }, + { + "epoch": 0.35514411950347147, + "grad_norm": 0.0511772483587265, + "learning_rate": 1e-05, + "loss": 2.374, + "step": 211 + }, + { + "epoch": 0.3568272669892699, + "grad_norm": 0.0496203638613224, + "learning_rate": 1e-05, + "loss": 2.4182, + "step": 212 + }, + { + "epoch": 0.3585104144750684, + "grad_norm": 0.061339233070611954, + "learning_rate": 1e-05, + "loss": 2.406, + "step": 213 + }, + { + "epoch": 0.3601935619608668, + "grad_norm": 0.052460432052612305, + "learning_rate": 1e-05, + "loss": 2.4309, + "step": 214 + }, + { + "epoch": 0.3618767094466653, + "grad_norm": 0.055436089634895325, + "learning_rate": 1e-05, + "loss": 2.4141, + "step": 215 + }, + { + "epoch": 0.36355985693246373, + "grad_norm": 0.05396036058664322, + "learning_rate": 1e-05, + "loss": 2.2705, + "step": 216 + }, + { + "epoch": 0.36524300441826213, + "grad_norm": 0.04853086173534393, + "learning_rate": 1e-05, + "loss": 2.4473, + "step": 217 + }, + { + "epoch": 0.3669261519040606, + "grad_norm": 0.051015399396419525, + "learning_rate": 1e-05, + "loss": 2.5115, + "step": 218 + }, + { + "epoch": 0.36860929938985904, + "grad_norm": 0.05526035279035568, + "learning_rate": 1e-05, + "loss": 2.3123, + "step": 219 + }, + { + "epoch": 0.3702924468756575, + "grad_norm": 0.056169234216213226, + "learning_rate": 1e-05, + "loss": 2.3447, + "step": 220 + }, + { + "epoch": 0.37197559436145594, + "grad_norm": 0.05238133668899536, + "learning_rate": 1e-05, + "loss": 2.26, + "step": 221 + }, + { + "epoch": 0.37365874184725434, + "grad_norm": 0.05587685480713844, + "learning_rate": 1e-05, + "loss": 2.3083, + "step": 222 + }, + { + "epoch": 0.3753418893330528, + "grad_norm": 0.050364553928375244, + "learning_rate": 1e-05, + "loss": 2.3459, + "step": 223 + }, + { + "epoch": 0.37702503681885124, + "grad_norm": 0.0506574809551239, + "learning_rate": 1e-05, + "loss": 2.4246, + "step": 224 + }, + { + "epoch": 0.3787081843046497, + "grad_norm": 0.05842865630984306, + "learning_rate": 1e-05, + "loss": 2.2617, + "step": 225 + }, + { + "epoch": 0.38039133179044815, + "grad_norm": 0.05097496882081032, + "learning_rate": 1e-05, + "loss": 2.52, + "step": 226 + }, + { + "epoch": 0.3820744792762466, + "grad_norm": 0.05665278434753418, + "learning_rate": 1e-05, + "loss": 2.2715, + "step": 227 + }, + { + "epoch": 0.383757626762045, + "grad_norm": 0.053350359201431274, + "learning_rate": 1e-05, + "loss": 2.3101, + "step": 228 + }, + { + "epoch": 0.38544077424784345, + "grad_norm": 0.05481604114174843, + "learning_rate": 1e-05, + "loss": 2.3347, + "step": 229 + }, + { + "epoch": 0.3871239217336419, + "grad_norm": 0.06036606431007385, + "learning_rate": 1e-05, + "loss": 2.2991, + "step": 230 + }, + { + "epoch": 0.38880706921944036, + "grad_norm": 0.0606355145573616, + "learning_rate": 1e-05, + "loss": 2.4226, + "step": 231 + }, + { + "epoch": 0.3904902167052388, + "grad_norm": 0.052770137786865234, + "learning_rate": 1e-05, + "loss": 2.4539, + "step": 232 + }, + { + "epoch": 0.39217336419103727, + "grad_norm": 0.050006203353405, + "learning_rate": 1e-05, + "loss": 2.3477, + "step": 233 + }, + { + "epoch": 0.39385651167683566, + "grad_norm": 0.05640649050474167, + "learning_rate": 1e-05, + "loss": 2.3123, + "step": 234 + }, + { + "epoch": 0.3955396591626341, + "grad_norm": 0.050969429314136505, + "learning_rate": 1e-05, + "loss": 2.4534, + "step": 235 + }, + { + "epoch": 0.39722280664843257, + "grad_norm": 0.05676101893186569, + "learning_rate": 1e-05, + "loss": 2.3481, + "step": 236 + }, + { + "epoch": 0.398905954134231, + "grad_norm": 0.05844707787036896, + "learning_rate": 1e-05, + "loss": 2.3638, + "step": 237 + }, + { + "epoch": 0.4005891016200295, + "grad_norm": 0.053074926137924194, + "learning_rate": 1e-05, + "loss": 2.3904, + "step": 238 + }, + { + "epoch": 0.4022722491058279, + "grad_norm": 0.04979414492845535, + "learning_rate": 1e-05, + "loss": 2.3855, + "step": 239 + }, + { + "epoch": 0.4039553965916263, + "grad_norm": 0.05607665330171585, + "learning_rate": 1e-05, + "loss": 2.3569, + "step": 240 + }, + { + "epoch": 0.4056385440774248, + "grad_norm": 0.05964501202106476, + "learning_rate": 1e-05, + "loss": 2.3459, + "step": 241 + }, + { + "epoch": 0.40732169156322323, + "grad_norm": 0.05849093198776245, + "learning_rate": 1e-05, + "loss": 2.3213, + "step": 242 + }, + { + "epoch": 0.4090048390490217, + "grad_norm": 0.053846072405576706, + "learning_rate": 1e-05, + "loss": 2.4436, + "step": 243 + }, + { + "epoch": 0.41068798653482014, + "grad_norm": 0.054448988288640976, + "learning_rate": 1e-05, + "loss": 2.3716, + "step": 244 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 0.05229583755135536, + "learning_rate": 1e-05, + "loss": 2.4099, + "step": 245 + }, + { + "epoch": 0.414054281506417, + "grad_norm": 0.05479966476559639, + "learning_rate": 1e-05, + "loss": 2.4026, + "step": 246 + }, + { + "epoch": 0.41573742899221544, + "grad_norm": 0.061799049377441406, + "learning_rate": 1e-05, + "loss": 2.4072, + "step": 247 + }, + { + "epoch": 0.4174205764780139, + "grad_norm": 0.061452727764844894, + "learning_rate": 1e-05, + "loss": 2.2833, + "step": 248 + }, + { + "epoch": 0.41910372396381235, + "grad_norm": 0.05868072435259819, + "learning_rate": 1e-05, + "loss": 2.3833, + "step": 249 + }, + { + "epoch": 0.4207868714496108, + "grad_norm": 0.05926290899515152, + "learning_rate": 1e-05, + "loss": 2.3645, + "step": 250 + }, + { + "epoch": 0.4224700189354092, + "grad_norm": 0.058858342468738556, + "learning_rate": 1e-05, + "loss": 2.3152, + "step": 251 + }, + { + "epoch": 0.42415316642120765, + "grad_norm": 0.058599065989255905, + "learning_rate": 1e-05, + "loss": 2.2827, + "step": 252 + }, + { + "epoch": 0.4258363139070061, + "grad_norm": 0.060381706804037094, + "learning_rate": 1e-05, + "loss": 2.3024, + "step": 253 + }, + { + "epoch": 0.42751946139280456, + "grad_norm": 0.05441940575838089, + "learning_rate": 1e-05, + "loss": 2.446, + "step": 254 + }, + { + "epoch": 0.429202608878603, + "grad_norm": 0.05750846117734909, + "learning_rate": 1e-05, + "loss": 2.3958, + "step": 255 + }, + { + "epoch": 0.4308857563644014, + "grad_norm": 0.060346368700265884, + "learning_rate": 1e-05, + "loss": 2.2395, + "step": 256 + }, + { + "epoch": 0.43256890385019986, + "grad_norm": 0.056383710354566574, + "learning_rate": 1e-05, + "loss": 2.3518, + "step": 257 + }, + { + "epoch": 0.4342520513359983, + "grad_norm": 0.057746805250644684, + "learning_rate": 1e-05, + "loss": 2.2834, + "step": 258 + }, + { + "epoch": 0.43593519882179677, + "grad_norm": 0.051562029868364334, + "learning_rate": 1e-05, + "loss": 2.3677, + "step": 259 + }, + { + "epoch": 0.4376183463075952, + "grad_norm": 0.059988316148519516, + "learning_rate": 1e-05, + "loss": 2.3372, + "step": 260 + }, + { + "epoch": 0.43930149379339367, + "grad_norm": 0.05852155759930611, + "learning_rate": 1e-05, + "loss": 2.3875, + "step": 261 + }, + { + "epoch": 0.44098464127919207, + "grad_norm": 0.06629418581724167, + "learning_rate": 1e-05, + "loss": 2.4194, + "step": 262 + }, + { + "epoch": 0.4426677887649905, + "grad_norm": 0.061044465750455856, + "learning_rate": 1e-05, + "loss": 2.2466, + "step": 263 + }, + { + "epoch": 0.444350936250789, + "grad_norm": 0.056285977363586426, + "learning_rate": 1e-05, + "loss": 2.3105, + "step": 264 + }, + { + "epoch": 0.44603408373658743, + "grad_norm": 0.06135227158665657, + "learning_rate": 1e-05, + "loss": 2.3853, + "step": 265 + }, + { + "epoch": 0.4477172312223859, + "grad_norm": 0.05644640699028969, + "learning_rate": 1e-05, + "loss": 2.3888, + "step": 266 + }, + { + "epoch": 0.4494003787081843, + "grad_norm": 0.06326981633901596, + "learning_rate": 1e-05, + "loss": 2.3132, + "step": 267 + }, + { + "epoch": 0.45108352619398273, + "grad_norm": 0.05710430070757866, + "learning_rate": 1e-05, + "loss": 2.365, + "step": 268 + }, + { + "epoch": 0.4527666736797812, + "grad_norm": 0.05607946217060089, + "learning_rate": 1e-05, + "loss": 2.4648, + "step": 269 + }, + { + "epoch": 0.45444982116557964, + "grad_norm": 0.057825781404972076, + "learning_rate": 1e-05, + "loss": 2.4189, + "step": 270 + }, + { + "epoch": 0.4561329686513781, + "grad_norm": 0.06380680948495865, + "learning_rate": 1e-05, + "loss": 2.3188, + "step": 271 + }, + { + "epoch": 0.45781611613717654, + "grad_norm": 0.06377760320901871, + "learning_rate": 1e-05, + "loss": 2.2896, + "step": 272 + }, + { + "epoch": 0.45949926362297494, + "grad_norm": 0.06210333853960037, + "learning_rate": 1e-05, + "loss": 2.3663, + "step": 273 + }, + { + "epoch": 0.4611824111087734, + "grad_norm": 0.06039275974035263, + "learning_rate": 1e-05, + "loss": 2.408, + "step": 274 + }, + { + "epoch": 0.46286555859457185, + "grad_norm": 0.05442138388752937, + "learning_rate": 1e-05, + "loss": 2.3843, + "step": 275 + }, + { + "epoch": 0.4645487060803703, + "grad_norm": 0.06208937615156174, + "learning_rate": 1e-05, + "loss": 2.4355, + "step": 276 + }, + { + "epoch": 0.46623185356616875, + "grad_norm": 0.0619891993701458, + "learning_rate": 1e-05, + "loss": 2.3196, + "step": 277 + }, + { + "epoch": 0.4679150010519672, + "grad_norm": 0.059192296117544174, + "learning_rate": 1e-05, + "loss": 2.3237, + "step": 278 + }, + { + "epoch": 0.4695981485377656, + "grad_norm": 0.06284468621015549, + "learning_rate": 1e-05, + "loss": 2.3694, + "step": 279 + }, + { + "epoch": 0.47128129602356406, + "grad_norm": 0.06121189519762993, + "learning_rate": 1e-05, + "loss": 2.3606, + "step": 280 + }, + { + "epoch": 0.4729644435093625, + "grad_norm": 0.061919402331113815, + "learning_rate": 1e-05, + "loss": 2.3381, + "step": 281 + }, + { + "epoch": 0.47464759099516096, + "grad_norm": 0.0676443800330162, + "learning_rate": 1e-05, + "loss": 2.3624, + "step": 282 + }, + { + "epoch": 0.4763307384809594, + "grad_norm": 0.060140665620565414, + "learning_rate": 1e-05, + "loss": 2.4541, + "step": 283 + }, + { + "epoch": 0.4780138859667578, + "grad_norm": 0.062285441905260086, + "learning_rate": 1e-05, + "loss": 2.323, + "step": 284 + }, + { + "epoch": 0.47969703345255627, + "grad_norm": 0.06063227355480194, + "learning_rate": 1e-05, + "loss": 2.3596, + "step": 285 + }, + { + "epoch": 0.4813801809383547, + "grad_norm": 0.05906851589679718, + "learning_rate": 1e-05, + "loss": 2.458, + "step": 286 + }, + { + "epoch": 0.48306332842415317, + "grad_norm": 0.05862203240394592, + "learning_rate": 1e-05, + "loss": 2.291, + "step": 287 + }, + { + "epoch": 0.4847464759099516, + "grad_norm": 0.0629325732588768, + "learning_rate": 1e-05, + "loss": 2.2634, + "step": 288 + }, + { + "epoch": 0.4864296233957501, + "grad_norm": 0.06464157998561859, + "learning_rate": 1e-05, + "loss": 2.2531, + "step": 289 + }, + { + "epoch": 0.4881127708815485, + "grad_norm": 0.0547555610537529, + "learning_rate": 1e-05, + "loss": 2.5339, + "step": 290 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.0606168657541275, + "learning_rate": 1e-05, + "loss": 2.2886, + "step": 291 + }, + { + "epoch": 0.4914790658531454, + "grad_norm": 0.058814577758312225, + "learning_rate": 1e-05, + "loss": 2.3337, + "step": 292 + }, + { + "epoch": 0.49316221333894383, + "grad_norm": 0.0691385492682457, + "learning_rate": 1e-05, + "loss": 2.2904, + "step": 293 + }, + { + "epoch": 0.4948453608247423, + "grad_norm": 0.06522157788276672, + "learning_rate": 1e-05, + "loss": 2.469, + "step": 294 + }, + { + "epoch": 0.4965285083105407, + "grad_norm": 0.05957287177443504, + "learning_rate": 1e-05, + "loss": 2.4095, + "step": 295 + }, + { + "epoch": 0.49821165579633914, + "grad_norm": 0.06277060508728027, + "learning_rate": 1e-05, + "loss": 2.4697, + "step": 296 + }, + { + "epoch": 0.4998948032821376, + "grad_norm": 0.06802426278591156, + "learning_rate": 1e-05, + "loss": 2.2517, + "step": 297 + }, + { + "epoch": 0.501577950767936, + "grad_norm": 0.06365792453289032, + "learning_rate": 1e-05, + "loss": 2.2942, + "step": 298 + }, + { + "epoch": 0.5032610982537344, + "grad_norm": 0.06624794751405716, + "learning_rate": 1e-05, + "loss": 2.283, + "step": 299 + }, + { + "epoch": 0.5049442457395329, + "grad_norm": 0.05979595705866814, + "learning_rate": 1e-05, + "loss": 2.4387, + "step": 300 + }, + { + "epoch": 0.5066273932253313, + "grad_norm": 0.06187634915113449, + "learning_rate": 1e-05, + "loss": 2.4205, + "step": 301 + }, + { + "epoch": 0.5083105407111298, + "grad_norm": 0.06389462947845459, + "learning_rate": 1e-05, + "loss": 2.2775, + "step": 302 + }, + { + "epoch": 0.5099936881969283, + "grad_norm": 0.05831071361899376, + "learning_rate": 1e-05, + "loss": 2.3892, + "step": 303 + }, + { + "epoch": 0.5116768356827267, + "grad_norm": 0.06568494439125061, + "learning_rate": 1e-05, + "loss": 2.3087, + "step": 304 + }, + { + "epoch": 0.5133599831685252, + "grad_norm": 0.062109317630529404, + "learning_rate": 1e-05, + "loss": 2.3268, + "step": 305 + }, + { + "epoch": 0.5150431306543236, + "grad_norm": 0.061168327927589417, + "learning_rate": 1e-05, + "loss": 2.3093, + "step": 306 + }, + { + "epoch": 0.5167262781401221, + "grad_norm": 0.061159648001194, + "learning_rate": 1e-05, + "loss": 2.3315, + "step": 307 + }, + { + "epoch": 0.5184094256259205, + "grad_norm": 0.06269169598817825, + "learning_rate": 1e-05, + "loss": 2.3442, + "step": 308 + }, + { + "epoch": 0.520092573111719, + "grad_norm": 0.06711502373218536, + "learning_rate": 1e-05, + "loss": 2.2008, + "step": 309 + }, + { + "epoch": 0.5217757205975173, + "grad_norm": 0.0663105845451355, + "learning_rate": 1e-05, + "loss": 2.3502, + "step": 310 + }, + { + "epoch": 0.5234588680833158, + "grad_norm": 0.06040646880865097, + "learning_rate": 1e-05, + "loss": 2.3414, + "step": 311 + }, + { + "epoch": 0.5251420155691142, + "grad_norm": 0.06823603063821793, + "learning_rate": 1e-05, + "loss": 2.3392, + "step": 312 + }, + { + "epoch": 0.5268251630549127, + "grad_norm": 0.05944176763296127, + "learning_rate": 1e-05, + "loss": 2.3193, + "step": 313 + }, + { + "epoch": 0.5285083105407111, + "grad_norm": 0.06610157340765, + "learning_rate": 1e-05, + "loss": 2.2288, + "step": 314 + }, + { + "epoch": 0.5301914580265096, + "grad_norm": 0.06880299746990204, + "learning_rate": 1e-05, + "loss": 2.3529, + "step": 315 + }, + { + "epoch": 0.531874605512308, + "grad_norm": 0.06061836704611778, + "learning_rate": 1e-05, + "loss": 2.3533, + "step": 316 + }, + { + "epoch": 0.5335577529981065, + "grad_norm": 0.06552371382713318, + "learning_rate": 1e-05, + "loss": 2.3579, + "step": 317 + }, + { + "epoch": 0.5352409004839049, + "grad_norm": 0.06967922300100327, + "learning_rate": 1e-05, + "loss": 2.2983, + "step": 318 + }, + { + "epoch": 0.5369240479697034, + "grad_norm": 0.06997574120759964, + "learning_rate": 1e-05, + "loss": 2.355, + "step": 319 + }, + { + "epoch": 0.5386071954555018, + "grad_norm": 0.0654403418302536, + "learning_rate": 1e-05, + "loss": 2.4258, + "step": 320 + }, + { + "epoch": 0.5402903429413002, + "grad_norm": 0.06031208485364914, + "learning_rate": 1e-05, + "loss": 2.4011, + "step": 321 + }, + { + "epoch": 0.5419734904270986, + "grad_norm": 0.06496379524469376, + "learning_rate": 1e-05, + "loss": 2.2429, + "step": 322 + }, + { + "epoch": 0.5436566379128971, + "grad_norm": 0.06525281816720963, + "learning_rate": 1e-05, + "loss": 2.3254, + "step": 323 + }, + { + "epoch": 0.5453397853986955, + "grad_norm": 0.07553514093160629, + "learning_rate": 1e-05, + "loss": 2.2953, + "step": 324 + }, + { + "epoch": 0.547022932884494, + "grad_norm": 0.06429509073495865, + "learning_rate": 1e-05, + "loss": 2.3319, + "step": 325 + }, + { + "epoch": 0.5487060803702924, + "grad_norm": 0.0657946914434433, + "learning_rate": 1e-05, + "loss": 2.3501, + "step": 326 + }, + { + "epoch": 0.5503892278560909, + "grad_norm": 0.06548567861318588, + "learning_rate": 1e-05, + "loss": 2.2781, + "step": 327 + }, + { + "epoch": 0.5520723753418894, + "grad_norm": 0.06299672275781631, + "learning_rate": 1e-05, + "loss": 2.377, + "step": 328 + }, + { + "epoch": 0.5537555228276878, + "grad_norm": 0.06381850689649582, + "learning_rate": 1e-05, + "loss": 2.3945, + "step": 329 + }, + { + "epoch": 0.5554386703134863, + "grad_norm": 0.06497140228748322, + "learning_rate": 1e-05, + "loss": 2.3496, + "step": 330 + }, + { + "epoch": 0.5571218177992847, + "grad_norm": 0.06588133424520493, + "learning_rate": 1e-05, + "loss": 2.3955, + "step": 331 + }, + { + "epoch": 0.558804965285083, + "grad_norm": 0.06468643248081207, + "learning_rate": 1e-05, + "loss": 2.2893, + "step": 332 + }, + { + "epoch": 0.5604881127708815, + "grad_norm": 0.07278285920619965, + "learning_rate": 1e-05, + "loss": 2.3179, + "step": 333 + }, + { + "epoch": 0.56217126025668, + "grad_norm": 0.06992325931787491, + "learning_rate": 1e-05, + "loss": 2.3588, + "step": 334 + }, + { + "epoch": 0.5638544077424784, + "grad_norm": 0.06566626578569412, + "learning_rate": 1e-05, + "loss": 2.4763, + "step": 335 + }, + { + "epoch": 0.5655375552282769, + "grad_norm": 0.0633927658200264, + "learning_rate": 1e-05, + "loss": 2.4685, + "step": 336 + }, + { + "epoch": 0.5672207027140753, + "grad_norm": 0.06903122365474701, + "learning_rate": 1e-05, + "loss": 2.311, + "step": 337 + }, + { + "epoch": 0.5689038501998738, + "grad_norm": 0.06421441584825516, + "learning_rate": 1e-05, + "loss": 2.3589, + "step": 338 + }, + { + "epoch": 0.5705869976856722, + "grad_norm": 0.07122648507356644, + "learning_rate": 1e-05, + "loss": 2.3798, + "step": 339 + }, + { + "epoch": 0.5722701451714707, + "grad_norm": 0.06518077105283737, + "learning_rate": 1e-05, + "loss": 2.4546, + "step": 340 + }, + { + "epoch": 0.5739532926572691, + "grad_norm": 0.07509720325469971, + "learning_rate": 1e-05, + "loss": 2.3341, + "step": 341 + }, + { + "epoch": 0.5756364401430676, + "grad_norm": 0.06559302657842636, + "learning_rate": 1e-05, + "loss": 2.3127, + "step": 342 + }, + { + "epoch": 0.5773195876288659, + "grad_norm": 0.06652245670557022, + "learning_rate": 1e-05, + "loss": 2.3997, + "step": 343 + }, + { + "epoch": 0.5790027351146644, + "grad_norm": 0.07472145557403564, + "learning_rate": 1e-05, + "loss": 2.3237, + "step": 344 + }, + { + "epoch": 0.5806858826004628, + "grad_norm": 0.07624109089374542, + "learning_rate": 1e-05, + "loss": 2.186, + "step": 345 + }, + { + "epoch": 0.5823690300862613, + "grad_norm": 0.06387084722518921, + "learning_rate": 1e-05, + "loss": 2.2717, + "step": 346 + }, + { + "epoch": 0.5840521775720597, + "grad_norm": 0.06857839971780777, + "learning_rate": 1e-05, + "loss": 2.3726, + "step": 347 + }, + { + "epoch": 0.5857353250578582, + "grad_norm": 0.06429892778396606, + "learning_rate": 1e-05, + "loss": 2.4109, + "step": 348 + }, + { + "epoch": 0.5874184725436566, + "grad_norm": 0.0720372200012207, + "learning_rate": 1e-05, + "loss": 2.3291, + "step": 349 + }, + { + "epoch": 0.5891016200294551, + "grad_norm": 0.0749678909778595, + "learning_rate": 1e-05, + "loss": 2.3369, + "step": 350 + }, + { + "epoch": 0.5907847675152536, + "grad_norm": 0.0645705908536911, + "learning_rate": 1e-05, + "loss": 2.3894, + "step": 351 + }, + { + "epoch": 0.592467915001052, + "grad_norm": 0.06680341064929962, + "learning_rate": 1e-05, + "loss": 2.3335, + "step": 352 + }, + { + "epoch": 0.5941510624868505, + "grad_norm": 0.07383781671524048, + "learning_rate": 1e-05, + "loss": 2.2733, + "step": 353 + }, + { + "epoch": 0.5958342099726488, + "grad_norm": 0.07338624447584152, + "learning_rate": 1e-05, + "loss": 2.2236, + "step": 354 + }, + { + "epoch": 0.5975173574584473, + "grad_norm": 0.06998410820960999, + "learning_rate": 1e-05, + "loss": 2.2552, + "step": 355 + }, + { + "epoch": 0.5992005049442457, + "grad_norm": 0.06697436422109604, + "learning_rate": 1e-05, + "loss": 2.4231, + "step": 356 + }, + { + "epoch": 0.6008836524300442, + "grad_norm": 0.06693920493125916, + "learning_rate": 1e-05, + "loss": 2.3296, + "step": 357 + }, + { + "epoch": 0.6025667999158426, + "grad_norm": 0.06306028366088867, + "learning_rate": 1e-05, + "loss": 2.4009, + "step": 358 + }, + { + "epoch": 0.6042499474016411, + "grad_norm": 0.0724472776055336, + "learning_rate": 1e-05, + "loss": 2.2986, + "step": 359 + }, + { + "epoch": 0.6059330948874395, + "grad_norm": 0.06711563467979431, + "learning_rate": 1e-05, + "loss": 2.3755, + "step": 360 + }, + { + "epoch": 0.607616242373238, + "grad_norm": 0.07287666201591492, + "learning_rate": 1e-05, + "loss": 2.325, + "step": 361 + }, + { + "epoch": 0.6092993898590364, + "grad_norm": 0.07494334876537323, + "learning_rate": 1e-05, + "loss": 2.2673, + "step": 362 + }, + { + "epoch": 0.6109825373448349, + "grad_norm": 0.07399529218673706, + "learning_rate": 1e-05, + "loss": 2.3134, + "step": 363 + }, + { + "epoch": 0.6126656848306333, + "grad_norm": 0.06705833226442337, + "learning_rate": 1e-05, + "loss": 2.3772, + "step": 364 + }, + { + "epoch": 0.6143488323164318, + "grad_norm": 0.07528689503669739, + "learning_rate": 1e-05, + "loss": 2.3872, + "step": 365 + }, + { + "epoch": 0.6160319798022301, + "grad_norm": 0.06814612448215485, + "learning_rate": 1e-05, + "loss": 2.2527, + "step": 366 + }, + { + "epoch": 0.6177151272880286, + "grad_norm": 0.06929857283830643, + "learning_rate": 1e-05, + "loss": 2.4138, + "step": 367 + }, + { + "epoch": 0.619398274773827, + "grad_norm": 0.07336314767599106, + "learning_rate": 1e-05, + "loss": 2.4197, + "step": 368 + }, + { + "epoch": 0.6210814222596255, + "grad_norm": 0.07009201496839523, + "learning_rate": 1e-05, + "loss": 2.3943, + "step": 369 + }, + { + "epoch": 0.6227645697454239, + "grad_norm": 0.07367721945047379, + "learning_rate": 1e-05, + "loss": 2.3044, + "step": 370 + }, + { + "epoch": 0.6244477172312224, + "grad_norm": 0.07029354572296143, + "learning_rate": 1e-05, + "loss": 2.3018, + "step": 371 + }, + { + "epoch": 0.6261308647170208, + "grad_norm": 0.07852700352668762, + "learning_rate": 1e-05, + "loss": 2.3727, + "step": 372 + }, + { + "epoch": 0.6278140122028193, + "grad_norm": 0.0764508917927742, + "learning_rate": 1e-05, + "loss": 2.1992, + "step": 373 + }, + { + "epoch": 0.6294971596886177, + "grad_norm": 0.0799420177936554, + "learning_rate": 1e-05, + "loss": 2.2693, + "step": 374 + }, + { + "epoch": 0.6311803071744162, + "grad_norm": 0.06878554075956345, + "learning_rate": 1e-05, + "loss": 2.4749, + "step": 375 + }, + { + "epoch": 0.6328634546602147, + "grad_norm": 0.07085944712162018, + "learning_rate": 1e-05, + "loss": 2.3435, + "step": 376 + }, + { + "epoch": 0.634546602146013, + "grad_norm": 0.06489285826683044, + "learning_rate": 1e-05, + "loss": 2.3257, + "step": 377 + }, + { + "epoch": 0.6362297496318114, + "grad_norm": 0.06664973497390747, + "learning_rate": 1e-05, + "loss": 2.5022, + "step": 378 + }, + { + "epoch": 0.6379128971176099, + "grad_norm": 0.07660377770662308, + "learning_rate": 1e-05, + "loss": 2.3269, + "step": 379 + }, + { + "epoch": 0.6395960446034084, + "grad_norm": 0.06934674084186554, + "learning_rate": 1e-05, + "loss": 2.4021, + "step": 380 + }, + { + "epoch": 0.6412791920892068, + "grad_norm": 0.07515530288219452, + "learning_rate": 1e-05, + "loss": 2.3157, + "step": 381 + }, + { + "epoch": 0.6429623395750053, + "grad_norm": 0.07302498072385788, + "learning_rate": 1e-05, + "loss": 2.3892, + "step": 382 + }, + { + "epoch": 0.6446454870608037, + "grad_norm": 0.07303425669670105, + "learning_rate": 1e-05, + "loss": 2.3765, + "step": 383 + }, + { + "epoch": 0.6463286345466022, + "grad_norm": 0.07705460488796234, + "learning_rate": 1e-05, + "loss": 2.2684, + "step": 384 + }, + { + "epoch": 0.6480117820324006, + "grad_norm": 0.07487067580223083, + "learning_rate": 1e-05, + "loss": 2.3733, + "step": 385 + }, + { + "epoch": 0.6496949295181991, + "grad_norm": 0.06538619101047516, + "learning_rate": 1e-05, + "loss": 2.3789, + "step": 386 + }, + { + "epoch": 0.6513780770039975, + "grad_norm": 0.07406684756278992, + "learning_rate": 1e-05, + "loss": 2.332, + "step": 387 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.07246539741754532, + "learning_rate": 1e-05, + "loss": 2.2302, + "step": 388 + }, + { + "epoch": 0.6547443719755943, + "grad_norm": 0.07304323464632034, + "learning_rate": 1e-05, + "loss": 2.3708, + "step": 389 + }, + { + "epoch": 0.6564275194613928, + "grad_norm": 0.07457181811332703, + "learning_rate": 1e-05, + "loss": 2.2991, + "step": 390 + }, + { + "epoch": 0.6581106669471912, + "grad_norm": 0.07300930470228195, + "learning_rate": 1e-05, + "loss": 2.2423, + "step": 391 + }, + { + "epoch": 0.6597938144329897, + "grad_norm": 0.07508236914873123, + "learning_rate": 1e-05, + "loss": 2.2642, + "step": 392 + }, + { + "epoch": 0.6614769619187881, + "grad_norm": 0.07481173425912857, + "learning_rate": 1e-05, + "loss": 2.3, + "step": 393 + }, + { + "epoch": 0.6631601094045866, + "grad_norm": 0.06851742416620255, + "learning_rate": 1e-05, + "loss": 2.4534, + "step": 394 + }, + { + "epoch": 0.664843256890385, + "grad_norm": 0.07536716759204865, + "learning_rate": 1e-05, + "loss": 2.3264, + "step": 395 + }, + { + "epoch": 0.6665264043761835, + "grad_norm": 0.07752048969268799, + "learning_rate": 1e-05, + "loss": 2.4158, + "step": 396 + }, + { + "epoch": 0.6682095518619819, + "grad_norm": 0.06357281655073166, + "learning_rate": 1e-05, + "loss": 2.4956, + "step": 397 + }, + { + "epoch": 0.6698926993477804, + "grad_norm": 0.08333004266023636, + "learning_rate": 1e-05, + "loss": 2.3921, + "step": 398 + }, + { + "epoch": 0.6715758468335787, + "grad_norm": 0.06873282790184021, + "learning_rate": 1e-05, + "loss": 2.3611, + "step": 399 + }, + { + "epoch": 0.6732589943193772, + "grad_norm": 0.07533644139766693, + "learning_rate": 1e-05, + "loss": 2.3708, + "step": 400 + }, + { + "epoch": 0.6749421418051756, + "grad_norm": 0.07756076753139496, + "learning_rate": 1e-05, + "loss": 2.3003, + "step": 401 + }, + { + "epoch": 0.6766252892909741, + "grad_norm": 0.06644177436828613, + "learning_rate": 1e-05, + "loss": 2.4331, + "step": 402 + }, + { + "epoch": 0.6783084367767725, + "grad_norm": 0.07512148469686508, + "learning_rate": 1e-05, + "loss": 2.2881, + "step": 403 + }, + { + "epoch": 0.679991584262571, + "grad_norm": 0.08939874172210693, + "learning_rate": 1e-05, + "loss": 2.1564, + "step": 404 + }, + { + "epoch": 0.6816747317483695, + "grad_norm": 0.07984601706266403, + "learning_rate": 1e-05, + "loss": 2.3967, + "step": 405 + }, + { + "epoch": 0.6833578792341679, + "grad_norm": 0.0724392980337143, + "learning_rate": 1e-05, + "loss": 2.2859, + "step": 406 + }, + { + "epoch": 0.6850410267199664, + "grad_norm": 0.07025589793920517, + "learning_rate": 1e-05, + "loss": 2.3027, + "step": 407 + }, + { + "epoch": 0.6867241742057648, + "grad_norm": 0.07863828539848328, + "learning_rate": 1e-05, + "loss": 2.3286, + "step": 408 + }, + { + "epoch": 0.6884073216915633, + "grad_norm": 0.07466793060302734, + "learning_rate": 1e-05, + "loss": 2.2849, + "step": 409 + }, + { + "epoch": 0.6900904691773617, + "grad_norm": 0.07291209697723389, + "learning_rate": 1e-05, + "loss": 2.3931, + "step": 410 + }, + { + "epoch": 0.6917736166631601, + "grad_norm": 0.072298564016819, + "learning_rate": 1e-05, + "loss": 2.377, + "step": 411 + }, + { + "epoch": 0.6934567641489585, + "grad_norm": 0.06996294856071472, + "learning_rate": 1e-05, + "loss": 2.3503, + "step": 412 + }, + { + "epoch": 0.695139911634757, + "grad_norm": 0.07319701462984085, + "learning_rate": 1e-05, + "loss": 2.345, + "step": 413 + }, + { + "epoch": 0.6968230591205554, + "grad_norm": 0.0768033117055893, + "learning_rate": 1e-05, + "loss": 2.3679, + "step": 414 + }, + { + "epoch": 0.6985062066063539, + "grad_norm": 0.07401002943515778, + "learning_rate": 1e-05, + "loss": 2.3435, + "step": 415 + }, + { + "epoch": 0.7001893540921523, + "grad_norm": 0.07700485736131668, + "learning_rate": 1e-05, + "loss": 2.3428, + "step": 416 + }, + { + "epoch": 0.7018725015779508, + "grad_norm": 0.07446201890707016, + "learning_rate": 1e-05, + "loss": 2.4133, + "step": 417 + }, + { + "epoch": 0.7035556490637492, + "grad_norm": 0.06801878660917282, + "learning_rate": 1e-05, + "loss": 2.3665, + "step": 418 + }, + { + "epoch": 0.7052387965495477, + "grad_norm": 0.07989214360713959, + "learning_rate": 1e-05, + "loss": 2.3303, + "step": 419 + }, + { + "epoch": 0.7069219440353461, + "grad_norm": 0.07385462522506714, + "learning_rate": 1e-05, + "loss": 2.3608, + "step": 420 + }, + { + "epoch": 0.7086050915211446, + "grad_norm": 0.06808451563119888, + "learning_rate": 1e-05, + "loss": 2.4851, + "step": 421 + }, + { + "epoch": 0.7102882390069429, + "grad_norm": 0.07354162633419037, + "learning_rate": 1e-05, + "loss": 2.3005, + "step": 422 + }, + { + "epoch": 0.7119713864927414, + "grad_norm": 0.07730504870414734, + "learning_rate": 1e-05, + "loss": 2.2815, + "step": 423 + }, + { + "epoch": 0.7136545339785398, + "grad_norm": 0.08045239001512527, + "learning_rate": 1e-05, + "loss": 2.2695, + "step": 424 + }, + { + "epoch": 0.7153376814643383, + "grad_norm": 0.07997512817382812, + "learning_rate": 1e-05, + "loss": 2.3608, + "step": 425 + }, + { + "epoch": 0.7170208289501367, + "grad_norm": 0.07076172530651093, + "learning_rate": 1e-05, + "loss": 2.3411, + "step": 426 + }, + { + "epoch": 0.7187039764359352, + "grad_norm": 0.07223929464817047, + "learning_rate": 1e-05, + "loss": 2.3452, + "step": 427 + }, + { + "epoch": 0.7203871239217337, + "grad_norm": 0.07667456567287445, + "learning_rate": 1e-05, + "loss": 2.333, + "step": 428 + }, + { + "epoch": 0.7220702714075321, + "grad_norm": 0.07509643584489822, + "learning_rate": 1e-05, + "loss": 2.3701, + "step": 429 + }, + { + "epoch": 0.7237534188933306, + "grad_norm": 0.08230644464492798, + "learning_rate": 1e-05, + "loss": 2.3577, + "step": 430 + }, + { + "epoch": 0.725436566379129, + "grad_norm": 0.06938886642456055, + "learning_rate": 1e-05, + "loss": 2.4573, + "step": 431 + }, + { + "epoch": 0.7271197138649275, + "grad_norm": 0.07415178418159485, + "learning_rate": 1e-05, + "loss": 2.2834, + "step": 432 + }, + { + "epoch": 0.7288028613507258, + "grad_norm": 0.0821278989315033, + "learning_rate": 1e-05, + "loss": 2.2744, + "step": 433 + }, + { + "epoch": 0.7304860088365243, + "grad_norm": 0.07293502986431122, + "learning_rate": 1e-05, + "loss": 2.313, + "step": 434 + }, + { + "epoch": 0.7321691563223227, + "grad_norm": 0.07829819619655609, + "learning_rate": 1e-05, + "loss": 2.3849, + "step": 435 + }, + { + "epoch": 0.7338523038081212, + "grad_norm": 0.07795297354459763, + "learning_rate": 1e-05, + "loss": 2.2466, + "step": 436 + }, + { + "epoch": 0.7355354512939196, + "grad_norm": 0.06956803798675537, + "learning_rate": 1e-05, + "loss": 2.4038, + "step": 437 + }, + { + "epoch": 0.7372185987797181, + "grad_norm": 0.07948347926139832, + "learning_rate": 1e-05, + "loss": 2.3042, + "step": 438 + }, + { + "epoch": 0.7389017462655165, + "grad_norm": 0.08074218034744263, + "learning_rate": 1e-05, + "loss": 2.3314, + "step": 439 + }, + { + "epoch": 0.740584893751315, + "grad_norm": 0.08029188960790634, + "learning_rate": 1e-05, + "loss": 2.312, + "step": 440 + }, + { + "epoch": 0.7422680412371134, + "grad_norm": 0.0783049538731575, + "learning_rate": 1e-05, + "loss": 2.307, + "step": 441 + }, + { + "epoch": 0.7439511887229119, + "grad_norm": 0.08203115314245224, + "learning_rate": 1e-05, + "loss": 2.3081, + "step": 442 + }, + { + "epoch": 0.7456343362087103, + "grad_norm": 0.08666986972093582, + "learning_rate": 1e-05, + "loss": 2.3721, + "step": 443 + }, + { + "epoch": 0.7473174836945087, + "grad_norm": 0.08097022771835327, + "learning_rate": 1e-05, + "loss": 2.1912, + "step": 444 + }, + { + "epoch": 0.7490006311803071, + "grad_norm": 0.08272138237953186, + "learning_rate": 1e-05, + "loss": 2.3562, + "step": 445 + }, + { + "epoch": 0.7506837786661056, + "grad_norm": 0.08114828914403915, + "learning_rate": 1e-05, + "loss": 2.3569, + "step": 446 + }, + { + "epoch": 0.752366926151904, + "grad_norm": 0.07786712795495987, + "learning_rate": 1e-05, + "loss": 2.3772, + "step": 447 + }, + { + "epoch": 0.7540500736377025, + "grad_norm": 0.07603191584348679, + "learning_rate": 1e-05, + "loss": 2.2748, + "step": 448 + }, + { + "epoch": 0.7557332211235009, + "grad_norm": 0.08364319056272507, + "learning_rate": 1e-05, + "loss": 2.334, + "step": 449 + }, + { + "epoch": 0.7574163686092994, + "grad_norm": 0.07968125492334366, + "learning_rate": 1e-05, + "loss": 2.3225, + "step": 450 + }, + { + "epoch": 0.7590995160950978, + "grad_norm": 0.08204993605613708, + "learning_rate": 1e-05, + "loss": 2.3107, + "step": 451 + }, + { + "epoch": 0.7607826635808963, + "grad_norm": 0.08319111168384552, + "learning_rate": 1e-05, + "loss": 2.3994, + "step": 452 + }, + { + "epoch": 0.7624658110666948, + "grad_norm": 0.07812530547380447, + "learning_rate": 1e-05, + "loss": 2.2771, + "step": 453 + }, + { + "epoch": 0.7641489585524932, + "grad_norm": 0.07962696999311447, + "learning_rate": 1e-05, + "loss": 2.3094, + "step": 454 + }, + { + "epoch": 0.7658321060382917, + "grad_norm": 0.0815802663564682, + "learning_rate": 1e-05, + "loss": 2.3169, + "step": 455 + }, + { + "epoch": 0.76751525352409, + "grad_norm": 0.08460783958435059, + "learning_rate": 1e-05, + "loss": 2.2443, + "step": 456 + }, + { + "epoch": 0.7691984010098885, + "grad_norm": 0.07976390421390533, + "learning_rate": 1e-05, + "loss": 2.26, + "step": 457 + }, + { + "epoch": 0.7708815484956869, + "grad_norm": 0.08143635839223862, + "learning_rate": 1e-05, + "loss": 2.2517, + "step": 458 + }, + { + "epoch": 0.7725646959814854, + "grad_norm": 0.08004558831453323, + "learning_rate": 1e-05, + "loss": 2.3276, + "step": 459 + }, + { + "epoch": 0.7742478434672838, + "grad_norm": 0.0831751599907875, + "learning_rate": 1e-05, + "loss": 2.2842, + "step": 460 + }, + { + "epoch": 0.7759309909530823, + "grad_norm": 0.07613930851221085, + "learning_rate": 1e-05, + "loss": 2.3958, + "step": 461 + }, + { + "epoch": 0.7776141384388807, + "grad_norm": 0.08161590993404388, + "learning_rate": 1e-05, + "loss": 2.3287, + "step": 462 + }, + { + "epoch": 0.7792972859246792, + "grad_norm": 0.08616164326667786, + "learning_rate": 1e-05, + "loss": 2.3098, + "step": 463 + }, + { + "epoch": 0.7809804334104776, + "grad_norm": 0.08720822632312775, + "learning_rate": 1e-05, + "loss": 2.1388, + "step": 464 + }, + { + "epoch": 0.7826635808962761, + "grad_norm": 0.08598899841308594, + "learning_rate": 1e-05, + "loss": 2.3005, + "step": 465 + }, + { + "epoch": 0.7843467283820745, + "grad_norm": 0.07982167601585388, + "learning_rate": 1e-05, + "loss": 2.3049, + "step": 466 + }, + { + "epoch": 0.7860298758678729, + "grad_norm": 0.08733374625444412, + "learning_rate": 1e-05, + "loss": 2.2747, + "step": 467 + }, + { + "epoch": 0.7877130233536713, + "grad_norm": 0.08848235011100769, + "learning_rate": 1e-05, + "loss": 2.4331, + "step": 468 + }, + { + "epoch": 0.7893961708394698, + "grad_norm": 0.08619164675474167, + "learning_rate": 1e-05, + "loss": 2.2881, + "step": 469 + }, + { + "epoch": 0.7910793183252682, + "grad_norm": 0.08046075701713562, + "learning_rate": 1e-05, + "loss": 2.397, + "step": 470 + }, + { + "epoch": 0.7927624658110667, + "grad_norm": 0.08469874411821365, + "learning_rate": 1e-05, + "loss": 2.3225, + "step": 471 + }, + { + "epoch": 0.7944456132968651, + "grad_norm": 0.08878640830516815, + "learning_rate": 1e-05, + "loss": 2.2832, + "step": 472 + }, + { + "epoch": 0.7961287607826636, + "grad_norm": 0.08530005067586899, + "learning_rate": 1e-05, + "loss": 2.28, + "step": 473 + }, + { + "epoch": 0.797811908268462, + "grad_norm": 0.08089161664247513, + "learning_rate": 1e-05, + "loss": 2.2822, + "step": 474 + }, + { + "epoch": 0.7994950557542605, + "grad_norm": 0.0770372822880745, + "learning_rate": 1e-05, + "loss": 2.4031, + "step": 475 + }, + { + "epoch": 0.801178203240059, + "grad_norm": 0.08313820511102676, + "learning_rate": 1e-05, + "loss": 2.4009, + "step": 476 + }, + { + "epoch": 0.8028613507258574, + "grad_norm": 0.08684401214122772, + "learning_rate": 1e-05, + "loss": 2.4563, + "step": 477 + }, + { + "epoch": 0.8045444982116557, + "grad_norm": 0.08352997899055481, + "learning_rate": 1e-05, + "loss": 2.3242, + "step": 478 + }, + { + "epoch": 0.8062276456974542, + "grad_norm": 0.08148252218961716, + "learning_rate": 1e-05, + "loss": 2.3096, + "step": 479 + }, + { + "epoch": 0.8079107931832527, + "grad_norm": 0.08157838881015778, + "learning_rate": 1e-05, + "loss": 2.3108, + "step": 480 + }, + { + "epoch": 0.8095939406690511, + "grad_norm": 0.08561182022094727, + "learning_rate": 1e-05, + "loss": 2.2327, + "step": 481 + }, + { + "epoch": 0.8112770881548496, + "grad_norm": 0.09177689999341965, + "learning_rate": 1e-05, + "loss": 2.2129, + "step": 482 + }, + { + "epoch": 0.812960235640648, + "grad_norm": 0.08262176811695099, + "learning_rate": 1e-05, + "loss": 2.397, + "step": 483 + }, + { + "epoch": 0.8146433831264465, + "grad_norm": 0.08541447669267654, + "learning_rate": 1e-05, + "loss": 2.2419, + "step": 484 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.08732729405164719, + "learning_rate": 1e-05, + "loss": 2.3328, + "step": 485 + }, + { + "epoch": 0.8180096780980434, + "grad_norm": 0.08658833056688309, + "learning_rate": 1e-05, + "loss": 2.2793, + "step": 486 + }, + { + "epoch": 0.8196928255838418, + "grad_norm": 0.0789208933711052, + "learning_rate": 1e-05, + "loss": 2.4072, + "step": 487 + }, + { + "epoch": 0.8213759730696403, + "grad_norm": 0.07870952039957047, + "learning_rate": 1e-05, + "loss": 2.4082, + "step": 488 + }, + { + "epoch": 0.8230591205554386, + "grad_norm": 0.07583601027727127, + "learning_rate": 1e-05, + "loss": 2.3833, + "step": 489 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 0.08982661366462708, + "learning_rate": 1e-05, + "loss": 2.2766, + "step": 490 + }, + { + "epoch": 0.8264254155270355, + "grad_norm": 0.08841705322265625, + "learning_rate": 1e-05, + "loss": 2.2581, + "step": 491 + }, + { + "epoch": 0.828108563012834, + "grad_norm": 0.08784886449575424, + "learning_rate": 1e-05, + "loss": 2.2352, + "step": 492 + }, + { + "epoch": 0.8297917104986324, + "grad_norm": 0.08765432238578796, + "learning_rate": 1e-05, + "loss": 2.1957, + "step": 493 + }, + { + "epoch": 0.8314748579844309, + "grad_norm": 0.09070983529090881, + "learning_rate": 1e-05, + "loss": 2.2451, + "step": 494 + }, + { + "epoch": 0.8331580054702293, + "grad_norm": 0.08307146281003952, + "learning_rate": 1e-05, + "loss": 2.3645, + "step": 495 + }, + { + "epoch": 0.8348411529560278, + "grad_norm": 0.07774417847394943, + "learning_rate": 1e-05, + "loss": 2.3921, + "step": 496 + }, + { + "epoch": 0.8365243004418262, + "grad_norm": 0.08441779762506485, + "learning_rate": 1e-05, + "loss": 2.2974, + "step": 497 + }, + { + "epoch": 0.8382074479276247, + "grad_norm": 0.08773106336593628, + "learning_rate": 1e-05, + "loss": 2.3984, + "step": 498 + }, + { + "epoch": 0.8398905954134231, + "grad_norm": 0.08157604187726974, + "learning_rate": 1e-05, + "loss": 2.2946, + "step": 499 + }, + { + "epoch": 0.8415737428992216, + "grad_norm": 0.09280236810445786, + "learning_rate": 1e-05, + "loss": 2.3628, + "step": 500 + }, + { + "epoch": 0.8432568903850199, + "grad_norm": 0.08737549185752869, + "learning_rate": 1e-05, + "loss": 2.2593, + "step": 501 + }, + { + "epoch": 0.8449400378708184, + "grad_norm": 0.08917705714702606, + "learning_rate": 1e-05, + "loss": 2.2435, + "step": 502 + }, + { + "epoch": 0.8466231853566168, + "grad_norm": 0.08589258790016174, + "learning_rate": 1e-05, + "loss": 2.2869, + "step": 503 + }, + { + "epoch": 0.8483063328424153, + "grad_norm": 0.08363740891218185, + "learning_rate": 1e-05, + "loss": 2.1512, + "step": 504 + }, + { + "epoch": 0.8499894803282138, + "grad_norm": 0.09710842370986938, + "learning_rate": 1e-05, + "loss": 2.3042, + "step": 505 + }, + { + "epoch": 0.8516726278140122, + "grad_norm": 0.09031599014997482, + "learning_rate": 1e-05, + "loss": 2.2406, + "step": 506 + }, + { + "epoch": 0.8533557752998107, + "grad_norm": 0.08941849321126938, + "learning_rate": 1e-05, + "loss": 2.2725, + "step": 507 + }, + { + "epoch": 0.8550389227856091, + "grad_norm": 0.08926845341920853, + "learning_rate": 1e-05, + "loss": 2.323, + "step": 508 + }, + { + "epoch": 0.8567220702714076, + "grad_norm": 0.08846578001976013, + "learning_rate": 1e-05, + "loss": 2.3394, + "step": 509 + }, + { + "epoch": 0.858405217757206, + "grad_norm": 0.08452317863702774, + "learning_rate": 1e-05, + "loss": 2.4158, + "step": 510 + }, + { + "epoch": 0.8600883652430045, + "grad_norm": 0.08531490713357925, + "learning_rate": 1e-05, + "loss": 2.3113, + "step": 511 + }, + { + "epoch": 0.8617715127288028, + "grad_norm": 0.08221501857042313, + "learning_rate": 1e-05, + "loss": 2.3826, + "step": 512 + }, + { + "epoch": 0.8634546602146013, + "grad_norm": 0.08809410035610199, + "learning_rate": 1e-05, + "loss": 2.2666, + "step": 513 + }, + { + "epoch": 0.8651378077003997, + "grad_norm": 0.0881451964378357, + "learning_rate": 1e-05, + "loss": 2.4678, + "step": 514 + }, + { + "epoch": 0.8668209551861982, + "grad_norm": 0.0958879366517067, + "learning_rate": 1e-05, + "loss": 2.17, + "step": 515 + }, + { + "epoch": 0.8685041026719966, + "grad_norm": 0.08498766273260117, + "learning_rate": 1e-05, + "loss": 2.4021, + "step": 516 + }, + { + "epoch": 0.8701872501577951, + "grad_norm": 0.09182509779930115, + "learning_rate": 1e-05, + "loss": 2.2476, + "step": 517 + }, + { + "epoch": 0.8718703976435935, + "grad_norm": 0.08831535279750824, + "learning_rate": 1e-05, + "loss": 2.3013, + "step": 518 + }, + { + "epoch": 0.873553545129392, + "grad_norm": 0.08792266249656677, + "learning_rate": 1e-05, + "loss": 2.2463, + "step": 519 + }, + { + "epoch": 0.8752366926151904, + "grad_norm": 0.0804978460073471, + "learning_rate": 1e-05, + "loss": 2.5151, + "step": 520 + }, + { + "epoch": 0.8769198401009889, + "grad_norm": 0.09397967159748077, + "learning_rate": 1e-05, + "loss": 2.2487, + "step": 521 + }, + { + "epoch": 0.8786029875867873, + "grad_norm": 0.08882005512714386, + "learning_rate": 1e-05, + "loss": 2.225, + "step": 522 + }, + { + "epoch": 0.8802861350725857, + "grad_norm": 0.08365931361913681, + "learning_rate": 1e-05, + "loss": 2.4277, + "step": 523 + }, + { + "epoch": 0.8819692825583841, + "grad_norm": 0.08842651546001434, + "learning_rate": 1e-05, + "loss": 2.3884, + "step": 524 + }, + { + "epoch": 0.8836524300441826, + "grad_norm": 0.08760154247283936, + "learning_rate": 1e-05, + "loss": 2.2576, + "step": 525 + }, + { + "epoch": 0.885335577529981, + "grad_norm": 0.07843348383903503, + "learning_rate": 1e-05, + "loss": 2.4143, + "step": 526 + }, + { + "epoch": 0.8870187250157795, + "grad_norm": 0.09312726557254791, + "learning_rate": 1e-05, + "loss": 2.2472, + "step": 527 + }, + { + "epoch": 0.888701872501578, + "grad_norm": 0.09460542351007462, + "learning_rate": 1e-05, + "loss": 2.2043, + "step": 528 + }, + { + "epoch": 0.8903850199873764, + "grad_norm": 0.09200920909643173, + "learning_rate": 1e-05, + "loss": 2.3562, + "step": 529 + }, + { + "epoch": 0.8920681674731749, + "grad_norm": 0.08051000535488129, + "learning_rate": 1e-05, + "loss": 2.4146, + "step": 530 + }, + { + "epoch": 0.8937513149589733, + "grad_norm": 0.09969057142734528, + "learning_rate": 1e-05, + "loss": 2.3342, + "step": 531 + }, + { + "epoch": 0.8954344624447718, + "grad_norm": 0.08616895228624344, + "learning_rate": 1e-05, + "loss": 2.3381, + "step": 532 + }, + { + "epoch": 0.8971176099305702, + "grad_norm": 0.09115055203437805, + "learning_rate": 1e-05, + "loss": 2.2377, + "step": 533 + }, + { + "epoch": 0.8988007574163686, + "grad_norm": 0.10309138149023056, + "learning_rate": 1e-05, + "loss": 2.1418, + "step": 534 + }, + { + "epoch": 0.900483904902167, + "grad_norm": 0.09327155351638794, + "learning_rate": 1e-05, + "loss": 2.312, + "step": 535 + }, + { + "epoch": 0.9021670523879655, + "grad_norm": 0.09104789048433304, + "learning_rate": 1e-05, + "loss": 2.2759, + "step": 536 + }, + { + "epoch": 0.9038501998737639, + "grad_norm": 0.08858876675367355, + "learning_rate": 1e-05, + "loss": 2.4138, + "step": 537 + }, + { + "epoch": 0.9055333473595624, + "grad_norm": 0.08850864320993423, + "learning_rate": 1e-05, + "loss": 2.3915, + "step": 538 + }, + { + "epoch": 0.9072164948453608, + "grad_norm": 0.09071122854948044, + "learning_rate": 1e-05, + "loss": 2.4199, + "step": 539 + }, + { + "epoch": 0.9088996423311593, + "grad_norm": 0.08702193200588226, + "learning_rate": 1e-05, + "loss": 2.3079, + "step": 540 + }, + { + "epoch": 0.9105827898169577, + "grad_norm": 0.09564194083213806, + "learning_rate": 1e-05, + "loss": 2.2996, + "step": 541 + }, + { + "epoch": 0.9122659373027562, + "grad_norm": 0.08906988054513931, + "learning_rate": 1e-05, + "loss": 2.3958, + "step": 542 + }, + { + "epoch": 0.9139490847885546, + "grad_norm": 0.08117242157459259, + "learning_rate": 1e-05, + "loss": 2.5557, + "step": 543 + }, + { + "epoch": 0.9156322322743531, + "grad_norm": 0.09870729595422745, + "learning_rate": 1e-05, + "loss": 2.3542, + "step": 544 + }, + { + "epoch": 0.9173153797601514, + "grad_norm": 0.0906287208199501, + "learning_rate": 1e-05, + "loss": 2.2866, + "step": 545 + }, + { + "epoch": 0.9189985272459499, + "grad_norm": 0.08649491518735886, + "learning_rate": 1e-05, + "loss": 2.3547, + "step": 546 + }, + { + "epoch": 0.9206816747317483, + "grad_norm": 0.09572413563728333, + "learning_rate": 1e-05, + "loss": 2.377, + "step": 547 + }, + { + "epoch": 0.9223648222175468, + "grad_norm": 0.08862059563398361, + "learning_rate": 1e-05, + "loss": 2.3452, + "step": 548 + }, + { + "epoch": 0.9240479697033452, + "grad_norm": 0.09061957150697708, + "learning_rate": 1e-05, + "loss": 2.264, + "step": 549 + }, + { + "epoch": 0.9257311171891437, + "grad_norm": 0.10327678918838501, + "learning_rate": 1e-05, + "loss": 2.3362, + "step": 550 + }, + { + "epoch": 0.9274142646749421, + "grad_norm": 0.10101998597383499, + "learning_rate": 1e-05, + "loss": 2.2091, + "step": 551 + }, + { + "epoch": 0.9290974121607406, + "grad_norm": 0.08099676668643951, + "learning_rate": 1e-05, + "loss": 2.3779, + "step": 552 + }, + { + "epoch": 0.930780559646539, + "grad_norm": 0.09572342783212662, + "learning_rate": 1e-05, + "loss": 2.2186, + "step": 553 + }, + { + "epoch": 0.9324637071323375, + "grad_norm": 0.10440348833799362, + "learning_rate": 1e-05, + "loss": 2.2717, + "step": 554 + }, + { + "epoch": 0.934146854618136, + "grad_norm": 0.09859239310026169, + "learning_rate": 1e-05, + "loss": 2.2964, + "step": 555 + }, + { + "epoch": 0.9358300021039344, + "grad_norm": 0.08539914339780807, + "learning_rate": 1e-05, + "loss": 2.3541, + "step": 556 + }, + { + "epoch": 0.9375131495897328, + "grad_norm": 0.09667155891656876, + "learning_rate": 1e-05, + "loss": 2.2412, + "step": 557 + }, + { + "epoch": 0.9391962970755312, + "grad_norm": 0.09381328523159027, + "learning_rate": 1e-05, + "loss": 2.1632, + "step": 558 + }, + { + "epoch": 0.9408794445613297, + "grad_norm": 0.10293637216091156, + "learning_rate": 1e-05, + "loss": 2.2969, + "step": 559 + }, + { + "epoch": 0.9425625920471281, + "grad_norm": 0.08901844918727875, + "learning_rate": 1e-05, + "loss": 2.2806, + "step": 560 + }, + { + "epoch": 0.9442457395329266, + "grad_norm": 0.09931071847677231, + "learning_rate": 1e-05, + "loss": 2.2671, + "step": 561 + }, + { + "epoch": 0.945928887018725, + "grad_norm": 0.08619210124015808, + "learning_rate": 1e-05, + "loss": 2.428, + "step": 562 + }, + { + "epoch": 0.9476120345045235, + "grad_norm": 0.08460855484008789, + "learning_rate": 1e-05, + "loss": 2.2412, + "step": 563 + }, + { + "epoch": 0.9492951819903219, + "grad_norm": 0.09682973474264145, + "learning_rate": 1e-05, + "loss": 2.3339, + "step": 564 + }, + { + "epoch": 0.9509783294761204, + "grad_norm": 0.10189709812402725, + "learning_rate": 1e-05, + "loss": 2.2268, + "step": 565 + }, + { + "epoch": 0.9526614769619188, + "grad_norm": 0.10271991789340973, + "learning_rate": 1e-05, + "loss": 2.1819, + "step": 566 + }, + { + "epoch": 0.9543446244477173, + "grad_norm": 0.0901963859796524, + "learning_rate": 1e-05, + "loss": 2.3029, + "step": 567 + }, + { + "epoch": 0.9560277719335156, + "grad_norm": 0.09148905426263809, + "learning_rate": 1e-05, + "loss": 2.3362, + "step": 568 + }, + { + "epoch": 0.9577109194193141, + "grad_norm": 0.10434332489967346, + "learning_rate": 1e-05, + "loss": 2.3037, + "step": 569 + }, + { + "epoch": 0.9593940669051125, + "grad_norm": 0.0956675261259079, + "learning_rate": 1e-05, + "loss": 2.3442, + "step": 570 + }, + { + "epoch": 0.961077214390911, + "grad_norm": 0.09394146502017975, + "learning_rate": 1e-05, + "loss": 2.2913, + "step": 571 + }, + { + "epoch": 0.9627603618767094, + "grad_norm": 0.09179794043302536, + "learning_rate": 1e-05, + "loss": 2.21, + "step": 572 + }, + { + "epoch": 0.9644435093625079, + "grad_norm": 0.09866604208946228, + "learning_rate": 1e-05, + "loss": 2.2721, + "step": 573 + }, + { + "epoch": 0.9661266568483063, + "grad_norm": 0.10069537162780762, + "learning_rate": 1e-05, + "loss": 2.1637, + "step": 574 + }, + { + "epoch": 0.9678098043341048, + "grad_norm": 0.0923682376742363, + "learning_rate": 1e-05, + "loss": 2.2343, + "step": 575 + }, + { + "epoch": 0.9694929518199032, + "grad_norm": 0.08836492151021957, + "learning_rate": 1e-05, + "loss": 2.3794, + "step": 576 + }, + { + "epoch": 0.9711760993057017, + "grad_norm": 0.0894513726234436, + "learning_rate": 1e-05, + "loss": 2.2378, + "step": 577 + }, + { + "epoch": 0.9728592467915002, + "grad_norm": 0.08647426962852478, + "learning_rate": 1e-05, + "loss": 2.3589, + "step": 578 + }, + { + "epoch": 0.9745423942772985, + "grad_norm": 0.11035202443599701, + "learning_rate": 1e-05, + "loss": 2.2371, + "step": 579 + }, + { + "epoch": 0.976225541763097, + "grad_norm": 0.09551876783370972, + "learning_rate": 1e-05, + "loss": 2.3353, + "step": 580 + }, + { + "epoch": 0.9779086892488954, + "grad_norm": 0.0911082923412323, + "learning_rate": 1e-05, + "loss": 2.3264, + "step": 581 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.10280529409646988, + "learning_rate": 1e-05, + "loss": 2.2351, + "step": 582 + }, + { + "epoch": 0.9812749842204923, + "grad_norm": 0.09424940496683121, + "learning_rate": 1e-05, + "loss": 2.3464, + "step": 583 + }, + { + "epoch": 0.9829581317062908, + "grad_norm": 0.092115618288517, + "learning_rate": 1e-05, + "loss": 2.2799, + "step": 584 + }, + { + "epoch": 0.9846412791920892, + "grad_norm": 0.09771659225225449, + "learning_rate": 1e-05, + "loss": 2.3777, + "step": 585 + }, + { + "epoch": 0.9863244266778877, + "grad_norm": 0.09877105802297592, + "learning_rate": 1e-05, + "loss": 2.3613, + "step": 586 + }, + { + "epoch": 0.9880075741636861, + "grad_norm": 0.09816967695951462, + "learning_rate": 1e-05, + "loss": 2.2925, + "step": 587 + }, + { + "epoch": 0.9896907216494846, + "grad_norm": 0.0874725803732872, + "learning_rate": 1e-05, + "loss": 2.3154, + "step": 588 + }, + { + "epoch": 0.991373869135283, + "grad_norm": 0.09336823225021362, + "learning_rate": 1e-05, + "loss": 2.3933, + "step": 589 + }, + { + "epoch": 0.9930570166210814, + "grad_norm": 0.10439187288284302, + "learning_rate": 1e-05, + "loss": 2.3655, + "step": 590 + }, + { + "epoch": 0.9947401641068798, + "grad_norm": 0.09005751460790634, + "learning_rate": 1e-05, + "loss": 2.2971, + "step": 591 + }, + { + "epoch": 0.9964233115926783, + "grad_norm": 0.10612068325281143, + "learning_rate": 1e-05, + "loss": 2.3584, + "step": 592 + }, + { + "epoch": 0.9981064590784767, + "grad_norm": 0.09101177752017975, + "learning_rate": 1e-05, + "loss": 2.4402, + "step": 593 + }, + { + "epoch": 0.9997896065642752, + "grad_norm": 0.09874800592660904, + "learning_rate": 1e-05, + "loss": 2.326, + "step": 594 + }, + { + "epoch": 1.0014727540500736, + "grad_norm": 0.1025647521018982, + "learning_rate": 1e-05, + "loss": 2.4041, + "step": 595 + }, + { + "epoch": 1.003155901535872, + "grad_norm": 0.11109832674264908, + "learning_rate": 1e-05, + "loss": 2.2881, + "step": 596 + }, + { + "epoch": 1.0048390490216705, + "grad_norm": 0.09670565277338028, + "learning_rate": 1e-05, + "loss": 2.2003, + "step": 597 + }, + { + "epoch": 1.0065221965074689, + "grad_norm": 0.09513822942972183, + "learning_rate": 1e-05, + "loss": 2.3225, + "step": 598 + }, + { + "epoch": 1.0082053439932674, + "grad_norm": 0.11121483892202377, + "learning_rate": 1e-05, + "loss": 2.4143, + "step": 599 + }, + { + "epoch": 1.0098884914790658, + "grad_norm": 0.09941378980875015, + "learning_rate": 1e-05, + "loss": 2.333, + "step": 600 + }, + { + "epoch": 1.0115716389648644, + "grad_norm": 0.09730757772922516, + "learning_rate": 1e-05, + "loss": 2.3638, + "step": 601 + }, + { + "epoch": 1.0132547864506627, + "grad_norm": 0.10626422613859177, + "learning_rate": 1e-05, + "loss": 2.2303, + "step": 602 + }, + { + "epoch": 1.0149379339364613, + "grad_norm": 0.0958971306681633, + "learning_rate": 1e-05, + "loss": 2.3906, + "step": 603 + }, + { + "epoch": 1.0166210814222596, + "grad_norm": 0.10065159201622009, + "learning_rate": 1e-05, + "loss": 2.3425, + "step": 604 + }, + { + "epoch": 1.0183042289080582, + "grad_norm": 0.08671624213457108, + "learning_rate": 1e-05, + "loss": 2.2742, + "step": 605 + }, + { + "epoch": 1.0199873763938565, + "grad_norm": 0.09528376907110214, + "learning_rate": 1e-05, + "loss": 2.3765, + "step": 606 + }, + { + "epoch": 1.0216705238796548, + "grad_norm": 0.09153752028942108, + "learning_rate": 1e-05, + "loss": 2.2983, + "step": 607 + }, + { + "epoch": 1.0233536713654534, + "grad_norm": 0.10145740956068039, + "learning_rate": 1e-05, + "loss": 2.1774, + "step": 608 + }, + { + "epoch": 1.0250368188512518, + "grad_norm": 0.09908965229988098, + "learning_rate": 1e-05, + "loss": 2.3479, + "step": 609 + }, + { + "epoch": 1.0267199663370503, + "grad_norm": 0.09253786504268646, + "learning_rate": 1e-05, + "loss": 2.3228, + "step": 610 + }, + { + "epoch": 1.0284031138228487, + "grad_norm": 0.094690702855587, + "learning_rate": 1e-05, + "loss": 2.2864, + "step": 611 + }, + { + "epoch": 1.0300862613086472, + "grad_norm": 0.09160283952951431, + "learning_rate": 1e-05, + "loss": 2.4285, + "step": 612 + }, + { + "epoch": 1.0317694087944456, + "grad_norm": 0.10157333314418793, + "learning_rate": 1e-05, + "loss": 2.1316, + "step": 613 + }, + { + "epoch": 1.0334525562802441, + "grad_norm": 0.10498999804258347, + "learning_rate": 1e-05, + "loss": 2.373, + "step": 614 + }, + { + "epoch": 1.0351357037660425, + "grad_norm": 0.09599211066961288, + "learning_rate": 1e-05, + "loss": 2.3511, + "step": 615 + }, + { + "epoch": 1.036818851251841, + "grad_norm": 0.1121436059474945, + "learning_rate": 1e-05, + "loss": 2.127, + "step": 616 + }, + { + "epoch": 1.0385019987376394, + "grad_norm": 0.10269173234701157, + "learning_rate": 1e-05, + "loss": 2.2659, + "step": 617 + }, + { + "epoch": 1.040185146223438, + "grad_norm": 0.0945139229297638, + "learning_rate": 1e-05, + "loss": 2.3281, + "step": 618 + }, + { + "epoch": 1.0418682937092363, + "grad_norm": 0.09318878501653671, + "learning_rate": 1e-05, + "loss": 2.3247, + "step": 619 + }, + { + "epoch": 1.0435514411950346, + "grad_norm": 0.10471779108047485, + "learning_rate": 1e-05, + "loss": 2.3098, + "step": 620 + }, + { + "epoch": 1.0452345886808332, + "grad_norm": 0.10514305531978607, + "learning_rate": 1e-05, + "loss": 2.3647, + "step": 621 + }, + { + "epoch": 1.0469177361666315, + "grad_norm": 0.09875541925430298, + "learning_rate": 1e-05, + "loss": 2.4204, + "step": 622 + }, + { + "epoch": 1.04860088365243, + "grad_norm": 0.10112539678812027, + "learning_rate": 1e-05, + "loss": 2.3269, + "step": 623 + }, + { + "epoch": 1.0502840311382284, + "grad_norm": 0.09719318896532059, + "learning_rate": 1e-05, + "loss": 2.3223, + "step": 624 + }, + { + "epoch": 1.051967178624027, + "grad_norm": 0.09615301340818405, + "learning_rate": 1e-05, + "loss": 2.2798, + "step": 625 + }, + { + "epoch": 1.0536503261098253, + "grad_norm": 0.09600812941789627, + "learning_rate": 1e-05, + "loss": 2.2738, + "step": 626 + }, + { + "epoch": 1.055333473595624, + "grad_norm": 0.09326303005218506, + "learning_rate": 1e-05, + "loss": 2.23, + "step": 627 + }, + { + "epoch": 1.0570166210814222, + "grad_norm": 0.09689430892467499, + "learning_rate": 1e-05, + "loss": 2.2582, + "step": 628 + }, + { + "epoch": 1.0586997685672208, + "grad_norm": 0.10389314591884613, + "learning_rate": 1e-05, + "loss": 2.3733, + "step": 629 + }, + { + "epoch": 1.0603829160530192, + "grad_norm": 0.09320785105228424, + "learning_rate": 1e-05, + "loss": 2.3315, + "step": 630 + }, + { + "epoch": 1.0620660635388175, + "grad_norm": 0.10638166218996048, + "learning_rate": 1e-05, + "loss": 2.4058, + "step": 631 + }, + { + "epoch": 1.063749211024616, + "grad_norm": 0.09525519609451294, + "learning_rate": 1e-05, + "loss": 2.2803, + "step": 632 + }, + { + "epoch": 1.0654323585104144, + "grad_norm": 0.09904535114765167, + "learning_rate": 1e-05, + "loss": 2.3613, + "step": 633 + }, + { + "epoch": 1.067115505996213, + "grad_norm": 0.10914106667041779, + "learning_rate": 1e-05, + "loss": 2.3955, + "step": 634 + }, + { + "epoch": 1.0687986534820113, + "grad_norm": 0.10424593091011047, + "learning_rate": 1e-05, + "loss": 2.2332, + "step": 635 + }, + { + "epoch": 1.0704818009678099, + "grad_norm": 0.10360780358314514, + "learning_rate": 1e-05, + "loss": 2.3127, + "step": 636 + }, + { + "epoch": 1.0721649484536082, + "grad_norm": 0.11223631352186203, + "learning_rate": 1e-05, + "loss": 2.201, + "step": 637 + }, + { + "epoch": 1.0738480959394068, + "grad_norm": 0.09491337090730667, + "learning_rate": 1e-05, + "loss": 2.3129, + "step": 638 + }, + { + "epoch": 1.0755312434252051, + "grad_norm": 0.09244826436042786, + "learning_rate": 1e-05, + "loss": 2.3728, + "step": 639 + }, + { + "epoch": 1.0772143909110037, + "grad_norm": 0.0922231450676918, + "learning_rate": 1e-05, + "loss": 2.3225, + "step": 640 + }, + { + "epoch": 1.078897538396802, + "grad_norm": 0.10818596929311752, + "learning_rate": 1e-05, + "loss": 2.3104, + "step": 641 + }, + { + "epoch": 1.0805806858826004, + "grad_norm": 0.09497258812189102, + "learning_rate": 1e-05, + "loss": 2.3176, + "step": 642 + }, + { + "epoch": 1.082263833368399, + "grad_norm": 0.10034379363059998, + "learning_rate": 1e-05, + "loss": 2.3943, + "step": 643 + }, + { + "epoch": 1.0839469808541973, + "grad_norm": 0.10024038702249527, + "learning_rate": 1e-05, + "loss": 2.3127, + "step": 644 + }, + { + "epoch": 1.0856301283399958, + "grad_norm": 0.10074039548635483, + "learning_rate": 1e-05, + "loss": 2.2351, + "step": 645 + }, + { + "epoch": 1.0873132758257942, + "grad_norm": 0.09631813317537308, + "learning_rate": 1e-05, + "loss": 2.3101, + "step": 646 + }, + { + "epoch": 1.0889964233115927, + "grad_norm": 0.10632781684398651, + "learning_rate": 1e-05, + "loss": 2.3669, + "step": 647 + }, + { + "epoch": 1.090679570797391, + "grad_norm": 0.10795175284147263, + "learning_rate": 1e-05, + "loss": 2.3064, + "step": 648 + }, + { + "epoch": 1.0923627182831896, + "grad_norm": 0.11120691895484924, + "learning_rate": 1e-05, + "loss": 2.2911, + "step": 649 + }, + { + "epoch": 1.094045865768988, + "grad_norm": 0.10034749656915665, + "learning_rate": 1e-05, + "loss": 2.3696, + "step": 650 + }, + { + "epoch": 1.0957290132547866, + "grad_norm": 0.10955310612916946, + "learning_rate": 1e-05, + "loss": 2.3464, + "step": 651 + }, + { + "epoch": 1.097412160740585, + "grad_norm": 0.09739572554826736, + "learning_rate": 1e-05, + "loss": 2.325, + "step": 652 + }, + { + "epoch": 1.0990953082263832, + "grad_norm": 0.10152111947536469, + "learning_rate": 1e-05, + "loss": 2.3745, + "step": 653 + }, + { + "epoch": 1.1007784557121818, + "grad_norm": 0.10103686153888702, + "learning_rate": 1e-05, + "loss": 2.3303, + "step": 654 + }, + { + "epoch": 1.1024616031979801, + "grad_norm": 0.1003558412194252, + "learning_rate": 1e-05, + "loss": 2.312, + "step": 655 + }, + { + "epoch": 1.1041447506837787, + "grad_norm": 0.10518987476825714, + "learning_rate": 1e-05, + "loss": 2.3444, + "step": 656 + }, + { + "epoch": 1.105827898169577, + "grad_norm": 0.09896016865968704, + "learning_rate": 1e-05, + "loss": 2.2532, + "step": 657 + }, + { + "epoch": 1.1075110456553756, + "grad_norm": 0.09725090116262436, + "learning_rate": 1e-05, + "loss": 2.3625, + "step": 658 + }, + { + "epoch": 1.109194193141174, + "grad_norm": 0.09022284299135208, + "learning_rate": 1e-05, + "loss": 2.3743, + "step": 659 + }, + { + "epoch": 1.1108773406269725, + "grad_norm": 0.10471490770578384, + "learning_rate": 1e-05, + "loss": 2.3416, + "step": 660 + }, + { + "epoch": 1.1125604881127709, + "grad_norm": 0.10991263389587402, + "learning_rate": 1e-05, + "loss": 2.3214, + "step": 661 + }, + { + "epoch": 1.1142436355985694, + "grad_norm": 0.10231148451566696, + "learning_rate": 1e-05, + "loss": 2.2832, + "step": 662 + }, + { + "epoch": 1.1159267830843678, + "grad_norm": 0.09433937072753906, + "learning_rate": 1e-05, + "loss": 2.2645, + "step": 663 + }, + { + "epoch": 1.117609930570166, + "grad_norm": 0.13238666951656342, + "learning_rate": 1e-05, + "loss": 2.2483, + "step": 664 + }, + { + "epoch": 1.1192930780559647, + "grad_norm": 0.10956214368343353, + "learning_rate": 1e-05, + "loss": 2.2321, + "step": 665 + }, + { + "epoch": 1.120976225541763, + "grad_norm": 0.11065597832202911, + "learning_rate": 1e-05, + "loss": 2.1869, + "step": 666 + }, + { + "epoch": 1.1226593730275616, + "grad_norm": 0.10971678793430328, + "learning_rate": 1e-05, + "loss": 2.1855, + "step": 667 + }, + { + "epoch": 1.12434252051336, + "grad_norm": 0.11080143600702286, + "learning_rate": 1e-05, + "loss": 2.198, + "step": 668 + }, + { + "epoch": 1.1260256679991585, + "grad_norm": 0.10381001979112625, + "learning_rate": 1e-05, + "loss": 2.3384, + "step": 669 + }, + { + "epoch": 1.1277088154849568, + "grad_norm": 0.1026921421289444, + "learning_rate": 1e-05, + "loss": 2.2458, + "step": 670 + }, + { + "epoch": 1.1293919629707554, + "grad_norm": 0.10585295408964157, + "learning_rate": 1e-05, + "loss": 2.1859, + "step": 671 + }, + { + "epoch": 1.1310751104565537, + "grad_norm": 0.10650487244129181, + "learning_rate": 1e-05, + "loss": 2.2662, + "step": 672 + }, + { + "epoch": 1.1327582579423523, + "grad_norm": 0.10717649012804031, + "learning_rate": 1e-05, + "loss": 2.3088, + "step": 673 + }, + { + "epoch": 1.1344414054281506, + "grad_norm": 0.10479724407196045, + "learning_rate": 1e-05, + "loss": 2.3042, + "step": 674 + }, + { + "epoch": 1.136124552913949, + "grad_norm": 0.10629065334796906, + "learning_rate": 1e-05, + "loss": 2.3481, + "step": 675 + }, + { + "epoch": 1.1378077003997475, + "grad_norm": 0.10375174880027771, + "learning_rate": 1e-05, + "loss": 2.3845, + "step": 676 + }, + { + "epoch": 1.1394908478855459, + "grad_norm": 0.10122872143983841, + "learning_rate": 1e-05, + "loss": 2.335, + "step": 677 + }, + { + "epoch": 1.1411739953713445, + "grad_norm": 0.09846247732639313, + "learning_rate": 1e-05, + "loss": 2.4028, + "step": 678 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.11501342058181763, + "learning_rate": 1e-05, + "loss": 2.2419, + "step": 679 + }, + { + "epoch": 1.1445402903429414, + "grad_norm": 0.11248493194580078, + "learning_rate": 1e-05, + "loss": 2.1294, + "step": 680 + }, + { + "epoch": 1.1462234378287397, + "grad_norm": 0.1141652762889862, + "learning_rate": 1e-05, + "loss": 2.2842, + "step": 681 + }, + { + "epoch": 1.1479065853145383, + "grad_norm": 0.10232444107532501, + "learning_rate": 1e-05, + "loss": 2.1798, + "step": 682 + }, + { + "epoch": 1.1495897328003366, + "grad_norm": 0.10624698549509048, + "learning_rate": 1e-05, + "loss": 2.2474, + "step": 683 + }, + { + "epoch": 1.1512728802861352, + "grad_norm": 0.10583934187889099, + "learning_rate": 1e-05, + "loss": 2.2917, + "step": 684 + }, + { + "epoch": 1.1529560277719335, + "grad_norm": 0.10667344182729721, + "learning_rate": 1e-05, + "loss": 2.2581, + "step": 685 + }, + { + "epoch": 1.1546391752577319, + "grad_norm": 0.10415381193161011, + "learning_rate": 1e-05, + "loss": 2.3325, + "step": 686 + }, + { + "epoch": 1.1563223227435304, + "grad_norm": 0.109574094414711, + "learning_rate": 1e-05, + "loss": 2.3306, + "step": 687 + }, + { + "epoch": 1.1580054702293288, + "grad_norm": 0.10537154227495193, + "learning_rate": 1e-05, + "loss": 2.3396, + "step": 688 + }, + { + "epoch": 1.1596886177151273, + "grad_norm": 0.10670781880617142, + "learning_rate": 1e-05, + "loss": 2.2518, + "step": 689 + }, + { + "epoch": 1.1613717652009257, + "grad_norm": 0.10296822339296341, + "learning_rate": 1e-05, + "loss": 2.3911, + "step": 690 + }, + { + "epoch": 1.1630549126867242, + "grad_norm": 0.10323610156774521, + "learning_rate": 1e-05, + "loss": 2.415, + "step": 691 + }, + { + "epoch": 1.1647380601725226, + "grad_norm": 0.09952528029680252, + "learning_rate": 1e-05, + "loss": 2.3674, + "step": 692 + }, + { + "epoch": 1.1664212076583211, + "grad_norm": 0.10683920234441757, + "learning_rate": 1e-05, + "loss": 2.1606, + "step": 693 + }, + { + "epoch": 1.1681043551441195, + "grad_norm": 0.10594907402992249, + "learning_rate": 1e-05, + "loss": 2.3633, + "step": 694 + }, + { + "epoch": 1.169787502629918, + "grad_norm": 0.1164483055472374, + "learning_rate": 1e-05, + "loss": 2.272, + "step": 695 + }, + { + "epoch": 1.1714706501157164, + "grad_norm": 0.1053275316953659, + "learning_rate": 1e-05, + "loss": 2.3361, + "step": 696 + }, + { + "epoch": 1.1731537976015147, + "grad_norm": 0.11722961068153381, + "learning_rate": 1e-05, + "loss": 2.1008, + "step": 697 + }, + { + "epoch": 1.1748369450873133, + "grad_norm": 0.11388476192951202, + "learning_rate": 1e-05, + "loss": 2.3129, + "step": 698 + }, + { + "epoch": 1.1765200925731116, + "grad_norm": 0.1149948239326477, + "learning_rate": 1e-05, + "loss": 2.3503, + "step": 699 + }, + { + "epoch": 1.1782032400589102, + "grad_norm": 0.09305736422538757, + "learning_rate": 1e-05, + "loss": 2.3811, + "step": 700 + }, + { + "epoch": 1.1798863875447085, + "grad_norm": 0.1027708575129509, + "learning_rate": 1e-05, + "loss": 2.3262, + "step": 701 + }, + { + "epoch": 1.181569535030507, + "grad_norm": 0.1058826595544815, + "learning_rate": 1e-05, + "loss": 2.2576, + "step": 702 + }, + { + "epoch": 1.1832526825163054, + "grad_norm": 0.1003696396946907, + "learning_rate": 1e-05, + "loss": 2.2759, + "step": 703 + }, + { + "epoch": 1.184935830002104, + "grad_norm": 0.11113473027944565, + "learning_rate": 1e-05, + "loss": 2.4163, + "step": 704 + }, + { + "epoch": 1.1866189774879023, + "grad_norm": 0.10945228487253189, + "learning_rate": 1e-05, + "loss": 2.2725, + "step": 705 + }, + { + "epoch": 1.188302124973701, + "grad_norm": 0.1079326868057251, + "learning_rate": 1e-05, + "loss": 2.3048, + "step": 706 + }, + { + "epoch": 1.1899852724594993, + "grad_norm": 0.10752802342176437, + "learning_rate": 1e-05, + "loss": 2.2145, + "step": 707 + }, + { + "epoch": 1.1916684199452976, + "grad_norm": 0.10588284581899643, + "learning_rate": 1e-05, + "loss": 2.3025, + "step": 708 + }, + { + "epoch": 1.1933515674310962, + "grad_norm": 0.1051083654165268, + "learning_rate": 1e-05, + "loss": 2.3198, + "step": 709 + }, + { + "epoch": 1.1950347149168945, + "grad_norm": 0.11915988475084305, + "learning_rate": 1e-05, + "loss": 2.2456, + "step": 710 + }, + { + "epoch": 1.196717862402693, + "grad_norm": 0.10947719216346741, + "learning_rate": 1e-05, + "loss": 2.3479, + "step": 711 + }, + { + "epoch": 1.1984010098884914, + "grad_norm": 0.11522776633501053, + "learning_rate": 1e-05, + "loss": 2.2898, + "step": 712 + }, + { + "epoch": 1.20008415737429, + "grad_norm": 0.10741020739078522, + "learning_rate": 1e-05, + "loss": 2.3198, + "step": 713 + }, + { + "epoch": 1.2017673048600883, + "grad_norm": 0.10589215159416199, + "learning_rate": 1e-05, + "loss": 2.2812, + "step": 714 + }, + { + "epoch": 1.2034504523458869, + "grad_norm": 0.10151232033967972, + "learning_rate": 1e-05, + "loss": 2.429, + "step": 715 + }, + { + "epoch": 1.2051335998316852, + "grad_norm": 0.11951622366905212, + "learning_rate": 1e-05, + "loss": 2.1932, + "step": 716 + }, + { + "epoch": 1.2068167473174838, + "grad_norm": 0.11722715198993683, + "learning_rate": 1e-05, + "loss": 2.2356, + "step": 717 + }, + { + "epoch": 1.2084998948032821, + "grad_norm": 0.11441315710544586, + "learning_rate": 1e-05, + "loss": 2.2891, + "step": 718 + }, + { + "epoch": 1.2101830422890805, + "grad_norm": 0.10936987400054932, + "learning_rate": 1e-05, + "loss": 2.2843, + "step": 719 + }, + { + "epoch": 1.211866189774879, + "grad_norm": 0.12374020367860794, + "learning_rate": 1e-05, + "loss": 2.2944, + "step": 720 + }, + { + "epoch": 1.2135493372606774, + "grad_norm": 0.11024117469787598, + "learning_rate": 1e-05, + "loss": 2.2595, + "step": 721 + }, + { + "epoch": 1.215232484746476, + "grad_norm": 0.09707245975732803, + "learning_rate": 1e-05, + "loss": 2.3867, + "step": 722 + }, + { + "epoch": 1.2169156322322743, + "grad_norm": 0.11022404581308365, + "learning_rate": 1e-05, + "loss": 2.375, + "step": 723 + }, + { + "epoch": 1.2185987797180728, + "grad_norm": 0.10732002556324005, + "learning_rate": 1e-05, + "loss": 2.3674, + "step": 724 + }, + { + "epoch": 1.2202819272038712, + "grad_norm": 0.11548677086830139, + "learning_rate": 1e-05, + "loss": 2.3284, + "step": 725 + }, + { + "epoch": 1.2219650746896698, + "grad_norm": 0.10313412547111511, + "learning_rate": 1e-05, + "loss": 2.4128, + "step": 726 + }, + { + "epoch": 1.223648222175468, + "grad_norm": 0.12717945873737335, + "learning_rate": 1e-05, + "loss": 2.2847, + "step": 727 + }, + { + "epoch": 1.2253313696612667, + "grad_norm": 0.11565182358026505, + "learning_rate": 1e-05, + "loss": 2.2695, + "step": 728 + }, + { + "epoch": 1.227014517147065, + "grad_norm": 0.10489466041326523, + "learning_rate": 1e-05, + "loss": 2.3394, + "step": 729 + }, + { + "epoch": 1.2286976646328633, + "grad_norm": 0.11056289076805115, + "learning_rate": 1e-05, + "loss": 2.4165, + "step": 730 + }, + { + "epoch": 1.230380812118662, + "grad_norm": 0.12048956751823425, + "learning_rate": 1e-05, + "loss": 2.2289, + "step": 731 + }, + { + "epoch": 1.2320639596044602, + "grad_norm": 0.10263136774301529, + "learning_rate": 1e-05, + "loss": 2.3306, + "step": 732 + }, + { + "epoch": 1.2337471070902588, + "grad_norm": 0.11179950088262558, + "learning_rate": 1e-05, + "loss": 2.3481, + "step": 733 + }, + { + "epoch": 1.2354302545760572, + "grad_norm": 0.10484311729669571, + "learning_rate": 1e-05, + "loss": 2.2703, + "step": 734 + }, + { + "epoch": 1.2371134020618557, + "grad_norm": 0.1182483434677124, + "learning_rate": 1e-05, + "loss": 2.2328, + "step": 735 + }, + { + "epoch": 1.238796549547654, + "grad_norm": 0.11377429217100143, + "learning_rate": 1e-05, + "loss": 2.3657, + "step": 736 + }, + { + "epoch": 1.2404796970334526, + "grad_norm": 0.11151503771543503, + "learning_rate": 1e-05, + "loss": 2.3542, + "step": 737 + }, + { + "epoch": 1.242162844519251, + "grad_norm": 0.12628555297851562, + "learning_rate": 1e-05, + "loss": 2.2634, + "step": 738 + }, + { + "epoch": 1.2438459920050495, + "grad_norm": 0.10311713814735413, + "learning_rate": 1e-05, + "loss": 2.2717, + "step": 739 + }, + { + "epoch": 1.2455291394908479, + "grad_norm": 0.12768767774105072, + "learning_rate": 1e-05, + "loss": 2.1725, + "step": 740 + }, + { + "epoch": 1.2472122869766462, + "grad_norm": 0.12390502542257309, + "learning_rate": 1e-05, + "loss": 2.1708, + "step": 741 + }, + { + "epoch": 1.2488954344624448, + "grad_norm": 0.10566207021474838, + "learning_rate": 1e-05, + "loss": 2.3469, + "step": 742 + }, + { + "epoch": 1.2505785819482433, + "grad_norm": 0.10176009684801102, + "learning_rate": 1e-05, + "loss": 2.3159, + "step": 743 + }, + { + "epoch": 1.2522617294340417, + "grad_norm": 0.10881732404232025, + "learning_rate": 1e-05, + "loss": 2.2966, + "step": 744 + }, + { + "epoch": 1.25394487691984, + "grad_norm": 0.11917608976364136, + "learning_rate": 1e-05, + "loss": 2.395, + "step": 745 + }, + { + "epoch": 1.2556280244056386, + "grad_norm": 0.09600858390331268, + "learning_rate": 1e-05, + "loss": 2.3479, + "step": 746 + }, + { + "epoch": 1.257311171891437, + "grad_norm": 0.11550504714250565, + "learning_rate": 1e-05, + "loss": 2.301, + "step": 747 + }, + { + "epoch": 1.2589943193772355, + "grad_norm": 0.10588584840297699, + "learning_rate": 1e-05, + "loss": 2.4163, + "step": 748 + }, + { + "epoch": 1.2606774668630338, + "grad_norm": 0.10998673737049103, + "learning_rate": 1e-05, + "loss": 2.3379, + "step": 749 + }, + { + "epoch": 1.2623606143488324, + "grad_norm": 0.10513128340244293, + "learning_rate": 1e-05, + "loss": 2.3795, + "step": 750 + }, + { + "epoch": 1.2640437618346307, + "grad_norm": 0.11185754835605621, + "learning_rate": 1e-05, + "loss": 2.2583, + "step": 751 + }, + { + "epoch": 1.265726909320429, + "grad_norm": 0.10794227570295334, + "learning_rate": 1e-05, + "loss": 2.285, + "step": 752 + }, + { + "epoch": 1.2674100568062276, + "grad_norm": 0.12522459030151367, + "learning_rate": 1e-05, + "loss": 2.2292, + "step": 753 + }, + { + "epoch": 1.2690932042920262, + "grad_norm": 0.11628364026546478, + "learning_rate": 1e-05, + "loss": 2.3342, + "step": 754 + }, + { + "epoch": 1.2707763517778246, + "grad_norm": 0.12842795252799988, + "learning_rate": 1e-05, + "loss": 2.1455, + "step": 755 + }, + { + "epoch": 1.272459499263623, + "grad_norm": 0.11268262565135956, + "learning_rate": 1e-05, + "loss": 2.2241, + "step": 756 + }, + { + "epoch": 1.2741426467494215, + "grad_norm": 0.11674508452415466, + "learning_rate": 1e-05, + "loss": 2.2677, + "step": 757 + }, + { + "epoch": 1.2758257942352198, + "grad_norm": 0.11475373059511185, + "learning_rate": 1e-05, + "loss": 2.4075, + "step": 758 + }, + { + "epoch": 1.2775089417210184, + "grad_norm": 0.11378497630357742, + "learning_rate": 1e-05, + "loss": 2.3032, + "step": 759 + }, + { + "epoch": 1.2791920892068167, + "grad_norm": 0.10426255315542221, + "learning_rate": 1e-05, + "loss": 2.2488, + "step": 760 + }, + { + "epoch": 1.2808752366926153, + "grad_norm": 0.11820263415575027, + "learning_rate": 1e-05, + "loss": 2.197, + "step": 761 + }, + { + "epoch": 1.2825583841784136, + "grad_norm": 0.10741489380598068, + "learning_rate": 1e-05, + "loss": 2.2811, + "step": 762 + }, + { + "epoch": 1.284241531664212, + "grad_norm": 0.115534208714962, + "learning_rate": 1e-05, + "loss": 2.3105, + "step": 763 + }, + { + "epoch": 1.2859246791500105, + "grad_norm": 0.1159248948097229, + "learning_rate": 1e-05, + "loss": 2.2963, + "step": 764 + }, + { + "epoch": 1.287607826635809, + "grad_norm": 0.11940732598304749, + "learning_rate": 1e-05, + "loss": 2.3274, + "step": 765 + }, + { + "epoch": 1.2892909741216074, + "grad_norm": 0.11882008612155914, + "learning_rate": 1e-05, + "loss": 2.2405, + "step": 766 + }, + { + "epoch": 1.2909741216074058, + "grad_norm": 0.10939499735832214, + "learning_rate": 1e-05, + "loss": 2.3008, + "step": 767 + }, + { + "epoch": 1.2926572690932043, + "grad_norm": 0.11414020508527756, + "learning_rate": 1e-05, + "loss": 2.3164, + "step": 768 + }, + { + "epoch": 1.2943404165790027, + "grad_norm": 0.11446741968393326, + "learning_rate": 1e-05, + "loss": 2.2524, + "step": 769 + }, + { + "epoch": 1.2960235640648012, + "grad_norm": 0.12233757227659225, + "learning_rate": 1e-05, + "loss": 2.3997, + "step": 770 + }, + { + "epoch": 1.2977067115505996, + "grad_norm": 0.11746780574321747, + "learning_rate": 1e-05, + "loss": 2.2241, + "step": 771 + }, + { + "epoch": 1.2993898590363981, + "grad_norm": 0.12653754651546478, + "learning_rate": 1e-05, + "loss": 2.2181, + "step": 772 + }, + { + "epoch": 1.3010730065221965, + "grad_norm": 0.11092430353164673, + "learning_rate": 1e-05, + "loss": 2.194, + "step": 773 + }, + { + "epoch": 1.3027561540079948, + "grad_norm": 0.11273445188999176, + "learning_rate": 1e-05, + "loss": 2.2821, + "step": 774 + }, + { + "epoch": 1.3044393014937934, + "grad_norm": 0.10755831003189087, + "learning_rate": 1e-05, + "loss": 2.3381, + "step": 775 + }, + { + "epoch": 1.306122448979592, + "grad_norm": 0.10324183851480484, + "learning_rate": 1e-05, + "loss": 2.4531, + "step": 776 + }, + { + "epoch": 1.3078055964653903, + "grad_norm": 0.1238187626004219, + "learning_rate": 1e-05, + "loss": 2.2378, + "step": 777 + }, + { + "epoch": 1.3094887439511886, + "grad_norm": 0.10919329524040222, + "learning_rate": 1e-05, + "loss": 2.3157, + "step": 778 + }, + { + "epoch": 1.3111718914369872, + "grad_norm": 0.11661651730537415, + "learning_rate": 1e-05, + "loss": 2.3889, + "step": 779 + }, + { + "epoch": 1.3128550389227855, + "grad_norm": 0.11324804276227951, + "learning_rate": 1e-05, + "loss": 2.366, + "step": 780 + }, + { + "epoch": 1.314538186408584, + "grad_norm": 0.11539211124181747, + "learning_rate": 1e-05, + "loss": 2.2661, + "step": 781 + }, + { + "epoch": 1.3162213338943825, + "grad_norm": 0.12013803422451019, + "learning_rate": 1e-05, + "loss": 2.2388, + "step": 782 + }, + { + "epoch": 1.317904481380181, + "grad_norm": 0.1297876238822937, + "learning_rate": 1e-05, + "loss": 2.338, + "step": 783 + }, + { + "epoch": 1.3195876288659794, + "grad_norm": 0.11792443692684174, + "learning_rate": 1e-05, + "loss": 2.3162, + "step": 784 + }, + { + "epoch": 1.3212707763517777, + "grad_norm": 0.11543410271406174, + "learning_rate": 1e-05, + "loss": 2.325, + "step": 785 + }, + { + "epoch": 1.3229539238375763, + "grad_norm": 0.11507069319486618, + "learning_rate": 1e-05, + "loss": 2.3389, + "step": 786 + }, + { + "epoch": 1.3246370713233748, + "grad_norm": 0.11883421987295151, + "learning_rate": 1e-05, + "loss": 2.3784, + "step": 787 + }, + { + "epoch": 1.3263202188091732, + "grad_norm": 0.11997753381729126, + "learning_rate": 1e-05, + "loss": 2.2183, + "step": 788 + }, + { + "epoch": 1.3280033662949715, + "grad_norm": 0.12312667816877365, + "learning_rate": 1e-05, + "loss": 2.2661, + "step": 789 + }, + { + "epoch": 1.32968651378077, + "grad_norm": 0.1280994415283203, + "learning_rate": 1e-05, + "loss": 2.235, + "step": 790 + }, + { + "epoch": 1.3313696612665684, + "grad_norm": 0.12460897862911224, + "learning_rate": 1e-05, + "loss": 2.2775, + "step": 791 + }, + { + "epoch": 1.333052808752367, + "grad_norm": 0.11441405862569809, + "learning_rate": 1e-05, + "loss": 2.2642, + "step": 792 + }, + { + "epoch": 1.3347359562381653, + "grad_norm": 0.1078685000538826, + "learning_rate": 1e-05, + "loss": 2.3174, + "step": 793 + }, + { + "epoch": 1.3364191037239639, + "grad_norm": 0.11945922672748566, + "learning_rate": 1e-05, + "loss": 2.3101, + "step": 794 + }, + { + "epoch": 1.3381022512097622, + "grad_norm": 0.11506087332963943, + "learning_rate": 1e-05, + "loss": 2.3167, + "step": 795 + }, + { + "epoch": 1.3397853986955606, + "grad_norm": 0.12365138530731201, + "learning_rate": 1e-05, + "loss": 2.3044, + "step": 796 + }, + { + "epoch": 1.3414685461813591, + "grad_norm": 0.12331211566925049, + "learning_rate": 1e-05, + "loss": 2.2058, + "step": 797 + }, + { + "epoch": 1.3431516936671577, + "grad_norm": 0.12298640608787537, + "learning_rate": 1e-05, + "loss": 2.21, + "step": 798 + }, + { + "epoch": 1.344834841152956, + "grad_norm": 0.12047012150287628, + "learning_rate": 1e-05, + "loss": 2.2781, + "step": 799 + }, + { + "epoch": 1.3465179886387544, + "grad_norm": 0.12428031861782074, + "learning_rate": 1e-05, + "loss": 2.3032, + "step": 800 + }, + { + "epoch": 1.348201136124553, + "grad_norm": 0.1128249540925026, + "learning_rate": 1e-05, + "loss": 2.3135, + "step": 801 + }, + { + "epoch": 1.3498842836103513, + "grad_norm": 0.12616464495658875, + "learning_rate": 1e-05, + "loss": 2.1487, + "step": 802 + }, + { + "epoch": 1.3515674310961499, + "grad_norm": 0.11388704925775528, + "learning_rate": 1e-05, + "loss": 2.2346, + "step": 803 + }, + { + "epoch": 1.3532505785819482, + "grad_norm": 0.10213828831911087, + "learning_rate": 1e-05, + "loss": 2.2859, + "step": 804 + }, + { + "epoch": 1.3549337260677468, + "grad_norm": 0.1226121038198471, + "learning_rate": 1e-05, + "loss": 2.2183, + "step": 805 + }, + { + "epoch": 1.356616873553545, + "grad_norm": 0.11445735394954681, + "learning_rate": 1e-05, + "loss": 2.3784, + "step": 806 + }, + { + "epoch": 1.3583000210393434, + "grad_norm": 0.11648505181074142, + "learning_rate": 1e-05, + "loss": 2.3442, + "step": 807 + }, + { + "epoch": 1.359983168525142, + "grad_norm": 0.1296563744544983, + "learning_rate": 1e-05, + "loss": 2.2469, + "step": 808 + }, + { + "epoch": 1.3616663160109406, + "grad_norm": 0.12322400510311127, + "learning_rate": 1e-05, + "loss": 2.2915, + "step": 809 + }, + { + "epoch": 1.363349463496739, + "grad_norm": 0.11419309675693512, + "learning_rate": 1e-05, + "loss": 2.3024, + "step": 810 + }, + { + "epoch": 1.3650326109825373, + "grad_norm": 0.12253374606370926, + "learning_rate": 1e-05, + "loss": 2.2969, + "step": 811 + }, + { + "epoch": 1.3667157584683358, + "grad_norm": 0.1254422962665558, + "learning_rate": 1e-05, + "loss": 2.364, + "step": 812 + }, + { + "epoch": 1.3683989059541342, + "grad_norm": 0.12984994053840637, + "learning_rate": 1e-05, + "loss": 2.2936, + "step": 813 + }, + { + "epoch": 1.3700820534399327, + "grad_norm": 0.1182006224989891, + "learning_rate": 1e-05, + "loss": 2.2673, + "step": 814 + }, + { + "epoch": 1.371765200925731, + "grad_norm": 0.12920832633972168, + "learning_rate": 1e-05, + "loss": 2.1582, + "step": 815 + }, + { + "epoch": 1.3734483484115296, + "grad_norm": 0.1216689869761467, + "learning_rate": 1e-05, + "loss": 2.3479, + "step": 816 + }, + { + "epoch": 1.375131495897328, + "grad_norm": 0.12459319084882736, + "learning_rate": 1e-05, + "loss": 2.1868, + "step": 817 + }, + { + "epoch": 1.3768146433831263, + "grad_norm": 0.11144936084747314, + "learning_rate": 1e-05, + "loss": 2.3663, + "step": 818 + }, + { + "epoch": 1.3784977908689249, + "grad_norm": 0.1110294982790947, + "learning_rate": 1e-05, + "loss": 2.3164, + "step": 819 + }, + { + "epoch": 1.3801809383547234, + "grad_norm": 0.11903022974729538, + "learning_rate": 1e-05, + "loss": 2.2589, + "step": 820 + }, + { + "epoch": 1.3818640858405218, + "grad_norm": 0.10610275715589523, + "learning_rate": 1e-05, + "loss": 2.4153, + "step": 821 + }, + { + "epoch": 1.3835472333263201, + "grad_norm": 0.11972808837890625, + "learning_rate": 1e-05, + "loss": 2.3901, + "step": 822 + }, + { + "epoch": 1.3852303808121187, + "grad_norm": 0.10772975534200668, + "learning_rate": 1e-05, + "loss": 2.3555, + "step": 823 + }, + { + "epoch": 1.386913528297917, + "grad_norm": 0.11757270246744156, + "learning_rate": 1e-05, + "loss": 2.2677, + "step": 824 + }, + { + "epoch": 1.3885966757837156, + "grad_norm": 0.1217508539557457, + "learning_rate": 1e-05, + "loss": 2.2267, + "step": 825 + }, + { + "epoch": 1.390279823269514, + "grad_norm": 0.10996967554092407, + "learning_rate": 1e-05, + "loss": 2.3965, + "step": 826 + }, + { + "epoch": 1.3919629707553125, + "grad_norm": 0.13068005442619324, + "learning_rate": 1e-05, + "loss": 2.1991, + "step": 827 + }, + { + "epoch": 1.3936461182411108, + "grad_norm": 0.12149260193109512, + "learning_rate": 1e-05, + "loss": 2.2775, + "step": 828 + }, + { + "epoch": 1.3953292657269092, + "grad_norm": 0.1100870743393898, + "learning_rate": 1e-05, + "loss": 2.2571, + "step": 829 + }, + { + "epoch": 1.3970124132127077, + "grad_norm": 0.10005280375480652, + "learning_rate": 1e-05, + "loss": 2.2808, + "step": 830 + }, + { + "epoch": 1.3986955606985063, + "grad_norm": 0.11633820086717606, + "learning_rate": 1e-05, + "loss": 2.3215, + "step": 831 + }, + { + "epoch": 1.4003787081843047, + "grad_norm": 0.11901983618736267, + "learning_rate": 1e-05, + "loss": 2.4236, + "step": 832 + }, + { + "epoch": 1.402061855670103, + "grad_norm": 0.11173246055841446, + "learning_rate": 1e-05, + "loss": 2.3457, + "step": 833 + }, + { + "epoch": 1.4037450031559016, + "grad_norm": 0.10333243012428284, + "learning_rate": 1e-05, + "loss": 2.2659, + "step": 834 + }, + { + "epoch": 1.4054281506417, + "grad_norm": 0.13903972506523132, + "learning_rate": 1e-05, + "loss": 2.1946, + "step": 835 + }, + { + "epoch": 1.4071112981274985, + "grad_norm": 0.11832322925329208, + "learning_rate": 1e-05, + "loss": 2.3223, + "step": 836 + }, + { + "epoch": 1.4087944456132968, + "grad_norm": 0.10906493663787842, + "learning_rate": 1e-05, + "loss": 2.4316, + "step": 837 + }, + { + "epoch": 1.4104775930990954, + "grad_norm": 0.10980133712291718, + "learning_rate": 1e-05, + "loss": 2.3525, + "step": 838 + }, + { + "epoch": 1.4121607405848937, + "grad_norm": 0.12958386540412903, + "learning_rate": 1e-05, + "loss": 2.3081, + "step": 839 + }, + { + "epoch": 1.413843888070692, + "grad_norm": 0.1342059075832367, + "learning_rate": 1e-05, + "loss": 2.3564, + "step": 840 + }, + { + "epoch": 1.4155270355564906, + "grad_norm": 0.1362716406583786, + "learning_rate": 1e-05, + "loss": 2.2435, + "step": 841 + }, + { + "epoch": 1.4172101830422892, + "grad_norm": 0.10814797878265381, + "learning_rate": 1e-05, + "loss": 2.3373, + "step": 842 + }, + { + "epoch": 1.4188933305280875, + "grad_norm": 0.111182801425457, + "learning_rate": 1e-05, + "loss": 2.2921, + "step": 843 + }, + { + "epoch": 1.4205764780138859, + "grad_norm": 0.11161399632692337, + "learning_rate": 1e-05, + "loss": 2.3816, + "step": 844 + }, + { + "epoch": 1.4222596254996844, + "grad_norm": 0.1261526495218277, + "learning_rate": 1e-05, + "loss": 2.4082, + "step": 845 + }, + { + "epoch": 1.4239427729854828, + "grad_norm": 0.10805182158946991, + "learning_rate": 1e-05, + "loss": 2.3622, + "step": 846 + }, + { + "epoch": 1.4256259204712813, + "grad_norm": 0.12294517457485199, + "learning_rate": 1e-05, + "loss": 2.3638, + "step": 847 + }, + { + "epoch": 1.4273090679570797, + "grad_norm": 0.10903607308864594, + "learning_rate": 1e-05, + "loss": 2.3484, + "step": 848 + }, + { + "epoch": 1.4289922154428782, + "grad_norm": 0.12460491806268692, + "learning_rate": 1e-05, + "loss": 2.2046, + "step": 849 + }, + { + "epoch": 1.4306753629286766, + "grad_norm": 0.13793089985847473, + "learning_rate": 1e-05, + "loss": 2.2437, + "step": 850 + }, + { + "epoch": 1.4323585104144752, + "grad_norm": 0.11700379103422165, + "learning_rate": 1e-05, + "loss": 2.2288, + "step": 851 + }, + { + "epoch": 1.4340416579002735, + "grad_norm": 0.11343109607696533, + "learning_rate": 1e-05, + "loss": 2.2501, + "step": 852 + }, + { + "epoch": 1.435724805386072, + "grad_norm": 0.10918331891298294, + "learning_rate": 1e-05, + "loss": 2.47, + "step": 853 + }, + { + "epoch": 1.4374079528718704, + "grad_norm": 0.12782573699951172, + "learning_rate": 1e-05, + "loss": 2.2281, + "step": 854 + }, + { + "epoch": 1.4390911003576687, + "grad_norm": 0.12039442360401154, + "learning_rate": 1e-05, + "loss": 2.2766, + "step": 855 + }, + { + "epoch": 1.4407742478434673, + "grad_norm": 0.13949096202850342, + "learning_rate": 1e-05, + "loss": 2.198, + "step": 856 + }, + { + "epoch": 1.4424573953292656, + "grad_norm": 0.13327306509017944, + "learning_rate": 1e-05, + "loss": 2.2253, + "step": 857 + }, + { + "epoch": 1.4441405428150642, + "grad_norm": 0.1229238212108612, + "learning_rate": 1e-05, + "loss": 2.3147, + "step": 858 + }, + { + "epoch": 1.4458236903008626, + "grad_norm": 0.13407859206199646, + "learning_rate": 1e-05, + "loss": 2.2532, + "step": 859 + }, + { + "epoch": 1.4475068377866611, + "grad_norm": 0.1280384659767151, + "learning_rate": 1e-05, + "loss": 2.3174, + "step": 860 + }, + { + "epoch": 1.4491899852724595, + "grad_norm": 0.1532362997531891, + "learning_rate": 1e-05, + "loss": 2.1671, + "step": 861 + }, + { + "epoch": 1.450873132758258, + "grad_norm": 0.1134854182600975, + "learning_rate": 1e-05, + "loss": 2.3607, + "step": 862 + }, + { + "epoch": 1.4525562802440564, + "grad_norm": 0.11682198196649551, + "learning_rate": 1e-05, + "loss": 2.4041, + "step": 863 + }, + { + "epoch": 1.454239427729855, + "grad_norm": 0.11356412619352341, + "learning_rate": 1e-05, + "loss": 2.2756, + "step": 864 + }, + { + "epoch": 1.4559225752156533, + "grad_norm": 0.11278104037046432, + "learning_rate": 1e-05, + "loss": 2.2983, + "step": 865 + }, + { + "epoch": 1.4576057227014516, + "grad_norm": 0.13442599773406982, + "learning_rate": 1e-05, + "loss": 2.2593, + "step": 866 + }, + { + "epoch": 1.4592888701872502, + "grad_norm": 0.1254800707101822, + "learning_rate": 1e-05, + "loss": 2.3213, + "step": 867 + }, + { + "epoch": 1.4609720176730487, + "grad_norm": 0.12374315410852432, + "learning_rate": 1e-05, + "loss": 2.4221, + "step": 868 + }, + { + "epoch": 1.462655165158847, + "grad_norm": 0.13577024638652802, + "learning_rate": 1e-05, + "loss": 2.2473, + "step": 869 + }, + { + "epoch": 1.4643383126446454, + "grad_norm": 0.12822799384593964, + "learning_rate": 1e-05, + "loss": 2.3057, + "step": 870 + }, + { + "epoch": 1.466021460130444, + "grad_norm": 0.1283286213874817, + "learning_rate": 1e-05, + "loss": 2.374, + "step": 871 + }, + { + "epoch": 1.4677046076162423, + "grad_norm": 0.12054271996021271, + "learning_rate": 1e-05, + "loss": 2.3369, + "step": 872 + }, + { + "epoch": 1.469387755102041, + "grad_norm": 0.127189502120018, + "learning_rate": 1e-05, + "loss": 2.3167, + "step": 873 + }, + { + "epoch": 1.4710709025878392, + "grad_norm": 0.12767814099788666, + "learning_rate": 1e-05, + "loss": 2.2695, + "step": 874 + }, + { + "epoch": 1.4727540500736378, + "grad_norm": 0.12026406079530716, + "learning_rate": 1e-05, + "loss": 2.3313, + "step": 875 + }, + { + "epoch": 1.4744371975594361, + "grad_norm": 0.13317981362342834, + "learning_rate": 1e-05, + "loss": 2.209, + "step": 876 + }, + { + "epoch": 1.4761203450452345, + "grad_norm": 0.12904947996139526, + "learning_rate": 1e-05, + "loss": 2.2344, + "step": 877 + }, + { + "epoch": 1.477803492531033, + "grad_norm": 0.13126946985721588, + "learning_rate": 1e-05, + "loss": 2.2888, + "step": 878 + }, + { + "epoch": 1.4794866400168316, + "grad_norm": 0.128869891166687, + "learning_rate": 1e-05, + "loss": 2.1996, + "step": 879 + }, + { + "epoch": 1.48116978750263, + "grad_norm": 0.1279861181974411, + "learning_rate": 1e-05, + "loss": 2.1873, + "step": 880 + }, + { + "epoch": 1.4828529349884283, + "grad_norm": 0.11732237040996552, + "learning_rate": 1e-05, + "loss": 2.3259, + "step": 881 + }, + { + "epoch": 1.4845360824742269, + "grad_norm": 0.1279248595237732, + "learning_rate": 1e-05, + "loss": 2.386, + "step": 882 + }, + { + "epoch": 1.4862192299600252, + "grad_norm": 0.13578535616397858, + "learning_rate": 1e-05, + "loss": 2.2937, + "step": 883 + }, + { + "epoch": 1.4879023774458238, + "grad_norm": 0.13534606993198395, + "learning_rate": 1e-05, + "loss": 2.239, + "step": 884 + }, + { + "epoch": 1.489585524931622, + "grad_norm": 0.12359879165887833, + "learning_rate": 1e-05, + "loss": 2.3572, + "step": 885 + }, + { + "epoch": 1.4912686724174207, + "grad_norm": 0.1236250028014183, + "learning_rate": 1e-05, + "loss": 2.188, + "step": 886 + }, + { + "epoch": 1.492951819903219, + "grad_norm": 0.12695659697055817, + "learning_rate": 1e-05, + "loss": 2.2637, + "step": 887 + }, + { + "epoch": 1.4946349673890174, + "grad_norm": 0.1281343400478363, + "learning_rate": 1e-05, + "loss": 2.2961, + "step": 888 + }, + { + "epoch": 1.496318114874816, + "grad_norm": 0.12446150928735733, + "learning_rate": 1e-05, + "loss": 2.3362, + "step": 889 + }, + { + "epoch": 1.4980012623606145, + "grad_norm": 0.12564988434314728, + "learning_rate": 1e-05, + "loss": 2.288, + "step": 890 + }, + { + "epoch": 1.4996844098464128, + "grad_norm": 0.14049400389194489, + "learning_rate": 1e-05, + "loss": 2.2867, + "step": 891 + }, + { + "epoch": 1.5013675573322112, + "grad_norm": 0.12252961844205856, + "learning_rate": 1e-05, + "loss": 2.3511, + "step": 892 + }, + { + "epoch": 1.5030507048180097, + "grad_norm": 0.15993735194206238, + "learning_rate": 1e-05, + "loss": 2.0931, + "step": 893 + }, + { + "epoch": 1.504733852303808, + "grad_norm": 0.13673749566078186, + "learning_rate": 1e-05, + "loss": 2.2998, + "step": 894 + }, + { + "epoch": 1.5064169997896064, + "grad_norm": 0.11770147830247879, + "learning_rate": 1e-05, + "loss": 2.2883, + "step": 895 + }, + { + "epoch": 1.508100147275405, + "grad_norm": 0.11792504787445068, + "learning_rate": 1e-05, + "loss": 2.1893, + "step": 896 + }, + { + "epoch": 1.5097832947612035, + "grad_norm": 0.1405222862958908, + "learning_rate": 1e-05, + "loss": 2.2645, + "step": 897 + }, + { + "epoch": 1.5114664422470019, + "grad_norm": 0.1401311457157135, + "learning_rate": 1e-05, + "loss": 2.2085, + "step": 898 + }, + { + "epoch": 1.5131495897328002, + "grad_norm": 0.14068666100502014, + "learning_rate": 1e-05, + "loss": 2.2711, + "step": 899 + }, + { + "epoch": 1.5148327372185988, + "grad_norm": 0.12995976209640503, + "learning_rate": 1e-05, + "loss": 2.2883, + "step": 900 + }, + { + "epoch": 1.5165158847043974, + "grad_norm": 0.12454178184270859, + "learning_rate": 1e-05, + "loss": 2.2515, + "step": 901 + }, + { + "epoch": 1.5181990321901957, + "grad_norm": 0.12165191769599915, + "learning_rate": 1e-05, + "loss": 2.3621, + "step": 902 + }, + { + "epoch": 1.519882179675994, + "grad_norm": 0.1413601189851761, + "learning_rate": 1e-05, + "loss": 2.27, + "step": 903 + }, + { + "epoch": 1.5215653271617926, + "grad_norm": 0.13545894622802734, + "learning_rate": 1e-05, + "loss": 2.3008, + "step": 904 + }, + { + "epoch": 1.523248474647591, + "grad_norm": 0.12211872637271881, + "learning_rate": 1e-05, + "loss": 2.3921, + "step": 905 + }, + { + "epoch": 1.5249316221333893, + "grad_norm": 0.13053253293037415, + "learning_rate": 1e-05, + "loss": 2.2434, + "step": 906 + }, + { + "epoch": 1.5266147696191879, + "grad_norm": 0.12977124750614166, + "learning_rate": 1e-05, + "loss": 2.2366, + "step": 907 + }, + { + "epoch": 1.5282979171049864, + "grad_norm": 0.13451719284057617, + "learning_rate": 1e-05, + "loss": 2.3154, + "step": 908 + }, + { + "epoch": 1.5299810645907848, + "grad_norm": 0.11067184805870056, + "learning_rate": 1e-05, + "loss": 2.3296, + "step": 909 + }, + { + "epoch": 1.531664212076583, + "grad_norm": 0.12281223386526108, + "learning_rate": 1e-05, + "loss": 2.2479, + "step": 910 + }, + { + "epoch": 1.5333473595623817, + "grad_norm": 0.12240397185087204, + "learning_rate": 1e-05, + "loss": 2.3416, + "step": 911 + }, + { + "epoch": 1.5350305070481802, + "grad_norm": 0.14465166628360748, + "learning_rate": 1e-05, + "loss": 2.1801, + "step": 912 + }, + { + "epoch": 1.5367136545339786, + "grad_norm": 0.1263197958469391, + "learning_rate": 1e-05, + "loss": 2.2583, + "step": 913 + }, + { + "epoch": 1.538396802019777, + "grad_norm": 0.14653970301151276, + "learning_rate": 1e-05, + "loss": 2.2939, + "step": 914 + }, + { + "epoch": 1.5400799495055755, + "grad_norm": 0.1311267763376236, + "learning_rate": 1e-05, + "loss": 2.2517, + "step": 915 + }, + { + "epoch": 1.5417630969913738, + "grad_norm": 0.13173674046993256, + "learning_rate": 1e-05, + "loss": 2.309, + "step": 916 + }, + { + "epoch": 1.5434462444771722, + "grad_norm": 0.13140322268009186, + "learning_rate": 1e-05, + "loss": 2.1447, + "step": 917 + }, + { + "epoch": 1.5451293919629707, + "grad_norm": 0.12431302666664124, + "learning_rate": 1e-05, + "loss": 2.3315, + "step": 918 + }, + { + "epoch": 1.5468125394487693, + "grad_norm": 0.14358630776405334, + "learning_rate": 1e-05, + "loss": 2.2634, + "step": 919 + }, + { + "epoch": 1.5484956869345676, + "grad_norm": 0.1297353357076645, + "learning_rate": 1e-05, + "loss": 2.2489, + "step": 920 + }, + { + "epoch": 1.550178834420366, + "grad_norm": 0.12963449954986572, + "learning_rate": 1e-05, + "loss": 2.1533, + "step": 921 + }, + { + "epoch": 1.5518619819061645, + "grad_norm": 0.11558603495359421, + "learning_rate": 1e-05, + "loss": 2.2688, + "step": 922 + }, + { + "epoch": 1.553545129391963, + "grad_norm": 0.14222054183483124, + "learning_rate": 1e-05, + "loss": 2.2385, + "step": 923 + }, + { + "epoch": 1.5552282768777614, + "grad_norm": 0.1376868486404419, + "learning_rate": 1e-05, + "loss": 2.2051, + "step": 924 + }, + { + "epoch": 1.5569114243635598, + "grad_norm": 0.12993879616260529, + "learning_rate": 1e-05, + "loss": 2.3445, + "step": 925 + }, + { + "epoch": 1.5585945718493583, + "grad_norm": 0.14503213763237, + "learning_rate": 1e-05, + "loss": 2.215, + "step": 926 + }, + { + "epoch": 1.560277719335157, + "grad_norm": 0.1302722692489624, + "learning_rate": 1e-05, + "loss": 2.1945, + "step": 927 + }, + { + "epoch": 1.561960866820955, + "grad_norm": 0.13545845448970795, + "learning_rate": 1e-05, + "loss": 2.3059, + "step": 928 + }, + { + "epoch": 1.5636440143067536, + "grad_norm": 0.12279404699802399, + "learning_rate": 1e-05, + "loss": 2.3511, + "step": 929 + }, + { + "epoch": 1.5653271617925522, + "grad_norm": 0.13220550119876862, + "learning_rate": 1e-05, + "loss": 2.2837, + "step": 930 + }, + { + "epoch": 1.5670103092783505, + "grad_norm": 0.1407599151134491, + "learning_rate": 1e-05, + "loss": 2.2905, + "step": 931 + }, + { + "epoch": 1.5686934567641488, + "grad_norm": 0.12597431242465973, + "learning_rate": 1e-05, + "loss": 2.366, + "step": 932 + }, + { + "epoch": 1.5703766042499474, + "grad_norm": 0.12998835742473602, + "learning_rate": 1e-05, + "loss": 2.1067, + "step": 933 + }, + { + "epoch": 1.572059751735746, + "grad_norm": 0.14708921313285828, + "learning_rate": 1e-05, + "loss": 2.2687, + "step": 934 + }, + { + "epoch": 1.5737428992215443, + "grad_norm": 0.13333402574062347, + "learning_rate": 1e-05, + "loss": 2.3381, + "step": 935 + }, + { + "epoch": 1.5754260467073427, + "grad_norm": 0.14774633944034576, + "learning_rate": 1e-05, + "loss": 2.163, + "step": 936 + }, + { + "epoch": 1.5771091941931412, + "grad_norm": 0.1283462792634964, + "learning_rate": 1e-05, + "loss": 2.3892, + "step": 937 + }, + { + "epoch": 1.5787923416789398, + "grad_norm": 0.12011823058128357, + "learning_rate": 1e-05, + "loss": 2.2758, + "step": 938 + }, + { + "epoch": 1.580475489164738, + "grad_norm": 0.11618427187204361, + "learning_rate": 1e-05, + "loss": 2.2545, + "step": 939 + }, + { + "epoch": 1.5821586366505365, + "grad_norm": 0.12683863937854767, + "learning_rate": 1e-05, + "loss": 2.291, + "step": 940 + }, + { + "epoch": 1.583841784136335, + "grad_norm": 0.13158243894577026, + "learning_rate": 1e-05, + "loss": 2.3066, + "step": 941 + }, + { + "epoch": 1.5855249316221334, + "grad_norm": 0.13269281387329102, + "learning_rate": 1e-05, + "loss": 2.3442, + "step": 942 + }, + { + "epoch": 1.5872080791079317, + "grad_norm": 0.14047692716121674, + "learning_rate": 1e-05, + "loss": 2.3092, + "step": 943 + }, + { + "epoch": 1.5888912265937303, + "grad_norm": 0.1387140154838562, + "learning_rate": 1e-05, + "loss": 2.1482, + "step": 944 + }, + { + "epoch": 1.5905743740795288, + "grad_norm": 0.13907848298549652, + "learning_rate": 1e-05, + "loss": 2.3484, + "step": 945 + }, + { + "epoch": 1.5922575215653272, + "grad_norm": 0.13114407658576965, + "learning_rate": 1e-05, + "loss": 2.2195, + "step": 946 + }, + { + "epoch": 1.5939406690511255, + "grad_norm": 0.1368924379348755, + "learning_rate": 1e-05, + "loss": 2.322, + "step": 947 + }, + { + "epoch": 1.595623816536924, + "grad_norm": 0.141913041472435, + "learning_rate": 1e-05, + "loss": 2.2336, + "step": 948 + }, + { + "epoch": 1.5973069640227227, + "grad_norm": 0.13295848667621613, + "learning_rate": 1e-05, + "loss": 2.3081, + "step": 949 + }, + { + "epoch": 1.5989901115085208, + "grad_norm": 0.12306110560894012, + "learning_rate": 1e-05, + "loss": 2.3354, + "step": 950 + }, + { + "epoch": 1.6006732589943193, + "grad_norm": 0.12122649699449539, + "learning_rate": 1e-05, + "loss": 2.2839, + "step": 951 + }, + { + "epoch": 1.602356406480118, + "grad_norm": 0.13046576082706451, + "learning_rate": 1e-05, + "loss": 2.385, + "step": 952 + }, + { + "epoch": 1.6040395539659162, + "grad_norm": 0.1272476315498352, + "learning_rate": 1e-05, + "loss": 2.4153, + "step": 953 + }, + { + "epoch": 1.6057227014517146, + "grad_norm": 0.13073799014091492, + "learning_rate": 1e-05, + "loss": 2.2854, + "step": 954 + }, + { + "epoch": 1.6074058489375131, + "grad_norm": 0.12583526968955994, + "learning_rate": 1e-05, + "loss": 2.3318, + "step": 955 + }, + { + "epoch": 1.6090889964233117, + "grad_norm": 0.1474972665309906, + "learning_rate": 1e-05, + "loss": 2.2542, + "step": 956 + }, + { + "epoch": 1.61077214390911, + "grad_norm": 0.13445797562599182, + "learning_rate": 1e-05, + "loss": 2.3645, + "step": 957 + }, + { + "epoch": 1.6124552913949084, + "grad_norm": 0.13466110825538635, + "learning_rate": 1e-05, + "loss": 2.3394, + "step": 958 + }, + { + "epoch": 1.614138438880707, + "grad_norm": 0.13525816798210144, + "learning_rate": 1e-05, + "loss": 2.2471, + "step": 959 + }, + { + "epoch": 1.6158215863665055, + "grad_norm": 0.1377459019422531, + "learning_rate": 1e-05, + "loss": 2.2478, + "step": 960 + }, + { + "epoch": 1.6175047338523036, + "grad_norm": 0.1405583918094635, + "learning_rate": 1e-05, + "loss": 2.2146, + "step": 961 + }, + { + "epoch": 1.6191878813381022, + "grad_norm": 0.11743167042732239, + "learning_rate": 1e-05, + "loss": 2.3555, + "step": 962 + }, + { + "epoch": 1.6208710288239008, + "grad_norm": 0.13644517958164215, + "learning_rate": 1e-05, + "loss": 2.2155, + "step": 963 + }, + { + "epoch": 1.6225541763096991, + "grad_norm": 0.12609997391700745, + "learning_rate": 1e-05, + "loss": 2.2593, + "step": 964 + }, + { + "epoch": 1.6242373237954975, + "grad_norm": 0.13276560604572296, + "learning_rate": 1e-05, + "loss": 2.1737, + "step": 965 + }, + { + "epoch": 1.625920471281296, + "grad_norm": 0.13567714393138885, + "learning_rate": 1e-05, + "loss": 2.3336, + "step": 966 + }, + { + "epoch": 1.6276036187670946, + "grad_norm": 0.12559200823307037, + "learning_rate": 1e-05, + "loss": 2.3494, + "step": 967 + }, + { + "epoch": 1.629286766252893, + "grad_norm": 0.13090649247169495, + "learning_rate": 1e-05, + "loss": 2.1851, + "step": 968 + }, + { + "epoch": 1.6309699137386913, + "grad_norm": 0.15777987241744995, + "learning_rate": 1e-05, + "loss": 2.2205, + "step": 969 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.1433715522289276, + "learning_rate": 1e-05, + "loss": 2.2295, + "step": 970 + }, + { + "epoch": 1.6343362087102884, + "grad_norm": 0.1218508929014206, + "learning_rate": 1e-05, + "loss": 2.3762, + "step": 971 + }, + { + "epoch": 1.6360193561960865, + "grad_norm": 0.14540942013263702, + "learning_rate": 1e-05, + "loss": 2.2139, + "step": 972 + }, + { + "epoch": 1.637702503681885, + "grad_norm": 0.14829136431217194, + "learning_rate": 1e-05, + "loss": 2.2871, + "step": 973 + }, + { + "epoch": 1.6393856511676836, + "grad_norm": 0.12728969752788544, + "learning_rate": 1e-05, + "loss": 2.2917, + "step": 974 + }, + { + "epoch": 1.641068798653482, + "grad_norm": 0.1471221148967743, + "learning_rate": 1e-05, + "loss": 2.2012, + "step": 975 + }, + { + "epoch": 1.6427519461392803, + "grad_norm": 0.13320200145244598, + "learning_rate": 1e-05, + "loss": 2.2771, + "step": 976 + }, + { + "epoch": 1.644435093625079, + "grad_norm": 0.1363966464996338, + "learning_rate": 1e-05, + "loss": 2.3086, + "step": 977 + }, + { + "epoch": 1.6461182411108775, + "grad_norm": 0.13870568573474884, + "learning_rate": 1e-05, + "loss": 2.2898, + "step": 978 + }, + { + "epoch": 1.6478013885966758, + "grad_norm": 0.15152350068092346, + "learning_rate": 1e-05, + "loss": 2.2994, + "step": 979 + }, + { + "epoch": 1.6494845360824741, + "grad_norm": 0.13830937445163727, + "learning_rate": 1e-05, + "loss": 2.2108, + "step": 980 + }, + { + "epoch": 1.6511676835682727, + "grad_norm": 0.15544220805168152, + "learning_rate": 1e-05, + "loss": 2.4043, + "step": 981 + }, + { + "epoch": 1.6528508310540713, + "grad_norm": 0.13135483860969543, + "learning_rate": 1e-05, + "loss": 2.2373, + "step": 982 + }, + { + "epoch": 1.6545339785398696, + "grad_norm": 0.12355194985866547, + "learning_rate": 1e-05, + "loss": 2.4163, + "step": 983 + }, + { + "epoch": 1.656217126025668, + "grad_norm": 0.14110660552978516, + "learning_rate": 1e-05, + "loss": 2.2031, + "step": 984 + }, + { + "epoch": 1.6579002735114665, + "grad_norm": 0.13077346980571747, + "learning_rate": 1e-05, + "loss": 2.3601, + "step": 985 + }, + { + "epoch": 1.6595834209972649, + "grad_norm": 0.14212660491466522, + "learning_rate": 1e-05, + "loss": 2.197, + "step": 986 + }, + { + "epoch": 1.6612665684830632, + "grad_norm": 0.12336140871047974, + "learning_rate": 1e-05, + "loss": 2.4146, + "step": 987 + }, + { + "epoch": 1.6629497159688618, + "grad_norm": 0.15291054546833038, + "learning_rate": 1e-05, + "loss": 2.2764, + "step": 988 + }, + { + "epoch": 1.6646328634546603, + "grad_norm": 0.1272605061531067, + "learning_rate": 1e-05, + "loss": 2.2703, + "step": 989 + }, + { + "epoch": 1.6663160109404587, + "grad_norm": 0.13462689518928528, + "learning_rate": 1e-05, + "loss": 2.3188, + "step": 990 + }, + { + "epoch": 1.667999158426257, + "grad_norm": 0.13290910422801971, + "learning_rate": 1e-05, + "loss": 2.2172, + "step": 991 + }, + { + "epoch": 1.6696823059120556, + "grad_norm": 0.15105758607387543, + "learning_rate": 1e-05, + "loss": 2.2156, + "step": 992 + }, + { + "epoch": 1.6713654533978541, + "grad_norm": 0.13150456547737122, + "learning_rate": 1e-05, + "loss": 2.3362, + "step": 993 + }, + { + "epoch": 1.6730486008836525, + "grad_norm": 0.13139204680919647, + "learning_rate": 1e-05, + "loss": 2.3833, + "step": 994 + }, + { + "epoch": 1.6747317483694508, + "grad_norm": 0.14886420965194702, + "learning_rate": 1e-05, + "loss": 2.1893, + "step": 995 + }, + { + "epoch": 1.6764148958552494, + "grad_norm": 0.13227102160453796, + "learning_rate": 1e-05, + "loss": 2.4055, + "step": 996 + }, + { + "epoch": 1.6780980433410477, + "grad_norm": 0.12545333802700043, + "learning_rate": 1e-05, + "loss": 2.3311, + "step": 997 + }, + { + "epoch": 1.679781190826846, + "grad_norm": 0.13391169905662537, + "learning_rate": 1e-05, + "loss": 2.3022, + "step": 998 + }, + { + "epoch": 1.6814643383126446, + "grad_norm": 0.13013269007205963, + "learning_rate": 1e-05, + "loss": 2.2318, + "step": 999 + }, + { + "epoch": 1.6831474857984432, + "grad_norm": 0.1331031173467636, + "learning_rate": 1e-05, + "loss": 2.3022, + "step": 1000 + }, + { + "epoch": 1.6848306332842415, + "grad_norm": 0.14438873529434204, + "learning_rate": 1e-05, + "loss": 2.2388, + "step": 1001 + }, + { + "epoch": 1.6865137807700399, + "grad_norm": 0.1422380954027176, + "learning_rate": 1e-05, + "loss": 2.3145, + "step": 1002 + }, + { + "epoch": 1.6881969282558384, + "grad_norm": 0.13909044861793518, + "learning_rate": 1e-05, + "loss": 2.2249, + "step": 1003 + }, + { + "epoch": 1.689880075741637, + "grad_norm": 0.14147858321666718, + "learning_rate": 1e-05, + "loss": 2.3179, + "step": 1004 + }, + { + "epoch": 1.6915632232274354, + "grad_norm": 0.13203288614749908, + "learning_rate": 1e-05, + "loss": 2.1912, + "step": 1005 + }, + { + "epoch": 1.6932463707132337, + "grad_norm": 0.14461839199066162, + "learning_rate": 1e-05, + "loss": 2.1982, + "step": 1006 + }, + { + "epoch": 1.6949295181990323, + "grad_norm": 0.14539021253585815, + "learning_rate": 1e-05, + "loss": 2.2917, + "step": 1007 + }, + { + "epoch": 1.6966126656848306, + "grad_norm": 0.14774973690509796, + "learning_rate": 1e-05, + "loss": 2.2639, + "step": 1008 + }, + { + "epoch": 1.698295813170629, + "grad_norm": 0.14927157759666443, + "learning_rate": 1e-05, + "loss": 2.1956, + "step": 1009 + }, + { + "epoch": 1.6999789606564275, + "grad_norm": 0.1286613643169403, + "learning_rate": 1e-05, + "loss": 2.292, + "step": 1010 + }, + { + "epoch": 1.701662108142226, + "grad_norm": 0.12883049249649048, + "learning_rate": 1e-05, + "loss": 2.2573, + "step": 1011 + }, + { + "epoch": 1.7033452556280244, + "grad_norm": 0.14129754900932312, + "learning_rate": 1e-05, + "loss": 2.334, + "step": 1012 + }, + { + "epoch": 1.7050284031138228, + "grad_norm": 0.13216479122638702, + "learning_rate": 1e-05, + "loss": 2.2664, + "step": 1013 + }, + { + "epoch": 1.7067115505996213, + "grad_norm": 0.12611788511276245, + "learning_rate": 1e-05, + "loss": 2.3159, + "step": 1014 + }, + { + "epoch": 1.7083946980854199, + "grad_norm": 0.14012207090854645, + "learning_rate": 1e-05, + "loss": 2.4026, + "step": 1015 + }, + { + "epoch": 1.7100778455712182, + "grad_norm": 0.14449255168437958, + "learning_rate": 1e-05, + "loss": 2.3313, + "step": 1016 + }, + { + "epoch": 1.7117609930570166, + "grad_norm": 0.15093393623828888, + "learning_rate": 1e-05, + "loss": 2.2075, + "step": 1017 + }, + { + "epoch": 1.7134441405428151, + "grad_norm": 0.15169350802898407, + "learning_rate": 1e-05, + "loss": 2.1926, + "step": 1018 + }, + { + "epoch": 1.7151272880286135, + "grad_norm": 0.13613849878311157, + "learning_rate": 1e-05, + "loss": 2.3394, + "step": 1019 + }, + { + "epoch": 1.7168104355144118, + "grad_norm": 0.13525283336639404, + "learning_rate": 1e-05, + "loss": 2.2234, + "step": 1020 + }, + { + "epoch": 1.7184935830002104, + "grad_norm": 0.1529736965894699, + "learning_rate": 1e-05, + "loss": 2.1866, + "step": 1021 + }, + { + "epoch": 1.720176730486009, + "grad_norm": 0.13723863661289215, + "learning_rate": 1e-05, + "loss": 2.3027, + "step": 1022 + }, + { + "epoch": 1.7218598779718073, + "grad_norm": 0.16251115500926971, + "learning_rate": 1e-05, + "loss": 2.3428, + "step": 1023 + }, + { + "epoch": 1.7235430254576056, + "grad_norm": 0.1440790742635727, + "learning_rate": 1e-05, + "loss": 2.3298, + "step": 1024 + }, + { + "epoch": 1.7252261729434042, + "grad_norm": 0.13486018776893616, + "learning_rate": 1e-05, + "loss": 2.3826, + "step": 1025 + }, + { + "epoch": 1.7269093204292028, + "grad_norm": 0.15616028010845184, + "learning_rate": 1e-05, + "loss": 2.0817, + "step": 1026 + }, + { + "epoch": 1.728592467915001, + "grad_norm": 0.15306299924850464, + "learning_rate": 1e-05, + "loss": 2.2601, + "step": 1027 + }, + { + "epoch": 1.7302756154007994, + "grad_norm": 0.14421014487743378, + "learning_rate": 1e-05, + "loss": 2.1998, + "step": 1028 + }, + { + "epoch": 1.731958762886598, + "grad_norm": 0.14438478648662567, + "learning_rate": 1e-05, + "loss": 2.262, + "step": 1029 + }, + { + "epoch": 1.7336419103723963, + "grad_norm": 0.13325351476669312, + "learning_rate": 1e-05, + "loss": 2.2852, + "step": 1030 + }, + { + "epoch": 1.7353250578581947, + "grad_norm": 0.14232920110225677, + "learning_rate": 1e-05, + "loss": 2.3147, + "step": 1031 + }, + { + "epoch": 1.7370082053439933, + "grad_norm": 0.1394515186548233, + "learning_rate": 1e-05, + "loss": 2.2781, + "step": 1032 + }, + { + "epoch": 1.7386913528297918, + "grad_norm": 0.12838682532310486, + "learning_rate": 1e-05, + "loss": 2.2827, + "step": 1033 + }, + { + "epoch": 1.7403745003155902, + "grad_norm": 0.15612417459487915, + "learning_rate": 1e-05, + "loss": 2.3108, + "step": 1034 + }, + { + "epoch": 1.7420576478013885, + "grad_norm": 0.14740139245986938, + "learning_rate": 1e-05, + "loss": 2.2412, + "step": 1035 + }, + { + "epoch": 1.743740795287187, + "grad_norm": 0.1541980355978012, + "learning_rate": 1e-05, + "loss": 2.3156, + "step": 1036 + }, + { + "epoch": 1.7454239427729856, + "grad_norm": 0.14056488871574402, + "learning_rate": 1e-05, + "loss": 2.1829, + "step": 1037 + }, + { + "epoch": 1.747107090258784, + "grad_norm": 0.143393874168396, + "learning_rate": 1e-05, + "loss": 2.2717, + "step": 1038 + }, + { + "epoch": 1.7487902377445823, + "grad_norm": 0.14296631515026093, + "learning_rate": 1e-05, + "loss": 2.342, + "step": 1039 + }, + { + "epoch": 1.7504733852303809, + "grad_norm": 0.13753627240657806, + "learning_rate": 1e-05, + "loss": 2.324, + "step": 1040 + }, + { + "epoch": 1.7521565327161792, + "grad_norm": 0.13361461460590363, + "learning_rate": 1e-05, + "loss": 2.3549, + "step": 1041 + }, + { + "epoch": 1.7538396802019776, + "grad_norm": 0.16176526248455048, + "learning_rate": 1e-05, + "loss": 2.0996, + "step": 1042 + }, + { + "epoch": 1.7555228276877761, + "grad_norm": 0.14512574672698975, + "learning_rate": 1e-05, + "loss": 2.3289, + "step": 1043 + }, + { + "epoch": 1.7572059751735747, + "grad_norm": 0.14329467713832855, + "learning_rate": 1e-05, + "loss": 2.2429, + "step": 1044 + }, + { + "epoch": 1.758889122659373, + "grad_norm": 0.1415308713912964, + "learning_rate": 1e-05, + "loss": 2.2976, + "step": 1045 + }, + { + "epoch": 1.7605722701451714, + "grad_norm": 0.13017630577087402, + "learning_rate": 1e-05, + "loss": 2.3142, + "step": 1046 + }, + { + "epoch": 1.76225541763097, + "grad_norm": 0.14865103363990784, + "learning_rate": 1e-05, + "loss": 2.2659, + "step": 1047 + }, + { + "epoch": 1.7639385651167685, + "grad_norm": 0.13973674178123474, + "learning_rate": 1e-05, + "loss": 2.1975, + "step": 1048 + }, + { + "epoch": 1.7656217126025668, + "grad_norm": 0.12378077954053879, + "learning_rate": 1e-05, + "loss": 2.4469, + "step": 1049 + }, + { + "epoch": 1.7673048600883652, + "grad_norm": 0.13462629914283752, + "learning_rate": 1e-05, + "loss": 2.332, + "step": 1050 + }, + { + "epoch": 1.7689880075741637, + "grad_norm": 0.14375431835651398, + "learning_rate": 1e-05, + "loss": 2.2834, + "step": 1051 + }, + { + "epoch": 1.770671155059962, + "grad_norm": 0.1413864940404892, + "learning_rate": 1e-05, + "loss": 2.2769, + "step": 1052 + }, + { + "epoch": 1.7723543025457604, + "grad_norm": 0.15052342414855957, + "learning_rate": 1e-05, + "loss": 2.2522, + "step": 1053 + }, + { + "epoch": 1.774037450031559, + "grad_norm": 0.15616975724697113, + "learning_rate": 1e-05, + "loss": 2.1501, + "step": 1054 + }, + { + "epoch": 1.7757205975173576, + "grad_norm": 0.16257071495056152, + "learning_rate": 1e-05, + "loss": 2.1545, + "step": 1055 + }, + { + "epoch": 1.777403745003156, + "grad_norm": 0.13512100279331207, + "learning_rate": 1e-05, + "loss": 2.2218, + "step": 1056 + }, + { + "epoch": 1.7790868924889542, + "grad_norm": 0.1581428200006485, + "learning_rate": 1e-05, + "loss": 2.1865, + "step": 1057 + }, + { + "epoch": 1.7807700399747528, + "grad_norm": 0.13829343020915985, + "learning_rate": 1e-05, + "loss": 2.3337, + "step": 1058 + }, + { + "epoch": 1.7824531874605514, + "grad_norm": 0.16639141738414764, + "learning_rate": 1e-05, + "loss": 2.2325, + "step": 1059 + }, + { + "epoch": 1.7841363349463497, + "grad_norm": 0.1412006914615631, + "learning_rate": 1e-05, + "loss": 2.3389, + "step": 1060 + }, + { + "epoch": 1.785819482432148, + "grad_norm": 0.13130658864974976, + "learning_rate": 1e-05, + "loss": 2.3376, + "step": 1061 + }, + { + "epoch": 1.7875026299179466, + "grad_norm": 0.1495353728532791, + "learning_rate": 1e-05, + "loss": 2.2666, + "step": 1062 + }, + { + "epoch": 1.789185777403745, + "grad_norm": 0.15077506005764008, + "learning_rate": 1e-05, + "loss": 2.228, + "step": 1063 + }, + { + "epoch": 1.7908689248895433, + "grad_norm": 0.1426386535167694, + "learning_rate": 1e-05, + "loss": 2.2727, + "step": 1064 + }, + { + "epoch": 1.7925520723753419, + "grad_norm": 0.14268244802951813, + "learning_rate": 1e-05, + "loss": 2.3643, + "step": 1065 + }, + { + "epoch": 1.7942352198611404, + "grad_norm": 0.14923584461212158, + "learning_rate": 1e-05, + "loss": 2.333, + "step": 1066 + }, + { + "epoch": 1.7959183673469388, + "grad_norm": 0.15571311116218567, + "learning_rate": 1e-05, + "loss": 2.3171, + "step": 1067 + }, + { + "epoch": 1.7976015148327371, + "grad_norm": 0.13931907713413239, + "learning_rate": 1e-05, + "loss": 2.2164, + "step": 1068 + }, + { + "epoch": 1.7992846623185357, + "grad_norm": 0.1513443887233734, + "learning_rate": 1e-05, + "loss": 2.2885, + "step": 1069 + }, + { + "epoch": 1.8009678098043342, + "grad_norm": 0.14123128354549408, + "learning_rate": 1e-05, + "loss": 2.3517, + "step": 1070 + }, + { + "epoch": 1.8026509572901326, + "grad_norm": 0.16668306291103363, + "learning_rate": 1e-05, + "loss": 2.1907, + "step": 1071 + }, + { + "epoch": 1.804334104775931, + "grad_norm": 0.14049063622951508, + "learning_rate": 1e-05, + "loss": 2.4216, + "step": 1072 + }, + { + "epoch": 1.8060172522617295, + "grad_norm": 0.13806495070457458, + "learning_rate": 1e-05, + "loss": 2.3367, + "step": 1073 + }, + { + "epoch": 1.8077003997475278, + "grad_norm": 0.14562048017978668, + "learning_rate": 1e-05, + "loss": 2.2303, + "step": 1074 + }, + { + "epoch": 1.8093835472333262, + "grad_norm": 0.16803675889968872, + "learning_rate": 1e-05, + "loss": 2.2404, + "step": 1075 + }, + { + "epoch": 1.8110666947191247, + "grad_norm": 0.14971864223480225, + "learning_rate": 1e-05, + "loss": 2.1941, + "step": 1076 + }, + { + "epoch": 1.8127498422049233, + "grad_norm": 0.162116140127182, + "learning_rate": 1e-05, + "loss": 2.2034, + "step": 1077 + }, + { + "epoch": 1.8144329896907216, + "grad_norm": 0.1417408138513565, + "learning_rate": 1e-05, + "loss": 2.2991, + "step": 1078 + }, + { + "epoch": 1.81611613717652, + "grad_norm": 0.14334024488925934, + "learning_rate": 1e-05, + "loss": 2.3796, + "step": 1079 + }, + { + "epoch": 1.8177992846623185, + "grad_norm": 0.13600003719329834, + "learning_rate": 1e-05, + "loss": 2.2322, + "step": 1080 + }, + { + "epoch": 1.8194824321481171, + "grad_norm": 0.1557435244321823, + "learning_rate": 1e-05, + "loss": 2.2151, + "step": 1081 + }, + { + "epoch": 1.8211655796339155, + "grad_norm": 0.14444471895694733, + "learning_rate": 1e-05, + "loss": 2.2778, + "step": 1082 + }, + { + "epoch": 1.8228487271197138, + "grad_norm": 0.15237338840961456, + "learning_rate": 1e-05, + "loss": 2.1863, + "step": 1083 + }, + { + "epoch": 1.8245318746055124, + "grad_norm": 0.1488647758960724, + "learning_rate": 1e-05, + "loss": 2.1194, + "step": 1084 + }, + { + "epoch": 1.8262150220913107, + "grad_norm": 0.14532509446144104, + "learning_rate": 1e-05, + "loss": 2.3018, + "step": 1085 + }, + { + "epoch": 1.827898169577109, + "grad_norm": 0.1438300609588623, + "learning_rate": 1e-05, + "loss": 2.3542, + "step": 1086 + }, + { + "epoch": 1.8295813170629076, + "grad_norm": 0.13162897527217865, + "learning_rate": 1e-05, + "loss": 2.3762, + "step": 1087 + }, + { + "epoch": 1.8312644645487062, + "grad_norm": 0.14388734102249146, + "learning_rate": 1e-05, + "loss": 2.3097, + "step": 1088 + }, + { + "epoch": 1.8329476120345045, + "grad_norm": 0.1633898764848709, + "learning_rate": 1e-05, + "loss": 2.1975, + "step": 1089 + }, + { + "epoch": 1.8346307595203029, + "grad_norm": 0.14513400197029114, + "learning_rate": 1e-05, + "loss": 2.3562, + "step": 1090 + }, + { + "epoch": 1.8363139070061014, + "grad_norm": 0.1562061607837677, + "learning_rate": 1e-05, + "loss": 2.2384, + "step": 1091 + }, + { + "epoch": 1.8379970544919, + "grad_norm": 0.14833082258701324, + "learning_rate": 1e-05, + "loss": 2.199, + "step": 1092 + }, + { + "epoch": 1.8396802019776983, + "grad_norm": 0.14182843267917633, + "learning_rate": 1e-05, + "loss": 2.2632, + "step": 1093 + }, + { + "epoch": 1.8413633494634967, + "grad_norm": 0.16517210006713867, + "learning_rate": 1e-05, + "loss": 2.2719, + "step": 1094 + }, + { + "epoch": 1.8430464969492952, + "grad_norm": 0.1563366949558258, + "learning_rate": 1e-05, + "loss": 2.2285, + "step": 1095 + }, + { + "epoch": 1.8447296444350936, + "grad_norm": 0.1349581480026245, + "learning_rate": 1e-05, + "loss": 2.2998, + "step": 1096 + }, + { + "epoch": 1.846412791920892, + "grad_norm": 0.14647842943668365, + "learning_rate": 1e-05, + "loss": 2.2588, + "step": 1097 + }, + { + "epoch": 1.8480959394066905, + "grad_norm": 0.1527308076620102, + "learning_rate": 1e-05, + "loss": 2.1945, + "step": 1098 + }, + { + "epoch": 1.849779086892489, + "grad_norm": 0.16208425164222717, + "learning_rate": 1e-05, + "loss": 2.1692, + "step": 1099 + }, + { + "epoch": 1.8514622343782874, + "grad_norm": 0.15897248685359955, + "learning_rate": 1e-05, + "loss": 2.3582, + "step": 1100 + }, + { + "epoch": 1.8531453818640857, + "grad_norm": 0.14687612652778625, + "learning_rate": 1e-05, + "loss": 2.3057, + "step": 1101 + }, + { + "epoch": 1.8548285293498843, + "grad_norm": 0.1631488800048828, + "learning_rate": 1e-05, + "loss": 2.2521, + "step": 1102 + }, + { + "epoch": 1.8565116768356829, + "grad_norm": 0.14686156809329987, + "learning_rate": 1e-05, + "loss": 2.313, + "step": 1103 + }, + { + "epoch": 1.8581948243214812, + "grad_norm": 0.162966787815094, + "learning_rate": 1e-05, + "loss": 2.1968, + "step": 1104 + }, + { + "epoch": 1.8598779718072795, + "grad_norm": 0.15387648344039917, + "learning_rate": 1e-05, + "loss": 2.3059, + "step": 1105 + }, + { + "epoch": 1.861561119293078, + "grad_norm": 0.1489906907081604, + "learning_rate": 1e-05, + "loss": 2.2195, + "step": 1106 + }, + { + "epoch": 1.8632442667788764, + "grad_norm": 0.14351260662078857, + "learning_rate": 1e-05, + "loss": 2.2656, + "step": 1107 + }, + { + "epoch": 1.8649274142646748, + "grad_norm": 0.16010256111621857, + "learning_rate": 1e-05, + "loss": 2.3252, + "step": 1108 + }, + { + "epoch": 1.8666105617504734, + "grad_norm": 0.14475148916244507, + "learning_rate": 1e-05, + "loss": 2.2878, + "step": 1109 + }, + { + "epoch": 1.868293709236272, + "grad_norm": 0.14097367227077484, + "learning_rate": 1e-05, + "loss": 2.3716, + "step": 1110 + }, + { + "epoch": 1.8699768567220703, + "grad_norm": 0.15699978172779083, + "learning_rate": 1e-05, + "loss": 2.1678, + "step": 1111 + }, + { + "epoch": 1.8716600042078686, + "grad_norm": 0.1370065063238144, + "learning_rate": 1e-05, + "loss": 2.3315, + "step": 1112 + }, + { + "epoch": 1.8733431516936672, + "grad_norm": 0.1498231291770935, + "learning_rate": 1e-05, + "loss": 2.2949, + "step": 1113 + }, + { + "epoch": 1.8750262991794657, + "grad_norm": 0.13267523050308228, + "learning_rate": 1e-05, + "loss": 2.3535, + "step": 1114 + }, + { + "epoch": 1.876709446665264, + "grad_norm": 0.1453379988670349, + "learning_rate": 1e-05, + "loss": 2.2791, + "step": 1115 + }, + { + "epoch": 1.8783925941510624, + "grad_norm": 0.15499484539031982, + "learning_rate": 1e-05, + "loss": 2.2085, + "step": 1116 + }, + { + "epoch": 1.880075741636861, + "grad_norm": 0.14418251812458038, + "learning_rate": 1e-05, + "loss": 2.2793, + "step": 1117 + }, + { + "epoch": 1.8817588891226595, + "grad_norm": 0.13686548173427582, + "learning_rate": 1e-05, + "loss": 2.4175, + "step": 1118 + }, + { + "epoch": 1.8834420366084577, + "grad_norm": 0.17202888429164886, + "learning_rate": 1e-05, + "loss": 2.2196, + "step": 1119 + }, + { + "epoch": 1.8851251840942562, + "grad_norm": 0.1437048763036728, + "learning_rate": 1e-05, + "loss": 2.2688, + "step": 1120 + }, + { + "epoch": 1.8868083315800548, + "grad_norm": 0.13868288695812225, + "learning_rate": 1e-05, + "loss": 2.2971, + "step": 1121 + }, + { + "epoch": 1.8884914790658531, + "grad_norm": 0.133874773979187, + "learning_rate": 1e-05, + "loss": 2.3228, + "step": 1122 + }, + { + "epoch": 1.8901746265516515, + "grad_norm": 0.15967018902301788, + "learning_rate": 1e-05, + "loss": 2.2346, + "step": 1123 + }, + { + "epoch": 1.89185777403745, + "grad_norm": 0.15074019134044647, + "learning_rate": 1e-05, + "loss": 2.3577, + "step": 1124 + }, + { + "epoch": 1.8935409215232486, + "grad_norm": 0.13931475579738617, + "learning_rate": 1e-05, + "loss": 2.3789, + "step": 1125 + }, + { + "epoch": 1.895224069009047, + "grad_norm": 0.15354882180690765, + "learning_rate": 1e-05, + "loss": 2.184, + "step": 1126 + }, + { + "epoch": 1.8969072164948453, + "grad_norm": 0.15907764434814453, + "learning_rate": 1e-05, + "loss": 2.3638, + "step": 1127 + }, + { + "epoch": 1.8985903639806438, + "grad_norm": 0.13138049840927124, + "learning_rate": 1e-05, + "loss": 2.4543, + "step": 1128 + }, + { + "epoch": 1.9002735114664424, + "grad_norm": 0.14568856358528137, + "learning_rate": 1e-05, + "loss": 2.3064, + "step": 1129 + }, + { + "epoch": 1.9019566589522405, + "grad_norm": 0.1426182985305786, + "learning_rate": 1e-05, + "loss": 2.3223, + "step": 1130 + }, + { + "epoch": 1.903639806438039, + "grad_norm": 0.13313454389572144, + "learning_rate": 1e-05, + "loss": 2.3953, + "step": 1131 + }, + { + "epoch": 1.9053229539238377, + "grad_norm": 0.16987952589988708, + "learning_rate": 1e-05, + "loss": 2.1274, + "step": 1132 + }, + { + "epoch": 1.907006101409636, + "grad_norm": 0.1408863216638565, + "learning_rate": 1e-05, + "loss": 2.3242, + "step": 1133 + }, + { + "epoch": 1.9086892488954343, + "grad_norm": 0.14704225957393646, + "learning_rate": 1e-05, + "loss": 2.3687, + "step": 1134 + }, + { + "epoch": 1.910372396381233, + "grad_norm": 0.18410103023052216, + "learning_rate": 1e-05, + "loss": 2.1222, + "step": 1135 + }, + { + "epoch": 1.9120555438670315, + "grad_norm": 0.13889069855213165, + "learning_rate": 1e-05, + "loss": 2.3165, + "step": 1136 + }, + { + "epoch": 1.9137386913528298, + "grad_norm": 0.1532329022884369, + "learning_rate": 1e-05, + "loss": 2.2913, + "step": 1137 + }, + { + "epoch": 1.9154218388386282, + "grad_norm": 0.14806988835334778, + "learning_rate": 1e-05, + "loss": 2.2239, + "step": 1138 + }, + { + "epoch": 1.9171049863244267, + "grad_norm": 0.14964371919631958, + "learning_rate": 1e-05, + "loss": 2.2639, + "step": 1139 + }, + { + "epoch": 1.9187881338102253, + "grad_norm": 0.15137715637683868, + "learning_rate": 1e-05, + "loss": 2.3096, + "step": 1140 + }, + { + "epoch": 1.9204712812960234, + "grad_norm": 0.15892736613750458, + "learning_rate": 1e-05, + "loss": 2.3163, + "step": 1141 + }, + { + "epoch": 1.922154428781822, + "grad_norm": 0.15544387698173523, + "learning_rate": 1e-05, + "loss": 2.1825, + "step": 1142 + }, + { + "epoch": 1.9238375762676205, + "grad_norm": 0.14712852239608765, + "learning_rate": 1e-05, + "loss": 2.2659, + "step": 1143 + }, + { + "epoch": 1.9255207237534189, + "grad_norm": 0.1436305195093155, + "learning_rate": 1e-05, + "loss": 2.3101, + "step": 1144 + }, + { + "epoch": 1.9272038712392172, + "grad_norm": 0.16642406582832336, + "learning_rate": 1e-05, + "loss": 2.2156, + "step": 1145 + }, + { + "epoch": 1.9288870187250158, + "grad_norm": 0.16517338156700134, + "learning_rate": 1e-05, + "loss": 2.2561, + "step": 1146 + }, + { + "epoch": 1.9305701662108143, + "grad_norm": 0.1337500959634781, + "learning_rate": 1e-05, + "loss": 2.3818, + "step": 1147 + }, + { + "epoch": 1.9322533136966127, + "grad_norm": 0.15977586805820465, + "learning_rate": 1e-05, + "loss": 2.2377, + "step": 1148 + }, + { + "epoch": 1.933936461182411, + "grad_norm": 0.14951424300670624, + "learning_rate": 1e-05, + "loss": 2.2269, + "step": 1149 + }, + { + "epoch": 1.9356196086682096, + "grad_norm": 0.13450993597507477, + "learning_rate": 1e-05, + "loss": 2.3442, + "step": 1150 + }, + { + "epoch": 1.9373027561540082, + "grad_norm": 0.16469308733940125, + "learning_rate": 1e-05, + "loss": 2.3123, + "step": 1151 + }, + { + "epoch": 1.9389859036398063, + "grad_norm": 0.14135532081127167, + "learning_rate": 1e-05, + "loss": 2.387, + "step": 1152 + }, + { + "epoch": 1.9406690511256048, + "grad_norm": 0.13864876329898834, + "learning_rate": 1e-05, + "loss": 2.2661, + "step": 1153 + }, + { + "epoch": 1.9423521986114034, + "grad_norm": 0.16291983425617218, + "learning_rate": 1e-05, + "loss": 2.2617, + "step": 1154 + }, + { + "epoch": 1.9440353460972017, + "grad_norm": 0.13341820240020752, + "learning_rate": 1e-05, + "loss": 2.4299, + "step": 1155 + }, + { + "epoch": 1.945718493583, + "grad_norm": 0.15701517462730408, + "learning_rate": 1e-05, + "loss": 2.2211, + "step": 1156 + }, + { + "epoch": 1.9474016410687987, + "grad_norm": 0.16075365245342255, + "learning_rate": 1e-05, + "loss": 2.1801, + "step": 1157 + }, + { + "epoch": 1.9490847885545972, + "grad_norm": 0.15631234645843506, + "learning_rate": 1e-05, + "loss": 2.2152, + "step": 1158 + }, + { + "epoch": 1.9507679360403956, + "grad_norm": 0.16927126049995422, + "learning_rate": 1e-05, + "loss": 2.1776, + "step": 1159 + }, + { + "epoch": 1.952451083526194, + "grad_norm": 0.15192179381847382, + "learning_rate": 1e-05, + "loss": 2.2812, + "step": 1160 + }, + { + "epoch": 1.9541342310119925, + "grad_norm": 0.145833820104599, + "learning_rate": 1e-05, + "loss": 2.3124, + "step": 1161 + }, + { + "epoch": 1.955817378497791, + "grad_norm": 0.16952313482761383, + "learning_rate": 1e-05, + "loss": 2.1085, + "step": 1162 + }, + { + "epoch": 1.9575005259835891, + "grad_norm": 0.1629469394683838, + "learning_rate": 1e-05, + "loss": 2.2267, + "step": 1163 + }, + { + "epoch": 1.9591836734693877, + "grad_norm": 0.16672489047050476, + "learning_rate": 1e-05, + "loss": 2.3783, + "step": 1164 + }, + { + "epoch": 1.9608668209551863, + "grad_norm": 0.14810308814048767, + "learning_rate": 1e-05, + "loss": 2.3723, + "step": 1165 + }, + { + "epoch": 1.9625499684409846, + "grad_norm": 0.1435479074716568, + "learning_rate": 1e-05, + "loss": 2.2615, + "step": 1166 + }, + { + "epoch": 1.964233115926783, + "grad_norm": 0.149140864610672, + "learning_rate": 1e-05, + "loss": 2.2134, + "step": 1167 + }, + { + "epoch": 1.9659162634125815, + "grad_norm": 0.17785809934139252, + "learning_rate": 1e-05, + "loss": 2.1993, + "step": 1168 + }, + { + "epoch": 1.96759941089838, + "grad_norm": 0.15931861102581024, + "learning_rate": 1e-05, + "loss": 2.1807, + "step": 1169 + }, + { + "epoch": 1.9692825583841784, + "grad_norm": 0.16015268862247467, + "learning_rate": 1e-05, + "loss": 2.2737, + "step": 1170 + }, + { + "epoch": 1.9709657058699768, + "grad_norm": 0.14189362525939941, + "learning_rate": 1e-05, + "loss": 2.3416, + "step": 1171 + }, + { + "epoch": 1.9726488533557753, + "grad_norm": 0.1655077338218689, + "learning_rate": 1e-05, + "loss": 2.184, + "step": 1172 + }, + { + "epoch": 1.974332000841574, + "grad_norm": 0.17838408052921295, + "learning_rate": 1e-05, + "loss": 2.2466, + "step": 1173 + }, + { + "epoch": 1.9760151483273722, + "grad_norm": 0.16605247557163239, + "learning_rate": 1e-05, + "loss": 2.2019, + "step": 1174 + }, + { + "epoch": 1.9776982958131706, + "grad_norm": 0.15444627404212952, + "learning_rate": 1e-05, + "loss": 2.2382, + "step": 1175 + }, + { + "epoch": 1.9793814432989691, + "grad_norm": 0.15730591118335724, + "learning_rate": 1e-05, + "loss": 2.3335, + "step": 1176 + }, + { + "epoch": 1.9810645907847675, + "grad_norm": 0.17332051694393158, + "learning_rate": 1e-05, + "loss": 2.17, + "step": 1177 + }, + { + "epoch": 1.9827477382705658, + "grad_norm": 0.15129022300243378, + "learning_rate": 1e-05, + "loss": 2.2584, + "step": 1178 + }, + { + "epoch": 1.9844308857563644, + "grad_norm": 0.16302135586738586, + "learning_rate": 1e-05, + "loss": 2.1904, + "step": 1179 + }, + { + "epoch": 1.986114033242163, + "grad_norm": 0.14117322862148285, + "learning_rate": 1e-05, + "loss": 2.3611, + "step": 1180 + }, + { + "epoch": 1.9877971807279613, + "grad_norm": 0.14415599405765533, + "learning_rate": 1e-05, + "loss": 2.3503, + "step": 1181 + }, + { + "epoch": 1.9894803282137596, + "grad_norm": 0.15894141793251038, + "learning_rate": 1e-05, + "loss": 2.2253, + "step": 1182 + }, + { + "epoch": 1.9911634756995582, + "grad_norm": 0.15063215792179108, + "learning_rate": 1e-05, + "loss": 2.303, + "step": 1183 + }, + { + "epoch": 1.9928466231853568, + "grad_norm": 0.15843670070171356, + "learning_rate": 1e-05, + "loss": 2.2959, + "step": 1184 + }, + { + "epoch": 1.9945297706711551, + "grad_norm": 0.1457902193069458, + "learning_rate": 1e-05, + "loss": 2.3396, + "step": 1185 + }, + { + "epoch": 1.9962129181569535, + "grad_norm": 0.1694038361310959, + "learning_rate": 1e-05, + "loss": 2.3169, + "step": 1186 + }, + { + "epoch": 1.997896065642752, + "grad_norm": 0.16121593117713928, + "learning_rate": 1e-05, + "loss": 2.2754, + "step": 1187 + }, + { + "epoch": 1.9995792131285504, + "grad_norm": 0.16226674616336823, + "learning_rate": 1e-05, + "loss": 2.2498, + "step": 1188 + }, + { + "epoch": 1.9995792131285504, + "step": 1188, + "total_flos": 2.494777289795961e+18, + "train_loss": 2.3332799018834174, + "train_runtime": 81586.2049, + "train_samples_per_second": 0.932, + "train_steps_per_second": 0.015 + } + ], + "logging_steps": 1.0, + "max_steps": 1188, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.494777289795961e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}