{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9995792131285504, "eval_steps": 500, "global_step": 1188, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001683147485798443, "grad_norm": 0.17560942471027374, "learning_rate": 0.0, "loss": 2.613, "step": 1 }, { "epoch": 0.003366294971596886, "grad_norm": 0.15861666202545166, "learning_rate": 2.7894294565112984e-06, "loss": 2.6655, "step": 2 }, { "epoch": 0.005049442457395329, "grad_norm": 0.1817302405834198, "learning_rate": 4.421141086977404e-06, "loss": 2.55, "step": 3 }, { "epoch": 0.006732589943193772, "grad_norm": 0.17854492366313934, "learning_rate": 5.578858913022597e-06, "loss": 2.7908, "step": 4 }, { "epoch": 0.008415737428992216, "grad_norm": 0.17169038951396942, "learning_rate": 6.47685462377997e-06, "loss": 2.6868, "step": 5 }, { "epoch": 0.010098884914790659, "grad_norm": 0.18368647992610931, "learning_rate": 7.210570543488702e-06, "loss": 2.5874, "step": 6 }, { "epoch": 0.011782032400589101, "grad_norm": 0.19648714363574982, "learning_rate": 7.830918514469461e-06, "loss": 2.6633, "step": 7 }, { "epoch": 0.013465179886387544, "grad_norm": 0.18358571827411652, "learning_rate": 8.368288369533896e-06, "loss": 2.6355, "step": 8 }, { "epoch": 0.015148327372185988, "grad_norm": 0.19153611361980438, "learning_rate": 8.842282173954808e-06, "loss": 2.6633, "step": 9 }, { "epoch": 0.016831474857984433, "grad_norm": 0.20646820962429047, "learning_rate": 9.26628408029127e-06, "loss": 2.7268, "step": 10 }, { "epoch": 0.018514622343782875, "grad_norm": 0.18688935041427612, "learning_rate": 9.64984045981344e-06, "loss": 2.7832, "step": 11 }, { "epoch": 0.020197769829581318, "grad_norm": 0.1985747218132019, "learning_rate": 1e-05, "loss": 2.738, "step": 12 }, { "epoch": 0.02188091731537976, "grad_norm": 0.19321100413799286, "learning_rate": 1e-05, "loss": 2.6206, "step": 13 }, { "epoch": 0.023564064801178203, "grad_norm": 0.1875382661819458, "learning_rate": 1e-05, "loss": 2.7153, "step": 14 }, { "epoch": 0.025247212286976645, "grad_norm": 0.18803201615810394, "learning_rate": 1e-05, "loss": 2.5359, "step": 15 }, { "epoch": 0.026930359772775088, "grad_norm": 0.19693922996520996, "learning_rate": 1e-05, "loss": 2.6082, "step": 16 }, { "epoch": 0.028613507258573534, "grad_norm": 0.20534300804138184, "learning_rate": 1e-05, "loss": 2.5317, "step": 17 }, { "epoch": 0.030296654744371977, "grad_norm": 0.22174465656280518, "learning_rate": 1e-05, "loss": 2.6067, "step": 18 }, { "epoch": 0.03197980223017042, "grad_norm": 0.1947612464427948, "learning_rate": 1e-05, "loss": 2.6824, "step": 19 }, { "epoch": 0.033662949715968865, "grad_norm": 0.19715926051139832, "learning_rate": 1e-05, "loss": 2.6868, "step": 20 }, { "epoch": 0.035346097201767304, "grad_norm": 0.19586338102817535, "learning_rate": 1e-05, "loss": 2.6206, "step": 21 }, { "epoch": 0.03702924468756575, "grad_norm": 0.19280074536800385, "learning_rate": 1e-05, "loss": 2.6023, "step": 22 }, { "epoch": 0.03871239217336419, "grad_norm": 0.19658198952674866, "learning_rate": 1e-05, "loss": 2.6384, "step": 23 }, { "epoch": 0.040395539659162635, "grad_norm": 0.17433768510818481, "learning_rate": 1e-05, "loss": 2.5305, "step": 24 }, { "epoch": 0.042078687144961074, "grad_norm": 0.18013380467891693, "learning_rate": 1e-05, "loss": 2.6519, "step": 25 }, { "epoch": 0.04376183463075952, "grad_norm": 0.1933555006980896, "learning_rate": 1e-05, "loss": 2.5591, "step": 26 }, { "epoch": 0.045444982116557966, "grad_norm": 0.18386027216911316, "learning_rate": 1e-05, "loss": 2.6169, "step": 27 }, { "epoch": 0.047128129602356406, "grad_norm": 0.18173415958881378, "learning_rate": 1e-05, "loss": 2.623, "step": 28 }, { "epoch": 0.04881127708815485, "grad_norm": 0.19154761731624603, "learning_rate": 1e-05, "loss": 2.5981, "step": 29 }, { "epoch": 0.05049442457395329, "grad_norm": 0.2001664638519287, "learning_rate": 1e-05, "loss": 2.5066, "step": 30 }, { "epoch": 0.05217757205975174, "grad_norm": 0.15573543310165405, "learning_rate": 1e-05, "loss": 2.6013, "step": 31 }, { "epoch": 0.053860719545550176, "grad_norm": 0.16071979701519012, "learning_rate": 1e-05, "loss": 2.4634, "step": 32 }, { "epoch": 0.05554386703134862, "grad_norm": 0.1769736260175705, "learning_rate": 1e-05, "loss": 2.5491, "step": 33 }, { "epoch": 0.05722701451714707, "grad_norm": 0.17623937129974365, "learning_rate": 1e-05, "loss": 2.4399, "step": 34 }, { "epoch": 0.05891016200294551, "grad_norm": 0.17367449402809143, "learning_rate": 1e-05, "loss": 2.5464, "step": 35 }, { "epoch": 0.06059330948874395, "grad_norm": 0.14842955768108368, "learning_rate": 1e-05, "loss": 2.4174, "step": 36 }, { "epoch": 0.06227645697454239, "grad_norm": 0.17405100166797638, "learning_rate": 1e-05, "loss": 2.5303, "step": 37 }, { "epoch": 0.06395960446034084, "grad_norm": 0.145203098654747, "learning_rate": 1e-05, "loss": 2.6428, "step": 38 }, { "epoch": 0.06564275194613928, "grad_norm": 0.1542726755142212, "learning_rate": 1e-05, "loss": 2.5618, "step": 39 }, { "epoch": 0.06732589943193773, "grad_norm": 0.14489781856536865, "learning_rate": 1e-05, "loss": 2.6885, "step": 40 }, { "epoch": 0.06900904691773617, "grad_norm": 0.14798486232757568, "learning_rate": 1e-05, "loss": 2.5322, "step": 41 }, { "epoch": 0.07069219440353461, "grad_norm": 0.15226829051971436, "learning_rate": 1e-05, "loss": 2.6011, "step": 42 }, { "epoch": 0.07237534188933305, "grad_norm": 0.14561522006988525, "learning_rate": 1e-05, "loss": 2.5657, "step": 43 }, { "epoch": 0.0740584893751315, "grad_norm": 0.13787826895713806, "learning_rate": 1e-05, "loss": 2.6011, "step": 44 }, { "epoch": 0.07574163686092994, "grad_norm": 0.14005698263645172, "learning_rate": 1e-05, "loss": 2.4673, "step": 45 }, { "epoch": 0.07742478434672838, "grad_norm": 0.13822345435619354, "learning_rate": 1e-05, "loss": 2.512, "step": 46 }, { "epoch": 0.07910793183252683, "grad_norm": 0.1284177154302597, "learning_rate": 1e-05, "loss": 2.5625, "step": 47 }, { "epoch": 0.08079107931832527, "grad_norm": 0.1279960423707962, "learning_rate": 1e-05, "loss": 2.46, "step": 48 }, { "epoch": 0.08247422680412371, "grad_norm": 0.12479826807975769, "learning_rate": 1e-05, "loss": 2.5706, "step": 49 }, { "epoch": 0.08415737428992215, "grad_norm": 0.12982836365699768, "learning_rate": 1e-05, "loss": 2.5098, "step": 50 }, { "epoch": 0.0858405217757206, "grad_norm": 0.13269256055355072, "learning_rate": 1e-05, "loss": 2.4688, "step": 51 }, { "epoch": 0.08752366926151904, "grad_norm": 0.11713477969169617, "learning_rate": 1e-05, "loss": 2.6226, "step": 52 }, { "epoch": 0.08920681674731748, "grad_norm": 0.11179152131080627, "learning_rate": 1e-05, "loss": 2.4224, "step": 53 }, { "epoch": 0.09088996423311593, "grad_norm": 0.12146276980638504, "learning_rate": 1e-05, "loss": 2.4639, "step": 54 }, { "epoch": 0.09257311171891437, "grad_norm": 0.12470445781946182, "learning_rate": 1e-05, "loss": 2.5195, "step": 55 }, { "epoch": 0.09425625920471281, "grad_norm": 0.11872275173664093, "learning_rate": 1e-05, "loss": 2.5186, "step": 56 }, { "epoch": 0.09593940669051125, "grad_norm": 0.11616484075784683, "learning_rate": 1e-05, "loss": 2.5581, "step": 57 }, { "epoch": 0.0976225541763097, "grad_norm": 0.1075875386595726, "learning_rate": 1e-05, "loss": 2.5693, "step": 58 }, { "epoch": 0.09930570166210814, "grad_norm": 0.10176095366477966, "learning_rate": 1e-05, "loss": 2.521, "step": 59 }, { "epoch": 0.10098884914790658, "grad_norm": 0.1076890155673027, "learning_rate": 1e-05, "loss": 2.53, "step": 60 }, { "epoch": 0.10267199663370503, "grad_norm": 0.09105601906776428, "learning_rate": 1e-05, "loss": 2.3733, "step": 61 }, { "epoch": 0.10435514411950347, "grad_norm": 0.09733142703771591, "learning_rate": 1e-05, "loss": 2.416, "step": 62 }, { "epoch": 0.10603829160530191, "grad_norm": 0.09099874645471573, "learning_rate": 1e-05, "loss": 2.3774, "step": 63 }, { "epoch": 0.10772143909110035, "grad_norm": 0.0884426161646843, "learning_rate": 1e-05, "loss": 2.4136, "step": 64 }, { "epoch": 0.1094045865768988, "grad_norm": 0.08939989656209946, "learning_rate": 1e-05, "loss": 2.4482, "step": 65 }, { "epoch": 0.11108773406269724, "grad_norm": 0.09078355878591537, "learning_rate": 1e-05, "loss": 2.5256, "step": 66 }, { "epoch": 0.11277088154849568, "grad_norm": 0.08570227026939392, "learning_rate": 1e-05, "loss": 2.4954, "step": 67 }, { "epoch": 0.11445402903429414, "grad_norm": 0.0766797736287117, "learning_rate": 1e-05, "loss": 2.3694, "step": 68 }, { "epoch": 0.11613717652009257, "grad_norm": 0.08015618473291397, "learning_rate": 1e-05, "loss": 2.4724, "step": 69 }, { "epoch": 0.11782032400589101, "grad_norm": 0.08956343680620193, "learning_rate": 1e-05, "loss": 2.47, "step": 70 }, { "epoch": 0.11950347149168945, "grad_norm": 0.08134786039590836, "learning_rate": 1e-05, "loss": 2.4482, "step": 71 }, { "epoch": 0.1211866189774879, "grad_norm": 0.07923366874456406, "learning_rate": 1e-05, "loss": 2.4182, "step": 72 }, { "epoch": 0.12286976646328635, "grad_norm": 0.07909434288740158, "learning_rate": 1e-05, "loss": 2.3711, "step": 73 }, { "epoch": 0.12455291394908478, "grad_norm": 0.07540368288755417, "learning_rate": 1e-05, "loss": 2.3962, "step": 74 }, { "epoch": 0.12623606143488322, "grad_norm": 0.06906846165657043, "learning_rate": 1e-05, "loss": 2.519, "step": 75 }, { "epoch": 0.12791920892068168, "grad_norm": 0.07301697880029678, "learning_rate": 1e-05, "loss": 2.5537, "step": 76 }, { "epoch": 0.12960235640648013, "grad_norm": 0.07182423770427704, "learning_rate": 1e-05, "loss": 2.4807, "step": 77 }, { "epoch": 0.13128550389227855, "grad_norm": 0.06827539950609207, "learning_rate": 1e-05, "loss": 2.5796, "step": 78 }, { "epoch": 0.132968651378077, "grad_norm": 0.07280007749795914, "learning_rate": 1e-05, "loss": 2.499, "step": 79 }, { "epoch": 0.13465179886387546, "grad_norm": 0.07410164177417755, "learning_rate": 1e-05, "loss": 2.3418, "step": 80 }, { "epoch": 0.13633494634967389, "grad_norm": 0.07245635986328125, "learning_rate": 1e-05, "loss": 2.4685, "step": 81 }, { "epoch": 0.13801809383547234, "grad_norm": 0.06992876529693604, "learning_rate": 1e-05, "loss": 2.4634, "step": 82 }, { "epoch": 0.13970124132127076, "grad_norm": 0.07322832196950912, "learning_rate": 1e-05, "loss": 2.4949, "step": 83 }, { "epoch": 0.14138438880706922, "grad_norm": 0.06528163701295853, "learning_rate": 1e-05, "loss": 2.3982, "step": 84 }, { "epoch": 0.14306753629286767, "grad_norm": 0.06972632557153702, "learning_rate": 1e-05, "loss": 2.4268, "step": 85 }, { "epoch": 0.1447506837786661, "grad_norm": 0.062493499368429184, "learning_rate": 1e-05, "loss": 2.4309, "step": 86 }, { "epoch": 0.14643383126446455, "grad_norm": 0.07086165249347687, "learning_rate": 1e-05, "loss": 2.4373, "step": 87 }, { "epoch": 0.148116978750263, "grad_norm": 0.06631726026535034, "learning_rate": 1e-05, "loss": 2.4141, "step": 88 }, { "epoch": 0.14980012623606143, "grad_norm": 0.07114582508802414, "learning_rate": 1e-05, "loss": 2.3546, "step": 89 }, { "epoch": 0.15148327372185988, "grad_norm": 0.06932078301906586, "learning_rate": 1e-05, "loss": 2.4758, "step": 90 }, { "epoch": 0.15316642120765833, "grad_norm": 0.06153389438986778, "learning_rate": 1e-05, "loss": 2.481, "step": 91 }, { "epoch": 0.15484956869345676, "grad_norm": 0.06216192990541458, "learning_rate": 1e-05, "loss": 2.4421, "step": 92 }, { "epoch": 0.1565327161792552, "grad_norm": 0.06554314494132996, "learning_rate": 1e-05, "loss": 2.3008, "step": 93 }, { "epoch": 0.15821586366505366, "grad_norm": 0.06210967153310776, "learning_rate": 1e-05, "loss": 2.2554, "step": 94 }, { "epoch": 0.1598990111508521, "grad_norm": 0.06851295381784439, "learning_rate": 1e-05, "loss": 2.5356, "step": 95 }, { "epoch": 0.16158215863665054, "grad_norm": 0.06121644005179405, "learning_rate": 1e-05, "loss": 2.4299, "step": 96 }, { "epoch": 0.16326530612244897, "grad_norm": 0.06593657284975052, "learning_rate": 1e-05, "loss": 2.3811, "step": 97 }, { "epoch": 0.16494845360824742, "grad_norm": 0.06456276774406433, "learning_rate": 1e-05, "loss": 2.3574, "step": 98 }, { "epoch": 0.16663160109404587, "grad_norm": 0.061866894364356995, "learning_rate": 1e-05, "loss": 2.4758, "step": 99 }, { "epoch": 0.1683147485798443, "grad_norm": 0.058500371873378754, "learning_rate": 1e-05, "loss": 2.4133, "step": 100 }, { "epoch": 0.16999789606564275, "grad_norm": 0.06366603821516037, "learning_rate": 1e-05, "loss": 2.3328, "step": 101 }, { "epoch": 0.1716810435514412, "grad_norm": 0.061924271285533905, "learning_rate": 1e-05, "loss": 2.4047, "step": 102 }, { "epoch": 0.17336419103723963, "grad_norm": 0.057471342384815216, "learning_rate": 1e-05, "loss": 2.4333, "step": 103 }, { "epoch": 0.17504733852303808, "grad_norm": 0.05482906475663185, "learning_rate": 1e-05, "loss": 2.3499, "step": 104 }, { "epoch": 0.17673048600883653, "grad_norm": 0.056116051971912384, "learning_rate": 1e-05, "loss": 2.4653, "step": 105 }, { "epoch": 0.17841363349463496, "grad_norm": 0.052277661859989166, "learning_rate": 1e-05, "loss": 2.4653, "step": 106 }, { "epoch": 0.1800967809804334, "grad_norm": 0.06346592307090759, "learning_rate": 1e-05, "loss": 2.3549, "step": 107 }, { "epoch": 0.18177992846623187, "grad_norm": 0.06070290133357048, "learning_rate": 1e-05, "loss": 2.2886, "step": 108 }, { "epoch": 0.1834630759520303, "grad_norm": 0.055994004011154175, "learning_rate": 1e-05, "loss": 2.4692, "step": 109 }, { "epoch": 0.18514622343782874, "grad_norm": 0.05782800912857056, "learning_rate": 1e-05, "loss": 2.3303, "step": 110 }, { "epoch": 0.18682937092362717, "grad_norm": 0.05491410568356514, "learning_rate": 1e-05, "loss": 2.47, "step": 111 }, { "epoch": 0.18851251840942562, "grad_norm": 0.060252465307712555, "learning_rate": 1e-05, "loss": 2.5464, "step": 112 }, { "epoch": 0.19019566589522408, "grad_norm": 0.05614893510937691, "learning_rate": 1e-05, "loss": 2.3457, "step": 113 }, { "epoch": 0.1918788133810225, "grad_norm": 0.051146939396858215, "learning_rate": 1e-05, "loss": 2.3918, "step": 114 }, { "epoch": 0.19356196086682095, "grad_norm": 0.05474052205681801, "learning_rate": 1e-05, "loss": 2.3689, "step": 115 }, { "epoch": 0.1952451083526194, "grad_norm": 0.052064936608076096, "learning_rate": 1e-05, "loss": 2.5073, "step": 116 }, { "epoch": 0.19692825583841783, "grad_norm": 0.06184034049510956, "learning_rate": 1e-05, "loss": 2.4248, "step": 117 }, { "epoch": 0.19861140332421628, "grad_norm": 0.05613533779978752, "learning_rate": 1e-05, "loss": 2.5742, "step": 118 }, { "epoch": 0.20029455081001474, "grad_norm": 0.05547456443309784, "learning_rate": 1e-05, "loss": 2.3884, "step": 119 }, { "epoch": 0.20197769829581316, "grad_norm": 0.05933033674955368, "learning_rate": 1e-05, "loss": 2.45, "step": 120 }, { "epoch": 0.20366084578161162, "grad_norm": 0.058600571006536484, "learning_rate": 1e-05, "loss": 2.3875, "step": 121 }, { "epoch": 0.20534399326741007, "grad_norm": 0.0554657019674778, "learning_rate": 1e-05, "loss": 2.3215, "step": 122 }, { "epoch": 0.2070271407532085, "grad_norm": 0.05604475364089012, "learning_rate": 1e-05, "loss": 2.3329, "step": 123 }, { "epoch": 0.20871028823900695, "grad_norm": 0.06094202771782875, "learning_rate": 1e-05, "loss": 2.4177, "step": 124 }, { "epoch": 0.2103934357248054, "grad_norm": 0.05517999082803726, "learning_rate": 1e-05, "loss": 2.3247, "step": 125 }, { "epoch": 0.21207658321060383, "grad_norm": 0.05678452178835869, "learning_rate": 1e-05, "loss": 2.3481, "step": 126 }, { "epoch": 0.21375973069640228, "grad_norm": 0.05295870825648308, "learning_rate": 1e-05, "loss": 2.3694, "step": 127 }, { "epoch": 0.2154428781822007, "grad_norm": 0.05118125304579735, "learning_rate": 1e-05, "loss": 2.4102, "step": 128 }, { "epoch": 0.21712602566799916, "grad_norm": 0.05659961327910423, "learning_rate": 1e-05, "loss": 2.3104, "step": 129 }, { "epoch": 0.2188091731537976, "grad_norm": 0.05049075558781624, "learning_rate": 1e-05, "loss": 2.4949, "step": 130 }, { "epoch": 0.22049232063959603, "grad_norm": 0.05323097109794617, "learning_rate": 1e-05, "loss": 2.323, "step": 131 }, { "epoch": 0.2221754681253945, "grad_norm": 0.05309610068798065, "learning_rate": 1e-05, "loss": 2.5203, "step": 132 }, { "epoch": 0.22385861561119294, "grad_norm": 0.05474167317152023, "learning_rate": 1e-05, "loss": 2.408, "step": 133 }, { "epoch": 0.22554176309699137, "grad_norm": 0.056433092802762985, "learning_rate": 1e-05, "loss": 2.3779, "step": 134 }, { "epoch": 0.22722491058278982, "grad_norm": 0.047424182295799255, "learning_rate": 1e-05, "loss": 2.45, "step": 135 }, { "epoch": 0.22890805806858827, "grad_norm": 0.05422671511769295, "learning_rate": 1e-05, "loss": 2.3397, "step": 136 }, { "epoch": 0.2305912055543867, "grad_norm": 0.05421329662203789, "learning_rate": 1e-05, "loss": 2.3779, "step": 137 }, { "epoch": 0.23227435304018515, "grad_norm": 0.057494040578603745, "learning_rate": 1e-05, "loss": 2.4509, "step": 138 }, { "epoch": 0.2339575005259836, "grad_norm": 0.0516960434615612, "learning_rate": 1e-05, "loss": 2.3647, "step": 139 }, { "epoch": 0.23564064801178203, "grad_norm": 0.049899645149707794, "learning_rate": 1e-05, "loss": 2.4844, "step": 140 }, { "epoch": 0.23732379549758048, "grad_norm": 0.05162065476179123, "learning_rate": 1e-05, "loss": 2.3613, "step": 141 }, { "epoch": 0.2390069429833789, "grad_norm": 0.05812832713127136, "learning_rate": 1e-05, "loss": 2.4548, "step": 142 }, { "epoch": 0.24069009046917736, "grad_norm": 0.04910556599497795, "learning_rate": 1e-05, "loss": 2.3274, "step": 143 }, { "epoch": 0.2423732379549758, "grad_norm": 0.05346587672829628, "learning_rate": 1e-05, "loss": 2.325, "step": 144 }, { "epoch": 0.24405638544077424, "grad_norm": 0.0495002381503582, "learning_rate": 1e-05, "loss": 2.4131, "step": 145 }, { "epoch": 0.2457395329265727, "grad_norm": 0.05076875165104866, "learning_rate": 1e-05, "loss": 2.3887, "step": 146 }, { "epoch": 0.24742268041237114, "grad_norm": 0.050955574959516525, "learning_rate": 1e-05, "loss": 2.4517, "step": 147 }, { "epoch": 0.24910582789816957, "grad_norm": 0.05082906410098076, "learning_rate": 1e-05, "loss": 2.3401, "step": 148 }, { "epoch": 0.250788975383968, "grad_norm": 0.052096717059612274, "learning_rate": 1e-05, "loss": 2.3218, "step": 149 }, { "epoch": 0.25247212286976645, "grad_norm": 0.052378151565790176, "learning_rate": 1e-05, "loss": 2.4246, "step": 150 }, { "epoch": 0.2541552703555649, "grad_norm": 0.04881056025624275, "learning_rate": 1e-05, "loss": 2.3435, "step": 151 }, { "epoch": 0.25583841784136335, "grad_norm": 0.05233067274093628, "learning_rate": 1e-05, "loss": 2.4761, "step": 152 }, { "epoch": 0.2575215653271618, "grad_norm": 0.05231297388672829, "learning_rate": 1e-05, "loss": 2.4065, "step": 153 }, { "epoch": 0.25920471281296026, "grad_norm": 0.04649129509925842, "learning_rate": 1e-05, "loss": 2.4175, "step": 154 }, { "epoch": 0.26088786029875866, "grad_norm": 0.05354660376906395, "learning_rate": 1e-05, "loss": 2.4731, "step": 155 }, { "epoch": 0.2625710077845571, "grad_norm": 0.05071151629090309, "learning_rate": 1e-05, "loss": 2.4421, "step": 156 }, { "epoch": 0.26425415527035556, "grad_norm": 0.04953297600150108, "learning_rate": 1e-05, "loss": 2.3134, "step": 157 }, { "epoch": 0.265937302756154, "grad_norm": 0.051142722368240356, "learning_rate": 1e-05, "loss": 2.3335, "step": 158 }, { "epoch": 0.26762045024195247, "grad_norm": 0.05187085270881653, "learning_rate": 1e-05, "loss": 2.4387, "step": 159 }, { "epoch": 0.2693035977277509, "grad_norm": 0.04968629032373428, "learning_rate": 1e-05, "loss": 2.4905, "step": 160 }, { "epoch": 0.2709867452135493, "grad_norm": 0.053009629249572754, "learning_rate": 1e-05, "loss": 2.4441, "step": 161 }, { "epoch": 0.27266989269934777, "grad_norm": 0.04917874187231064, "learning_rate": 1e-05, "loss": 2.4763, "step": 162 }, { "epoch": 0.2743530401851462, "grad_norm": 0.048884451389312744, "learning_rate": 1e-05, "loss": 2.4248, "step": 163 }, { "epoch": 0.2760361876709447, "grad_norm": 0.049946676939725876, "learning_rate": 1e-05, "loss": 2.5173, "step": 164 }, { "epoch": 0.27771933515674313, "grad_norm": 0.052534863352775574, "learning_rate": 1e-05, "loss": 2.4558, "step": 165 }, { "epoch": 0.2794024826425415, "grad_norm": 0.05162844434380531, "learning_rate": 1e-05, "loss": 2.405, "step": 166 }, { "epoch": 0.28108563012834, "grad_norm": 0.049985259771347046, "learning_rate": 1e-05, "loss": 2.3542, "step": 167 }, { "epoch": 0.28276877761413843, "grad_norm": 0.05239354074001312, "learning_rate": 1e-05, "loss": 2.3721, "step": 168 }, { "epoch": 0.2844519250999369, "grad_norm": 0.05592744052410126, "learning_rate": 1e-05, "loss": 2.2701, "step": 169 }, { "epoch": 0.28613507258573534, "grad_norm": 0.052739113569259644, "learning_rate": 1e-05, "loss": 2.4216, "step": 170 }, { "epoch": 0.2878182200715338, "grad_norm": 0.04806948080658913, "learning_rate": 1e-05, "loss": 2.3884, "step": 171 }, { "epoch": 0.2895013675573322, "grad_norm": 0.04990949481725693, "learning_rate": 1e-05, "loss": 2.4419, "step": 172 }, { "epoch": 0.29118451504313064, "grad_norm": 0.050067439675331116, "learning_rate": 1e-05, "loss": 2.4331, "step": 173 }, { "epoch": 0.2928676625289291, "grad_norm": 0.0507354810833931, "learning_rate": 1e-05, "loss": 2.406, "step": 174 }, { "epoch": 0.29455081001472755, "grad_norm": 0.0538686104118824, "learning_rate": 1e-05, "loss": 2.4182, "step": 175 }, { "epoch": 0.296233957500526, "grad_norm": 0.05205219238996506, "learning_rate": 1e-05, "loss": 2.3401, "step": 176 }, { "epoch": 0.2979171049863244, "grad_norm": 0.04672086611390114, "learning_rate": 1e-05, "loss": 2.3149, "step": 177 }, { "epoch": 0.29960025247212285, "grad_norm": 0.051963068544864655, "learning_rate": 1e-05, "loss": 2.2537, "step": 178 }, { "epoch": 0.3012833999579213, "grad_norm": 0.053639005869627, "learning_rate": 1e-05, "loss": 2.4353, "step": 179 }, { "epoch": 0.30296654744371976, "grad_norm": 0.05326982960104942, "learning_rate": 1e-05, "loss": 2.334, "step": 180 }, { "epoch": 0.3046496949295182, "grad_norm": 0.05361334979534149, "learning_rate": 1e-05, "loss": 2.4224, "step": 181 }, { "epoch": 0.30633284241531666, "grad_norm": 0.05790587514638901, "learning_rate": 1e-05, "loss": 2.334, "step": 182 }, { "epoch": 0.30801598990111506, "grad_norm": 0.04790763929486275, "learning_rate": 1e-05, "loss": 2.5073, "step": 183 }, { "epoch": 0.3096991373869135, "grad_norm": 0.054103124886751175, "learning_rate": 1e-05, "loss": 2.3483, "step": 184 }, { "epoch": 0.31138228487271197, "grad_norm": 0.05902162939310074, "learning_rate": 1e-05, "loss": 2.3301, "step": 185 }, { "epoch": 0.3130654323585104, "grad_norm": 0.04853544384241104, "learning_rate": 1e-05, "loss": 2.5566, "step": 186 }, { "epoch": 0.3147485798443089, "grad_norm": 0.055288165807724, "learning_rate": 1e-05, "loss": 2.2903, "step": 187 }, { "epoch": 0.3164317273301073, "grad_norm": 0.05180734023451805, "learning_rate": 1e-05, "loss": 2.4285, "step": 188 }, { "epoch": 0.3181148748159057, "grad_norm": 0.04889997839927673, "learning_rate": 1e-05, "loss": 2.2542, "step": 189 }, { "epoch": 0.3197980223017042, "grad_norm": 0.051011502742767334, "learning_rate": 1e-05, "loss": 2.2893, "step": 190 }, { "epoch": 0.32148116978750263, "grad_norm": 0.04864371567964554, "learning_rate": 1e-05, "loss": 2.5225, "step": 191 }, { "epoch": 0.3231643172733011, "grad_norm": 0.05374041944742203, "learning_rate": 1e-05, "loss": 2.4504, "step": 192 }, { "epoch": 0.32484746475909954, "grad_norm": 0.05158041790127754, "learning_rate": 1e-05, "loss": 2.4683, "step": 193 }, { "epoch": 0.32653061224489793, "grad_norm": 0.05630083382129669, "learning_rate": 1e-05, "loss": 2.2415, "step": 194 }, { "epoch": 0.3282137597306964, "grad_norm": 0.05439196154475212, "learning_rate": 1e-05, "loss": 2.3684, "step": 195 }, { "epoch": 0.32989690721649484, "grad_norm": 0.05023415759205818, "learning_rate": 1e-05, "loss": 2.415, "step": 196 }, { "epoch": 0.3315800547022933, "grad_norm": 0.05531445890665054, "learning_rate": 1e-05, "loss": 2.4626, "step": 197 }, { "epoch": 0.33326320218809175, "grad_norm": 0.05087656155228615, "learning_rate": 1e-05, "loss": 2.3936, "step": 198 }, { "epoch": 0.3349463496738902, "grad_norm": 0.05231088399887085, "learning_rate": 1e-05, "loss": 2.3779, "step": 199 }, { "epoch": 0.3366294971596886, "grad_norm": 0.0514984093606472, "learning_rate": 1e-05, "loss": 2.3967, "step": 200 }, { "epoch": 0.33831264464548705, "grad_norm": 0.05334719642996788, "learning_rate": 1e-05, "loss": 2.4604, "step": 201 }, { "epoch": 0.3399957921312855, "grad_norm": 0.054843124002218246, "learning_rate": 1e-05, "loss": 2.3538, "step": 202 }, { "epoch": 0.34167893961708395, "grad_norm": 0.04888272285461426, "learning_rate": 1e-05, "loss": 2.4844, "step": 203 }, { "epoch": 0.3433620871028824, "grad_norm": 0.054122187197208405, "learning_rate": 1e-05, "loss": 2.3291, "step": 204 }, { "epoch": 0.34504523458868086, "grad_norm": 0.054561201483011246, "learning_rate": 1e-05, "loss": 2.3218, "step": 205 }, { "epoch": 0.34672838207447926, "grad_norm": 0.04919834062457085, "learning_rate": 1e-05, "loss": 2.4478, "step": 206 }, { "epoch": 0.3484115295602777, "grad_norm": 0.050551943480968475, "learning_rate": 1e-05, "loss": 2.3755, "step": 207 }, { "epoch": 0.35009467704607616, "grad_norm": 0.05242514982819557, "learning_rate": 1e-05, "loss": 2.3922, "step": 208 }, { "epoch": 0.3517778245318746, "grad_norm": 0.06077054515480995, "learning_rate": 1e-05, "loss": 2.3218, "step": 209 }, { "epoch": 0.35346097201767307, "grad_norm": 0.061367545276880264, "learning_rate": 1e-05, "loss": 2.2957, "step": 210 }, { "epoch": 0.35514411950347147, "grad_norm": 0.0511772483587265, "learning_rate": 1e-05, "loss": 2.374, "step": 211 }, { "epoch": 0.3568272669892699, "grad_norm": 0.0496203638613224, "learning_rate": 1e-05, "loss": 2.4182, "step": 212 }, { "epoch": 0.3585104144750684, "grad_norm": 0.061339233070611954, "learning_rate": 1e-05, "loss": 2.406, "step": 213 }, { "epoch": 0.3601935619608668, "grad_norm": 0.052460432052612305, "learning_rate": 1e-05, "loss": 2.4309, "step": 214 }, { "epoch": 0.3618767094466653, "grad_norm": 0.055436089634895325, "learning_rate": 1e-05, "loss": 2.4141, "step": 215 }, { "epoch": 0.36355985693246373, "grad_norm": 0.05396036058664322, "learning_rate": 1e-05, "loss": 2.2705, "step": 216 }, { "epoch": 0.36524300441826213, "grad_norm": 0.04853086173534393, "learning_rate": 1e-05, "loss": 2.4473, "step": 217 }, { "epoch": 0.3669261519040606, "grad_norm": 0.051015399396419525, "learning_rate": 1e-05, "loss": 2.5115, "step": 218 }, { "epoch": 0.36860929938985904, "grad_norm": 0.05526035279035568, "learning_rate": 1e-05, "loss": 2.3123, "step": 219 }, { "epoch": 0.3702924468756575, "grad_norm": 0.056169234216213226, "learning_rate": 1e-05, "loss": 2.3447, "step": 220 }, { "epoch": 0.37197559436145594, "grad_norm": 0.05238133668899536, "learning_rate": 1e-05, "loss": 2.26, "step": 221 }, { "epoch": 0.37365874184725434, "grad_norm": 0.05587685480713844, "learning_rate": 1e-05, "loss": 2.3083, "step": 222 }, { "epoch": 0.3753418893330528, "grad_norm": 0.050364553928375244, "learning_rate": 1e-05, "loss": 2.3459, "step": 223 }, { "epoch": 0.37702503681885124, "grad_norm": 0.0506574809551239, "learning_rate": 1e-05, "loss": 2.4246, "step": 224 }, { "epoch": 0.3787081843046497, "grad_norm": 0.05842865630984306, "learning_rate": 1e-05, "loss": 2.2617, "step": 225 }, { "epoch": 0.38039133179044815, "grad_norm": 0.05097496882081032, "learning_rate": 1e-05, "loss": 2.52, "step": 226 }, { "epoch": 0.3820744792762466, "grad_norm": 0.05665278434753418, "learning_rate": 1e-05, "loss": 2.2715, "step": 227 }, { "epoch": 0.383757626762045, "grad_norm": 0.053350359201431274, "learning_rate": 1e-05, "loss": 2.3101, "step": 228 }, { "epoch": 0.38544077424784345, "grad_norm": 0.05481604114174843, "learning_rate": 1e-05, "loss": 2.3347, "step": 229 }, { "epoch": 0.3871239217336419, "grad_norm": 0.06036606431007385, "learning_rate": 1e-05, "loss": 2.2991, "step": 230 }, { "epoch": 0.38880706921944036, "grad_norm": 0.0606355145573616, "learning_rate": 1e-05, "loss": 2.4226, "step": 231 }, { "epoch": 0.3904902167052388, "grad_norm": 0.052770137786865234, "learning_rate": 1e-05, "loss": 2.4539, "step": 232 }, { "epoch": 0.39217336419103727, "grad_norm": 0.050006203353405, "learning_rate": 1e-05, "loss": 2.3477, "step": 233 }, { "epoch": 0.39385651167683566, "grad_norm": 0.05640649050474167, "learning_rate": 1e-05, "loss": 2.3123, "step": 234 }, { "epoch": 0.3955396591626341, "grad_norm": 0.050969429314136505, "learning_rate": 1e-05, "loss": 2.4534, "step": 235 }, { "epoch": 0.39722280664843257, "grad_norm": 0.05676101893186569, "learning_rate": 1e-05, "loss": 2.3481, "step": 236 }, { "epoch": 0.398905954134231, "grad_norm": 0.05844707787036896, "learning_rate": 1e-05, "loss": 2.3638, "step": 237 }, { "epoch": 0.4005891016200295, "grad_norm": 0.053074926137924194, "learning_rate": 1e-05, "loss": 2.3904, "step": 238 }, { "epoch": 0.4022722491058279, "grad_norm": 0.04979414492845535, "learning_rate": 1e-05, "loss": 2.3855, "step": 239 }, { "epoch": 0.4039553965916263, "grad_norm": 0.05607665330171585, "learning_rate": 1e-05, "loss": 2.3569, "step": 240 }, { "epoch": 0.4056385440774248, "grad_norm": 0.05964501202106476, "learning_rate": 1e-05, "loss": 2.3459, "step": 241 }, { "epoch": 0.40732169156322323, "grad_norm": 0.05849093198776245, "learning_rate": 1e-05, "loss": 2.3213, "step": 242 }, { "epoch": 0.4090048390490217, "grad_norm": 0.053846072405576706, "learning_rate": 1e-05, "loss": 2.4436, "step": 243 }, { "epoch": 0.41068798653482014, "grad_norm": 0.054448988288640976, "learning_rate": 1e-05, "loss": 2.3716, "step": 244 }, { "epoch": 0.41237113402061853, "grad_norm": 0.05229583755135536, "learning_rate": 1e-05, "loss": 2.4099, "step": 245 }, { "epoch": 0.414054281506417, "grad_norm": 0.05479966476559639, "learning_rate": 1e-05, "loss": 2.4026, "step": 246 }, { "epoch": 0.41573742899221544, "grad_norm": 0.061799049377441406, "learning_rate": 1e-05, "loss": 2.4072, "step": 247 }, { "epoch": 0.4174205764780139, "grad_norm": 0.061452727764844894, "learning_rate": 1e-05, "loss": 2.2833, "step": 248 }, { "epoch": 0.41910372396381235, "grad_norm": 0.05868072435259819, "learning_rate": 1e-05, "loss": 2.3833, "step": 249 }, { "epoch": 0.4207868714496108, "grad_norm": 0.05926290899515152, "learning_rate": 1e-05, "loss": 2.3645, "step": 250 }, { "epoch": 0.4224700189354092, "grad_norm": 0.058858342468738556, "learning_rate": 1e-05, "loss": 2.3152, "step": 251 }, { "epoch": 0.42415316642120765, "grad_norm": 0.058599065989255905, "learning_rate": 1e-05, "loss": 2.2827, "step": 252 }, { "epoch": 0.4258363139070061, "grad_norm": 0.060381706804037094, "learning_rate": 1e-05, "loss": 2.3024, "step": 253 }, { "epoch": 0.42751946139280456, "grad_norm": 0.05441940575838089, "learning_rate": 1e-05, "loss": 2.446, "step": 254 }, { "epoch": 0.429202608878603, "grad_norm": 0.05750846117734909, "learning_rate": 1e-05, "loss": 2.3958, "step": 255 }, { "epoch": 0.4308857563644014, "grad_norm": 0.060346368700265884, "learning_rate": 1e-05, "loss": 2.2395, "step": 256 }, { "epoch": 0.43256890385019986, "grad_norm": 0.056383710354566574, "learning_rate": 1e-05, "loss": 2.3518, "step": 257 }, { "epoch": 0.4342520513359983, "grad_norm": 0.057746805250644684, "learning_rate": 1e-05, "loss": 2.2834, "step": 258 }, { "epoch": 0.43593519882179677, "grad_norm": 0.051562029868364334, "learning_rate": 1e-05, "loss": 2.3677, "step": 259 }, { "epoch": 0.4376183463075952, "grad_norm": 0.059988316148519516, "learning_rate": 1e-05, "loss": 2.3372, "step": 260 }, { "epoch": 0.43930149379339367, "grad_norm": 0.05852155759930611, "learning_rate": 1e-05, "loss": 2.3875, "step": 261 }, { "epoch": 0.44098464127919207, "grad_norm": 0.06629418581724167, "learning_rate": 1e-05, "loss": 2.4194, "step": 262 }, { "epoch": 0.4426677887649905, "grad_norm": 0.061044465750455856, "learning_rate": 1e-05, "loss": 2.2466, "step": 263 }, { "epoch": 0.444350936250789, "grad_norm": 0.056285977363586426, "learning_rate": 1e-05, "loss": 2.3105, "step": 264 }, { "epoch": 0.44603408373658743, "grad_norm": 0.06135227158665657, "learning_rate": 1e-05, "loss": 2.3853, "step": 265 }, { "epoch": 0.4477172312223859, "grad_norm": 0.05644640699028969, "learning_rate": 1e-05, "loss": 2.3888, "step": 266 }, { "epoch": 0.4494003787081843, "grad_norm": 0.06326981633901596, "learning_rate": 1e-05, "loss": 2.3132, "step": 267 }, { "epoch": 0.45108352619398273, "grad_norm": 0.05710430070757866, "learning_rate": 1e-05, "loss": 2.365, "step": 268 }, { "epoch": 0.4527666736797812, "grad_norm": 0.05607946217060089, "learning_rate": 1e-05, "loss": 2.4648, "step": 269 }, { "epoch": 0.45444982116557964, "grad_norm": 0.057825781404972076, "learning_rate": 1e-05, "loss": 2.4189, "step": 270 }, { "epoch": 0.4561329686513781, "grad_norm": 0.06380680948495865, "learning_rate": 1e-05, "loss": 2.3188, "step": 271 }, { "epoch": 0.45781611613717654, "grad_norm": 0.06377760320901871, "learning_rate": 1e-05, "loss": 2.2896, "step": 272 }, { "epoch": 0.45949926362297494, "grad_norm": 0.06210333853960037, "learning_rate": 1e-05, "loss": 2.3663, "step": 273 }, { "epoch": 0.4611824111087734, "grad_norm": 0.06039275974035263, "learning_rate": 1e-05, "loss": 2.408, "step": 274 }, { "epoch": 0.46286555859457185, "grad_norm": 0.05442138388752937, "learning_rate": 1e-05, "loss": 2.3843, "step": 275 }, { "epoch": 0.4645487060803703, "grad_norm": 0.06208937615156174, "learning_rate": 1e-05, "loss": 2.4355, "step": 276 }, { "epoch": 0.46623185356616875, "grad_norm": 0.0619891993701458, "learning_rate": 1e-05, "loss": 2.3196, "step": 277 }, { "epoch": 0.4679150010519672, "grad_norm": 0.059192296117544174, "learning_rate": 1e-05, "loss": 2.3237, "step": 278 }, { "epoch": 0.4695981485377656, "grad_norm": 0.06284468621015549, "learning_rate": 1e-05, "loss": 2.3694, "step": 279 }, { "epoch": 0.47128129602356406, "grad_norm": 0.06121189519762993, "learning_rate": 1e-05, "loss": 2.3606, "step": 280 }, { "epoch": 0.4729644435093625, "grad_norm": 0.061919402331113815, "learning_rate": 1e-05, "loss": 2.3381, "step": 281 }, { "epoch": 0.47464759099516096, "grad_norm": 0.0676443800330162, "learning_rate": 1e-05, "loss": 2.3624, "step": 282 }, { "epoch": 0.4763307384809594, "grad_norm": 0.060140665620565414, "learning_rate": 1e-05, "loss": 2.4541, "step": 283 }, { "epoch": 0.4780138859667578, "grad_norm": 0.062285441905260086, "learning_rate": 1e-05, "loss": 2.323, "step": 284 }, { "epoch": 0.47969703345255627, "grad_norm": 0.06063227355480194, "learning_rate": 1e-05, "loss": 2.3596, "step": 285 }, { "epoch": 0.4813801809383547, "grad_norm": 0.05906851589679718, "learning_rate": 1e-05, "loss": 2.458, "step": 286 }, { "epoch": 0.48306332842415317, "grad_norm": 0.05862203240394592, "learning_rate": 1e-05, "loss": 2.291, "step": 287 }, { "epoch": 0.4847464759099516, "grad_norm": 0.0629325732588768, "learning_rate": 1e-05, "loss": 2.2634, "step": 288 }, { "epoch": 0.4864296233957501, "grad_norm": 0.06464157998561859, "learning_rate": 1e-05, "loss": 2.2531, "step": 289 }, { "epoch": 0.4881127708815485, "grad_norm": 0.0547555610537529, "learning_rate": 1e-05, "loss": 2.5339, "step": 290 }, { "epoch": 0.4897959183673469, "grad_norm": 0.0606168657541275, "learning_rate": 1e-05, "loss": 2.2886, "step": 291 }, { "epoch": 0.4914790658531454, "grad_norm": 0.058814577758312225, "learning_rate": 1e-05, "loss": 2.3337, "step": 292 }, { "epoch": 0.49316221333894383, "grad_norm": 0.0691385492682457, "learning_rate": 1e-05, "loss": 2.2904, "step": 293 }, { "epoch": 0.4948453608247423, "grad_norm": 0.06522157788276672, "learning_rate": 1e-05, "loss": 2.469, "step": 294 }, { "epoch": 0.4965285083105407, "grad_norm": 0.05957287177443504, "learning_rate": 1e-05, "loss": 2.4095, "step": 295 }, { "epoch": 0.49821165579633914, "grad_norm": 0.06277060508728027, "learning_rate": 1e-05, "loss": 2.4697, "step": 296 }, { "epoch": 0.4998948032821376, "grad_norm": 0.06802426278591156, "learning_rate": 1e-05, "loss": 2.2517, "step": 297 }, { "epoch": 0.501577950767936, "grad_norm": 0.06365792453289032, "learning_rate": 1e-05, "loss": 2.2942, "step": 298 }, { "epoch": 0.5032610982537344, "grad_norm": 0.06624794751405716, "learning_rate": 1e-05, "loss": 2.283, "step": 299 }, { "epoch": 0.5049442457395329, "grad_norm": 0.05979595705866814, "learning_rate": 1e-05, "loss": 2.4387, "step": 300 }, { "epoch": 0.5066273932253313, "grad_norm": 0.06187634915113449, "learning_rate": 1e-05, "loss": 2.4205, "step": 301 }, { "epoch": 0.5083105407111298, "grad_norm": 0.06389462947845459, "learning_rate": 1e-05, "loss": 2.2775, "step": 302 }, { "epoch": 0.5099936881969283, "grad_norm": 0.05831071361899376, "learning_rate": 1e-05, "loss": 2.3892, "step": 303 }, { "epoch": 0.5116768356827267, "grad_norm": 0.06568494439125061, "learning_rate": 1e-05, "loss": 2.3087, "step": 304 }, { "epoch": 0.5133599831685252, "grad_norm": 0.062109317630529404, "learning_rate": 1e-05, "loss": 2.3268, "step": 305 }, { "epoch": 0.5150431306543236, "grad_norm": 0.061168327927589417, "learning_rate": 1e-05, "loss": 2.3093, "step": 306 }, { "epoch": 0.5167262781401221, "grad_norm": 0.061159648001194, "learning_rate": 1e-05, "loss": 2.3315, "step": 307 }, { "epoch": 0.5184094256259205, "grad_norm": 0.06269169598817825, "learning_rate": 1e-05, "loss": 2.3442, "step": 308 }, { "epoch": 0.520092573111719, "grad_norm": 0.06711502373218536, "learning_rate": 1e-05, "loss": 2.2008, "step": 309 }, { "epoch": 0.5217757205975173, "grad_norm": 0.0663105845451355, "learning_rate": 1e-05, "loss": 2.3502, "step": 310 }, { "epoch": 0.5234588680833158, "grad_norm": 0.06040646880865097, "learning_rate": 1e-05, "loss": 2.3414, "step": 311 }, { "epoch": 0.5251420155691142, "grad_norm": 0.06823603063821793, "learning_rate": 1e-05, "loss": 2.3392, "step": 312 }, { "epoch": 0.5268251630549127, "grad_norm": 0.05944176763296127, "learning_rate": 1e-05, "loss": 2.3193, "step": 313 }, { "epoch": 0.5285083105407111, "grad_norm": 0.06610157340765, "learning_rate": 1e-05, "loss": 2.2288, "step": 314 }, { "epoch": 0.5301914580265096, "grad_norm": 0.06880299746990204, "learning_rate": 1e-05, "loss": 2.3529, "step": 315 }, { "epoch": 0.531874605512308, "grad_norm": 0.06061836704611778, "learning_rate": 1e-05, "loss": 2.3533, "step": 316 }, { "epoch": 0.5335577529981065, "grad_norm": 0.06552371382713318, "learning_rate": 1e-05, "loss": 2.3579, "step": 317 }, { "epoch": 0.5352409004839049, "grad_norm": 0.06967922300100327, "learning_rate": 1e-05, "loss": 2.2983, "step": 318 }, { "epoch": 0.5369240479697034, "grad_norm": 0.06997574120759964, "learning_rate": 1e-05, "loss": 2.355, "step": 319 }, { "epoch": 0.5386071954555018, "grad_norm": 0.0654403418302536, "learning_rate": 1e-05, "loss": 2.4258, "step": 320 }, { "epoch": 0.5402903429413002, "grad_norm": 0.06031208485364914, "learning_rate": 1e-05, "loss": 2.4011, "step": 321 }, { "epoch": 0.5419734904270986, "grad_norm": 0.06496379524469376, "learning_rate": 1e-05, "loss": 2.2429, "step": 322 }, { "epoch": 0.5436566379128971, "grad_norm": 0.06525281816720963, "learning_rate": 1e-05, "loss": 2.3254, "step": 323 }, { "epoch": 0.5453397853986955, "grad_norm": 0.07553514093160629, "learning_rate": 1e-05, "loss": 2.2953, "step": 324 }, { "epoch": 0.547022932884494, "grad_norm": 0.06429509073495865, "learning_rate": 1e-05, "loss": 2.3319, "step": 325 }, { "epoch": 0.5487060803702924, "grad_norm": 0.0657946914434433, "learning_rate": 1e-05, "loss": 2.3501, "step": 326 }, { "epoch": 0.5503892278560909, "grad_norm": 0.06548567861318588, "learning_rate": 1e-05, "loss": 2.2781, "step": 327 }, { "epoch": 0.5520723753418894, "grad_norm": 0.06299672275781631, "learning_rate": 1e-05, "loss": 2.377, "step": 328 }, { "epoch": 0.5537555228276878, "grad_norm": 0.06381850689649582, "learning_rate": 1e-05, "loss": 2.3945, "step": 329 }, { "epoch": 0.5554386703134863, "grad_norm": 0.06497140228748322, "learning_rate": 1e-05, "loss": 2.3496, "step": 330 }, { "epoch": 0.5571218177992847, "grad_norm": 0.06588133424520493, "learning_rate": 1e-05, "loss": 2.3955, "step": 331 }, { "epoch": 0.558804965285083, "grad_norm": 0.06468643248081207, "learning_rate": 1e-05, "loss": 2.2893, "step": 332 }, { "epoch": 0.5604881127708815, "grad_norm": 0.07278285920619965, "learning_rate": 1e-05, "loss": 2.3179, "step": 333 }, { "epoch": 0.56217126025668, "grad_norm": 0.06992325931787491, "learning_rate": 1e-05, "loss": 2.3588, "step": 334 }, { "epoch": 0.5638544077424784, "grad_norm": 0.06566626578569412, "learning_rate": 1e-05, "loss": 2.4763, "step": 335 }, { "epoch": 0.5655375552282769, "grad_norm": 0.0633927658200264, "learning_rate": 1e-05, "loss": 2.4685, "step": 336 }, { "epoch": 0.5672207027140753, "grad_norm": 0.06903122365474701, "learning_rate": 1e-05, "loss": 2.311, "step": 337 }, { "epoch": 0.5689038501998738, "grad_norm": 0.06421441584825516, "learning_rate": 1e-05, "loss": 2.3589, "step": 338 }, { "epoch": 0.5705869976856722, "grad_norm": 0.07122648507356644, "learning_rate": 1e-05, "loss": 2.3798, "step": 339 }, { "epoch": 0.5722701451714707, "grad_norm": 0.06518077105283737, "learning_rate": 1e-05, "loss": 2.4546, "step": 340 }, { "epoch": 0.5739532926572691, "grad_norm": 0.07509720325469971, "learning_rate": 1e-05, "loss": 2.3341, "step": 341 }, { "epoch": 0.5756364401430676, "grad_norm": 0.06559302657842636, "learning_rate": 1e-05, "loss": 2.3127, "step": 342 }, { "epoch": 0.5773195876288659, "grad_norm": 0.06652245670557022, "learning_rate": 1e-05, "loss": 2.3997, "step": 343 }, { "epoch": 0.5790027351146644, "grad_norm": 0.07472145557403564, "learning_rate": 1e-05, "loss": 2.3237, "step": 344 }, { "epoch": 0.5806858826004628, "grad_norm": 0.07624109089374542, "learning_rate": 1e-05, "loss": 2.186, "step": 345 }, { "epoch": 0.5823690300862613, "grad_norm": 0.06387084722518921, "learning_rate": 1e-05, "loss": 2.2717, "step": 346 }, { "epoch": 0.5840521775720597, "grad_norm": 0.06857839971780777, "learning_rate": 1e-05, "loss": 2.3726, "step": 347 }, { "epoch": 0.5857353250578582, "grad_norm": 0.06429892778396606, "learning_rate": 1e-05, "loss": 2.4109, "step": 348 }, { "epoch": 0.5874184725436566, "grad_norm": 0.0720372200012207, "learning_rate": 1e-05, "loss": 2.3291, "step": 349 }, { "epoch": 0.5891016200294551, "grad_norm": 0.0749678909778595, "learning_rate": 1e-05, "loss": 2.3369, "step": 350 }, { "epoch": 0.5907847675152536, "grad_norm": 0.0645705908536911, "learning_rate": 1e-05, "loss": 2.3894, "step": 351 }, { "epoch": 0.592467915001052, "grad_norm": 0.06680341064929962, "learning_rate": 1e-05, "loss": 2.3335, "step": 352 }, { "epoch": 0.5941510624868505, "grad_norm": 0.07383781671524048, "learning_rate": 1e-05, "loss": 2.2733, "step": 353 }, { "epoch": 0.5958342099726488, "grad_norm": 0.07338624447584152, "learning_rate": 1e-05, "loss": 2.2236, "step": 354 }, { "epoch": 0.5975173574584473, "grad_norm": 0.06998410820960999, "learning_rate": 1e-05, "loss": 2.2552, "step": 355 }, { "epoch": 0.5992005049442457, "grad_norm": 0.06697436422109604, "learning_rate": 1e-05, "loss": 2.4231, "step": 356 }, { "epoch": 0.6008836524300442, "grad_norm": 0.06693920493125916, "learning_rate": 1e-05, "loss": 2.3296, "step": 357 }, { "epoch": 0.6025667999158426, "grad_norm": 0.06306028366088867, "learning_rate": 1e-05, "loss": 2.4009, "step": 358 }, { "epoch": 0.6042499474016411, "grad_norm": 0.0724472776055336, "learning_rate": 1e-05, "loss": 2.2986, "step": 359 }, { "epoch": 0.6059330948874395, "grad_norm": 0.06711563467979431, "learning_rate": 1e-05, "loss": 2.3755, "step": 360 }, { "epoch": 0.607616242373238, "grad_norm": 0.07287666201591492, "learning_rate": 1e-05, "loss": 2.325, "step": 361 }, { "epoch": 0.6092993898590364, "grad_norm": 0.07494334876537323, "learning_rate": 1e-05, "loss": 2.2673, "step": 362 }, { "epoch": 0.6109825373448349, "grad_norm": 0.07399529218673706, "learning_rate": 1e-05, "loss": 2.3134, "step": 363 }, { "epoch": 0.6126656848306333, "grad_norm": 0.06705833226442337, "learning_rate": 1e-05, "loss": 2.3772, "step": 364 }, { "epoch": 0.6143488323164318, "grad_norm": 0.07528689503669739, "learning_rate": 1e-05, "loss": 2.3872, "step": 365 }, { "epoch": 0.6160319798022301, "grad_norm": 0.06814612448215485, "learning_rate": 1e-05, "loss": 2.2527, "step": 366 }, { "epoch": 0.6177151272880286, "grad_norm": 0.06929857283830643, "learning_rate": 1e-05, "loss": 2.4138, "step": 367 }, { "epoch": 0.619398274773827, "grad_norm": 0.07336314767599106, "learning_rate": 1e-05, "loss": 2.4197, "step": 368 }, { "epoch": 0.6210814222596255, "grad_norm": 0.07009201496839523, "learning_rate": 1e-05, "loss": 2.3943, "step": 369 }, { "epoch": 0.6227645697454239, "grad_norm": 0.07367721945047379, "learning_rate": 1e-05, "loss": 2.3044, "step": 370 }, { "epoch": 0.6244477172312224, "grad_norm": 0.07029354572296143, "learning_rate": 1e-05, "loss": 2.3018, "step": 371 }, { "epoch": 0.6261308647170208, "grad_norm": 0.07852700352668762, "learning_rate": 1e-05, "loss": 2.3727, "step": 372 }, { "epoch": 0.6278140122028193, "grad_norm": 0.0764508917927742, "learning_rate": 1e-05, "loss": 2.1992, "step": 373 }, { "epoch": 0.6294971596886177, "grad_norm": 0.0799420177936554, "learning_rate": 1e-05, "loss": 2.2693, "step": 374 }, { "epoch": 0.6311803071744162, "grad_norm": 0.06878554075956345, "learning_rate": 1e-05, "loss": 2.4749, "step": 375 }, { "epoch": 0.6328634546602147, "grad_norm": 0.07085944712162018, "learning_rate": 1e-05, "loss": 2.3435, "step": 376 }, { "epoch": 0.634546602146013, "grad_norm": 0.06489285826683044, "learning_rate": 1e-05, "loss": 2.3257, "step": 377 }, { "epoch": 0.6362297496318114, "grad_norm": 0.06664973497390747, "learning_rate": 1e-05, "loss": 2.5022, "step": 378 }, { "epoch": 0.6379128971176099, "grad_norm": 0.07660377770662308, "learning_rate": 1e-05, "loss": 2.3269, "step": 379 }, { "epoch": 0.6395960446034084, "grad_norm": 0.06934674084186554, "learning_rate": 1e-05, "loss": 2.4021, "step": 380 }, { "epoch": 0.6412791920892068, "grad_norm": 0.07515530288219452, "learning_rate": 1e-05, "loss": 2.3157, "step": 381 }, { "epoch": 0.6429623395750053, "grad_norm": 0.07302498072385788, "learning_rate": 1e-05, "loss": 2.3892, "step": 382 }, { "epoch": 0.6446454870608037, "grad_norm": 0.07303425669670105, "learning_rate": 1e-05, "loss": 2.3765, "step": 383 }, { "epoch": 0.6463286345466022, "grad_norm": 0.07705460488796234, "learning_rate": 1e-05, "loss": 2.2684, "step": 384 }, { "epoch": 0.6480117820324006, "grad_norm": 0.07487067580223083, "learning_rate": 1e-05, "loss": 2.3733, "step": 385 }, { "epoch": 0.6496949295181991, "grad_norm": 0.06538619101047516, "learning_rate": 1e-05, "loss": 2.3789, "step": 386 }, { "epoch": 0.6513780770039975, "grad_norm": 0.07406684756278992, "learning_rate": 1e-05, "loss": 2.332, "step": 387 }, { "epoch": 0.6530612244897959, "grad_norm": 0.07246539741754532, "learning_rate": 1e-05, "loss": 2.2302, "step": 388 }, { "epoch": 0.6547443719755943, "grad_norm": 0.07304323464632034, "learning_rate": 1e-05, "loss": 2.3708, "step": 389 }, { "epoch": 0.6564275194613928, "grad_norm": 0.07457181811332703, "learning_rate": 1e-05, "loss": 2.2991, "step": 390 }, { "epoch": 0.6581106669471912, "grad_norm": 0.07300930470228195, "learning_rate": 1e-05, "loss": 2.2423, "step": 391 }, { "epoch": 0.6597938144329897, "grad_norm": 0.07508236914873123, "learning_rate": 1e-05, "loss": 2.2642, "step": 392 }, { "epoch": 0.6614769619187881, "grad_norm": 0.07481173425912857, "learning_rate": 1e-05, "loss": 2.3, "step": 393 }, { "epoch": 0.6631601094045866, "grad_norm": 0.06851742416620255, "learning_rate": 1e-05, "loss": 2.4534, "step": 394 }, { "epoch": 0.664843256890385, "grad_norm": 0.07536716759204865, "learning_rate": 1e-05, "loss": 2.3264, "step": 395 }, { "epoch": 0.6665264043761835, "grad_norm": 0.07752048969268799, "learning_rate": 1e-05, "loss": 2.4158, "step": 396 }, { "epoch": 0.6682095518619819, "grad_norm": 0.06357281655073166, "learning_rate": 1e-05, "loss": 2.4956, "step": 397 }, { "epoch": 0.6698926993477804, "grad_norm": 0.08333004266023636, "learning_rate": 1e-05, "loss": 2.3921, "step": 398 }, { "epoch": 0.6715758468335787, "grad_norm": 0.06873282790184021, "learning_rate": 1e-05, "loss": 2.3611, "step": 399 }, { "epoch": 0.6732589943193772, "grad_norm": 0.07533644139766693, "learning_rate": 1e-05, "loss": 2.3708, "step": 400 }, { "epoch": 0.6749421418051756, "grad_norm": 0.07756076753139496, "learning_rate": 1e-05, "loss": 2.3003, "step": 401 }, { "epoch": 0.6766252892909741, "grad_norm": 0.06644177436828613, "learning_rate": 1e-05, "loss": 2.4331, "step": 402 }, { "epoch": 0.6783084367767725, "grad_norm": 0.07512148469686508, "learning_rate": 1e-05, "loss": 2.2881, "step": 403 }, { "epoch": 0.679991584262571, "grad_norm": 0.08939874172210693, "learning_rate": 1e-05, "loss": 2.1564, "step": 404 }, { "epoch": 0.6816747317483695, "grad_norm": 0.07984601706266403, "learning_rate": 1e-05, "loss": 2.3967, "step": 405 }, { "epoch": 0.6833578792341679, "grad_norm": 0.0724392980337143, "learning_rate": 1e-05, "loss": 2.2859, "step": 406 }, { "epoch": 0.6850410267199664, "grad_norm": 0.07025589793920517, "learning_rate": 1e-05, "loss": 2.3027, "step": 407 }, { "epoch": 0.6867241742057648, "grad_norm": 0.07863828539848328, "learning_rate": 1e-05, "loss": 2.3286, "step": 408 }, { "epoch": 0.6884073216915633, "grad_norm": 0.07466793060302734, "learning_rate": 1e-05, "loss": 2.2849, "step": 409 }, { "epoch": 0.6900904691773617, "grad_norm": 0.07291209697723389, "learning_rate": 1e-05, "loss": 2.3931, "step": 410 }, { "epoch": 0.6917736166631601, "grad_norm": 0.072298564016819, "learning_rate": 1e-05, "loss": 2.377, "step": 411 }, { "epoch": 0.6934567641489585, "grad_norm": 0.06996294856071472, "learning_rate": 1e-05, "loss": 2.3503, "step": 412 }, { "epoch": 0.695139911634757, "grad_norm": 0.07319701462984085, "learning_rate": 1e-05, "loss": 2.345, "step": 413 }, { "epoch": 0.6968230591205554, "grad_norm": 0.0768033117055893, "learning_rate": 1e-05, "loss": 2.3679, "step": 414 }, { "epoch": 0.6985062066063539, "grad_norm": 0.07401002943515778, "learning_rate": 1e-05, "loss": 2.3435, "step": 415 }, { "epoch": 0.7001893540921523, "grad_norm": 0.07700485736131668, "learning_rate": 1e-05, "loss": 2.3428, "step": 416 }, { "epoch": 0.7018725015779508, "grad_norm": 0.07446201890707016, "learning_rate": 1e-05, "loss": 2.4133, "step": 417 }, { "epoch": 0.7035556490637492, "grad_norm": 0.06801878660917282, "learning_rate": 1e-05, "loss": 2.3665, "step": 418 }, { "epoch": 0.7052387965495477, "grad_norm": 0.07989214360713959, "learning_rate": 1e-05, "loss": 2.3303, "step": 419 }, { "epoch": 0.7069219440353461, "grad_norm": 0.07385462522506714, "learning_rate": 1e-05, "loss": 2.3608, "step": 420 }, { "epoch": 0.7086050915211446, "grad_norm": 0.06808451563119888, "learning_rate": 1e-05, "loss": 2.4851, "step": 421 }, { "epoch": 0.7102882390069429, "grad_norm": 0.07354162633419037, "learning_rate": 1e-05, "loss": 2.3005, "step": 422 }, { "epoch": 0.7119713864927414, "grad_norm": 0.07730504870414734, "learning_rate": 1e-05, "loss": 2.2815, "step": 423 }, { "epoch": 0.7136545339785398, "grad_norm": 0.08045239001512527, "learning_rate": 1e-05, "loss": 2.2695, "step": 424 }, { "epoch": 0.7153376814643383, "grad_norm": 0.07997512817382812, "learning_rate": 1e-05, "loss": 2.3608, "step": 425 }, { "epoch": 0.7170208289501367, "grad_norm": 0.07076172530651093, "learning_rate": 1e-05, "loss": 2.3411, "step": 426 }, { "epoch": 0.7187039764359352, "grad_norm": 0.07223929464817047, "learning_rate": 1e-05, "loss": 2.3452, "step": 427 }, { "epoch": 0.7203871239217337, "grad_norm": 0.07667456567287445, "learning_rate": 1e-05, "loss": 2.333, "step": 428 }, { "epoch": 0.7220702714075321, "grad_norm": 0.07509643584489822, "learning_rate": 1e-05, "loss": 2.3701, "step": 429 }, { "epoch": 0.7237534188933306, "grad_norm": 0.08230644464492798, "learning_rate": 1e-05, "loss": 2.3577, "step": 430 }, { "epoch": 0.725436566379129, "grad_norm": 0.06938886642456055, "learning_rate": 1e-05, "loss": 2.4573, "step": 431 }, { "epoch": 0.7271197138649275, "grad_norm": 0.07415178418159485, "learning_rate": 1e-05, "loss": 2.2834, "step": 432 }, { "epoch": 0.7288028613507258, "grad_norm": 0.0821278989315033, "learning_rate": 1e-05, "loss": 2.2744, "step": 433 }, { "epoch": 0.7304860088365243, "grad_norm": 0.07293502986431122, "learning_rate": 1e-05, "loss": 2.313, "step": 434 }, { "epoch": 0.7321691563223227, "grad_norm": 0.07829819619655609, "learning_rate": 1e-05, "loss": 2.3849, "step": 435 }, { "epoch": 0.7338523038081212, "grad_norm": 0.07795297354459763, "learning_rate": 1e-05, "loss": 2.2466, "step": 436 }, { "epoch": 0.7355354512939196, "grad_norm": 0.06956803798675537, "learning_rate": 1e-05, "loss": 2.4038, "step": 437 }, { "epoch": 0.7372185987797181, "grad_norm": 0.07948347926139832, "learning_rate": 1e-05, "loss": 2.3042, "step": 438 }, { "epoch": 0.7389017462655165, "grad_norm": 0.08074218034744263, "learning_rate": 1e-05, "loss": 2.3314, "step": 439 }, { "epoch": 0.740584893751315, "grad_norm": 0.08029188960790634, "learning_rate": 1e-05, "loss": 2.312, "step": 440 }, { "epoch": 0.7422680412371134, "grad_norm": 0.0783049538731575, "learning_rate": 1e-05, "loss": 2.307, "step": 441 }, { "epoch": 0.7439511887229119, "grad_norm": 0.08203115314245224, "learning_rate": 1e-05, "loss": 2.3081, "step": 442 }, { "epoch": 0.7456343362087103, "grad_norm": 0.08666986972093582, "learning_rate": 1e-05, "loss": 2.3721, "step": 443 }, { "epoch": 0.7473174836945087, "grad_norm": 0.08097022771835327, "learning_rate": 1e-05, "loss": 2.1912, "step": 444 }, { "epoch": 0.7490006311803071, "grad_norm": 0.08272138237953186, "learning_rate": 1e-05, "loss": 2.3562, "step": 445 }, { "epoch": 0.7506837786661056, "grad_norm": 0.08114828914403915, "learning_rate": 1e-05, "loss": 2.3569, "step": 446 }, { "epoch": 0.752366926151904, "grad_norm": 0.07786712795495987, "learning_rate": 1e-05, "loss": 2.3772, "step": 447 }, { "epoch": 0.7540500736377025, "grad_norm": 0.07603191584348679, "learning_rate": 1e-05, "loss": 2.2748, "step": 448 }, { "epoch": 0.7557332211235009, "grad_norm": 0.08364319056272507, "learning_rate": 1e-05, "loss": 2.334, "step": 449 }, { "epoch": 0.7574163686092994, "grad_norm": 0.07968125492334366, "learning_rate": 1e-05, "loss": 2.3225, "step": 450 }, { "epoch": 0.7590995160950978, "grad_norm": 0.08204993605613708, "learning_rate": 1e-05, "loss": 2.3107, "step": 451 }, { "epoch": 0.7607826635808963, "grad_norm": 0.08319111168384552, "learning_rate": 1e-05, "loss": 2.3994, "step": 452 }, { "epoch": 0.7624658110666948, "grad_norm": 0.07812530547380447, "learning_rate": 1e-05, "loss": 2.2771, "step": 453 }, { "epoch": 0.7641489585524932, "grad_norm": 0.07962696999311447, "learning_rate": 1e-05, "loss": 2.3094, "step": 454 }, { "epoch": 0.7658321060382917, "grad_norm": 0.0815802663564682, "learning_rate": 1e-05, "loss": 2.3169, "step": 455 }, { "epoch": 0.76751525352409, "grad_norm": 0.08460783958435059, "learning_rate": 1e-05, "loss": 2.2443, "step": 456 }, { "epoch": 0.7691984010098885, "grad_norm": 0.07976390421390533, "learning_rate": 1e-05, "loss": 2.26, "step": 457 }, { "epoch": 0.7708815484956869, "grad_norm": 0.08143635839223862, "learning_rate": 1e-05, "loss": 2.2517, "step": 458 }, { "epoch": 0.7725646959814854, "grad_norm": 0.08004558831453323, "learning_rate": 1e-05, "loss": 2.3276, "step": 459 }, { "epoch": 0.7742478434672838, "grad_norm": 0.0831751599907875, "learning_rate": 1e-05, "loss": 2.2842, "step": 460 }, { "epoch": 0.7759309909530823, "grad_norm": 0.07613930851221085, "learning_rate": 1e-05, "loss": 2.3958, "step": 461 }, { "epoch": 0.7776141384388807, "grad_norm": 0.08161590993404388, "learning_rate": 1e-05, "loss": 2.3287, "step": 462 }, { "epoch": 0.7792972859246792, "grad_norm": 0.08616164326667786, "learning_rate": 1e-05, "loss": 2.3098, "step": 463 }, { "epoch": 0.7809804334104776, "grad_norm": 0.08720822632312775, "learning_rate": 1e-05, "loss": 2.1388, "step": 464 }, { "epoch": 0.7826635808962761, "grad_norm": 0.08598899841308594, "learning_rate": 1e-05, "loss": 2.3005, "step": 465 }, { "epoch": 0.7843467283820745, "grad_norm": 0.07982167601585388, "learning_rate": 1e-05, "loss": 2.3049, "step": 466 }, { "epoch": 0.7860298758678729, "grad_norm": 0.08733374625444412, "learning_rate": 1e-05, "loss": 2.2747, "step": 467 }, { "epoch": 0.7877130233536713, "grad_norm": 0.08848235011100769, "learning_rate": 1e-05, "loss": 2.4331, "step": 468 }, { "epoch": 0.7893961708394698, "grad_norm": 0.08619164675474167, "learning_rate": 1e-05, "loss": 2.2881, "step": 469 }, { "epoch": 0.7910793183252682, "grad_norm": 0.08046075701713562, "learning_rate": 1e-05, "loss": 2.397, "step": 470 }, { "epoch": 0.7927624658110667, "grad_norm": 0.08469874411821365, "learning_rate": 1e-05, "loss": 2.3225, "step": 471 }, { "epoch": 0.7944456132968651, "grad_norm": 0.08878640830516815, "learning_rate": 1e-05, "loss": 2.2832, "step": 472 }, { "epoch": 0.7961287607826636, "grad_norm": 0.08530005067586899, "learning_rate": 1e-05, "loss": 2.28, "step": 473 }, { "epoch": 0.797811908268462, "grad_norm": 0.08089161664247513, "learning_rate": 1e-05, "loss": 2.2822, "step": 474 }, { "epoch": 0.7994950557542605, "grad_norm": 0.0770372822880745, "learning_rate": 1e-05, "loss": 2.4031, "step": 475 }, { "epoch": 0.801178203240059, "grad_norm": 0.08313820511102676, "learning_rate": 1e-05, "loss": 2.4009, "step": 476 }, { "epoch": 0.8028613507258574, "grad_norm": 0.08684401214122772, "learning_rate": 1e-05, "loss": 2.4563, "step": 477 }, { "epoch": 0.8045444982116557, "grad_norm": 0.08352997899055481, "learning_rate": 1e-05, "loss": 2.3242, "step": 478 }, { "epoch": 0.8062276456974542, "grad_norm": 0.08148252218961716, "learning_rate": 1e-05, "loss": 2.3096, "step": 479 }, { "epoch": 0.8079107931832527, "grad_norm": 0.08157838881015778, "learning_rate": 1e-05, "loss": 2.3108, "step": 480 }, { "epoch": 0.8095939406690511, "grad_norm": 0.08561182022094727, "learning_rate": 1e-05, "loss": 2.2327, "step": 481 }, { "epoch": 0.8112770881548496, "grad_norm": 0.09177689999341965, "learning_rate": 1e-05, "loss": 2.2129, "step": 482 }, { "epoch": 0.812960235640648, "grad_norm": 0.08262176811695099, "learning_rate": 1e-05, "loss": 2.397, "step": 483 }, { "epoch": 0.8146433831264465, "grad_norm": 0.08541447669267654, "learning_rate": 1e-05, "loss": 2.2419, "step": 484 }, { "epoch": 0.8163265306122449, "grad_norm": 0.08732729405164719, "learning_rate": 1e-05, "loss": 2.3328, "step": 485 }, { "epoch": 0.8180096780980434, "grad_norm": 0.08658833056688309, "learning_rate": 1e-05, "loss": 2.2793, "step": 486 }, { "epoch": 0.8196928255838418, "grad_norm": 0.0789208933711052, "learning_rate": 1e-05, "loss": 2.4072, "step": 487 }, { "epoch": 0.8213759730696403, "grad_norm": 0.07870952039957047, "learning_rate": 1e-05, "loss": 2.4082, "step": 488 }, { "epoch": 0.8230591205554386, "grad_norm": 0.07583601027727127, "learning_rate": 1e-05, "loss": 2.3833, "step": 489 }, { "epoch": 0.8247422680412371, "grad_norm": 0.08982661366462708, "learning_rate": 1e-05, "loss": 2.2766, "step": 490 }, { "epoch": 0.8264254155270355, "grad_norm": 0.08841705322265625, "learning_rate": 1e-05, "loss": 2.2581, "step": 491 }, { "epoch": 0.828108563012834, "grad_norm": 0.08784886449575424, "learning_rate": 1e-05, "loss": 2.2352, "step": 492 }, { "epoch": 0.8297917104986324, "grad_norm": 0.08765432238578796, "learning_rate": 1e-05, "loss": 2.1957, "step": 493 }, { "epoch": 0.8314748579844309, "grad_norm": 0.09070983529090881, "learning_rate": 1e-05, "loss": 2.2451, "step": 494 }, { "epoch": 0.8331580054702293, "grad_norm": 0.08307146281003952, "learning_rate": 1e-05, "loss": 2.3645, "step": 495 }, { "epoch": 0.8348411529560278, "grad_norm": 0.07774417847394943, "learning_rate": 1e-05, "loss": 2.3921, "step": 496 }, { "epoch": 0.8365243004418262, "grad_norm": 0.08441779762506485, "learning_rate": 1e-05, "loss": 2.2974, "step": 497 }, { "epoch": 0.8382074479276247, "grad_norm": 0.08773106336593628, "learning_rate": 1e-05, "loss": 2.3984, "step": 498 }, { "epoch": 0.8398905954134231, "grad_norm": 0.08157604187726974, "learning_rate": 1e-05, "loss": 2.2946, "step": 499 }, { "epoch": 0.8415737428992216, "grad_norm": 0.09280236810445786, "learning_rate": 1e-05, "loss": 2.3628, "step": 500 }, { "epoch": 0.8432568903850199, "grad_norm": 0.08737549185752869, "learning_rate": 1e-05, "loss": 2.2593, "step": 501 }, { "epoch": 0.8449400378708184, "grad_norm": 0.08917705714702606, "learning_rate": 1e-05, "loss": 2.2435, "step": 502 }, { "epoch": 0.8466231853566168, "grad_norm": 0.08589258790016174, "learning_rate": 1e-05, "loss": 2.2869, "step": 503 }, { "epoch": 0.8483063328424153, "grad_norm": 0.08363740891218185, "learning_rate": 1e-05, "loss": 2.1512, "step": 504 }, { "epoch": 0.8499894803282138, "grad_norm": 0.09710842370986938, "learning_rate": 1e-05, "loss": 2.3042, "step": 505 }, { "epoch": 0.8516726278140122, "grad_norm": 0.09031599014997482, "learning_rate": 1e-05, "loss": 2.2406, "step": 506 }, { "epoch": 0.8533557752998107, "grad_norm": 0.08941849321126938, "learning_rate": 1e-05, "loss": 2.2725, "step": 507 }, { "epoch": 0.8550389227856091, "grad_norm": 0.08926845341920853, "learning_rate": 1e-05, "loss": 2.323, "step": 508 }, { "epoch": 0.8567220702714076, "grad_norm": 0.08846578001976013, "learning_rate": 1e-05, "loss": 2.3394, "step": 509 }, { "epoch": 0.858405217757206, "grad_norm": 0.08452317863702774, "learning_rate": 1e-05, "loss": 2.4158, "step": 510 }, { "epoch": 0.8600883652430045, "grad_norm": 0.08531490713357925, "learning_rate": 1e-05, "loss": 2.3113, "step": 511 }, { "epoch": 0.8617715127288028, "grad_norm": 0.08221501857042313, "learning_rate": 1e-05, "loss": 2.3826, "step": 512 }, { "epoch": 0.8634546602146013, "grad_norm": 0.08809410035610199, "learning_rate": 1e-05, "loss": 2.2666, "step": 513 }, { "epoch": 0.8651378077003997, "grad_norm": 0.0881451964378357, "learning_rate": 1e-05, "loss": 2.4678, "step": 514 }, { "epoch": 0.8668209551861982, "grad_norm": 0.0958879366517067, "learning_rate": 1e-05, "loss": 2.17, "step": 515 }, { "epoch": 0.8685041026719966, "grad_norm": 0.08498766273260117, "learning_rate": 1e-05, "loss": 2.4021, "step": 516 }, { "epoch": 0.8701872501577951, "grad_norm": 0.09182509779930115, "learning_rate": 1e-05, "loss": 2.2476, "step": 517 }, { "epoch": 0.8718703976435935, "grad_norm": 0.08831535279750824, "learning_rate": 1e-05, "loss": 2.3013, "step": 518 }, { "epoch": 0.873553545129392, "grad_norm": 0.08792266249656677, "learning_rate": 1e-05, "loss": 2.2463, "step": 519 }, { "epoch": 0.8752366926151904, "grad_norm": 0.0804978460073471, "learning_rate": 1e-05, "loss": 2.5151, "step": 520 }, { "epoch": 0.8769198401009889, "grad_norm": 0.09397967159748077, "learning_rate": 1e-05, "loss": 2.2487, "step": 521 }, { "epoch": 0.8786029875867873, "grad_norm": 0.08882005512714386, "learning_rate": 1e-05, "loss": 2.225, "step": 522 }, { "epoch": 0.8802861350725857, "grad_norm": 0.08365931361913681, "learning_rate": 1e-05, "loss": 2.4277, "step": 523 }, { "epoch": 0.8819692825583841, "grad_norm": 0.08842651546001434, "learning_rate": 1e-05, "loss": 2.3884, "step": 524 }, { "epoch": 0.8836524300441826, "grad_norm": 0.08760154247283936, "learning_rate": 1e-05, "loss": 2.2576, "step": 525 }, { "epoch": 0.885335577529981, "grad_norm": 0.07843348383903503, "learning_rate": 1e-05, "loss": 2.4143, "step": 526 }, { "epoch": 0.8870187250157795, "grad_norm": 0.09312726557254791, "learning_rate": 1e-05, "loss": 2.2472, "step": 527 }, { "epoch": 0.888701872501578, "grad_norm": 0.09460542351007462, "learning_rate": 1e-05, "loss": 2.2043, "step": 528 }, { "epoch": 0.8903850199873764, "grad_norm": 0.09200920909643173, "learning_rate": 1e-05, "loss": 2.3562, "step": 529 }, { "epoch": 0.8920681674731749, "grad_norm": 0.08051000535488129, "learning_rate": 1e-05, "loss": 2.4146, "step": 530 }, { "epoch": 0.8937513149589733, "grad_norm": 0.09969057142734528, "learning_rate": 1e-05, "loss": 2.3342, "step": 531 }, { "epoch": 0.8954344624447718, "grad_norm": 0.08616895228624344, "learning_rate": 1e-05, "loss": 2.3381, "step": 532 }, { "epoch": 0.8971176099305702, "grad_norm": 0.09115055203437805, "learning_rate": 1e-05, "loss": 2.2377, "step": 533 }, { "epoch": 0.8988007574163686, "grad_norm": 0.10309138149023056, "learning_rate": 1e-05, "loss": 2.1418, "step": 534 }, { "epoch": 0.900483904902167, "grad_norm": 0.09327155351638794, "learning_rate": 1e-05, "loss": 2.312, "step": 535 }, { "epoch": 0.9021670523879655, "grad_norm": 0.09104789048433304, "learning_rate": 1e-05, "loss": 2.2759, "step": 536 }, { "epoch": 0.9038501998737639, "grad_norm": 0.08858876675367355, "learning_rate": 1e-05, "loss": 2.4138, "step": 537 }, { "epoch": 0.9055333473595624, "grad_norm": 0.08850864320993423, "learning_rate": 1e-05, "loss": 2.3915, "step": 538 }, { "epoch": 0.9072164948453608, "grad_norm": 0.09071122854948044, "learning_rate": 1e-05, "loss": 2.4199, "step": 539 }, { "epoch": 0.9088996423311593, "grad_norm": 0.08702193200588226, "learning_rate": 1e-05, "loss": 2.3079, "step": 540 }, { "epoch": 0.9105827898169577, "grad_norm": 0.09564194083213806, "learning_rate": 1e-05, "loss": 2.2996, "step": 541 }, { "epoch": 0.9122659373027562, "grad_norm": 0.08906988054513931, "learning_rate": 1e-05, "loss": 2.3958, "step": 542 }, { "epoch": 0.9139490847885546, "grad_norm": 0.08117242157459259, "learning_rate": 1e-05, "loss": 2.5557, "step": 543 }, { "epoch": 0.9156322322743531, "grad_norm": 0.09870729595422745, "learning_rate": 1e-05, "loss": 2.3542, "step": 544 }, { "epoch": 0.9173153797601514, "grad_norm": 0.0906287208199501, "learning_rate": 1e-05, "loss": 2.2866, "step": 545 }, { "epoch": 0.9189985272459499, "grad_norm": 0.08649491518735886, "learning_rate": 1e-05, "loss": 2.3547, "step": 546 }, { "epoch": 0.9206816747317483, "grad_norm": 0.09572413563728333, "learning_rate": 1e-05, "loss": 2.377, "step": 547 }, { "epoch": 0.9223648222175468, "grad_norm": 0.08862059563398361, "learning_rate": 1e-05, "loss": 2.3452, "step": 548 }, { "epoch": 0.9240479697033452, "grad_norm": 0.09061957150697708, "learning_rate": 1e-05, "loss": 2.264, "step": 549 }, { "epoch": 0.9257311171891437, "grad_norm": 0.10327678918838501, "learning_rate": 1e-05, "loss": 2.3362, "step": 550 }, { "epoch": 0.9274142646749421, "grad_norm": 0.10101998597383499, "learning_rate": 1e-05, "loss": 2.2091, "step": 551 }, { "epoch": 0.9290974121607406, "grad_norm": 0.08099676668643951, "learning_rate": 1e-05, "loss": 2.3779, "step": 552 }, { "epoch": 0.930780559646539, "grad_norm": 0.09572342783212662, "learning_rate": 1e-05, "loss": 2.2186, "step": 553 }, { "epoch": 0.9324637071323375, "grad_norm": 0.10440348833799362, "learning_rate": 1e-05, "loss": 2.2717, "step": 554 }, { "epoch": 0.934146854618136, "grad_norm": 0.09859239310026169, "learning_rate": 1e-05, "loss": 2.2964, "step": 555 }, { "epoch": 0.9358300021039344, "grad_norm": 0.08539914339780807, "learning_rate": 1e-05, "loss": 2.3541, "step": 556 }, { "epoch": 0.9375131495897328, "grad_norm": 0.09667155891656876, "learning_rate": 1e-05, "loss": 2.2412, "step": 557 }, { "epoch": 0.9391962970755312, "grad_norm": 0.09381328523159027, "learning_rate": 1e-05, "loss": 2.1632, "step": 558 }, { "epoch": 0.9408794445613297, "grad_norm": 0.10293637216091156, "learning_rate": 1e-05, "loss": 2.2969, "step": 559 }, { "epoch": 0.9425625920471281, "grad_norm": 0.08901844918727875, "learning_rate": 1e-05, "loss": 2.2806, "step": 560 }, { "epoch": 0.9442457395329266, "grad_norm": 0.09931071847677231, "learning_rate": 1e-05, "loss": 2.2671, "step": 561 }, { "epoch": 0.945928887018725, "grad_norm": 0.08619210124015808, "learning_rate": 1e-05, "loss": 2.428, "step": 562 }, { "epoch": 0.9476120345045235, "grad_norm": 0.08460855484008789, "learning_rate": 1e-05, "loss": 2.2412, "step": 563 }, { "epoch": 0.9492951819903219, "grad_norm": 0.09682973474264145, "learning_rate": 1e-05, "loss": 2.3339, "step": 564 }, { "epoch": 0.9509783294761204, "grad_norm": 0.10189709812402725, "learning_rate": 1e-05, "loss": 2.2268, "step": 565 }, { "epoch": 0.9526614769619188, "grad_norm": 0.10271991789340973, "learning_rate": 1e-05, "loss": 2.1819, "step": 566 }, { "epoch": 0.9543446244477173, "grad_norm": 0.0901963859796524, "learning_rate": 1e-05, "loss": 2.3029, "step": 567 }, { "epoch": 0.9560277719335156, "grad_norm": 0.09148905426263809, "learning_rate": 1e-05, "loss": 2.3362, "step": 568 }, { "epoch": 0.9577109194193141, "grad_norm": 0.10434332489967346, "learning_rate": 1e-05, "loss": 2.3037, "step": 569 }, { "epoch": 0.9593940669051125, "grad_norm": 0.0956675261259079, "learning_rate": 1e-05, "loss": 2.3442, "step": 570 }, { "epoch": 0.961077214390911, "grad_norm": 0.09394146502017975, "learning_rate": 1e-05, "loss": 2.2913, "step": 571 }, { "epoch": 0.9627603618767094, "grad_norm": 0.09179794043302536, "learning_rate": 1e-05, "loss": 2.21, "step": 572 }, { "epoch": 0.9644435093625079, "grad_norm": 0.09866604208946228, "learning_rate": 1e-05, "loss": 2.2721, "step": 573 }, { "epoch": 0.9661266568483063, "grad_norm": 0.10069537162780762, "learning_rate": 1e-05, "loss": 2.1637, "step": 574 }, { "epoch": 0.9678098043341048, "grad_norm": 0.0923682376742363, "learning_rate": 1e-05, "loss": 2.2343, "step": 575 }, { "epoch": 0.9694929518199032, "grad_norm": 0.08836492151021957, "learning_rate": 1e-05, "loss": 2.3794, "step": 576 }, { "epoch": 0.9711760993057017, "grad_norm": 0.0894513726234436, "learning_rate": 1e-05, "loss": 2.2378, "step": 577 }, { "epoch": 0.9728592467915002, "grad_norm": 0.08647426962852478, "learning_rate": 1e-05, "loss": 2.3589, "step": 578 }, { "epoch": 0.9745423942772985, "grad_norm": 0.11035202443599701, "learning_rate": 1e-05, "loss": 2.2371, "step": 579 }, { "epoch": 0.976225541763097, "grad_norm": 0.09551876783370972, "learning_rate": 1e-05, "loss": 2.3353, "step": 580 }, { "epoch": 0.9779086892488954, "grad_norm": 0.0911082923412323, "learning_rate": 1e-05, "loss": 2.3264, "step": 581 }, { "epoch": 0.9795918367346939, "grad_norm": 0.10280529409646988, "learning_rate": 1e-05, "loss": 2.2351, "step": 582 }, { "epoch": 0.9812749842204923, "grad_norm": 0.09424940496683121, "learning_rate": 1e-05, "loss": 2.3464, "step": 583 }, { "epoch": 0.9829581317062908, "grad_norm": 0.092115618288517, "learning_rate": 1e-05, "loss": 2.2799, "step": 584 }, { "epoch": 0.9846412791920892, "grad_norm": 0.09771659225225449, "learning_rate": 1e-05, "loss": 2.3777, "step": 585 }, { "epoch": 0.9863244266778877, "grad_norm": 0.09877105802297592, "learning_rate": 1e-05, "loss": 2.3613, "step": 586 }, { "epoch": 0.9880075741636861, "grad_norm": 0.09816967695951462, "learning_rate": 1e-05, "loss": 2.2925, "step": 587 }, { "epoch": 0.9896907216494846, "grad_norm": 0.0874725803732872, "learning_rate": 1e-05, "loss": 2.3154, "step": 588 }, { "epoch": 0.991373869135283, "grad_norm": 0.09336823225021362, "learning_rate": 1e-05, "loss": 2.3933, "step": 589 }, { "epoch": 0.9930570166210814, "grad_norm": 0.10439187288284302, "learning_rate": 1e-05, "loss": 2.3655, "step": 590 }, { "epoch": 0.9947401641068798, "grad_norm": 0.09005751460790634, "learning_rate": 1e-05, "loss": 2.2971, "step": 591 }, { "epoch": 0.9964233115926783, "grad_norm": 0.10612068325281143, "learning_rate": 1e-05, "loss": 2.3584, "step": 592 }, { "epoch": 0.9981064590784767, "grad_norm": 0.09101177752017975, "learning_rate": 1e-05, "loss": 2.4402, "step": 593 }, { "epoch": 0.9997896065642752, "grad_norm": 0.09874800592660904, "learning_rate": 1e-05, "loss": 2.326, "step": 594 }, { "epoch": 1.0014727540500736, "grad_norm": 0.1025647521018982, "learning_rate": 1e-05, "loss": 2.4041, "step": 595 }, { "epoch": 1.003155901535872, "grad_norm": 0.11109832674264908, "learning_rate": 1e-05, "loss": 2.2881, "step": 596 }, { "epoch": 1.0048390490216705, "grad_norm": 0.09670565277338028, "learning_rate": 1e-05, "loss": 2.2003, "step": 597 }, { "epoch": 1.0065221965074689, "grad_norm": 0.09513822942972183, "learning_rate": 1e-05, "loss": 2.3225, "step": 598 }, { "epoch": 1.0082053439932674, "grad_norm": 0.11121483892202377, "learning_rate": 1e-05, "loss": 2.4143, "step": 599 }, { "epoch": 1.0098884914790658, "grad_norm": 0.09941378980875015, "learning_rate": 1e-05, "loss": 2.333, "step": 600 }, { "epoch": 1.0115716389648644, "grad_norm": 0.09730757772922516, "learning_rate": 1e-05, "loss": 2.3638, "step": 601 }, { "epoch": 1.0132547864506627, "grad_norm": 0.10626422613859177, "learning_rate": 1e-05, "loss": 2.2303, "step": 602 }, { "epoch": 1.0149379339364613, "grad_norm": 0.0958971306681633, "learning_rate": 1e-05, "loss": 2.3906, "step": 603 }, { "epoch": 1.0166210814222596, "grad_norm": 0.10065159201622009, "learning_rate": 1e-05, "loss": 2.3425, "step": 604 }, { "epoch": 1.0183042289080582, "grad_norm": 0.08671624213457108, "learning_rate": 1e-05, "loss": 2.2742, "step": 605 }, { "epoch": 1.0199873763938565, "grad_norm": 0.09528376907110214, "learning_rate": 1e-05, "loss": 2.3765, "step": 606 }, { "epoch": 1.0216705238796548, "grad_norm": 0.09153752028942108, "learning_rate": 1e-05, "loss": 2.2983, "step": 607 }, { "epoch": 1.0233536713654534, "grad_norm": 0.10145740956068039, "learning_rate": 1e-05, "loss": 2.1774, "step": 608 }, { "epoch": 1.0250368188512518, "grad_norm": 0.09908965229988098, "learning_rate": 1e-05, "loss": 2.3479, "step": 609 }, { "epoch": 1.0267199663370503, "grad_norm": 0.09253786504268646, "learning_rate": 1e-05, "loss": 2.3228, "step": 610 }, { "epoch": 1.0284031138228487, "grad_norm": 0.094690702855587, "learning_rate": 1e-05, "loss": 2.2864, "step": 611 }, { "epoch": 1.0300862613086472, "grad_norm": 0.09160283952951431, "learning_rate": 1e-05, "loss": 2.4285, "step": 612 }, { "epoch": 1.0317694087944456, "grad_norm": 0.10157333314418793, "learning_rate": 1e-05, "loss": 2.1316, "step": 613 }, { "epoch": 1.0334525562802441, "grad_norm": 0.10498999804258347, "learning_rate": 1e-05, "loss": 2.373, "step": 614 }, { "epoch": 1.0351357037660425, "grad_norm": 0.09599211066961288, "learning_rate": 1e-05, "loss": 2.3511, "step": 615 }, { "epoch": 1.036818851251841, "grad_norm": 0.1121436059474945, "learning_rate": 1e-05, "loss": 2.127, "step": 616 }, { "epoch": 1.0385019987376394, "grad_norm": 0.10269173234701157, "learning_rate": 1e-05, "loss": 2.2659, "step": 617 }, { "epoch": 1.040185146223438, "grad_norm": 0.0945139229297638, "learning_rate": 1e-05, "loss": 2.3281, "step": 618 }, { "epoch": 1.0418682937092363, "grad_norm": 0.09318878501653671, "learning_rate": 1e-05, "loss": 2.3247, "step": 619 }, { "epoch": 1.0435514411950346, "grad_norm": 0.10471779108047485, "learning_rate": 1e-05, "loss": 2.3098, "step": 620 }, { "epoch": 1.0452345886808332, "grad_norm": 0.10514305531978607, "learning_rate": 1e-05, "loss": 2.3647, "step": 621 }, { "epoch": 1.0469177361666315, "grad_norm": 0.09875541925430298, "learning_rate": 1e-05, "loss": 2.4204, "step": 622 }, { "epoch": 1.04860088365243, "grad_norm": 0.10112539678812027, "learning_rate": 1e-05, "loss": 2.3269, "step": 623 }, { "epoch": 1.0502840311382284, "grad_norm": 0.09719318896532059, "learning_rate": 1e-05, "loss": 2.3223, "step": 624 }, { "epoch": 1.051967178624027, "grad_norm": 0.09615301340818405, "learning_rate": 1e-05, "loss": 2.2798, "step": 625 }, { "epoch": 1.0536503261098253, "grad_norm": 0.09600812941789627, "learning_rate": 1e-05, "loss": 2.2738, "step": 626 }, { "epoch": 1.055333473595624, "grad_norm": 0.09326303005218506, "learning_rate": 1e-05, "loss": 2.23, "step": 627 }, { "epoch": 1.0570166210814222, "grad_norm": 0.09689430892467499, "learning_rate": 1e-05, "loss": 2.2582, "step": 628 }, { "epoch": 1.0586997685672208, "grad_norm": 0.10389314591884613, "learning_rate": 1e-05, "loss": 2.3733, "step": 629 }, { "epoch": 1.0603829160530192, "grad_norm": 0.09320785105228424, "learning_rate": 1e-05, "loss": 2.3315, "step": 630 }, { "epoch": 1.0620660635388175, "grad_norm": 0.10638166218996048, "learning_rate": 1e-05, "loss": 2.4058, "step": 631 }, { "epoch": 1.063749211024616, "grad_norm": 0.09525519609451294, "learning_rate": 1e-05, "loss": 2.2803, "step": 632 }, { "epoch": 1.0654323585104144, "grad_norm": 0.09904535114765167, "learning_rate": 1e-05, "loss": 2.3613, "step": 633 }, { "epoch": 1.067115505996213, "grad_norm": 0.10914106667041779, "learning_rate": 1e-05, "loss": 2.3955, "step": 634 }, { "epoch": 1.0687986534820113, "grad_norm": 0.10424593091011047, "learning_rate": 1e-05, "loss": 2.2332, "step": 635 }, { "epoch": 1.0704818009678099, "grad_norm": 0.10360780358314514, "learning_rate": 1e-05, "loss": 2.3127, "step": 636 }, { "epoch": 1.0721649484536082, "grad_norm": 0.11223631352186203, "learning_rate": 1e-05, "loss": 2.201, "step": 637 }, { "epoch": 1.0738480959394068, "grad_norm": 0.09491337090730667, "learning_rate": 1e-05, "loss": 2.3129, "step": 638 }, { "epoch": 1.0755312434252051, "grad_norm": 0.09244826436042786, "learning_rate": 1e-05, "loss": 2.3728, "step": 639 }, { "epoch": 1.0772143909110037, "grad_norm": 0.0922231450676918, "learning_rate": 1e-05, "loss": 2.3225, "step": 640 }, { "epoch": 1.078897538396802, "grad_norm": 0.10818596929311752, "learning_rate": 1e-05, "loss": 2.3104, "step": 641 }, { "epoch": 1.0805806858826004, "grad_norm": 0.09497258812189102, "learning_rate": 1e-05, "loss": 2.3176, "step": 642 }, { "epoch": 1.082263833368399, "grad_norm": 0.10034379363059998, "learning_rate": 1e-05, "loss": 2.3943, "step": 643 }, { "epoch": 1.0839469808541973, "grad_norm": 0.10024038702249527, "learning_rate": 1e-05, "loss": 2.3127, "step": 644 }, { "epoch": 1.0856301283399958, "grad_norm": 0.10074039548635483, "learning_rate": 1e-05, "loss": 2.2351, "step": 645 }, { "epoch": 1.0873132758257942, "grad_norm": 0.09631813317537308, "learning_rate": 1e-05, "loss": 2.3101, "step": 646 }, { "epoch": 1.0889964233115927, "grad_norm": 0.10632781684398651, "learning_rate": 1e-05, "loss": 2.3669, "step": 647 }, { "epoch": 1.090679570797391, "grad_norm": 0.10795175284147263, "learning_rate": 1e-05, "loss": 2.3064, "step": 648 }, { "epoch": 1.0923627182831896, "grad_norm": 0.11120691895484924, "learning_rate": 1e-05, "loss": 2.2911, "step": 649 }, { "epoch": 1.094045865768988, "grad_norm": 0.10034749656915665, "learning_rate": 1e-05, "loss": 2.3696, "step": 650 }, { "epoch": 1.0957290132547866, "grad_norm": 0.10955310612916946, "learning_rate": 1e-05, "loss": 2.3464, "step": 651 }, { "epoch": 1.097412160740585, "grad_norm": 0.09739572554826736, "learning_rate": 1e-05, "loss": 2.325, "step": 652 }, { "epoch": 1.0990953082263832, "grad_norm": 0.10152111947536469, "learning_rate": 1e-05, "loss": 2.3745, "step": 653 }, { "epoch": 1.1007784557121818, "grad_norm": 0.10103686153888702, "learning_rate": 1e-05, "loss": 2.3303, "step": 654 }, { "epoch": 1.1024616031979801, "grad_norm": 0.1003558412194252, "learning_rate": 1e-05, "loss": 2.312, "step": 655 }, { "epoch": 1.1041447506837787, "grad_norm": 0.10518987476825714, "learning_rate": 1e-05, "loss": 2.3444, "step": 656 }, { "epoch": 1.105827898169577, "grad_norm": 0.09896016865968704, "learning_rate": 1e-05, "loss": 2.2532, "step": 657 }, { "epoch": 1.1075110456553756, "grad_norm": 0.09725090116262436, "learning_rate": 1e-05, "loss": 2.3625, "step": 658 }, { "epoch": 1.109194193141174, "grad_norm": 0.09022284299135208, "learning_rate": 1e-05, "loss": 2.3743, "step": 659 }, { "epoch": 1.1108773406269725, "grad_norm": 0.10471490770578384, "learning_rate": 1e-05, "loss": 2.3416, "step": 660 }, { "epoch": 1.1125604881127709, "grad_norm": 0.10991263389587402, "learning_rate": 1e-05, "loss": 2.3214, "step": 661 }, { "epoch": 1.1142436355985694, "grad_norm": 0.10231148451566696, "learning_rate": 1e-05, "loss": 2.2832, "step": 662 }, { "epoch": 1.1159267830843678, "grad_norm": 0.09433937072753906, "learning_rate": 1e-05, "loss": 2.2645, "step": 663 }, { "epoch": 1.117609930570166, "grad_norm": 0.13238666951656342, "learning_rate": 1e-05, "loss": 2.2483, "step": 664 }, { "epoch": 1.1192930780559647, "grad_norm": 0.10956214368343353, "learning_rate": 1e-05, "loss": 2.2321, "step": 665 }, { "epoch": 1.120976225541763, "grad_norm": 0.11065597832202911, "learning_rate": 1e-05, "loss": 2.1869, "step": 666 }, { "epoch": 1.1226593730275616, "grad_norm": 0.10971678793430328, "learning_rate": 1e-05, "loss": 2.1855, "step": 667 }, { "epoch": 1.12434252051336, "grad_norm": 0.11080143600702286, "learning_rate": 1e-05, "loss": 2.198, "step": 668 }, { "epoch": 1.1260256679991585, "grad_norm": 0.10381001979112625, "learning_rate": 1e-05, "loss": 2.3384, "step": 669 }, { "epoch": 1.1277088154849568, "grad_norm": 0.1026921421289444, "learning_rate": 1e-05, "loss": 2.2458, "step": 670 }, { "epoch": 1.1293919629707554, "grad_norm": 0.10585295408964157, "learning_rate": 1e-05, "loss": 2.1859, "step": 671 }, { "epoch": 1.1310751104565537, "grad_norm": 0.10650487244129181, "learning_rate": 1e-05, "loss": 2.2662, "step": 672 }, { "epoch": 1.1327582579423523, "grad_norm": 0.10717649012804031, "learning_rate": 1e-05, "loss": 2.3088, "step": 673 }, { "epoch": 1.1344414054281506, "grad_norm": 0.10479724407196045, "learning_rate": 1e-05, "loss": 2.3042, "step": 674 }, { "epoch": 1.136124552913949, "grad_norm": 0.10629065334796906, "learning_rate": 1e-05, "loss": 2.3481, "step": 675 }, { "epoch": 1.1378077003997475, "grad_norm": 0.10375174880027771, "learning_rate": 1e-05, "loss": 2.3845, "step": 676 }, { "epoch": 1.1394908478855459, "grad_norm": 0.10122872143983841, "learning_rate": 1e-05, "loss": 2.335, "step": 677 }, { "epoch": 1.1411739953713445, "grad_norm": 0.09846247732639313, "learning_rate": 1e-05, "loss": 2.4028, "step": 678 }, { "epoch": 1.1428571428571428, "grad_norm": 0.11501342058181763, "learning_rate": 1e-05, "loss": 2.2419, "step": 679 }, { "epoch": 1.1445402903429414, "grad_norm": 0.11248493194580078, "learning_rate": 1e-05, "loss": 2.1294, "step": 680 }, { "epoch": 1.1462234378287397, "grad_norm": 0.1141652762889862, "learning_rate": 1e-05, "loss": 2.2842, "step": 681 }, { "epoch": 1.1479065853145383, "grad_norm": 0.10232444107532501, "learning_rate": 1e-05, "loss": 2.1798, "step": 682 }, { "epoch": 1.1495897328003366, "grad_norm": 0.10624698549509048, "learning_rate": 1e-05, "loss": 2.2474, "step": 683 }, { "epoch": 1.1512728802861352, "grad_norm": 0.10583934187889099, "learning_rate": 1e-05, "loss": 2.2917, "step": 684 }, { "epoch": 1.1529560277719335, "grad_norm": 0.10667344182729721, "learning_rate": 1e-05, "loss": 2.2581, "step": 685 }, { "epoch": 1.1546391752577319, "grad_norm": 0.10415381193161011, "learning_rate": 1e-05, "loss": 2.3325, "step": 686 }, { "epoch": 1.1563223227435304, "grad_norm": 0.109574094414711, "learning_rate": 1e-05, "loss": 2.3306, "step": 687 }, { "epoch": 1.1580054702293288, "grad_norm": 0.10537154227495193, "learning_rate": 1e-05, "loss": 2.3396, "step": 688 }, { "epoch": 1.1596886177151273, "grad_norm": 0.10670781880617142, "learning_rate": 1e-05, "loss": 2.2518, "step": 689 }, { "epoch": 1.1613717652009257, "grad_norm": 0.10296822339296341, "learning_rate": 1e-05, "loss": 2.3911, "step": 690 }, { "epoch": 1.1630549126867242, "grad_norm": 0.10323610156774521, "learning_rate": 1e-05, "loss": 2.415, "step": 691 }, { "epoch": 1.1647380601725226, "grad_norm": 0.09952528029680252, "learning_rate": 1e-05, "loss": 2.3674, "step": 692 }, { "epoch": 1.1664212076583211, "grad_norm": 0.10683920234441757, "learning_rate": 1e-05, "loss": 2.1606, "step": 693 }, { "epoch": 1.1681043551441195, "grad_norm": 0.10594907402992249, "learning_rate": 1e-05, "loss": 2.3633, "step": 694 }, { "epoch": 1.169787502629918, "grad_norm": 0.1164483055472374, "learning_rate": 1e-05, "loss": 2.272, "step": 695 }, { "epoch": 1.1714706501157164, "grad_norm": 0.1053275316953659, "learning_rate": 1e-05, "loss": 2.3361, "step": 696 }, { "epoch": 1.1731537976015147, "grad_norm": 0.11722961068153381, "learning_rate": 1e-05, "loss": 2.1008, "step": 697 }, { "epoch": 1.1748369450873133, "grad_norm": 0.11388476192951202, "learning_rate": 1e-05, "loss": 2.3129, "step": 698 }, { "epoch": 1.1765200925731116, "grad_norm": 0.1149948239326477, "learning_rate": 1e-05, "loss": 2.3503, "step": 699 }, { "epoch": 1.1782032400589102, "grad_norm": 0.09305736422538757, "learning_rate": 1e-05, "loss": 2.3811, "step": 700 }, { "epoch": 1.1798863875447085, "grad_norm": 0.1027708575129509, "learning_rate": 1e-05, "loss": 2.3262, "step": 701 }, { "epoch": 1.181569535030507, "grad_norm": 0.1058826595544815, "learning_rate": 1e-05, "loss": 2.2576, "step": 702 }, { "epoch": 1.1832526825163054, "grad_norm": 0.1003696396946907, "learning_rate": 1e-05, "loss": 2.2759, "step": 703 }, { "epoch": 1.184935830002104, "grad_norm": 0.11113473027944565, "learning_rate": 1e-05, "loss": 2.4163, "step": 704 }, { "epoch": 1.1866189774879023, "grad_norm": 0.10945228487253189, "learning_rate": 1e-05, "loss": 2.2725, "step": 705 }, { "epoch": 1.188302124973701, "grad_norm": 0.1079326868057251, "learning_rate": 1e-05, "loss": 2.3048, "step": 706 }, { "epoch": 1.1899852724594993, "grad_norm": 0.10752802342176437, "learning_rate": 1e-05, "loss": 2.2145, "step": 707 }, { "epoch": 1.1916684199452976, "grad_norm": 0.10588284581899643, "learning_rate": 1e-05, "loss": 2.3025, "step": 708 }, { "epoch": 1.1933515674310962, "grad_norm": 0.1051083654165268, "learning_rate": 1e-05, "loss": 2.3198, "step": 709 }, { "epoch": 1.1950347149168945, "grad_norm": 0.11915988475084305, "learning_rate": 1e-05, "loss": 2.2456, "step": 710 }, { "epoch": 1.196717862402693, "grad_norm": 0.10947719216346741, "learning_rate": 1e-05, "loss": 2.3479, "step": 711 }, { "epoch": 1.1984010098884914, "grad_norm": 0.11522776633501053, "learning_rate": 1e-05, "loss": 2.2898, "step": 712 }, { "epoch": 1.20008415737429, "grad_norm": 0.10741020739078522, "learning_rate": 1e-05, "loss": 2.3198, "step": 713 }, { "epoch": 1.2017673048600883, "grad_norm": 0.10589215159416199, "learning_rate": 1e-05, "loss": 2.2812, "step": 714 }, { "epoch": 1.2034504523458869, "grad_norm": 0.10151232033967972, "learning_rate": 1e-05, "loss": 2.429, "step": 715 }, { "epoch": 1.2051335998316852, "grad_norm": 0.11951622366905212, "learning_rate": 1e-05, "loss": 2.1932, "step": 716 }, { "epoch": 1.2068167473174838, "grad_norm": 0.11722715198993683, "learning_rate": 1e-05, "loss": 2.2356, "step": 717 }, { "epoch": 1.2084998948032821, "grad_norm": 0.11441315710544586, "learning_rate": 1e-05, "loss": 2.2891, "step": 718 }, { "epoch": 1.2101830422890805, "grad_norm": 0.10936987400054932, "learning_rate": 1e-05, "loss": 2.2843, "step": 719 }, { "epoch": 1.211866189774879, "grad_norm": 0.12374020367860794, "learning_rate": 1e-05, "loss": 2.2944, "step": 720 }, { "epoch": 1.2135493372606774, "grad_norm": 0.11024117469787598, "learning_rate": 1e-05, "loss": 2.2595, "step": 721 }, { "epoch": 1.215232484746476, "grad_norm": 0.09707245975732803, "learning_rate": 1e-05, "loss": 2.3867, "step": 722 }, { "epoch": 1.2169156322322743, "grad_norm": 0.11022404581308365, "learning_rate": 1e-05, "loss": 2.375, "step": 723 }, { "epoch": 1.2185987797180728, "grad_norm": 0.10732002556324005, "learning_rate": 1e-05, "loss": 2.3674, "step": 724 }, { "epoch": 1.2202819272038712, "grad_norm": 0.11548677086830139, "learning_rate": 1e-05, "loss": 2.3284, "step": 725 }, { "epoch": 1.2219650746896698, "grad_norm": 0.10313412547111511, "learning_rate": 1e-05, "loss": 2.4128, "step": 726 }, { "epoch": 1.223648222175468, "grad_norm": 0.12717945873737335, "learning_rate": 1e-05, "loss": 2.2847, "step": 727 }, { "epoch": 1.2253313696612667, "grad_norm": 0.11565182358026505, "learning_rate": 1e-05, "loss": 2.2695, "step": 728 }, { "epoch": 1.227014517147065, "grad_norm": 0.10489466041326523, "learning_rate": 1e-05, "loss": 2.3394, "step": 729 }, { "epoch": 1.2286976646328633, "grad_norm": 0.11056289076805115, "learning_rate": 1e-05, "loss": 2.4165, "step": 730 }, { "epoch": 1.230380812118662, "grad_norm": 0.12048956751823425, "learning_rate": 1e-05, "loss": 2.2289, "step": 731 }, { "epoch": 1.2320639596044602, "grad_norm": 0.10263136774301529, "learning_rate": 1e-05, "loss": 2.3306, "step": 732 }, { "epoch": 1.2337471070902588, "grad_norm": 0.11179950088262558, "learning_rate": 1e-05, "loss": 2.3481, "step": 733 }, { "epoch": 1.2354302545760572, "grad_norm": 0.10484311729669571, "learning_rate": 1e-05, "loss": 2.2703, "step": 734 }, { "epoch": 1.2371134020618557, "grad_norm": 0.1182483434677124, "learning_rate": 1e-05, "loss": 2.2328, "step": 735 }, { "epoch": 1.238796549547654, "grad_norm": 0.11377429217100143, "learning_rate": 1e-05, "loss": 2.3657, "step": 736 }, { "epoch": 1.2404796970334526, "grad_norm": 0.11151503771543503, "learning_rate": 1e-05, "loss": 2.3542, "step": 737 }, { "epoch": 1.242162844519251, "grad_norm": 0.12628555297851562, "learning_rate": 1e-05, "loss": 2.2634, "step": 738 }, { "epoch": 1.2438459920050495, "grad_norm": 0.10311713814735413, "learning_rate": 1e-05, "loss": 2.2717, "step": 739 }, { "epoch": 1.2455291394908479, "grad_norm": 0.12768767774105072, "learning_rate": 1e-05, "loss": 2.1725, "step": 740 }, { "epoch": 1.2472122869766462, "grad_norm": 0.12390502542257309, "learning_rate": 1e-05, "loss": 2.1708, "step": 741 }, { "epoch": 1.2488954344624448, "grad_norm": 0.10566207021474838, "learning_rate": 1e-05, "loss": 2.3469, "step": 742 }, { "epoch": 1.2505785819482433, "grad_norm": 0.10176009684801102, "learning_rate": 1e-05, "loss": 2.3159, "step": 743 }, { "epoch": 1.2522617294340417, "grad_norm": 0.10881732404232025, "learning_rate": 1e-05, "loss": 2.2966, "step": 744 }, { "epoch": 1.25394487691984, "grad_norm": 0.11917608976364136, "learning_rate": 1e-05, "loss": 2.395, "step": 745 }, { "epoch": 1.2556280244056386, "grad_norm": 0.09600858390331268, "learning_rate": 1e-05, "loss": 2.3479, "step": 746 }, { "epoch": 1.257311171891437, "grad_norm": 0.11550504714250565, "learning_rate": 1e-05, "loss": 2.301, "step": 747 }, { "epoch": 1.2589943193772355, "grad_norm": 0.10588584840297699, "learning_rate": 1e-05, "loss": 2.4163, "step": 748 }, { "epoch": 1.2606774668630338, "grad_norm": 0.10998673737049103, "learning_rate": 1e-05, "loss": 2.3379, "step": 749 }, { "epoch": 1.2623606143488324, "grad_norm": 0.10513128340244293, "learning_rate": 1e-05, "loss": 2.3795, "step": 750 }, { "epoch": 1.2640437618346307, "grad_norm": 0.11185754835605621, "learning_rate": 1e-05, "loss": 2.2583, "step": 751 }, { "epoch": 1.265726909320429, "grad_norm": 0.10794227570295334, "learning_rate": 1e-05, "loss": 2.285, "step": 752 }, { "epoch": 1.2674100568062276, "grad_norm": 0.12522459030151367, "learning_rate": 1e-05, "loss": 2.2292, "step": 753 }, { "epoch": 1.2690932042920262, "grad_norm": 0.11628364026546478, "learning_rate": 1e-05, "loss": 2.3342, "step": 754 }, { "epoch": 1.2707763517778246, "grad_norm": 0.12842795252799988, "learning_rate": 1e-05, "loss": 2.1455, "step": 755 }, { "epoch": 1.272459499263623, "grad_norm": 0.11268262565135956, "learning_rate": 1e-05, "loss": 2.2241, "step": 756 }, { "epoch": 1.2741426467494215, "grad_norm": 0.11674508452415466, "learning_rate": 1e-05, "loss": 2.2677, "step": 757 }, { "epoch": 1.2758257942352198, "grad_norm": 0.11475373059511185, "learning_rate": 1e-05, "loss": 2.4075, "step": 758 }, { "epoch": 1.2775089417210184, "grad_norm": 0.11378497630357742, "learning_rate": 1e-05, "loss": 2.3032, "step": 759 }, { "epoch": 1.2791920892068167, "grad_norm": 0.10426255315542221, "learning_rate": 1e-05, "loss": 2.2488, "step": 760 }, { "epoch": 1.2808752366926153, "grad_norm": 0.11820263415575027, "learning_rate": 1e-05, "loss": 2.197, "step": 761 }, { "epoch": 1.2825583841784136, "grad_norm": 0.10741489380598068, "learning_rate": 1e-05, "loss": 2.2811, "step": 762 }, { "epoch": 1.284241531664212, "grad_norm": 0.115534208714962, "learning_rate": 1e-05, "loss": 2.3105, "step": 763 }, { "epoch": 1.2859246791500105, "grad_norm": 0.1159248948097229, "learning_rate": 1e-05, "loss": 2.2963, "step": 764 }, { "epoch": 1.287607826635809, "grad_norm": 0.11940732598304749, "learning_rate": 1e-05, "loss": 2.3274, "step": 765 }, { "epoch": 1.2892909741216074, "grad_norm": 0.11882008612155914, "learning_rate": 1e-05, "loss": 2.2405, "step": 766 }, { "epoch": 1.2909741216074058, "grad_norm": 0.10939499735832214, "learning_rate": 1e-05, "loss": 2.3008, "step": 767 }, { "epoch": 1.2926572690932043, "grad_norm": 0.11414020508527756, "learning_rate": 1e-05, "loss": 2.3164, "step": 768 }, { "epoch": 1.2943404165790027, "grad_norm": 0.11446741968393326, "learning_rate": 1e-05, "loss": 2.2524, "step": 769 }, { "epoch": 1.2960235640648012, "grad_norm": 0.12233757227659225, "learning_rate": 1e-05, "loss": 2.3997, "step": 770 }, { "epoch": 1.2977067115505996, "grad_norm": 0.11746780574321747, "learning_rate": 1e-05, "loss": 2.2241, "step": 771 }, { "epoch": 1.2993898590363981, "grad_norm": 0.12653754651546478, "learning_rate": 1e-05, "loss": 2.2181, "step": 772 }, { "epoch": 1.3010730065221965, "grad_norm": 0.11092430353164673, "learning_rate": 1e-05, "loss": 2.194, "step": 773 }, { "epoch": 1.3027561540079948, "grad_norm": 0.11273445188999176, "learning_rate": 1e-05, "loss": 2.2821, "step": 774 }, { "epoch": 1.3044393014937934, "grad_norm": 0.10755831003189087, "learning_rate": 1e-05, "loss": 2.3381, "step": 775 }, { "epoch": 1.306122448979592, "grad_norm": 0.10324183851480484, "learning_rate": 1e-05, "loss": 2.4531, "step": 776 }, { "epoch": 1.3078055964653903, "grad_norm": 0.1238187626004219, "learning_rate": 1e-05, "loss": 2.2378, "step": 777 }, { "epoch": 1.3094887439511886, "grad_norm": 0.10919329524040222, "learning_rate": 1e-05, "loss": 2.3157, "step": 778 }, { "epoch": 1.3111718914369872, "grad_norm": 0.11661651730537415, "learning_rate": 1e-05, "loss": 2.3889, "step": 779 }, { "epoch": 1.3128550389227855, "grad_norm": 0.11324804276227951, "learning_rate": 1e-05, "loss": 2.366, "step": 780 }, { "epoch": 1.314538186408584, "grad_norm": 0.11539211124181747, "learning_rate": 1e-05, "loss": 2.2661, "step": 781 }, { "epoch": 1.3162213338943825, "grad_norm": 0.12013803422451019, "learning_rate": 1e-05, "loss": 2.2388, "step": 782 }, { "epoch": 1.317904481380181, "grad_norm": 0.1297876238822937, "learning_rate": 1e-05, "loss": 2.338, "step": 783 }, { "epoch": 1.3195876288659794, "grad_norm": 0.11792443692684174, "learning_rate": 1e-05, "loss": 2.3162, "step": 784 }, { "epoch": 1.3212707763517777, "grad_norm": 0.11543410271406174, "learning_rate": 1e-05, "loss": 2.325, "step": 785 }, { "epoch": 1.3229539238375763, "grad_norm": 0.11507069319486618, "learning_rate": 1e-05, "loss": 2.3389, "step": 786 }, { "epoch": 1.3246370713233748, "grad_norm": 0.11883421987295151, "learning_rate": 1e-05, "loss": 2.3784, "step": 787 }, { "epoch": 1.3263202188091732, "grad_norm": 0.11997753381729126, "learning_rate": 1e-05, "loss": 2.2183, "step": 788 }, { "epoch": 1.3280033662949715, "grad_norm": 0.12312667816877365, "learning_rate": 1e-05, "loss": 2.2661, "step": 789 }, { "epoch": 1.32968651378077, "grad_norm": 0.1280994415283203, "learning_rate": 1e-05, "loss": 2.235, "step": 790 }, { "epoch": 1.3313696612665684, "grad_norm": 0.12460897862911224, "learning_rate": 1e-05, "loss": 2.2775, "step": 791 }, { "epoch": 1.333052808752367, "grad_norm": 0.11441405862569809, "learning_rate": 1e-05, "loss": 2.2642, "step": 792 }, { "epoch": 1.3347359562381653, "grad_norm": 0.1078685000538826, "learning_rate": 1e-05, "loss": 2.3174, "step": 793 }, { "epoch": 1.3364191037239639, "grad_norm": 0.11945922672748566, "learning_rate": 1e-05, "loss": 2.3101, "step": 794 }, { "epoch": 1.3381022512097622, "grad_norm": 0.11506087332963943, "learning_rate": 1e-05, "loss": 2.3167, "step": 795 }, { "epoch": 1.3397853986955606, "grad_norm": 0.12365138530731201, "learning_rate": 1e-05, "loss": 2.3044, "step": 796 }, { "epoch": 1.3414685461813591, "grad_norm": 0.12331211566925049, "learning_rate": 1e-05, "loss": 2.2058, "step": 797 }, { "epoch": 1.3431516936671577, "grad_norm": 0.12298640608787537, "learning_rate": 1e-05, "loss": 2.21, "step": 798 }, { "epoch": 1.344834841152956, "grad_norm": 0.12047012150287628, "learning_rate": 1e-05, "loss": 2.2781, "step": 799 }, { "epoch": 1.3465179886387544, "grad_norm": 0.12428031861782074, "learning_rate": 1e-05, "loss": 2.3032, "step": 800 }, { "epoch": 1.348201136124553, "grad_norm": 0.1128249540925026, "learning_rate": 1e-05, "loss": 2.3135, "step": 801 }, { "epoch": 1.3498842836103513, "grad_norm": 0.12616464495658875, "learning_rate": 1e-05, "loss": 2.1487, "step": 802 }, { "epoch": 1.3515674310961499, "grad_norm": 0.11388704925775528, "learning_rate": 1e-05, "loss": 2.2346, "step": 803 }, { "epoch": 1.3532505785819482, "grad_norm": 0.10213828831911087, "learning_rate": 1e-05, "loss": 2.2859, "step": 804 }, { "epoch": 1.3549337260677468, "grad_norm": 0.1226121038198471, "learning_rate": 1e-05, "loss": 2.2183, "step": 805 }, { "epoch": 1.356616873553545, "grad_norm": 0.11445735394954681, "learning_rate": 1e-05, "loss": 2.3784, "step": 806 }, { "epoch": 1.3583000210393434, "grad_norm": 0.11648505181074142, "learning_rate": 1e-05, "loss": 2.3442, "step": 807 }, { "epoch": 1.359983168525142, "grad_norm": 0.1296563744544983, "learning_rate": 1e-05, "loss": 2.2469, "step": 808 }, { "epoch": 1.3616663160109406, "grad_norm": 0.12322400510311127, "learning_rate": 1e-05, "loss": 2.2915, "step": 809 }, { "epoch": 1.363349463496739, "grad_norm": 0.11419309675693512, "learning_rate": 1e-05, "loss": 2.3024, "step": 810 }, { "epoch": 1.3650326109825373, "grad_norm": 0.12253374606370926, "learning_rate": 1e-05, "loss": 2.2969, "step": 811 }, { "epoch": 1.3667157584683358, "grad_norm": 0.1254422962665558, "learning_rate": 1e-05, "loss": 2.364, "step": 812 }, { "epoch": 1.3683989059541342, "grad_norm": 0.12984994053840637, "learning_rate": 1e-05, "loss": 2.2936, "step": 813 }, { "epoch": 1.3700820534399327, "grad_norm": 0.1182006224989891, "learning_rate": 1e-05, "loss": 2.2673, "step": 814 }, { "epoch": 1.371765200925731, "grad_norm": 0.12920832633972168, "learning_rate": 1e-05, "loss": 2.1582, "step": 815 }, { "epoch": 1.3734483484115296, "grad_norm": 0.1216689869761467, "learning_rate": 1e-05, "loss": 2.3479, "step": 816 }, { "epoch": 1.375131495897328, "grad_norm": 0.12459319084882736, "learning_rate": 1e-05, "loss": 2.1868, "step": 817 }, { "epoch": 1.3768146433831263, "grad_norm": 0.11144936084747314, "learning_rate": 1e-05, "loss": 2.3663, "step": 818 }, { "epoch": 1.3784977908689249, "grad_norm": 0.1110294982790947, "learning_rate": 1e-05, "loss": 2.3164, "step": 819 }, { "epoch": 1.3801809383547234, "grad_norm": 0.11903022974729538, "learning_rate": 1e-05, "loss": 2.2589, "step": 820 }, { "epoch": 1.3818640858405218, "grad_norm": 0.10610275715589523, "learning_rate": 1e-05, "loss": 2.4153, "step": 821 }, { "epoch": 1.3835472333263201, "grad_norm": 0.11972808837890625, "learning_rate": 1e-05, "loss": 2.3901, "step": 822 }, { "epoch": 1.3852303808121187, "grad_norm": 0.10772975534200668, "learning_rate": 1e-05, "loss": 2.3555, "step": 823 }, { "epoch": 1.386913528297917, "grad_norm": 0.11757270246744156, "learning_rate": 1e-05, "loss": 2.2677, "step": 824 }, { "epoch": 1.3885966757837156, "grad_norm": 0.1217508539557457, "learning_rate": 1e-05, "loss": 2.2267, "step": 825 }, { "epoch": 1.390279823269514, "grad_norm": 0.10996967554092407, "learning_rate": 1e-05, "loss": 2.3965, "step": 826 }, { "epoch": 1.3919629707553125, "grad_norm": 0.13068005442619324, "learning_rate": 1e-05, "loss": 2.1991, "step": 827 }, { "epoch": 1.3936461182411108, "grad_norm": 0.12149260193109512, "learning_rate": 1e-05, "loss": 2.2775, "step": 828 }, { "epoch": 1.3953292657269092, "grad_norm": 0.1100870743393898, "learning_rate": 1e-05, "loss": 2.2571, "step": 829 }, { "epoch": 1.3970124132127077, "grad_norm": 0.10005280375480652, "learning_rate": 1e-05, "loss": 2.2808, "step": 830 }, { "epoch": 1.3986955606985063, "grad_norm": 0.11633820086717606, "learning_rate": 1e-05, "loss": 2.3215, "step": 831 }, { "epoch": 1.4003787081843047, "grad_norm": 0.11901983618736267, "learning_rate": 1e-05, "loss": 2.4236, "step": 832 }, { "epoch": 1.402061855670103, "grad_norm": 0.11173246055841446, "learning_rate": 1e-05, "loss": 2.3457, "step": 833 }, { "epoch": 1.4037450031559016, "grad_norm": 0.10333243012428284, "learning_rate": 1e-05, "loss": 2.2659, "step": 834 }, { "epoch": 1.4054281506417, "grad_norm": 0.13903972506523132, "learning_rate": 1e-05, "loss": 2.1946, "step": 835 }, { "epoch": 1.4071112981274985, "grad_norm": 0.11832322925329208, "learning_rate": 1e-05, "loss": 2.3223, "step": 836 }, { "epoch": 1.4087944456132968, "grad_norm": 0.10906493663787842, "learning_rate": 1e-05, "loss": 2.4316, "step": 837 }, { "epoch": 1.4104775930990954, "grad_norm": 0.10980133712291718, "learning_rate": 1e-05, "loss": 2.3525, "step": 838 }, { "epoch": 1.4121607405848937, "grad_norm": 0.12958386540412903, "learning_rate": 1e-05, "loss": 2.3081, "step": 839 }, { "epoch": 1.413843888070692, "grad_norm": 0.1342059075832367, "learning_rate": 1e-05, "loss": 2.3564, "step": 840 }, { "epoch": 1.4155270355564906, "grad_norm": 0.1362716406583786, "learning_rate": 1e-05, "loss": 2.2435, "step": 841 }, { "epoch": 1.4172101830422892, "grad_norm": 0.10814797878265381, "learning_rate": 1e-05, "loss": 2.3373, "step": 842 }, { "epoch": 1.4188933305280875, "grad_norm": 0.111182801425457, "learning_rate": 1e-05, "loss": 2.2921, "step": 843 }, { "epoch": 1.4205764780138859, "grad_norm": 0.11161399632692337, "learning_rate": 1e-05, "loss": 2.3816, "step": 844 }, { "epoch": 1.4222596254996844, "grad_norm": 0.1261526495218277, "learning_rate": 1e-05, "loss": 2.4082, "step": 845 }, { "epoch": 1.4239427729854828, "grad_norm": 0.10805182158946991, "learning_rate": 1e-05, "loss": 2.3622, "step": 846 }, { "epoch": 1.4256259204712813, "grad_norm": 0.12294517457485199, "learning_rate": 1e-05, "loss": 2.3638, "step": 847 }, { "epoch": 1.4273090679570797, "grad_norm": 0.10903607308864594, "learning_rate": 1e-05, "loss": 2.3484, "step": 848 }, { "epoch": 1.4289922154428782, "grad_norm": 0.12460491806268692, "learning_rate": 1e-05, "loss": 2.2046, "step": 849 }, { "epoch": 1.4306753629286766, "grad_norm": 0.13793089985847473, "learning_rate": 1e-05, "loss": 2.2437, "step": 850 }, { "epoch": 1.4323585104144752, "grad_norm": 0.11700379103422165, "learning_rate": 1e-05, "loss": 2.2288, "step": 851 }, { "epoch": 1.4340416579002735, "grad_norm": 0.11343109607696533, "learning_rate": 1e-05, "loss": 2.2501, "step": 852 }, { "epoch": 1.435724805386072, "grad_norm": 0.10918331891298294, "learning_rate": 1e-05, "loss": 2.47, "step": 853 }, { "epoch": 1.4374079528718704, "grad_norm": 0.12782573699951172, "learning_rate": 1e-05, "loss": 2.2281, "step": 854 }, { "epoch": 1.4390911003576687, "grad_norm": 0.12039442360401154, "learning_rate": 1e-05, "loss": 2.2766, "step": 855 }, { "epoch": 1.4407742478434673, "grad_norm": 0.13949096202850342, "learning_rate": 1e-05, "loss": 2.198, "step": 856 }, { "epoch": 1.4424573953292656, "grad_norm": 0.13327306509017944, "learning_rate": 1e-05, "loss": 2.2253, "step": 857 }, { "epoch": 1.4441405428150642, "grad_norm": 0.1229238212108612, "learning_rate": 1e-05, "loss": 2.3147, "step": 858 }, { "epoch": 1.4458236903008626, "grad_norm": 0.13407859206199646, "learning_rate": 1e-05, "loss": 2.2532, "step": 859 }, { "epoch": 1.4475068377866611, "grad_norm": 0.1280384659767151, "learning_rate": 1e-05, "loss": 2.3174, "step": 860 }, { "epoch": 1.4491899852724595, "grad_norm": 0.1532362997531891, "learning_rate": 1e-05, "loss": 2.1671, "step": 861 }, { "epoch": 1.450873132758258, "grad_norm": 0.1134854182600975, "learning_rate": 1e-05, "loss": 2.3607, "step": 862 }, { "epoch": 1.4525562802440564, "grad_norm": 0.11682198196649551, "learning_rate": 1e-05, "loss": 2.4041, "step": 863 }, { "epoch": 1.454239427729855, "grad_norm": 0.11356412619352341, "learning_rate": 1e-05, "loss": 2.2756, "step": 864 }, { "epoch": 1.4559225752156533, "grad_norm": 0.11278104037046432, "learning_rate": 1e-05, "loss": 2.2983, "step": 865 }, { "epoch": 1.4576057227014516, "grad_norm": 0.13442599773406982, "learning_rate": 1e-05, "loss": 2.2593, "step": 866 }, { "epoch": 1.4592888701872502, "grad_norm": 0.1254800707101822, "learning_rate": 1e-05, "loss": 2.3213, "step": 867 }, { "epoch": 1.4609720176730487, "grad_norm": 0.12374315410852432, "learning_rate": 1e-05, "loss": 2.4221, "step": 868 }, { "epoch": 1.462655165158847, "grad_norm": 0.13577024638652802, "learning_rate": 1e-05, "loss": 2.2473, "step": 869 }, { "epoch": 1.4643383126446454, "grad_norm": 0.12822799384593964, "learning_rate": 1e-05, "loss": 2.3057, "step": 870 }, { "epoch": 1.466021460130444, "grad_norm": 0.1283286213874817, "learning_rate": 1e-05, "loss": 2.374, "step": 871 }, { "epoch": 1.4677046076162423, "grad_norm": 0.12054271996021271, "learning_rate": 1e-05, "loss": 2.3369, "step": 872 }, { "epoch": 1.469387755102041, "grad_norm": 0.127189502120018, "learning_rate": 1e-05, "loss": 2.3167, "step": 873 }, { "epoch": 1.4710709025878392, "grad_norm": 0.12767814099788666, "learning_rate": 1e-05, "loss": 2.2695, "step": 874 }, { "epoch": 1.4727540500736378, "grad_norm": 0.12026406079530716, "learning_rate": 1e-05, "loss": 2.3313, "step": 875 }, { "epoch": 1.4744371975594361, "grad_norm": 0.13317981362342834, "learning_rate": 1e-05, "loss": 2.209, "step": 876 }, { "epoch": 1.4761203450452345, "grad_norm": 0.12904947996139526, "learning_rate": 1e-05, "loss": 2.2344, "step": 877 }, { "epoch": 1.477803492531033, "grad_norm": 0.13126946985721588, "learning_rate": 1e-05, "loss": 2.2888, "step": 878 }, { "epoch": 1.4794866400168316, "grad_norm": 0.128869891166687, "learning_rate": 1e-05, "loss": 2.1996, "step": 879 }, { "epoch": 1.48116978750263, "grad_norm": 0.1279861181974411, "learning_rate": 1e-05, "loss": 2.1873, "step": 880 }, { "epoch": 1.4828529349884283, "grad_norm": 0.11732237040996552, "learning_rate": 1e-05, "loss": 2.3259, "step": 881 }, { "epoch": 1.4845360824742269, "grad_norm": 0.1279248595237732, "learning_rate": 1e-05, "loss": 2.386, "step": 882 }, { "epoch": 1.4862192299600252, "grad_norm": 0.13578535616397858, "learning_rate": 1e-05, "loss": 2.2937, "step": 883 }, { "epoch": 1.4879023774458238, "grad_norm": 0.13534606993198395, "learning_rate": 1e-05, "loss": 2.239, "step": 884 }, { "epoch": 1.489585524931622, "grad_norm": 0.12359879165887833, "learning_rate": 1e-05, "loss": 2.3572, "step": 885 }, { "epoch": 1.4912686724174207, "grad_norm": 0.1236250028014183, "learning_rate": 1e-05, "loss": 2.188, "step": 886 }, { "epoch": 1.492951819903219, "grad_norm": 0.12695659697055817, "learning_rate": 1e-05, "loss": 2.2637, "step": 887 }, { "epoch": 1.4946349673890174, "grad_norm": 0.1281343400478363, "learning_rate": 1e-05, "loss": 2.2961, "step": 888 }, { "epoch": 1.496318114874816, "grad_norm": 0.12446150928735733, "learning_rate": 1e-05, "loss": 2.3362, "step": 889 }, { "epoch": 1.4980012623606145, "grad_norm": 0.12564988434314728, "learning_rate": 1e-05, "loss": 2.288, "step": 890 }, { "epoch": 1.4996844098464128, "grad_norm": 0.14049400389194489, "learning_rate": 1e-05, "loss": 2.2867, "step": 891 }, { "epoch": 1.5013675573322112, "grad_norm": 0.12252961844205856, "learning_rate": 1e-05, "loss": 2.3511, "step": 892 }, { "epoch": 1.5030507048180097, "grad_norm": 0.15993735194206238, "learning_rate": 1e-05, "loss": 2.0931, "step": 893 }, { "epoch": 1.504733852303808, "grad_norm": 0.13673749566078186, "learning_rate": 1e-05, "loss": 2.2998, "step": 894 }, { "epoch": 1.5064169997896064, "grad_norm": 0.11770147830247879, "learning_rate": 1e-05, "loss": 2.2883, "step": 895 }, { "epoch": 1.508100147275405, "grad_norm": 0.11792504787445068, "learning_rate": 1e-05, "loss": 2.1893, "step": 896 }, { "epoch": 1.5097832947612035, "grad_norm": 0.1405222862958908, "learning_rate": 1e-05, "loss": 2.2645, "step": 897 }, { "epoch": 1.5114664422470019, "grad_norm": 0.1401311457157135, "learning_rate": 1e-05, "loss": 2.2085, "step": 898 }, { "epoch": 1.5131495897328002, "grad_norm": 0.14068666100502014, "learning_rate": 1e-05, "loss": 2.2711, "step": 899 }, { "epoch": 1.5148327372185988, "grad_norm": 0.12995976209640503, "learning_rate": 1e-05, "loss": 2.2883, "step": 900 }, { "epoch": 1.5165158847043974, "grad_norm": 0.12454178184270859, "learning_rate": 1e-05, "loss": 2.2515, "step": 901 }, { "epoch": 1.5181990321901957, "grad_norm": 0.12165191769599915, "learning_rate": 1e-05, "loss": 2.3621, "step": 902 }, { "epoch": 1.519882179675994, "grad_norm": 0.1413601189851761, "learning_rate": 1e-05, "loss": 2.27, "step": 903 }, { "epoch": 1.5215653271617926, "grad_norm": 0.13545894622802734, "learning_rate": 1e-05, "loss": 2.3008, "step": 904 }, { "epoch": 1.523248474647591, "grad_norm": 0.12211872637271881, "learning_rate": 1e-05, "loss": 2.3921, "step": 905 }, { "epoch": 1.5249316221333893, "grad_norm": 0.13053253293037415, "learning_rate": 1e-05, "loss": 2.2434, "step": 906 }, { "epoch": 1.5266147696191879, "grad_norm": 0.12977124750614166, "learning_rate": 1e-05, "loss": 2.2366, "step": 907 }, { "epoch": 1.5282979171049864, "grad_norm": 0.13451719284057617, "learning_rate": 1e-05, "loss": 2.3154, "step": 908 }, { "epoch": 1.5299810645907848, "grad_norm": 0.11067184805870056, "learning_rate": 1e-05, "loss": 2.3296, "step": 909 }, { "epoch": 1.531664212076583, "grad_norm": 0.12281223386526108, "learning_rate": 1e-05, "loss": 2.2479, "step": 910 }, { "epoch": 1.5333473595623817, "grad_norm": 0.12240397185087204, "learning_rate": 1e-05, "loss": 2.3416, "step": 911 }, { "epoch": 1.5350305070481802, "grad_norm": 0.14465166628360748, "learning_rate": 1e-05, "loss": 2.1801, "step": 912 }, { "epoch": 1.5367136545339786, "grad_norm": 0.1263197958469391, "learning_rate": 1e-05, "loss": 2.2583, "step": 913 }, { "epoch": 1.538396802019777, "grad_norm": 0.14653970301151276, "learning_rate": 1e-05, "loss": 2.2939, "step": 914 }, { "epoch": 1.5400799495055755, "grad_norm": 0.1311267763376236, "learning_rate": 1e-05, "loss": 2.2517, "step": 915 }, { "epoch": 1.5417630969913738, "grad_norm": 0.13173674046993256, "learning_rate": 1e-05, "loss": 2.309, "step": 916 }, { "epoch": 1.5434462444771722, "grad_norm": 0.13140322268009186, "learning_rate": 1e-05, "loss": 2.1447, "step": 917 }, { "epoch": 1.5451293919629707, "grad_norm": 0.12431302666664124, "learning_rate": 1e-05, "loss": 2.3315, "step": 918 }, { "epoch": 1.5468125394487693, "grad_norm": 0.14358630776405334, "learning_rate": 1e-05, "loss": 2.2634, "step": 919 }, { "epoch": 1.5484956869345676, "grad_norm": 0.1297353357076645, "learning_rate": 1e-05, "loss": 2.2489, "step": 920 }, { "epoch": 1.550178834420366, "grad_norm": 0.12963449954986572, "learning_rate": 1e-05, "loss": 2.1533, "step": 921 }, { "epoch": 1.5518619819061645, "grad_norm": 0.11558603495359421, "learning_rate": 1e-05, "loss": 2.2688, "step": 922 }, { "epoch": 1.553545129391963, "grad_norm": 0.14222054183483124, "learning_rate": 1e-05, "loss": 2.2385, "step": 923 }, { "epoch": 1.5552282768777614, "grad_norm": 0.1376868486404419, "learning_rate": 1e-05, "loss": 2.2051, "step": 924 }, { "epoch": 1.5569114243635598, "grad_norm": 0.12993879616260529, "learning_rate": 1e-05, "loss": 2.3445, "step": 925 }, { "epoch": 1.5585945718493583, "grad_norm": 0.14503213763237, "learning_rate": 1e-05, "loss": 2.215, "step": 926 }, { "epoch": 1.560277719335157, "grad_norm": 0.1302722692489624, "learning_rate": 1e-05, "loss": 2.1945, "step": 927 }, { "epoch": 1.561960866820955, "grad_norm": 0.13545845448970795, "learning_rate": 1e-05, "loss": 2.3059, "step": 928 }, { "epoch": 1.5636440143067536, "grad_norm": 0.12279404699802399, "learning_rate": 1e-05, "loss": 2.3511, "step": 929 }, { "epoch": 1.5653271617925522, "grad_norm": 0.13220550119876862, "learning_rate": 1e-05, "loss": 2.2837, "step": 930 }, { "epoch": 1.5670103092783505, "grad_norm": 0.1407599151134491, "learning_rate": 1e-05, "loss": 2.2905, "step": 931 }, { "epoch": 1.5686934567641488, "grad_norm": 0.12597431242465973, "learning_rate": 1e-05, "loss": 2.366, "step": 932 }, { "epoch": 1.5703766042499474, "grad_norm": 0.12998835742473602, "learning_rate": 1e-05, "loss": 2.1067, "step": 933 }, { "epoch": 1.572059751735746, "grad_norm": 0.14708921313285828, "learning_rate": 1e-05, "loss": 2.2687, "step": 934 }, { "epoch": 1.5737428992215443, "grad_norm": 0.13333402574062347, "learning_rate": 1e-05, "loss": 2.3381, "step": 935 }, { "epoch": 1.5754260467073427, "grad_norm": 0.14774633944034576, "learning_rate": 1e-05, "loss": 2.163, "step": 936 }, { "epoch": 1.5771091941931412, "grad_norm": 0.1283462792634964, "learning_rate": 1e-05, "loss": 2.3892, "step": 937 }, { "epoch": 1.5787923416789398, "grad_norm": 0.12011823058128357, "learning_rate": 1e-05, "loss": 2.2758, "step": 938 }, { "epoch": 1.580475489164738, "grad_norm": 0.11618427187204361, "learning_rate": 1e-05, "loss": 2.2545, "step": 939 }, { "epoch": 1.5821586366505365, "grad_norm": 0.12683863937854767, "learning_rate": 1e-05, "loss": 2.291, "step": 940 }, { "epoch": 1.583841784136335, "grad_norm": 0.13158243894577026, "learning_rate": 1e-05, "loss": 2.3066, "step": 941 }, { "epoch": 1.5855249316221334, "grad_norm": 0.13269281387329102, "learning_rate": 1e-05, "loss": 2.3442, "step": 942 }, { "epoch": 1.5872080791079317, "grad_norm": 0.14047692716121674, "learning_rate": 1e-05, "loss": 2.3092, "step": 943 }, { "epoch": 1.5888912265937303, "grad_norm": 0.1387140154838562, "learning_rate": 1e-05, "loss": 2.1482, "step": 944 }, { "epoch": 1.5905743740795288, "grad_norm": 0.13907848298549652, "learning_rate": 1e-05, "loss": 2.3484, "step": 945 }, { "epoch": 1.5922575215653272, "grad_norm": 0.13114407658576965, "learning_rate": 1e-05, "loss": 2.2195, "step": 946 }, { "epoch": 1.5939406690511255, "grad_norm": 0.1368924379348755, "learning_rate": 1e-05, "loss": 2.322, "step": 947 }, { "epoch": 1.595623816536924, "grad_norm": 0.141913041472435, "learning_rate": 1e-05, "loss": 2.2336, "step": 948 }, { "epoch": 1.5973069640227227, "grad_norm": 0.13295848667621613, "learning_rate": 1e-05, "loss": 2.3081, "step": 949 }, { "epoch": 1.5989901115085208, "grad_norm": 0.12306110560894012, "learning_rate": 1e-05, "loss": 2.3354, "step": 950 }, { "epoch": 1.6006732589943193, "grad_norm": 0.12122649699449539, "learning_rate": 1e-05, "loss": 2.2839, "step": 951 }, { "epoch": 1.602356406480118, "grad_norm": 0.13046576082706451, "learning_rate": 1e-05, "loss": 2.385, "step": 952 }, { "epoch": 1.6040395539659162, "grad_norm": 0.1272476315498352, "learning_rate": 1e-05, "loss": 2.4153, "step": 953 }, { "epoch": 1.6057227014517146, "grad_norm": 0.13073799014091492, "learning_rate": 1e-05, "loss": 2.2854, "step": 954 }, { "epoch": 1.6074058489375131, "grad_norm": 0.12583526968955994, "learning_rate": 1e-05, "loss": 2.3318, "step": 955 }, { "epoch": 1.6090889964233117, "grad_norm": 0.1474972665309906, "learning_rate": 1e-05, "loss": 2.2542, "step": 956 }, { "epoch": 1.61077214390911, "grad_norm": 0.13445797562599182, "learning_rate": 1e-05, "loss": 2.3645, "step": 957 }, { "epoch": 1.6124552913949084, "grad_norm": 0.13466110825538635, "learning_rate": 1e-05, "loss": 2.3394, "step": 958 }, { "epoch": 1.614138438880707, "grad_norm": 0.13525816798210144, "learning_rate": 1e-05, "loss": 2.2471, "step": 959 }, { "epoch": 1.6158215863665055, "grad_norm": 0.1377459019422531, "learning_rate": 1e-05, "loss": 2.2478, "step": 960 }, { "epoch": 1.6175047338523036, "grad_norm": 0.1405583918094635, "learning_rate": 1e-05, "loss": 2.2146, "step": 961 }, { "epoch": 1.6191878813381022, "grad_norm": 0.11743167042732239, "learning_rate": 1e-05, "loss": 2.3555, "step": 962 }, { "epoch": 1.6208710288239008, "grad_norm": 0.13644517958164215, "learning_rate": 1e-05, "loss": 2.2155, "step": 963 }, { "epoch": 1.6225541763096991, "grad_norm": 0.12609997391700745, "learning_rate": 1e-05, "loss": 2.2593, "step": 964 }, { "epoch": 1.6242373237954975, "grad_norm": 0.13276560604572296, "learning_rate": 1e-05, "loss": 2.1737, "step": 965 }, { "epoch": 1.625920471281296, "grad_norm": 0.13567714393138885, "learning_rate": 1e-05, "loss": 2.3336, "step": 966 }, { "epoch": 1.6276036187670946, "grad_norm": 0.12559200823307037, "learning_rate": 1e-05, "loss": 2.3494, "step": 967 }, { "epoch": 1.629286766252893, "grad_norm": 0.13090649247169495, "learning_rate": 1e-05, "loss": 2.1851, "step": 968 }, { "epoch": 1.6309699137386913, "grad_norm": 0.15777987241744995, "learning_rate": 1e-05, "loss": 2.2205, "step": 969 }, { "epoch": 1.6326530612244898, "grad_norm": 0.1433715522289276, "learning_rate": 1e-05, "loss": 2.2295, "step": 970 }, { "epoch": 1.6343362087102884, "grad_norm": 0.1218508929014206, "learning_rate": 1e-05, "loss": 2.3762, "step": 971 }, { "epoch": 1.6360193561960865, "grad_norm": 0.14540942013263702, "learning_rate": 1e-05, "loss": 2.2139, "step": 972 }, { "epoch": 1.637702503681885, "grad_norm": 0.14829136431217194, "learning_rate": 1e-05, "loss": 2.2871, "step": 973 }, { "epoch": 1.6393856511676836, "grad_norm": 0.12728969752788544, "learning_rate": 1e-05, "loss": 2.2917, "step": 974 }, { "epoch": 1.641068798653482, "grad_norm": 0.1471221148967743, "learning_rate": 1e-05, "loss": 2.2012, "step": 975 }, { "epoch": 1.6427519461392803, "grad_norm": 0.13320200145244598, "learning_rate": 1e-05, "loss": 2.2771, "step": 976 }, { "epoch": 1.644435093625079, "grad_norm": 0.1363966464996338, "learning_rate": 1e-05, "loss": 2.3086, "step": 977 }, { "epoch": 1.6461182411108775, "grad_norm": 0.13870568573474884, "learning_rate": 1e-05, "loss": 2.2898, "step": 978 }, { "epoch": 1.6478013885966758, "grad_norm": 0.15152350068092346, "learning_rate": 1e-05, "loss": 2.2994, "step": 979 }, { "epoch": 1.6494845360824741, "grad_norm": 0.13830937445163727, "learning_rate": 1e-05, "loss": 2.2108, "step": 980 }, { "epoch": 1.6511676835682727, "grad_norm": 0.15544220805168152, "learning_rate": 1e-05, "loss": 2.4043, "step": 981 }, { "epoch": 1.6528508310540713, "grad_norm": 0.13135483860969543, "learning_rate": 1e-05, "loss": 2.2373, "step": 982 }, { "epoch": 1.6545339785398696, "grad_norm": 0.12355194985866547, "learning_rate": 1e-05, "loss": 2.4163, "step": 983 }, { "epoch": 1.656217126025668, "grad_norm": 0.14110660552978516, "learning_rate": 1e-05, "loss": 2.2031, "step": 984 }, { "epoch": 1.6579002735114665, "grad_norm": 0.13077346980571747, "learning_rate": 1e-05, "loss": 2.3601, "step": 985 }, { "epoch": 1.6595834209972649, "grad_norm": 0.14212660491466522, "learning_rate": 1e-05, "loss": 2.197, "step": 986 }, { "epoch": 1.6612665684830632, "grad_norm": 0.12336140871047974, "learning_rate": 1e-05, "loss": 2.4146, "step": 987 }, { "epoch": 1.6629497159688618, "grad_norm": 0.15291054546833038, "learning_rate": 1e-05, "loss": 2.2764, "step": 988 }, { "epoch": 1.6646328634546603, "grad_norm": 0.1272605061531067, "learning_rate": 1e-05, "loss": 2.2703, "step": 989 }, { "epoch": 1.6663160109404587, "grad_norm": 0.13462689518928528, "learning_rate": 1e-05, "loss": 2.3188, "step": 990 }, { "epoch": 1.667999158426257, "grad_norm": 0.13290910422801971, "learning_rate": 1e-05, "loss": 2.2172, "step": 991 }, { "epoch": 1.6696823059120556, "grad_norm": 0.15105758607387543, "learning_rate": 1e-05, "loss": 2.2156, "step": 992 }, { "epoch": 1.6713654533978541, "grad_norm": 0.13150456547737122, "learning_rate": 1e-05, "loss": 2.3362, "step": 993 }, { "epoch": 1.6730486008836525, "grad_norm": 0.13139204680919647, "learning_rate": 1e-05, "loss": 2.3833, "step": 994 }, { "epoch": 1.6747317483694508, "grad_norm": 0.14886420965194702, "learning_rate": 1e-05, "loss": 2.1893, "step": 995 }, { "epoch": 1.6764148958552494, "grad_norm": 0.13227102160453796, "learning_rate": 1e-05, "loss": 2.4055, "step": 996 }, { "epoch": 1.6780980433410477, "grad_norm": 0.12545333802700043, "learning_rate": 1e-05, "loss": 2.3311, "step": 997 }, { "epoch": 1.679781190826846, "grad_norm": 0.13391169905662537, "learning_rate": 1e-05, "loss": 2.3022, "step": 998 }, { "epoch": 1.6814643383126446, "grad_norm": 0.13013269007205963, "learning_rate": 1e-05, "loss": 2.2318, "step": 999 }, { "epoch": 1.6831474857984432, "grad_norm": 0.1331031173467636, "learning_rate": 1e-05, "loss": 2.3022, "step": 1000 }, { "epoch": 1.6848306332842415, "grad_norm": 0.14438873529434204, "learning_rate": 1e-05, "loss": 2.2388, "step": 1001 }, { "epoch": 1.6865137807700399, "grad_norm": 0.1422380954027176, "learning_rate": 1e-05, "loss": 2.3145, "step": 1002 }, { "epoch": 1.6881969282558384, "grad_norm": 0.13909044861793518, "learning_rate": 1e-05, "loss": 2.2249, "step": 1003 }, { "epoch": 1.689880075741637, "grad_norm": 0.14147858321666718, "learning_rate": 1e-05, "loss": 2.3179, "step": 1004 }, { "epoch": 1.6915632232274354, "grad_norm": 0.13203288614749908, "learning_rate": 1e-05, "loss": 2.1912, "step": 1005 }, { "epoch": 1.6932463707132337, "grad_norm": 0.14461839199066162, "learning_rate": 1e-05, "loss": 2.1982, "step": 1006 }, { "epoch": 1.6949295181990323, "grad_norm": 0.14539021253585815, "learning_rate": 1e-05, "loss": 2.2917, "step": 1007 }, { "epoch": 1.6966126656848306, "grad_norm": 0.14774973690509796, "learning_rate": 1e-05, "loss": 2.2639, "step": 1008 }, { "epoch": 1.698295813170629, "grad_norm": 0.14927157759666443, "learning_rate": 1e-05, "loss": 2.1956, "step": 1009 }, { "epoch": 1.6999789606564275, "grad_norm": 0.1286613643169403, "learning_rate": 1e-05, "loss": 2.292, "step": 1010 }, { "epoch": 1.701662108142226, "grad_norm": 0.12883049249649048, "learning_rate": 1e-05, "loss": 2.2573, "step": 1011 }, { "epoch": 1.7033452556280244, "grad_norm": 0.14129754900932312, "learning_rate": 1e-05, "loss": 2.334, "step": 1012 }, { "epoch": 1.7050284031138228, "grad_norm": 0.13216479122638702, "learning_rate": 1e-05, "loss": 2.2664, "step": 1013 }, { "epoch": 1.7067115505996213, "grad_norm": 0.12611788511276245, "learning_rate": 1e-05, "loss": 2.3159, "step": 1014 }, { "epoch": 1.7083946980854199, "grad_norm": 0.14012207090854645, "learning_rate": 1e-05, "loss": 2.4026, "step": 1015 }, { "epoch": 1.7100778455712182, "grad_norm": 0.14449255168437958, "learning_rate": 1e-05, "loss": 2.3313, "step": 1016 }, { "epoch": 1.7117609930570166, "grad_norm": 0.15093393623828888, "learning_rate": 1e-05, "loss": 2.2075, "step": 1017 }, { "epoch": 1.7134441405428151, "grad_norm": 0.15169350802898407, "learning_rate": 1e-05, "loss": 2.1926, "step": 1018 }, { "epoch": 1.7151272880286135, "grad_norm": 0.13613849878311157, "learning_rate": 1e-05, "loss": 2.3394, "step": 1019 }, { "epoch": 1.7168104355144118, "grad_norm": 0.13525283336639404, "learning_rate": 1e-05, "loss": 2.2234, "step": 1020 }, { "epoch": 1.7184935830002104, "grad_norm": 0.1529736965894699, "learning_rate": 1e-05, "loss": 2.1866, "step": 1021 }, { "epoch": 1.720176730486009, "grad_norm": 0.13723863661289215, "learning_rate": 1e-05, "loss": 2.3027, "step": 1022 }, { "epoch": 1.7218598779718073, "grad_norm": 0.16251115500926971, "learning_rate": 1e-05, "loss": 2.3428, "step": 1023 }, { "epoch": 1.7235430254576056, "grad_norm": 0.1440790742635727, "learning_rate": 1e-05, "loss": 2.3298, "step": 1024 }, { "epoch": 1.7252261729434042, "grad_norm": 0.13486018776893616, "learning_rate": 1e-05, "loss": 2.3826, "step": 1025 }, { "epoch": 1.7269093204292028, "grad_norm": 0.15616028010845184, "learning_rate": 1e-05, "loss": 2.0817, "step": 1026 }, { "epoch": 1.728592467915001, "grad_norm": 0.15306299924850464, "learning_rate": 1e-05, "loss": 2.2601, "step": 1027 }, { "epoch": 1.7302756154007994, "grad_norm": 0.14421014487743378, "learning_rate": 1e-05, "loss": 2.1998, "step": 1028 }, { "epoch": 1.731958762886598, "grad_norm": 0.14438478648662567, "learning_rate": 1e-05, "loss": 2.262, "step": 1029 }, { "epoch": 1.7336419103723963, "grad_norm": 0.13325351476669312, "learning_rate": 1e-05, "loss": 2.2852, "step": 1030 }, { "epoch": 1.7353250578581947, "grad_norm": 0.14232920110225677, "learning_rate": 1e-05, "loss": 2.3147, "step": 1031 }, { "epoch": 1.7370082053439933, "grad_norm": 0.1394515186548233, "learning_rate": 1e-05, "loss": 2.2781, "step": 1032 }, { "epoch": 1.7386913528297918, "grad_norm": 0.12838682532310486, "learning_rate": 1e-05, "loss": 2.2827, "step": 1033 }, { "epoch": 1.7403745003155902, "grad_norm": 0.15612417459487915, "learning_rate": 1e-05, "loss": 2.3108, "step": 1034 }, { "epoch": 1.7420576478013885, "grad_norm": 0.14740139245986938, "learning_rate": 1e-05, "loss": 2.2412, "step": 1035 }, { "epoch": 1.743740795287187, "grad_norm": 0.1541980355978012, "learning_rate": 1e-05, "loss": 2.3156, "step": 1036 }, { "epoch": 1.7454239427729856, "grad_norm": 0.14056488871574402, "learning_rate": 1e-05, "loss": 2.1829, "step": 1037 }, { "epoch": 1.747107090258784, "grad_norm": 0.143393874168396, "learning_rate": 1e-05, "loss": 2.2717, "step": 1038 }, { "epoch": 1.7487902377445823, "grad_norm": 0.14296631515026093, "learning_rate": 1e-05, "loss": 2.342, "step": 1039 }, { "epoch": 1.7504733852303809, "grad_norm": 0.13753627240657806, "learning_rate": 1e-05, "loss": 2.324, "step": 1040 }, { "epoch": 1.7521565327161792, "grad_norm": 0.13361461460590363, "learning_rate": 1e-05, "loss": 2.3549, "step": 1041 }, { "epoch": 1.7538396802019776, "grad_norm": 0.16176526248455048, "learning_rate": 1e-05, "loss": 2.0996, "step": 1042 }, { "epoch": 1.7555228276877761, "grad_norm": 0.14512574672698975, "learning_rate": 1e-05, "loss": 2.3289, "step": 1043 }, { "epoch": 1.7572059751735747, "grad_norm": 0.14329467713832855, "learning_rate": 1e-05, "loss": 2.2429, "step": 1044 }, { "epoch": 1.758889122659373, "grad_norm": 0.1415308713912964, "learning_rate": 1e-05, "loss": 2.2976, "step": 1045 }, { "epoch": 1.7605722701451714, "grad_norm": 0.13017630577087402, "learning_rate": 1e-05, "loss": 2.3142, "step": 1046 }, { "epoch": 1.76225541763097, "grad_norm": 0.14865103363990784, "learning_rate": 1e-05, "loss": 2.2659, "step": 1047 }, { "epoch": 1.7639385651167685, "grad_norm": 0.13973674178123474, "learning_rate": 1e-05, "loss": 2.1975, "step": 1048 }, { "epoch": 1.7656217126025668, "grad_norm": 0.12378077954053879, "learning_rate": 1e-05, "loss": 2.4469, "step": 1049 }, { "epoch": 1.7673048600883652, "grad_norm": 0.13462629914283752, "learning_rate": 1e-05, "loss": 2.332, "step": 1050 }, { "epoch": 1.7689880075741637, "grad_norm": 0.14375431835651398, "learning_rate": 1e-05, "loss": 2.2834, "step": 1051 }, { "epoch": 1.770671155059962, "grad_norm": 0.1413864940404892, "learning_rate": 1e-05, "loss": 2.2769, "step": 1052 }, { "epoch": 1.7723543025457604, "grad_norm": 0.15052342414855957, "learning_rate": 1e-05, "loss": 2.2522, "step": 1053 }, { "epoch": 1.774037450031559, "grad_norm": 0.15616975724697113, "learning_rate": 1e-05, "loss": 2.1501, "step": 1054 }, { "epoch": 1.7757205975173576, "grad_norm": 0.16257071495056152, "learning_rate": 1e-05, "loss": 2.1545, "step": 1055 }, { "epoch": 1.777403745003156, "grad_norm": 0.13512100279331207, "learning_rate": 1e-05, "loss": 2.2218, "step": 1056 }, { "epoch": 1.7790868924889542, "grad_norm": 0.1581428200006485, "learning_rate": 1e-05, "loss": 2.1865, "step": 1057 }, { "epoch": 1.7807700399747528, "grad_norm": 0.13829343020915985, "learning_rate": 1e-05, "loss": 2.3337, "step": 1058 }, { "epoch": 1.7824531874605514, "grad_norm": 0.16639141738414764, "learning_rate": 1e-05, "loss": 2.2325, "step": 1059 }, { "epoch": 1.7841363349463497, "grad_norm": 0.1412006914615631, "learning_rate": 1e-05, "loss": 2.3389, "step": 1060 }, { "epoch": 1.785819482432148, "grad_norm": 0.13130658864974976, "learning_rate": 1e-05, "loss": 2.3376, "step": 1061 }, { "epoch": 1.7875026299179466, "grad_norm": 0.1495353728532791, "learning_rate": 1e-05, "loss": 2.2666, "step": 1062 }, { "epoch": 1.789185777403745, "grad_norm": 0.15077506005764008, "learning_rate": 1e-05, "loss": 2.228, "step": 1063 }, { "epoch": 1.7908689248895433, "grad_norm": 0.1426386535167694, "learning_rate": 1e-05, "loss": 2.2727, "step": 1064 }, { "epoch": 1.7925520723753419, "grad_norm": 0.14268244802951813, "learning_rate": 1e-05, "loss": 2.3643, "step": 1065 }, { "epoch": 1.7942352198611404, "grad_norm": 0.14923584461212158, "learning_rate": 1e-05, "loss": 2.333, "step": 1066 }, { "epoch": 1.7959183673469388, "grad_norm": 0.15571311116218567, "learning_rate": 1e-05, "loss": 2.3171, "step": 1067 }, { "epoch": 1.7976015148327371, "grad_norm": 0.13931907713413239, "learning_rate": 1e-05, "loss": 2.2164, "step": 1068 }, { "epoch": 1.7992846623185357, "grad_norm": 0.1513443887233734, "learning_rate": 1e-05, "loss": 2.2885, "step": 1069 }, { "epoch": 1.8009678098043342, "grad_norm": 0.14123128354549408, "learning_rate": 1e-05, "loss": 2.3517, "step": 1070 }, { "epoch": 1.8026509572901326, "grad_norm": 0.16668306291103363, "learning_rate": 1e-05, "loss": 2.1907, "step": 1071 }, { "epoch": 1.804334104775931, "grad_norm": 0.14049063622951508, "learning_rate": 1e-05, "loss": 2.4216, "step": 1072 }, { "epoch": 1.8060172522617295, "grad_norm": 0.13806495070457458, "learning_rate": 1e-05, "loss": 2.3367, "step": 1073 }, { "epoch": 1.8077003997475278, "grad_norm": 0.14562048017978668, "learning_rate": 1e-05, "loss": 2.2303, "step": 1074 }, { "epoch": 1.8093835472333262, "grad_norm": 0.16803675889968872, "learning_rate": 1e-05, "loss": 2.2404, "step": 1075 }, { "epoch": 1.8110666947191247, "grad_norm": 0.14971864223480225, "learning_rate": 1e-05, "loss": 2.1941, "step": 1076 }, { "epoch": 1.8127498422049233, "grad_norm": 0.162116140127182, "learning_rate": 1e-05, "loss": 2.2034, "step": 1077 }, { "epoch": 1.8144329896907216, "grad_norm": 0.1417408138513565, "learning_rate": 1e-05, "loss": 2.2991, "step": 1078 }, { "epoch": 1.81611613717652, "grad_norm": 0.14334024488925934, "learning_rate": 1e-05, "loss": 2.3796, "step": 1079 }, { "epoch": 1.8177992846623185, "grad_norm": 0.13600003719329834, "learning_rate": 1e-05, "loss": 2.2322, "step": 1080 }, { "epoch": 1.8194824321481171, "grad_norm": 0.1557435244321823, "learning_rate": 1e-05, "loss": 2.2151, "step": 1081 }, { "epoch": 1.8211655796339155, "grad_norm": 0.14444471895694733, "learning_rate": 1e-05, "loss": 2.2778, "step": 1082 }, { "epoch": 1.8228487271197138, "grad_norm": 0.15237338840961456, "learning_rate": 1e-05, "loss": 2.1863, "step": 1083 }, { "epoch": 1.8245318746055124, "grad_norm": 0.1488647758960724, "learning_rate": 1e-05, "loss": 2.1194, "step": 1084 }, { "epoch": 1.8262150220913107, "grad_norm": 0.14532509446144104, "learning_rate": 1e-05, "loss": 2.3018, "step": 1085 }, { "epoch": 1.827898169577109, "grad_norm": 0.1438300609588623, "learning_rate": 1e-05, "loss": 2.3542, "step": 1086 }, { "epoch": 1.8295813170629076, "grad_norm": 0.13162897527217865, "learning_rate": 1e-05, "loss": 2.3762, "step": 1087 }, { "epoch": 1.8312644645487062, "grad_norm": 0.14388734102249146, "learning_rate": 1e-05, "loss": 2.3097, "step": 1088 }, { "epoch": 1.8329476120345045, "grad_norm": 0.1633898764848709, "learning_rate": 1e-05, "loss": 2.1975, "step": 1089 }, { "epoch": 1.8346307595203029, "grad_norm": 0.14513400197029114, "learning_rate": 1e-05, "loss": 2.3562, "step": 1090 }, { "epoch": 1.8363139070061014, "grad_norm": 0.1562061607837677, "learning_rate": 1e-05, "loss": 2.2384, "step": 1091 }, { "epoch": 1.8379970544919, "grad_norm": 0.14833082258701324, "learning_rate": 1e-05, "loss": 2.199, "step": 1092 }, { "epoch": 1.8396802019776983, "grad_norm": 0.14182843267917633, "learning_rate": 1e-05, "loss": 2.2632, "step": 1093 }, { "epoch": 1.8413633494634967, "grad_norm": 0.16517210006713867, "learning_rate": 1e-05, "loss": 2.2719, "step": 1094 }, { "epoch": 1.8430464969492952, "grad_norm": 0.1563366949558258, "learning_rate": 1e-05, "loss": 2.2285, "step": 1095 }, { "epoch": 1.8447296444350936, "grad_norm": 0.1349581480026245, "learning_rate": 1e-05, "loss": 2.2998, "step": 1096 }, { "epoch": 1.846412791920892, "grad_norm": 0.14647842943668365, "learning_rate": 1e-05, "loss": 2.2588, "step": 1097 }, { "epoch": 1.8480959394066905, "grad_norm": 0.1527308076620102, "learning_rate": 1e-05, "loss": 2.1945, "step": 1098 }, { "epoch": 1.849779086892489, "grad_norm": 0.16208425164222717, "learning_rate": 1e-05, "loss": 2.1692, "step": 1099 }, { "epoch": 1.8514622343782874, "grad_norm": 0.15897248685359955, "learning_rate": 1e-05, "loss": 2.3582, "step": 1100 }, { "epoch": 1.8531453818640857, "grad_norm": 0.14687612652778625, "learning_rate": 1e-05, "loss": 2.3057, "step": 1101 }, { "epoch": 1.8548285293498843, "grad_norm": 0.1631488800048828, "learning_rate": 1e-05, "loss": 2.2521, "step": 1102 }, { "epoch": 1.8565116768356829, "grad_norm": 0.14686156809329987, "learning_rate": 1e-05, "loss": 2.313, "step": 1103 }, { "epoch": 1.8581948243214812, "grad_norm": 0.162966787815094, "learning_rate": 1e-05, "loss": 2.1968, "step": 1104 }, { "epoch": 1.8598779718072795, "grad_norm": 0.15387648344039917, "learning_rate": 1e-05, "loss": 2.3059, "step": 1105 }, { "epoch": 1.861561119293078, "grad_norm": 0.1489906907081604, "learning_rate": 1e-05, "loss": 2.2195, "step": 1106 }, { "epoch": 1.8632442667788764, "grad_norm": 0.14351260662078857, "learning_rate": 1e-05, "loss": 2.2656, "step": 1107 }, { "epoch": 1.8649274142646748, "grad_norm": 0.16010256111621857, "learning_rate": 1e-05, "loss": 2.3252, "step": 1108 }, { "epoch": 1.8666105617504734, "grad_norm": 0.14475148916244507, "learning_rate": 1e-05, "loss": 2.2878, "step": 1109 }, { "epoch": 1.868293709236272, "grad_norm": 0.14097367227077484, "learning_rate": 1e-05, "loss": 2.3716, "step": 1110 }, { "epoch": 1.8699768567220703, "grad_norm": 0.15699978172779083, "learning_rate": 1e-05, "loss": 2.1678, "step": 1111 }, { "epoch": 1.8716600042078686, "grad_norm": 0.1370065063238144, "learning_rate": 1e-05, "loss": 2.3315, "step": 1112 }, { "epoch": 1.8733431516936672, "grad_norm": 0.1498231291770935, "learning_rate": 1e-05, "loss": 2.2949, "step": 1113 }, { "epoch": 1.8750262991794657, "grad_norm": 0.13267523050308228, "learning_rate": 1e-05, "loss": 2.3535, "step": 1114 }, { "epoch": 1.876709446665264, "grad_norm": 0.1453379988670349, "learning_rate": 1e-05, "loss": 2.2791, "step": 1115 }, { "epoch": 1.8783925941510624, "grad_norm": 0.15499484539031982, "learning_rate": 1e-05, "loss": 2.2085, "step": 1116 }, { "epoch": 1.880075741636861, "grad_norm": 0.14418251812458038, "learning_rate": 1e-05, "loss": 2.2793, "step": 1117 }, { "epoch": 1.8817588891226595, "grad_norm": 0.13686548173427582, "learning_rate": 1e-05, "loss": 2.4175, "step": 1118 }, { "epoch": 1.8834420366084577, "grad_norm": 0.17202888429164886, "learning_rate": 1e-05, "loss": 2.2196, "step": 1119 }, { "epoch": 1.8851251840942562, "grad_norm": 0.1437048763036728, "learning_rate": 1e-05, "loss": 2.2688, "step": 1120 }, { "epoch": 1.8868083315800548, "grad_norm": 0.13868288695812225, "learning_rate": 1e-05, "loss": 2.2971, "step": 1121 }, { "epoch": 1.8884914790658531, "grad_norm": 0.133874773979187, "learning_rate": 1e-05, "loss": 2.3228, "step": 1122 }, { "epoch": 1.8901746265516515, "grad_norm": 0.15967018902301788, "learning_rate": 1e-05, "loss": 2.2346, "step": 1123 }, { "epoch": 1.89185777403745, "grad_norm": 0.15074019134044647, "learning_rate": 1e-05, "loss": 2.3577, "step": 1124 }, { "epoch": 1.8935409215232486, "grad_norm": 0.13931475579738617, "learning_rate": 1e-05, "loss": 2.3789, "step": 1125 }, { "epoch": 1.895224069009047, "grad_norm": 0.15354882180690765, "learning_rate": 1e-05, "loss": 2.184, "step": 1126 }, { "epoch": 1.8969072164948453, "grad_norm": 0.15907764434814453, "learning_rate": 1e-05, "loss": 2.3638, "step": 1127 }, { "epoch": 1.8985903639806438, "grad_norm": 0.13138049840927124, "learning_rate": 1e-05, "loss": 2.4543, "step": 1128 }, { "epoch": 1.9002735114664424, "grad_norm": 0.14568856358528137, "learning_rate": 1e-05, "loss": 2.3064, "step": 1129 }, { "epoch": 1.9019566589522405, "grad_norm": 0.1426182985305786, "learning_rate": 1e-05, "loss": 2.3223, "step": 1130 }, { "epoch": 1.903639806438039, "grad_norm": 0.13313454389572144, "learning_rate": 1e-05, "loss": 2.3953, "step": 1131 }, { "epoch": 1.9053229539238377, "grad_norm": 0.16987952589988708, "learning_rate": 1e-05, "loss": 2.1274, "step": 1132 }, { "epoch": 1.907006101409636, "grad_norm": 0.1408863216638565, "learning_rate": 1e-05, "loss": 2.3242, "step": 1133 }, { "epoch": 1.9086892488954343, "grad_norm": 0.14704225957393646, "learning_rate": 1e-05, "loss": 2.3687, "step": 1134 }, { "epoch": 1.910372396381233, "grad_norm": 0.18410103023052216, "learning_rate": 1e-05, "loss": 2.1222, "step": 1135 }, { "epoch": 1.9120555438670315, "grad_norm": 0.13889069855213165, "learning_rate": 1e-05, "loss": 2.3165, "step": 1136 }, { "epoch": 1.9137386913528298, "grad_norm": 0.1532329022884369, "learning_rate": 1e-05, "loss": 2.2913, "step": 1137 }, { "epoch": 1.9154218388386282, "grad_norm": 0.14806988835334778, "learning_rate": 1e-05, "loss": 2.2239, "step": 1138 }, { "epoch": 1.9171049863244267, "grad_norm": 0.14964371919631958, "learning_rate": 1e-05, "loss": 2.2639, "step": 1139 }, { "epoch": 1.9187881338102253, "grad_norm": 0.15137715637683868, "learning_rate": 1e-05, "loss": 2.3096, "step": 1140 }, { "epoch": 1.9204712812960234, "grad_norm": 0.15892736613750458, "learning_rate": 1e-05, "loss": 2.3163, "step": 1141 }, { "epoch": 1.922154428781822, "grad_norm": 0.15544387698173523, "learning_rate": 1e-05, "loss": 2.1825, "step": 1142 }, { "epoch": 1.9238375762676205, "grad_norm": 0.14712852239608765, "learning_rate": 1e-05, "loss": 2.2659, "step": 1143 }, { "epoch": 1.9255207237534189, "grad_norm": 0.1436305195093155, "learning_rate": 1e-05, "loss": 2.3101, "step": 1144 }, { "epoch": 1.9272038712392172, "grad_norm": 0.16642406582832336, "learning_rate": 1e-05, "loss": 2.2156, "step": 1145 }, { "epoch": 1.9288870187250158, "grad_norm": 0.16517338156700134, "learning_rate": 1e-05, "loss": 2.2561, "step": 1146 }, { "epoch": 1.9305701662108143, "grad_norm": 0.1337500959634781, "learning_rate": 1e-05, "loss": 2.3818, "step": 1147 }, { "epoch": 1.9322533136966127, "grad_norm": 0.15977586805820465, "learning_rate": 1e-05, "loss": 2.2377, "step": 1148 }, { "epoch": 1.933936461182411, "grad_norm": 0.14951424300670624, "learning_rate": 1e-05, "loss": 2.2269, "step": 1149 }, { "epoch": 1.9356196086682096, "grad_norm": 0.13450993597507477, "learning_rate": 1e-05, "loss": 2.3442, "step": 1150 }, { "epoch": 1.9373027561540082, "grad_norm": 0.16469308733940125, "learning_rate": 1e-05, "loss": 2.3123, "step": 1151 }, { "epoch": 1.9389859036398063, "grad_norm": 0.14135532081127167, "learning_rate": 1e-05, "loss": 2.387, "step": 1152 }, { "epoch": 1.9406690511256048, "grad_norm": 0.13864876329898834, "learning_rate": 1e-05, "loss": 2.2661, "step": 1153 }, { "epoch": 1.9423521986114034, "grad_norm": 0.16291983425617218, "learning_rate": 1e-05, "loss": 2.2617, "step": 1154 }, { "epoch": 1.9440353460972017, "grad_norm": 0.13341820240020752, "learning_rate": 1e-05, "loss": 2.4299, "step": 1155 }, { "epoch": 1.945718493583, "grad_norm": 0.15701517462730408, "learning_rate": 1e-05, "loss": 2.2211, "step": 1156 }, { "epoch": 1.9474016410687987, "grad_norm": 0.16075365245342255, "learning_rate": 1e-05, "loss": 2.1801, "step": 1157 }, { "epoch": 1.9490847885545972, "grad_norm": 0.15631234645843506, "learning_rate": 1e-05, "loss": 2.2152, "step": 1158 }, { "epoch": 1.9507679360403956, "grad_norm": 0.16927126049995422, "learning_rate": 1e-05, "loss": 2.1776, "step": 1159 }, { "epoch": 1.952451083526194, "grad_norm": 0.15192179381847382, "learning_rate": 1e-05, "loss": 2.2812, "step": 1160 }, { "epoch": 1.9541342310119925, "grad_norm": 0.145833820104599, "learning_rate": 1e-05, "loss": 2.3124, "step": 1161 }, { "epoch": 1.955817378497791, "grad_norm": 0.16952313482761383, "learning_rate": 1e-05, "loss": 2.1085, "step": 1162 }, { "epoch": 1.9575005259835891, "grad_norm": 0.1629469394683838, "learning_rate": 1e-05, "loss": 2.2267, "step": 1163 }, { "epoch": 1.9591836734693877, "grad_norm": 0.16672489047050476, "learning_rate": 1e-05, "loss": 2.3783, "step": 1164 }, { "epoch": 1.9608668209551863, "grad_norm": 0.14810308814048767, "learning_rate": 1e-05, "loss": 2.3723, "step": 1165 }, { "epoch": 1.9625499684409846, "grad_norm": 0.1435479074716568, "learning_rate": 1e-05, "loss": 2.2615, "step": 1166 }, { "epoch": 1.964233115926783, "grad_norm": 0.149140864610672, "learning_rate": 1e-05, "loss": 2.2134, "step": 1167 }, { "epoch": 1.9659162634125815, "grad_norm": 0.17785809934139252, "learning_rate": 1e-05, "loss": 2.1993, "step": 1168 }, { "epoch": 1.96759941089838, "grad_norm": 0.15931861102581024, "learning_rate": 1e-05, "loss": 2.1807, "step": 1169 }, { "epoch": 1.9692825583841784, "grad_norm": 0.16015268862247467, "learning_rate": 1e-05, "loss": 2.2737, "step": 1170 }, { "epoch": 1.9709657058699768, "grad_norm": 0.14189362525939941, "learning_rate": 1e-05, "loss": 2.3416, "step": 1171 }, { "epoch": 1.9726488533557753, "grad_norm": 0.1655077338218689, "learning_rate": 1e-05, "loss": 2.184, "step": 1172 }, { "epoch": 1.974332000841574, "grad_norm": 0.17838408052921295, "learning_rate": 1e-05, "loss": 2.2466, "step": 1173 }, { "epoch": 1.9760151483273722, "grad_norm": 0.16605247557163239, "learning_rate": 1e-05, "loss": 2.2019, "step": 1174 }, { "epoch": 1.9776982958131706, "grad_norm": 0.15444627404212952, "learning_rate": 1e-05, "loss": 2.2382, "step": 1175 }, { "epoch": 1.9793814432989691, "grad_norm": 0.15730591118335724, "learning_rate": 1e-05, "loss": 2.3335, "step": 1176 }, { "epoch": 1.9810645907847675, "grad_norm": 0.17332051694393158, "learning_rate": 1e-05, "loss": 2.17, "step": 1177 }, { "epoch": 1.9827477382705658, "grad_norm": 0.15129022300243378, "learning_rate": 1e-05, "loss": 2.2584, "step": 1178 }, { "epoch": 1.9844308857563644, "grad_norm": 0.16302135586738586, "learning_rate": 1e-05, "loss": 2.1904, "step": 1179 }, { "epoch": 1.986114033242163, "grad_norm": 0.14117322862148285, "learning_rate": 1e-05, "loss": 2.3611, "step": 1180 }, { "epoch": 1.9877971807279613, "grad_norm": 0.14415599405765533, "learning_rate": 1e-05, "loss": 2.3503, "step": 1181 }, { "epoch": 1.9894803282137596, "grad_norm": 0.15894141793251038, "learning_rate": 1e-05, "loss": 2.2253, "step": 1182 }, { "epoch": 1.9911634756995582, "grad_norm": 0.15063215792179108, "learning_rate": 1e-05, "loss": 2.303, "step": 1183 }, { "epoch": 1.9928466231853568, "grad_norm": 0.15843670070171356, "learning_rate": 1e-05, "loss": 2.2959, "step": 1184 }, { "epoch": 1.9945297706711551, "grad_norm": 0.1457902193069458, "learning_rate": 1e-05, "loss": 2.3396, "step": 1185 }, { "epoch": 1.9962129181569535, "grad_norm": 0.1694038361310959, "learning_rate": 1e-05, "loss": 2.3169, "step": 1186 }, { "epoch": 1.997896065642752, "grad_norm": 0.16121593117713928, "learning_rate": 1e-05, "loss": 2.2754, "step": 1187 }, { "epoch": 1.9995792131285504, "grad_norm": 0.16226674616336823, "learning_rate": 1e-05, "loss": 2.2498, "step": 1188 }, { "epoch": 1.9995792131285504, "step": 1188, "total_flos": 2.494777289795961e+18, "train_loss": 2.3332799018834174, "train_runtime": 81586.2049, "train_samples_per_second": 0.932, "train_steps_per_second": 0.015 } ], "logging_steps": 1.0, "max_steps": 1188, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.494777289795961e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }