{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5763, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005205622071837585, "grad_norm": 6.376532092079771, "learning_rate": 2.890173410404624e-07, "loss": 0.5233, "step": 1 }, { "epoch": 0.001041124414367517, "grad_norm": 6.4124152514036075, "learning_rate": 5.780346820809248e-07, "loss": 0.5204, "step": 2 }, { "epoch": 0.0015616866215512754, "grad_norm": 6.458779162822453, "learning_rate": 8.670520231213873e-07, "loss": 0.529, "step": 3 }, { "epoch": 0.002082248828735034, "grad_norm": 6.285519837284632, "learning_rate": 1.1560693641618497e-06, "loss": 0.5214, "step": 4 }, { "epoch": 0.0026028110359187923, "grad_norm": 5.796645692951652, "learning_rate": 1.4450867052023122e-06, "loss": 0.5185, "step": 5 }, { "epoch": 0.0031233732431025507, "grad_norm": 3.9879840763701693, "learning_rate": 1.7341040462427746e-06, "loss": 0.4823, "step": 6 }, { "epoch": 0.003643935450286309, "grad_norm": 3.6346238149034646, "learning_rate": 2.023121387283237e-06, "loss": 0.4531, "step": 7 }, { "epoch": 0.004164497657470068, "grad_norm": 3.3536127160309777, "learning_rate": 2.3121387283236993e-06, "loss": 0.427, "step": 8 }, { "epoch": 0.004685059864653826, "grad_norm": 2.7703984048593324, "learning_rate": 2.601156069364162e-06, "loss": 0.3954, "step": 9 }, { "epoch": 0.0052056220718375845, "grad_norm": 2.6269808449331644, "learning_rate": 2.8901734104046244e-06, "loss": 0.3784, "step": 10 }, { "epoch": 0.005726184279021343, "grad_norm": 1.7055430768828044, "learning_rate": 3.1791907514450866e-06, "loss": 0.3593, "step": 11 }, { "epoch": 0.006246746486205101, "grad_norm": 2.517105495712176, "learning_rate": 3.468208092485549e-06, "loss": 0.3521, "step": 12 }, { "epoch": 0.00676730869338886, "grad_norm": 3.195572501235165, "learning_rate": 3.757225433526012e-06, "loss": 0.3539, "step": 13 }, { "epoch": 0.007287870900572618, "grad_norm": 2.323008206835536, "learning_rate": 4.046242774566474e-06, "loss": 0.3425, "step": 14 }, { "epoch": 0.007808433107756377, "grad_norm": 1.5310630307639206, "learning_rate": 4.3352601156069365e-06, "loss": 0.3317, "step": 15 }, { "epoch": 0.008328995314940135, "grad_norm": 1.365492849675652, "learning_rate": 4.624277456647399e-06, "loss": 0.3148, "step": 16 }, { "epoch": 0.008849557522123894, "grad_norm": 1.4095650207655759, "learning_rate": 4.913294797687862e-06, "loss": 0.3157, "step": 17 }, { "epoch": 0.009370119729307652, "grad_norm": 1.5215580885689424, "learning_rate": 5.202312138728324e-06, "loss": 0.2976, "step": 18 }, { "epoch": 0.00989068193649141, "grad_norm": 1.2318061379322545, "learning_rate": 5.491329479768787e-06, "loss": 0.3052, "step": 19 }, { "epoch": 0.010411244143675169, "grad_norm": 1.1045087369383553, "learning_rate": 5.780346820809249e-06, "loss": 0.2965, "step": 20 }, { "epoch": 0.010931806350858927, "grad_norm": 1.1036987329271282, "learning_rate": 6.069364161849711e-06, "loss": 0.2897, "step": 21 }, { "epoch": 0.011452368558042686, "grad_norm": 1.0423572850985787, "learning_rate": 6.358381502890173e-06, "loss": 0.2845, "step": 22 }, { "epoch": 0.011972930765226444, "grad_norm": 0.9845462775484162, "learning_rate": 6.647398843930635e-06, "loss": 0.268, "step": 23 }, { "epoch": 0.012493492972410203, "grad_norm": 0.9205758761394418, "learning_rate": 6.936416184971098e-06, "loss": 0.2728, "step": 24 }, { "epoch": 0.013014055179593961, "grad_norm": 0.936483616234927, "learning_rate": 7.225433526011561e-06, "loss": 0.2594, "step": 25 }, { "epoch": 0.01353461738677772, "grad_norm": 0.8503591857964459, "learning_rate": 7.514450867052024e-06, "loss": 0.2628, "step": 26 }, { "epoch": 0.014055179593961478, "grad_norm": 0.8672336072786627, "learning_rate": 7.803468208092486e-06, "loss": 0.2555, "step": 27 }, { "epoch": 0.014575741801145237, "grad_norm": 0.9212778120486387, "learning_rate": 8.092485549132949e-06, "loss": 0.2547, "step": 28 }, { "epoch": 0.015096304008328995, "grad_norm": 1.0623750925591287, "learning_rate": 8.38150289017341e-06, "loss": 0.2518, "step": 29 }, { "epoch": 0.015616866215512754, "grad_norm": 1.0990772755341645, "learning_rate": 8.670520231213873e-06, "loss": 0.2542, "step": 30 }, { "epoch": 0.016137428422696512, "grad_norm": 0.7201508676237249, "learning_rate": 8.959537572254335e-06, "loss": 0.2411, "step": 31 }, { "epoch": 0.01665799062988027, "grad_norm": 0.8464593786312, "learning_rate": 9.248554913294797e-06, "loss": 0.2453, "step": 32 }, { "epoch": 0.01717855283706403, "grad_norm": 0.8038877087534331, "learning_rate": 9.53757225433526e-06, "loss": 0.2358, "step": 33 }, { "epoch": 0.017699115044247787, "grad_norm": 1.0702814348205445, "learning_rate": 9.826589595375723e-06, "loss": 0.2375, "step": 34 }, { "epoch": 0.018219677251431546, "grad_norm": 1.425305836752578, "learning_rate": 1.0115606936416185e-05, "loss": 0.2344, "step": 35 }, { "epoch": 0.018740239458615304, "grad_norm": 0.9940426565737253, "learning_rate": 1.0404624277456647e-05, "loss": 0.2388, "step": 36 }, { "epoch": 0.019260801665799063, "grad_norm": 0.7934472038238055, "learning_rate": 1.0693641618497111e-05, "loss": 0.2352, "step": 37 }, { "epoch": 0.01978136387298282, "grad_norm": 1.06277101439209, "learning_rate": 1.0982658959537573e-05, "loss": 0.2382, "step": 38 }, { "epoch": 0.02030192608016658, "grad_norm": 1.2617518893593866, "learning_rate": 1.1271676300578036e-05, "loss": 0.2299, "step": 39 }, { "epoch": 0.020822488287350338, "grad_norm": 1.0241590169095665, "learning_rate": 1.1560693641618498e-05, "loss": 0.2341, "step": 40 }, { "epoch": 0.021343050494534097, "grad_norm": 0.7725404532271828, "learning_rate": 1.184971098265896e-05, "loss": 0.2361, "step": 41 }, { "epoch": 0.021863612701717855, "grad_norm": 0.8376692345275107, "learning_rate": 1.2138728323699422e-05, "loss": 0.2362, "step": 42 }, { "epoch": 0.022384174908901613, "grad_norm": 1.0428393775559166, "learning_rate": 1.2427745664739884e-05, "loss": 0.2338, "step": 43 }, { "epoch": 0.022904737116085372, "grad_norm": 0.7784715293287426, "learning_rate": 1.2716763005780346e-05, "loss": 0.2316, "step": 44 }, { "epoch": 0.02342529932326913, "grad_norm": 0.9048629801576277, "learning_rate": 1.3005780346820809e-05, "loss": 0.2253, "step": 45 }, { "epoch": 0.02394586153045289, "grad_norm": 1.1048673864810445, "learning_rate": 1.329479768786127e-05, "loss": 0.229, "step": 46 }, { "epoch": 0.024466423737636647, "grad_norm": 0.7942075601745159, "learning_rate": 1.3583815028901733e-05, "loss": 0.2285, "step": 47 }, { "epoch": 0.024986985944820406, "grad_norm": 0.9826389811914579, "learning_rate": 1.3872832369942197e-05, "loss": 0.2276, "step": 48 }, { "epoch": 0.025507548152004164, "grad_norm": 1.6396773861852498, "learning_rate": 1.416184971098266e-05, "loss": 0.2242, "step": 49 }, { "epoch": 0.026028110359187923, "grad_norm": 1.0452409902447666, "learning_rate": 1.4450867052023123e-05, "loss": 0.2281, "step": 50 }, { "epoch": 0.02654867256637168, "grad_norm": 1.0756275987207078, "learning_rate": 1.4739884393063585e-05, "loss": 0.2277, "step": 51 }, { "epoch": 0.02706923477355544, "grad_norm": 0.8991098023314276, "learning_rate": 1.5028901734104049e-05, "loss": 0.2172, "step": 52 }, { "epoch": 0.027589796980739198, "grad_norm": 0.9582658807979682, "learning_rate": 1.531791907514451e-05, "loss": 0.2178, "step": 53 }, { "epoch": 0.028110359187922956, "grad_norm": 0.9911148408927335, "learning_rate": 1.5606936416184973e-05, "loss": 0.2341, "step": 54 }, { "epoch": 0.028630921395106715, "grad_norm": 1.3180692766328292, "learning_rate": 1.5895953757225435e-05, "loss": 0.2195, "step": 55 }, { "epoch": 0.029151483602290473, "grad_norm": 1.486871730799441, "learning_rate": 1.6184971098265897e-05, "loss": 0.2247, "step": 56 }, { "epoch": 0.029672045809474232, "grad_norm": 0.8943418358679089, "learning_rate": 1.647398843930636e-05, "loss": 0.2208, "step": 57 }, { "epoch": 0.03019260801665799, "grad_norm": 0.9549840398600492, "learning_rate": 1.676300578034682e-05, "loss": 0.2303, "step": 58 }, { "epoch": 0.03071317022384175, "grad_norm": 1.6440802970143786, "learning_rate": 1.7052023121387284e-05, "loss": 0.2189, "step": 59 }, { "epoch": 0.031233732431025507, "grad_norm": 0.8692681682580999, "learning_rate": 1.7341040462427746e-05, "loss": 0.2242, "step": 60 }, { "epoch": 0.031754294638209266, "grad_norm": 1.0685256597249488, "learning_rate": 1.7630057803468208e-05, "loss": 0.2124, "step": 61 }, { "epoch": 0.032274856845393024, "grad_norm": 1.401339524265495, "learning_rate": 1.791907514450867e-05, "loss": 0.222, "step": 62 }, { "epoch": 0.03279541905257678, "grad_norm": 0.8600585202663065, "learning_rate": 1.8208092485549132e-05, "loss": 0.2138, "step": 63 }, { "epoch": 0.03331598125976054, "grad_norm": 1.2477374949025173, "learning_rate": 1.8497109826589594e-05, "loss": 0.2282, "step": 64 }, { "epoch": 0.0338365434669443, "grad_norm": 1.097660347997359, "learning_rate": 1.8786127167630057e-05, "loss": 0.217, "step": 65 }, { "epoch": 0.03435710567412806, "grad_norm": 1.2951250009085689, "learning_rate": 1.907514450867052e-05, "loss": 0.2199, "step": 66 }, { "epoch": 0.034877667881311816, "grad_norm": 0.826872263396979, "learning_rate": 1.936416184971098e-05, "loss": 0.2134, "step": 67 }, { "epoch": 0.035398230088495575, "grad_norm": 1.326379381993655, "learning_rate": 1.9653179190751446e-05, "loss": 0.2253, "step": 68 }, { "epoch": 0.03591879229567933, "grad_norm": 0.8549714338599687, "learning_rate": 1.994219653179191e-05, "loss": 0.2118, "step": 69 }, { "epoch": 0.03643935450286309, "grad_norm": 1.1468222920831441, "learning_rate": 2.023121387283237e-05, "loss": 0.225, "step": 70 }, { "epoch": 0.03695991671004685, "grad_norm": 0.8454115307212339, "learning_rate": 2.0520231213872833e-05, "loss": 0.2165, "step": 71 }, { "epoch": 0.03748047891723061, "grad_norm": 0.8498885949939733, "learning_rate": 2.0809248554913295e-05, "loss": 0.2098, "step": 72 }, { "epoch": 0.03800104112441437, "grad_norm": 0.8214398654369537, "learning_rate": 2.1098265895953757e-05, "loss": 0.2183, "step": 73 }, { "epoch": 0.038521603331598125, "grad_norm": 1.0466964698294483, "learning_rate": 2.1387283236994223e-05, "loss": 0.2193, "step": 74 }, { "epoch": 0.039042165538781884, "grad_norm": 1.1184070427085229, "learning_rate": 2.1676300578034685e-05, "loss": 0.2187, "step": 75 }, { "epoch": 0.03956272774596564, "grad_norm": 1.040775059918436, "learning_rate": 2.1965317919075147e-05, "loss": 0.2194, "step": 76 }, { "epoch": 0.0400832899531494, "grad_norm": 1.2038697374050564, "learning_rate": 2.225433526011561e-05, "loss": 0.2184, "step": 77 }, { "epoch": 0.04060385216033316, "grad_norm": 0.8663131100942328, "learning_rate": 2.254335260115607e-05, "loss": 0.2107, "step": 78 }, { "epoch": 0.04112441436751692, "grad_norm": 0.7771080010057934, "learning_rate": 2.2832369942196533e-05, "loss": 0.2091, "step": 79 }, { "epoch": 0.041644976574700676, "grad_norm": 0.9075557788861287, "learning_rate": 2.3121387283236996e-05, "loss": 0.213, "step": 80 }, { "epoch": 0.042165538781884435, "grad_norm": 1.1814000831436555, "learning_rate": 2.3410404624277458e-05, "loss": 0.2111, "step": 81 }, { "epoch": 0.04268610098906819, "grad_norm": 0.808697178051325, "learning_rate": 2.369942196531792e-05, "loss": 0.2079, "step": 82 }, { "epoch": 0.04320666319625195, "grad_norm": 0.9213395387224003, "learning_rate": 2.3988439306358382e-05, "loss": 0.213, "step": 83 }, { "epoch": 0.04372722540343571, "grad_norm": 0.8345054388332048, "learning_rate": 2.4277456647398844e-05, "loss": 0.2099, "step": 84 }, { "epoch": 0.04424778761061947, "grad_norm": 0.8849593943677236, "learning_rate": 2.4566473988439306e-05, "loss": 0.222, "step": 85 }, { "epoch": 0.04476834981780323, "grad_norm": 0.8478338478744712, "learning_rate": 2.485549132947977e-05, "loss": 0.2203, "step": 86 }, { "epoch": 0.045288912024986985, "grad_norm": 0.9085767079232848, "learning_rate": 2.5144508670520234e-05, "loss": 0.2134, "step": 87 }, { "epoch": 0.045809474232170744, "grad_norm": 0.9152260021422175, "learning_rate": 2.5433526011560693e-05, "loss": 0.2212, "step": 88 }, { "epoch": 0.0463300364393545, "grad_norm": 0.7878287625575658, "learning_rate": 2.5722543352601158e-05, "loss": 0.2088, "step": 89 }, { "epoch": 0.04685059864653826, "grad_norm": 0.9443429001103073, "learning_rate": 2.6011560693641617e-05, "loss": 0.2126, "step": 90 }, { "epoch": 0.04737116085372202, "grad_norm": 1.0473598485121471, "learning_rate": 2.6300578034682083e-05, "loss": 0.2211, "step": 91 }, { "epoch": 0.04789172306090578, "grad_norm": 0.8361641199343858, "learning_rate": 2.658959537572254e-05, "loss": 0.2128, "step": 92 }, { "epoch": 0.048412285268089536, "grad_norm": 0.7931786259368718, "learning_rate": 2.6878612716763007e-05, "loss": 0.2166, "step": 93 }, { "epoch": 0.048932847475273294, "grad_norm": 0.8124807154897088, "learning_rate": 2.7167630057803466e-05, "loss": 0.2122, "step": 94 }, { "epoch": 0.04945340968245705, "grad_norm": 0.7637460807323695, "learning_rate": 2.745664739884393e-05, "loss": 0.2157, "step": 95 }, { "epoch": 0.04997397188964081, "grad_norm": 1.0197675734431848, "learning_rate": 2.7745664739884393e-05, "loss": 0.2202, "step": 96 }, { "epoch": 0.05049453409682457, "grad_norm": 0.7718672139219506, "learning_rate": 2.8034682080924855e-05, "loss": 0.2157, "step": 97 }, { "epoch": 0.05101509630400833, "grad_norm": 0.6967370402605544, "learning_rate": 2.832369942196532e-05, "loss": 0.2115, "step": 98 }, { "epoch": 0.05153565851119209, "grad_norm": 0.8860212093442391, "learning_rate": 2.861271676300578e-05, "loss": 0.2164, "step": 99 }, { "epoch": 0.052056220718375845, "grad_norm": 0.806336638090273, "learning_rate": 2.8901734104046245e-05, "loss": 0.2066, "step": 100 }, { "epoch": 0.052576782925559604, "grad_norm": 0.8839202966628119, "learning_rate": 2.9190751445086707e-05, "loss": 0.2133, "step": 101 }, { "epoch": 0.05309734513274336, "grad_norm": 1.0715053048576961, "learning_rate": 2.947976878612717e-05, "loss": 0.2094, "step": 102 }, { "epoch": 0.05361790733992712, "grad_norm": 0.9923778641494126, "learning_rate": 2.9768786127167632e-05, "loss": 0.2147, "step": 103 }, { "epoch": 0.05413846954711088, "grad_norm": 0.996319311639693, "learning_rate": 3.0057803468208097e-05, "loss": 0.2119, "step": 104 }, { "epoch": 0.05465903175429464, "grad_norm": 0.8155226884859121, "learning_rate": 3.0346820809248556e-05, "loss": 0.2078, "step": 105 }, { "epoch": 0.055179593961478396, "grad_norm": 0.7905094789778886, "learning_rate": 3.063583815028902e-05, "loss": 0.2096, "step": 106 }, { "epoch": 0.055700156168662154, "grad_norm": 0.8773860436858838, "learning_rate": 3.092485549132948e-05, "loss": 0.2167, "step": 107 }, { "epoch": 0.05622071837584591, "grad_norm": 0.8873612182478008, "learning_rate": 3.1213872832369946e-05, "loss": 0.2147, "step": 108 }, { "epoch": 0.05674128058302967, "grad_norm": 0.8093261489985791, "learning_rate": 3.1502890173410405e-05, "loss": 0.2147, "step": 109 }, { "epoch": 0.05726184279021343, "grad_norm": 0.7069615535722936, "learning_rate": 3.179190751445087e-05, "loss": 0.2175, "step": 110 }, { "epoch": 0.05778240499739719, "grad_norm": 0.7496643468031777, "learning_rate": 3.208092485549133e-05, "loss": 0.2133, "step": 111 }, { "epoch": 0.05830296720458095, "grad_norm": 0.6778780641394953, "learning_rate": 3.2369942196531794e-05, "loss": 0.2142, "step": 112 }, { "epoch": 0.058823529411764705, "grad_norm": 0.8038377581955503, "learning_rate": 3.265895953757225e-05, "loss": 0.2081, "step": 113 }, { "epoch": 0.059344091618948464, "grad_norm": 0.8262906806241153, "learning_rate": 3.294797687861272e-05, "loss": 0.2198, "step": 114 }, { "epoch": 0.05986465382613222, "grad_norm": 0.8225060449744769, "learning_rate": 3.323699421965318e-05, "loss": 0.2085, "step": 115 }, { "epoch": 0.06038521603331598, "grad_norm": 0.9474684268193776, "learning_rate": 3.352601156069364e-05, "loss": 0.2141, "step": 116 }, { "epoch": 0.06090577824049974, "grad_norm": 0.9490625592067119, "learning_rate": 3.381502890173411e-05, "loss": 0.2116, "step": 117 }, { "epoch": 0.0614263404476835, "grad_norm": 1.0111096680389842, "learning_rate": 3.410404624277457e-05, "loss": 0.2199, "step": 118 }, { "epoch": 0.061946902654867256, "grad_norm": 0.9650260633005836, "learning_rate": 3.439306358381503e-05, "loss": 0.2126, "step": 119 }, { "epoch": 0.062467464862051014, "grad_norm": 0.6980785653915276, "learning_rate": 3.468208092485549e-05, "loss": 0.2099, "step": 120 }, { "epoch": 0.06298802706923477, "grad_norm": 0.8534007635131398, "learning_rate": 3.497109826589596e-05, "loss": 0.2193, "step": 121 }, { "epoch": 0.06350858927641853, "grad_norm": 0.8531206959278465, "learning_rate": 3.5260115606936416e-05, "loss": 0.2121, "step": 122 }, { "epoch": 0.06402915148360229, "grad_norm": 0.8548017488318986, "learning_rate": 3.554913294797688e-05, "loss": 0.2228, "step": 123 }, { "epoch": 0.06454971369078605, "grad_norm": 0.6612943150792601, "learning_rate": 3.583815028901734e-05, "loss": 0.2103, "step": 124 }, { "epoch": 0.0650702758979698, "grad_norm": 0.7715488990577469, "learning_rate": 3.6127167630057806e-05, "loss": 0.2035, "step": 125 }, { "epoch": 0.06559083810515356, "grad_norm": 0.8671638801555337, "learning_rate": 3.6416184971098265e-05, "loss": 0.2152, "step": 126 }, { "epoch": 0.06611140031233732, "grad_norm": 0.9914635629797826, "learning_rate": 3.670520231213873e-05, "loss": 0.2162, "step": 127 }, { "epoch": 0.06663196251952108, "grad_norm": 0.979896089844123, "learning_rate": 3.699421965317919e-05, "loss": 0.2161, "step": 128 }, { "epoch": 0.06715252472670484, "grad_norm": 0.6633992741028703, "learning_rate": 3.7283236994219654e-05, "loss": 0.2121, "step": 129 }, { "epoch": 0.0676730869338886, "grad_norm": 0.7005104328408702, "learning_rate": 3.757225433526011e-05, "loss": 0.2068, "step": 130 }, { "epoch": 0.06819364914107236, "grad_norm": 0.7770874448510542, "learning_rate": 3.786127167630058e-05, "loss": 0.2148, "step": 131 }, { "epoch": 0.06871421134825612, "grad_norm": 0.8338865623089665, "learning_rate": 3.815028901734104e-05, "loss": 0.2143, "step": 132 }, { "epoch": 0.06923477355543987, "grad_norm": 0.7816413255198202, "learning_rate": 3.84393063583815e-05, "loss": 0.2189, "step": 133 }, { "epoch": 0.06975533576262363, "grad_norm": 0.65556462363938, "learning_rate": 3.872832369942196e-05, "loss": 0.2079, "step": 134 }, { "epoch": 0.07027589796980739, "grad_norm": 0.7701568466848703, "learning_rate": 3.901734104046243e-05, "loss": 0.2093, "step": 135 }, { "epoch": 0.07079646017699115, "grad_norm": 0.832197605955844, "learning_rate": 3.930635838150289e-05, "loss": 0.2131, "step": 136 }, { "epoch": 0.07131702238417491, "grad_norm": 0.8404782199225694, "learning_rate": 3.959537572254335e-05, "loss": 0.2183, "step": 137 }, { "epoch": 0.07183758459135867, "grad_norm": 0.6621631106005148, "learning_rate": 3.988439306358382e-05, "loss": 0.2176, "step": 138 }, { "epoch": 0.07235814679854242, "grad_norm": 0.800987879800019, "learning_rate": 4.0173410404624276e-05, "loss": 0.2068, "step": 139 }, { "epoch": 0.07287870900572618, "grad_norm": 0.8086309231367268, "learning_rate": 4.046242774566474e-05, "loss": 0.2149, "step": 140 }, { "epoch": 0.07339927121290994, "grad_norm": 0.6390918137014469, "learning_rate": 4.07514450867052e-05, "loss": 0.2067, "step": 141 }, { "epoch": 0.0739198334200937, "grad_norm": 0.6208193264026112, "learning_rate": 4.1040462427745666e-05, "loss": 0.2146, "step": 142 }, { "epoch": 0.07444039562727746, "grad_norm": 0.6102728010873583, "learning_rate": 4.132947976878613e-05, "loss": 0.2136, "step": 143 }, { "epoch": 0.07496095783446122, "grad_norm": 0.7098434352977124, "learning_rate": 4.161849710982659e-05, "loss": 0.2124, "step": 144 }, { "epoch": 0.07548152004164498, "grad_norm": 0.7136938829382273, "learning_rate": 4.1907514450867055e-05, "loss": 0.2176, "step": 145 }, { "epoch": 0.07600208224882873, "grad_norm": 0.6206135947118576, "learning_rate": 4.2196531791907514e-05, "loss": 0.2128, "step": 146 }, { "epoch": 0.07652264445601249, "grad_norm": 0.6662245815894154, "learning_rate": 4.248554913294798e-05, "loss": 0.2096, "step": 147 }, { "epoch": 0.07704320666319625, "grad_norm": 0.6047404577832085, "learning_rate": 4.2774566473988445e-05, "loss": 0.2067, "step": 148 }, { "epoch": 0.07756376887038001, "grad_norm": 0.6072140153823105, "learning_rate": 4.3063583815028904e-05, "loss": 0.2173, "step": 149 }, { "epoch": 0.07808433107756377, "grad_norm": 0.6556745346884483, "learning_rate": 4.335260115606937e-05, "loss": 0.214, "step": 150 }, { "epoch": 0.07860489328474753, "grad_norm": 0.593639230209489, "learning_rate": 4.364161849710983e-05, "loss": 0.2088, "step": 151 }, { "epoch": 0.07912545549193128, "grad_norm": 0.6371994241596302, "learning_rate": 4.3930635838150294e-05, "loss": 0.2046, "step": 152 }, { "epoch": 0.07964601769911504, "grad_norm": 0.638999242172759, "learning_rate": 4.421965317919075e-05, "loss": 0.2161, "step": 153 }, { "epoch": 0.0801665799062988, "grad_norm": 0.5754915699358542, "learning_rate": 4.450867052023122e-05, "loss": 0.2057, "step": 154 }, { "epoch": 0.08068714211348256, "grad_norm": 0.5766300270893668, "learning_rate": 4.4797687861271684e-05, "loss": 0.2087, "step": 155 }, { "epoch": 0.08120770432066632, "grad_norm": 0.6651018920554423, "learning_rate": 4.508670520231214e-05, "loss": 0.2111, "step": 156 }, { "epoch": 0.08172826652785008, "grad_norm": 0.6875773025621368, "learning_rate": 4.537572254335261e-05, "loss": 0.2168, "step": 157 }, { "epoch": 0.08224882873503384, "grad_norm": 0.6150201790012484, "learning_rate": 4.566473988439307e-05, "loss": 0.2141, "step": 158 }, { "epoch": 0.0827693909422176, "grad_norm": 0.6599147042410466, "learning_rate": 4.595375722543353e-05, "loss": 0.2153, "step": 159 }, { "epoch": 0.08328995314940135, "grad_norm": 0.7633380008679602, "learning_rate": 4.624277456647399e-05, "loss": 0.2118, "step": 160 }, { "epoch": 0.08381051535658511, "grad_norm": 0.7353603486570877, "learning_rate": 4.653179190751446e-05, "loss": 0.2125, "step": 161 }, { "epoch": 0.08433107756376887, "grad_norm": 0.7963914788392694, "learning_rate": 4.6820809248554915e-05, "loss": 0.2177, "step": 162 }, { "epoch": 0.08485163977095263, "grad_norm": 0.7816123652355957, "learning_rate": 4.710982658959538e-05, "loss": 0.2113, "step": 163 }, { "epoch": 0.08537220197813639, "grad_norm": 0.673743870011103, "learning_rate": 4.739884393063584e-05, "loss": 0.215, "step": 164 }, { "epoch": 0.08589276418532014, "grad_norm": 0.5227320089849515, "learning_rate": 4.7687861271676305e-05, "loss": 0.2103, "step": 165 }, { "epoch": 0.0864133263925039, "grad_norm": 0.678198278022982, "learning_rate": 4.7976878612716764e-05, "loss": 0.2193, "step": 166 }, { "epoch": 0.08693388859968766, "grad_norm": 0.6374039379559228, "learning_rate": 4.826589595375723e-05, "loss": 0.2167, "step": 167 }, { "epoch": 0.08745445080687142, "grad_norm": 0.5577509103478314, "learning_rate": 4.855491329479769e-05, "loss": 0.2047, "step": 168 }, { "epoch": 0.08797501301405518, "grad_norm": 0.6071594590963697, "learning_rate": 4.8843930635838154e-05, "loss": 0.2129, "step": 169 }, { "epoch": 0.08849557522123894, "grad_norm": 0.5535849448502077, "learning_rate": 4.913294797687861e-05, "loss": 0.2112, "step": 170 }, { "epoch": 0.0890161374284227, "grad_norm": 0.5570160624184318, "learning_rate": 4.942196531791908e-05, "loss": 0.2249, "step": 171 }, { "epoch": 0.08953669963560645, "grad_norm": 0.5926801003230638, "learning_rate": 4.971098265895954e-05, "loss": 0.2183, "step": 172 }, { "epoch": 0.09005726184279021, "grad_norm": 0.6340827920564475, "learning_rate": 5e-05, "loss": 0.2134, "step": 173 }, { "epoch": 0.09057782404997397, "grad_norm": 0.5566326852792648, "learning_rate": 4.99999960519183e-05, "loss": 0.2109, "step": 174 }, { "epoch": 0.09109838625715773, "grad_norm": 0.6364150749071997, "learning_rate": 4.999998420767445e-05, "loss": 0.2207, "step": 175 }, { "epoch": 0.09161894846434149, "grad_norm": 0.5946977297441665, "learning_rate": 4.999996446727219e-05, "loss": 0.22, "step": 176 }, { "epoch": 0.09213951067152525, "grad_norm": 0.5756761019081571, "learning_rate": 4.9999936830717745e-05, "loss": 0.2157, "step": 177 }, { "epoch": 0.092660072878709, "grad_norm": 0.755144779893014, "learning_rate": 4.999990129801986e-05, "loss": 0.2149, "step": 178 }, { "epoch": 0.09318063508589276, "grad_norm": 0.8329885805863328, "learning_rate": 4.9999857869189735e-05, "loss": 0.2104, "step": 179 }, { "epoch": 0.09370119729307652, "grad_norm": 0.7690781702453137, "learning_rate": 4.999980654424112e-05, "loss": 0.2078, "step": 180 }, { "epoch": 0.09422175950026028, "grad_norm": 0.6308760503667642, "learning_rate": 4.9999747323190195e-05, "loss": 0.2197, "step": 181 }, { "epoch": 0.09474232170744404, "grad_norm": 0.6148255176499823, "learning_rate": 4.9999680206055686e-05, "loss": 0.213, "step": 182 }, { "epoch": 0.0952628839146278, "grad_norm": 0.6734184929964855, "learning_rate": 4.999960519285878e-05, "loss": 0.203, "step": 183 }, { "epoch": 0.09578344612181156, "grad_norm": 0.5406620690992346, "learning_rate": 4.999952228362317e-05, "loss": 0.2051, "step": 184 }, { "epoch": 0.09630400832899531, "grad_norm": 0.5805819703164237, "learning_rate": 4.999943147837505e-05, "loss": 0.2243, "step": 185 }, { "epoch": 0.09682457053617907, "grad_norm": 0.6790309432200481, "learning_rate": 4.999933277714309e-05, "loss": 0.223, "step": 186 }, { "epoch": 0.09734513274336283, "grad_norm": 0.5725871196289053, "learning_rate": 4.999922617995847e-05, "loss": 0.2137, "step": 187 }, { "epoch": 0.09786569495054659, "grad_norm": 0.5475251087174847, "learning_rate": 4.999911168685486e-05, "loss": 0.2151, "step": 188 }, { "epoch": 0.09838625715773035, "grad_norm": 0.6035996664978388, "learning_rate": 4.999898929786842e-05, "loss": 0.2073, "step": 189 }, { "epoch": 0.0989068193649141, "grad_norm": 0.6086080086431157, "learning_rate": 4.999885901303781e-05, "loss": 0.2183, "step": 190 }, { "epoch": 0.09942738157209786, "grad_norm": 0.4937337226206942, "learning_rate": 4.999872083240418e-05, "loss": 0.2102, "step": 191 }, { "epoch": 0.09994794377928162, "grad_norm": 0.725349945989928, "learning_rate": 4.999857475601117e-05, "loss": 0.2089, "step": 192 }, { "epoch": 0.10046850598646538, "grad_norm": 0.7612664949980464, "learning_rate": 4.999842078390492e-05, "loss": 0.1991, "step": 193 }, { "epoch": 0.10098906819364914, "grad_norm": 0.5707299858159967, "learning_rate": 4.9998258916134055e-05, "loss": 0.2126, "step": 194 }, { "epoch": 0.1015096304008329, "grad_norm": 0.5643415112854695, "learning_rate": 4.999808915274971e-05, "loss": 0.2069, "step": 195 }, { "epoch": 0.10203019260801666, "grad_norm": 0.5633442640544284, "learning_rate": 4.999791149380549e-05, "loss": 0.2132, "step": 196 }, { "epoch": 0.10255075481520042, "grad_norm": 0.5614129286121543, "learning_rate": 4.999772593935752e-05, "loss": 0.2054, "step": 197 }, { "epoch": 0.10307131702238417, "grad_norm": 0.5646241630980633, "learning_rate": 4.999753248946441e-05, "loss": 0.21, "step": 198 }, { "epoch": 0.10359187922956793, "grad_norm": 0.5425474957654737, "learning_rate": 4.9997331144187255e-05, "loss": 0.2187, "step": 199 }, { "epoch": 0.10411244143675169, "grad_norm": 0.6177258752993052, "learning_rate": 4.999712190358965e-05, "loss": 0.2082, "step": 200 }, { "epoch": 0.10463300364393545, "grad_norm": 0.6029817403963842, "learning_rate": 4.999690476773767e-05, "loss": 0.212, "step": 201 }, { "epoch": 0.10515356585111921, "grad_norm": 0.48878469751460535, "learning_rate": 4.9996679736699924e-05, "loss": 0.2024, "step": 202 }, { "epoch": 0.10567412805830297, "grad_norm": 0.5654845375393931, "learning_rate": 4.9996446810547464e-05, "loss": 0.2107, "step": 203 }, { "epoch": 0.10619469026548672, "grad_norm": 0.5149953607057386, "learning_rate": 4.999620598935387e-05, "loss": 0.1983, "step": 204 }, { "epoch": 0.10671525247267048, "grad_norm": 0.4962347495654175, "learning_rate": 4.9995957273195206e-05, "loss": 0.2095, "step": 205 }, { "epoch": 0.10723581467985424, "grad_norm": 0.48397875587172107, "learning_rate": 4.9995700662150015e-05, "loss": 0.2062, "step": 206 }, { "epoch": 0.107756376887038, "grad_norm": 0.49662456882149275, "learning_rate": 4.9995436156299355e-05, "loss": 0.2186, "step": 207 }, { "epoch": 0.10827693909422176, "grad_norm": 0.5132623063501726, "learning_rate": 4.999516375572677e-05, "loss": 0.2106, "step": 208 }, { "epoch": 0.10879750130140552, "grad_norm": 0.48217335031801317, "learning_rate": 4.99948834605183e-05, "loss": 0.2196, "step": 209 }, { "epoch": 0.10931806350858927, "grad_norm": 0.5083613448407168, "learning_rate": 4.999459527076247e-05, "loss": 0.2092, "step": 210 }, { "epoch": 0.10983862571577303, "grad_norm": 0.45169614742277503, "learning_rate": 4.99942991865503e-05, "loss": 0.1936, "step": 211 }, { "epoch": 0.11035918792295679, "grad_norm": 0.5475430570698664, "learning_rate": 4.999399520797532e-05, "loss": 0.2089, "step": 212 }, { "epoch": 0.11087975013014055, "grad_norm": 0.4605893449333839, "learning_rate": 4.9993683335133535e-05, "loss": 0.2023, "step": 213 }, { "epoch": 0.11140031233732431, "grad_norm": 0.48657373710725793, "learning_rate": 4.999336356812344e-05, "loss": 0.2141, "step": 214 }, { "epoch": 0.11192087454450807, "grad_norm": 0.5120193381886232, "learning_rate": 4.9993035907046034e-05, "loss": 0.2026, "step": 215 }, { "epoch": 0.11244143675169183, "grad_norm": 0.49871162631911875, "learning_rate": 4.999270035200483e-05, "loss": 0.2207, "step": 216 }, { "epoch": 0.11296199895887558, "grad_norm": 0.4917375628104701, "learning_rate": 4.999235690310578e-05, "loss": 0.2045, "step": 217 }, { "epoch": 0.11348256116605934, "grad_norm": 0.5254314690487704, "learning_rate": 4.999200556045739e-05, "loss": 0.2068, "step": 218 }, { "epoch": 0.1140031233732431, "grad_norm": 0.5379406712902184, "learning_rate": 4.99916463241706e-05, "loss": 0.1999, "step": 219 }, { "epoch": 0.11452368558042686, "grad_norm": 0.5497570174948546, "learning_rate": 4.99912791943589e-05, "loss": 0.2174, "step": 220 }, { "epoch": 0.11504424778761062, "grad_norm": 0.5159970185644596, "learning_rate": 4.999090417113823e-05, "loss": 0.2121, "step": 221 }, { "epoch": 0.11556480999479438, "grad_norm": 0.5255410280335246, "learning_rate": 4.999052125462705e-05, "loss": 0.2005, "step": 222 }, { "epoch": 0.11608537220197813, "grad_norm": 0.4370433291491418, "learning_rate": 4.9990130444946295e-05, "loss": 0.2062, "step": 223 }, { "epoch": 0.1166059344091619, "grad_norm": 0.5368330251522332, "learning_rate": 4.9989731742219415e-05, "loss": 0.2053, "step": 224 }, { "epoch": 0.11712649661634565, "grad_norm": 0.4997030944367229, "learning_rate": 4.998932514657232e-05, "loss": 0.2035, "step": 225 }, { "epoch": 0.11764705882352941, "grad_norm": 0.48475596745975486, "learning_rate": 4.9988910658133445e-05, "loss": 0.2066, "step": 226 }, { "epoch": 0.11816762103071317, "grad_norm": 0.4549147383812301, "learning_rate": 4.99884882770337e-05, "loss": 0.1947, "step": 227 }, { "epoch": 0.11868818323789693, "grad_norm": 0.4829688354526709, "learning_rate": 4.998805800340649e-05, "loss": 0.2043, "step": 228 }, { "epoch": 0.11920874544508069, "grad_norm": 0.49782219243743464, "learning_rate": 4.998761983738772e-05, "loss": 0.2051, "step": 229 }, { "epoch": 0.11972930765226444, "grad_norm": 0.502253063495323, "learning_rate": 4.998717377911578e-05, "loss": 0.2169, "step": 230 }, { "epoch": 0.1202498698594482, "grad_norm": 0.5061752506717145, "learning_rate": 4.998671982873156e-05, "loss": 0.2035, "step": 231 }, { "epoch": 0.12077043206663196, "grad_norm": 0.511314601070742, "learning_rate": 4.9986257986378434e-05, "loss": 0.2034, "step": 232 }, { "epoch": 0.12129099427381572, "grad_norm": 0.4988500109798501, "learning_rate": 4.9985788252202284e-05, "loss": 0.2007, "step": 233 }, { "epoch": 0.12181155648099948, "grad_norm": 0.47625297037289466, "learning_rate": 4.9985310626351453e-05, "loss": 0.2065, "step": 234 }, { "epoch": 0.12233211868818324, "grad_norm": 0.5281298227193022, "learning_rate": 4.998482510897682e-05, "loss": 0.2063, "step": 235 }, { "epoch": 0.122852680895367, "grad_norm": 0.4762927778180656, "learning_rate": 4.9984331700231716e-05, "loss": 0.2091, "step": 236 }, { "epoch": 0.12337324310255075, "grad_norm": 0.5474311482218877, "learning_rate": 4.9983830400271995e-05, "loss": 0.2061, "step": 237 }, { "epoch": 0.12389380530973451, "grad_norm": 0.4774773588042615, "learning_rate": 4.998332120925598e-05, "loss": 0.2002, "step": 238 }, { "epoch": 0.12441436751691827, "grad_norm": 0.48763373747314165, "learning_rate": 4.9982804127344515e-05, "loss": 0.2012, "step": 239 }, { "epoch": 0.12493492972410203, "grad_norm": 0.5185578830596879, "learning_rate": 4.9982279154700905e-05, "loss": 0.2003, "step": 240 }, { "epoch": 0.1254554919312858, "grad_norm": 0.5024445332518108, "learning_rate": 4.9981746291490955e-05, "loss": 0.1951, "step": 241 }, { "epoch": 0.12597605413846955, "grad_norm": 0.5449525262128915, "learning_rate": 4.998120553788298e-05, "loss": 0.2032, "step": 242 }, { "epoch": 0.12649661634565332, "grad_norm": 0.4665771668363775, "learning_rate": 4.9980656894047776e-05, "loss": 0.2039, "step": 243 }, { "epoch": 0.12701717855283706, "grad_norm": 0.4372817573132417, "learning_rate": 4.998010036015862e-05, "loss": 0.2051, "step": 244 }, { "epoch": 0.12753774076002083, "grad_norm": 0.43781674957374894, "learning_rate": 4.997953593639129e-05, "loss": 0.2054, "step": 245 }, { "epoch": 0.12805830296720458, "grad_norm": 0.4541311939721181, "learning_rate": 4.997896362292407e-05, "loss": 0.1971, "step": 246 }, { "epoch": 0.12857886517438835, "grad_norm": 0.4384524770223584, "learning_rate": 4.997838341993772e-05, "loss": 0.198, "step": 247 }, { "epoch": 0.1290994273815721, "grad_norm": 0.4471256807532643, "learning_rate": 4.997779532761549e-05, "loss": 0.1986, "step": 248 }, { "epoch": 0.12961998958875587, "grad_norm": 0.5163351668297698, "learning_rate": 4.997719934614313e-05, "loss": 0.2006, "step": 249 }, { "epoch": 0.1301405517959396, "grad_norm": 0.5102778772697563, "learning_rate": 4.9976595475708873e-05, "loss": 0.2016, "step": 250 }, { "epoch": 0.13066111400312339, "grad_norm": 0.4253852174219019, "learning_rate": 4.997598371650346e-05, "loss": 0.1965, "step": 251 }, { "epoch": 0.13118167621030713, "grad_norm": 0.46645746563329804, "learning_rate": 4.9975364068720106e-05, "loss": 0.2031, "step": 252 }, { "epoch": 0.1317022384174909, "grad_norm": 0.43965296000744936, "learning_rate": 4.997473653255452e-05, "loss": 0.1995, "step": 253 }, { "epoch": 0.13222280062467465, "grad_norm": 0.44221288800503233, "learning_rate": 4.997410110820492e-05, "loss": 0.1982, "step": 254 }, { "epoch": 0.13274336283185842, "grad_norm": 0.4370059916055007, "learning_rate": 4.997345779587199e-05, "loss": 0.1939, "step": 255 }, { "epoch": 0.13326392503904216, "grad_norm": 0.5211107612902853, "learning_rate": 4.997280659575892e-05, "loss": 0.2026, "step": 256 }, { "epoch": 0.13378448724622594, "grad_norm": 0.488605855839782, "learning_rate": 4.997214750807141e-05, "loss": 0.2005, "step": 257 }, { "epoch": 0.13430504945340968, "grad_norm": 0.4932959365760645, "learning_rate": 4.99714805330176e-05, "loss": 0.2019, "step": 258 }, { "epoch": 0.13482561166059345, "grad_norm": 0.5546717733269304, "learning_rate": 4.997080567080817e-05, "loss": 0.1948, "step": 259 }, { "epoch": 0.1353461738677772, "grad_norm": 0.48295394900788036, "learning_rate": 4.9970122921656257e-05, "loss": 0.1955, "step": 260 }, { "epoch": 0.13586673607496097, "grad_norm": 0.5105683280432859, "learning_rate": 4.996943228577753e-05, "loss": 0.2075, "step": 261 }, { "epoch": 0.13638729828214471, "grad_norm": 0.5759006373431579, "learning_rate": 4.996873376339011e-05, "loss": 0.201, "step": 262 }, { "epoch": 0.1369078604893285, "grad_norm": 0.483482317837608, "learning_rate": 4.996802735471461e-05, "loss": 0.1938, "step": 263 }, { "epoch": 0.13742842269651223, "grad_norm": 0.48756843106965886, "learning_rate": 4.996731305997416e-05, "loss": 0.2, "step": 264 }, { "epoch": 0.137948984903696, "grad_norm": 0.5231891365741146, "learning_rate": 4.996659087939438e-05, "loss": 0.2009, "step": 265 }, { "epoch": 0.13846954711087975, "grad_norm": 0.518441009753121, "learning_rate": 4.9965860813203345e-05, "loss": 0.1831, "step": 266 }, { "epoch": 0.13899010931806352, "grad_norm": 0.45224708236227923, "learning_rate": 4.996512286163166e-05, "loss": 0.2059, "step": 267 }, { "epoch": 0.13951067152524727, "grad_norm": 0.5456839005168297, "learning_rate": 4.99643770249124e-05, "loss": 0.205, "step": 268 }, { "epoch": 0.14003123373243104, "grad_norm": 0.4891910966189773, "learning_rate": 4.996362330328113e-05, "loss": 0.2, "step": 269 }, { "epoch": 0.14055179593961478, "grad_norm": 0.5478174066545909, "learning_rate": 4.996286169697591e-05, "loss": 0.2004, "step": 270 }, { "epoch": 0.14107235814679855, "grad_norm": 0.5582267970906812, "learning_rate": 4.99620922062373e-05, "loss": 0.203, "step": 271 }, { "epoch": 0.1415929203539823, "grad_norm": 0.5105647459053501, "learning_rate": 4.996131483130833e-05, "loss": 0.2029, "step": 272 }, { "epoch": 0.14211348256116607, "grad_norm": 0.48249484447771235, "learning_rate": 4.9960529572434545e-05, "loss": 0.2015, "step": 273 }, { "epoch": 0.14263404476834982, "grad_norm": 0.5235774115905943, "learning_rate": 4.995973642986395e-05, "loss": 0.2036, "step": 274 }, { "epoch": 0.1431546069755336, "grad_norm": 0.47923742023520904, "learning_rate": 4.995893540384707e-05, "loss": 0.2017, "step": 275 }, { "epoch": 0.14367516918271733, "grad_norm": 0.43341131129494787, "learning_rate": 4.99581264946369e-05, "loss": 0.1995, "step": 276 }, { "epoch": 0.1441957313899011, "grad_norm": 0.5535433768591606, "learning_rate": 4.995730970248893e-05, "loss": 0.2048, "step": 277 }, { "epoch": 0.14471629359708485, "grad_norm": 0.3972052460404149, "learning_rate": 4.9956485027661136e-05, "loss": 0.2039, "step": 278 }, { "epoch": 0.14523685580426862, "grad_norm": 0.594647280576585, "learning_rate": 4.995565247041401e-05, "loss": 0.1998, "step": 279 }, { "epoch": 0.14575741801145237, "grad_norm": 0.4962198874500988, "learning_rate": 4.995481203101049e-05, "loss": 0.2018, "step": 280 }, { "epoch": 0.14627798021863614, "grad_norm": 0.4301303739409537, "learning_rate": 4.9953963709716034e-05, "loss": 0.2053, "step": 281 }, { "epoch": 0.14679854242581988, "grad_norm": 0.5069059225942892, "learning_rate": 4.995310750679858e-05, "loss": 0.1979, "step": 282 }, { "epoch": 0.14731910463300366, "grad_norm": 0.5279838955451033, "learning_rate": 4.995224342252855e-05, "loss": 0.2017, "step": 283 }, { "epoch": 0.1478396668401874, "grad_norm": 0.3970535952201932, "learning_rate": 4.995137145717889e-05, "loss": 0.2025, "step": 284 }, { "epoch": 0.14836022904737117, "grad_norm": 0.4049155457791142, "learning_rate": 4.9950491611024975e-05, "loss": 0.1893, "step": 285 }, { "epoch": 0.14888079125455492, "grad_norm": 0.4413501454399157, "learning_rate": 4.994960388434471e-05, "loss": 0.2012, "step": 286 }, { "epoch": 0.1494013534617387, "grad_norm": 0.39118558952416227, "learning_rate": 4.994870827741849e-05, "loss": 0.194, "step": 287 }, { "epoch": 0.14992191566892243, "grad_norm": 0.4209076336573591, "learning_rate": 4.9947804790529176e-05, "loss": 0.1923, "step": 288 }, { "epoch": 0.1504424778761062, "grad_norm": 0.4149330323573577, "learning_rate": 4.994689342396215e-05, "loss": 0.1944, "step": 289 }, { "epoch": 0.15096304008328995, "grad_norm": 0.41004710305075126, "learning_rate": 4.994597417800524e-05, "loss": 0.1993, "step": 290 }, { "epoch": 0.15148360229047372, "grad_norm": 0.41295205841010674, "learning_rate": 4.994504705294881e-05, "loss": 0.2048, "step": 291 }, { "epoch": 0.15200416449765747, "grad_norm": 0.3865464642966178, "learning_rate": 4.994411204908567e-05, "loss": 0.1952, "step": 292 }, { "epoch": 0.15252472670484124, "grad_norm": 0.4033991442815395, "learning_rate": 4.994316916671114e-05, "loss": 0.1945, "step": 293 }, { "epoch": 0.15304528891202498, "grad_norm": 0.37826488837927863, "learning_rate": 4.9942218406123045e-05, "loss": 0.1989, "step": 294 }, { "epoch": 0.15356585111920876, "grad_norm": 0.3925013641735211, "learning_rate": 4.994125976762167e-05, "loss": 0.1948, "step": 295 }, { "epoch": 0.1540864133263925, "grad_norm": 0.4297598640191415, "learning_rate": 4.9940293251509786e-05, "loss": 0.199, "step": 296 }, { "epoch": 0.15460697553357627, "grad_norm": 0.39427507343176355, "learning_rate": 4.9939318858092664e-05, "loss": 0.2038, "step": 297 }, { "epoch": 0.15512753774076002, "grad_norm": 0.41295108211994785, "learning_rate": 4.993833658767808e-05, "loss": 0.1855, "step": 298 }, { "epoch": 0.1556480999479438, "grad_norm": 0.428783102563254, "learning_rate": 4.993734644057627e-05, "loss": 0.1925, "step": 299 }, { "epoch": 0.15616866215512754, "grad_norm": 0.39643845709605496, "learning_rate": 4.993634841709998e-05, "loss": 0.1953, "step": 300 }, { "epoch": 0.1566892243623113, "grad_norm": 0.40315780437097026, "learning_rate": 4.993534251756441e-05, "loss": 0.197, "step": 301 }, { "epoch": 0.15720978656949505, "grad_norm": 0.42325725150557625, "learning_rate": 4.9934328742287285e-05, "loss": 0.1993, "step": 302 }, { "epoch": 0.15773034877667882, "grad_norm": 0.4080909081022836, "learning_rate": 4.9933307091588796e-05, "loss": 0.1872, "step": 303 }, { "epoch": 0.15825091098386257, "grad_norm": 0.4403757079324371, "learning_rate": 4.993227756579163e-05, "loss": 0.196, "step": 304 }, { "epoch": 0.15877147319104634, "grad_norm": 0.4315054835172639, "learning_rate": 4.993124016522097e-05, "loss": 0.1988, "step": 305 }, { "epoch": 0.1592920353982301, "grad_norm": 0.4181529747933808, "learning_rate": 4.993019489020446e-05, "loss": 0.1874, "step": 306 }, { "epoch": 0.15981259760541386, "grad_norm": 0.38805878488629814, "learning_rate": 4.992914174107225e-05, "loss": 0.1885, "step": 307 }, { "epoch": 0.1603331598125976, "grad_norm": 0.4442375624813517, "learning_rate": 4.992808071815698e-05, "loss": 0.1918, "step": 308 }, { "epoch": 0.16085372201978138, "grad_norm": 0.3809208737314787, "learning_rate": 4.9927011821793766e-05, "loss": 0.1997, "step": 309 }, { "epoch": 0.16137428422696512, "grad_norm": 0.4138404674084266, "learning_rate": 4.9925935052320214e-05, "loss": 0.1959, "step": 310 }, { "epoch": 0.1618948464341489, "grad_norm": 0.41341096380173886, "learning_rate": 4.9924850410076416e-05, "loss": 0.2007, "step": 311 }, { "epoch": 0.16241540864133264, "grad_norm": 0.43053394989910876, "learning_rate": 4.9923757895404966e-05, "loss": 0.1898, "step": 312 }, { "epoch": 0.1629359708485164, "grad_norm": 0.40770200417265084, "learning_rate": 4.992265750865091e-05, "loss": 0.1961, "step": 313 }, { "epoch": 0.16345653305570015, "grad_norm": 0.44481929282099114, "learning_rate": 4.9921549250161817e-05, "loss": 0.1903, "step": 314 }, { "epoch": 0.16397709526288393, "grad_norm": 0.4214318597046138, "learning_rate": 4.9920433120287726e-05, "loss": 0.1978, "step": 315 }, { "epoch": 0.16449765747006767, "grad_norm": 0.42351713283633147, "learning_rate": 4.9919309119381155e-05, "loss": 0.1913, "step": 316 }, { "epoch": 0.16501821967725144, "grad_norm": 0.40073423145965437, "learning_rate": 4.991817724779711e-05, "loss": 0.201, "step": 317 }, { "epoch": 0.1655387818844352, "grad_norm": 0.46802701541194947, "learning_rate": 4.99170375058931e-05, "loss": 0.1984, "step": 318 }, { "epoch": 0.16605934409161896, "grad_norm": 0.4391832742508613, "learning_rate": 4.9915889894029124e-05, "loss": 0.1896, "step": 319 }, { "epoch": 0.1665799062988027, "grad_norm": 0.4276877940360142, "learning_rate": 4.991473441256762e-05, "loss": 0.1883, "step": 320 }, { "epoch": 0.16710046850598648, "grad_norm": 0.4783858273158986, "learning_rate": 4.991357106187356e-05, "loss": 0.1914, "step": 321 }, { "epoch": 0.16762103071317022, "grad_norm": 0.37335690024180623, "learning_rate": 4.991239984231438e-05, "loss": 0.1861, "step": 322 }, { "epoch": 0.168141592920354, "grad_norm": 0.5571380480881387, "learning_rate": 4.991122075426001e-05, "loss": 0.1938, "step": 323 }, { "epoch": 0.16866215512753774, "grad_norm": 0.4673856871456706, "learning_rate": 4.991003379808286e-05, "loss": 0.1956, "step": 324 }, { "epoch": 0.1691827173347215, "grad_norm": 0.6907331185852513, "learning_rate": 4.990883897415781e-05, "loss": 0.1946, "step": 325 }, { "epoch": 0.16970327954190526, "grad_norm": 0.4420336300041284, "learning_rate": 4.9907636282862256e-05, "loss": 0.1917, "step": 326 }, { "epoch": 0.17022384174908903, "grad_norm": 0.4673927210202748, "learning_rate": 4.9906425724576075e-05, "loss": 0.1868, "step": 327 }, { "epoch": 0.17074440395627277, "grad_norm": 0.39164253998754844, "learning_rate": 4.99052072996816e-05, "loss": 0.1811, "step": 328 }, { "epoch": 0.17126496616345654, "grad_norm": 0.44534231109604294, "learning_rate": 4.990398100856367e-05, "loss": 0.1938, "step": 329 }, { "epoch": 0.1717855283706403, "grad_norm": 0.46887801231015935, "learning_rate": 4.990274685160961e-05, "loss": 0.1903, "step": 330 }, { "epoch": 0.17230609057782406, "grad_norm": 0.3968509774573516, "learning_rate": 4.990150482920921e-05, "loss": 0.1908, "step": 331 }, { "epoch": 0.1728266527850078, "grad_norm": 0.5263341947271295, "learning_rate": 4.990025494175477e-05, "loss": 0.1922, "step": 332 }, { "epoch": 0.17334721499219158, "grad_norm": 0.4775537352423801, "learning_rate": 4.989899718964107e-05, "loss": 0.1841, "step": 333 }, { "epoch": 0.17386777719937532, "grad_norm": 0.41924991923989763, "learning_rate": 4.989773157326535e-05, "loss": 0.2015, "step": 334 }, { "epoch": 0.1743883394065591, "grad_norm": 0.5252465757639925, "learning_rate": 4.989645809302736e-05, "loss": 0.1917, "step": 335 }, { "epoch": 0.17490890161374284, "grad_norm": 0.5169440105481551, "learning_rate": 4.9895176749329334e-05, "loss": 0.1917, "step": 336 }, { "epoch": 0.1754294638209266, "grad_norm": 0.42246102114852874, "learning_rate": 4.989388754257596e-05, "loss": 0.2005, "step": 337 }, { "epoch": 0.17595002602811036, "grad_norm": 0.5029332963068417, "learning_rate": 4.989259047317444e-05, "loss": 0.1903, "step": 338 }, { "epoch": 0.17647058823529413, "grad_norm": 0.41379461152486186, "learning_rate": 4.989128554153444e-05, "loss": 0.1844, "step": 339 }, { "epoch": 0.17699115044247787, "grad_norm": 0.45322329232876984, "learning_rate": 4.9889972748068134e-05, "loss": 0.188, "step": 340 }, { "epoch": 0.17751171264966165, "grad_norm": 0.48168712385017154, "learning_rate": 4.988865209319015e-05, "loss": 0.1981, "step": 341 }, { "epoch": 0.1780322748568454, "grad_norm": 0.41086331407455057, "learning_rate": 4.988732357731762e-05, "loss": 0.1942, "step": 342 }, { "epoch": 0.17855283706402916, "grad_norm": 0.540497000624346, "learning_rate": 4.988598720087015e-05, "loss": 0.1907, "step": 343 }, { "epoch": 0.1790733992712129, "grad_norm": 0.38794214622006457, "learning_rate": 4.9884642964269824e-05, "loss": 0.1874, "step": 344 }, { "epoch": 0.17959396147839668, "grad_norm": 0.5258434187192316, "learning_rate": 4.988329086794122e-05, "loss": 0.1877, "step": 345 }, { "epoch": 0.18011452368558042, "grad_norm": 0.5353658539595278, "learning_rate": 4.9881930912311394e-05, "loss": 0.194, "step": 346 }, { "epoch": 0.1806350858927642, "grad_norm": 0.4392015964672634, "learning_rate": 4.988056309780987e-05, "loss": 0.1904, "step": 347 }, { "epoch": 0.18115564809994794, "grad_norm": 0.5430979821127188, "learning_rate": 4.987918742486869e-05, "loss": 0.2045, "step": 348 }, { "epoch": 0.1816762103071317, "grad_norm": 0.47746127042266084, "learning_rate": 4.987780389392234e-05, "loss": 0.1877, "step": 349 }, { "epoch": 0.18219677251431546, "grad_norm": 0.48616627754776087, "learning_rate": 4.98764125054078e-05, "loss": 0.1856, "step": 350 }, { "epoch": 0.18271733472149923, "grad_norm": 0.4137932189199551, "learning_rate": 4.987501325976455e-05, "loss": 0.1882, "step": 351 }, { "epoch": 0.18323789692868298, "grad_norm": 0.48916635581424217, "learning_rate": 4.987360615743453e-05, "loss": 0.1872, "step": 352 }, { "epoch": 0.18375845913586675, "grad_norm": 0.45140194045768417, "learning_rate": 4.987219119886216e-05, "loss": 0.1974, "step": 353 }, { "epoch": 0.1842790213430505, "grad_norm": 0.4683554765097198, "learning_rate": 4.987076838449436e-05, "loss": 0.1887, "step": 354 }, { "epoch": 0.18479958355023426, "grad_norm": 0.45043420134623535, "learning_rate": 4.986933771478052e-05, "loss": 0.1953, "step": 355 }, { "epoch": 0.185320145757418, "grad_norm": 0.38632079781772455, "learning_rate": 4.9867899190172505e-05, "loss": 0.1944, "step": 356 }, { "epoch": 0.18584070796460178, "grad_norm": 0.4502927446904294, "learning_rate": 4.986645281112469e-05, "loss": 0.1993, "step": 357 }, { "epoch": 0.18636127017178553, "grad_norm": 0.35109988684784443, "learning_rate": 4.986499857809387e-05, "loss": 0.1832, "step": 358 }, { "epoch": 0.1868818323789693, "grad_norm": 0.4855353635745786, "learning_rate": 4.98635364915394e-05, "loss": 0.1928, "step": 359 }, { "epoch": 0.18740239458615304, "grad_norm": 0.3907280285159689, "learning_rate": 4.986206655192305e-05, "loss": 0.1876, "step": 360 }, { "epoch": 0.18792295679333682, "grad_norm": 0.42227390242070895, "learning_rate": 4.98605887597091e-05, "loss": 0.193, "step": 361 }, { "epoch": 0.18844351900052056, "grad_norm": 0.39584572126339823, "learning_rate": 4.985910311536431e-05, "loss": 0.184, "step": 362 }, { "epoch": 0.18896408120770433, "grad_norm": 0.3990052626089986, "learning_rate": 4.985760961935791e-05, "loss": 0.1881, "step": 363 }, { "epoch": 0.18948464341488808, "grad_norm": 0.42636340213787094, "learning_rate": 4.9856108272161614e-05, "loss": 0.1926, "step": 364 }, { "epoch": 0.19000520562207185, "grad_norm": 0.42577564707501814, "learning_rate": 4.9854599074249633e-05, "loss": 0.195, "step": 365 }, { "epoch": 0.1905257678292556, "grad_norm": 0.41616380256414365, "learning_rate": 4.985308202609863e-05, "loss": 0.1885, "step": 366 }, { "epoch": 0.19104633003643937, "grad_norm": 0.46495409518003894, "learning_rate": 4.9851557128187755e-05, "loss": 0.1908, "step": 367 }, { "epoch": 0.1915668922436231, "grad_norm": 0.41526376329857984, "learning_rate": 4.985002438099865e-05, "loss": 0.188, "step": 368 }, { "epoch": 0.19208745445080688, "grad_norm": 0.3935066236122138, "learning_rate": 4.984848378501542e-05, "loss": 0.1885, "step": 369 }, { "epoch": 0.19260801665799063, "grad_norm": 0.4088622746642285, "learning_rate": 4.984693534072467e-05, "loss": 0.1918, "step": 370 }, { "epoch": 0.1931285788651744, "grad_norm": 0.3809983710951146, "learning_rate": 4.984537904861546e-05, "loss": 0.1826, "step": 371 }, { "epoch": 0.19364914107235814, "grad_norm": 0.40658651643232196, "learning_rate": 4.9843814909179345e-05, "loss": 0.19, "step": 372 }, { "epoch": 0.19416970327954192, "grad_norm": 0.43969698475310975, "learning_rate": 4.9842242922910345e-05, "loss": 0.1894, "step": 373 }, { "epoch": 0.19469026548672566, "grad_norm": 0.3862559796395953, "learning_rate": 4.9840663090304965e-05, "loss": 0.1914, "step": 374 }, { "epoch": 0.19521082769390943, "grad_norm": 0.5347847427366343, "learning_rate": 4.983907541186221e-05, "loss": 0.1917, "step": 375 }, { "epoch": 0.19573138990109318, "grad_norm": 0.3974569413531495, "learning_rate": 4.983747988808352e-05, "loss": 0.184, "step": 376 }, { "epoch": 0.19625195210827695, "grad_norm": 0.4412640683782702, "learning_rate": 4.983587651947285e-05, "loss": 0.1855, "step": 377 }, { "epoch": 0.1967725143154607, "grad_norm": 0.4423096342665371, "learning_rate": 4.983426530653661e-05, "loss": 0.1923, "step": 378 }, { "epoch": 0.19729307652264447, "grad_norm": 0.43821702301793497, "learning_rate": 4.9832646249783694e-05, "loss": 0.189, "step": 379 }, { "epoch": 0.1978136387298282, "grad_norm": 0.3924545030570308, "learning_rate": 4.983101934972548e-05, "loss": 0.1891, "step": 380 }, { "epoch": 0.19833420093701198, "grad_norm": 0.42299618371561903, "learning_rate": 4.982938460687583e-05, "loss": 0.1903, "step": 381 }, { "epoch": 0.19885476314419573, "grad_norm": 0.4001882492331783, "learning_rate": 4.982774202175105e-05, "loss": 0.1803, "step": 382 }, { "epoch": 0.1993753253513795, "grad_norm": 0.49011114003395145, "learning_rate": 4.9826091594869974e-05, "loss": 0.1813, "step": 383 }, { "epoch": 0.19989588755856325, "grad_norm": 0.37442432473888976, "learning_rate": 4.982443332675385e-05, "loss": 0.1802, "step": 384 }, { "epoch": 0.20041644976574702, "grad_norm": 0.48051842908306147, "learning_rate": 4.9822767217926456e-05, "loss": 0.1947, "step": 385 }, { "epoch": 0.20093701197293076, "grad_norm": 0.4257153401498972, "learning_rate": 4.982109326891402e-05, "loss": 0.1884, "step": 386 }, { "epoch": 0.20145757418011453, "grad_norm": 0.47752005156773103, "learning_rate": 4.981941148024526e-05, "loss": 0.1857, "step": 387 }, { "epoch": 0.20197813638729828, "grad_norm": 0.40745960509516727, "learning_rate": 4.981772185245135e-05, "loss": 0.187, "step": 388 }, { "epoch": 0.20249869859448205, "grad_norm": 0.4359497459483064, "learning_rate": 4.9816024386065973e-05, "loss": 0.1871, "step": 389 }, { "epoch": 0.2030192608016658, "grad_norm": 0.3892740872530919, "learning_rate": 4.981431908162525e-05, "loss": 0.188, "step": 390 }, { "epoch": 0.20353982300884957, "grad_norm": 0.420465792965565, "learning_rate": 4.98126059396678e-05, "loss": 0.1936, "step": 391 }, { "epoch": 0.2040603852160333, "grad_norm": 0.3773127318291861, "learning_rate": 4.981088496073472e-05, "loss": 0.1875, "step": 392 }, { "epoch": 0.20458094742321709, "grad_norm": 0.4041436379650743, "learning_rate": 4.980915614536957e-05, "loss": 0.1915, "step": 393 }, { "epoch": 0.20510150963040083, "grad_norm": 0.3808004928284413, "learning_rate": 4.980741949411839e-05, "loss": 0.1886, "step": 394 }, { "epoch": 0.2056220718375846, "grad_norm": 0.39329042034751993, "learning_rate": 4.98056750075297e-05, "loss": 0.189, "step": 395 }, { "epoch": 0.20614263404476835, "grad_norm": 0.3950097377987076, "learning_rate": 4.980392268615447e-05, "loss": 0.1922, "step": 396 }, { "epoch": 0.20666319625195212, "grad_norm": 0.3768193415771947, "learning_rate": 4.980216253054619e-05, "loss": 0.1863, "step": 397 }, { "epoch": 0.20718375845913586, "grad_norm": 0.40683290447145076, "learning_rate": 4.98003945412608e-05, "loss": 0.1888, "step": 398 }, { "epoch": 0.20770432066631964, "grad_norm": 0.3547903108580832, "learning_rate": 4.979861871885669e-05, "loss": 0.1916, "step": 399 }, { "epoch": 0.20822488287350338, "grad_norm": 0.36494431710070346, "learning_rate": 4.9796835063894765e-05, "loss": 0.1806, "step": 400 }, { "epoch": 0.20874544508068715, "grad_norm": 0.36485565181417856, "learning_rate": 4.9795043576938384e-05, "loss": 0.1833, "step": 401 }, { "epoch": 0.2092660072878709, "grad_norm": 0.3632899653463087, "learning_rate": 4.9793244258553375e-05, "loss": 0.185, "step": 402 }, { "epoch": 0.20978656949505467, "grad_norm": 0.35512187990656785, "learning_rate": 4.979143710930805e-05, "loss": 0.1817, "step": 403 }, { "epoch": 0.21030713170223841, "grad_norm": 0.4054160927254296, "learning_rate": 4.9789622129773195e-05, "loss": 0.1807, "step": 404 }, { "epoch": 0.2108276939094222, "grad_norm": 0.348222334286123, "learning_rate": 4.978779932052206e-05, "loss": 0.1802, "step": 405 }, { "epoch": 0.21134825611660593, "grad_norm": 0.35217898427995553, "learning_rate": 4.978596868213037e-05, "loss": 0.1876, "step": 406 }, { "epoch": 0.2118688183237897, "grad_norm": 0.3399990298458504, "learning_rate": 4.978413021517634e-05, "loss": 0.1824, "step": 407 }, { "epoch": 0.21238938053097345, "grad_norm": 0.3914421983501113, "learning_rate": 4.978228392024063e-05, "loss": 0.1832, "step": 408 }, { "epoch": 0.21290994273815722, "grad_norm": 0.3599097944411457, "learning_rate": 4.978042979790639e-05, "loss": 0.1857, "step": 409 }, { "epoch": 0.21343050494534097, "grad_norm": 0.3764468760970812, "learning_rate": 4.977856784875924e-05, "loss": 0.1874, "step": 410 }, { "epoch": 0.21395106715252474, "grad_norm": 0.36370332252391396, "learning_rate": 4.977669807338726e-05, "loss": 0.1909, "step": 411 }, { "epoch": 0.21447162935970848, "grad_norm": 0.3590343415362341, "learning_rate": 4.9774820472381037e-05, "loss": 0.1808, "step": 412 }, { "epoch": 0.21499219156689225, "grad_norm": 0.3964803489327254, "learning_rate": 4.977293504633357e-05, "loss": 0.1993, "step": 413 }, { "epoch": 0.215512753774076, "grad_norm": 0.409647084566613, "learning_rate": 4.977104179584039e-05, "loss": 0.1828, "step": 414 }, { "epoch": 0.21603331598125977, "grad_norm": 0.37684217160913236, "learning_rate": 4.9769140721499466e-05, "loss": 0.1903, "step": 415 }, { "epoch": 0.21655387818844352, "grad_norm": 0.40627101878375, "learning_rate": 4.976723182391124e-05, "loss": 0.1947, "step": 416 }, { "epoch": 0.2170744403956273, "grad_norm": 0.4293268173755893, "learning_rate": 4.9765315103678646e-05, "loss": 0.1928, "step": 417 }, { "epoch": 0.21759500260281103, "grad_norm": 0.363143103690957, "learning_rate": 4.976339056140706e-05, "loss": 0.1859, "step": 418 }, { "epoch": 0.2181155648099948, "grad_norm": 0.3801901908980379, "learning_rate": 4.976145819770435e-05, "loss": 0.1839, "step": 419 }, { "epoch": 0.21863612701717855, "grad_norm": 0.3728199137683034, "learning_rate": 4.975951801318083e-05, "loss": 0.1836, "step": 420 }, { "epoch": 0.21915668922436232, "grad_norm": 0.39399419625104315, "learning_rate": 4.975757000844932e-05, "loss": 0.1869, "step": 421 }, { "epoch": 0.21967725143154607, "grad_norm": 0.352068817370931, "learning_rate": 4.975561418412509e-05, "loss": 0.1836, "step": 422 }, { "epoch": 0.22019781363872984, "grad_norm": 0.38356796463195586, "learning_rate": 4.9753650540825855e-05, "loss": 0.1813, "step": 423 }, { "epoch": 0.22071837584591358, "grad_norm": 0.3610859855606606, "learning_rate": 4.975167907917187e-05, "loss": 0.1862, "step": 424 }, { "epoch": 0.22123893805309736, "grad_norm": 0.35330258565992684, "learning_rate": 4.974969979978577e-05, "loss": 0.1769, "step": 425 }, { "epoch": 0.2217595002602811, "grad_norm": 0.384329198269658, "learning_rate": 4.9747712703292714e-05, "loss": 0.1807, "step": 426 }, { "epoch": 0.22228006246746487, "grad_norm": 0.34746642781914455, "learning_rate": 4.9745717790320344e-05, "loss": 0.1809, "step": 427 }, { "epoch": 0.22280062467464862, "grad_norm": 0.3880165680244598, "learning_rate": 4.9743715061498716e-05, "loss": 0.1867, "step": 428 }, { "epoch": 0.2233211868818324, "grad_norm": 0.351470867292313, "learning_rate": 4.9741704517460406e-05, "loss": 0.1894, "step": 429 }, { "epoch": 0.22384174908901613, "grad_norm": 0.3645151488001888, "learning_rate": 4.973968615884043e-05, "loss": 0.1788, "step": 430 }, { "epoch": 0.2243623112961999, "grad_norm": 0.3625527069125457, "learning_rate": 4.973765998627628e-05, "loss": 0.1914, "step": 431 }, { "epoch": 0.22488287350338365, "grad_norm": 0.343361514535999, "learning_rate": 4.973562600040791e-05, "loss": 0.1848, "step": 432 }, { "epoch": 0.22540343571056742, "grad_norm": 0.37503966422043744, "learning_rate": 4.973358420187776e-05, "loss": 0.1879, "step": 433 }, { "epoch": 0.22592399791775117, "grad_norm": 0.3474973215881361, "learning_rate": 4.973153459133071e-05, "loss": 0.1894, "step": 434 }, { "epoch": 0.22644456012493494, "grad_norm": 0.3508809916689539, "learning_rate": 4.972947716941413e-05, "loss": 0.1895, "step": 435 }, { "epoch": 0.22696512233211869, "grad_norm": 0.3643384265149809, "learning_rate": 4.9727411936777854e-05, "loss": 0.1829, "step": 436 }, { "epoch": 0.22748568453930246, "grad_norm": 0.36350332295792714, "learning_rate": 4.972533889407417e-05, "loss": 0.1813, "step": 437 }, { "epoch": 0.2280062467464862, "grad_norm": 0.3539689125746054, "learning_rate": 4.972325804195784e-05, "loss": 0.1842, "step": 438 }, { "epoch": 0.22852680895366997, "grad_norm": 0.3822911141420241, "learning_rate": 4.972116938108611e-05, "loss": 0.193, "step": 439 }, { "epoch": 0.22904737116085372, "grad_norm": 0.38773043508499666, "learning_rate": 4.971907291211866e-05, "loss": 0.188, "step": 440 }, { "epoch": 0.2295679333680375, "grad_norm": 0.35091644126465377, "learning_rate": 4.971696863571765e-05, "loss": 0.1824, "step": 441 }, { "epoch": 0.23008849557522124, "grad_norm": 0.34427733124831933, "learning_rate": 4.971485655254773e-05, "loss": 0.175, "step": 442 }, { "epoch": 0.230609057782405, "grad_norm": 0.3543495011979818, "learning_rate": 4.9712736663275974e-05, "loss": 0.1849, "step": 443 }, { "epoch": 0.23112961998958875, "grad_norm": 0.36206158086227636, "learning_rate": 4.971060896857195e-05, "loss": 0.1891, "step": 444 }, { "epoch": 0.23165018219677252, "grad_norm": 0.3599367526208678, "learning_rate": 4.9708473469107676e-05, "loss": 0.1853, "step": 445 }, { "epoch": 0.23217074440395627, "grad_norm": 0.34979470112428146, "learning_rate": 4.970633016555765e-05, "loss": 0.1761, "step": 446 }, { "epoch": 0.23269130661114004, "grad_norm": 0.4092021184005037, "learning_rate": 4.9704179058598824e-05, "loss": 0.1842, "step": 447 }, { "epoch": 0.2332118688183238, "grad_norm": 0.3745830889783029, "learning_rate": 4.970202014891062e-05, "loss": 0.1854, "step": 448 }, { "epoch": 0.23373243102550756, "grad_norm": 0.3787251823910667, "learning_rate": 4.969985343717492e-05, "loss": 0.1882, "step": 449 }, { "epoch": 0.2342529932326913, "grad_norm": 0.3986619843101879, "learning_rate": 4.9697678924076066e-05, "loss": 0.1856, "step": 450 }, { "epoch": 0.23477355543987508, "grad_norm": 0.3430314984382981, "learning_rate": 4.969549661030089e-05, "loss": 0.1839, "step": 451 }, { "epoch": 0.23529411764705882, "grad_norm": 0.3621352775817525, "learning_rate": 4.969330649653864e-05, "loss": 0.1848, "step": 452 }, { "epoch": 0.2358146798542426, "grad_norm": 0.35888862276629535, "learning_rate": 4.969110858348108e-05, "loss": 0.1874, "step": 453 }, { "epoch": 0.23633524206142634, "grad_norm": 0.3942881061832858, "learning_rate": 4.96889028718224e-05, "loss": 0.178, "step": 454 }, { "epoch": 0.2368558042686101, "grad_norm": 0.33338571503996683, "learning_rate": 4.968668936225928e-05, "loss": 0.1915, "step": 455 }, { "epoch": 0.23737636647579385, "grad_norm": 0.3789691197396563, "learning_rate": 4.968446805549082e-05, "loss": 0.1814, "step": 456 }, { "epoch": 0.23789692868297763, "grad_norm": 0.36459554389496945, "learning_rate": 4.968223895221865e-05, "loss": 0.1874, "step": 457 }, { "epoch": 0.23841749089016137, "grad_norm": 0.40802558988494547, "learning_rate": 4.96800020531468e-05, "loss": 0.1876, "step": 458 }, { "epoch": 0.23893805309734514, "grad_norm": 0.35893156732538567, "learning_rate": 4.967775735898179e-05, "loss": 0.1825, "step": 459 }, { "epoch": 0.2394586153045289, "grad_norm": 0.3651439723126297, "learning_rate": 4.967550487043261e-05, "loss": 0.1853, "step": 460 }, { "epoch": 0.23997917751171266, "grad_norm": 0.3624747101403952, "learning_rate": 4.9673244588210684e-05, "loss": 0.1783, "step": 461 }, { "epoch": 0.2404997397188964, "grad_norm": 0.3360578895946992, "learning_rate": 4.967097651302993e-05, "loss": 0.1786, "step": 462 }, { "epoch": 0.24102030192608018, "grad_norm": 0.3929754036367695, "learning_rate": 4.9668700645606704e-05, "loss": 0.188, "step": 463 }, { "epoch": 0.24154086413326392, "grad_norm": 0.34507860442461075, "learning_rate": 4.966641698665982e-05, "loss": 0.183, "step": 464 }, { "epoch": 0.2420614263404477, "grad_norm": 0.363947787934868, "learning_rate": 4.9664125536910597e-05, "loss": 0.179, "step": 465 }, { "epoch": 0.24258198854763144, "grad_norm": 0.3444630195014257, "learning_rate": 4.966182629708275e-05, "loss": 0.1815, "step": 466 }, { "epoch": 0.2431025507548152, "grad_norm": 0.35943929861594537, "learning_rate": 4.965951926790249e-05, "loss": 0.1873, "step": 467 }, { "epoch": 0.24362311296199896, "grad_norm": 0.3629355287385724, "learning_rate": 4.9657204450098496e-05, "loss": 0.1857, "step": 468 }, { "epoch": 0.24414367516918273, "grad_norm": 0.32513099997931477, "learning_rate": 4.9654881844401886e-05, "loss": 0.1845, "step": 469 }, { "epoch": 0.24466423737636647, "grad_norm": 0.375672815309898, "learning_rate": 4.965255145154625e-05, "loss": 0.1825, "step": 470 }, { "epoch": 0.24518479958355024, "grad_norm": 0.32902747038638014, "learning_rate": 4.965021327226764e-05, "loss": 0.1831, "step": 471 }, { "epoch": 0.245705361790734, "grad_norm": 0.3723575912403685, "learning_rate": 4.964786730730455e-05, "loss": 0.1828, "step": 472 }, { "epoch": 0.24622592399791776, "grad_norm": 0.3291696338939864, "learning_rate": 4.964551355739795e-05, "loss": 0.1825, "step": 473 }, { "epoch": 0.2467464862051015, "grad_norm": 0.40037826390407993, "learning_rate": 4.964315202329127e-05, "loss": 0.1948, "step": 474 }, { "epoch": 0.24726704841228528, "grad_norm": 0.3980798213979412, "learning_rate": 4.9640782705730394e-05, "loss": 0.1818, "step": 475 }, { "epoch": 0.24778761061946902, "grad_norm": 0.3307799216619339, "learning_rate": 4.9638405605463646e-05, "loss": 0.177, "step": 476 }, { "epoch": 0.2483081728266528, "grad_norm": 0.37858138177738815, "learning_rate": 4.963602072324184e-05, "loss": 0.1829, "step": 477 }, { "epoch": 0.24882873503383654, "grad_norm": 0.34060237598449605, "learning_rate": 4.963362805981823e-05, "loss": 0.1872, "step": 478 }, { "epoch": 0.2493492972410203, "grad_norm": 0.3491413889386904, "learning_rate": 4.9631227615948516e-05, "loss": 0.1719, "step": 479 }, { "epoch": 0.24986985944820406, "grad_norm": 0.3693858175983402, "learning_rate": 4.962881939239089e-05, "loss": 0.1855, "step": 480 }, { "epoch": 0.25039042165538783, "grad_norm": 0.329979856271684, "learning_rate": 4.962640338990598e-05, "loss": 0.1885, "step": 481 }, { "epoch": 0.2509109838625716, "grad_norm": 0.3796178436862751, "learning_rate": 4.962397960925686e-05, "loss": 0.1896, "step": 482 }, { "epoch": 0.2514315460697553, "grad_norm": 0.34129494053229703, "learning_rate": 4.9621548051209075e-05, "loss": 0.1772, "step": 483 }, { "epoch": 0.2519521082769391, "grad_norm": 0.33923486612711795, "learning_rate": 4.961910871653063e-05, "loss": 0.1809, "step": 484 }, { "epoch": 0.25247267048412286, "grad_norm": 0.32895841536481185, "learning_rate": 4.961666160599198e-05, "loss": 0.1801, "step": 485 }, { "epoch": 0.25299323269130664, "grad_norm": 0.34829345184997373, "learning_rate": 4.961420672036603e-05, "loss": 0.1783, "step": 486 }, { "epoch": 0.25351379489849035, "grad_norm": 0.36383868089193916, "learning_rate": 4.9611744060428156e-05, "loss": 0.1822, "step": 487 }, { "epoch": 0.2540343571056741, "grad_norm": 0.36477630270026584, "learning_rate": 4.960927362695617e-05, "loss": 0.1808, "step": 488 }, { "epoch": 0.2545549193128579, "grad_norm": 0.39127337500191006, "learning_rate": 4.960679542073036e-05, "loss": 0.1889, "step": 489 }, { "epoch": 0.25507548152004167, "grad_norm": 0.364605809032773, "learning_rate": 4.9604309442533454e-05, "loss": 0.1837, "step": 490 }, { "epoch": 0.2555960437272254, "grad_norm": 0.387563172322994, "learning_rate": 4.960181569315064e-05, "loss": 0.1813, "step": 491 }, { "epoch": 0.25611660593440916, "grad_norm": 0.3574697533499239, "learning_rate": 4.959931417336956e-05, "loss": 0.1778, "step": 492 }, { "epoch": 0.25663716814159293, "grad_norm": 0.3772487104771915, "learning_rate": 4.959680488398031e-05, "loss": 0.1852, "step": 493 }, { "epoch": 0.2571577303487767, "grad_norm": 0.37856588039150824, "learning_rate": 4.959428782577544e-05, "loss": 0.1852, "step": 494 }, { "epoch": 0.2576782925559604, "grad_norm": 0.3473512333204632, "learning_rate": 4.959176299954995e-05, "loss": 0.1883, "step": 495 }, { "epoch": 0.2581988547631442, "grad_norm": 0.3717884159305744, "learning_rate": 4.958923040610132e-05, "loss": 0.1901, "step": 496 }, { "epoch": 0.25871941697032796, "grad_norm": 0.3333259893749479, "learning_rate": 4.958669004622942e-05, "loss": 0.1755, "step": 497 }, { "epoch": 0.25923997917751174, "grad_norm": 0.352187312912872, "learning_rate": 4.9584141920736656e-05, "loss": 0.1814, "step": 498 }, { "epoch": 0.25976054138469545, "grad_norm": 0.35402911428193723, "learning_rate": 4.958158603042782e-05, "loss": 0.1862, "step": 499 }, { "epoch": 0.2602811035918792, "grad_norm": 0.351609210599415, "learning_rate": 4.957902237611018e-05, "loss": 0.1853, "step": 500 }, { "epoch": 0.260801665799063, "grad_norm": 0.3961682023708728, "learning_rate": 4.957645095859348e-05, "loss": 0.1804, "step": 501 }, { "epoch": 0.26132222800624677, "grad_norm": 0.3432093967728205, "learning_rate": 4.957387177868986e-05, "loss": 0.1777, "step": 502 }, { "epoch": 0.2618427902134305, "grad_norm": 0.43841871784060643, "learning_rate": 4.957128483721398e-05, "loss": 0.1967, "step": 503 }, { "epoch": 0.26236335242061426, "grad_norm": 0.32944557567419036, "learning_rate": 4.9568690134982884e-05, "loss": 0.1847, "step": 504 }, { "epoch": 0.26288391462779803, "grad_norm": 0.4072278216406123, "learning_rate": 4.956608767281612e-05, "loss": 0.1916, "step": 505 }, { "epoch": 0.2634044768349818, "grad_norm": 0.3470914106501254, "learning_rate": 4.9563477451535664e-05, "loss": 0.1686, "step": 506 }, { "epoch": 0.2639250390421655, "grad_norm": 0.3431768907003393, "learning_rate": 4.956085947196595e-05, "loss": 0.1844, "step": 507 }, { "epoch": 0.2644456012493493, "grad_norm": 0.373990259509472, "learning_rate": 4.955823373493385e-05, "loss": 0.1866, "step": 508 }, { "epoch": 0.26496616345653307, "grad_norm": 0.36035756528381147, "learning_rate": 4.955560024126868e-05, "loss": 0.1828, "step": 509 }, { "epoch": 0.26548672566371684, "grad_norm": 0.37277974712871015, "learning_rate": 4.9552958991802245e-05, "loss": 0.1877, "step": 510 }, { "epoch": 0.26600728787090056, "grad_norm": 0.3791475893537772, "learning_rate": 4.955030998736876e-05, "loss": 0.1787, "step": 511 }, { "epoch": 0.2665278500780843, "grad_norm": 0.34601195974476934, "learning_rate": 4.9547653228804915e-05, "loss": 0.178, "step": 512 }, { "epoch": 0.2670484122852681, "grad_norm": 0.3860891277749901, "learning_rate": 4.954498871694982e-05, "loss": 0.1786, "step": 513 }, { "epoch": 0.26756897449245187, "grad_norm": 0.36197045930962224, "learning_rate": 4.954231645264507e-05, "loss": 0.181, "step": 514 }, { "epoch": 0.2680895366996356, "grad_norm": 0.35660823996500324, "learning_rate": 4.953963643673468e-05, "loss": 0.1816, "step": 515 }, { "epoch": 0.26861009890681936, "grad_norm": 0.37899933398158164, "learning_rate": 4.953694867006513e-05, "loss": 0.1794, "step": 516 }, { "epoch": 0.26913066111400313, "grad_norm": 0.37134644471067046, "learning_rate": 4.953425315348534e-05, "loss": 0.1829, "step": 517 }, { "epoch": 0.2696512233211869, "grad_norm": 0.35629118982864444, "learning_rate": 4.953154988784667e-05, "loss": 0.1825, "step": 518 }, { "epoch": 0.2701717855283706, "grad_norm": 0.3569372189926499, "learning_rate": 4.952883887400296e-05, "loss": 0.1852, "step": 519 }, { "epoch": 0.2706923477355544, "grad_norm": 0.3784207348633246, "learning_rate": 4.9526120112810445e-05, "loss": 0.1792, "step": 520 }, { "epoch": 0.27121290994273817, "grad_norm": 0.3571363618294585, "learning_rate": 4.952339360512786e-05, "loss": 0.1829, "step": 521 }, { "epoch": 0.27173347214992194, "grad_norm": 0.37323568055590195, "learning_rate": 4.952065935181635e-05, "loss": 0.1844, "step": 522 }, { "epoch": 0.27225403435710566, "grad_norm": 0.3405402620718736, "learning_rate": 4.951791735373953e-05, "loss": 0.1817, "step": 523 }, { "epoch": 0.27277459656428943, "grad_norm": 0.3604521476507971, "learning_rate": 4.9515167611763434e-05, "loss": 0.1897, "step": 524 }, { "epoch": 0.2732951587714732, "grad_norm": 0.33208621053766485, "learning_rate": 4.951241012675657e-05, "loss": 0.1792, "step": 525 }, { "epoch": 0.273815720978657, "grad_norm": 0.3839270762469873, "learning_rate": 4.950964489958988e-05, "loss": 0.1841, "step": 526 }, { "epoch": 0.2743362831858407, "grad_norm": 0.31535848820876206, "learning_rate": 4.950687193113676e-05, "loss": 0.1743, "step": 527 }, { "epoch": 0.27485684539302446, "grad_norm": 0.37827478667026426, "learning_rate": 4.950409122227302e-05, "loss": 0.1883, "step": 528 }, { "epoch": 0.27537740760020823, "grad_norm": 0.3582790920870559, "learning_rate": 4.950130277387695e-05, "loss": 0.1772, "step": 529 }, { "epoch": 0.275897969807392, "grad_norm": 0.3584180526860497, "learning_rate": 4.949850658682929e-05, "loss": 0.1876, "step": 530 }, { "epoch": 0.2764185320145757, "grad_norm": 0.5738445813183063, "learning_rate": 4.949570266201317e-05, "loss": 0.178, "step": 531 }, { "epoch": 0.2769390942217595, "grad_norm": 0.3744948980693089, "learning_rate": 4.949289100031422e-05, "loss": 0.1823, "step": 532 }, { "epoch": 0.27745965642894327, "grad_norm": 0.3379331374106325, "learning_rate": 4.949007160262049e-05, "loss": 0.1768, "step": 533 }, { "epoch": 0.27798021863612704, "grad_norm": 0.3480919801415185, "learning_rate": 4.948724446982248e-05, "loss": 0.1859, "step": 534 }, { "epoch": 0.27850078084331076, "grad_norm": 0.3842609731913717, "learning_rate": 4.948440960281313e-05, "loss": 0.1753, "step": 535 }, { "epoch": 0.27902134305049453, "grad_norm": 0.37532802640166946, "learning_rate": 4.948156700248782e-05, "loss": 0.1837, "step": 536 }, { "epoch": 0.2795419052576783, "grad_norm": 0.37959061595323806, "learning_rate": 4.947871666974437e-05, "loss": 0.1833, "step": 537 }, { "epoch": 0.2800624674648621, "grad_norm": 0.4342019133871026, "learning_rate": 4.9475858605483074e-05, "loss": 0.1765, "step": 538 }, { "epoch": 0.2805830296720458, "grad_norm": 0.3600696888138904, "learning_rate": 4.94729928106066e-05, "loss": 0.1839, "step": 539 }, { "epoch": 0.28110359187922956, "grad_norm": 0.4075659366581729, "learning_rate": 4.9470119286020134e-05, "loss": 0.178, "step": 540 }, { "epoch": 0.28162415408641334, "grad_norm": 0.3288846470475826, "learning_rate": 4.946723803263125e-05, "loss": 0.1822, "step": 541 }, { "epoch": 0.2821447162935971, "grad_norm": 0.4265164240412001, "learning_rate": 4.946434905134999e-05, "loss": 0.1857, "step": 542 }, { "epoch": 0.2826652785007808, "grad_norm": 0.3490790807245664, "learning_rate": 4.9461452343088835e-05, "loss": 0.1792, "step": 543 }, { "epoch": 0.2831858407079646, "grad_norm": 0.3906850709137392, "learning_rate": 4.945854790876268e-05, "loss": 0.183, "step": 544 }, { "epoch": 0.28370640291514837, "grad_norm": 0.3673184770495506, "learning_rate": 4.94556357492889e-05, "loss": 0.1791, "step": 545 }, { "epoch": 0.28422696512233214, "grad_norm": 0.3435991187690188, "learning_rate": 4.9452715865587274e-05, "loss": 0.1799, "step": 546 }, { "epoch": 0.28474752732951586, "grad_norm": 0.33557383372158994, "learning_rate": 4.944978825858005e-05, "loss": 0.1754, "step": 547 }, { "epoch": 0.28526808953669963, "grad_norm": 0.3472232615973859, "learning_rate": 4.944685292919191e-05, "loss": 0.1829, "step": 548 }, { "epoch": 0.2857886517438834, "grad_norm": 0.34437324485625154, "learning_rate": 4.9443909878349945e-05, "loss": 0.1849, "step": 549 }, { "epoch": 0.2863092139510672, "grad_norm": 0.3366544790820249, "learning_rate": 4.944095910698372e-05, "loss": 0.1732, "step": 550 }, { "epoch": 0.2868297761582509, "grad_norm": 0.3249404378916035, "learning_rate": 4.9438000616025226e-05, "loss": 0.1756, "step": 551 }, { "epoch": 0.28735033836543467, "grad_norm": 0.3849346781758215, "learning_rate": 4.94350344064089e-05, "loss": 0.1856, "step": 552 }, { "epoch": 0.28787090057261844, "grad_norm": 0.3243739567095206, "learning_rate": 4.9432060479071584e-05, "loss": 0.183, "step": 553 }, { "epoch": 0.2883914627798022, "grad_norm": 0.3612277561023643, "learning_rate": 4.942907883495261e-05, "loss": 0.1786, "step": 554 }, { "epoch": 0.2889120249869859, "grad_norm": 0.3323067953459406, "learning_rate": 4.9426089474993696e-05, "loss": 0.1816, "step": 555 }, { "epoch": 0.2894325871941697, "grad_norm": 0.3316679874226504, "learning_rate": 4.942309240013905e-05, "loss": 0.1787, "step": 556 }, { "epoch": 0.28995314940135347, "grad_norm": 0.36297638930718357, "learning_rate": 4.9420087611335265e-05, "loss": 0.1895, "step": 557 }, { "epoch": 0.29047371160853724, "grad_norm": 0.34447907177875525, "learning_rate": 4.94170751095314e-05, "loss": 0.1831, "step": 558 }, { "epoch": 0.29099427381572096, "grad_norm": 0.37466310329246094, "learning_rate": 4.941405489567893e-05, "loss": 0.18, "step": 559 }, { "epoch": 0.29151483602290473, "grad_norm": 0.3240041218848788, "learning_rate": 4.9411026970731805e-05, "loss": 0.1695, "step": 560 }, { "epoch": 0.2920353982300885, "grad_norm": 0.34612245370701383, "learning_rate": 4.940799133564637e-05, "loss": 0.1722, "step": 561 }, { "epoch": 0.2925559604372723, "grad_norm": 0.35368918014173173, "learning_rate": 4.9404947991381416e-05, "loss": 0.1845, "step": 562 }, { "epoch": 0.293076522644456, "grad_norm": 0.3048992546896397, "learning_rate": 4.9401896938898185e-05, "loss": 0.1764, "step": 563 }, { "epoch": 0.29359708485163977, "grad_norm": 0.3780353682612039, "learning_rate": 4.9398838179160326e-05, "loss": 0.1763, "step": 564 }, { "epoch": 0.29411764705882354, "grad_norm": 0.33431520426027345, "learning_rate": 4.939577171313395e-05, "loss": 0.1738, "step": 565 }, { "epoch": 0.2946382092660073, "grad_norm": 0.3574740739801916, "learning_rate": 4.9392697541787587e-05, "loss": 0.1864, "step": 566 }, { "epoch": 0.29515877147319103, "grad_norm": 0.3080198068295583, "learning_rate": 4.93896156660922e-05, "loss": 0.1758, "step": 567 }, { "epoch": 0.2956793336803748, "grad_norm": 0.34221401560536524, "learning_rate": 4.938652608702119e-05, "loss": 0.1787, "step": 568 }, { "epoch": 0.2961998958875586, "grad_norm": 0.3422289356320483, "learning_rate": 4.938342880555039e-05, "loss": 0.1791, "step": 569 }, { "epoch": 0.29672045809474235, "grad_norm": 0.32294407935462033, "learning_rate": 4.938032382265807e-05, "loss": 0.1804, "step": 570 }, { "epoch": 0.29724102030192606, "grad_norm": 0.32293386743920566, "learning_rate": 4.937721113932493e-05, "loss": 0.1745, "step": 571 }, { "epoch": 0.29776158250910983, "grad_norm": 0.3339982925147738, "learning_rate": 4.937409075653408e-05, "loss": 0.1855, "step": 572 }, { "epoch": 0.2982821447162936, "grad_norm": 0.33201821976883394, "learning_rate": 4.9370962675271106e-05, "loss": 0.1867, "step": 573 }, { "epoch": 0.2988027069234774, "grad_norm": 0.36333200193928655, "learning_rate": 4.936782689652399e-05, "loss": 0.1798, "step": 574 }, { "epoch": 0.2993232691306611, "grad_norm": 0.3170978099956422, "learning_rate": 4.936468342128315e-05, "loss": 0.1734, "step": 575 }, { "epoch": 0.29984383133784487, "grad_norm": 0.3809200990521368, "learning_rate": 4.936153225054146e-05, "loss": 0.1731, "step": 576 }, { "epoch": 0.30036439354502864, "grad_norm": 0.32675460808837287, "learning_rate": 4.93583733852942e-05, "loss": 0.1796, "step": 577 }, { "epoch": 0.3008849557522124, "grad_norm": 0.34942233525996, "learning_rate": 4.935520682653908e-05, "loss": 0.182, "step": 578 }, { "epoch": 0.30140551795939613, "grad_norm": 0.34564982764778135, "learning_rate": 4.9352032575276255e-05, "loss": 0.1778, "step": 579 }, { "epoch": 0.3019260801665799, "grad_norm": 0.3716440462553411, "learning_rate": 4.9348850632508295e-05, "loss": 0.1793, "step": 580 }, { "epoch": 0.3024466423737637, "grad_norm": 0.38121896546844525, "learning_rate": 4.934566099924021e-05, "loss": 0.1846, "step": 581 }, { "epoch": 0.30296720458094745, "grad_norm": 0.34320281278816944, "learning_rate": 4.9342463676479424e-05, "loss": 0.1877, "step": 582 }, { "epoch": 0.30348776678813116, "grad_norm": 0.356611150010106, "learning_rate": 4.933925866523581e-05, "loss": 0.1787, "step": 583 }, { "epoch": 0.30400832899531494, "grad_norm": 0.3535551866752405, "learning_rate": 4.933604596652166e-05, "loss": 0.1753, "step": 584 }, { "epoch": 0.3045288912024987, "grad_norm": 0.3587966333569922, "learning_rate": 4.933282558135169e-05, "loss": 0.1767, "step": 585 }, { "epoch": 0.3050494534096825, "grad_norm": 0.32820492797103323, "learning_rate": 4.932959751074305e-05, "loss": 0.1765, "step": 586 }, { "epoch": 0.3055700156168662, "grad_norm": 0.3668795974443606, "learning_rate": 4.932636175571531e-05, "loss": 0.1847, "step": 587 }, { "epoch": 0.30609057782404997, "grad_norm": 0.3893815535302477, "learning_rate": 4.932311831729048e-05, "loss": 0.1769, "step": 588 }, { "epoch": 0.30661114003123374, "grad_norm": 0.32745455373903526, "learning_rate": 4.931986719649299e-05, "loss": 0.1803, "step": 589 }, { "epoch": 0.3071317022384175, "grad_norm": 0.33500065348948266, "learning_rate": 4.9316608394349684e-05, "loss": 0.1759, "step": 590 }, { "epoch": 0.30765226444560123, "grad_norm": 0.34040682897700947, "learning_rate": 4.931334191188985e-05, "loss": 0.1809, "step": 591 }, { "epoch": 0.308172826652785, "grad_norm": 0.30393987030284975, "learning_rate": 4.93100677501452e-05, "loss": 0.1707, "step": 592 }, { "epoch": 0.3086933888599688, "grad_norm": 0.3351048511454525, "learning_rate": 4.930678591014986e-05, "loss": 0.1792, "step": 593 }, { "epoch": 0.30921395106715255, "grad_norm": 0.36537436869160295, "learning_rate": 4.930349639294038e-05, "loss": 0.1808, "step": 594 }, { "epoch": 0.30973451327433627, "grad_norm": 0.32634199146147447, "learning_rate": 4.930019919955576e-05, "loss": 0.1736, "step": 595 }, { "epoch": 0.31025507548152004, "grad_norm": 0.3299845870559719, "learning_rate": 4.9296894331037405e-05, "loss": 0.1793, "step": 596 }, { "epoch": 0.3107756376887038, "grad_norm": 0.34165209773973343, "learning_rate": 4.9293581788429136e-05, "loss": 0.1814, "step": 597 }, { "epoch": 0.3112961998958876, "grad_norm": 0.3453422147378798, "learning_rate": 4.92902615727772e-05, "loss": 0.177, "step": 598 }, { "epoch": 0.3118167621030713, "grad_norm": 0.32274993522205253, "learning_rate": 4.92869336851303e-05, "loss": 0.1708, "step": 599 }, { "epoch": 0.31233732431025507, "grad_norm": 0.32030522939856365, "learning_rate": 4.9283598126539524e-05, "loss": 0.173, "step": 600 }, { "epoch": 0.31285788651743884, "grad_norm": 0.3482608585554554, "learning_rate": 4.92802548980584e-05, "loss": 0.179, "step": 601 }, { "epoch": 0.3133784487246226, "grad_norm": 0.3692046388241526, "learning_rate": 4.927690400074286e-05, "loss": 0.1812, "step": 602 }, { "epoch": 0.31389901093180633, "grad_norm": 0.33322086716168214, "learning_rate": 4.92735454356513e-05, "loss": 0.1852, "step": 603 }, { "epoch": 0.3144195731389901, "grad_norm": 0.3715077169273792, "learning_rate": 4.92701792038445e-05, "loss": 0.1767, "step": 604 }, { "epoch": 0.3149401353461739, "grad_norm": 0.33348425102156853, "learning_rate": 4.926680530638567e-05, "loss": 0.179, "step": 605 }, { "epoch": 0.31546069755335765, "grad_norm": 0.34957898455987335, "learning_rate": 4.926342374434043e-05, "loss": 0.1824, "step": 606 }, { "epoch": 0.31598125976054137, "grad_norm": 0.35273594539974945, "learning_rate": 4.926003451877687e-05, "loss": 0.1784, "step": 607 }, { "epoch": 0.31650182196772514, "grad_norm": 0.32859387991060474, "learning_rate": 4.9256637630765425e-05, "loss": 0.1761, "step": 608 }, { "epoch": 0.3170223841749089, "grad_norm": 0.3364281217569474, "learning_rate": 4.9253233081379024e-05, "loss": 0.1757, "step": 609 }, { "epoch": 0.3175429463820927, "grad_norm": 0.3180815183990718, "learning_rate": 4.924982087169296e-05, "loss": 0.1688, "step": 610 }, { "epoch": 0.3180635085892764, "grad_norm": 0.3265120523959636, "learning_rate": 4.9246401002784976e-05, "loss": 0.1794, "step": 611 }, { "epoch": 0.3185840707964602, "grad_norm": 0.3435491116451335, "learning_rate": 4.9242973475735224e-05, "loss": 0.1807, "step": 612 }, { "epoch": 0.31910463300364394, "grad_norm": 0.3601397832581144, "learning_rate": 4.923953829162628e-05, "loss": 0.1844, "step": 613 }, { "epoch": 0.3196251952108277, "grad_norm": 0.33676918035319736, "learning_rate": 4.923609545154313e-05, "loss": 0.1794, "step": 614 }, { "epoch": 0.32014575741801143, "grad_norm": 0.35847840458223534, "learning_rate": 4.923264495657319e-05, "loss": 0.1796, "step": 615 }, { "epoch": 0.3206663196251952, "grad_norm": 0.3517065818342989, "learning_rate": 4.9229186807806284e-05, "loss": 0.1772, "step": 616 }, { "epoch": 0.321186881832379, "grad_norm": 0.32840575469268135, "learning_rate": 4.9225721006334644e-05, "loss": 0.183, "step": 617 }, { "epoch": 0.32170744403956275, "grad_norm": 0.3304656690924349, "learning_rate": 4.922224755325295e-05, "loss": 0.1826, "step": 618 }, { "epoch": 0.32222800624674647, "grad_norm": 0.3365642578033948, "learning_rate": 4.921876644965827e-05, "loss": 0.1849, "step": 619 }, { "epoch": 0.32274856845393024, "grad_norm": 0.321873293657072, "learning_rate": 4.921527769665011e-05, "loss": 0.1842, "step": 620 }, { "epoch": 0.323269130661114, "grad_norm": 0.3304218237484672, "learning_rate": 4.921178129533036e-05, "loss": 0.1766, "step": 621 }, { "epoch": 0.3237896928682978, "grad_norm": 0.36159820669310355, "learning_rate": 4.920827724680336e-05, "loss": 0.1833, "step": 622 }, { "epoch": 0.3243102550754815, "grad_norm": 0.3125115981551096, "learning_rate": 4.9204765552175857e-05, "loss": 0.1687, "step": 623 }, { "epoch": 0.3248308172826653, "grad_norm": 0.355646049290069, "learning_rate": 4.920124621255699e-05, "loss": 0.1831, "step": 624 }, { "epoch": 0.32535137948984905, "grad_norm": 0.31743405014825066, "learning_rate": 4.9197719229058346e-05, "loss": 0.1719, "step": 625 }, { "epoch": 0.3258719416970328, "grad_norm": 0.3320164493908055, "learning_rate": 4.9194184602793904e-05, "loss": 0.1779, "step": 626 }, { "epoch": 0.32639250390421654, "grad_norm": 0.3569014079597437, "learning_rate": 4.919064233488006e-05, "loss": 0.1792, "step": 627 }, { "epoch": 0.3269130661114003, "grad_norm": 0.3422394188289934, "learning_rate": 4.9187092426435634e-05, "loss": 0.1853, "step": 628 }, { "epoch": 0.3274336283185841, "grad_norm": 0.45779658269843443, "learning_rate": 4.918353487858185e-05, "loss": 0.1865, "step": 629 }, { "epoch": 0.32795419052576785, "grad_norm": 0.36967762282280137, "learning_rate": 4.917996969244235e-05, "loss": 0.1842, "step": 630 }, { "epoch": 0.32847475273295157, "grad_norm": 0.3383443636578056, "learning_rate": 4.917639686914317e-05, "loss": 0.177, "step": 631 }, { "epoch": 0.32899531494013534, "grad_norm": 0.34576867722134114, "learning_rate": 4.91728164098128e-05, "loss": 0.1741, "step": 632 }, { "epoch": 0.3295158771473191, "grad_norm": 0.33889725319396574, "learning_rate": 4.9169228315582094e-05, "loss": 0.1733, "step": 633 }, { "epoch": 0.3300364393545029, "grad_norm": 0.3181574991844868, "learning_rate": 4.9165632587584346e-05, "loss": 0.1712, "step": 634 }, { "epoch": 0.3305570015616866, "grad_norm": 0.3150178248046848, "learning_rate": 4.916202922695526e-05, "loss": 0.1769, "step": 635 }, { "epoch": 0.3310775637688704, "grad_norm": 0.3231916056988664, "learning_rate": 4.9158418234832935e-05, "loss": 0.1711, "step": 636 }, { "epoch": 0.33159812597605415, "grad_norm": 0.3219246488562882, "learning_rate": 4.9154799612357905e-05, "loss": 0.1742, "step": 637 }, { "epoch": 0.3321186881832379, "grad_norm": 0.3490019970784047, "learning_rate": 4.915117336067308e-05, "loss": 0.187, "step": 638 }, { "epoch": 0.33263925039042164, "grad_norm": 0.3404922017535072, "learning_rate": 4.914753948092381e-05, "loss": 0.1814, "step": 639 }, { "epoch": 0.3331598125976054, "grad_norm": 0.33038580290518343, "learning_rate": 4.9143897974257845e-05, "loss": 0.1836, "step": 640 }, { "epoch": 0.3336803748047892, "grad_norm": 0.48447675580050537, "learning_rate": 4.914024884182534e-05, "loss": 0.1723, "step": 641 }, { "epoch": 0.33420093701197295, "grad_norm": 0.31792440208506284, "learning_rate": 4.913659208477886e-05, "loss": 0.1744, "step": 642 }, { "epoch": 0.33472149921915667, "grad_norm": 1.1187782722485253, "learning_rate": 4.9132927704273376e-05, "loss": 0.1782, "step": 643 }, { "epoch": 0.33524206142634044, "grad_norm": 0.46784530940515706, "learning_rate": 4.9129255701466284e-05, "loss": 0.1785, "step": 644 }, { "epoch": 0.3357626236335242, "grad_norm": 3.7910679408191483, "learning_rate": 4.9125576077517356e-05, "loss": 0.19, "step": 645 }, { "epoch": 0.336283185840708, "grad_norm": 0.5244754308415, "learning_rate": 4.9121888833588795e-05, "loss": 0.179, "step": 646 }, { "epoch": 0.3368037480478917, "grad_norm": 1.3332280542365016, "learning_rate": 4.91181939708452e-05, "loss": 0.1785, "step": 647 }, { "epoch": 0.3373243102550755, "grad_norm": 0.6152204952369942, "learning_rate": 4.9114491490453585e-05, "loss": 0.1817, "step": 648 }, { "epoch": 0.33784487246225925, "grad_norm": 0.6322972902919656, "learning_rate": 4.911078139358337e-05, "loss": 0.1834, "step": 649 }, { "epoch": 0.338365434669443, "grad_norm": 0.4481338546785872, "learning_rate": 4.9107063681406376e-05, "loss": 0.1825, "step": 650 }, { "epoch": 0.33888599687662674, "grad_norm": 0.9859952621710755, "learning_rate": 4.910333835509682e-05, "loss": 0.1809, "step": 651 }, { "epoch": 0.3394065590838105, "grad_norm": 0.4410398257105147, "learning_rate": 4.909960541583133e-05, "loss": 0.1758, "step": 652 }, { "epoch": 0.3399271212909943, "grad_norm": 0.46192330205640203, "learning_rate": 4.909586486478897e-05, "loss": 0.1745, "step": 653 }, { "epoch": 0.34044768349817806, "grad_norm": 0.4347198500546606, "learning_rate": 4.909211670315114e-05, "loss": 0.1763, "step": 654 }, { "epoch": 0.34096824570536177, "grad_norm": 0.47923777050652683, "learning_rate": 4.908836093210172e-05, "loss": 0.186, "step": 655 }, { "epoch": 0.34148880791254554, "grad_norm": 0.3778493845758958, "learning_rate": 4.9084597552826935e-05, "loss": 0.1778, "step": 656 }, { "epoch": 0.3420093701197293, "grad_norm": 0.4629741259881899, "learning_rate": 4.908082656651544e-05, "loss": 0.1835, "step": 657 }, { "epoch": 0.3425299323269131, "grad_norm": 0.43536744466295096, "learning_rate": 4.907704797435829e-05, "loss": 0.1836, "step": 658 }, { "epoch": 0.3430504945340968, "grad_norm": 0.390115416460036, "learning_rate": 4.9073261777548954e-05, "loss": 0.1835, "step": 659 }, { "epoch": 0.3435710567412806, "grad_norm": 0.46457793969633604, "learning_rate": 4.9069467977283255e-05, "loss": 0.1771, "step": 660 }, { "epoch": 0.34409161894846435, "grad_norm": 0.3963393707607232, "learning_rate": 4.906566657475949e-05, "loss": 0.1769, "step": 661 }, { "epoch": 0.3446121811556481, "grad_norm": 0.36870322720760557, "learning_rate": 4.906185757117829e-05, "loss": 0.1781, "step": 662 }, { "epoch": 0.34513274336283184, "grad_norm": 0.3979707755147091, "learning_rate": 4.905804096774274e-05, "loss": 0.1801, "step": 663 }, { "epoch": 0.3456533055700156, "grad_norm": 0.36701302139429726, "learning_rate": 4.905421676565827e-05, "loss": 0.181, "step": 664 }, { "epoch": 0.3461738677771994, "grad_norm": 0.3698838374588819, "learning_rate": 4.905038496613277e-05, "loss": 0.1877, "step": 665 }, { "epoch": 0.34669442998438316, "grad_norm": 0.352717338392521, "learning_rate": 4.9046545570376484e-05, "loss": 0.1789, "step": 666 }, { "epoch": 0.3472149921915669, "grad_norm": 0.3809906085196877, "learning_rate": 4.904269857960207e-05, "loss": 0.1779, "step": 667 }, { "epoch": 0.34773555439875065, "grad_norm": 0.3717632611173711, "learning_rate": 4.9038843995024606e-05, "loss": 0.1841, "step": 668 }, { "epoch": 0.3482561166059344, "grad_norm": 0.3616057869533888, "learning_rate": 4.9034981817861534e-05, "loss": 0.1832, "step": 669 }, { "epoch": 0.3487766788131182, "grad_norm": 0.3720170377297904, "learning_rate": 4.9031112049332715e-05, "loss": 0.1773, "step": 670 }, { "epoch": 0.3492972410203019, "grad_norm": 0.38176483972868525, "learning_rate": 4.9027234690660396e-05, "loss": 0.1795, "step": 671 }, { "epoch": 0.3498178032274857, "grad_norm": 0.3971475485254792, "learning_rate": 4.902334974306924e-05, "loss": 0.187, "step": 672 }, { "epoch": 0.35033836543466945, "grad_norm": 0.33041713668617295, "learning_rate": 4.9019457207786265e-05, "loss": 0.168, "step": 673 }, { "epoch": 0.3508589276418532, "grad_norm": 0.3660144363430606, "learning_rate": 4.901555708604095e-05, "loss": 0.1856, "step": 674 }, { "epoch": 0.35137948984903694, "grad_norm": 0.3766530978916359, "learning_rate": 4.901164937906511e-05, "loss": 0.1794, "step": 675 }, { "epoch": 0.3519000520562207, "grad_norm": 0.3378578696974802, "learning_rate": 4.900773408809299e-05, "loss": 0.1757, "step": 676 }, { "epoch": 0.3524206142634045, "grad_norm": 0.421166687268464, "learning_rate": 4.900381121436123e-05, "loss": 0.1824, "step": 677 }, { "epoch": 0.35294117647058826, "grad_norm": 0.36026982081841086, "learning_rate": 4.8999880759108844e-05, "loss": 0.1811, "step": 678 }, { "epoch": 0.353461738677772, "grad_norm": 0.3655859893200682, "learning_rate": 4.899594272357726e-05, "loss": 0.1739, "step": 679 }, { "epoch": 0.35398230088495575, "grad_norm": 0.3821275209491641, "learning_rate": 4.899199710901028e-05, "loss": 0.1783, "step": 680 }, { "epoch": 0.3545028630921395, "grad_norm": 0.3409239426235484, "learning_rate": 4.8988043916654126e-05, "loss": 0.1782, "step": 681 }, { "epoch": 0.3550234252993233, "grad_norm": 0.37076066161212423, "learning_rate": 4.89840831477574e-05, "loss": 0.1732, "step": 682 }, { "epoch": 0.355543987506507, "grad_norm": 0.3742846141003235, "learning_rate": 4.8980114803571084e-05, "loss": 0.1874, "step": 683 }, { "epoch": 0.3560645497136908, "grad_norm": 0.3537697722706192, "learning_rate": 4.8976138885348575e-05, "loss": 0.1813, "step": 684 }, { "epoch": 0.35658511192087455, "grad_norm": 0.34743989109554174, "learning_rate": 4.897215539434566e-05, "loss": 0.1759, "step": 685 }, { "epoch": 0.3571056741280583, "grad_norm": 0.3591186392260258, "learning_rate": 4.89681643318205e-05, "loss": 0.1677, "step": 686 }, { "epoch": 0.35762623633524204, "grad_norm": 0.35400264664937536, "learning_rate": 4.896416569903366e-05, "loss": 0.1755, "step": 687 }, { "epoch": 0.3581467985424258, "grad_norm": 0.36086365972267287, "learning_rate": 4.89601594972481e-05, "loss": 0.1738, "step": 688 }, { "epoch": 0.3586673607496096, "grad_norm": 0.34105586937400784, "learning_rate": 4.8956145727729156e-05, "loss": 0.1743, "step": 689 }, { "epoch": 0.35918792295679336, "grad_norm": 0.3373349436933188, "learning_rate": 4.895212439174457e-05, "loss": 0.1733, "step": 690 }, { "epoch": 0.3597084851639771, "grad_norm": 0.3503292696755782, "learning_rate": 4.894809549056447e-05, "loss": 0.1727, "step": 691 }, { "epoch": 0.36022904737116085, "grad_norm": 0.35086053297097963, "learning_rate": 4.894405902546136e-05, "loss": 0.1863, "step": 692 }, { "epoch": 0.3607496095783446, "grad_norm": 0.3839353193339784, "learning_rate": 4.894001499771015e-05, "loss": 0.183, "step": 693 }, { "epoch": 0.3612701717855284, "grad_norm": 0.32399392427850626, "learning_rate": 4.8935963408588134e-05, "loss": 0.1748, "step": 694 }, { "epoch": 0.3617907339927121, "grad_norm": 0.3585859771864591, "learning_rate": 4.893190425937499e-05, "loss": 0.1811, "step": 695 }, { "epoch": 0.3623112961998959, "grad_norm": 0.3227714956193789, "learning_rate": 4.8927837551352784e-05, "loss": 0.1782, "step": 696 }, { "epoch": 0.36283185840707965, "grad_norm": 0.3474440141200088, "learning_rate": 4.8923763285805965e-05, "loss": 0.1754, "step": 697 }, { "epoch": 0.3633524206142634, "grad_norm": 0.3322694368519195, "learning_rate": 4.89196814640214e-05, "loss": 0.1774, "step": 698 }, { "epoch": 0.36387298282144714, "grad_norm": 0.34820159808484086, "learning_rate": 4.89155920872883e-05, "loss": 0.177, "step": 699 }, { "epoch": 0.3643935450286309, "grad_norm": 0.3031746341954232, "learning_rate": 4.891149515689827e-05, "loss": 0.1736, "step": 700 }, { "epoch": 0.3649141072358147, "grad_norm": 0.36173867640457913, "learning_rate": 4.8907390674145335e-05, "loss": 0.1861, "step": 701 }, { "epoch": 0.36543466944299846, "grad_norm": 0.30797603475460056, "learning_rate": 4.890327864032587e-05, "loss": 0.1766, "step": 702 }, { "epoch": 0.3659552316501822, "grad_norm": 0.4370440494274063, "learning_rate": 4.8899159056738646e-05, "loss": 0.1791, "step": 703 }, { "epoch": 0.36647579385736595, "grad_norm": 0.33055435550054857, "learning_rate": 4.889503192468482e-05, "loss": 0.1794, "step": 704 }, { "epoch": 0.3669963560645497, "grad_norm": 0.3232689437227807, "learning_rate": 4.8890897245467934e-05, "loss": 0.1692, "step": 705 }, { "epoch": 0.3675169182717335, "grad_norm": 0.32294570761168684, "learning_rate": 4.8886755020393915e-05, "loss": 0.1815, "step": 706 }, { "epoch": 0.3680374804789172, "grad_norm": 0.34062024576611194, "learning_rate": 4.888260525077106e-05, "loss": 0.1702, "step": 707 }, { "epoch": 0.368558042686101, "grad_norm": 0.30465620374884, "learning_rate": 4.887844793791008e-05, "loss": 0.1751, "step": 708 }, { "epoch": 0.36907860489328476, "grad_norm": 0.3420067292282057, "learning_rate": 4.887428308312402e-05, "loss": 0.1787, "step": 709 }, { "epoch": 0.36959916710046853, "grad_norm": 0.32157102458891257, "learning_rate": 4.887011068772835e-05, "loss": 0.1757, "step": 710 }, { "epoch": 0.37011972930765225, "grad_norm": 0.3128376282076131, "learning_rate": 4.886593075304091e-05, "loss": 0.1743, "step": 711 }, { "epoch": 0.370640291514836, "grad_norm": 0.3401322277124052, "learning_rate": 4.886174328038191e-05, "loss": 0.1777, "step": 712 }, { "epoch": 0.3711608537220198, "grad_norm": 0.33579339844499495, "learning_rate": 4.885754827107395e-05, "loss": 0.1772, "step": 713 }, { "epoch": 0.37168141592920356, "grad_norm": 0.3153726122024893, "learning_rate": 4.885334572644202e-05, "loss": 0.1789, "step": 714 }, { "epoch": 0.3722019781363873, "grad_norm": 0.32118752532405626, "learning_rate": 4.884913564781346e-05, "loss": 0.1719, "step": 715 }, { "epoch": 0.37272254034357105, "grad_norm": 0.3213935784665974, "learning_rate": 4.884491803651803e-05, "loss": 0.169, "step": 716 }, { "epoch": 0.3732431025507548, "grad_norm": 0.33814274890520907, "learning_rate": 4.884069289388783e-05, "loss": 0.1781, "step": 717 }, { "epoch": 0.3737636647579386, "grad_norm": 0.30359841482721706, "learning_rate": 4.883646022125736e-05, "loss": 0.1733, "step": 718 }, { "epoch": 0.3742842269651223, "grad_norm": 0.30429885008494284, "learning_rate": 4.8832220019963514e-05, "loss": 0.1693, "step": 719 }, { "epoch": 0.3748047891723061, "grad_norm": 0.331016348005072, "learning_rate": 4.882797229134551e-05, "loss": 0.1771, "step": 720 }, { "epoch": 0.37532535137948986, "grad_norm": 0.3339222828334093, "learning_rate": 4.882371703674501e-05, "loss": 0.1782, "step": 721 }, { "epoch": 0.37584591358667363, "grad_norm": 0.2991710011140988, "learning_rate": 4.8819454257506015e-05, "loss": 0.1672, "step": 722 }, { "epoch": 0.37636647579385735, "grad_norm": 0.36396331898025386, "learning_rate": 4.8815183954974896e-05, "loss": 0.1715, "step": 723 }, { "epoch": 0.3768870380010411, "grad_norm": 0.33416943073891897, "learning_rate": 4.881090613050042e-05, "loss": 0.1771, "step": 724 }, { "epoch": 0.3774076002082249, "grad_norm": 0.36523650639258104, "learning_rate": 4.8806620785433726e-05, "loss": 0.164, "step": 725 }, { "epoch": 0.37792816241540866, "grad_norm": 0.32073950251793465, "learning_rate": 4.880232792112832e-05, "loss": 0.171, "step": 726 }, { "epoch": 0.3784487246225924, "grad_norm": 0.3484933060375919, "learning_rate": 4.879802753894009e-05, "loss": 0.1803, "step": 727 }, { "epoch": 0.37896928682977615, "grad_norm": 0.33643384179749536, "learning_rate": 4.879371964022731e-05, "loss": 0.1762, "step": 728 }, { "epoch": 0.3794898490369599, "grad_norm": 0.3844044688078634, "learning_rate": 4.878940422635059e-05, "loss": 0.1757, "step": 729 }, { "epoch": 0.3800104112441437, "grad_norm": 0.31212059732422176, "learning_rate": 4.878508129867296e-05, "loss": 0.1744, "step": 730 }, { "epoch": 0.3805309734513274, "grad_norm": 0.37952982057783435, "learning_rate": 4.8780750858559794e-05, "loss": 0.1873, "step": 731 }, { "epoch": 0.3810515356585112, "grad_norm": 0.36193360687044024, "learning_rate": 4.877641290737884e-05, "loss": 0.1746, "step": 732 }, { "epoch": 0.38157209786569496, "grad_norm": 0.3415108925924172, "learning_rate": 4.8772067446500235e-05, "loss": 0.176, "step": 733 }, { "epoch": 0.38209266007287873, "grad_norm": 0.35739113924155885, "learning_rate": 4.8767714477296475e-05, "loss": 0.1773, "step": 734 }, { "epoch": 0.38261322228006245, "grad_norm": 0.299683012812736, "learning_rate": 4.8763354001142426e-05, "loss": 0.1742, "step": 735 }, { "epoch": 0.3831337844872462, "grad_norm": 0.3281156150608031, "learning_rate": 4.875898601941533e-05, "loss": 0.1773, "step": 736 }, { "epoch": 0.38365434669443, "grad_norm": 0.29779263356766955, "learning_rate": 4.875461053349481e-05, "loss": 0.1754, "step": 737 }, { "epoch": 0.38417490890161377, "grad_norm": 0.36739569269258815, "learning_rate": 4.875022754476283e-05, "loss": 0.1768, "step": 738 }, { "epoch": 0.3846954711087975, "grad_norm": 0.3305403555222801, "learning_rate": 4.8745837054603746e-05, "loss": 0.1782, "step": 739 }, { "epoch": 0.38521603331598125, "grad_norm": 0.3461843255219805, "learning_rate": 4.87414390644043e-05, "loss": 0.1761, "step": 740 }, { "epoch": 0.385736595523165, "grad_norm": 0.34024915037224157, "learning_rate": 4.8737033575553556e-05, "loss": 0.1783, "step": 741 }, { "epoch": 0.3862571577303488, "grad_norm": 0.36955073863687815, "learning_rate": 4.873262058944299e-05, "loss": 0.1801, "step": 742 }, { "epoch": 0.3867777199375325, "grad_norm": 0.33810479957816336, "learning_rate": 4.872820010746641e-05, "loss": 0.1781, "step": 743 }, { "epoch": 0.3872982821447163, "grad_norm": 0.31816896200979733, "learning_rate": 4.872377213102003e-05, "loss": 0.1701, "step": 744 }, { "epoch": 0.38781884435190006, "grad_norm": 0.2994338543173188, "learning_rate": 4.871933666150239e-05, "loss": 0.1768, "step": 745 }, { "epoch": 0.38833940655908383, "grad_norm": 0.3143457501312435, "learning_rate": 4.8714893700314445e-05, "loss": 0.1745, "step": 746 }, { "epoch": 0.38885996876626755, "grad_norm": 0.3323503389224882, "learning_rate": 4.8710443248859464e-05, "loss": 0.1747, "step": 747 }, { "epoch": 0.3893805309734513, "grad_norm": 0.3144112661589175, "learning_rate": 4.870598530854312e-05, "loss": 0.1722, "step": 748 }, { "epoch": 0.3899010931806351, "grad_norm": 0.3299614127535381, "learning_rate": 4.870151988077343e-05, "loss": 0.177, "step": 749 }, { "epoch": 0.39042165538781887, "grad_norm": 0.3303857038262982, "learning_rate": 4.869704696696079e-05, "loss": 0.1686, "step": 750 }, { "epoch": 0.3909422175950026, "grad_norm": 0.30943696088141864, "learning_rate": 4.869256656851795e-05, "loss": 0.1748, "step": 751 }, { "epoch": 0.39146277980218636, "grad_norm": 0.3347597384412751, "learning_rate": 4.8688078686860025e-05, "loss": 0.1848, "step": 752 }, { "epoch": 0.39198334200937013, "grad_norm": 0.3357985035932315, "learning_rate": 4.8683583323404514e-05, "loss": 0.1722, "step": 753 }, { "epoch": 0.3925039042165539, "grad_norm": 0.3229921181970955, "learning_rate": 4.867908047957125e-05, "loss": 0.1716, "step": 754 }, { "epoch": 0.3930244664237376, "grad_norm": 0.4022384622310971, "learning_rate": 4.867457015678244e-05, "loss": 0.1786, "step": 755 }, { "epoch": 0.3935450286309214, "grad_norm": 0.3049771099599678, "learning_rate": 4.867005235646265e-05, "loss": 0.1793, "step": 756 }, { "epoch": 0.39406559083810516, "grad_norm": 0.4173130163682277, "learning_rate": 4.866552708003882e-05, "loss": 0.1778, "step": 757 }, { "epoch": 0.39458615304528893, "grad_norm": 0.3313304796655795, "learning_rate": 4.8660994328940235e-05, "loss": 0.179, "step": 758 }, { "epoch": 0.39510671525247265, "grad_norm": 0.3218402610994695, "learning_rate": 4.865645410459856e-05, "loss": 0.1628, "step": 759 }, { "epoch": 0.3956272774596564, "grad_norm": 0.32361763518417075, "learning_rate": 4.8651906408447795e-05, "loss": 0.1749, "step": 760 }, { "epoch": 0.3961478396668402, "grad_norm": 0.33838851288907523, "learning_rate": 4.864735124192432e-05, "loss": 0.1731, "step": 761 }, { "epoch": 0.39666840187402397, "grad_norm": 0.30076334190054943, "learning_rate": 4.8642788606466884e-05, "loss": 0.1697, "step": 762 }, { "epoch": 0.3971889640812077, "grad_norm": 0.2972937365361002, "learning_rate": 4.8638218503516554e-05, "loss": 0.1732, "step": 763 }, { "epoch": 0.39770952628839146, "grad_norm": 0.2930332941508139, "learning_rate": 4.863364093451679e-05, "loss": 0.1767, "step": 764 }, { "epoch": 0.39823008849557523, "grad_norm": 0.29879453433882425, "learning_rate": 4.862905590091341e-05, "loss": 0.1649, "step": 765 }, { "epoch": 0.398750650702759, "grad_norm": 0.30522662679820955, "learning_rate": 4.8624463404154575e-05, "loss": 0.1779, "step": 766 }, { "epoch": 0.3992712129099427, "grad_norm": 0.29429201663178467, "learning_rate": 4.8619863445690804e-05, "loss": 0.1742, "step": 767 }, { "epoch": 0.3997917751171265, "grad_norm": 0.2828872044198585, "learning_rate": 4.861525602697499e-05, "loss": 0.169, "step": 768 }, { "epoch": 0.40031233732431026, "grad_norm": 0.31056313318997847, "learning_rate": 4.861064114946236e-05, "loss": 0.1728, "step": 769 }, { "epoch": 0.40083289953149404, "grad_norm": 0.28343310391927706, "learning_rate": 4.860601881461051e-05, "loss": 0.171, "step": 770 }, { "epoch": 0.40135346173867775, "grad_norm": 0.31511791024068336, "learning_rate": 4.86013890238794e-05, "loss": 0.1754, "step": 771 }, { "epoch": 0.4018740239458615, "grad_norm": 0.3106130269067802, "learning_rate": 4.85967517787313e-05, "loss": 0.1755, "step": 772 }, { "epoch": 0.4023945861530453, "grad_norm": 0.3216761568260499, "learning_rate": 4.859210708063091e-05, "loss": 0.1709, "step": 773 }, { "epoch": 0.40291514836022907, "grad_norm": 0.3169473466081924, "learning_rate": 4.858745493104521e-05, "loss": 0.1677, "step": 774 }, { "epoch": 0.4034357105674128, "grad_norm": 0.2858364547750457, "learning_rate": 4.858279533144358e-05, "loss": 0.173, "step": 775 }, { "epoch": 0.40395627277459656, "grad_norm": 0.31534006666184045, "learning_rate": 4.8578128283297736e-05, "loss": 0.1701, "step": 776 }, { "epoch": 0.40447683498178033, "grad_norm": 0.2978277533533381, "learning_rate": 4.857345378808175e-05, "loss": 0.1642, "step": 777 }, { "epoch": 0.4049973971889641, "grad_norm": 0.30248488777067317, "learning_rate": 4.856877184727204e-05, "loss": 0.1763, "step": 778 }, { "epoch": 0.4055179593961478, "grad_norm": 0.31081161780013244, "learning_rate": 4.856408246234739e-05, "loss": 0.1672, "step": 779 }, { "epoch": 0.4060385216033316, "grad_norm": 0.33291363782085454, "learning_rate": 4.855938563478892e-05, "loss": 0.1773, "step": 780 }, { "epoch": 0.40655908381051536, "grad_norm": 0.3108957897270069, "learning_rate": 4.8554681366080104e-05, "loss": 0.1693, "step": 781 }, { "epoch": 0.40707964601769914, "grad_norm": 0.32908383574112354, "learning_rate": 4.8549969657706775e-05, "loss": 0.1708, "step": 782 }, { "epoch": 0.40760020822488285, "grad_norm": 0.3107460244869553, "learning_rate": 4.85452505111571e-05, "loss": 0.1683, "step": 783 }, { "epoch": 0.4081207704320666, "grad_norm": 0.31853260828349084, "learning_rate": 4.8540523927921616e-05, "loss": 0.1687, "step": 784 }, { "epoch": 0.4086413326392504, "grad_norm": 0.3316073002641256, "learning_rate": 4.85357899094932e-05, "loss": 0.1684, "step": 785 }, { "epoch": 0.40916189484643417, "grad_norm": 0.3027467331091613, "learning_rate": 4.853104845736706e-05, "loss": 0.1715, "step": 786 }, { "epoch": 0.4096824570536179, "grad_norm": 0.3278257429615449, "learning_rate": 4.852629957304078e-05, "loss": 0.1773, "step": 787 }, { "epoch": 0.41020301926080166, "grad_norm": 0.291153268830071, "learning_rate": 4.8521543258014276e-05, "loss": 0.1681, "step": 788 }, { "epoch": 0.41072358146798543, "grad_norm": 0.3559444910615816, "learning_rate": 4.8516779513789815e-05, "loss": 0.1807, "step": 789 }, { "epoch": 0.4112441436751692, "grad_norm": 0.3085670500497958, "learning_rate": 4.8512008341872e-05, "loss": 0.1719, "step": 790 }, { "epoch": 0.4117647058823529, "grad_norm": 0.35264383435574276, "learning_rate": 4.85072297437678e-05, "loss": 0.1794, "step": 791 }, { "epoch": 0.4122852680895367, "grad_norm": 0.3356960894753313, "learning_rate": 4.850244372098651e-05, "loss": 0.1769, "step": 792 }, { "epoch": 0.41280583029672047, "grad_norm": 0.32740161142053137, "learning_rate": 4.8497650275039795e-05, "loss": 0.1783, "step": 793 }, { "epoch": 0.41332639250390424, "grad_norm": 0.3351198703316088, "learning_rate": 4.849284940744163e-05, "loss": 0.1733, "step": 794 }, { "epoch": 0.41384695471108796, "grad_norm": 0.312070876021951, "learning_rate": 4.848804111970836e-05, "loss": 0.1866, "step": 795 }, { "epoch": 0.4143675169182717, "grad_norm": 0.3282848492692099, "learning_rate": 4.8483225413358663e-05, "loss": 0.1789, "step": 796 }, { "epoch": 0.4148880791254555, "grad_norm": 0.286266024247938, "learning_rate": 4.8478402289913566e-05, "loss": 0.1698, "step": 797 }, { "epoch": 0.4154086413326393, "grad_norm": 0.33058577040283377, "learning_rate": 4.847357175089643e-05, "loss": 0.1684, "step": 798 }, { "epoch": 0.415929203539823, "grad_norm": 0.2934521521206386, "learning_rate": 4.846873379783298e-05, "loss": 0.1753, "step": 799 }, { "epoch": 0.41644976574700676, "grad_norm": 0.31483778450446187, "learning_rate": 4.846388843225125e-05, "loss": 0.1665, "step": 800 }, { "epoch": 0.41697032795419053, "grad_norm": 0.30494366672931933, "learning_rate": 4.845903565568164e-05, "loss": 0.1791, "step": 801 }, { "epoch": 0.4174908901613743, "grad_norm": 0.2737424192822608, "learning_rate": 4.845417546965688e-05, "loss": 0.1689, "step": 802 }, { "epoch": 0.418011452368558, "grad_norm": 0.3176283439520924, "learning_rate": 4.844930787571204e-05, "loss": 0.1728, "step": 803 }, { "epoch": 0.4185320145757418, "grad_norm": 0.3013277659758763, "learning_rate": 4.844443287538454e-05, "loss": 0.1793, "step": 804 }, { "epoch": 0.41905257678292557, "grad_norm": 0.31841896666187924, "learning_rate": 4.8439550470214124e-05, "loss": 0.1725, "step": 805 }, { "epoch": 0.41957313899010934, "grad_norm": 0.32537384441510847, "learning_rate": 4.8434660661742894e-05, "loss": 0.1726, "step": 806 }, { "epoch": 0.42009370119729306, "grad_norm": 0.2918667124673871, "learning_rate": 4.8429763451515263e-05, "loss": 0.1713, "step": 807 }, { "epoch": 0.42061426340447683, "grad_norm": 0.3068100516789931, "learning_rate": 4.842485884107801e-05, "loss": 0.177, "step": 808 }, { "epoch": 0.4211348256116606, "grad_norm": 0.32460789829684644, "learning_rate": 4.8419946831980236e-05, "loss": 0.1742, "step": 809 }, { "epoch": 0.4216553878188444, "grad_norm": 0.2946290268979509, "learning_rate": 4.8415027425773386e-05, "loss": 0.1681, "step": 810 }, { "epoch": 0.4221759500260281, "grad_norm": 0.3258905754569639, "learning_rate": 4.841010062401123e-05, "loss": 0.1741, "step": 811 }, { "epoch": 0.42269651223321186, "grad_norm": 0.29790984995596925, "learning_rate": 4.840516642824988e-05, "loss": 0.1694, "step": 812 }, { "epoch": 0.42321707444039564, "grad_norm": 0.31430235551801683, "learning_rate": 4.8400224840047795e-05, "loss": 0.1692, "step": 813 }, { "epoch": 0.4237376366475794, "grad_norm": 0.28497704460198, "learning_rate": 4.839527586096575e-05, "loss": 0.1692, "step": 814 }, { "epoch": 0.4242581988547631, "grad_norm": 0.3251548281082162, "learning_rate": 4.839031949256687e-05, "loss": 0.1786, "step": 815 }, { "epoch": 0.4247787610619469, "grad_norm": 0.3037172282531429, "learning_rate": 4.838535573641661e-05, "loss": 0.1784, "step": 816 }, { "epoch": 0.42529932326913067, "grad_norm": 0.2921332397976667, "learning_rate": 4.838038459408273e-05, "loss": 0.1627, "step": 817 }, { "epoch": 0.42581988547631444, "grad_norm": 0.2871659618387349, "learning_rate": 4.837540606713538e-05, "loss": 0.1669, "step": 818 }, { "epoch": 0.42634044768349816, "grad_norm": 0.3022037954071766, "learning_rate": 4.837042015714698e-05, "loss": 0.1737, "step": 819 }, { "epoch": 0.42686100989068193, "grad_norm": 0.31565270517728566, "learning_rate": 4.8365426865692345e-05, "loss": 0.1787, "step": 820 }, { "epoch": 0.4273815720978657, "grad_norm": 0.32793352347608645, "learning_rate": 4.836042619434856e-05, "loss": 0.1748, "step": 821 }, { "epoch": 0.4279021343050495, "grad_norm": 0.317161732106535, "learning_rate": 4.835541814469509e-05, "loss": 0.1782, "step": 822 }, { "epoch": 0.4284226965122332, "grad_norm": 0.3049128671196971, "learning_rate": 4.83504027183137e-05, "loss": 0.1816, "step": 823 }, { "epoch": 0.42894325871941696, "grad_norm": 0.28919765868015046, "learning_rate": 4.8345379916788505e-05, "loss": 0.1672, "step": 824 }, { "epoch": 0.42946382092660074, "grad_norm": 0.33488488955581175, "learning_rate": 4.834034974170592e-05, "loss": 0.1688, "step": 825 }, { "epoch": 0.4299843831337845, "grad_norm": 0.31612472215220894, "learning_rate": 4.833531219465473e-05, "loss": 0.1639, "step": 826 }, { "epoch": 0.4305049453409682, "grad_norm": 0.30731831334383447, "learning_rate": 4.8330267277226006e-05, "loss": 0.1712, "step": 827 }, { "epoch": 0.431025507548152, "grad_norm": 0.31217223215929757, "learning_rate": 4.832521499101319e-05, "loss": 0.176, "step": 828 }, { "epoch": 0.43154606975533577, "grad_norm": 0.2795778724288852, "learning_rate": 4.8320155337612014e-05, "loss": 0.1703, "step": 829 }, { "epoch": 0.43206663196251954, "grad_norm": 0.2970259430025653, "learning_rate": 4.831508831862055e-05, "loss": 0.1693, "step": 830 }, { "epoch": 0.43258719416970326, "grad_norm": 0.31169918235532607, "learning_rate": 4.8310013935639206e-05, "loss": 0.1758, "step": 831 }, { "epoch": 0.43310775637688703, "grad_norm": 0.3282580950489392, "learning_rate": 4.830493219027071e-05, "loss": 0.1701, "step": 832 }, { "epoch": 0.4336283185840708, "grad_norm": 0.3035542216950865, "learning_rate": 4.829984308412011e-05, "loss": 0.1759, "step": 833 }, { "epoch": 0.4341488807912546, "grad_norm": 0.3170042392985621, "learning_rate": 4.8294746618794786e-05, "loss": 0.1714, "step": 834 }, { "epoch": 0.4346694429984383, "grad_norm": 0.30492935178098757, "learning_rate": 4.8289642795904433e-05, "loss": 0.1769, "step": 835 }, { "epoch": 0.43519000520562207, "grad_norm": 0.3023169478849347, "learning_rate": 4.828453161706108e-05, "loss": 0.1684, "step": 836 }, { "epoch": 0.43571056741280584, "grad_norm": 0.2934309140121328, "learning_rate": 4.8279413083879063e-05, "loss": 0.1726, "step": 837 }, { "epoch": 0.4362311296199896, "grad_norm": 0.33964067234560835, "learning_rate": 4.827428719797508e-05, "loss": 0.1811, "step": 838 }, { "epoch": 0.4367516918271733, "grad_norm": 0.3239469204735507, "learning_rate": 4.8269153960968094e-05, "loss": 0.1801, "step": 839 }, { "epoch": 0.4372722540343571, "grad_norm": 0.34044634387274025, "learning_rate": 4.8264013374479446e-05, "loss": 0.1734, "step": 840 }, { "epoch": 0.43779281624154087, "grad_norm": 0.3143167652108177, "learning_rate": 4.825886544013275e-05, "loss": 0.1742, "step": 841 }, { "epoch": 0.43831337844872464, "grad_norm": 0.2944577153853259, "learning_rate": 4.825371015955398e-05, "loss": 0.1653, "step": 842 }, { "epoch": 0.43883394065590836, "grad_norm": 0.3052896712979636, "learning_rate": 4.82485475343714e-05, "loss": 0.1704, "step": 843 }, { "epoch": 0.43935450286309213, "grad_norm": 0.3177119837657539, "learning_rate": 4.8243377566215616e-05, "loss": 0.1719, "step": 844 }, { "epoch": 0.4398750650702759, "grad_norm": 0.30859779939404325, "learning_rate": 4.8238200256719554e-05, "loss": 0.1779, "step": 845 }, { "epoch": 0.4403956272774597, "grad_norm": 0.3226603688721493, "learning_rate": 4.823301560751843e-05, "loss": 0.1686, "step": 846 }, { "epoch": 0.4409161894846434, "grad_norm": 0.29317543800324647, "learning_rate": 4.82278236202498e-05, "loss": 0.1673, "step": 847 }, { "epoch": 0.44143675169182717, "grad_norm": 0.31460096487006833, "learning_rate": 4.8222624296553554e-05, "loss": 0.1753, "step": 848 }, { "epoch": 0.44195731389901094, "grad_norm": 0.2919013066978429, "learning_rate": 4.821741763807186e-05, "loss": 0.1694, "step": 849 }, { "epoch": 0.4424778761061947, "grad_norm": 0.33306663050219115, "learning_rate": 4.821220364644923e-05, "loss": 0.1725, "step": 850 }, { "epoch": 0.44299843831337843, "grad_norm": 0.32508594947874425, "learning_rate": 4.8206982323332485e-05, "loss": 0.1703, "step": 851 }, { "epoch": 0.4435190005205622, "grad_norm": 0.312739185199514, "learning_rate": 4.820175367037076e-05, "loss": 0.1747, "step": 852 }, { "epoch": 0.444039562727746, "grad_norm": 0.3139934324616478, "learning_rate": 4.8196517689215515e-05, "loss": 0.1697, "step": 853 }, { "epoch": 0.44456012493492975, "grad_norm": 0.28334748849023433, "learning_rate": 4.8191274381520515e-05, "loss": 0.1701, "step": 854 }, { "epoch": 0.44508068714211346, "grad_norm": 0.31522836280773053, "learning_rate": 4.818602374894182e-05, "loss": 0.1724, "step": 855 }, { "epoch": 0.44560124934929723, "grad_norm": 0.28857799909498244, "learning_rate": 4.8180765793137856e-05, "loss": 0.1748, "step": 856 }, { "epoch": 0.446121811556481, "grad_norm": 0.2972500213315476, "learning_rate": 4.817550051576931e-05, "loss": 0.1653, "step": 857 }, { "epoch": 0.4466423737636648, "grad_norm": 0.29255400412368143, "learning_rate": 4.81702279184992e-05, "loss": 0.1615, "step": 858 }, { "epoch": 0.4471629359708485, "grad_norm": 0.2886655749254805, "learning_rate": 4.8164948002992874e-05, "loss": 0.1634, "step": 859 }, { "epoch": 0.44768349817803227, "grad_norm": 0.31437191386746677, "learning_rate": 4.815966077091796e-05, "loss": 0.1749, "step": 860 }, { "epoch": 0.44820406038521604, "grad_norm": 0.29304891186896304, "learning_rate": 4.815436622394441e-05, "loss": 0.1708, "step": 861 }, { "epoch": 0.4487246225923998, "grad_norm": 0.31745839082325317, "learning_rate": 4.814906436374451e-05, "loss": 0.1686, "step": 862 }, { "epoch": 0.44924518479958353, "grad_norm": 0.3046376488171113, "learning_rate": 4.814375519199281e-05, "loss": 0.1654, "step": 863 }, { "epoch": 0.4497657470067673, "grad_norm": 0.30408750750055136, "learning_rate": 4.8138438710366204e-05, "loss": 0.1692, "step": 864 }, { "epoch": 0.4502863092139511, "grad_norm": 0.2900405154788453, "learning_rate": 4.813311492054388e-05, "loss": 0.1719, "step": 865 }, { "epoch": 0.45080687142113485, "grad_norm": 0.307651302837432, "learning_rate": 4.8127783824207344e-05, "loss": 0.1768, "step": 866 }, { "epoch": 0.45132743362831856, "grad_norm": 0.3003033631697678, "learning_rate": 4.812244542304041e-05, "loss": 0.1687, "step": 867 }, { "epoch": 0.45184799583550234, "grad_norm": 0.3226224029065988, "learning_rate": 4.811709971872918e-05, "loss": 0.1737, "step": 868 }, { "epoch": 0.4523685580426861, "grad_norm": 0.3431087957838699, "learning_rate": 4.8111746712962066e-05, "loss": 0.1792, "step": 869 }, { "epoch": 0.4528891202498699, "grad_norm": 0.3241044954870311, "learning_rate": 4.810638640742983e-05, "loss": 0.1747, "step": 870 }, { "epoch": 0.4534096824570536, "grad_norm": 0.29640273607618955, "learning_rate": 4.810101880382548e-05, "loss": 0.165, "step": 871 }, { "epoch": 0.45393024466423737, "grad_norm": 0.2992144530417321, "learning_rate": 4.809564390384437e-05, "loss": 0.1734, "step": 872 }, { "epoch": 0.45445080687142114, "grad_norm": 0.33183641077540893, "learning_rate": 4.809026170918414e-05, "loss": 0.1737, "step": 873 }, { "epoch": 0.4549713690786049, "grad_norm": 0.28653542372832375, "learning_rate": 4.808487222154472e-05, "loss": 0.1634, "step": 874 }, { "epoch": 0.45549193128578863, "grad_norm": 0.32334785309979097, "learning_rate": 4.807947544262839e-05, "loss": 0.1733, "step": 875 }, { "epoch": 0.4560124934929724, "grad_norm": 0.3030305396087403, "learning_rate": 4.807407137413967e-05, "loss": 0.1658, "step": 876 }, { "epoch": 0.4565330557001562, "grad_norm": 0.296769998908498, "learning_rate": 4.806866001778545e-05, "loss": 0.1767, "step": 877 }, { "epoch": 0.45705361790733995, "grad_norm": 0.31049567683022117, "learning_rate": 4.806324137527487e-05, "loss": 0.1797, "step": 878 }, { "epoch": 0.45757418011452367, "grad_norm": 0.30534013515879393, "learning_rate": 4.8057815448319394e-05, "loss": 0.1691, "step": 879 }, { "epoch": 0.45809474232170744, "grad_norm": 0.31938583186414876, "learning_rate": 4.8052382238632774e-05, "loss": 0.1729, "step": 880 }, { "epoch": 0.4586153045288912, "grad_norm": 0.3046756327864025, "learning_rate": 4.804694174793108e-05, "loss": 0.1623, "step": 881 }, { "epoch": 0.459135866736075, "grad_norm": 0.30819124791549257, "learning_rate": 4.8041493977932685e-05, "loss": 0.1739, "step": 882 }, { "epoch": 0.4596564289432587, "grad_norm": 0.29113272544105995, "learning_rate": 4.803603893035822e-05, "loss": 0.1635, "step": 883 }, { "epoch": 0.46017699115044247, "grad_norm": 0.2908354554872441, "learning_rate": 4.803057660693065e-05, "loss": 0.1642, "step": 884 }, { "epoch": 0.46069755335762624, "grad_norm": 0.3228709959167957, "learning_rate": 4.8025107009375246e-05, "loss": 0.1687, "step": 885 }, { "epoch": 0.46121811556481, "grad_norm": 0.2875173089594343, "learning_rate": 4.8019630139419555e-05, "loss": 0.1779, "step": 886 }, { "epoch": 0.46173867777199373, "grad_norm": 0.33584341705384174, "learning_rate": 4.8014145998793416e-05, "loss": 0.1733, "step": 887 }, { "epoch": 0.4622592399791775, "grad_norm": 0.2896189188284826, "learning_rate": 4.8008654589228984e-05, "loss": 0.1684, "step": 888 }, { "epoch": 0.4627798021863613, "grad_norm": 0.3260162748091208, "learning_rate": 4.800315591246071e-05, "loss": 0.1679, "step": 889 }, { "epoch": 0.46330036439354505, "grad_norm": 0.28906298710463635, "learning_rate": 4.799764997022532e-05, "loss": 0.1748, "step": 890 }, { "epoch": 0.46382092660072877, "grad_norm": 0.28621158344042275, "learning_rate": 4.799213676426185e-05, "loss": 0.1745, "step": 891 }, { "epoch": 0.46434148880791254, "grad_norm": 0.309418801701727, "learning_rate": 4.798661629631163e-05, "loss": 0.168, "step": 892 }, { "epoch": 0.4648620510150963, "grad_norm": 0.2872933837593369, "learning_rate": 4.798108856811828e-05, "loss": 0.1695, "step": 893 }, { "epoch": 0.4653826132222801, "grad_norm": 0.31032860424563535, "learning_rate": 4.7975553581427715e-05, "loss": 0.164, "step": 894 }, { "epoch": 0.4659031754294638, "grad_norm": 0.2944281461376994, "learning_rate": 4.797001133798813e-05, "loss": 0.1715, "step": 895 }, { "epoch": 0.4664237376366476, "grad_norm": 0.3185115115776022, "learning_rate": 4.796446183955003e-05, "loss": 0.1664, "step": 896 }, { "epoch": 0.46694429984383135, "grad_norm": 0.2898044563512308, "learning_rate": 4.795890508786622e-05, "loss": 0.1681, "step": 897 }, { "epoch": 0.4674648620510151, "grad_norm": 0.30483790574973163, "learning_rate": 4.795334108469176e-05, "loss": 0.1687, "step": 898 }, { "epoch": 0.46798542425819883, "grad_norm": 0.30599381056013947, "learning_rate": 4.794776983178403e-05, "loss": 0.1711, "step": 899 }, { "epoch": 0.4685059864653826, "grad_norm": 0.3093671697842366, "learning_rate": 4.794219133090269e-05, "loss": 0.1696, "step": 900 }, { "epoch": 0.4690265486725664, "grad_norm": 0.3000416412774675, "learning_rate": 4.793660558380969e-05, "loss": 0.1748, "step": 901 }, { "epoch": 0.46954711087975015, "grad_norm": 0.3242303350202561, "learning_rate": 4.793101259226927e-05, "loss": 0.1775, "step": 902 }, { "epoch": 0.47006767308693387, "grad_norm": 0.31647313130174715, "learning_rate": 4.792541235804796e-05, "loss": 0.167, "step": 903 }, { "epoch": 0.47058823529411764, "grad_norm": 0.32293924392398315, "learning_rate": 4.791980488291456e-05, "loss": 0.1763, "step": 904 }, { "epoch": 0.4711087975013014, "grad_norm": 0.29224165341381086, "learning_rate": 4.7914190168640196e-05, "loss": 0.1678, "step": 905 }, { "epoch": 0.4716293597084852, "grad_norm": 0.30657161857494225, "learning_rate": 4.790856821699823e-05, "loss": 0.1739, "step": 906 }, { "epoch": 0.4721499219156689, "grad_norm": 0.29572827932510726, "learning_rate": 4.790293902976435e-05, "loss": 0.1635, "step": 907 }, { "epoch": 0.4726704841228527, "grad_norm": 0.3140759869076264, "learning_rate": 4.789730260871651e-05, "loss": 0.169, "step": 908 }, { "epoch": 0.47319104633003645, "grad_norm": 0.30710825268098163, "learning_rate": 4.7891658955634964e-05, "loss": 0.1658, "step": 909 }, { "epoch": 0.4737116085372202, "grad_norm": 0.31793790159265933, "learning_rate": 4.7886008072302235e-05, "loss": 0.1753, "step": 910 }, { "epoch": 0.47423217074440394, "grad_norm": 0.3014852750545304, "learning_rate": 4.788034996050314e-05, "loss": 0.1709, "step": 911 }, { "epoch": 0.4747527329515877, "grad_norm": 0.30747571574129023, "learning_rate": 4.787468462202476e-05, "loss": 0.1612, "step": 912 }, { "epoch": 0.4752732951587715, "grad_norm": 0.3065142379561827, "learning_rate": 4.786901205865647e-05, "loss": 0.1621, "step": 913 }, { "epoch": 0.47579385736595525, "grad_norm": 0.3017055320367006, "learning_rate": 4.786333227218995e-05, "loss": 0.1759, "step": 914 }, { "epoch": 0.47631441957313897, "grad_norm": 0.3063810493732478, "learning_rate": 4.785764526441913e-05, "loss": 0.1766, "step": 915 }, { "epoch": 0.47683498178032274, "grad_norm": 0.3139212066831126, "learning_rate": 4.7851951037140234e-05, "loss": 0.1686, "step": 916 }, { "epoch": 0.4773555439875065, "grad_norm": 0.2976317597214064, "learning_rate": 4.784624959215176e-05, "loss": 0.169, "step": 917 }, { "epoch": 0.4778761061946903, "grad_norm": 0.28710602424260284, "learning_rate": 4.78405409312545e-05, "loss": 0.1653, "step": 918 }, { "epoch": 0.478396668401874, "grad_norm": 0.30826508978927336, "learning_rate": 4.783482505625149e-05, "loss": 0.1716, "step": 919 }, { "epoch": 0.4789172306090578, "grad_norm": 0.31724938862767077, "learning_rate": 4.7829101968948095e-05, "loss": 0.1682, "step": 920 }, { "epoch": 0.47943779281624155, "grad_norm": 0.30073475133813604, "learning_rate": 4.782337167115193e-05, "loss": 0.1666, "step": 921 }, { "epoch": 0.4799583550234253, "grad_norm": 0.2941398432084064, "learning_rate": 4.7817634164672875e-05, "loss": 0.1654, "step": 922 }, { "epoch": 0.48047891723060904, "grad_norm": 0.30183359482377575, "learning_rate": 4.7811889451323114e-05, "loss": 0.1704, "step": 923 }, { "epoch": 0.4809994794377928, "grad_norm": 0.27257120459980766, "learning_rate": 4.7806137532917085e-05, "loss": 0.1588, "step": 924 }, { "epoch": 0.4815200416449766, "grad_norm": 0.3017092170804171, "learning_rate": 4.780037841127152e-05, "loss": 0.1632, "step": 925 }, { "epoch": 0.48204060385216035, "grad_norm": 0.31115577416098184, "learning_rate": 4.779461208820541e-05, "loss": 0.1723, "step": 926 }, { "epoch": 0.48256116605934407, "grad_norm": 0.294393697239707, "learning_rate": 4.778883856554004e-05, "loss": 0.1709, "step": 927 }, { "epoch": 0.48308172826652784, "grad_norm": 0.31179094440237864, "learning_rate": 4.778305784509894e-05, "loss": 0.1837, "step": 928 }, { "epoch": 0.4836022904737116, "grad_norm": 0.29561645952616644, "learning_rate": 4.7777269928707946e-05, "loss": 0.1679, "step": 929 }, { "epoch": 0.4841228526808954, "grad_norm": 0.31643861362801334, "learning_rate": 4.777147481819515e-05, "loss": 0.1652, "step": 930 }, { "epoch": 0.4846434148880791, "grad_norm": 0.2990057285412248, "learning_rate": 4.776567251539091e-05, "loss": 0.1652, "step": 931 }, { "epoch": 0.4851639770952629, "grad_norm": 0.31236323289513435, "learning_rate": 4.7759863022127864e-05, "loss": 0.1704, "step": 932 }, { "epoch": 0.48568453930244665, "grad_norm": 0.3453944960772348, "learning_rate": 4.775404634024093e-05, "loss": 0.1748, "step": 933 }, { "epoch": 0.4862051015096304, "grad_norm": 0.3629879836622649, "learning_rate": 4.7748222471567275e-05, "loss": 0.177, "step": 934 }, { "epoch": 0.48672566371681414, "grad_norm": 0.3120022773445736, "learning_rate": 4.7742391417946345e-05, "loss": 0.1741, "step": 935 }, { "epoch": 0.4872462259239979, "grad_norm": 0.3335724998214946, "learning_rate": 4.773655318121987e-05, "loss": 0.1748, "step": 936 }, { "epoch": 0.4877667881311817, "grad_norm": 0.34481706739223045, "learning_rate": 4.7730707763231844e-05, "loss": 0.1764, "step": 937 }, { "epoch": 0.48828735033836546, "grad_norm": 0.32529182532452, "learning_rate": 4.7724855165828497e-05, "loss": 0.1689, "step": 938 }, { "epoch": 0.4888079125455492, "grad_norm": 0.32960723240412076, "learning_rate": 4.771899539085837e-05, "loss": 0.1706, "step": 939 }, { "epoch": 0.48932847475273294, "grad_norm": 0.331766989087599, "learning_rate": 4.7713128440172244e-05, "loss": 0.1691, "step": 940 }, { "epoch": 0.4898490369599167, "grad_norm": 0.32802357802208776, "learning_rate": 4.770725431562318e-05, "loss": 0.1753, "step": 941 }, { "epoch": 0.4903695991671005, "grad_norm": 0.30860493564486086, "learning_rate": 4.77013730190665e-05, "loss": 0.1728, "step": 942 }, { "epoch": 0.4908901613742842, "grad_norm": 0.3022595357659953, "learning_rate": 4.7695484552359794e-05, "loss": 0.1722, "step": 943 }, { "epoch": 0.491410723581468, "grad_norm": 0.2934783342140039, "learning_rate": 4.7689588917362905e-05, "loss": 0.1608, "step": 944 }, { "epoch": 0.49193128578865175, "grad_norm": 0.28487899547455736, "learning_rate": 4.768368611593795e-05, "loss": 0.167, "step": 945 }, { "epoch": 0.4924518479958355, "grad_norm": 0.2990203794907809, "learning_rate": 4.7677776149949315e-05, "loss": 0.1651, "step": 946 }, { "epoch": 0.49297241020301924, "grad_norm": 0.32061252229163467, "learning_rate": 4.767185902126364e-05, "loss": 0.1626, "step": 947 }, { "epoch": 0.493492972410203, "grad_norm": 0.2807651296414107, "learning_rate": 4.7665934731749825e-05, "loss": 0.1668, "step": 948 }, { "epoch": 0.4940135346173868, "grad_norm": 0.2967222786779605, "learning_rate": 4.7660003283279045e-05, "loss": 0.1721, "step": 949 }, { "epoch": 0.49453409682457056, "grad_norm": 0.29814117868358586, "learning_rate": 4.765406467772472e-05, "loss": 0.1778, "step": 950 }, { "epoch": 0.4950546590317543, "grad_norm": 0.2788975714416808, "learning_rate": 4.7648118916962535e-05, "loss": 0.1681, "step": 951 }, { "epoch": 0.49557522123893805, "grad_norm": 0.28418700831999905, "learning_rate": 4.7642166002870455e-05, "loss": 0.1727, "step": 952 }, { "epoch": 0.4960957834461218, "grad_norm": 0.29906644459887316, "learning_rate": 4.763620593732867e-05, "loss": 0.1623, "step": 953 }, { "epoch": 0.4966163456533056, "grad_norm": 0.26491394611771796, "learning_rate": 4.763023872221965e-05, "loss": 0.1677, "step": 954 }, { "epoch": 0.4971369078604893, "grad_norm": 0.27853554570105976, "learning_rate": 4.762426435942812e-05, "loss": 0.1738, "step": 955 }, { "epoch": 0.4976574700676731, "grad_norm": 0.3177673300895612, "learning_rate": 4.761828285084107e-05, "loss": 0.1758, "step": 956 }, { "epoch": 0.49817803227485685, "grad_norm": 0.2822714144980152, "learning_rate": 4.761229419834772e-05, "loss": 0.1688, "step": 957 }, { "epoch": 0.4986985944820406, "grad_norm": 0.2931606790760609, "learning_rate": 4.7606298403839586e-05, "loss": 0.1714, "step": 958 }, { "epoch": 0.49921915668922434, "grad_norm": 0.3067169511093501, "learning_rate": 4.760029546921041e-05, "loss": 0.1707, "step": 959 }, { "epoch": 0.4997397188964081, "grad_norm": 0.28137140614635753, "learning_rate": 4.7594285396356184e-05, "loss": 0.1761, "step": 960 }, { "epoch": 0.5002602811035919, "grad_norm": 0.28844189796145525, "learning_rate": 4.75882681871752e-05, "loss": 0.1666, "step": 961 }, { "epoch": 0.5007808433107757, "grad_norm": 0.2878826071675348, "learning_rate": 4.758224384356795e-05, "loss": 0.1683, "step": 962 }, { "epoch": 0.5013014055179594, "grad_norm": 0.2998487365957084, "learning_rate": 4.75762123674372e-05, "loss": 0.1686, "step": 963 }, { "epoch": 0.5018219677251432, "grad_norm": 0.27860322662131287, "learning_rate": 4.757017376068799e-05, "loss": 0.1729, "step": 964 }, { "epoch": 0.5023425299323269, "grad_norm": 0.27665779300643556, "learning_rate": 4.7564128025227566e-05, "loss": 0.1669, "step": 965 }, { "epoch": 0.5028630921395106, "grad_norm": 0.2961362102520933, "learning_rate": 4.755807516296548e-05, "loss": 0.1731, "step": 966 }, { "epoch": 0.5033836543466944, "grad_norm": 0.28373948335406174, "learning_rate": 4.755201517581349e-05, "loss": 0.1647, "step": 967 }, { "epoch": 0.5039042165538782, "grad_norm": 0.3027972283519984, "learning_rate": 4.754594806568562e-05, "loss": 0.1744, "step": 968 }, { "epoch": 0.504424778761062, "grad_norm": 0.32929413135285035, "learning_rate": 4.753987383449816e-05, "loss": 0.1677, "step": 969 }, { "epoch": 0.5049453409682457, "grad_norm": 0.29541171416091216, "learning_rate": 4.753379248416963e-05, "loss": 0.1718, "step": 970 }, { "epoch": 0.5054659031754295, "grad_norm": 0.3445502102948572, "learning_rate": 4.75277040166208e-05, "loss": 0.1753, "step": 971 }, { "epoch": 0.5059864653826133, "grad_norm": 0.2832624623321996, "learning_rate": 4.752160843377469e-05, "loss": 0.1674, "step": 972 }, { "epoch": 0.5065070275897969, "grad_norm": 0.2837250649227678, "learning_rate": 4.751550573755658e-05, "loss": 0.1648, "step": 973 }, { "epoch": 0.5070275897969807, "grad_norm": 0.30259169690721716, "learning_rate": 4.750939592989396e-05, "loss": 0.1715, "step": 974 }, { "epoch": 0.5075481520041645, "grad_norm": 0.29554573173203236, "learning_rate": 4.750327901271662e-05, "loss": 0.1767, "step": 975 }, { "epoch": 0.5080687142113482, "grad_norm": 0.3048549802014968, "learning_rate": 4.7497154987956554e-05, "loss": 0.1736, "step": 976 }, { "epoch": 0.508589276418532, "grad_norm": 0.28859818824566424, "learning_rate": 4.749102385754802e-05, "loss": 0.1762, "step": 977 }, { "epoch": 0.5091098386257158, "grad_norm": 0.2907661277669279, "learning_rate": 4.74848856234275e-05, "loss": 0.169, "step": 978 }, { "epoch": 0.5096304008328996, "grad_norm": 0.3020314402188709, "learning_rate": 4.747874028753375e-05, "loss": 0.1705, "step": 979 }, { "epoch": 0.5101509630400833, "grad_norm": 0.28916453332411907, "learning_rate": 4.747258785180774e-05, "loss": 0.169, "step": 980 }, { "epoch": 0.510671525247267, "grad_norm": 0.27306730871455454, "learning_rate": 4.746642831819271e-05, "loss": 0.1712, "step": 981 }, { "epoch": 0.5111920874544508, "grad_norm": 0.3577975221682602, "learning_rate": 4.746026168863412e-05, "loss": 0.1714, "step": 982 }, { "epoch": 0.5117126496616345, "grad_norm": 0.28463660820354614, "learning_rate": 4.7454087965079675e-05, "loss": 0.1622, "step": 983 }, { "epoch": 0.5122332118688183, "grad_norm": 0.31003657876962476, "learning_rate": 4.744790714947932e-05, "loss": 0.1695, "step": 984 }, { "epoch": 0.5127537740760021, "grad_norm": 0.27961769726505104, "learning_rate": 4.744171924378526e-05, "loss": 0.163, "step": 985 }, { "epoch": 0.5132743362831859, "grad_norm": 0.33088943157458217, "learning_rate": 4.743552424995191e-05, "loss": 0.17, "step": 986 }, { "epoch": 0.5137948984903696, "grad_norm": 0.3056859101086278, "learning_rate": 4.7429322169935955e-05, "loss": 0.1697, "step": 987 }, { "epoch": 0.5143154606975534, "grad_norm": 0.31119710051950294, "learning_rate": 4.7423113005696275e-05, "loss": 0.1671, "step": 988 }, { "epoch": 0.5148360229047371, "grad_norm": 0.3143711776010868, "learning_rate": 4.741689675919403e-05, "loss": 0.1698, "step": 989 }, { "epoch": 0.5153565851119208, "grad_norm": 0.3061616967117495, "learning_rate": 4.7410673432392596e-05, "loss": 0.1842, "step": 990 }, { "epoch": 0.5158771473191046, "grad_norm": 0.29278315778967423, "learning_rate": 4.740444302725759e-05, "loss": 0.1678, "step": 991 }, { "epoch": 0.5163977095262884, "grad_norm": 0.2957634913771008, "learning_rate": 4.7398205545756863e-05, "loss": 0.1751, "step": 992 }, { "epoch": 0.5169182717334722, "grad_norm": 0.279576531903814, "learning_rate": 4.7391960989860504e-05, "loss": 0.1633, "step": 993 }, { "epoch": 0.5174388339406559, "grad_norm": 0.3073973561502015, "learning_rate": 4.738570936154083e-05, "loss": 0.1645, "step": 994 }, { "epoch": 0.5179593961478397, "grad_norm": 0.29041369434808123, "learning_rate": 4.7379450662772394e-05, "loss": 0.1626, "step": 995 }, { "epoch": 0.5184799583550235, "grad_norm": 0.30177127364477313, "learning_rate": 4.737318489553199e-05, "loss": 0.175, "step": 996 }, { "epoch": 0.5190005205622071, "grad_norm": 0.29152000130899824, "learning_rate": 4.736691206179864e-05, "loss": 0.1713, "step": 997 }, { "epoch": 0.5195210827693909, "grad_norm": 0.3104231023579747, "learning_rate": 4.7360632163553595e-05, "loss": 0.1703, "step": 998 }, { "epoch": 0.5200416449765747, "grad_norm": 0.28598295804809104, "learning_rate": 4.735434520278034e-05, "loss": 0.1685, "step": 999 }, { "epoch": 0.5205622071837585, "grad_norm": 0.31603631518913927, "learning_rate": 4.734805118146459e-05, "loss": 0.1786, "step": 1000 }, { "epoch": 0.5210827693909422, "grad_norm": 0.3129741436841726, "learning_rate": 4.734175010159428e-05, "loss": 0.1732, "step": 1001 }, { "epoch": 0.521603331598126, "grad_norm": 0.3176913349121328, "learning_rate": 4.73354419651596e-05, "loss": 0.1678, "step": 1002 }, { "epoch": 0.5221238938053098, "grad_norm": 0.33041896990240904, "learning_rate": 4.732912677415294e-05, "loss": 0.1645, "step": 1003 }, { "epoch": 0.5226444560124935, "grad_norm": 0.2830876561876967, "learning_rate": 4.732280453056894e-05, "loss": 0.1679, "step": 1004 }, { "epoch": 0.5231650182196772, "grad_norm": 0.34261613836366955, "learning_rate": 4.7316475236404454e-05, "loss": 0.1737, "step": 1005 }, { "epoch": 0.523685580426861, "grad_norm": 0.2760763443773014, "learning_rate": 4.731013889365857e-05, "loss": 0.1709, "step": 1006 }, { "epoch": 0.5242061426340447, "grad_norm": 0.28543357092624544, "learning_rate": 4.7303795504332604e-05, "loss": 0.1694, "step": 1007 }, { "epoch": 0.5247267048412285, "grad_norm": 0.2968860076469517, "learning_rate": 4.729744507043008e-05, "loss": 0.1686, "step": 1008 }, { "epoch": 0.5252472670484123, "grad_norm": 0.2955425625928012, "learning_rate": 4.729108759395677e-05, "loss": 0.1668, "step": 1009 }, { "epoch": 0.5257678292555961, "grad_norm": 0.3064862906090259, "learning_rate": 4.728472307692067e-05, "loss": 0.1717, "step": 1010 }, { "epoch": 0.5262883914627798, "grad_norm": 0.2944863868003504, "learning_rate": 4.727835152133197e-05, "loss": 0.1658, "step": 1011 }, { "epoch": 0.5268089536699636, "grad_norm": 0.29581781869973445, "learning_rate": 4.727197292920312e-05, "loss": 0.1664, "step": 1012 }, { "epoch": 0.5273295158771473, "grad_norm": 0.2655727030333062, "learning_rate": 4.7265587302548766e-05, "loss": 0.1588, "step": 1013 }, { "epoch": 0.527850078084331, "grad_norm": 0.32401890632087926, "learning_rate": 4.7259194643385796e-05, "loss": 0.1689, "step": 1014 }, { "epoch": 0.5283706402915148, "grad_norm": 0.31117831297374177, "learning_rate": 4.7252794953733294e-05, "loss": 0.1776, "step": 1015 }, { "epoch": 0.5288912024986986, "grad_norm": 0.3060852587762897, "learning_rate": 4.72463882356126e-05, "loss": 0.1671, "step": 1016 }, { "epoch": 0.5294117647058824, "grad_norm": 0.32556487990025834, "learning_rate": 4.7239974491047236e-05, "loss": 0.1775, "step": 1017 }, { "epoch": 0.5299323269130661, "grad_norm": 0.2799847281876232, "learning_rate": 4.723355372206297e-05, "loss": 0.1675, "step": 1018 }, { "epoch": 0.5304528891202499, "grad_norm": 0.2821768720138456, "learning_rate": 4.722712593068779e-05, "loss": 0.1664, "step": 1019 }, { "epoch": 0.5309734513274337, "grad_norm": 0.3123419119344323, "learning_rate": 4.722069111895187e-05, "loss": 0.1733, "step": 1020 }, { "epoch": 0.5314940135346173, "grad_norm": 0.30697486370866406, "learning_rate": 4.721424928888763e-05, "loss": 0.1722, "step": 1021 }, { "epoch": 0.5320145757418011, "grad_norm": 0.2895747411066602, "learning_rate": 4.7207800442529706e-05, "loss": 0.1647, "step": 1022 }, { "epoch": 0.5325351379489849, "grad_norm": 0.28336131313295926, "learning_rate": 4.720134458191494e-05, "loss": 0.1657, "step": 1023 }, { "epoch": 0.5330557001561687, "grad_norm": 0.2902463741761351, "learning_rate": 4.719488170908239e-05, "loss": 0.1579, "step": 1024 }, { "epoch": 0.5335762623633524, "grad_norm": 0.295514891496485, "learning_rate": 4.718841182607334e-05, "loss": 0.1716, "step": 1025 }, { "epoch": 0.5340968245705362, "grad_norm": 0.32915905626702163, "learning_rate": 4.718193493493127e-05, "loss": 0.1671, "step": 1026 }, { "epoch": 0.53461738677772, "grad_norm": 0.3026674578937171, "learning_rate": 4.717545103770189e-05, "loss": 0.158, "step": 1027 }, { "epoch": 0.5351379489849037, "grad_norm": 0.28381405365121987, "learning_rate": 4.716896013643313e-05, "loss": 0.1677, "step": 1028 }, { "epoch": 0.5356585111920874, "grad_norm": 0.29824689910486607, "learning_rate": 4.716246223317509e-05, "loss": 0.1703, "step": 1029 }, { "epoch": 0.5361790733992712, "grad_norm": 0.33329704323409864, "learning_rate": 4.7155957329980126e-05, "loss": 0.1698, "step": 1030 }, { "epoch": 0.536699635606455, "grad_norm": 0.3004672079802441, "learning_rate": 4.7149445428902786e-05, "loss": 0.1589, "step": 1031 }, { "epoch": 0.5372201978136387, "grad_norm": 0.31860987261597545, "learning_rate": 4.714292653199984e-05, "loss": 0.1672, "step": 1032 }, { "epoch": 0.5377407600208225, "grad_norm": 0.3442296126297135, "learning_rate": 4.713640064133025e-05, "loss": 0.1632, "step": 1033 }, { "epoch": 0.5382613222280063, "grad_norm": 0.3064387531445744, "learning_rate": 4.7129867758955196e-05, "loss": 0.1635, "step": 1034 }, { "epoch": 0.53878188443519, "grad_norm": 0.3123991398030029, "learning_rate": 4.7123327886938076e-05, "loss": 0.1664, "step": 1035 }, { "epoch": 0.5393024466423738, "grad_norm": 0.2900379032500118, "learning_rate": 4.711678102734447e-05, "loss": 0.1699, "step": 1036 }, { "epoch": 0.5398230088495575, "grad_norm": 0.3101402740827743, "learning_rate": 4.711022718224218e-05, "loss": 0.1674, "step": 1037 }, { "epoch": 0.5403435710567412, "grad_norm": 0.2978308638116788, "learning_rate": 4.710366635370124e-05, "loss": 0.1629, "step": 1038 }, { "epoch": 0.540864133263925, "grad_norm": 0.2934026273877542, "learning_rate": 4.709709854379385e-05, "loss": 0.1557, "step": 1039 }, { "epoch": 0.5413846954711088, "grad_norm": 0.29754336522736463, "learning_rate": 4.709052375459442e-05, "loss": 0.1708, "step": 1040 }, { "epoch": 0.5419052576782926, "grad_norm": 0.2941537759903196, "learning_rate": 4.7083941988179594e-05, "loss": 0.1688, "step": 1041 }, { "epoch": 0.5424258198854763, "grad_norm": 0.3197429755584853, "learning_rate": 4.707735324662818e-05, "loss": 0.1711, "step": 1042 }, { "epoch": 0.5429463820926601, "grad_norm": 0.29878401691495904, "learning_rate": 4.7070757532021224e-05, "loss": 0.1667, "step": 1043 }, { "epoch": 0.5434669442998439, "grad_norm": 0.34624169271494265, "learning_rate": 4.706415484644195e-05, "loss": 0.1719, "step": 1044 }, { "epoch": 0.5439875065070275, "grad_norm": 0.2825166282554898, "learning_rate": 4.705754519197581e-05, "loss": 0.163, "step": 1045 }, { "epoch": 0.5445080687142113, "grad_norm": 0.3328465767542314, "learning_rate": 4.705092857071042e-05, "loss": 0.1655, "step": 1046 }, { "epoch": 0.5450286309213951, "grad_norm": 0.3231665017996849, "learning_rate": 4.704430498473562e-05, "loss": 0.1666, "step": 1047 }, { "epoch": 0.5455491931285789, "grad_norm": 0.33401260004138217, "learning_rate": 4.7037674436143466e-05, "loss": 0.1704, "step": 1048 }, { "epoch": 0.5460697553357626, "grad_norm": 0.30948523577299875, "learning_rate": 4.703103692702817e-05, "loss": 0.1682, "step": 1049 }, { "epoch": 0.5465903175429464, "grad_norm": 0.33513343731742634, "learning_rate": 4.7024392459486176e-05, "loss": 0.1719, "step": 1050 }, { "epoch": 0.5471108797501302, "grad_norm": 0.29920676443214966, "learning_rate": 4.7017741035616124e-05, "loss": 0.1624, "step": 1051 }, { "epoch": 0.547631441957314, "grad_norm": 0.3063741016132584, "learning_rate": 4.701108265751884e-05, "loss": 0.1651, "step": 1052 }, { "epoch": 0.5481520041644976, "grad_norm": 0.2934234878431351, "learning_rate": 4.7004417327297325e-05, "loss": 0.1648, "step": 1053 }, { "epoch": 0.5486725663716814, "grad_norm": 0.30182853291753425, "learning_rate": 4.6997745047056836e-05, "loss": 0.1626, "step": 1054 }, { "epoch": 0.5491931285788652, "grad_norm": 0.3175333083074021, "learning_rate": 4.699106581890477e-05, "loss": 0.1719, "step": 1055 }, { "epoch": 0.5497136907860489, "grad_norm": 0.31314464288948607, "learning_rate": 4.698437964495074e-05, "loss": 0.1598, "step": 1056 }, { "epoch": 0.5502342529932327, "grad_norm": 0.32599179752881036, "learning_rate": 4.6977686527306556e-05, "loss": 0.1631, "step": 1057 }, { "epoch": 0.5507548152004165, "grad_norm": 0.3308531331011867, "learning_rate": 4.697098646808621e-05, "loss": 0.17, "step": 1058 }, { "epoch": 0.5512753774076002, "grad_norm": 0.2982979929430078, "learning_rate": 4.69642794694059e-05, "loss": 0.1708, "step": 1059 }, { "epoch": 0.551795939614784, "grad_norm": 0.32955914457005925, "learning_rate": 4.695756553338401e-05, "loss": 0.1671, "step": 1060 }, { "epoch": 0.5523165018219677, "grad_norm": 0.3070588254800077, "learning_rate": 4.6950844662141096e-05, "loss": 0.171, "step": 1061 }, { "epoch": 0.5528370640291514, "grad_norm": 0.31217947398655915, "learning_rate": 4.6944116857799936e-05, "loss": 0.1697, "step": 1062 }, { "epoch": 0.5533576262363352, "grad_norm": 0.2934020560226863, "learning_rate": 4.6937382122485484e-05, "loss": 0.1744, "step": 1063 }, { "epoch": 0.553878188443519, "grad_norm": 0.2854842827185457, "learning_rate": 4.693064045832488e-05, "loss": 0.161, "step": 1064 }, { "epoch": 0.5543987506507028, "grad_norm": 0.2815405598740039, "learning_rate": 4.692389186744745e-05, "loss": 0.1621, "step": 1065 }, { "epoch": 0.5549193128578865, "grad_norm": 0.28832423549074593, "learning_rate": 4.691713635198473e-05, "loss": 0.1621, "step": 1066 }, { "epoch": 0.5554398750650703, "grad_norm": 0.29233220575342456, "learning_rate": 4.6910373914070404e-05, "loss": 0.1664, "step": 1067 }, { "epoch": 0.5559604372722541, "grad_norm": 0.2904867981234466, "learning_rate": 4.6903604555840374e-05, "loss": 0.1709, "step": 1068 }, { "epoch": 0.5564809994794377, "grad_norm": 0.33583477749715884, "learning_rate": 4.6896828279432725e-05, "loss": 0.1592, "step": 1069 }, { "epoch": 0.5570015616866215, "grad_norm": 0.2809477801971266, "learning_rate": 4.689004508698771e-05, "loss": 0.1676, "step": 1070 }, { "epoch": 0.5575221238938053, "grad_norm": 0.32697745725327254, "learning_rate": 4.6883254980647787e-05, "loss": 0.1671, "step": 1071 }, { "epoch": 0.5580426861009891, "grad_norm": 0.28644207450920894, "learning_rate": 4.6876457962557575e-05, "loss": 0.1704, "step": 1072 }, { "epoch": 0.5585632483081728, "grad_norm": 0.3293642883398715, "learning_rate": 4.68696540348639e-05, "loss": 0.1722, "step": 1073 }, { "epoch": 0.5590838105153566, "grad_norm": 0.2844767472247252, "learning_rate": 4.686284319971576e-05, "loss": 0.166, "step": 1074 }, { "epoch": 0.5596043727225404, "grad_norm": 0.30371406025926573, "learning_rate": 4.685602545926432e-05, "loss": 0.1718, "step": 1075 }, { "epoch": 0.5601249349297241, "grad_norm": 0.303036966019252, "learning_rate": 4.684920081566295e-05, "loss": 0.1696, "step": 1076 }, { "epoch": 0.5606454971369078, "grad_norm": 0.2771701255154809, "learning_rate": 4.6842369271067185e-05, "loss": 0.1606, "step": 1077 }, { "epoch": 0.5611660593440916, "grad_norm": 0.3233636024191239, "learning_rate": 4.683553082763475e-05, "loss": 0.1699, "step": 1078 }, { "epoch": 0.5616866215512754, "grad_norm": 0.28548453667968193, "learning_rate": 4.6828685487525554e-05, "loss": 0.1621, "step": 1079 }, { "epoch": 0.5622071837584591, "grad_norm": 0.33431595584526136, "learning_rate": 4.6821833252901646e-05, "loss": 0.1579, "step": 1080 }, { "epoch": 0.5627277459656429, "grad_norm": 0.29278248350889036, "learning_rate": 4.6814974125927304e-05, "loss": 0.1622, "step": 1081 }, { "epoch": 0.5632483081728267, "grad_norm": 0.31082966905234066, "learning_rate": 4.680810810876895e-05, "loss": 0.1745, "step": 1082 }, { "epoch": 0.5637688703800104, "grad_norm": 0.304493159369903, "learning_rate": 4.6801235203595195e-05, "loss": 0.1657, "step": 1083 }, { "epoch": 0.5642894325871942, "grad_norm": 0.2949622274273174, "learning_rate": 4.679435541257682e-05, "loss": 0.1608, "step": 1084 }, { "epoch": 0.5648099947943779, "grad_norm": 0.27199415580975606, "learning_rate": 4.678746873788677e-05, "loss": 0.1626, "step": 1085 }, { "epoch": 0.5653305570015617, "grad_norm": 0.30641660626180384, "learning_rate": 4.678057518170021e-05, "loss": 0.1705, "step": 1086 }, { "epoch": 0.5658511192087454, "grad_norm": 0.3045178599347487, "learning_rate": 4.677367474619442e-05, "loss": 0.164, "step": 1087 }, { "epoch": 0.5663716814159292, "grad_norm": 0.2942177088983781, "learning_rate": 4.6766767433548885e-05, "loss": 0.1702, "step": 1088 }, { "epoch": 0.566892243623113, "grad_norm": 0.3112050068584472, "learning_rate": 4.6759853245945256e-05, "loss": 0.1727, "step": 1089 }, { "epoch": 0.5674128058302967, "grad_norm": 0.2968014199642348, "learning_rate": 4.675293218556735e-05, "loss": 0.1605, "step": 1090 }, { "epoch": 0.5679333680374805, "grad_norm": 0.30379202567617314, "learning_rate": 4.6746004254601184e-05, "loss": 0.1665, "step": 1091 }, { "epoch": 0.5684539302446643, "grad_norm": 0.4356850886928418, "learning_rate": 4.6739069455234886e-05, "loss": 0.1737, "step": 1092 }, { "epoch": 0.568974492451848, "grad_norm": 0.2873286927122915, "learning_rate": 4.673212778965881e-05, "loss": 0.1671, "step": 1093 }, { "epoch": 0.5694950546590317, "grad_norm": 0.307373163176704, "learning_rate": 4.672517926006545e-05, "loss": 0.1696, "step": 1094 }, { "epoch": 0.5700156168662155, "grad_norm": 0.284743433783671, "learning_rate": 4.671822386864948e-05, "loss": 0.1603, "step": 1095 }, { "epoch": 0.5705361790733993, "grad_norm": 0.3019989838556474, "learning_rate": 4.6711261617607725e-05, "loss": 0.1735, "step": 1096 }, { "epoch": 0.571056741280583, "grad_norm": 0.3033519694233394, "learning_rate": 4.670429250913921e-05, "loss": 0.1639, "step": 1097 }, { "epoch": 0.5715773034877668, "grad_norm": 0.31363574509134545, "learning_rate": 4.669731654544508e-05, "loss": 0.161, "step": 1098 }, { "epoch": 0.5720978656949506, "grad_norm": 0.28285834283513167, "learning_rate": 4.669033372872868e-05, "loss": 0.1665, "step": 1099 }, { "epoch": 0.5726184279021344, "grad_norm": 0.2628857284685492, "learning_rate": 4.668334406119551e-05, "loss": 0.1566, "step": 1100 }, { "epoch": 0.573138990109318, "grad_norm": 0.2655054843532405, "learning_rate": 4.667634754505323e-05, "loss": 0.1604, "step": 1101 }, { "epoch": 0.5736595523165018, "grad_norm": 0.26626184972682054, "learning_rate": 4.666934418251166e-05, "loss": 0.1554, "step": 1102 }, { "epoch": 0.5741801145236856, "grad_norm": 0.28305815343523066, "learning_rate": 4.6662333975782795e-05, "loss": 0.1698, "step": 1103 }, { "epoch": 0.5747006767308693, "grad_norm": 0.2707887337606054, "learning_rate": 4.6655316927080784e-05, "loss": 0.165, "step": 1104 }, { "epoch": 0.5752212389380531, "grad_norm": 0.2917568726342999, "learning_rate": 4.664829303862194e-05, "loss": 0.1686, "step": 1105 }, { "epoch": 0.5757418011452369, "grad_norm": 0.26069329637196703, "learning_rate": 4.664126231262472e-05, "loss": 0.1625, "step": 1106 }, { "epoch": 0.5762623633524206, "grad_norm": 0.27239030458009167, "learning_rate": 4.663422475130977e-05, "loss": 0.1537, "step": 1107 }, { "epoch": 0.5767829255596044, "grad_norm": 0.2575395498508633, "learning_rate": 4.662718035689987e-05, "loss": 0.1617, "step": 1108 }, { "epoch": 0.5773034877667881, "grad_norm": 0.27545374757966323, "learning_rate": 4.662012913161997e-05, "loss": 0.1616, "step": 1109 }, { "epoch": 0.5778240499739719, "grad_norm": 0.28448600250229067, "learning_rate": 4.661307107769718e-05, "loss": 0.1646, "step": 1110 }, { "epoch": 0.5783446121811556, "grad_norm": 0.27298548291298, "learning_rate": 4.660600619736076e-05, "loss": 0.1628, "step": 1111 }, { "epoch": 0.5788651743883394, "grad_norm": 0.3145003842578059, "learning_rate": 4.6598934492842114e-05, "loss": 0.1737, "step": 1112 }, { "epoch": 0.5793857365955232, "grad_norm": 0.27194978692066746, "learning_rate": 4.659185596637484e-05, "loss": 0.1621, "step": 1113 }, { "epoch": 0.5799062988027069, "grad_norm": 0.28956670097111964, "learning_rate": 4.658477062019465e-05, "loss": 0.1722, "step": 1114 }, { "epoch": 0.5804268610098907, "grad_norm": 0.2826807429961728, "learning_rate": 4.657767845653943e-05, "loss": 0.1688, "step": 1115 }, { "epoch": 0.5809474232170745, "grad_norm": 0.28708843741623624, "learning_rate": 4.657057947764922e-05, "loss": 0.1649, "step": 1116 }, { "epoch": 0.5814679854242581, "grad_norm": 0.29098772804179435, "learning_rate": 4.656347368576619e-05, "loss": 0.169, "step": 1117 }, { "epoch": 0.5819885476314419, "grad_norm": 0.3050742875550255, "learning_rate": 4.6556361083134705e-05, "loss": 0.1736, "step": 1118 }, { "epoch": 0.5825091098386257, "grad_norm": 0.2883430860003268, "learning_rate": 4.654924167200123e-05, "loss": 0.1636, "step": 1119 }, { "epoch": 0.5830296720458095, "grad_norm": 0.2694576550703567, "learning_rate": 4.654211545461443e-05, "loss": 0.1596, "step": 1120 }, { "epoch": 0.5835502342529932, "grad_norm": 0.29077979216048633, "learning_rate": 4.653498243322508e-05, "loss": 0.1614, "step": 1121 }, { "epoch": 0.584070796460177, "grad_norm": 0.30454032706720835, "learning_rate": 4.652784261008613e-05, "loss": 0.1657, "step": 1122 }, { "epoch": 0.5845913586673608, "grad_norm": 0.2809170140826117, "learning_rate": 4.652069598745267e-05, "loss": 0.1674, "step": 1123 }, { "epoch": 0.5851119208745446, "grad_norm": 0.2998875317771076, "learning_rate": 4.6513542567581914e-05, "loss": 0.1745, "step": 1124 }, { "epoch": 0.5856324830817282, "grad_norm": 0.2907341448806623, "learning_rate": 4.650638235273327e-05, "loss": 0.1722, "step": 1125 }, { "epoch": 0.586153045288912, "grad_norm": 0.3240717556080497, "learning_rate": 4.6499215345168255e-05, "loss": 0.1715, "step": 1126 }, { "epoch": 0.5866736074960958, "grad_norm": 0.2856383542388814, "learning_rate": 4.6492041547150555e-05, "loss": 0.1704, "step": 1127 }, { "epoch": 0.5871941697032795, "grad_norm": 0.29395504573057224, "learning_rate": 4.648486096094597e-05, "loss": 0.1644, "step": 1128 }, { "epoch": 0.5877147319104633, "grad_norm": 0.29455509913506334, "learning_rate": 4.647767358882249e-05, "loss": 0.1637, "step": 1129 }, { "epoch": 0.5882352941176471, "grad_norm": 0.27422990925352025, "learning_rate": 4.647047943305019e-05, "loss": 0.1581, "step": 1130 }, { "epoch": 0.5887558563248309, "grad_norm": 0.28624654561233115, "learning_rate": 4.646327849590134e-05, "loss": 0.1659, "step": 1131 }, { "epoch": 0.5892764185320146, "grad_norm": 0.2589094816834805, "learning_rate": 4.6456070779650326e-05, "loss": 0.1545, "step": 1132 }, { "epoch": 0.5897969807391983, "grad_norm": 0.28489485708163403, "learning_rate": 4.6448856286573684e-05, "loss": 0.1669, "step": 1133 }, { "epoch": 0.5903175429463821, "grad_norm": 0.27991494765414887, "learning_rate": 4.644163501895008e-05, "loss": 0.1692, "step": 1134 }, { "epoch": 0.5908381051535658, "grad_norm": 0.271863121129334, "learning_rate": 4.643440697906033e-05, "loss": 0.1703, "step": 1135 }, { "epoch": 0.5913586673607496, "grad_norm": 0.261056414536902, "learning_rate": 4.642717216918738e-05, "loss": 0.1621, "step": 1136 }, { "epoch": 0.5918792295679334, "grad_norm": 0.2929328942515898, "learning_rate": 4.6419930591616336e-05, "loss": 0.1629, "step": 1137 }, { "epoch": 0.5923997917751171, "grad_norm": 0.2606767168924215, "learning_rate": 4.641268224863441e-05, "loss": 0.1654, "step": 1138 }, { "epoch": 0.5929203539823009, "grad_norm": 0.2908040086603664, "learning_rate": 4.6405427142530954e-05, "loss": 0.1588, "step": 1139 }, { "epoch": 0.5934409161894847, "grad_norm": 0.2611032398127685, "learning_rate": 4.6398165275597494e-05, "loss": 0.1616, "step": 1140 }, { "epoch": 0.5939614783966684, "grad_norm": 0.27150334105117246, "learning_rate": 4.6390896650127656e-05, "loss": 0.1693, "step": 1141 }, { "epoch": 0.5944820406038521, "grad_norm": 0.27540568978983837, "learning_rate": 4.638362126841721e-05, "loss": 0.167, "step": 1142 }, { "epoch": 0.5950026028110359, "grad_norm": 0.2676573123405451, "learning_rate": 4.637633913276406e-05, "loss": 0.1643, "step": 1143 }, { "epoch": 0.5955231650182197, "grad_norm": 0.2607951523773393, "learning_rate": 4.6369050245468243e-05, "loss": 0.1672, "step": 1144 }, { "epoch": 0.5960437272254034, "grad_norm": 0.28522127225344346, "learning_rate": 4.636175460883193e-05, "loss": 0.1689, "step": 1145 }, { "epoch": 0.5965642894325872, "grad_norm": 0.28313291478789127, "learning_rate": 4.6354452225159416e-05, "loss": 0.1678, "step": 1146 }, { "epoch": 0.597084851639771, "grad_norm": 0.2781466635540155, "learning_rate": 4.634714309675714e-05, "loss": 0.1693, "step": 1147 }, { "epoch": 0.5976054138469548, "grad_norm": 0.2993701779876083, "learning_rate": 4.6339827225933665e-05, "loss": 0.178, "step": 1148 }, { "epoch": 0.5981259760541384, "grad_norm": 0.2757353899839654, "learning_rate": 4.6332504614999684e-05, "loss": 0.1686, "step": 1149 }, { "epoch": 0.5986465382613222, "grad_norm": 0.2893083043239989, "learning_rate": 4.6325175266268005e-05, "loss": 0.1613, "step": 1150 }, { "epoch": 0.599167100468506, "grad_norm": 0.28169522960512217, "learning_rate": 4.6317839182053603e-05, "loss": 0.1612, "step": 1151 }, { "epoch": 0.5996876626756897, "grad_norm": 0.32089924116837154, "learning_rate": 4.6310496364673534e-05, "loss": 0.1688, "step": 1152 }, { "epoch": 0.6002082248828735, "grad_norm": 0.259177259755011, "learning_rate": 4.630314681644701e-05, "loss": 0.159, "step": 1153 }, { "epoch": 0.6007287870900573, "grad_norm": 0.3124787935064602, "learning_rate": 4.6295790539695354e-05, "loss": 0.1642, "step": 1154 }, { "epoch": 0.601249349297241, "grad_norm": 0.297935731727392, "learning_rate": 4.628842753674203e-05, "loss": 0.1592, "step": 1155 }, { "epoch": 0.6017699115044248, "grad_norm": 0.2917605841000676, "learning_rate": 4.628105780991261e-05, "loss": 0.1738, "step": 1156 }, { "epoch": 0.6022904737116085, "grad_norm": 0.2742189116346638, "learning_rate": 4.6273681361534796e-05, "loss": 0.1676, "step": 1157 }, { "epoch": 0.6028110359187923, "grad_norm": 0.2831911499474275, "learning_rate": 4.626629819393842e-05, "loss": 0.1659, "step": 1158 }, { "epoch": 0.603331598125976, "grad_norm": 0.30018675422590946, "learning_rate": 4.6258908309455424e-05, "loss": 0.1624, "step": 1159 }, { "epoch": 0.6038521603331598, "grad_norm": 0.29631188813134685, "learning_rate": 4.625151171041988e-05, "loss": 0.1671, "step": 1160 }, { "epoch": 0.6043727225403436, "grad_norm": 0.29206096012142957, "learning_rate": 4.624410839916798e-05, "loss": 0.1755, "step": 1161 }, { "epoch": 0.6048932847475273, "grad_norm": 0.29805577625196694, "learning_rate": 4.6236698378038026e-05, "loss": 0.1674, "step": 1162 }, { "epoch": 0.6054138469547111, "grad_norm": 0.284263114326214, "learning_rate": 4.622928164937046e-05, "loss": 0.1611, "step": 1163 }, { "epoch": 0.6059344091618949, "grad_norm": 0.29763872316389595, "learning_rate": 4.622185821550782e-05, "loss": 0.1753, "step": 1164 }, { "epoch": 0.6064549713690786, "grad_norm": 0.2869777890373756, "learning_rate": 4.621442807879477e-05, "loss": 0.1554, "step": 1165 }, { "epoch": 0.6069755335762623, "grad_norm": 0.2743988330225703, "learning_rate": 4.6206991241578115e-05, "loss": 0.1662, "step": 1166 }, { "epoch": 0.6074960957834461, "grad_norm": 0.27540743834473974, "learning_rate": 4.6199547706206726e-05, "loss": 0.1632, "step": 1167 }, { "epoch": 0.6080166579906299, "grad_norm": 0.28312699949690634, "learning_rate": 4.619209747503163e-05, "loss": 0.1636, "step": 1168 }, { "epoch": 0.6085372201978136, "grad_norm": 0.2637175185952636, "learning_rate": 4.618464055040595e-05, "loss": 0.1649, "step": 1169 }, { "epoch": 0.6090577824049974, "grad_norm": 0.2907917728207116, "learning_rate": 4.617717693468494e-05, "loss": 0.1688, "step": 1170 }, { "epoch": 0.6095783446121812, "grad_norm": 0.2726681176203265, "learning_rate": 4.616970663022596e-05, "loss": 0.1695, "step": 1171 }, { "epoch": 0.610098906819365, "grad_norm": 0.3021128819540319, "learning_rate": 4.616222963938847e-05, "loss": 0.1689, "step": 1172 }, { "epoch": 0.6106194690265486, "grad_norm": 0.2611672657701669, "learning_rate": 4.615474596453405e-05, "loss": 0.1601, "step": 1173 }, { "epoch": 0.6111400312337324, "grad_norm": 0.27250524535283555, "learning_rate": 4.6147255608026394e-05, "loss": 0.1595, "step": 1174 }, { "epoch": 0.6116605934409162, "grad_norm": 0.2780403650376052, "learning_rate": 4.6139758572231315e-05, "loss": 0.1613, "step": 1175 }, { "epoch": 0.6121811556480999, "grad_norm": 0.28819496612165146, "learning_rate": 4.613225485951672e-05, "loss": 0.1701, "step": 1176 }, { "epoch": 0.6127017178552837, "grad_norm": 0.289358937511583, "learning_rate": 4.612474447225263e-05, "loss": 0.1663, "step": 1177 }, { "epoch": 0.6132222800624675, "grad_norm": 0.2720168641397797, "learning_rate": 4.611722741281118e-05, "loss": 0.1637, "step": 1178 }, { "epoch": 0.6137428422696513, "grad_norm": 0.278359207879303, "learning_rate": 4.610970368356659e-05, "loss": 0.1644, "step": 1179 }, { "epoch": 0.614263404476835, "grad_norm": 0.26805244317170895, "learning_rate": 4.610217328689522e-05, "loss": 0.1681, "step": 1180 }, { "epoch": 0.6147839666840187, "grad_norm": 0.28205230211004056, "learning_rate": 4.609463622517551e-05, "loss": 0.1669, "step": 1181 }, { "epoch": 0.6153045288912025, "grad_norm": 0.269236517123811, "learning_rate": 4.608709250078803e-05, "loss": 0.1619, "step": 1182 }, { "epoch": 0.6158250910983862, "grad_norm": 0.29722253174417096, "learning_rate": 4.607954211611543e-05, "loss": 0.1677, "step": 1183 }, { "epoch": 0.61634565330557, "grad_norm": 0.2726195453326547, "learning_rate": 4.6071985073542464e-05, "loss": 0.1656, "step": 1184 }, { "epoch": 0.6168662155127538, "grad_norm": 0.2945066289976058, "learning_rate": 4.606442137545602e-05, "loss": 0.1577, "step": 1185 }, { "epoch": 0.6173867777199376, "grad_norm": 0.3225752803253219, "learning_rate": 4.605685102424504e-05, "loss": 0.1711, "step": 1186 }, { "epoch": 0.6179073399271213, "grad_norm": 0.3155533738497284, "learning_rate": 4.6049274022300604e-05, "loss": 0.1711, "step": 1187 }, { "epoch": 0.6184279021343051, "grad_norm": 0.29404463902874495, "learning_rate": 4.604169037201589e-05, "loss": 0.1644, "step": 1188 }, { "epoch": 0.6189484643414888, "grad_norm": 0.28530333471749375, "learning_rate": 4.603410007578616e-05, "loss": 0.1611, "step": 1189 }, { "epoch": 0.6194690265486725, "grad_norm": 0.2841758773756127, "learning_rate": 4.602650313600878e-05, "loss": 0.1655, "step": 1190 }, { "epoch": 0.6199895887558563, "grad_norm": 0.29874212816673795, "learning_rate": 4.601889955508322e-05, "loss": 0.1678, "step": 1191 }, { "epoch": 0.6205101509630401, "grad_norm": 0.2790940324283885, "learning_rate": 4.601128933541105e-05, "loss": 0.1669, "step": 1192 }, { "epoch": 0.6210307131702238, "grad_norm": 0.2819144485571506, "learning_rate": 4.600367247939591e-05, "loss": 0.1594, "step": 1193 }, { "epoch": 0.6215512753774076, "grad_norm": 0.2789287724673938, "learning_rate": 4.5996048989443597e-05, "loss": 0.163, "step": 1194 }, { "epoch": 0.6220718375845914, "grad_norm": 0.2862188445057931, "learning_rate": 4.598841886796192e-05, "loss": 0.1752, "step": 1195 }, { "epoch": 0.6225923997917752, "grad_norm": 0.3007115626630004, "learning_rate": 4.598078211736086e-05, "loss": 0.1795, "step": 1196 }, { "epoch": 0.6231129619989588, "grad_norm": 0.2655000681844505, "learning_rate": 4.5973138740052455e-05, "loss": 0.1652, "step": 1197 }, { "epoch": 0.6236335242061426, "grad_norm": 0.3051733991758238, "learning_rate": 4.596548873845081e-05, "loss": 0.1638, "step": 1198 }, { "epoch": 0.6241540864133264, "grad_norm": 0.2928160917194925, "learning_rate": 4.595783211497219e-05, "loss": 0.169, "step": 1199 }, { "epoch": 0.6246746486205101, "grad_norm": 0.29112623504997737, "learning_rate": 4.5950168872034885e-05, "loss": 0.163, "step": 1200 }, { "epoch": 0.6251952108276939, "grad_norm": 0.2675061785401877, "learning_rate": 4.5942499012059316e-05, "loss": 0.1657, "step": 1201 }, { "epoch": 0.6257157730348777, "grad_norm": 0.3040715761688371, "learning_rate": 4.593482253746798e-05, "loss": 0.1623, "step": 1202 }, { "epoch": 0.6262363352420615, "grad_norm": 0.29614925372633366, "learning_rate": 4.592713945068545e-05, "loss": 0.1643, "step": 1203 }, { "epoch": 0.6267568974492452, "grad_norm": 0.2638537701916355, "learning_rate": 4.591944975413843e-05, "loss": 0.1618, "step": 1204 }, { "epoch": 0.6272774596564289, "grad_norm": 0.3005257646399674, "learning_rate": 4.5911753450255665e-05, "loss": 0.1598, "step": 1205 }, { "epoch": 0.6277980218636127, "grad_norm": 0.29494144677802137, "learning_rate": 4.590405054146802e-05, "loss": 0.1718, "step": 1206 }, { "epoch": 0.6283185840707964, "grad_norm": 0.29636762114195014, "learning_rate": 4.5896341030208415e-05, "loss": 0.1637, "step": 1207 }, { "epoch": 0.6288391462779802, "grad_norm": 0.2934192989066668, "learning_rate": 4.5888624918911884e-05, "loss": 0.158, "step": 1208 }, { "epoch": 0.629359708485164, "grad_norm": 0.3124230206823834, "learning_rate": 4.588090221001553e-05, "loss": 0.1672, "step": 1209 }, { "epoch": 0.6298802706923478, "grad_norm": 0.30615588299221824, "learning_rate": 4.587317290595855e-05, "loss": 0.1629, "step": 1210 }, { "epoch": 0.6304008328995315, "grad_norm": 0.2988825912755345, "learning_rate": 4.586543700918221e-05, "loss": 0.1723, "step": 1211 }, { "epoch": 0.6309213951067153, "grad_norm": 0.29738980460994413, "learning_rate": 4.5857694522129855e-05, "loss": 0.1573, "step": 1212 }, { "epoch": 0.631441957313899, "grad_norm": 0.2797032464087098, "learning_rate": 4.584994544724695e-05, "loss": 0.1678, "step": 1213 }, { "epoch": 0.6319625195210827, "grad_norm": 0.26491065606515474, "learning_rate": 4.584218978698099e-05, "loss": 0.1626, "step": 1214 }, { "epoch": 0.6324830817282665, "grad_norm": 0.2652322894153915, "learning_rate": 4.5834427543781596e-05, "loss": 0.1666, "step": 1215 }, { "epoch": 0.6330036439354503, "grad_norm": 0.264807354619328, "learning_rate": 4.582665872010043e-05, "loss": 0.1641, "step": 1216 }, { "epoch": 0.633524206142634, "grad_norm": 0.2738204236926736, "learning_rate": 4.581888331839125e-05, "loss": 0.1703, "step": 1217 }, { "epoch": 0.6340447683498178, "grad_norm": 0.299863744726911, "learning_rate": 4.581110134110989e-05, "loss": 0.1667, "step": 1218 }, { "epoch": 0.6345653305570016, "grad_norm": 0.2956315906472553, "learning_rate": 4.580331279071426e-05, "loss": 0.1685, "step": 1219 }, { "epoch": 0.6350858927641854, "grad_norm": 0.28753743007877214, "learning_rate": 4.579551766966435e-05, "loss": 0.1608, "step": 1220 }, { "epoch": 0.635606454971369, "grad_norm": 0.28657496675188177, "learning_rate": 4.578771598042221e-05, "loss": 0.1692, "step": 1221 }, { "epoch": 0.6361270171785528, "grad_norm": 0.27201718233205613, "learning_rate": 4.5779907725452e-05, "loss": 0.1592, "step": 1222 }, { "epoch": 0.6366475793857366, "grad_norm": 0.26136498905384203, "learning_rate": 4.577209290721991e-05, "loss": 0.1625, "step": 1223 }, { "epoch": 0.6371681415929203, "grad_norm": 0.2898833505811805, "learning_rate": 4.576427152819423e-05, "loss": 0.1699, "step": 1224 }, { "epoch": 0.6376887038001041, "grad_norm": 0.2668251937477724, "learning_rate": 4.575644359084532e-05, "loss": 0.1591, "step": 1225 }, { "epoch": 0.6382092660072879, "grad_norm": 0.3003314921218274, "learning_rate": 4.5748609097645595e-05, "loss": 0.1673, "step": 1226 }, { "epoch": 0.6387298282144717, "grad_norm": 0.2684194237776865, "learning_rate": 4.574076805106956e-05, "loss": 0.1587, "step": 1227 }, { "epoch": 0.6392503904216554, "grad_norm": 0.2845756731306716, "learning_rate": 4.5732920453593785e-05, "loss": 0.1642, "step": 1228 }, { "epoch": 0.6397709526288391, "grad_norm": 0.27739696831325483, "learning_rate": 4.572506630769691e-05, "loss": 0.1623, "step": 1229 }, { "epoch": 0.6402915148360229, "grad_norm": 0.27961191274533287, "learning_rate": 4.571720561585963e-05, "loss": 0.155, "step": 1230 }, { "epoch": 0.6408120770432066, "grad_norm": 0.2807787449921395, "learning_rate": 4.570933838056472e-05, "loss": 0.1569, "step": 1231 }, { "epoch": 0.6413326392503904, "grad_norm": 0.27535967325114985, "learning_rate": 4.570146460429701e-05, "loss": 0.1534, "step": 1232 }, { "epoch": 0.6418532014575742, "grad_norm": 0.31053243876551606, "learning_rate": 4.569358428954343e-05, "loss": 0.1687, "step": 1233 }, { "epoch": 0.642373763664758, "grad_norm": 0.2735613339054094, "learning_rate": 4.568569743879293e-05, "loss": 0.1619, "step": 1234 }, { "epoch": 0.6428943258719417, "grad_norm": 0.31025137855515883, "learning_rate": 4.5677804054536544e-05, "loss": 0.1669, "step": 1235 }, { "epoch": 0.6434148880791255, "grad_norm": 0.3108370328370114, "learning_rate": 4.566990413926738e-05, "loss": 0.1616, "step": 1236 }, { "epoch": 0.6439354502863092, "grad_norm": 0.3339548204834231, "learning_rate": 4.5661997695480595e-05, "loss": 0.1616, "step": 1237 }, { "epoch": 0.6444560124934929, "grad_norm": 0.3056231219128003, "learning_rate": 4.5654084725673404e-05, "loss": 0.1654, "step": 1238 }, { "epoch": 0.6449765747006767, "grad_norm": 0.29210355340648536, "learning_rate": 4.564616523234511e-05, "loss": 0.1686, "step": 1239 }, { "epoch": 0.6454971369078605, "grad_norm": 0.29213232167646325, "learning_rate": 4.5638239217997034e-05, "loss": 0.1621, "step": 1240 }, { "epoch": 0.6460176991150443, "grad_norm": 0.28705338754520415, "learning_rate": 4.56303066851326e-05, "loss": 0.166, "step": 1241 }, { "epoch": 0.646538261322228, "grad_norm": 0.3057168972516095, "learning_rate": 4.5622367636257264e-05, "loss": 0.161, "step": 1242 }, { "epoch": 0.6470588235294118, "grad_norm": 0.2716038210905968, "learning_rate": 4.561442207387854e-05, "loss": 0.1628, "step": 1243 }, { "epoch": 0.6475793857365956, "grad_norm": 0.3359502419591334, "learning_rate": 4.560647000050602e-05, "loss": 0.1641, "step": 1244 }, { "epoch": 0.6480999479437792, "grad_norm": 0.2677924387144485, "learning_rate": 4.5598511418651324e-05, "loss": 0.1692, "step": 1245 }, { "epoch": 0.648620510150963, "grad_norm": 0.2687659769423282, "learning_rate": 4.5590546330828154e-05, "loss": 0.1652, "step": 1246 }, { "epoch": 0.6491410723581468, "grad_norm": 0.27911537151113053, "learning_rate": 4.5582574739552254e-05, "loss": 0.1681, "step": 1247 }, { "epoch": 0.6496616345653305, "grad_norm": 0.258050298522539, "learning_rate": 4.557459664734141e-05, "loss": 0.1632, "step": 1248 }, { "epoch": 0.6501821967725143, "grad_norm": 0.2714387670557213, "learning_rate": 4.5566612056715494e-05, "loss": 0.1602, "step": 1249 }, { "epoch": 0.6507027589796981, "grad_norm": 0.3007289110925111, "learning_rate": 4.5558620970196406e-05, "loss": 0.1603, "step": 1250 }, { "epoch": 0.6512233211868819, "grad_norm": 0.2751562123537327, "learning_rate": 4.5550623390308086e-05, "loss": 0.1637, "step": 1251 }, { "epoch": 0.6517438833940656, "grad_norm": 0.26206723907101465, "learning_rate": 4.554261931957657e-05, "loss": 0.1602, "step": 1252 }, { "epoch": 0.6522644456012493, "grad_norm": 0.29168906928554644, "learning_rate": 4.5534608760529895e-05, "loss": 0.1655, "step": 1253 }, { "epoch": 0.6527850078084331, "grad_norm": 0.272148724671491, "learning_rate": 4.552659171569817e-05, "loss": 0.1567, "step": 1254 }, { "epoch": 0.6533055700156168, "grad_norm": 0.2918024931656026, "learning_rate": 4.551856818761357e-05, "loss": 0.1656, "step": 1255 }, { "epoch": 0.6538261322228006, "grad_norm": 0.2897642359961369, "learning_rate": 4.551053817881028e-05, "loss": 0.1635, "step": 1256 }, { "epoch": 0.6543466944299844, "grad_norm": 0.2675950487049928, "learning_rate": 4.550250169182455e-05, "loss": 0.1571, "step": 1257 }, { "epoch": 0.6548672566371682, "grad_norm": 0.28801786916053895, "learning_rate": 4.549445872919468e-05, "loss": 0.1616, "step": 1258 }, { "epoch": 0.6553878188443519, "grad_norm": 0.2602745738457694, "learning_rate": 4.548640929346102e-05, "loss": 0.1629, "step": 1259 }, { "epoch": 0.6559083810515357, "grad_norm": 0.2876893050906943, "learning_rate": 4.5478353387165946e-05, "loss": 0.1575, "step": 1260 }, { "epoch": 0.6564289432587194, "grad_norm": 0.2881481837684052, "learning_rate": 4.547029101285389e-05, "loss": 0.1625, "step": 1261 }, { "epoch": 0.6569495054659031, "grad_norm": 0.2612428197323997, "learning_rate": 4.5462222173071335e-05, "loss": 0.1652, "step": 1262 }, { "epoch": 0.6574700676730869, "grad_norm": 0.2701617714724092, "learning_rate": 4.5454146870366775e-05, "loss": 0.1652, "step": 1263 }, { "epoch": 0.6579906298802707, "grad_norm": 0.28509359219297975, "learning_rate": 4.5446065107290786e-05, "loss": 0.164, "step": 1264 }, { "epoch": 0.6585111920874545, "grad_norm": 0.26256618912001867, "learning_rate": 4.543797688639596e-05, "loss": 0.1554, "step": 1265 }, { "epoch": 0.6590317542946382, "grad_norm": 0.27155835804114575, "learning_rate": 4.5429882210236926e-05, "loss": 0.1613, "step": 1266 }, { "epoch": 0.659552316501822, "grad_norm": 0.2918933060230762, "learning_rate": 4.542178108137038e-05, "loss": 0.1656, "step": 1267 }, { "epoch": 0.6600728787090058, "grad_norm": 0.26850912684301664, "learning_rate": 4.5413673502355e-05, "loss": 0.1658, "step": 1268 }, { "epoch": 0.6605934409161894, "grad_norm": 0.2555610557403195, "learning_rate": 4.540555947575157e-05, "loss": 0.1555, "step": 1269 }, { "epoch": 0.6611140031233732, "grad_norm": 0.492044807765271, "learning_rate": 4.539743900412287e-05, "loss": 0.1668, "step": 1270 }, { "epoch": 0.661634565330557, "grad_norm": 0.2749406988327086, "learning_rate": 4.53893120900337e-05, "loss": 0.1571, "step": 1271 }, { "epoch": 0.6621551275377408, "grad_norm": 0.2755114222031435, "learning_rate": 4.538117873605094e-05, "loss": 0.1604, "step": 1272 }, { "epoch": 0.6626756897449245, "grad_norm": 0.27347742130970076, "learning_rate": 4.537303894474349e-05, "loss": 0.1628, "step": 1273 }, { "epoch": 0.6631962519521083, "grad_norm": 0.2859200379145871, "learning_rate": 4.536489271868225e-05, "loss": 0.1663, "step": 1274 }, { "epoch": 0.6637168141592921, "grad_norm": 0.2855335291443677, "learning_rate": 4.5356740060440194e-05, "loss": 0.1699, "step": 1275 }, { "epoch": 0.6642373763664758, "grad_norm": 0.26154237753790416, "learning_rate": 4.53485809725923e-05, "loss": 0.159, "step": 1276 }, { "epoch": 0.6647579385736595, "grad_norm": 0.2993939894006488, "learning_rate": 4.53404154577156e-05, "loss": 0.166, "step": 1277 }, { "epoch": 0.6652785007808433, "grad_norm": 0.2747912605662342, "learning_rate": 4.533224351838914e-05, "loss": 0.1515, "step": 1278 }, { "epoch": 0.665799062988027, "grad_norm": 0.2736057262584086, "learning_rate": 4.532406515719399e-05, "loss": 0.1642, "step": 1279 }, { "epoch": 0.6663196251952108, "grad_norm": 0.2616489484367182, "learning_rate": 4.531588037671326e-05, "loss": 0.161, "step": 1280 }, { "epoch": 0.6668401874023946, "grad_norm": 0.26322733292018735, "learning_rate": 4.5307689179532085e-05, "loss": 0.1574, "step": 1281 }, { "epoch": 0.6673607496095784, "grad_norm": 0.2522456597339631, "learning_rate": 4.529949156823764e-05, "loss": 0.1615, "step": 1282 }, { "epoch": 0.6678813118167621, "grad_norm": 0.2588976061704534, "learning_rate": 4.529128754541909e-05, "loss": 0.155, "step": 1283 }, { "epoch": 0.6684018740239459, "grad_norm": 0.27256034050552175, "learning_rate": 4.528307711366767e-05, "loss": 0.171, "step": 1284 }, { "epoch": 0.6689224362311296, "grad_norm": 0.2538123767756482, "learning_rate": 4.527486027557659e-05, "loss": 0.1617, "step": 1285 }, { "epoch": 0.6694429984383133, "grad_norm": 0.27020857968681916, "learning_rate": 4.526663703374113e-05, "loss": 0.1648, "step": 1286 }, { "epoch": 0.6699635606454971, "grad_norm": 0.2772645799194081, "learning_rate": 4.525840739075857e-05, "loss": 0.1679, "step": 1287 }, { "epoch": 0.6704841228526809, "grad_norm": 0.2842658573717009, "learning_rate": 4.525017134922821e-05, "loss": 0.1704, "step": 1288 }, { "epoch": 0.6710046850598647, "grad_norm": 0.2695068859454936, "learning_rate": 4.524192891175138e-05, "loss": 0.1685, "step": 1289 }, { "epoch": 0.6715252472670484, "grad_norm": 0.2563249875383298, "learning_rate": 4.5233680080931415e-05, "loss": 0.1575, "step": 1290 }, { "epoch": 0.6720458094742322, "grad_norm": 0.25282111526223566, "learning_rate": 4.522542485937369e-05, "loss": 0.1581, "step": 1291 }, { "epoch": 0.672566371681416, "grad_norm": 0.27360763527857895, "learning_rate": 4.521716324968558e-05, "loss": 0.1611, "step": 1292 }, { "epoch": 0.6730869338885996, "grad_norm": 0.2635465269224253, "learning_rate": 4.520889525447649e-05, "loss": 0.1613, "step": 1293 }, { "epoch": 0.6736074960957834, "grad_norm": 0.2826648530069749, "learning_rate": 4.520062087635784e-05, "loss": 0.1597, "step": 1294 }, { "epoch": 0.6741280583029672, "grad_norm": 0.27562512183656485, "learning_rate": 4.5192340117943063e-05, "loss": 0.1606, "step": 1295 }, { "epoch": 0.674648620510151, "grad_norm": 0.25422173470452253, "learning_rate": 4.51840529818476e-05, "loss": 0.1591, "step": 1296 }, { "epoch": 0.6751691827173347, "grad_norm": 0.28058709543239363, "learning_rate": 4.517575947068893e-05, "loss": 0.1595, "step": 1297 }, { "epoch": 0.6756897449245185, "grad_norm": 0.26381435173704143, "learning_rate": 4.516745958708652e-05, "loss": 0.1613, "step": 1298 }, { "epoch": 0.6762103071317023, "grad_norm": 0.28533524054694864, "learning_rate": 4.5159153333661854e-05, "loss": 0.1611, "step": 1299 }, { "epoch": 0.676730869338886, "grad_norm": 0.26541389475587734, "learning_rate": 4.515084071303843e-05, "loss": 0.1638, "step": 1300 }, { "epoch": 0.6772514315460697, "grad_norm": 0.2695669437026599, "learning_rate": 4.514252172784178e-05, "loss": 0.1654, "step": 1301 }, { "epoch": 0.6777719937532535, "grad_norm": 0.252500452369742, "learning_rate": 4.513419638069942e-05, "loss": 0.1612, "step": 1302 }, { "epoch": 0.6782925559604372, "grad_norm": 0.2593569283907477, "learning_rate": 4.512586467424087e-05, "loss": 0.1573, "step": 1303 }, { "epoch": 0.678813118167621, "grad_norm": 0.2783028111560752, "learning_rate": 4.511752661109768e-05, "loss": 0.1734, "step": 1304 }, { "epoch": 0.6793336803748048, "grad_norm": 0.26185446584580446, "learning_rate": 4.51091821939034e-05, "loss": 0.1634, "step": 1305 }, { "epoch": 0.6798542425819886, "grad_norm": 0.2713826383452847, "learning_rate": 4.510083142529359e-05, "loss": 0.1627, "step": 1306 }, { "epoch": 0.6803748047891723, "grad_norm": 0.31074372566311925, "learning_rate": 4.5092474307905785e-05, "loss": 0.1587, "step": 1307 }, { "epoch": 0.6808953669963561, "grad_norm": 0.2487669262126004, "learning_rate": 4.5084110844379584e-05, "loss": 0.1637, "step": 1308 }, { "epoch": 0.6814159292035398, "grad_norm": 0.2869857982196813, "learning_rate": 4.507574103735654e-05, "loss": 0.1698, "step": 1309 }, { "epoch": 0.6819364914107235, "grad_norm": 0.2790644846811937, "learning_rate": 4.506736488948024e-05, "loss": 0.1653, "step": 1310 }, { "epoch": 0.6824570536179073, "grad_norm": 0.27483161848797866, "learning_rate": 4.5058982403396244e-05, "loss": 0.1572, "step": 1311 }, { "epoch": 0.6829776158250911, "grad_norm": 0.26249570703461655, "learning_rate": 4.505059358175214e-05, "loss": 0.1545, "step": 1312 }, { "epoch": 0.6834981780322749, "grad_norm": 0.27003457349130416, "learning_rate": 4.504219842719751e-05, "loss": 0.1548, "step": 1313 }, { "epoch": 0.6840187402394586, "grad_norm": 0.27502243535142196, "learning_rate": 4.503379694238394e-05, "loss": 0.1626, "step": 1314 }, { "epoch": 0.6845393024466424, "grad_norm": 0.3145544670289913, "learning_rate": 4.502538912996499e-05, "loss": 0.1674, "step": 1315 }, { "epoch": 0.6850598646538262, "grad_norm": 0.2662239312987116, "learning_rate": 4.501697499259626e-05, "loss": 0.1597, "step": 1316 }, { "epoch": 0.6855804268610098, "grad_norm": 0.2745260554615778, "learning_rate": 4.500855453293532e-05, "loss": 0.163, "step": 1317 }, { "epoch": 0.6861009890681936, "grad_norm": 0.2657154339820778, "learning_rate": 4.500012775364173e-05, "loss": 0.1635, "step": 1318 }, { "epoch": 0.6866215512753774, "grad_norm": 0.2832483197266812, "learning_rate": 4.499169465737708e-05, "loss": 0.169, "step": 1319 }, { "epoch": 0.6871421134825612, "grad_norm": 0.2893358983294465, "learning_rate": 4.498325524680492e-05, "loss": 0.1657, "step": 1320 }, { "epoch": 0.6876626756897449, "grad_norm": 0.2716434135511077, "learning_rate": 4.4974809524590814e-05, "loss": 0.1682, "step": 1321 }, { "epoch": 0.6881832378969287, "grad_norm": 0.31689018683469905, "learning_rate": 4.496635749340231e-05, "loss": 0.1634, "step": 1322 }, { "epoch": 0.6887038001041125, "grad_norm": 0.27519106860257103, "learning_rate": 4.495789915590895e-05, "loss": 0.1571, "step": 1323 }, { "epoch": 0.6892243623112962, "grad_norm": 0.3070166511158656, "learning_rate": 4.494943451478229e-05, "loss": 0.1625, "step": 1324 }, { "epoch": 0.6897449245184799, "grad_norm": 0.26270345276600776, "learning_rate": 4.4940963572695836e-05, "loss": 0.1594, "step": 1325 }, { "epoch": 0.6902654867256637, "grad_norm": 0.2885003932017069, "learning_rate": 4.4932486332325115e-05, "loss": 0.1595, "step": 1326 }, { "epoch": 0.6907860489328475, "grad_norm": 0.2571626418463362, "learning_rate": 4.492400279634763e-05, "loss": 0.1616, "step": 1327 }, { "epoch": 0.6913066111400312, "grad_norm": 0.27955455405062174, "learning_rate": 4.491551296744288e-05, "loss": 0.172, "step": 1328 }, { "epoch": 0.691827173347215, "grad_norm": 0.2691060387491177, "learning_rate": 4.490701684829235e-05, "loss": 0.1613, "step": 1329 }, { "epoch": 0.6923477355543988, "grad_norm": 0.2739293243057916, "learning_rate": 4.48985144415795e-05, "loss": 0.1665, "step": 1330 }, { "epoch": 0.6928682977615825, "grad_norm": 0.29877047824582253, "learning_rate": 4.489000574998979e-05, "loss": 0.1644, "step": 1331 }, { "epoch": 0.6933888599687663, "grad_norm": 0.2707072638145847, "learning_rate": 4.488149077621067e-05, "loss": 0.1614, "step": 1332 }, { "epoch": 0.69390942217595, "grad_norm": 0.2782882849164749, "learning_rate": 4.4872969522931556e-05, "loss": 0.1646, "step": 1333 }, { "epoch": 0.6944299843831337, "grad_norm": 0.27327557400755875, "learning_rate": 4.486444199284386e-05, "loss": 0.1659, "step": 1334 }, { "epoch": 0.6949505465903175, "grad_norm": 0.2806747248809842, "learning_rate": 4.4855908188640973e-05, "loss": 0.1622, "step": 1335 }, { "epoch": 0.6954711087975013, "grad_norm": 0.2690587883243696, "learning_rate": 4.484736811301826e-05, "loss": 0.1573, "step": 1336 }, { "epoch": 0.6959916710046851, "grad_norm": 0.27243238739702624, "learning_rate": 4.483882176867308e-05, "loss": 0.1686, "step": 1337 }, { "epoch": 0.6965122332118688, "grad_norm": 0.28204045583043863, "learning_rate": 4.483026915830477e-05, "loss": 0.163, "step": 1338 }, { "epoch": 0.6970327954190526, "grad_norm": 0.2772739553002315, "learning_rate": 4.4821710284614636e-05, "loss": 0.1624, "step": 1339 }, { "epoch": 0.6975533576262364, "grad_norm": 0.29231455388507804, "learning_rate": 4.4813145150305965e-05, "loss": 0.1616, "step": 1340 }, { "epoch": 0.69807391983342, "grad_norm": 0.2640575107760888, "learning_rate": 4.4804573758084046e-05, "loss": 0.1585, "step": 1341 }, { "epoch": 0.6985944820406038, "grad_norm": 0.3071872152649576, "learning_rate": 4.4795996110656105e-05, "loss": 0.1673, "step": 1342 }, { "epoch": 0.6991150442477876, "grad_norm": 0.2749143884079691, "learning_rate": 4.478741221073136e-05, "loss": 0.1609, "step": 1343 }, { "epoch": 0.6996356064549714, "grad_norm": 0.2754640339891129, "learning_rate": 4.477882206102101e-05, "loss": 0.1606, "step": 1344 }, { "epoch": 0.7001561686621551, "grad_norm": 0.2639768649579088, "learning_rate": 4.477022566423823e-05, "loss": 0.162, "step": 1345 }, { "epoch": 0.7006767308693389, "grad_norm": 0.30604022298071126, "learning_rate": 4.476162302309815e-05, "loss": 0.1628, "step": 1346 }, { "epoch": 0.7011972930765227, "grad_norm": 0.26151484803979685, "learning_rate": 4.475301414031791e-05, "loss": 0.1561, "step": 1347 }, { "epoch": 0.7017178552837064, "grad_norm": 0.3069542520801269, "learning_rate": 4.4744399018616566e-05, "loss": 0.1717, "step": 1348 }, { "epoch": 0.7022384174908901, "grad_norm": 0.26118236966791997, "learning_rate": 4.4735777660715186e-05, "loss": 0.1594, "step": 1349 }, { "epoch": 0.7027589796980739, "grad_norm": 0.2821690722820644, "learning_rate": 4.472715006933681e-05, "loss": 0.17, "step": 1350 }, { "epoch": 0.7032795419052577, "grad_norm": 0.28425332393518327, "learning_rate": 4.47185162472064e-05, "loss": 0.1656, "step": 1351 }, { "epoch": 0.7038001041124414, "grad_norm": 0.2895241985407839, "learning_rate": 4.470987619705095e-05, "loss": 0.172, "step": 1352 }, { "epoch": 0.7043206663196252, "grad_norm": 0.2907907968206427, "learning_rate": 4.470122992159938e-05, "loss": 0.1577, "step": 1353 }, { "epoch": 0.704841228526809, "grad_norm": 0.26360172964644457, "learning_rate": 4.469257742358258e-05, "loss": 0.1641, "step": 1354 }, { "epoch": 0.7053617907339927, "grad_norm": 0.30825700427174524, "learning_rate": 4.468391870573342e-05, "loss": 0.1613, "step": 1355 }, { "epoch": 0.7058823529411765, "grad_norm": 0.2832274278276974, "learning_rate": 4.467525377078672e-05, "loss": 0.1616, "step": 1356 }, { "epoch": 0.7064029151483602, "grad_norm": 0.2918275203692509, "learning_rate": 4.466658262147927e-05, "loss": 0.1576, "step": 1357 }, { "epoch": 0.706923477355544, "grad_norm": 0.2623509716876954, "learning_rate": 4.465790526054983e-05, "loss": 0.1548, "step": 1358 }, { "epoch": 0.7074440395627277, "grad_norm": 0.2830506524726103, "learning_rate": 4.4649221690739095e-05, "loss": 0.1577, "step": 1359 }, { "epoch": 0.7079646017699115, "grad_norm": 0.2668706458778001, "learning_rate": 4.464053191478976e-05, "loss": 0.1693, "step": 1360 }, { "epoch": 0.7084851639770953, "grad_norm": 0.2529153576757616, "learning_rate": 4.463183593544647e-05, "loss": 0.1637, "step": 1361 }, { "epoch": 0.709005726184279, "grad_norm": 0.2624787559358483, "learning_rate": 4.462313375545579e-05, "loss": 0.1612, "step": 1362 }, { "epoch": 0.7095262883914628, "grad_norm": 0.2751465060223429, "learning_rate": 4.461442537756629e-05, "loss": 0.1549, "step": 1363 }, { "epoch": 0.7100468505986466, "grad_norm": 0.24739529719217984, "learning_rate": 4.4605710804528474e-05, "loss": 0.162, "step": 1364 }, { "epoch": 0.7105674128058302, "grad_norm": 0.26495855736115675, "learning_rate": 4.459699003909482e-05, "loss": 0.1659, "step": 1365 }, { "epoch": 0.711087975013014, "grad_norm": 0.27568214342295516, "learning_rate": 4.4588263084019746e-05, "loss": 0.1635, "step": 1366 }, { "epoch": 0.7116085372201978, "grad_norm": 0.25595986364541223, "learning_rate": 4.457952994205963e-05, "loss": 0.1569, "step": 1367 }, { "epoch": 0.7121290994273816, "grad_norm": 0.2731708239567679, "learning_rate": 4.457079061597281e-05, "loss": 0.1603, "step": 1368 }, { "epoch": 0.7126496616345653, "grad_norm": 0.2587032809571712, "learning_rate": 4.4562045108519565e-05, "loss": 0.1609, "step": 1369 }, { "epoch": 0.7131702238417491, "grad_norm": 0.26523949719681184, "learning_rate": 4.4553293422462134e-05, "loss": 0.1578, "step": 1370 }, { "epoch": 0.7136907860489329, "grad_norm": 0.2594662248421297, "learning_rate": 4.454453556056471e-05, "loss": 0.1585, "step": 1371 }, { "epoch": 0.7142113482561167, "grad_norm": 0.2557915985298669, "learning_rate": 4.4535771525593426e-05, "loss": 0.1536, "step": 1372 }, { "epoch": 0.7147319104633003, "grad_norm": 0.2742705642572711, "learning_rate": 4.452700132031638e-05, "loss": 0.1673, "step": 1373 }, { "epoch": 0.7152524726704841, "grad_norm": 0.2806193185281088, "learning_rate": 4.451822494750362e-05, "loss": 0.1648, "step": 1374 }, { "epoch": 0.7157730348776679, "grad_norm": 0.2809327056175134, "learning_rate": 4.450944240992711e-05, "loss": 0.1656, "step": 1375 }, { "epoch": 0.7162935970848516, "grad_norm": 0.2759622382555981, "learning_rate": 4.45006537103608e-05, "loss": 0.1612, "step": 1376 }, { "epoch": 0.7168141592920354, "grad_norm": 0.2624638331201247, "learning_rate": 4.449185885158056e-05, "loss": 0.1668, "step": 1377 }, { "epoch": 0.7173347214992192, "grad_norm": 0.2689887733722017, "learning_rate": 4.4483057836364225e-05, "loss": 0.1608, "step": 1378 }, { "epoch": 0.717855283706403, "grad_norm": 0.26933355871425685, "learning_rate": 4.4474250667491567e-05, "loss": 0.1629, "step": 1379 }, { "epoch": 0.7183758459135867, "grad_norm": 0.2735573234930184, "learning_rate": 4.4465437347744285e-05, "loss": 0.1647, "step": 1380 }, { "epoch": 0.7188964081207704, "grad_norm": 0.2680034708829736, "learning_rate": 4.4456617879906056e-05, "loss": 0.1678, "step": 1381 }, { "epoch": 0.7194169703279542, "grad_norm": 0.2519063095093893, "learning_rate": 4.444779226676246e-05, "loss": 0.1592, "step": 1382 }, { "epoch": 0.7199375325351379, "grad_norm": 0.2602912105474266, "learning_rate": 4.4438960511101046e-05, "loss": 0.1575, "step": 1383 }, { "epoch": 0.7204580947423217, "grad_norm": 0.2726477230959709, "learning_rate": 4.443012261571129e-05, "loss": 0.1604, "step": 1384 }, { "epoch": 0.7209786569495055, "grad_norm": 0.27905940371356286, "learning_rate": 4.442127858338462e-05, "loss": 0.1654, "step": 1385 }, { "epoch": 0.7214992191566892, "grad_norm": 0.2583560743210376, "learning_rate": 4.441242841691438e-05, "loss": 0.1528, "step": 1386 }, { "epoch": 0.722019781363873, "grad_norm": 0.27099777286648374, "learning_rate": 4.440357211909586e-05, "loss": 0.1667, "step": 1387 }, { "epoch": 0.7225403435710568, "grad_norm": 0.2894163802959934, "learning_rate": 4.439470969272631e-05, "loss": 0.1645, "step": 1388 }, { "epoch": 0.7230609057782404, "grad_norm": 0.2544924261078653, "learning_rate": 4.4385841140604884e-05, "loss": 0.1585, "step": 1389 }, { "epoch": 0.7235814679854242, "grad_norm": 0.27636086548700217, "learning_rate": 4.437696646553269e-05, "loss": 0.1523, "step": 1390 }, { "epoch": 0.724102030192608, "grad_norm": 0.2713258603157374, "learning_rate": 4.4368085670312755e-05, "loss": 0.1645, "step": 1391 }, { "epoch": 0.7246225923997918, "grad_norm": 0.2764758232297857, "learning_rate": 4.435919875775005e-05, "loss": 0.1526, "step": 1392 }, { "epoch": 0.7251431546069755, "grad_norm": 0.26060022386382026, "learning_rate": 4.435030573065149e-05, "loss": 0.1524, "step": 1393 }, { "epoch": 0.7256637168141593, "grad_norm": 0.29792250327932235, "learning_rate": 4.434140659182588e-05, "loss": 0.1666, "step": 1394 }, { "epoch": 0.7261842790213431, "grad_norm": 0.26037287596745995, "learning_rate": 4.433250134408401e-05, "loss": 0.1586, "step": 1395 }, { "epoch": 0.7267048412285269, "grad_norm": 0.26727896558958614, "learning_rate": 4.4323589990238545e-05, "loss": 0.164, "step": 1396 }, { "epoch": 0.7272254034357105, "grad_norm": 0.2754571354466698, "learning_rate": 4.431467253310413e-05, "loss": 0.1659, "step": 1397 }, { "epoch": 0.7277459656428943, "grad_norm": 0.26954157638947546, "learning_rate": 4.4305748975497294e-05, "loss": 0.1591, "step": 1398 }, { "epoch": 0.7282665278500781, "grad_norm": 0.262164138500046, "learning_rate": 4.4296819320236524e-05, "loss": 0.1605, "step": 1399 }, { "epoch": 0.7287870900572618, "grad_norm": 0.27456638161498215, "learning_rate": 4.428788357014222e-05, "loss": 0.1572, "step": 1400 }, { "epoch": 0.7293076522644456, "grad_norm": 0.2720040860336858, "learning_rate": 4.4278941728036696e-05, "loss": 0.1667, "step": 1401 }, { "epoch": 0.7298282144716294, "grad_norm": 0.2636935543695637, "learning_rate": 4.426999379674421e-05, "loss": 0.1678, "step": 1402 }, { "epoch": 0.7303487766788131, "grad_norm": 0.27172824164676596, "learning_rate": 4.426103977909094e-05, "loss": 0.1654, "step": 1403 }, { "epoch": 0.7308693388859969, "grad_norm": 0.2557001119750677, "learning_rate": 4.425207967790497e-05, "loss": 0.1598, "step": 1404 }, { "epoch": 0.7313899010931806, "grad_norm": 0.26111748434009874, "learning_rate": 4.4243113496016326e-05, "loss": 0.1587, "step": 1405 }, { "epoch": 0.7319104633003644, "grad_norm": 0.2833129354173884, "learning_rate": 4.423414123625694e-05, "loss": 0.162, "step": 1406 }, { "epoch": 0.7324310255075481, "grad_norm": 0.2575834825018457, "learning_rate": 4.4225162901460676e-05, "loss": 0.1597, "step": 1407 }, { "epoch": 0.7329515877147319, "grad_norm": 0.2753343459846711, "learning_rate": 4.42161784944633e-05, "loss": 0.17, "step": 1408 }, { "epoch": 0.7334721499219157, "grad_norm": 0.2971924494704701, "learning_rate": 4.420718801810252e-05, "loss": 0.1563, "step": 1409 }, { "epoch": 0.7339927121290994, "grad_norm": 0.2579981593383931, "learning_rate": 4.419819147521793e-05, "loss": 0.1586, "step": 1410 }, { "epoch": 0.7345132743362832, "grad_norm": 0.2763781873570336, "learning_rate": 4.418918886865108e-05, "loss": 0.1679, "step": 1411 }, { "epoch": 0.735033836543467, "grad_norm": 0.27888896994369716, "learning_rate": 4.418018020124538e-05, "loss": 0.158, "step": 1412 }, { "epoch": 0.7355543987506507, "grad_norm": 0.27992425465308046, "learning_rate": 4.417116547584621e-05, "loss": 0.1582, "step": 1413 }, { "epoch": 0.7360749609578344, "grad_norm": 0.28643991156800214, "learning_rate": 4.4162144695300834e-05, "loss": 0.1674, "step": 1414 }, { "epoch": 0.7365955231650182, "grad_norm": 0.2642558460300512, "learning_rate": 4.415311786245843e-05, "loss": 0.1645, "step": 1415 }, { "epoch": 0.737116085372202, "grad_norm": 0.27570896762545, "learning_rate": 4.41440849801701e-05, "loss": 0.1634, "step": 1416 }, { "epoch": 0.7376366475793857, "grad_norm": 0.26823860392672444, "learning_rate": 4.413504605128885e-05, "loss": 0.1615, "step": 1417 }, { "epoch": 0.7381572097865695, "grad_norm": 0.27413381020561156, "learning_rate": 4.4126001078669574e-05, "loss": 0.1638, "step": 1418 }, { "epoch": 0.7386777719937533, "grad_norm": 0.2682256952523028, "learning_rate": 4.4116950065169124e-05, "loss": 0.1604, "step": 1419 }, { "epoch": 0.7391983342009371, "grad_norm": 0.2934261761780093, "learning_rate": 4.410789301364621e-05, "loss": 0.1665, "step": 1420 }, { "epoch": 0.7397188964081207, "grad_norm": 0.26702280332753814, "learning_rate": 4.409882992696148e-05, "loss": 0.1584, "step": 1421 }, { "epoch": 0.7402394586153045, "grad_norm": 0.27945453060552594, "learning_rate": 4.4089760807977474e-05, "loss": 0.153, "step": 1422 }, { "epoch": 0.7407600208224883, "grad_norm": 0.26111156671410596, "learning_rate": 4.408068565955865e-05, "loss": 0.1633, "step": 1423 }, { "epoch": 0.741280583029672, "grad_norm": 0.2618834889479115, "learning_rate": 4.407160448457135e-05, "loss": 0.1633, "step": 1424 }, { "epoch": 0.7418011452368558, "grad_norm": 0.2548316686323859, "learning_rate": 4.406251728588384e-05, "loss": 0.1611, "step": 1425 }, { "epoch": 0.7423217074440396, "grad_norm": 0.27198720127436277, "learning_rate": 4.405342406636627e-05, "loss": 0.1518, "step": 1426 }, { "epoch": 0.7428422696512234, "grad_norm": 0.25780399068453724, "learning_rate": 4.4044324828890715e-05, "loss": 0.1539, "step": 1427 }, { "epoch": 0.7433628318584071, "grad_norm": 0.28708604768264273, "learning_rate": 4.403521957633113e-05, "loss": 0.1677, "step": 1428 }, { "epoch": 0.7438833940655908, "grad_norm": 0.2854990852578142, "learning_rate": 4.4026108311563394e-05, "loss": 0.1591, "step": 1429 }, { "epoch": 0.7444039562727746, "grad_norm": 0.2601422292209927, "learning_rate": 4.401699103746524e-05, "loss": 0.1628, "step": 1430 }, { "epoch": 0.7449245184799583, "grad_norm": 0.26792776342524677, "learning_rate": 4.4007867756916345e-05, "loss": 0.1576, "step": 1431 }, { "epoch": 0.7454450806871421, "grad_norm": 0.2553899632837398, "learning_rate": 4.399873847279827e-05, "loss": 0.1689, "step": 1432 }, { "epoch": 0.7459656428943259, "grad_norm": 0.2631912627429382, "learning_rate": 4.3989603187994454e-05, "loss": 0.1544, "step": 1433 }, { "epoch": 0.7464862051015096, "grad_norm": 0.2719680455212801, "learning_rate": 4.398046190539025e-05, "loss": 0.1648, "step": 1434 }, { "epoch": 0.7470067673086934, "grad_norm": 0.2623299175088546, "learning_rate": 4.39713146278729e-05, "loss": 0.1685, "step": 1435 }, { "epoch": 0.7475273295158772, "grad_norm": 0.27789094278392973, "learning_rate": 4.3962161358331546e-05, "loss": 0.1588, "step": 1436 }, { "epoch": 0.7480478917230609, "grad_norm": 0.2625407847886398, "learning_rate": 4.395300209965721e-05, "loss": 0.1616, "step": 1437 }, { "epoch": 0.7485684539302446, "grad_norm": 0.2506574404285458, "learning_rate": 4.394383685474281e-05, "loss": 0.1522, "step": 1438 }, { "epoch": 0.7490890161374284, "grad_norm": 0.26122428518589536, "learning_rate": 4.3934665626483175e-05, "loss": 0.1654, "step": 1439 }, { "epoch": 0.7496095783446122, "grad_norm": 0.25983335544974423, "learning_rate": 4.392548841777497e-05, "loss": 0.1592, "step": 1440 }, { "epoch": 0.7501301405517959, "grad_norm": 0.26244778657616386, "learning_rate": 4.391630523151683e-05, "loss": 0.163, "step": 1441 }, { "epoch": 0.7506507027589797, "grad_norm": 0.24661772307545587, "learning_rate": 4.390711607060919e-05, "loss": 0.156, "step": 1442 }, { "epoch": 0.7511712649661635, "grad_norm": 0.2681126416443138, "learning_rate": 4.389792093795444e-05, "loss": 0.1658, "step": 1443 }, { "epoch": 0.7516918271733473, "grad_norm": 0.28319213581654684, "learning_rate": 4.3888719836456823e-05, "loss": 0.1603, "step": 1444 }, { "epoch": 0.7522123893805309, "grad_norm": 0.2717403925825211, "learning_rate": 4.3879512769022485e-05, "loss": 0.1618, "step": 1445 }, { "epoch": 0.7527329515877147, "grad_norm": 0.2672807713995564, "learning_rate": 4.387029973855943e-05, "loss": 0.164, "step": 1446 }, { "epoch": 0.7532535137948985, "grad_norm": 0.29074179763902774, "learning_rate": 4.3861080747977565e-05, "loss": 0.159, "step": 1447 }, { "epoch": 0.7537740760020822, "grad_norm": 0.2644544725678007, "learning_rate": 4.385185580018869e-05, "loss": 0.1608, "step": 1448 }, { "epoch": 0.754294638209266, "grad_norm": 0.2786228789121159, "learning_rate": 4.3842624898106464e-05, "loss": 0.1663, "step": 1449 }, { "epoch": 0.7548152004164498, "grad_norm": 0.2568411909485992, "learning_rate": 4.383338804464643e-05, "loss": 0.1632, "step": 1450 }, { "epoch": 0.7553357626236336, "grad_norm": 0.2704063306890479, "learning_rate": 4.382414524272602e-05, "loss": 0.1559, "step": 1451 }, { "epoch": 0.7558563248308173, "grad_norm": 0.3052629008083044, "learning_rate": 4.3814896495264544e-05, "loss": 0.1667, "step": 1452 }, { "epoch": 0.756376887038001, "grad_norm": 0.25388945533730933, "learning_rate": 4.380564180518318e-05, "loss": 0.1596, "step": 1453 }, { "epoch": 0.7568974492451848, "grad_norm": 0.2876303341079924, "learning_rate": 4.3796381175405014e-05, "loss": 0.1571, "step": 1454 }, { "epoch": 0.7574180114523685, "grad_norm": 0.26037804849757196, "learning_rate": 4.378711460885494e-05, "loss": 0.157, "step": 1455 }, { "epoch": 0.7579385736595523, "grad_norm": 0.281072192425641, "learning_rate": 4.377784210845981e-05, "loss": 0.1605, "step": 1456 }, { "epoch": 0.7584591358667361, "grad_norm": 0.28910101581498615, "learning_rate": 4.376856367714829e-05, "loss": 0.1627, "step": 1457 }, { "epoch": 0.7589796980739199, "grad_norm": 0.27247204660508845, "learning_rate": 4.375927931785095e-05, "loss": 0.16, "step": 1458 }, { "epoch": 0.7595002602811036, "grad_norm": 0.24843057073483057, "learning_rate": 4.3749989033500224e-05, "loss": 0.1552, "step": 1459 }, { "epoch": 0.7600208224882874, "grad_norm": 0.27297992132744714, "learning_rate": 4.3740692827030404e-05, "loss": 0.1545, "step": 1460 }, { "epoch": 0.7605413846954711, "grad_norm": 0.25731476919380347, "learning_rate": 4.3731390701377675e-05, "loss": 0.1547, "step": 1461 }, { "epoch": 0.7610619469026548, "grad_norm": 0.27094205358391144, "learning_rate": 4.3722082659480076e-05, "loss": 0.1509, "step": 1462 }, { "epoch": 0.7615825091098386, "grad_norm": 0.27051817962373237, "learning_rate": 4.371276870427753e-05, "loss": 0.1514, "step": 1463 }, { "epoch": 0.7621030713170224, "grad_norm": 0.2776605729402276, "learning_rate": 4.37034488387118e-05, "loss": 0.1648, "step": 1464 }, { "epoch": 0.7626236335242061, "grad_norm": 0.2632398573808106, "learning_rate": 4.3694123065726553e-05, "loss": 0.1625, "step": 1465 }, { "epoch": 0.7631441957313899, "grad_norm": 0.2631443834365562, "learning_rate": 4.3684791388267287e-05, "loss": 0.1588, "step": 1466 }, { "epoch": 0.7636647579385737, "grad_norm": 0.26943148019545626, "learning_rate": 4.367545380928139e-05, "loss": 0.1552, "step": 1467 }, { "epoch": 0.7641853201457575, "grad_norm": 0.2649233188506122, "learning_rate": 4.36661103317181e-05, "loss": 0.1615, "step": 1468 }, { "epoch": 0.7647058823529411, "grad_norm": 0.26835305552005984, "learning_rate": 4.3656760958528506e-05, "loss": 0.1637, "step": 1469 }, { "epoch": 0.7652264445601249, "grad_norm": 0.2578891624619545, "learning_rate": 4.364740569266561e-05, "loss": 0.152, "step": 1470 }, { "epoch": 0.7657470067673087, "grad_norm": 0.26524236019881625, "learning_rate": 4.363804453708421e-05, "loss": 0.1676, "step": 1471 }, { "epoch": 0.7662675689744924, "grad_norm": 0.2583254156227514, "learning_rate": 4.362867749474101e-05, "loss": 0.1543, "step": 1472 }, { "epoch": 0.7667881311816762, "grad_norm": 0.2577114988061728, "learning_rate": 4.361930456859455e-05, "loss": 0.1546, "step": 1473 }, { "epoch": 0.76730869338886, "grad_norm": 0.2655743069628856, "learning_rate": 4.360992576160524e-05, "loss": 0.1563, "step": 1474 }, { "epoch": 0.7678292555960438, "grad_norm": 0.25925048966807795, "learning_rate": 4.3600541076735346e-05, "loss": 0.1562, "step": 1475 }, { "epoch": 0.7683498178032275, "grad_norm": 0.26321769667524203, "learning_rate": 4.359115051694898e-05, "loss": 0.1587, "step": 1476 }, { "epoch": 0.7688703800104112, "grad_norm": 0.2792326144746915, "learning_rate": 4.358175408521212e-05, "loss": 0.1555, "step": 1477 }, { "epoch": 0.769390942217595, "grad_norm": 0.26756537678776365, "learning_rate": 4.357235178449261e-05, "loss": 0.1666, "step": 1478 }, { "epoch": 0.7699115044247787, "grad_norm": 0.27412529165548094, "learning_rate": 4.356294361776012e-05, "loss": 0.164, "step": 1479 }, { "epoch": 0.7704320666319625, "grad_norm": 0.2632617066780282, "learning_rate": 4.3553529587986184e-05, "loss": 0.1554, "step": 1480 }, { "epoch": 0.7709526288391463, "grad_norm": 0.2665209273532276, "learning_rate": 4.3544109698144206e-05, "loss": 0.157, "step": 1481 }, { "epoch": 0.77147319104633, "grad_norm": 0.26609079034706307, "learning_rate": 4.3534683951209416e-05, "loss": 0.1561, "step": 1482 }, { "epoch": 0.7719937532535138, "grad_norm": 0.2782722665482425, "learning_rate": 4.3525252350158904e-05, "loss": 0.1633, "step": 1483 }, { "epoch": 0.7725143154606976, "grad_norm": 0.24586603055391748, "learning_rate": 4.351581489797161e-05, "loss": 0.1625, "step": 1484 }, { "epoch": 0.7730348776678813, "grad_norm": 0.27660175665693065, "learning_rate": 4.350637159762831e-05, "loss": 0.166, "step": 1485 }, { "epoch": 0.773555439875065, "grad_norm": 0.2718655540992913, "learning_rate": 4.3496922452111656e-05, "loss": 0.162, "step": 1486 }, { "epoch": 0.7740760020822488, "grad_norm": 0.2583348630376606, "learning_rate": 4.348746746440612e-05, "loss": 0.1632, "step": 1487 }, { "epoch": 0.7745965642894326, "grad_norm": 0.2692273301860101, "learning_rate": 4.347800663749801e-05, "loss": 0.1513, "step": 1488 }, { "epoch": 0.7751171264966163, "grad_norm": 0.26291032421691457, "learning_rate": 4.3468539974375534e-05, "loss": 0.1545, "step": 1489 }, { "epoch": 0.7756376887038001, "grad_norm": 0.27809527086476604, "learning_rate": 4.345906747802867e-05, "loss": 0.1593, "step": 1490 }, { "epoch": 0.7761582509109839, "grad_norm": 0.25920104592248266, "learning_rate": 4.344958915144929e-05, "loss": 0.1609, "step": 1491 }, { "epoch": 0.7766788131181677, "grad_norm": 0.29330224785096815, "learning_rate": 4.3440104997631084e-05, "loss": 0.1686, "step": 1492 }, { "epoch": 0.7771993753253513, "grad_norm": 0.2727948495509996, "learning_rate": 4.343061501956959e-05, "loss": 0.1592, "step": 1493 }, { "epoch": 0.7777199375325351, "grad_norm": 0.25156183079449285, "learning_rate": 4.3421119220262185e-05, "loss": 0.161, "step": 1494 }, { "epoch": 0.7782404997397189, "grad_norm": 0.26655483033933713, "learning_rate": 4.3411617602708085e-05, "loss": 0.1556, "step": 1495 }, { "epoch": 0.7787610619469026, "grad_norm": 0.25869405345561675, "learning_rate": 4.340211016990834e-05, "loss": 0.1518, "step": 1496 }, { "epoch": 0.7792816241540864, "grad_norm": 0.25895490167143587, "learning_rate": 4.3392596924865854e-05, "loss": 0.1583, "step": 1497 }, { "epoch": 0.7798021863612702, "grad_norm": 0.26930038307801346, "learning_rate": 4.3383077870585334e-05, "loss": 0.1578, "step": 1498 }, { "epoch": 0.780322748568454, "grad_norm": 0.2655149958667704, "learning_rate": 4.3373553010073355e-05, "loss": 0.1666, "step": 1499 }, { "epoch": 0.7808433107756377, "grad_norm": 0.2700875671699295, "learning_rate": 4.3364022346338295e-05, "loss": 0.1499, "step": 1500 }, { "epoch": 0.7813638729828214, "grad_norm": 0.26841277303835387, "learning_rate": 4.335448588239039e-05, "loss": 0.16, "step": 1501 }, { "epoch": 0.7818844351900052, "grad_norm": 0.2664241191896805, "learning_rate": 4.33449436212417e-05, "loss": 0.1566, "step": 1502 }, { "epoch": 0.7824049973971889, "grad_norm": 0.2748558495343656, "learning_rate": 4.333539556590612e-05, "loss": 0.1561, "step": 1503 }, { "epoch": 0.7829255596043727, "grad_norm": 0.260891571763646, "learning_rate": 4.332584171939936e-05, "loss": 0.1639, "step": 1504 }, { "epoch": 0.7834461218115565, "grad_norm": 0.26117170009910184, "learning_rate": 4.331628208473897e-05, "loss": 0.1535, "step": 1505 }, { "epoch": 0.7839666840187403, "grad_norm": 0.2501439526366261, "learning_rate": 4.3306716664944344e-05, "loss": 0.1558, "step": 1506 }, { "epoch": 0.784487246225924, "grad_norm": 0.24457951812109338, "learning_rate": 4.329714546303666e-05, "loss": 0.1539, "step": 1507 }, { "epoch": 0.7850078084331078, "grad_norm": 0.25274194830382657, "learning_rate": 4.328756848203897e-05, "loss": 0.1564, "step": 1508 }, { "epoch": 0.7855283706402915, "grad_norm": 0.24212505466279638, "learning_rate": 4.327798572497612e-05, "loss": 0.1585, "step": 1509 }, { "epoch": 0.7860489328474752, "grad_norm": 0.26756424368954107, "learning_rate": 4.3268397194874796e-05, "loss": 0.1617, "step": 1510 }, { "epoch": 0.786569495054659, "grad_norm": 0.24184787477349412, "learning_rate": 4.32588028947635e-05, "loss": 0.1619, "step": 1511 }, { "epoch": 0.7870900572618428, "grad_norm": 0.25168399053293283, "learning_rate": 4.3249202827672564e-05, "loss": 0.1602, "step": 1512 }, { "epoch": 0.7876106194690266, "grad_norm": 0.26161299334783633, "learning_rate": 4.3239596996634125e-05, "loss": 0.1576, "step": 1513 }, { "epoch": 0.7881311816762103, "grad_norm": 0.23097070104546066, "learning_rate": 4.322998540468216e-05, "loss": 0.149, "step": 1514 }, { "epoch": 0.7886517438833941, "grad_norm": 0.26176672102662274, "learning_rate": 4.322036805485245e-05, "loss": 0.1604, "step": 1515 }, { "epoch": 0.7891723060905779, "grad_norm": 0.2509842627379737, "learning_rate": 4.3210744950182603e-05, "loss": 0.1609, "step": 1516 }, { "epoch": 0.7896928682977615, "grad_norm": 0.25286215696526987, "learning_rate": 4.3201116093712045e-05, "loss": 0.1519, "step": 1517 }, { "epoch": 0.7902134305049453, "grad_norm": 0.2554977617373658, "learning_rate": 4.319148148848202e-05, "loss": 0.1507, "step": 1518 }, { "epoch": 0.7907339927121291, "grad_norm": 0.2610030848646668, "learning_rate": 4.3181841137535585e-05, "loss": 0.1565, "step": 1519 }, { "epoch": 0.7912545549193128, "grad_norm": 0.25847858497390264, "learning_rate": 4.317219504391761e-05, "loss": 0.1513, "step": 1520 }, { "epoch": 0.7917751171264966, "grad_norm": 0.25023349404312234, "learning_rate": 4.316254321067477e-05, "loss": 0.1632, "step": 1521 }, { "epoch": 0.7922956793336804, "grad_norm": 0.2641861638565703, "learning_rate": 4.315288564085558e-05, "loss": 0.1618, "step": 1522 }, { "epoch": 0.7928162415408642, "grad_norm": 0.2421590693178086, "learning_rate": 4.314322233751034e-05, "loss": 0.1558, "step": 1523 }, { "epoch": 0.7933368037480479, "grad_norm": 0.26085418193866505, "learning_rate": 4.313355330369117e-05, "loss": 0.1595, "step": 1524 }, { "epoch": 0.7938573659552316, "grad_norm": 0.2439161457038632, "learning_rate": 4.312387854245201e-05, "loss": 0.1466, "step": 1525 }, { "epoch": 0.7943779281624154, "grad_norm": 0.2713532358303694, "learning_rate": 4.3114198056848585e-05, "loss": 0.1608, "step": 1526 }, { "epoch": 0.7948984903695991, "grad_norm": 0.25922505436788407, "learning_rate": 4.3104511849938464e-05, "loss": 0.1557, "step": 1527 }, { "epoch": 0.7954190525767829, "grad_norm": 0.2886790825671015, "learning_rate": 4.309481992478098e-05, "loss": 0.1644, "step": 1528 }, { "epoch": 0.7959396147839667, "grad_norm": 0.24108201278459665, "learning_rate": 4.308512228443731e-05, "loss": 0.1596, "step": 1529 }, { "epoch": 0.7964601769911505, "grad_norm": 0.27215617885447624, "learning_rate": 4.30754189319704e-05, "loss": 0.1573, "step": 1530 }, { "epoch": 0.7969807391983342, "grad_norm": 0.28694200101027506, "learning_rate": 4.306570987044505e-05, "loss": 0.1686, "step": 1531 }, { "epoch": 0.797501301405518, "grad_norm": 0.24652959196034746, "learning_rate": 4.305599510292781e-05, "loss": 0.159, "step": 1532 }, { "epoch": 0.7980218636127017, "grad_norm": 0.2819197614862714, "learning_rate": 4.304627463248706e-05, "loss": 0.1588, "step": 1533 }, { "epoch": 0.7985424258198854, "grad_norm": 0.2909511370680243, "learning_rate": 4.3036548462192986e-05, "loss": 0.1606, "step": 1534 }, { "epoch": 0.7990629880270692, "grad_norm": 0.2530278943329065, "learning_rate": 4.302681659511755e-05, "loss": 0.1544, "step": 1535 }, { "epoch": 0.799583550234253, "grad_norm": 0.282363644399392, "learning_rate": 4.301707903433454e-05, "loss": 0.1556, "step": 1536 }, { "epoch": 0.8001041124414368, "grad_norm": 0.2676419532705495, "learning_rate": 4.300733578291953e-05, "loss": 0.1637, "step": 1537 }, { "epoch": 0.8006246746486205, "grad_norm": 0.25796205124564936, "learning_rate": 4.29975868439499e-05, "loss": 0.1609, "step": 1538 }, { "epoch": 0.8011452368558043, "grad_norm": 0.2819765513723248, "learning_rate": 4.29878322205048e-05, "loss": 0.1624, "step": 1539 }, { "epoch": 0.8016657990629881, "grad_norm": 0.27897029452667205, "learning_rate": 4.297807191566521e-05, "loss": 0.1572, "step": 1540 }, { "epoch": 0.8021863612701717, "grad_norm": 0.27824823196428977, "learning_rate": 4.2968305932513866e-05, "loss": 0.1622, "step": 1541 }, { "epoch": 0.8027069234773555, "grad_norm": 0.2738961827219555, "learning_rate": 4.295853427413535e-05, "loss": 0.1601, "step": 1542 }, { "epoch": 0.8032274856845393, "grad_norm": 0.2665241074273365, "learning_rate": 4.2948756943615985e-05, "loss": 0.1534, "step": 1543 }, { "epoch": 0.803748047891723, "grad_norm": 0.27182571850263426, "learning_rate": 4.293897394404392e-05, "loss": 0.1664, "step": 1544 }, { "epoch": 0.8042686100989068, "grad_norm": 0.2553846747993963, "learning_rate": 4.292918527850907e-05, "loss": 0.1543, "step": 1545 }, { "epoch": 0.8047891723060906, "grad_norm": 0.25252091088164785, "learning_rate": 4.291939095010316e-05, "loss": 0.1569, "step": 1546 }, { "epoch": 0.8053097345132744, "grad_norm": 0.3033445790518775, "learning_rate": 4.290959096191969e-05, "loss": 0.156, "step": 1547 }, { "epoch": 0.8058302967204581, "grad_norm": 0.26780210520674946, "learning_rate": 4.289978531705395e-05, "loss": 0.1629, "step": 1548 }, { "epoch": 0.8063508589276418, "grad_norm": 0.25340358566449683, "learning_rate": 4.288997401860303e-05, "loss": 0.1676, "step": 1549 }, { "epoch": 0.8068714211348256, "grad_norm": 0.26032260808346724, "learning_rate": 4.288015706966578e-05, "loss": 0.1599, "step": 1550 }, { "epoch": 0.8073919833420093, "grad_norm": 0.253837369944548, "learning_rate": 4.287033447334286e-05, "loss": 0.1584, "step": 1551 }, { "epoch": 0.8079125455491931, "grad_norm": 0.26639544259069964, "learning_rate": 4.2860506232736706e-05, "loss": 0.165, "step": 1552 }, { "epoch": 0.8084331077563769, "grad_norm": 0.2374985977664141, "learning_rate": 4.2850672350951516e-05, "loss": 0.1545, "step": 1553 }, { "epoch": 0.8089536699635607, "grad_norm": 0.2528600110588083, "learning_rate": 4.284083283109331e-05, "loss": 0.1579, "step": 1554 }, { "epoch": 0.8094742321707444, "grad_norm": 0.25899495284687546, "learning_rate": 4.283098767626984e-05, "loss": 0.1533, "step": 1555 }, { "epoch": 0.8099947943779282, "grad_norm": 0.2596540439723637, "learning_rate": 4.2821136889590696e-05, "loss": 0.1623, "step": 1556 }, { "epoch": 0.8105153565851119, "grad_norm": 0.2593899738580702, "learning_rate": 4.281128047416719e-05, "loss": 0.1622, "step": 1557 }, { "epoch": 0.8110359187922956, "grad_norm": 0.27318043091362676, "learning_rate": 4.280141843311244e-05, "loss": 0.1614, "step": 1558 }, { "epoch": 0.8115564809994794, "grad_norm": 0.2650371042158417, "learning_rate": 4.279155076954135e-05, "loss": 0.1551, "step": 1559 }, { "epoch": 0.8120770432066632, "grad_norm": 0.2554530943228691, "learning_rate": 4.2781677486570576e-05, "loss": 0.1622, "step": 1560 }, { "epoch": 0.812597605413847, "grad_norm": 0.2742519335456577, "learning_rate": 4.277179858731857e-05, "loss": 0.1619, "step": 1561 }, { "epoch": 0.8131181676210307, "grad_norm": 0.2613073326722418, "learning_rate": 4.276191407490553e-05, "loss": 0.1603, "step": 1562 }, { "epoch": 0.8136387298282145, "grad_norm": 0.26817158410006026, "learning_rate": 4.2752023952453465e-05, "loss": 0.1571, "step": 1563 }, { "epoch": 0.8141592920353983, "grad_norm": 0.26632267357937267, "learning_rate": 4.274212822308612e-05, "loss": 0.1599, "step": 1564 }, { "epoch": 0.8146798542425819, "grad_norm": 0.2596167990888178, "learning_rate": 4.273222688992904e-05, "loss": 0.1588, "step": 1565 }, { "epoch": 0.8152004164497657, "grad_norm": 0.27568862315736015, "learning_rate": 4.272231995610952e-05, "loss": 0.1542, "step": 1566 }, { "epoch": 0.8157209786569495, "grad_norm": 0.27114645918535957, "learning_rate": 4.271240742475664e-05, "loss": 0.1576, "step": 1567 }, { "epoch": 0.8162415408641333, "grad_norm": 0.24591250210741347, "learning_rate": 4.2702489299001224e-05, "loss": 0.1542, "step": 1568 }, { "epoch": 0.816762103071317, "grad_norm": 0.25998302637843507, "learning_rate": 4.269256558197588e-05, "loss": 0.1597, "step": 1569 }, { "epoch": 0.8172826652785008, "grad_norm": 0.277349113622191, "learning_rate": 4.2682636276815e-05, "loss": 0.1656, "step": 1570 }, { "epoch": 0.8178032274856846, "grad_norm": 0.2591716268174999, "learning_rate": 4.267270138665469e-05, "loss": 0.1636, "step": 1571 }, { "epoch": 0.8183237896928683, "grad_norm": 0.2626931448102875, "learning_rate": 4.266276091463286e-05, "loss": 0.1582, "step": 1572 }, { "epoch": 0.818844351900052, "grad_norm": 0.2533409992404636, "learning_rate": 4.26528148638892e-05, "loss": 0.1559, "step": 1573 }, { "epoch": 0.8193649141072358, "grad_norm": 0.2635890206268693, "learning_rate": 4.26428632375651e-05, "loss": 0.1587, "step": 1574 }, { "epoch": 0.8198854763144195, "grad_norm": 0.2781221733340093, "learning_rate": 4.2632906038803765e-05, "loss": 0.1661, "step": 1575 }, { "epoch": 0.8204060385216033, "grad_norm": 0.2466422831727149, "learning_rate": 4.262294327075014e-05, "loss": 0.1609, "step": 1576 }, { "epoch": 0.8209266007287871, "grad_norm": 0.26214772989894536, "learning_rate": 4.261297493655092e-05, "loss": 0.156, "step": 1577 }, { "epoch": 0.8214471629359709, "grad_norm": 0.25723469064882787, "learning_rate": 4.260300103935459e-05, "loss": 0.1562, "step": 1578 }, { "epoch": 0.8219677251431546, "grad_norm": 0.2597832572362618, "learning_rate": 4.2593021582311354e-05, "loss": 0.163, "step": 1579 }, { "epoch": 0.8224882873503384, "grad_norm": 0.25814069220724356, "learning_rate": 4.2583036568573184e-05, "loss": 0.1583, "step": 1580 }, { "epoch": 0.8230088495575221, "grad_norm": 0.2680515018520899, "learning_rate": 4.257304600129384e-05, "loss": 0.1544, "step": 1581 }, { "epoch": 0.8235294117647058, "grad_norm": 0.26943464813073753, "learning_rate": 4.256304988362878e-05, "loss": 0.1579, "step": 1582 }, { "epoch": 0.8240499739718896, "grad_norm": 0.2536098084394136, "learning_rate": 4.2553048218735256e-05, "loss": 0.155, "step": 1583 }, { "epoch": 0.8245705361790734, "grad_norm": 0.2757591782017878, "learning_rate": 4.254304100977225e-05, "loss": 0.1606, "step": 1584 }, { "epoch": 0.8250910983862572, "grad_norm": 0.24717329055472512, "learning_rate": 4.253302825990051e-05, "loss": 0.1456, "step": 1585 }, { "epoch": 0.8256116605934409, "grad_norm": 0.27846929929536657, "learning_rate": 4.2523009972282534e-05, "loss": 0.1619, "step": 1586 }, { "epoch": 0.8261322228006247, "grad_norm": 0.2456486639173495, "learning_rate": 4.2512986150082555e-05, "loss": 0.163, "step": 1587 }, { "epoch": 0.8266527850078085, "grad_norm": 0.28099219066923725, "learning_rate": 4.250295679646657e-05, "loss": 0.1581, "step": 1588 }, { "epoch": 0.8271733472149921, "grad_norm": 0.28012381846944834, "learning_rate": 4.24929219146023e-05, "loss": 0.1618, "step": 1589 }, { "epoch": 0.8276939094221759, "grad_norm": 0.2514914295217897, "learning_rate": 4.248288150765925e-05, "loss": 0.1565, "step": 1590 }, { "epoch": 0.8282144716293597, "grad_norm": 0.2716012154641059, "learning_rate": 4.2472835578808635e-05, "loss": 0.1589, "step": 1591 }, { "epoch": 0.8287350338365435, "grad_norm": 0.2815268850024281, "learning_rate": 4.2462784131223434e-05, "loss": 0.15, "step": 1592 }, { "epoch": 0.8292555960437272, "grad_norm": 0.26832812464875594, "learning_rate": 4.245272716807834e-05, "loss": 0.1516, "step": 1593 }, { "epoch": 0.829776158250911, "grad_norm": 0.255986260897498, "learning_rate": 4.244266469254984e-05, "loss": 0.1557, "step": 1594 }, { "epoch": 0.8302967204580948, "grad_norm": 0.27756007774738645, "learning_rate": 4.243259670781611e-05, "loss": 0.1582, "step": 1595 }, { "epoch": 0.8308172826652785, "grad_norm": 0.2788845280857182, "learning_rate": 4.2422523217057104e-05, "loss": 0.1611, "step": 1596 }, { "epoch": 0.8313378448724622, "grad_norm": 0.265204765669698, "learning_rate": 4.241244422345448e-05, "loss": 0.1654, "step": 1597 }, { "epoch": 0.831858407079646, "grad_norm": 0.29862671442559635, "learning_rate": 4.240235973019168e-05, "loss": 0.1628, "step": 1598 }, { "epoch": 0.8323789692868298, "grad_norm": 0.25068737790327783, "learning_rate": 4.239226974045383e-05, "loss": 0.1612, "step": 1599 }, { "epoch": 0.8328995314940135, "grad_norm": 0.26109969950747847, "learning_rate": 4.2382174257427845e-05, "loss": 0.1526, "step": 1600 }, { "epoch": 0.8334200937011973, "grad_norm": 0.2708108326367472, "learning_rate": 4.237207328430232e-05, "loss": 0.1574, "step": 1601 }, { "epoch": 0.8339406559083811, "grad_norm": 0.2603788294937712, "learning_rate": 4.236196682426762e-05, "loss": 0.1603, "step": 1602 }, { "epoch": 0.8344612181155648, "grad_norm": 0.28645615014842474, "learning_rate": 4.235185488051585e-05, "loss": 0.164, "step": 1603 }, { "epoch": 0.8349817803227486, "grad_norm": 0.25263310940554584, "learning_rate": 4.2341737456240815e-05, "loss": 0.152, "step": 1604 }, { "epoch": 0.8355023425299323, "grad_norm": 0.29024837078909443, "learning_rate": 4.233161455463809e-05, "loss": 0.1584, "step": 1605 }, { "epoch": 0.836022904737116, "grad_norm": 0.2698645950290543, "learning_rate": 4.232148617890493e-05, "loss": 0.1603, "step": 1606 }, { "epoch": 0.8365434669442998, "grad_norm": 0.28134226221542846, "learning_rate": 4.231135233224037e-05, "loss": 0.1492, "step": 1607 }, { "epoch": 0.8370640291514836, "grad_norm": 0.2436518446483368, "learning_rate": 4.2301213017845144e-05, "loss": 0.1543, "step": 1608 }, { "epoch": 0.8375845913586674, "grad_norm": 0.28214111662149993, "learning_rate": 4.2291068238921714e-05, "loss": 0.1689, "step": 1609 }, { "epoch": 0.8381051535658511, "grad_norm": 0.2863837316615571, "learning_rate": 4.228091799867427e-05, "loss": 0.1578, "step": 1610 }, { "epoch": 0.8386257157730349, "grad_norm": 0.28091970790308574, "learning_rate": 4.227076230030875e-05, "loss": 0.1577, "step": 1611 }, { "epoch": 0.8391462779802187, "grad_norm": 0.2593788595939422, "learning_rate": 4.226060114703278e-05, "loss": 0.1591, "step": 1612 }, { "epoch": 0.8396668401874023, "grad_norm": 0.270786976366135, "learning_rate": 4.225043454205573e-05, "loss": 0.1646, "step": 1613 }, { "epoch": 0.8401874023945861, "grad_norm": 0.2689847000833549, "learning_rate": 4.224026248858868e-05, "loss": 0.1574, "step": 1614 }, { "epoch": 0.8407079646017699, "grad_norm": 0.2577066953840738, "learning_rate": 4.2230084989844454e-05, "loss": 0.155, "step": 1615 }, { "epoch": 0.8412285268089537, "grad_norm": 0.2550154489290601, "learning_rate": 4.221990204903756e-05, "loss": 0.1564, "step": 1616 }, { "epoch": 0.8417490890161374, "grad_norm": 0.273560448413933, "learning_rate": 4.220971366938425e-05, "loss": 0.1578, "step": 1617 }, { "epoch": 0.8422696512233212, "grad_norm": 0.24371532149607608, "learning_rate": 4.21995198541025e-05, "loss": 0.1531, "step": 1618 }, { "epoch": 0.842790213430505, "grad_norm": 0.28921374195808397, "learning_rate": 4.218932060641198e-05, "loss": 0.1561, "step": 1619 }, { "epoch": 0.8433107756376887, "grad_norm": 0.2829460232916761, "learning_rate": 4.217911592953409e-05, "loss": 0.1621, "step": 1620 }, { "epoch": 0.8438313378448724, "grad_norm": 0.26061758786155376, "learning_rate": 4.216890582669194e-05, "loss": 0.1484, "step": 1621 }, { "epoch": 0.8443519000520562, "grad_norm": 0.2619069964371061, "learning_rate": 4.2158690301110366e-05, "loss": 0.1622, "step": 1622 }, { "epoch": 0.84487246225924, "grad_norm": 0.27728638024745567, "learning_rate": 4.2148469356015896e-05, "loss": 0.1573, "step": 1623 }, { "epoch": 0.8453930244664237, "grad_norm": 0.2660951427495475, "learning_rate": 4.213824299463678e-05, "loss": 0.1653, "step": 1624 }, { "epoch": 0.8459135866736075, "grad_norm": 0.2617492948395115, "learning_rate": 4.2128011220202976e-05, "loss": 0.1633, "step": 1625 }, { "epoch": 0.8464341488807913, "grad_norm": 0.29138627520898736, "learning_rate": 4.211777403594617e-05, "loss": 0.1596, "step": 1626 }, { "epoch": 0.846954711087975, "grad_norm": 0.24888350370609963, "learning_rate": 4.210753144509972e-05, "loss": 0.1606, "step": 1627 }, { "epoch": 0.8474752732951588, "grad_norm": 0.2505367676114268, "learning_rate": 4.209728345089873e-05, "loss": 0.1629, "step": 1628 }, { "epoch": 0.8479958355023425, "grad_norm": 0.25895259510640567, "learning_rate": 4.208703005657999e-05, "loss": 0.1591, "step": 1629 }, { "epoch": 0.8485163977095262, "grad_norm": 0.25678447658914816, "learning_rate": 4.207677126538199e-05, "loss": 0.1606, "step": 1630 }, { "epoch": 0.84903695991671, "grad_norm": 0.26827714044896583, "learning_rate": 4.206650708054494e-05, "loss": 0.1457, "step": 1631 }, { "epoch": 0.8495575221238938, "grad_norm": 0.2722481858513276, "learning_rate": 4.205623750531076e-05, "loss": 0.1575, "step": 1632 }, { "epoch": 0.8500780843310776, "grad_norm": 0.27881803489843254, "learning_rate": 4.204596254292303e-05, "loss": 0.1581, "step": 1633 }, { "epoch": 0.8505986465382613, "grad_norm": 0.2519593987498123, "learning_rate": 4.203568219662709e-05, "loss": 0.1565, "step": 1634 }, { "epoch": 0.8511192087454451, "grad_norm": 0.26920318449984887, "learning_rate": 4.202539646966993e-05, "loss": 0.1525, "step": 1635 }, { "epoch": 0.8516397709526289, "grad_norm": 0.26653026491152526, "learning_rate": 4.2015105365300276e-05, "loss": 0.1609, "step": 1636 }, { "epoch": 0.8521603331598125, "grad_norm": 0.2827093407759518, "learning_rate": 4.200480888676853e-05, "loss": 0.1611, "step": 1637 }, { "epoch": 0.8526808953669963, "grad_norm": 0.2652398385704472, "learning_rate": 4.199450703732681e-05, "loss": 0.1513, "step": 1638 }, { "epoch": 0.8532014575741801, "grad_norm": 0.2696612536923782, "learning_rate": 4.19841998202289e-05, "loss": 0.1566, "step": 1639 }, { "epoch": 0.8537220197813639, "grad_norm": 0.28633933577187565, "learning_rate": 4.197388723873032e-05, "loss": 0.1578, "step": 1640 }, { "epoch": 0.8542425819885476, "grad_norm": 0.2690705270209936, "learning_rate": 4.196356929608825e-05, "loss": 0.1558, "step": 1641 }, { "epoch": 0.8547631441957314, "grad_norm": 0.24483032302626148, "learning_rate": 4.195324599556158e-05, "loss": 0.1494, "step": 1642 }, { "epoch": 0.8552837064029152, "grad_norm": 0.25311938346196466, "learning_rate": 4.194291734041089e-05, "loss": 0.1585, "step": 1643 }, { "epoch": 0.855804268610099, "grad_norm": 0.2687437228508878, "learning_rate": 4.193258333389844e-05, "loss": 0.1585, "step": 1644 }, { "epoch": 0.8563248308172826, "grad_norm": 0.23790698337821162, "learning_rate": 4.1922243979288205e-05, "loss": 0.1591, "step": 1645 }, { "epoch": 0.8568453930244664, "grad_norm": 0.2720940365664535, "learning_rate": 4.191189927984583e-05, "loss": 0.1591, "step": 1646 }, { "epoch": 0.8573659552316502, "grad_norm": 0.23733908382168187, "learning_rate": 4.190154923883865e-05, "loss": 0.1497, "step": 1647 }, { "epoch": 0.8578865174388339, "grad_norm": 0.2767062941528966, "learning_rate": 4.1891193859535686e-05, "loss": 0.1606, "step": 1648 }, { "epoch": 0.8584070796460177, "grad_norm": 0.23093654286620274, "learning_rate": 4.1880833145207655e-05, "loss": 0.15, "step": 1649 }, { "epoch": 0.8589276418532015, "grad_norm": 0.24409975896473038, "learning_rate": 4.187046709912695e-05, "loss": 0.1608, "step": 1650 }, { "epoch": 0.8594482040603852, "grad_norm": 0.2641900349527886, "learning_rate": 4.186009572456765e-05, "loss": 0.1583, "step": 1651 }, { "epoch": 0.859968766267569, "grad_norm": 0.25565972911270796, "learning_rate": 4.184971902480552e-05, "loss": 0.1543, "step": 1652 }, { "epoch": 0.8604893284747527, "grad_norm": 0.26824705481294847, "learning_rate": 4.183933700311801e-05, "loss": 0.1673, "step": 1653 }, { "epoch": 0.8610098906819365, "grad_norm": 0.2430609538785938, "learning_rate": 4.1828949662784236e-05, "loss": 0.1462, "step": 1654 }, { "epoch": 0.8615304528891202, "grad_norm": 0.2846978931791095, "learning_rate": 4.1818557007085e-05, "loss": 0.163, "step": 1655 }, { "epoch": 0.862051015096304, "grad_norm": 0.25943378302982295, "learning_rate": 4.1808159039302795e-05, "loss": 0.1561, "step": 1656 }, { "epoch": 0.8625715773034878, "grad_norm": 0.26198173198500524, "learning_rate": 4.1797755762721787e-05, "loss": 0.1552, "step": 1657 }, { "epoch": 0.8630921395106715, "grad_norm": 0.24923720021291665, "learning_rate": 4.17873471806278e-05, "loss": 0.1586, "step": 1658 }, { "epoch": 0.8636127017178553, "grad_norm": 0.2974201345925281, "learning_rate": 4.177693329630837e-05, "loss": 0.1578, "step": 1659 }, { "epoch": 0.8641332639250391, "grad_norm": 0.24813334349056498, "learning_rate": 4.176651411305266e-05, "loss": 0.1585, "step": 1660 }, { "epoch": 0.8646538261322227, "grad_norm": 0.2688434182973993, "learning_rate": 4.175608963415155e-05, "loss": 0.1627, "step": 1661 }, { "epoch": 0.8651743883394065, "grad_norm": 0.2525026939637232, "learning_rate": 4.174565986289758e-05, "loss": 0.1588, "step": 1662 }, { "epoch": 0.8656949505465903, "grad_norm": 0.2734677290901909, "learning_rate": 4.1735224802584946e-05, "loss": 0.1566, "step": 1663 }, { "epoch": 0.8662155127537741, "grad_norm": 0.24890893238886064, "learning_rate": 4.172478445650953e-05, "loss": 0.1605, "step": 1664 }, { "epoch": 0.8667360749609578, "grad_norm": 0.2705875269181692, "learning_rate": 4.171433882796888e-05, "loss": 0.1629, "step": 1665 }, { "epoch": 0.8672566371681416, "grad_norm": 0.2576695174793914, "learning_rate": 4.1703887920262195e-05, "loss": 0.153, "step": 1666 }, { "epoch": 0.8677771993753254, "grad_norm": 0.2566955998695762, "learning_rate": 4.1693431736690386e-05, "loss": 0.1538, "step": 1667 }, { "epoch": 0.8682977615825092, "grad_norm": 0.2795710749963426, "learning_rate": 4.1682970280555986e-05, "loss": 0.1593, "step": 1668 }, { "epoch": 0.8688183237896928, "grad_norm": 0.2565679446639193, "learning_rate": 4.1672503555163215e-05, "loss": 0.167, "step": 1669 }, { "epoch": 0.8693388859968766, "grad_norm": 0.26408964383887445, "learning_rate": 4.166203156381795e-05, "loss": 0.1518, "step": 1670 }, { "epoch": 0.8698594482040604, "grad_norm": 0.24825446479368055, "learning_rate": 4.1651554309827725e-05, "loss": 0.1589, "step": 1671 }, { "epoch": 0.8703800104112441, "grad_norm": 0.2674815525271392, "learning_rate": 4.1641071796501764e-05, "loss": 0.1579, "step": 1672 }, { "epoch": 0.8709005726184279, "grad_norm": 0.26322901088276796, "learning_rate": 4.163058402715091e-05, "loss": 0.1587, "step": 1673 }, { "epoch": 0.8714211348256117, "grad_norm": 0.27350744005632355, "learning_rate": 4.1620091005087714e-05, "loss": 0.165, "step": 1674 }, { "epoch": 0.8719416970327954, "grad_norm": 0.263352010398878, "learning_rate": 4.1609592733626335e-05, "loss": 0.155, "step": 1675 }, { "epoch": 0.8724622592399792, "grad_norm": 0.2693710231228204, "learning_rate": 4.159908921608263e-05, "loss": 0.1589, "step": 1676 }, { "epoch": 0.8729828214471629, "grad_norm": 0.2846789455817984, "learning_rate": 4.158858045577409e-05, "loss": 0.1557, "step": 1677 }, { "epoch": 0.8735033836543467, "grad_norm": 0.25566156183607636, "learning_rate": 4.157806645601988e-05, "loss": 0.1645, "step": 1678 }, { "epoch": 0.8740239458615304, "grad_norm": 0.2817627765773898, "learning_rate": 4.1567547220140814e-05, "loss": 0.1601, "step": 1679 }, { "epoch": 0.8745445080687142, "grad_norm": 0.2505616741147018, "learning_rate": 4.155702275145934e-05, "loss": 0.1547, "step": 1680 }, { "epoch": 0.875065070275898, "grad_norm": 0.2553052002789369, "learning_rate": 4.154649305329958e-05, "loss": 0.1591, "step": 1681 }, { "epoch": 0.8755856324830817, "grad_norm": 0.25048785354273917, "learning_rate": 4.153595812898732e-05, "loss": 0.1597, "step": 1682 }, { "epoch": 0.8761061946902655, "grad_norm": 0.25427482435094684, "learning_rate": 4.152541798184995e-05, "loss": 0.161, "step": 1683 }, { "epoch": 0.8766267568974493, "grad_norm": 0.24530895241431033, "learning_rate": 4.151487261521656e-05, "loss": 0.1581, "step": 1684 }, { "epoch": 0.877147319104633, "grad_norm": 0.25440752591956056, "learning_rate": 4.1504322032417864e-05, "loss": 0.1638, "step": 1685 }, { "epoch": 0.8776678813118167, "grad_norm": 0.2587793514600556, "learning_rate": 4.149376623678623e-05, "loss": 0.1587, "step": 1686 }, { "epoch": 0.8781884435190005, "grad_norm": 0.24239699887208244, "learning_rate": 4.148320523165566e-05, "loss": 0.1514, "step": 1687 }, { "epoch": 0.8787090057261843, "grad_norm": 0.24553152349960053, "learning_rate": 4.147263902036181e-05, "loss": 0.1538, "step": 1688 }, { "epoch": 0.879229567933368, "grad_norm": 0.2499331408281633, "learning_rate": 4.146206760624199e-05, "loss": 0.1603, "step": 1689 }, { "epoch": 0.8797501301405518, "grad_norm": 0.24703228194514043, "learning_rate": 4.145149099263515e-05, "loss": 0.1577, "step": 1690 }, { "epoch": 0.8802706923477356, "grad_norm": 0.2723118994778378, "learning_rate": 4.1440909182881857e-05, "loss": 0.1587, "step": 1691 }, { "epoch": 0.8807912545549194, "grad_norm": 0.2754696773958591, "learning_rate": 4.143032218032435e-05, "loss": 0.1562, "step": 1692 }, { "epoch": 0.881311816762103, "grad_norm": 0.2617623912854439, "learning_rate": 4.141972998830651e-05, "loss": 0.1585, "step": 1693 }, { "epoch": 0.8818323789692868, "grad_norm": 0.30890725644539296, "learning_rate": 4.140913261017382e-05, "loss": 0.1624, "step": 1694 }, { "epoch": 0.8823529411764706, "grad_norm": 0.24884174691256375, "learning_rate": 4.139853004927344e-05, "loss": 0.1559, "step": 1695 }, { "epoch": 0.8828735033836543, "grad_norm": 0.27439772981131394, "learning_rate": 4.1387922308954154e-05, "loss": 0.1562, "step": 1696 }, { "epoch": 0.8833940655908381, "grad_norm": 0.2542935955914035, "learning_rate": 4.137730939256636e-05, "loss": 0.1619, "step": 1697 }, { "epoch": 0.8839146277980219, "grad_norm": 0.28136889554044375, "learning_rate": 4.1366691303462144e-05, "loss": 0.1625, "step": 1698 }, { "epoch": 0.8844351900052057, "grad_norm": 0.247375024118648, "learning_rate": 4.135606804499516e-05, "loss": 0.1599, "step": 1699 }, { "epoch": 0.8849557522123894, "grad_norm": 0.2608875547949653, "learning_rate": 4.1345439620520744e-05, "loss": 0.1639, "step": 1700 }, { "epoch": 0.8854763144195731, "grad_norm": 0.2556809971776182, "learning_rate": 4.1334806033395845e-05, "loss": 0.1596, "step": 1701 }, { "epoch": 0.8859968766267569, "grad_norm": 0.25036073808421017, "learning_rate": 4.132416728697905e-05, "loss": 0.1567, "step": 1702 }, { "epoch": 0.8865174388339406, "grad_norm": 0.24789643869487482, "learning_rate": 4.131352338463056e-05, "loss": 0.1551, "step": 1703 }, { "epoch": 0.8870380010411244, "grad_norm": 0.2597436931442242, "learning_rate": 4.130287432971222e-05, "loss": 0.1635, "step": 1704 }, { "epoch": 0.8875585632483082, "grad_norm": 0.25176527895305045, "learning_rate": 4.1292220125587494e-05, "loss": 0.1529, "step": 1705 }, { "epoch": 0.888079125455492, "grad_norm": 0.22936720495484114, "learning_rate": 4.1281560775621475e-05, "loss": 0.1536, "step": 1706 }, { "epoch": 0.8885996876626757, "grad_norm": 0.29208711846524826, "learning_rate": 4.1270896283180896e-05, "loss": 0.1669, "step": 1707 }, { "epoch": 0.8891202498698595, "grad_norm": 0.24185973845450207, "learning_rate": 4.1260226651634074e-05, "loss": 0.153, "step": 1708 }, { "epoch": 0.8896408120770432, "grad_norm": 0.2405834957600981, "learning_rate": 4.1249551884351e-05, "loss": 0.1548, "step": 1709 }, { "epoch": 0.8901613742842269, "grad_norm": 0.2640002798637536, "learning_rate": 4.1238871984703255e-05, "loss": 0.152, "step": 1710 }, { "epoch": 0.8906819364914107, "grad_norm": 0.3051529300584874, "learning_rate": 4.122818695606403e-05, "loss": 0.161, "step": 1711 }, { "epoch": 0.8912024986985945, "grad_norm": 0.2700106707452003, "learning_rate": 4.121749680180818e-05, "loss": 0.1564, "step": 1712 }, { "epoch": 0.8917230609057782, "grad_norm": 0.26880881799117995, "learning_rate": 4.1206801525312144e-05, "loss": 0.1584, "step": 1713 }, { "epoch": 0.892243623112962, "grad_norm": 0.2658890951024931, "learning_rate": 4.119610112995398e-05, "loss": 0.1567, "step": 1714 }, { "epoch": 0.8927641853201458, "grad_norm": 0.2546814903680581, "learning_rate": 4.118539561911339e-05, "loss": 0.1554, "step": 1715 }, { "epoch": 0.8932847475273296, "grad_norm": 0.24712755281104856, "learning_rate": 4.1174684996171644e-05, "loss": 0.1535, "step": 1716 }, { "epoch": 0.8938053097345132, "grad_norm": 0.2806241667002411, "learning_rate": 4.116396926451168e-05, "loss": 0.1563, "step": 1717 }, { "epoch": 0.894325871941697, "grad_norm": 0.27342193998178105, "learning_rate": 4.115324842751802e-05, "loss": 0.1592, "step": 1718 }, { "epoch": 0.8948464341488808, "grad_norm": 0.23962746337276375, "learning_rate": 4.114252248857679e-05, "loss": 0.1472, "step": 1719 }, { "epoch": 0.8953669963560645, "grad_norm": 0.26324287304353916, "learning_rate": 4.1131791451075755e-05, "loss": 0.159, "step": 1720 }, { "epoch": 0.8958875585632483, "grad_norm": 0.26150443069954205, "learning_rate": 4.1121055318404264e-05, "loss": 0.1539, "step": 1721 }, { "epoch": 0.8964081207704321, "grad_norm": 0.24755890951530543, "learning_rate": 4.1110314093953305e-05, "loss": 0.1582, "step": 1722 }, { "epoch": 0.8969286829776159, "grad_norm": 0.27523785393031364, "learning_rate": 4.109956778111544e-05, "loss": 0.1629, "step": 1723 }, { "epoch": 0.8974492451847996, "grad_norm": 0.2591994387571103, "learning_rate": 4.108881638328486e-05, "loss": 0.1478, "step": 1724 }, { "epoch": 0.8979698073919833, "grad_norm": 0.29966605704015586, "learning_rate": 4.1078059903857355e-05, "loss": 0.1601, "step": 1725 }, { "epoch": 0.8984903695991671, "grad_norm": 0.25390075851127014, "learning_rate": 4.1067298346230335e-05, "loss": 0.1501, "step": 1726 }, { "epoch": 0.8990109318063508, "grad_norm": 0.2687980166473991, "learning_rate": 4.105653171380278e-05, "loss": 0.1498, "step": 1727 }, { "epoch": 0.8995314940135346, "grad_norm": 0.28950396723952554, "learning_rate": 4.10457600099753e-05, "loss": 0.1549, "step": 1728 }, { "epoch": 0.9000520562207184, "grad_norm": 0.25514115726981085, "learning_rate": 4.103498323815011e-05, "loss": 0.1647, "step": 1729 }, { "epoch": 0.9005726184279021, "grad_norm": 0.2587811609001319, "learning_rate": 4.1024201401731005e-05, "loss": 0.1583, "step": 1730 }, { "epoch": 0.9010931806350859, "grad_norm": 0.2604547864469705, "learning_rate": 4.1013414504123396e-05, "loss": 0.1561, "step": 1731 }, { "epoch": 0.9016137428422697, "grad_norm": 0.24850657249226776, "learning_rate": 4.1002622548734296e-05, "loss": 0.1522, "step": 1732 }, { "epoch": 0.9021343050494534, "grad_norm": 0.2625089988338353, "learning_rate": 4.099182553897229e-05, "loss": 0.1618, "step": 1733 }, { "epoch": 0.9026548672566371, "grad_norm": 0.2632632791671295, "learning_rate": 4.098102347824758e-05, "loss": 0.1555, "step": 1734 }, { "epoch": 0.9031754294638209, "grad_norm": 0.23666054780572104, "learning_rate": 4.097021636997196e-05, "loss": 0.1595, "step": 1735 }, { "epoch": 0.9036959916710047, "grad_norm": 0.2591388444201557, "learning_rate": 4.095940421755883e-05, "loss": 0.1656, "step": 1736 }, { "epoch": 0.9042165538781884, "grad_norm": 0.2591712441056336, "learning_rate": 4.094858702442316e-05, "loss": 0.1526, "step": 1737 }, { "epoch": 0.9047371160853722, "grad_norm": 0.24844682559914494, "learning_rate": 4.093776479398151e-05, "loss": 0.157, "step": 1738 }, { "epoch": 0.905257678292556, "grad_norm": 0.2559696276702533, "learning_rate": 4.092693752965208e-05, "loss": 0.155, "step": 1739 }, { "epoch": 0.9057782404997398, "grad_norm": 0.2568180726661143, "learning_rate": 4.091610523485458e-05, "loss": 0.1567, "step": 1740 }, { "epoch": 0.9062988027069234, "grad_norm": 0.27176199980897775, "learning_rate": 4.09052679130104e-05, "loss": 0.1473, "step": 1741 }, { "epoch": 0.9068193649141072, "grad_norm": 0.26678573538868655, "learning_rate": 4.089442556754243e-05, "loss": 0.1591, "step": 1742 }, { "epoch": 0.907339927121291, "grad_norm": 0.2568264736060921, "learning_rate": 4.088357820187521e-05, "loss": 0.1624, "step": 1743 }, { "epoch": 0.9078604893284747, "grad_norm": 0.2551009665433581, "learning_rate": 4.087272581943483e-05, "loss": 0.164, "step": 1744 }, { "epoch": 0.9083810515356585, "grad_norm": 0.2596099419866776, "learning_rate": 4.0861868423648985e-05, "loss": 0.1593, "step": 1745 }, { "epoch": 0.9089016137428423, "grad_norm": 0.25573551965558855, "learning_rate": 4.085100601794695e-05, "loss": 0.1598, "step": 1746 }, { "epoch": 0.9094221759500261, "grad_norm": 0.25024824627264436, "learning_rate": 4.084013860575956e-05, "loss": 0.1499, "step": 1747 }, { "epoch": 0.9099427381572098, "grad_norm": 0.24563669418483275, "learning_rate": 4.0829266190519264e-05, "loss": 0.1533, "step": 1748 }, { "epoch": 0.9104633003643935, "grad_norm": 0.25857714800578796, "learning_rate": 4.0818388775660083e-05, "loss": 0.1616, "step": 1749 }, { "epoch": 0.9109838625715773, "grad_norm": 0.25995854352713227, "learning_rate": 4.08075063646176e-05, "loss": 0.1607, "step": 1750 }, { "epoch": 0.911504424778761, "grad_norm": 0.24199870728320377, "learning_rate": 4.079661896082899e-05, "loss": 0.155, "step": 1751 }, { "epoch": 0.9120249869859448, "grad_norm": 0.2585528070362897, "learning_rate": 4.0785726567733e-05, "loss": 0.1695, "step": 1752 }, { "epoch": 0.9125455491931286, "grad_norm": 0.2535260703066128, "learning_rate": 4.0774829188769946e-05, "loss": 0.1553, "step": 1753 }, { "epoch": 0.9130661114003124, "grad_norm": 0.24975570980491604, "learning_rate": 4.076392682738175e-05, "loss": 0.155, "step": 1754 }, { "epoch": 0.9135866736074961, "grad_norm": 0.23729277290872783, "learning_rate": 4.075301948701186e-05, "loss": 0.1559, "step": 1755 }, { "epoch": 0.9141072358146799, "grad_norm": 0.2576836288112198, "learning_rate": 4.074210717110534e-05, "loss": 0.1508, "step": 1756 }, { "epoch": 0.9146277980218636, "grad_norm": 0.23905943214710218, "learning_rate": 4.07311898831088e-05, "loss": 0.1595, "step": 1757 }, { "epoch": 0.9151483602290473, "grad_norm": 0.2623063356166442, "learning_rate": 4.072026762647043e-05, "loss": 0.155, "step": 1758 }, { "epoch": 0.9156689224362311, "grad_norm": 0.25004028247757853, "learning_rate": 4.070934040463998e-05, "loss": 0.151, "step": 1759 }, { "epoch": 0.9161894846434149, "grad_norm": 0.24742792524698254, "learning_rate": 4.069840822106879e-05, "loss": 0.158, "step": 1760 }, { "epoch": 0.9167100468505986, "grad_norm": 0.2536956962460334, "learning_rate": 4.068747107920974e-05, "loss": 0.1525, "step": 1761 }, { "epoch": 0.9172306090577824, "grad_norm": 0.2483073074101714, "learning_rate": 4.067652898251729e-05, "loss": 0.1516, "step": 1762 }, { "epoch": 0.9177511712649662, "grad_norm": 0.24230992836435572, "learning_rate": 4.066558193444746e-05, "loss": 0.1521, "step": 1763 }, { "epoch": 0.91827173347215, "grad_norm": 0.26776759518460846, "learning_rate": 4.065462993845784e-05, "loss": 0.1615, "step": 1764 }, { "epoch": 0.9187922956793336, "grad_norm": 0.247701252468107, "learning_rate": 4.0643672998007593e-05, "loss": 0.156, "step": 1765 }, { "epoch": 0.9193128578865174, "grad_norm": 0.2525494661335194, "learning_rate": 4.063271111655741e-05, "loss": 0.1544, "step": 1766 }, { "epoch": 0.9198334200937012, "grad_norm": 0.24152047171275015, "learning_rate": 4.062174429756958e-05, "loss": 0.1582, "step": 1767 }, { "epoch": 0.9203539823008849, "grad_norm": 0.24113773563979826, "learning_rate": 4.0610772544507925e-05, "loss": 0.157, "step": 1768 }, { "epoch": 0.9208745445080687, "grad_norm": 0.24499303443994122, "learning_rate": 4.059979586083783e-05, "loss": 0.1589, "step": 1769 }, { "epoch": 0.9213951067152525, "grad_norm": 0.2441852531096595, "learning_rate": 4.0588814250026255e-05, "loss": 0.1514, "step": 1770 }, { "epoch": 0.9219156689224363, "grad_norm": 0.2618027069982354, "learning_rate": 4.05778277155417e-05, "loss": 0.1581, "step": 1771 }, { "epoch": 0.92243623112962, "grad_norm": 0.2604906720770046, "learning_rate": 4.056683626085422e-05, "loss": 0.161, "step": 1772 }, { "epoch": 0.9229567933368037, "grad_norm": 0.23267751863286024, "learning_rate": 4.0555839889435446e-05, "loss": 0.1482, "step": 1773 }, { "epoch": 0.9234773555439875, "grad_norm": 0.25641954777935727, "learning_rate": 4.054483860475851e-05, "loss": 0.1541, "step": 1774 }, { "epoch": 0.9239979177511712, "grad_norm": 0.2596515890918495, "learning_rate": 4.053383241029815e-05, "loss": 0.1583, "step": 1775 }, { "epoch": 0.924518479958355, "grad_norm": 0.23481778190953714, "learning_rate": 4.0522821309530635e-05, "loss": 0.1548, "step": 1776 }, { "epoch": 0.9250390421655388, "grad_norm": 0.24362441004243998, "learning_rate": 4.051180530593379e-05, "loss": 0.1584, "step": 1777 }, { "epoch": 0.9255596043727226, "grad_norm": 0.2520010539214911, "learning_rate": 4.0500784402986956e-05, "loss": 0.1551, "step": 1778 }, { "epoch": 0.9260801665799063, "grad_norm": 0.26110890320468977, "learning_rate": 4.0489758604171076e-05, "loss": 0.1531, "step": 1779 }, { "epoch": 0.9266007287870901, "grad_norm": 0.27497633290869694, "learning_rate": 4.047872791296859e-05, "loss": 0.1618, "step": 1780 }, { "epoch": 0.9271212909942738, "grad_norm": 0.26975908040013474, "learning_rate": 4.0467692332863515e-05, "loss": 0.1487, "step": 1781 }, { "epoch": 0.9276418532014575, "grad_norm": 0.26862955487856527, "learning_rate": 4.04566518673414e-05, "loss": 0.1561, "step": 1782 }, { "epoch": 0.9281624154086413, "grad_norm": 0.25511699277211003, "learning_rate": 4.044560651988933e-05, "loss": 0.1615, "step": 1783 }, { "epoch": 0.9286829776158251, "grad_norm": 0.2667422054425098, "learning_rate": 4.043455629399594e-05, "loss": 0.1582, "step": 1784 }, { "epoch": 0.9292035398230089, "grad_norm": 0.2706278456881864, "learning_rate": 4.0423501193151416e-05, "loss": 0.1537, "step": 1785 }, { "epoch": 0.9297241020301926, "grad_norm": 0.2540911793302258, "learning_rate": 4.041244122084747e-05, "loss": 0.1513, "step": 1786 }, { "epoch": 0.9302446642373764, "grad_norm": 0.25585354118375886, "learning_rate": 4.040137638057735e-05, "loss": 0.1558, "step": 1787 }, { "epoch": 0.9307652264445602, "grad_norm": 0.23363718441591247, "learning_rate": 4.039030667583585e-05, "loss": 0.1468, "step": 1788 }, { "epoch": 0.9312857886517438, "grad_norm": 0.2779496142875094, "learning_rate": 4.037923211011929e-05, "loss": 0.1513, "step": 1789 }, { "epoch": 0.9318063508589276, "grad_norm": 0.235742116241504, "learning_rate": 4.036815268692556e-05, "loss": 0.1501, "step": 1790 }, { "epoch": 0.9323269130661114, "grad_norm": 0.2561251442991573, "learning_rate": 4.035706840975403e-05, "loss": 0.1611, "step": 1791 }, { "epoch": 0.9328474752732951, "grad_norm": 0.2564452558632798, "learning_rate": 4.0345979282105637e-05, "loss": 0.1573, "step": 1792 }, { "epoch": 0.9333680374804789, "grad_norm": 0.2545473340743804, "learning_rate": 4.033488530748285e-05, "loss": 0.1543, "step": 1793 }, { "epoch": 0.9338885996876627, "grad_norm": 0.25319840729825216, "learning_rate": 4.032378648938966e-05, "loss": 0.1591, "step": 1794 }, { "epoch": 0.9344091618948465, "grad_norm": 0.24462206700432024, "learning_rate": 4.031268283133158e-05, "loss": 0.1492, "step": 1795 }, { "epoch": 0.9349297241020302, "grad_norm": 0.2442034089324051, "learning_rate": 4.030157433681568e-05, "loss": 0.1564, "step": 1796 }, { "epoch": 0.9354502863092139, "grad_norm": 0.23610337924487748, "learning_rate": 4.0290461009350535e-05, "loss": 0.1446, "step": 1797 }, { "epoch": 0.9359708485163977, "grad_norm": 0.7555257854589889, "learning_rate": 4.0279342852446234e-05, "loss": 0.154, "step": 1798 }, { "epoch": 0.9364914107235814, "grad_norm": 0.2730083918906193, "learning_rate": 4.026821986961443e-05, "loss": 0.1557, "step": 1799 }, { "epoch": 0.9370119729307652, "grad_norm": 0.24844427506768513, "learning_rate": 4.0257092064368266e-05, "loss": 0.1556, "step": 1800 }, { "epoch": 0.937532535137949, "grad_norm": 0.2550146011348462, "learning_rate": 4.0245959440222425e-05, "loss": 0.1526, "step": 1801 }, { "epoch": 0.9380530973451328, "grad_norm": 0.2643186732751096, "learning_rate": 4.023482200069311e-05, "loss": 0.1648, "step": 1802 }, { "epoch": 0.9385736595523165, "grad_norm": 0.2375368349668477, "learning_rate": 4.0223679749298025e-05, "loss": 0.1558, "step": 1803 }, { "epoch": 0.9390942217595003, "grad_norm": 0.24780404748279455, "learning_rate": 4.021253268955644e-05, "loss": 0.1558, "step": 1804 }, { "epoch": 0.939614783966684, "grad_norm": 0.24937088179496925, "learning_rate": 4.02013808249891e-05, "loss": 0.1525, "step": 1805 }, { "epoch": 0.9401353461738677, "grad_norm": 0.247516280623822, "learning_rate": 4.019022415911828e-05, "loss": 0.1497, "step": 1806 }, { "epoch": 0.9406559083810515, "grad_norm": 0.2565387438992933, "learning_rate": 4.0179062695467784e-05, "loss": 0.154, "step": 1807 }, { "epoch": 0.9411764705882353, "grad_norm": 0.25957570816875336, "learning_rate": 4.016789643756291e-05, "loss": 0.1575, "step": 1808 }, { "epoch": 0.941697032795419, "grad_norm": 0.25711864420404396, "learning_rate": 4.0156725388930495e-05, "loss": 0.1583, "step": 1809 }, { "epoch": 0.9422175950026028, "grad_norm": 0.2439002536135534, "learning_rate": 4.014554955309886e-05, "loss": 0.1538, "step": 1810 }, { "epoch": 0.9427381572097866, "grad_norm": 0.2734062021761085, "learning_rate": 4.0134368933597863e-05, "loss": 0.1587, "step": 1811 }, { "epoch": 0.9432587194169704, "grad_norm": 0.24078383939318468, "learning_rate": 4.012318353395887e-05, "loss": 0.1541, "step": 1812 }, { "epoch": 0.943779281624154, "grad_norm": 0.25527670699498406, "learning_rate": 4.011199335771475e-05, "loss": 0.1641, "step": 1813 }, { "epoch": 0.9442998438313378, "grad_norm": 0.26131896589926484, "learning_rate": 4.010079840839987e-05, "loss": 0.1602, "step": 1814 }, { "epoch": 0.9448204060385216, "grad_norm": 0.2457034759853004, "learning_rate": 4.0089598689550126e-05, "loss": 0.149, "step": 1815 }, { "epoch": 0.9453409682457053, "grad_norm": 0.2466235816689623, "learning_rate": 4.0078394204702895e-05, "loss": 0.151, "step": 1816 }, { "epoch": 0.9458615304528891, "grad_norm": 0.261112482926145, "learning_rate": 4.0067184957397096e-05, "loss": 0.1536, "step": 1817 }, { "epoch": 0.9463820926600729, "grad_norm": 0.2758752170090889, "learning_rate": 4.0055970951173116e-05, "loss": 0.1633, "step": 1818 }, { "epoch": 0.9469026548672567, "grad_norm": 0.266272488103662, "learning_rate": 4.004475218957287e-05, "loss": 0.1568, "step": 1819 }, { "epoch": 0.9474232170744404, "grad_norm": 0.2733192385804763, "learning_rate": 4.003352867613975e-05, "loss": 0.1526, "step": 1820 }, { "epoch": 0.9479437792816241, "grad_norm": 0.2502565692501297, "learning_rate": 4.002230041441868e-05, "loss": 0.1498, "step": 1821 }, { "epoch": 0.9484643414888079, "grad_norm": 0.2981229168833489, "learning_rate": 4.001106740795607e-05, "loss": 0.1535, "step": 1822 }, { "epoch": 0.9489849036959916, "grad_norm": 0.25792533892269764, "learning_rate": 3.9999829660299806e-05, "loss": 0.1546, "step": 1823 }, { "epoch": 0.9495054659031754, "grad_norm": 0.2754778565483347, "learning_rate": 3.998858717499931e-05, "loss": 0.154, "step": 1824 }, { "epoch": 0.9500260281103592, "grad_norm": 0.25581628643187587, "learning_rate": 3.997733995560547e-05, "loss": 0.1588, "step": 1825 }, { "epoch": 0.950546590317543, "grad_norm": 0.29974295677568763, "learning_rate": 3.9966088005670686e-05, "loss": 0.1581, "step": 1826 }, { "epoch": 0.9510671525247267, "grad_norm": 0.2566154047129761, "learning_rate": 3.995483132874885e-05, "loss": 0.1521, "step": 1827 }, { "epoch": 0.9515877147319105, "grad_norm": 0.26876875526607946, "learning_rate": 3.994356992839535e-05, "loss": 0.1608, "step": 1828 }, { "epoch": 0.9521082769390942, "grad_norm": 0.2774171688241344, "learning_rate": 3.993230380816705e-05, "loss": 0.1515, "step": 1829 }, { "epoch": 0.9526288391462779, "grad_norm": 0.24564551896719408, "learning_rate": 3.9921032971622306e-05, "loss": 0.1561, "step": 1830 }, { "epoch": 0.9531494013534617, "grad_norm": 0.2463503110268932, "learning_rate": 3.9909757422321e-05, "loss": 0.1537, "step": 1831 }, { "epoch": 0.9536699635606455, "grad_norm": 0.267020735900316, "learning_rate": 3.9898477163824454e-05, "loss": 0.1522, "step": 1832 }, { "epoch": 0.9541905257678293, "grad_norm": 0.2602448863644486, "learning_rate": 3.98871921996955e-05, "loss": 0.1571, "step": 1833 }, { "epoch": 0.954711087975013, "grad_norm": 0.24776161325403143, "learning_rate": 3.9875902533498465e-05, "loss": 0.1578, "step": 1834 }, { "epoch": 0.9552316501821968, "grad_norm": 0.25779665449960754, "learning_rate": 3.986460816879913e-05, "loss": 0.1496, "step": 1835 }, { "epoch": 0.9557522123893806, "grad_norm": 0.2701760173076487, "learning_rate": 3.985330910916482e-05, "loss": 0.1569, "step": 1836 }, { "epoch": 0.9562727745965642, "grad_norm": 0.25325580552113314, "learning_rate": 3.984200535816427e-05, "loss": 0.1566, "step": 1837 }, { "epoch": 0.956793336803748, "grad_norm": 0.29664763959854834, "learning_rate": 3.983069691936773e-05, "loss": 0.1534, "step": 1838 }, { "epoch": 0.9573138990109318, "grad_norm": 0.27443913965950784, "learning_rate": 3.981938379634696e-05, "loss": 0.1587, "step": 1839 }, { "epoch": 0.9578344612181156, "grad_norm": 0.3036453223080745, "learning_rate": 3.980806599267514e-05, "loss": 0.1622, "step": 1840 }, { "epoch": 0.9583550234252993, "grad_norm": 0.2703683477848925, "learning_rate": 3.979674351192697e-05, "loss": 0.1512, "step": 1841 }, { "epoch": 0.9588755856324831, "grad_norm": 0.25118682959451855, "learning_rate": 3.978541635767862e-05, "loss": 0.1479, "step": 1842 }, { "epoch": 0.9593961478396669, "grad_norm": 0.27007643798115183, "learning_rate": 3.977408453350773e-05, "loss": 0.1478, "step": 1843 }, { "epoch": 0.9599167100468506, "grad_norm": 0.24905501625360876, "learning_rate": 3.976274804299342e-05, "loss": 0.1492, "step": 1844 }, { "epoch": 0.9604372722540343, "grad_norm": 0.2721594566362824, "learning_rate": 3.975140688971628e-05, "loss": 0.1577, "step": 1845 }, { "epoch": 0.9609578344612181, "grad_norm": 0.26822647420622714, "learning_rate": 3.974006107725837e-05, "loss": 0.1542, "step": 1846 }, { "epoch": 0.9614783966684018, "grad_norm": 0.2610073474569343, "learning_rate": 3.972871060920323e-05, "loss": 0.1573, "step": 1847 }, { "epoch": 0.9619989588755856, "grad_norm": 0.24741924184000724, "learning_rate": 3.971735548913586e-05, "loss": 0.1572, "step": 1848 }, { "epoch": 0.9625195210827694, "grad_norm": 0.2497560572681831, "learning_rate": 3.970599572064275e-05, "loss": 0.1627, "step": 1849 }, { "epoch": 0.9630400832899532, "grad_norm": 0.261255891586638, "learning_rate": 3.969463130731183e-05, "loss": 0.146, "step": 1850 }, { "epoch": 0.9635606454971369, "grad_norm": 0.2661862951079788, "learning_rate": 3.968326225273251e-05, "loss": 0.1549, "step": 1851 }, { "epoch": 0.9640812077043207, "grad_norm": 0.26696523143923034, "learning_rate": 3.9671888560495676e-05, "loss": 0.1584, "step": 1852 }, { "epoch": 0.9646017699115044, "grad_norm": 0.28425969486144664, "learning_rate": 3.966051023419366e-05, "loss": 0.1585, "step": 1853 }, { "epoch": 0.9651223321186881, "grad_norm": 0.2252880385971139, "learning_rate": 3.964912727742027e-05, "loss": 0.1468, "step": 1854 }, { "epoch": 0.9656428943258719, "grad_norm": 0.2746297685611525, "learning_rate": 3.963773969377077e-05, "loss": 0.1526, "step": 1855 }, { "epoch": 0.9661634565330557, "grad_norm": 0.2562090900129971, "learning_rate": 3.9626347486841896e-05, "loss": 0.1574, "step": 1856 }, { "epoch": 0.9666840187402395, "grad_norm": 0.2641947002589958, "learning_rate": 3.961495066023184e-05, "loss": 0.1579, "step": 1857 }, { "epoch": 0.9672045809474232, "grad_norm": 0.2576610491884646, "learning_rate": 3.9603549217540235e-05, "loss": 0.1566, "step": 1858 }, { "epoch": 0.967725143154607, "grad_norm": 0.22546307618447003, "learning_rate": 3.959214316236821e-05, "loss": 0.1451, "step": 1859 }, { "epoch": 0.9682457053617908, "grad_norm": 0.27228400782836965, "learning_rate": 3.95807324983183e-05, "loss": 0.1544, "step": 1860 }, { "epoch": 0.9687662675689744, "grad_norm": 0.2404970285210459, "learning_rate": 3.956931722899454e-05, "loss": 0.1586, "step": 1861 }, { "epoch": 0.9692868297761582, "grad_norm": 0.2483984351066557, "learning_rate": 3.955789735800241e-05, "loss": 0.1519, "step": 1862 }, { "epoch": 0.969807391983342, "grad_norm": 0.26080827724904837, "learning_rate": 3.954647288894883e-05, "loss": 0.1577, "step": 1863 }, { "epoch": 0.9703279541905258, "grad_norm": 0.24279634879644127, "learning_rate": 3.953504382544216e-05, "loss": 0.1529, "step": 1864 }, { "epoch": 0.9708485163977095, "grad_norm": 0.24834103365291627, "learning_rate": 3.952361017109226e-05, "loss": 0.1494, "step": 1865 }, { "epoch": 0.9713690786048933, "grad_norm": 0.24460951509898635, "learning_rate": 3.95121719295104e-05, "loss": 0.1571, "step": 1866 }, { "epoch": 0.9718896408120771, "grad_norm": 0.26683915335760894, "learning_rate": 3.95007291043093e-05, "loss": 0.1531, "step": 1867 }, { "epoch": 0.9724102030192608, "grad_norm": 0.24054568963656242, "learning_rate": 3.9489281699103145e-05, "loss": 0.1516, "step": 1868 }, { "epoch": 0.9729307652264445, "grad_norm": 0.24375115372025324, "learning_rate": 3.947782971750755e-05, "loss": 0.1488, "step": 1869 }, { "epoch": 0.9734513274336283, "grad_norm": 0.25412067934433286, "learning_rate": 3.94663731631396e-05, "loss": 0.1607, "step": 1870 }, { "epoch": 0.973971889640812, "grad_norm": 0.2397702543825332, "learning_rate": 3.945491203961779e-05, "loss": 0.1511, "step": 1871 }, { "epoch": 0.9744924518479958, "grad_norm": 0.2806540915038973, "learning_rate": 3.94434463505621e-05, "loss": 0.154, "step": 1872 }, { "epoch": 0.9750130140551796, "grad_norm": 0.2546854416214566, "learning_rate": 3.9431976099593896e-05, "loss": 0.1605, "step": 1873 }, { "epoch": 0.9755335762623634, "grad_norm": 0.2575288721304389, "learning_rate": 3.942050129033603e-05, "loss": 0.1485, "step": 1874 }, { "epoch": 0.9760541384695471, "grad_norm": 0.26908803144635207, "learning_rate": 3.9409021926412795e-05, "loss": 0.1549, "step": 1875 }, { "epoch": 0.9765747006767309, "grad_norm": 0.29289715788111687, "learning_rate": 3.9397538011449894e-05, "loss": 0.1563, "step": 1876 }, { "epoch": 0.9770952628839146, "grad_norm": 0.23831369536611632, "learning_rate": 3.938604954907449e-05, "loss": 0.1541, "step": 1877 }, { "epoch": 0.9776158250910983, "grad_norm": 0.2589658450603122, "learning_rate": 3.9374556542915167e-05, "loss": 0.1519, "step": 1878 }, { "epoch": 0.9781363872982821, "grad_norm": 0.3008301402515451, "learning_rate": 3.936305899660195e-05, "loss": 0.1554, "step": 1879 }, { "epoch": 0.9786569495054659, "grad_norm": 0.2579134695099253, "learning_rate": 3.935155691376631e-05, "loss": 0.16, "step": 1880 }, { "epoch": 0.9791775117126497, "grad_norm": 0.2755461345867494, "learning_rate": 3.934005029804112e-05, "loss": 0.1495, "step": 1881 }, { "epoch": 0.9796980739198334, "grad_norm": 0.25850824899183367, "learning_rate": 3.9328539153060725e-05, "loss": 0.1589, "step": 1882 }, { "epoch": 0.9802186361270172, "grad_norm": 0.2823202784003672, "learning_rate": 3.931702348246087e-05, "loss": 0.1588, "step": 1883 }, { "epoch": 0.980739198334201, "grad_norm": 0.25907134737796256, "learning_rate": 3.930550328987875e-05, "loss": 0.1516, "step": 1884 }, { "epoch": 0.9812597605413846, "grad_norm": 0.26429422471247077, "learning_rate": 3.929397857895297e-05, "loss": 0.1571, "step": 1885 }, { "epoch": 0.9817803227485684, "grad_norm": 0.25776033042758834, "learning_rate": 3.928244935332356e-05, "loss": 0.1565, "step": 1886 }, { "epoch": 0.9823008849557522, "grad_norm": 0.23117386280694466, "learning_rate": 3.9270915616632e-05, "loss": 0.1532, "step": 1887 }, { "epoch": 0.982821447162936, "grad_norm": 0.2589601378005726, "learning_rate": 3.9259377372521176e-05, "loss": 0.1603, "step": 1888 }, { "epoch": 0.9833420093701197, "grad_norm": 0.24736053021683718, "learning_rate": 3.924783462463541e-05, "loss": 0.1482, "step": 1889 }, { "epoch": 0.9838625715773035, "grad_norm": 0.24050172557296737, "learning_rate": 3.923628737662043e-05, "loss": 0.1558, "step": 1890 }, { "epoch": 0.9843831337844873, "grad_norm": 0.25204285474497545, "learning_rate": 3.9224735632123395e-05, "loss": 0.1588, "step": 1891 }, { "epoch": 0.984903695991671, "grad_norm": 0.24569292644864882, "learning_rate": 3.921317939479289e-05, "loss": 0.149, "step": 1892 }, { "epoch": 0.9854242581988547, "grad_norm": 0.2591242502253903, "learning_rate": 3.920161866827889e-05, "loss": 0.1515, "step": 1893 }, { "epoch": 0.9859448204060385, "grad_norm": 0.25314114488184825, "learning_rate": 3.919005345623285e-05, "loss": 0.1583, "step": 1894 }, { "epoch": 0.9864653826132223, "grad_norm": 0.2707662697095083, "learning_rate": 3.917848376230757e-05, "loss": 0.1474, "step": 1895 }, { "epoch": 0.986985944820406, "grad_norm": 0.2489741320109898, "learning_rate": 3.916690959015731e-05, "loss": 0.1604, "step": 1896 }, { "epoch": 0.9875065070275898, "grad_norm": 0.26650693500099853, "learning_rate": 3.915533094343773e-05, "loss": 0.1484, "step": 1897 }, { "epoch": 0.9880270692347736, "grad_norm": 0.24576821619143538, "learning_rate": 3.914374782580591e-05, "loss": 0.1533, "step": 1898 }, { "epoch": 0.9885476314419573, "grad_norm": 0.2242445667075516, "learning_rate": 3.913216024092032e-05, "loss": 0.1453, "step": 1899 }, { "epoch": 0.9890681936491411, "grad_norm": 0.24170044956082523, "learning_rate": 3.912056819244089e-05, "loss": 0.15, "step": 1900 }, { "epoch": 0.9895887558563248, "grad_norm": 0.23999183798149767, "learning_rate": 3.910897168402889e-05, "loss": 0.1445, "step": 1901 }, { "epoch": 0.9901093180635085, "grad_norm": 0.24978308599133406, "learning_rate": 3.909737071934707e-05, "loss": 0.1516, "step": 1902 }, { "epoch": 0.9906298802706923, "grad_norm": 0.2477655018309477, "learning_rate": 3.9085765302059554e-05, "loss": 0.1617, "step": 1903 }, { "epoch": 0.9911504424778761, "grad_norm": 0.24166671338937565, "learning_rate": 3.907415543583184e-05, "loss": 0.1537, "step": 1904 }, { "epoch": 0.9916710046850599, "grad_norm": 0.24364319933406497, "learning_rate": 3.9062541124330884e-05, "loss": 0.1529, "step": 1905 }, { "epoch": 0.9921915668922436, "grad_norm": 0.2646257000445511, "learning_rate": 3.905092237122504e-05, "loss": 0.1554, "step": 1906 }, { "epoch": 0.9927121290994274, "grad_norm": 0.2496525283069466, "learning_rate": 3.903929918018403e-05, "loss": 0.1511, "step": 1907 }, { "epoch": 0.9932326913066112, "grad_norm": 0.2309200833828225, "learning_rate": 3.902767155487901e-05, "loss": 0.154, "step": 1908 }, { "epoch": 0.9937532535137948, "grad_norm": 0.2451270375417471, "learning_rate": 3.9016039498982515e-05, "loss": 0.1453, "step": 1909 }, { "epoch": 0.9942738157209786, "grad_norm": 0.2699658100912516, "learning_rate": 3.90044030161685e-05, "loss": 0.1545, "step": 1910 }, { "epoch": 0.9947943779281624, "grad_norm": 0.26692292482327223, "learning_rate": 3.8992762110112304e-05, "loss": 0.1561, "step": 1911 }, { "epoch": 0.9953149401353462, "grad_norm": 0.24585278390934867, "learning_rate": 3.8981116784490666e-05, "loss": 0.147, "step": 1912 }, { "epoch": 0.9958355023425299, "grad_norm": 0.25147680505901215, "learning_rate": 3.896946704298172e-05, "loss": 0.1521, "step": 1913 }, { "epoch": 0.9963560645497137, "grad_norm": 0.25230893137578037, "learning_rate": 3.8957812889265e-05, "loss": 0.1541, "step": 1914 }, { "epoch": 0.9968766267568975, "grad_norm": 0.24038428447771637, "learning_rate": 3.8946154327021434e-05, "loss": 0.1498, "step": 1915 }, { "epoch": 0.9973971889640812, "grad_norm": 0.2466022950313137, "learning_rate": 3.893449135993333e-05, "loss": 0.1632, "step": 1916 }, { "epoch": 0.9979177511712649, "grad_norm": 0.25366577440341587, "learning_rate": 3.89228239916844e-05, "loss": 0.146, "step": 1917 }, { "epoch": 0.9984383133784487, "grad_norm": 0.24171289918790415, "learning_rate": 3.8911152225959743e-05, "loss": 0.1561, "step": 1918 }, { "epoch": 0.9989588755856325, "grad_norm": 0.2581505141982399, "learning_rate": 3.889947606644584e-05, "loss": 0.1595, "step": 1919 }, { "epoch": 0.9994794377928162, "grad_norm": 0.24747053780836764, "learning_rate": 3.888779551683057e-05, "loss": 0.1519, "step": 1920 }, { "epoch": 1.0, "grad_norm": 0.237124505232837, "learning_rate": 3.8876110580803186e-05, "loss": 0.1481, "step": 1921 }, { "epoch": 1.0005205622071838, "grad_norm": 0.2661627018308619, "learning_rate": 3.886442126205435e-05, "loss": 0.1194, "step": 1922 }, { "epoch": 1.0010411244143675, "grad_norm": 0.2520650370090721, "learning_rate": 3.8852727564276086e-05, "loss": 0.1134, "step": 1923 }, { "epoch": 1.0015616866215513, "grad_norm": 0.22725768987546488, "learning_rate": 3.884102949116181e-05, "loss": 0.1149, "step": 1924 }, { "epoch": 1.002082248828735, "grad_norm": 0.25305995229234013, "learning_rate": 3.8829327046406304e-05, "loss": 0.1111, "step": 1925 }, { "epoch": 1.0026028110359189, "grad_norm": 0.27939931613711055, "learning_rate": 3.881762023370576e-05, "loss": 0.1104, "step": 1926 }, { "epoch": 1.0031233732431026, "grad_norm": 0.3043520089700027, "learning_rate": 3.880590905675773e-05, "loss": 0.1232, "step": 1927 }, { "epoch": 1.0036439354502864, "grad_norm": 0.261721428329474, "learning_rate": 3.879419351926115e-05, "loss": 0.1076, "step": 1928 }, { "epoch": 1.0041644976574702, "grad_norm": 0.2700146216167829, "learning_rate": 3.878247362491633e-05, "loss": 0.1143, "step": 1929 }, { "epoch": 1.0046850598646537, "grad_norm": 0.2512971409772084, "learning_rate": 3.877074937742495e-05, "loss": 0.1063, "step": 1930 }, { "epoch": 1.0052056220718375, "grad_norm": 0.24157054726586163, "learning_rate": 3.8759020780490094e-05, "loss": 0.1139, "step": 1931 }, { "epoch": 1.0057261842790213, "grad_norm": 0.25231694869276755, "learning_rate": 3.8747287837816184e-05, "loss": 0.1167, "step": 1932 }, { "epoch": 1.006246746486205, "grad_norm": 0.28056071702883195, "learning_rate": 3.8735550553109024e-05, "loss": 0.1148, "step": 1933 }, { "epoch": 1.0067673086933888, "grad_norm": 0.23802118673016223, "learning_rate": 3.87238089300758e-05, "loss": 0.1061, "step": 1934 }, { "epoch": 1.0072878709005726, "grad_norm": 0.2518313655139843, "learning_rate": 3.8712062972425077e-05, "loss": 0.1144, "step": 1935 }, { "epoch": 1.0078084331077564, "grad_norm": 0.2687038083950964, "learning_rate": 3.870031268386676e-05, "loss": 0.1153, "step": 1936 }, { "epoch": 1.0083289953149401, "grad_norm": 0.2618181075208367, "learning_rate": 3.868855806811212e-05, "loss": 0.1177, "step": 1937 }, { "epoch": 1.008849557522124, "grad_norm": 0.2685854416587716, "learning_rate": 3.867679912887385e-05, "loss": 0.1186, "step": 1938 }, { "epoch": 1.0093701197293077, "grad_norm": 0.26559471123613715, "learning_rate": 3.866503586986595e-05, "loss": 0.1161, "step": 1939 }, { "epoch": 1.0098906819364915, "grad_norm": 0.24409238460263524, "learning_rate": 3.865326829480381e-05, "loss": 0.1102, "step": 1940 }, { "epoch": 1.0104112441436752, "grad_norm": 0.2937743298818115, "learning_rate": 3.864149640740417e-05, "loss": 0.1145, "step": 1941 }, { "epoch": 1.010931806350859, "grad_norm": 0.25001092468218145, "learning_rate": 3.862972021138514e-05, "loss": 0.1117, "step": 1942 }, { "epoch": 1.0114523685580428, "grad_norm": 0.24032698365614474, "learning_rate": 3.86179397104662e-05, "loss": 0.1083, "step": 1943 }, { "epoch": 1.0119729307652265, "grad_norm": 0.26454925482485414, "learning_rate": 3.860615490836817e-05, "loss": 0.1122, "step": 1944 }, { "epoch": 1.01249349297241, "grad_norm": 0.25931975068403923, "learning_rate": 3.859436580881325e-05, "loss": 0.113, "step": 1945 }, { "epoch": 1.0130140551795939, "grad_norm": 0.28362163389354017, "learning_rate": 3.858257241552498e-05, "loss": 0.1158, "step": 1946 }, { "epoch": 1.0135346173867776, "grad_norm": 0.24978572696444806, "learning_rate": 3.857077473222825e-05, "loss": 0.1084, "step": 1947 }, { "epoch": 1.0140551795939614, "grad_norm": 0.24629450541328152, "learning_rate": 3.855897276264934e-05, "loss": 0.1108, "step": 1948 }, { "epoch": 1.0145757418011452, "grad_norm": 0.24448076133842503, "learning_rate": 3.8547166510515854e-05, "loss": 0.1142, "step": 1949 }, { "epoch": 1.015096304008329, "grad_norm": 0.25912362617154444, "learning_rate": 3.8535355979556755e-05, "loss": 0.1185, "step": 1950 }, { "epoch": 1.0156168662155127, "grad_norm": 0.26406004163474434, "learning_rate": 3.852354117350235e-05, "loss": 0.1137, "step": 1951 }, { "epoch": 1.0161374284226965, "grad_norm": 0.23169769727224848, "learning_rate": 3.8511722096084313e-05, "loss": 0.1058, "step": 1952 }, { "epoch": 1.0166579906298803, "grad_norm": 0.2420173219027427, "learning_rate": 3.8499898751035656e-05, "loss": 0.1091, "step": 1953 }, { "epoch": 1.017178552837064, "grad_norm": 0.24237055917616832, "learning_rate": 3.848807114209074e-05, "loss": 0.1121, "step": 1954 }, { "epoch": 1.0176991150442478, "grad_norm": 0.2417178678114401, "learning_rate": 3.8476239272985284e-05, "loss": 0.1138, "step": 1955 }, { "epoch": 1.0182196772514316, "grad_norm": 0.24659252456869502, "learning_rate": 3.846440314745633e-05, "loss": 0.1173, "step": 1956 }, { "epoch": 1.0187402394586154, "grad_norm": 0.234545477823226, "learning_rate": 3.8452562769242276e-05, "loss": 0.1136, "step": 1957 }, { "epoch": 1.0192608016657991, "grad_norm": 0.23798416850355458, "learning_rate": 3.844071814208288e-05, "loss": 0.1121, "step": 1958 }, { "epoch": 1.019781363872983, "grad_norm": 0.26896997384709637, "learning_rate": 3.842886926971922e-05, "loss": 0.1119, "step": 1959 }, { "epoch": 1.0203019260801667, "grad_norm": 0.2461574213164607, "learning_rate": 3.8417016155893716e-05, "loss": 0.1127, "step": 1960 }, { "epoch": 1.0208224882873504, "grad_norm": 0.23767808403209464, "learning_rate": 3.840515880435013e-05, "loss": 0.1085, "step": 1961 }, { "epoch": 1.021343050494534, "grad_norm": 0.2713143335946204, "learning_rate": 3.839329721883358e-05, "loss": 0.1111, "step": 1962 }, { "epoch": 1.0218636127017178, "grad_norm": 0.2599294251623557, "learning_rate": 3.8381431403090494e-05, "loss": 0.1126, "step": 1963 }, { "epoch": 1.0223841749089015, "grad_norm": 0.24545293936346022, "learning_rate": 3.8369561360868656e-05, "loss": 0.1105, "step": 1964 }, { "epoch": 1.0229047371160853, "grad_norm": 0.24512412624134056, "learning_rate": 3.835768709591717e-05, "loss": 0.1097, "step": 1965 }, { "epoch": 1.023425299323269, "grad_norm": 0.2458438658532382, "learning_rate": 3.8345808611986485e-05, "loss": 0.1068, "step": 1966 }, { "epoch": 1.0239458615304529, "grad_norm": 0.2426897803892748, "learning_rate": 3.8333925912828384e-05, "loss": 0.1108, "step": 1967 }, { "epoch": 1.0244664237376366, "grad_norm": 0.24168253147617336, "learning_rate": 3.832203900219597e-05, "loss": 0.1103, "step": 1968 }, { "epoch": 1.0249869859448204, "grad_norm": 0.25236496660379437, "learning_rate": 3.8310147883843684e-05, "loss": 0.109, "step": 1969 }, { "epoch": 1.0255075481520042, "grad_norm": 0.25308298024794296, "learning_rate": 3.829825256152729e-05, "loss": 0.1119, "step": 1970 }, { "epoch": 1.026028110359188, "grad_norm": 0.256204183314334, "learning_rate": 3.82863530390039e-05, "loss": 0.1108, "step": 1971 }, { "epoch": 1.0265486725663717, "grad_norm": 0.24156358649582654, "learning_rate": 3.8274449320031926e-05, "loss": 0.1084, "step": 1972 }, { "epoch": 1.0270692347735555, "grad_norm": 0.24950618643926795, "learning_rate": 3.826254140837111e-05, "loss": 0.1087, "step": 1973 }, { "epoch": 1.0275897969807393, "grad_norm": 0.2704998223408513, "learning_rate": 3.8250629307782535e-05, "loss": 0.1133, "step": 1974 }, { "epoch": 1.028110359187923, "grad_norm": 0.25017914226933313, "learning_rate": 3.8238713022028595e-05, "loss": 0.1105, "step": 1975 }, { "epoch": 1.0286309213951068, "grad_norm": 0.2327093355495663, "learning_rate": 3.8226792554873004e-05, "loss": 0.113, "step": 1976 }, { "epoch": 1.0291514836022904, "grad_norm": 0.2719152461496494, "learning_rate": 3.821486791008081e-05, "loss": 0.1239, "step": 1977 }, { "epoch": 1.0296720458094741, "grad_norm": 0.2397787036646458, "learning_rate": 3.820293909141835e-05, "loss": 0.1153, "step": 1978 }, { "epoch": 1.030192608016658, "grad_norm": 0.24038853770587257, "learning_rate": 3.819100610265332e-05, "loss": 0.1184, "step": 1979 }, { "epoch": 1.0307131702238417, "grad_norm": 0.24961313789239628, "learning_rate": 3.8179068947554705e-05, "loss": 0.1124, "step": 1980 }, { "epoch": 1.0312337324310255, "grad_norm": 0.2382377547825293, "learning_rate": 3.8167127629892815e-05, "loss": 0.11, "step": 1981 }, { "epoch": 1.0317542946382092, "grad_norm": 0.2532706658437514, "learning_rate": 3.815518215343928e-05, "loss": 0.1179, "step": 1982 }, { "epoch": 1.032274856845393, "grad_norm": 0.2656733745200885, "learning_rate": 3.8143232521967023e-05, "loss": 0.1149, "step": 1983 }, { "epoch": 1.0327954190525768, "grad_norm": 0.23840852166865217, "learning_rate": 3.813127873925031e-05, "loss": 0.1068, "step": 1984 }, { "epoch": 1.0333159812597605, "grad_norm": 0.25521240664720873, "learning_rate": 3.811932080906468e-05, "loss": 0.1085, "step": 1985 }, { "epoch": 1.0338365434669443, "grad_norm": 0.24469566160163492, "learning_rate": 3.8107358735187036e-05, "loss": 0.1128, "step": 1986 }, { "epoch": 1.034357105674128, "grad_norm": 0.25281133665260414, "learning_rate": 3.809539252139553e-05, "loss": 0.1128, "step": 1987 }, { "epoch": 1.0348776678813119, "grad_norm": 0.236637048209141, "learning_rate": 3.8083422171469666e-05, "loss": 0.104, "step": 1988 }, { "epoch": 1.0353982300884956, "grad_norm": 0.2729698170542485, "learning_rate": 3.807144768919022e-05, "loss": 0.1153, "step": 1989 }, { "epoch": 1.0359187922956794, "grad_norm": 0.26111532376784635, "learning_rate": 3.8059469078339305e-05, "loss": 0.1096, "step": 1990 }, { "epoch": 1.0364393545028632, "grad_norm": 0.2528115929930001, "learning_rate": 3.8047486342700314e-05, "loss": 0.1117, "step": 1991 }, { "epoch": 1.036959916710047, "grad_norm": 0.2614038022166877, "learning_rate": 3.803549948605797e-05, "loss": 0.1145, "step": 1992 }, { "epoch": 1.0374804789172307, "grad_norm": 0.2584809095985116, "learning_rate": 3.8023508512198256e-05, "loss": 0.1125, "step": 1993 }, { "epoch": 1.0380010411244143, "grad_norm": 0.2510595198139831, "learning_rate": 3.8011513424908504e-05, "loss": 0.1182, "step": 1994 }, { "epoch": 1.038521603331598, "grad_norm": 0.2490798615545912, "learning_rate": 3.7999514227977304e-05, "loss": 0.1069, "step": 1995 }, { "epoch": 1.0390421655387818, "grad_norm": 0.23506443265432583, "learning_rate": 3.798751092519456e-05, "loss": 0.1091, "step": 1996 }, { "epoch": 1.0395627277459656, "grad_norm": 0.2542184882450879, "learning_rate": 3.7975503520351487e-05, "loss": 0.1168, "step": 1997 }, { "epoch": 1.0400832899531494, "grad_norm": 0.2622709219446434, "learning_rate": 3.796349201724058e-05, "loss": 0.1121, "step": 1998 }, { "epoch": 1.0406038521603331, "grad_norm": 0.24618731175514902, "learning_rate": 3.795147641965561e-05, "loss": 0.1143, "step": 1999 }, { "epoch": 1.041124414367517, "grad_norm": 0.26095944678719163, "learning_rate": 3.7939456731391684e-05, "loss": 0.1157, "step": 2000 }, { "epoch": 1.0416449765747007, "grad_norm": 0.24783987249021047, "learning_rate": 3.792743295624517e-05, "loss": 0.1168, "step": 2001 }, { "epoch": 1.0421655387818844, "grad_norm": 0.2528026533974678, "learning_rate": 3.791540509801373e-05, "loss": 0.1114, "step": 2002 }, { "epoch": 1.0426861009890682, "grad_norm": 0.23900993330244627, "learning_rate": 3.7903373160496345e-05, "loss": 0.1089, "step": 2003 }, { "epoch": 1.043206663196252, "grad_norm": 0.2464392822573349, "learning_rate": 3.789133714749323e-05, "loss": 0.1115, "step": 2004 }, { "epoch": 1.0437272254034358, "grad_norm": 0.26170131605706815, "learning_rate": 3.787929706280594e-05, "loss": 0.1181, "step": 2005 }, { "epoch": 1.0442477876106195, "grad_norm": 0.2366142963015739, "learning_rate": 3.786725291023728e-05, "loss": 0.1084, "step": 2006 }, { "epoch": 1.0447683498178033, "grad_norm": 0.266920747864476, "learning_rate": 3.785520469359138e-05, "loss": 0.1193, "step": 2007 }, { "epoch": 1.045288912024987, "grad_norm": 0.27917270091917423, "learning_rate": 3.784315241667359e-05, "loss": 0.1216, "step": 2008 }, { "epoch": 1.0458094742321706, "grad_norm": 0.2524069132942275, "learning_rate": 3.7831096083290606e-05, "loss": 0.1145, "step": 2009 }, { "epoch": 1.0463300364393544, "grad_norm": 0.27357050117506915, "learning_rate": 3.781903569725036e-05, "loss": 0.1171, "step": 2010 }, { "epoch": 1.0468505986465382, "grad_norm": 0.23696843787160707, "learning_rate": 3.780697126236211e-05, "loss": 0.1143, "step": 2011 }, { "epoch": 1.047371160853722, "grad_norm": 0.23491005269928766, "learning_rate": 3.779490278243634e-05, "loss": 0.1082, "step": 2012 }, { "epoch": 1.0478917230609057, "grad_norm": 0.24529859449463776, "learning_rate": 3.778283026128485e-05, "loss": 0.1138, "step": 2013 }, { "epoch": 1.0484122852680895, "grad_norm": 0.24704977515279603, "learning_rate": 3.7770753702720704e-05, "loss": 0.1124, "step": 2014 }, { "epoch": 1.0489328474752733, "grad_norm": 0.26691179513701707, "learning_rate": 3.775867311055823e-05, "loss": 0.1162, "step": 2015 }, { "epoch": 1.049453409682457, "grad_norm": 0.2427635803168682, "learning_rate": 3.7746588488613066e-05, "loss": 0.1085, "step": 2016 }, { "epoch": 1.0499739718896408, "grad_norm": 0.2551238836654876, "learning_rate": 3.773449984070207e-05, "loss": 0.11, "step": 2017 }, { "epoch": 1.0504945340968246, "grad_norm": 0.24382085261536607, "learning_rate": 3.7722407170643416e-05, "loss": 0.1103, "step": 2018 }, { "epoch": 1.0510150963040084, "grad_norm": 0.2349225844442972, "learning_rate": 3.7710310482256526e-05, "loss": 0.115, "step": 2019 }, { "epoch": 1.0515356585111921, "grad_norm": 0.24160862206632094, "learning_rate": 3.7698209779362105e-05, "loss": 0.1111, "step": 2020 }, { "epoch": 1.052056220718376, "grad_norm": 0.24412984723875725, "learning_rate": 3.768610506578211e-05, "loss": 0.1079, "step": 2021 }, { "epoch": 1.0525767829255597, "grad_norm": 0.24330022038065835, "learning_rate": 3.7673996345339765e-05, "loss": 0.1101, "step": 2022 }, { "epoch": 1.0530973451327434, "grad_norm": 0.26374011548874077, "learning_rate": 3.7661883621859585e-05, "loss": 0.1063, "step": 2023 }, { "epoch": 1.0536179073399272, "grad_norm": 0.24698833556767213, "learning_rate": 3.764976689916732e-05, "loss": 0.1128, "step": 2024 }, { "epoch": 1.054138469547111, "grad_norm": 0.2534386235100205, "learning_rate": 3.7637646181090006e-05, "loss": 0.1116, "step": 2025 }, { "epoch": 1.0546590317542945, "grad_norm": 0.2698669420224393, "learning_rate": 3.7625521471455914e-05, "loss": 0.1146, "step": 2026 }, { "epoch": 1.0551795939614783, "grad_norm": 0.2399199660805788, "learning_rate": 3.76133927740946e-05, "loss": 0.1147, "step": 2027 }, { "epoch": 1.055700156168662, "grad_norm": 0.24762045246035963, "learning_rate": 3.760126009283688e-05, "loss": 0.1114, "step": 2028 }, { "epoch": 1.0562207183758459, "grad_norm": 0.24458494730365296, "learning_rate": 3.758912343151481e-05, "loss": 0.1138, "step": 2029 }, { "epoch": 1.0567412805830296, "grad_norm": 0.23831510708786413, "learning_rate": 3.75769827939617e-05, "loss": 0.1093, "step": 2030 }, { "epoch": 1.0572618427902134, "grad_norm": 0.24160472062392452, "learning_rate": 3.7564838184012155e-05, "loss": 0.1113, "step": 2031 }, { "epoch": 1.0577824049973972, "grad_norm": 0.25777500815863746, "learning_rate": 3.755268960550199e-05, "loss": 0.1126, "step": 2032 }, { "epoch": 1.058302967204581, "grad_norm": 0.2469273261838509, "learning_rate": 3.754053706226829e-05, "loss": 0.1093, "step": 2033 }, { "epoch": 1.0588235294117647, "grad_norm": 0.2403499464753065, "learning_rate": 3.75283805581494e-05, "loss": 0.1093, "step": 2034 }, { "epoch": 1.0593440916189485, "grad_norm": 0.28349200510154626, "learning_rate": 3.751622009698492e-05, "loss": 0.1128, "step": 2035 }, { "epoch": 1.0598646538261323, "grad_norm": 0.25717282657195967, "learning_rate": 3.7504055682615676e-05, "loss": 0.1128, "step": 2036 }, { "epoch": 1.060385216033316, "grad_norm": 0.2604631820265362, "learning_rate": 3.749188731888375e-05, "loss": 0.1094, "step": 2037 }, { "epoch": 1.0609057782404998, "grad_norm": 0.26157189061505526, "learning_rate": 3.7479715009632486e-05, "loss": 0.1152, "step": 2038 }, { "epoch": 1.0614263404476836, "grad_norm": 0.2558378121103914, "learning_rate": 3.7467538758706476e-05, "loss": 0.1084, "step": 2039 }, { "epoch": 1.0619469026548674, "grad_norm": 0.24072034047832436, "learning_rate": 3.7455358569951535e-05, "loss": 0.109, "step": 2040 }, { "epoch": 1.062467464862051, "grad_norm": 0.2560891463466215, "learning_rate": 3.744317444721473e-05, "loss": 0.1097, "step": 2041 }, { "epoch": 1.0629880270692347, "grad_norm": 0.2441562331368522, "learning_rate": 3.743098639434438e-05, "loss": 0.1126, "step": 2042 }, { "epoch": 1.0635085892764184, "grad_norm": 0.25909422394913734, "learning_rate": 3.7418794415190037e-05, "loss": 0.1126, "step": 2043 }, { "epoch": 1.0640291514836022, "grad_norm": 0.25192159362102, "learning_rate": 3.74065985136025e-05, "loss": 0.113, "step": 2044 }, { "epoch": 1.064549713690786, "grad_norm": 0.24579868026393167, "learning_rate": 3.73943986934338e-05, "loss": 0.1136, "step": 2045 }, { "epoch": 1.0650702758979698, "grad_norm": 0.25827104066791506, "learning_rate": 3.738219495853721e-05, "loss": 0.1161, "step": 2046 }, { "epoch": 1.0655908381051535, "grad_norm": 0.23400282041641796, "learning_rate": 3.736998731276722e-05, "loss": 0.1115, "step": 2047 }, { "epoch": 1.0661114003123373, "grad_norm": 0.26815101610132586, "learning_rate": 3.7357775759979605e-05, "loss": 0.1113, "step": 2048 }, { "epoch": 1.066631962519521, "grad_norm": 0.24275941822819658, "learning_rate": 3.734556030403131e-05, "loss": 0.1115, "step": 2049 }, { "epoch": 1.0671525247267049, "grad_norm": 0.2718950911696723, "learning_rate": 3.733334094878057e-05, "loss": 0.1155, "step": 2050 }, { "epoch": 1.0676730869338886, "grad_norm": 0.2598394861210938, "learning_rate": 3.73211176980868e-05, "loss": 0.112, "step": 2051 }, { "epoch": 1.0681936491410724, "grad_norm": 0.2694385440911815, "learning_rate": 3.73088905558107e-05, "loss": 0.1155, "step": 2052 }, { "epoch": 1.0687142113482562, "grad_norm": 0.24106350307174138, "learning_rate": 3.7296659525814146e-05, "loss": 0.1106, "step": 2053 }, { "epoch": 1.06923477355544, "grad_norm": 0.262198101864761, "learning_rate": 3.7284424611960275e-05, "loss": 0.1111, "step": 2054 }, { "epoch": 1.0697553357626237, "grad_norm": 0.2546146585988391, "learning_rate": 3.727218581811346e-05, "loss": 0.1142, "step": 2055 }, { "epoch": 1.0702758979698075, "grad_norm": 0.24247156568617306, "learning_rate": 3.725994314813925e-05, "loss": 0.1093, "step": 2056 }, { "epoch": 1.0707964601769913, "grad_norm": 0.23958179241477287, "learning_rate": 3.724769660590447e-05, "loss": 0.1062, "step": 2057 }, { "epoch": 1.0713170223841748, "grad_norm": 0.2396103519223926, "learning_rate": 3.723544619527714e-05, "loss": 0.1158, "step": 2058 }, { "epoch": 1.0718375845913586, "grad_norm": 0.24141752361795885, "learning_rate": 3.722319192012652e-05, "loss": 0.1163, "step": 2059 }, { "epoch": 1.0723581467985424, "grad_norm": 0.2530546244951208, "learning_rate": 3.721093378432306e-05, "loss": 0.1103, "step": 2060 }, { "epoch": 1.0728787090057261, "grad_norm": 0.2304892834331245, "learning_rate": 3.7198671791738475e-05, "loss": 0.1138, "step": 2061 }, { "epoch": 1.07339927121291, "grad_norm": 0.24116172581054715, "learning_rate": 3.718640594624566e-05, "loss": 0.1131, "step": 2062 }, { "epoch": 1.0739198334200937, "grad_norm": 0.2514432482339406, "learning_rate": 3.7174136251718736e-05, "loss": 0.1147, "step": 2063 }, { "epoch": 1.0744403956272774, "grad_norm": 0.24575627264010017, "learning_rate": 3.716186271203305e-05, "loss": 0.1112, "step": 2064 }, { "epoch": 1.0749609578344612, "grad_norm": 0.2408037168496382, "learning_rate": 3.714958533106515e-05, "loss": 0.1094, "step": 2065 }, { "epoch": 1.075481520041645, "grad_norm": 0.248061764942323, "learning_rate": 3.713730411269282e-05, "loss": 0.1165, "step": 2066 }, { "epoch": 1.0760020822488288, "grad_norm": 0.2372533367139487, "learning_rate": 3.7125019060795024e-05, "loss": 0.1062, "step": 2067 }, { "epoch": 1.0765226444560125, "grad_norm": 0.2383605045988699, "learning_rate": 3.711273017925196e-05, "loss": 0.1029, "step": 2068 }, { "epoch": 1.0770432066631963, "grad_norm": 0.2609070795749889, "learning_rate": 3.710043747194503e-05, "loss": 0.1099, "step": 2069 }, { "epoch": 1.07756376887038, "grad_norm": 0.25094133107875816, "learning_rate": 3.708814094275683e-05, "loss": 0.1104, "step": 2070 }, { "epoch": 1.0780843310775639, "grad_norm": 0.2555320702860627, "learning_rate": 3.70758405955712e-05, "loss": 0.1118, "step": 2071 }, { "epoch": 1.0786048932847476, "grad_norm": 0.2557360841701921, "learning_rate": 3.7063536434273135e-05, "loss": 0.1078, "step": 2072 }, { "epoch": 1.0791254554919312, "grad_norm": 0.30210930503756817, "learning_rate": 3.705122846274889e-05, "loss": 0.1116, "step": 2073 }, { "epoch": 1.079646017699115, "grad_norm": 0.283413324681584, "learning_rate": 3.703891668488587e-05, "loss": 0.1168, "step": 2074 }, { "epoch": 1.0801665799062987, "grad_norm": 0.22741963249503896, "learning_rate": 3.702660110457272e-05, "loss": 0.1086, "step": 2075 }, { "epoch": 1.0806871421134825, "grad_norm": 0.24697100297311528, "learning_rate": 3.7014281725699276e-05, "loss": 0.1138, "step": 2076 }, { "epoch": 1.0812077043206663, "grad_norm": 0.2555266097287812, "learning_rate": 3.700195855215656e-05, "loss": 0.1094, "step": 2077 }, { "epoch": 1.08172826652785, "grad_norm": 0.25611897463687516, "learning_rate": 3.6989631587836814e-05, "loss": 0.1131, "step": 2078 }, { "epoch": 1.0822488287350338, "grad_norm": 0.24097390848814207, "learning_rate": 3.6977300836633466e-05, "loss": 0.1082, "step": 2079 }, { "epoch": 1.0827693909422176, "grad_norm": 0.27090232208959586, "learning_rate": 3.696496630244114e-05, "loss": 0.113, "step": 2080 }, { "epoch": 1.0832899531494014, "grad_norm": 0.2435755711000546, "learning_rate": 3.695262798915564e-05, "loss": 0.1082, "step": 2081 }, { "epoch": 1.0838105153565851, "grad_norm": 0.2440321008013806, "learning_rate": 3.694028590067401e-05, "loss": 0.1088, "step": 2082 }, { "epoch": 1.084331077563769, "grad_norm": 0.2523999632061997, "learning_rate": 3.6927940040894424e-05, "loss": 0.1123, "step": 2083 }, { "epoch": 1.0848516397709527, "grad_norm": 0.25809490414472963, "learning_rate": 3.691559041371631e-05, "loss": 0.1156, "step": 2084 }, { "epoch": 1.0853722019781364, "grad_norm": 0.2764366832810147, "learning_rate": 3.6903237023040235e-05, "loss": 0.1098, "step": 2085 }, { "epoch": 1.0858927641853202, "grad_norm": 0.24467790732933922, "learning_rate": 3.689087987276797e-05, "loss": 0.1083, "step": 2086 }, { "epoch": 1.086413326392504, "grad_norm": 0.2590589962094532, "learning_rate": 3.68785189668025e-05, "loss": 0.1142, "step": 2087 }, { "epoch": 1.0869338885996878, "grad_norm": 0.23869572486032326, "learning_rate": 3.686615430904795e-05, "loss": 0.1141, "step": 2088 }, { "epoch": 1.0874544508068715, "grad_norm": 0.24558579990983817, "learning_rate": 3.685378590340968e-05, "loss": 0.1086, "step": 2089 }, { "epoch": 1.087975013014055, "grad_norm": 0.2451590372072089, "learning_rate": 3.684141375379418e-05, "loss": 0.1132, "step": 2090 }, { "epoch": 1.0884955752212389, "grad_norm": 0.24564234860463044, "learning_rate": 3.6829037864109176e-05, "loss": 0.1109, "step": 2091 }, { "epoch": 1.0890161374284226, "grad_norm": 0.26529150512656974, "learning_rate": 3.6816658238263525e-05, "loss": 0.1199, "step": 2092 }, { "epoch": 1.0895366996356064, "grad_norm": 0.2585931700922823, "learning_rate": 3.680427488016731e-05, "loss": 0.1194, "step": 2093 }, { "epoch": 1.0900572618427902, "grad_norm": 0.2464657000667077, "learning_rate": 3.679188779373177e-05, "loss": 0.1147, "step": 2094 }, { "epoch": 1.090577824049974, "grad_norm": 0.2452855437235244, "learning_rate": 3.677949698286931e-05, "loss": 0.1149, "step": 2095 }, { "epoch": 1.0910983862571577, "grad_norm": 0.2534553426066007, "learning_rate": 3.676710245149353e-05, "loss": 0.1171, "step": 2096 }, { "epoch": 1.0916189484643415, "grad_norm": 0.23973694981857727, "learning_rate": 3.675470420351921e-05, "loss": 0.1158, "step": 2097 }, { "epoch": 1.0921395106715253, "grad_norm": 0.24051783041794145, "learning_rate": 3.6742302242862284e-05, "loss": 0.1105, "step": 2098 }, { "epoch": 1.092660072878709, "grad_norm": 0.24443405018149944, "learning_rate": 3.6729896573439867e-05, "loss": 0.1116, "step": 2099 }, { "epoch": 1.0931806350858928, "grad_norm": 0.2597312632422425, "learning_rate": 3.671748719917025e-05, "loss": 0.1124, "step": 2100 }, { "epoch": 1.0937011972930766, "grad_norm": 0.24822503893302403, "learning_rate": 3.6705074123972885e-05, "loss": 0.1096, "step": 2101 }, { "epoch": 1.0942217595002603, "grad_norm": 0.24740697095379377, "learning_rate": 3.669265735176842e-05, "loss": 0.1083, "step": 2102 }, { "epoch": 1.0947423217074441, "grad_norm": 0.2645450510720223, "learning_rate": 3.668023688647863e-05, "loss": 0.1126, "step": 2103 }, { "epoch": 1.095262883914628, "grad_norm": 0.2535454890824587, "learning_rate": 3.666781273202646e-05, "loss": 0.1208, "step": 2104 }, { "epoch": 1.0957834461218114, "grad_norm": 0.2575428275855921, "learning_rate": 3.6655384892336075e-05, "loss": 0.1168, "step": 2105 }, { "epoch": 1.0963040083289952, "grad_norm": 0.2309286179235019, "learning_rate": 3.664295337133274e-05, "loss": 0.1124, "step": 2106 }, { "epoch": 1.096824570536179, "grad_norm": 0.2512339973023405, "learning_rate": 3.6630518172942915e-05, "loss": 0.1151, "step": 2107 }, { "epoch": 1.0973451327433628, "grad_norm": 0.2386785747665995, "learning_rate": 3.6618079301094216e-05, "loss": 0.1172, "step": 2108 }, { "epoch": 1.0978656949505465, "grad_norm": 0.23724836929193713, "learning_rate": 3.660563675971541e-05, "loss": 0.1115, "step": 2109 }, { "epoch": 1.0983862571577303, "grad_norm": 0.23600725505996434, "learning_rate": 3.659319055273644e-05, "loss": 0.1144, "step": 2110 }, { "epoch": 1.098906819364914, "grad_norm": 0.23772868705388486, "learning_rate": 3.6580740684088396e-05, "loss": 0.113, "step": 2111 }, { "epoch": 1.0994273815720979, "grad_norm": 0.237076578253062, "learning_rate": 3.656828715770352e-05, "loss": 0.1093, "step": 2112 }, { "epoch": 1.0999479437792816, "grad_norm": 0.24382321669254056, "learning_rate": 3.655582997751521e-05, "loss": 0.1106, "step": 2113 }, { "epoch": 1.1004685059864654, "grad_norm": 0.2377191561622143, "learning_rate": 3.654336914745804e-05, "loss": 0.1149, "step": 2114 }, { "epoch": 1.1009890681936492, "grad_norm": 0.2490830706313632, "learning_rate": 3.65309046714677e-05, "loss": 0.1216, "step": 2115 }, { "epoch": 1.101509630400833, "grad_norm": 0.23768207413624293, "learning_rate": 3.651843655348107e-05, "loss": 0.1116, "step": 2116 }, { "epoch": 1.1020301926080167, "grad_norm": 0.23880298996606503, "learning_rate": 3.650596479743616e-05, "loss": 0.1111, "step": 2117 }, { "epoch": 1.1025507548152005, "grad_norm": 0.241400343263037, "learning_rate": 3.649348940727212e-05, "loss": 0.1106, "step": 2118 }, { "epoch": 1.1030713170223843, "grad_norm": 0.22973487630981618, "learning_rate": 3.6481010386929264e-05, "loss": 0.1107, "step": 2119 }, { "epoch": 1.103591879229568, "grad_norm": 0.24451820553198225, "learning_rate": 3.6468527740349045e-05, "loss": 0.115, "step": 2120 }, { "epoch": 1.1041124414367518, "grad_norm": 0.27311985417564316, "learning_rate": 3.645604147147408e-05, "loss": 0.1123, "step": 2121 }, { "epoch": 1.1046330036439354, "grad_norm": 0.2438128553800211, "learning_rate": 3.644355158424808e-05, "loss": 0.1138, "step": 2122 }, { "epoch": 1.1051535658511191, "grad_norm": 0.25561227871892656, "learning_rate": 3.6431058082615964e-05, "loss": 0.1236, "step": 2123 }, { "epoch": 1.105674128058303, "grad_norm": 0.24292979052263167, "learning_rate": 3.6418560970523745e-05, "loss": 0.1126, "step": 2124 }, { "epoch": 1.1061946902654867, "grad_norm": 0.23356756437347412, "learning_rate": 3.6406060251918604e-05, "loss": 0.1092, "step": 2125 }, { "epoch": 1.1067152524726704, "grad_norm": 0.2510189996756236, "learning_rate": 3.6393555930748846e-05, "loss": 0.1134, "step": 2126 }, { "epoch": 1.1072358146798542, "grad_norm": 0.2389541755449117, "learning_rate": 3.63810480109639e-05, "loss": 0.1101, "step": 2127 }, { "epoch": 1.107756376887038, "grad_norm": 0.24947876309951245, "learning_rate": 3.636853649651438e-05, "loss": 0.1134, "step": 2128 }, { "epoch": 1.1082769390942218, "grad_norm": 0.24327455652806065, "learning_rate": 3.6356021391351976e-05, "loss": 0.1128, "step": 2129 }, { "epoch": 1.1087975013014055, "grad_norm": 0.25621361610147303, "learning_rate": 3.634350269942956e-05, "loss": 0.115, "step": 2130 }, { "epoch": 1.1093180635085893, "grad_norm": 0.24171392174728737, "learning_rate": 3.633098042470111e-05, "loss": 0.1145, "step": 2131 }, { "epoch": 1.109838625715773, "grad_norm": 0.24255117880153898, "learning_rate": 3.631845457112174e-05, "loss": 0.1118, "step": 2132 }, { "epoch": 1.1103591879229568, "grad_norm": 0.2589821697017109, "learning_rate": 3.63059251426477e-05, "loss": 0.1118, "step": 2133 }, { "epoch": 1.1108797501301406, "grad_norm": 0.2546318202945326, "learning_rate": 3.6293392143236374e-05, "loss": 0.1149, "step": 2134 }, { "epoch": 1.1114003123373244, "grad_norm": 0.24797440602614088, "learning_rate": 3.628085557684625e-05, "loss": 0.1151, "step": 2135 }, { "epoch": 1.1119208745445082, "grad_norm": 0.25596352809519574, "learning_rate": 3.6268315447436976e-05, "loss": 0.1104, "step": 2136 }, { "epoch": 1.1124414367516917, "grad_norm": 0.24903661132125504, "learning_rate": 3.6255771758969303e-05, "loss": 0.1149, "step": 2137 }, { "epoch": 1.1129619989588755, "grad_norm": 0.24389645955042044, "learning_rate": 3.62432245154051e-05, "loss": 0.1155, "step": 2138 }, { "epoch": 1.1134825611660593, "grad_norm": 0.235781861888092, "learning_rate": 3.6230673720707393e-05, "loss": 0.112, "step": 2139 }, { "epoch": 1.114003123373243, "grad_norm": 0.2308900894585995, "learning_rate": 3.621811937884029e-05, "loss": 0.1077, "step": 2140 }, { "epoch": 1.1145236855804268, "grad_norm": 0.2753295219850416, "learning_rate": 3.620556149376904e-05, "loss": 0.1156, "step": 2141 }, { "epoch": 1.1150442477876106, "grad_norm": 0.2470693560397974, "learning_rate": 3.6193000069460006e-05, "loss": 0.1096, "step": 2142 }, { "epoch": 1.1155648099947943, "grad_norm": 0.24067635208687654, "learning_rate": 3.618043510988068e-05, "loss": 0.1102, "step": 2143 }, { "epoch": 1.1160853722019781, "grad_norm": 0.2553351905414518, "learning_rate": 3.616786661899965e-05, "loss": 0.1135, "step": 2144 }, { "epoch": 1.116605934409162, "grad_norm": 0.26177624105631986, "learning_rate": 3.6155294600786625e-05, "loss": 0.1082, "step": 2145 }, { "epoch": 1.1171264966163457, "grad_norm": 0.2657931741696369, "learning_rate": 3.6142719059212456e-05, "loss": 0.1168, "step": 2146 }, { "epoch": 1.1176470588235294, "grad_norm": 0.25488377144439966, "learning_rate": 3.613013999824906e-05, "loss": 0.113, "step": 2147 }, { "epoch": 1.1181676210307132, "grad_norm": 0.2755475530717958, "learning_rate": 3.6117557421869506e-05, "loss": 0.1098, "step": 2148 }, { "epoch": 1.118688183237897, "grad_norm": 0.25201657034289093, "learning_rate": 3.6104971334047956e-05, "loss": 0.1086, "step": 2149 }, { "epoch": 1.1192087454450808, "grad_norm": 0.2357938145901859, "learning_rate": 3.609238173875966e-05, "loss": 0.1087, "step": 2150 }, { "epoch": 1.1197293076522645, "grad_norm": 0.2540699466524578, "learning_rate": 3.6079788639981036e-05, "loss": 0.1143, "step": 2151 }, { "epoch": 1.1202498698594483, "grad_norm": 0.24546722256380496, "learning_rate": 3.606719204168954e-05, "loss": 0.1117, "step": 2152 }, { "epoch": 1.120770432066632, "grad_norm": 0.23557984732078704, "learning_rate": 3.6054591947863784e-05, "loss": 0.1108, "step": 2153 }, { "epoch": 1.1212909942738156, "grad_norm": 0.24342719518637673, "learning_rate": 3.604198836248344e-05, "loss": 0.1106, "step": 2154 }, { "epoch": 1.1218115564809994, "grad_norm": 0.24766981664804752, "learning_rate": 3.602938128952933e-05, "loss": 0.1145, "step": 2155 }, { "epoch": 1.1223321186881832, "grad_norm": 0.25337709609875236, "learning_rate": 3.6016770732983344e-05, "loss": 0.1107, "step": 2156 }, { "epoch": 1.122852680895367, "grad_norm": 0.2777055344080702, "learning_rate": 3.600415669682849e-05, "loss": 0.1171, "step": 2157 }, { "epoch": 1.1233732431025507, "grad_norm": 0.24802200183807002, "learning_rate": 3.599153918504886e-05, "loss": 0.1148, "step": 2158 }, { "epoch": 1.1238938053097345, "grad_norm": 0.249326912022627, "learning_rate": 3.597891820162964e-05, "loss": 0.114, "step": 2159 }, { "epoch": 1.1244143675169183, "grad_norm": 0.24976539124029015, "learning_rate": 3.596629375055716e-05, "loss": 0.1108, "step": 2160 }, { "epoch": 1.124934929724102, "grad_norm": 0.2549908652545141, "learning_rate": 3.5953665835818765e-05, "loss": 0.1161, "step": 2161 }, { "epoch": 1.1254554919312858, "grad_norm": 0.2589331285801467, "learning_rate": 3.594103446140297e-05, "loss": 0.1144, "step": 2162 }, { "epoch": 1.1259760541384696, "grad_norm": 0.24712526867777315, "learning_rate": 3.592839963129934e-05, "loss": 0.1202, "step": 2163 }, { "epoch": 1.1264966163456533, "grad_norm": 0.252358949933663, "learning_rate": 3.591576134949854e-05, "loss": 0.1127, "step": 2164 }, { "epoch": 1.1270171785528371, "grad_norm": 0.2357834325734446, "learning_rate": 3.590311961999233e-05, "loss": 0.1059, "step": 2165 }, { "epoch": 1.127537740760021, "grad_norm": 0.25635650100122037, "learning_rate": 3.589047444677355e-05, "loss": 0.1154, "step": 2166 }, { "epoch": 1.1280583029672047, "grad_norm": 0.28412878079518017, "learning_rate": 3.587782583383615e-05, "loss": 0.1173, "step": 2167 }, { "epoch": 1.1285788651743884, "grad_norm": 0.26029129448235633, "learning_rate": 3.586517378517514e-05, "loss": 0.1131, "step": 2168 }, { "epoch": 1.129099427381572, "grad_norm": 0.2592810117661404, "learning_rate": 3.585251830478663e-05, "loss": 0.1118, "step": 2169 }, { "epoch": 1.1296199895887558, "grad_norm": 0.25743920274899534, "learning_rate": 3.58398593966678e-05, "loss": 0.1113, "step": 2170 }, { "epoch": 1.1301405517959395, "grad_norm": 0.26024239886218314, "learning_rate": 3.582719706481693e-05, "loss": 0.1187, "step": 2171 }, { "epoch": 1.1306611140031233, "grad_norm": 0.2534072347951432, "learning_rate": 3.581453131323337e-05, "loss": 0.1157, "step": 2172 }, { "epoch": 1.131181676210307, "grad_norm": 0.2537355619987752, "learning_rate": 3.580186214591756e-05, "loss": 0.1095, "step": 2173 }, { "epoch": 1.1317022384174908, "grad_norm": 0.23632427238631776, "learning_rate": 3.578918956687101e-05, "loss": 0.112, "step": 2174 }, { "epoch": 1.1322228006246746, "grad_norm": 0.2412909737617186, "learning_rate": 3.5776513580096315e-05, "loss": 0.111, "step": 2175 }, { "epoch": 1.1327433628318584, "grad_norm": 0.2483748166841977, "learning_rate": 3.576383418959713e-05, "loss": 0.1088, "step": 2176 }, { "epoch": 1.1332639250390422, "grad_norm": 0.25025876140950926, "learning_rate": 3.57511513993782e-05, "loss": 0.1154, "step": 2177 }, { "epoch": 1.133784487246226, "grad_norm": 0.24527483686430052, "learning_rate": 3.5738465213445345e-05, "loss": 0.1133, "step": 2178 }, { "epoch": 1.1343050494534097, "grad_norm": 0.2400418074455827, "learning_rate": 3.572577563580545e-05, "loss": 0.1087, "step": 2179 }, { "epoch": 1.1348256116605935, "grad_norm": 0.2473622001205245, "learning_rate": 3.571308267046647e-05, "loss": 0.1094, "step": 2180 }, { "epoch": 1.1353461738677773, "grad_norm": 0.22926206403165528, "learning_rate": 3.5700386321437446e-05, "loss": 0.103, "step": 2181 }, { "epoch": 1.135866736074961, "grad_norm": 0.2530508376636812, "learning_rate": 3.5687686592728465e-05, "loss": 0.1164, "step": 2182 }, { "epoch": 1.1363872982821448, "grad_norm": 0.2547621831001377, "learning_rate": 3.567498348835069e-05, "loss": 0.1165, "step": 2183 }, { "epoch": 1.1369078604893286, "grad_norm": 0.24555495677708386, "learning_rate": 3.566227701231637e-05, "loss": 0.1146, "step": 2184 }, { "epoch": 1.1374284226965123, "grad_norm": 0.24522163832061175, "learning_rate": 3.5649567168638786e-05, "loss": 0.1094, "step": 2185 }, { "epoch": 1.1379489849036961, "grad_norm": 0.24316629123363587, "learning_rate": 3.56368539613323e-05, "loss": 0.111, "step": 2186 }, { "epoch": 1.1384695471108797, "grad_norm": 0.24441005322777853, "learning_rate": 3.562413739441234e-05, "loss": 0.1114, "step": 2187 }, { "epoch": 1.1389901093180634, "grad_norm": 0.25037434739181624, "learning_rate": 3.561141747189538e-05, "loss": 0.1154, "step": 2188 }, { "epoch": 1.1395106715252472, "grad_norm": 0.2561173459299334, "learning_rate": 3.559869419779897e-05, "loss": 0.1142, "step": 2189 }, { "epoch": 1.140031233732431, "grad_norm": 0.23707593564275278, "learning_rate": 3.558596757614172e-05, "loss": 0.1099, "step": 2190 }, { "epoch": 1.1405517959396148, "grad_norm": 0.2500778362026866, "learning_rate": 3.5573237610943264e-05, "loss": 0.1161, "step": 2191 }, { "epoch": 1.1410723581467985, "grad_norm": 0.27492670789651996, "learning_rate": 3.556050430622435e-05, "loss": 0.1192, "step": 2192 }, { "epoch": 1.1415929203539823, "grad_norm": 0.2605598668612606, "learning_rate": 3.5547767666006735e-05, "loss": 0.1132, "step": 2193 }, { "epoch": 1.142113482561166, "grad_norm": 0.2605622258673461, "learning_rate": 3.553502769431323e-05, "loss": 0.114, "step": 2194 }, { "epoch": 1.1426340447683498, "grad_norm": 0.2577447414552421, "learning_rate": 3.5522284395167724e-05, "loss": 0.1149, "step": 2195 }, { "epoch": 1.1431546069755336, "grad_norm": 0.2728095113979058, "learning_rate": 3.550953777259515e-05, "loss": 0.1194, "step": 2196 }, { "epoch": 1.1436751691827174, "grad_norm": 0.22575779206167243, "learning_rate": 3.549678783062147e-05, "loss": 0.1104, "step": 2197 }, { "epoch": 1.1441957313899012, "grad_norm": 0.2551987577448281, "learning_rate": 3.54840345732737e-05, "loss": 0.1158, "step": 2198 }, { "epoch": 1.144716293597085, "grad_norm": 0.24948023939005337, "learning_rate": 3.547127800457994e-05, "loss": 0.1139, "step": 2199 }, { "epoch": 1.1452368558042687, "grad_norm": 0.260804602778483, "learning_rate": 3.54585181285693e-05, "loss": 0.1142, "step": 2200 }, { "epoch": 1.1457574180114523, "grad_norm": 0.24626439137019285, "learning_rate": 3.5445754949271924e-05, "loss": 0.114, "step": 2201 }, { "epoch": 1.146277980218636, "grad_norm": 0.23624464258694697, "learning_rate": 3.543298847071904e-05, "loss": 0.1101, "step": 2202 }, { "epoch": 1.1467985424258198, "grad_norm": 0.2557734293730534, "learning_rate": 3.542021869694289e-05, "loss": 0.1176, "step": 2203 }, { "epoch": 1.1473191046330036, "grad_norm": 0.25015139608483516, "learning_rate": 3.5407445631976756e-05, "loss": 0.1131, "step": 2204 }, { "epoch": 1.1478396668401873, "grad_norm": 0.23610501538977566, "learning_rate": 3.5394669279854966e-05, "loss": 0.1153, "step": 2205 }, { "epoch": 1.1483602290473711, "grad_norm": 0.22976718602322604, "learning_rate": 3.53818896446129e-05, "loss": 0.1184, "step": 2206 }, { "epoch": 1.148880791254555, "grad_norm": 0.240542256545283, "learning_rate": 3.536910673028695e-05, "loss": 0.1121, "step": 2207 }, { "epoch": 1.1494013534617387, "grad_norm": 0.23366916566434953, "learning_rate": 3.5356320540914556e-05, "loss": 0.1139, "step": 2208 }, { "epoch": 1.1499219156689224, "grad_norm": 0.2277321141291514, "learning_rate": 3.534353108053419e-05, "loss": 0.1108, "step": 2209 }, { "epoch": 1.1504424778761062, "grad_norm": 0.2245190916941551, "learning_rate": 3.5330738353185364e-05, "loss": 0.1106, "step": 2210 }, { "epoch": 1.15096304008329, "grad_norm": 0.2441026791856793, "learning_rate": 3.531794236290862e-05, "loss": 0.1111, "step": 2211 }, { "epoch": 1.1514836022904738, "grad_norm": 0.2693550306160406, "learning_rate": 3.530514311374552e-05, "loss": 0.1185, "step": 2212 }, { "epoch": 1.1520041644976575, "grad_norm": 0.2583486674649021, "learning_rate": 3.529234060973867e-05, "loss": 0.1053, "step": 2213 }, { "epoch": 1.1525247267048413, "grad_norm": 0.24969080269353847, "learning_rate": 3.527953485493168e-05, "loss": 0.1149, "step": 2214 }, { "epoch": 1.153045288912025, "grad_norm": 0.29109726318240187, "learning_rate": 3.526672585336923e-05, "loss": 0.113, "step": 2215 }, { "epoch": 1.1535658511192088, "grad_norm": 0.2342749727367158, "learning_rate": 3.525391360909697e-05, "loss": 0.109, "step": 2216 }, { "epoch": 1.1540864133263926, "grad_norm": 0.2500797999992455, "learning_rate": 3.524109812616161e-05, "loss": 0.1144, "step": 2217 }, { "epoch": 1.1546069755335764, "grad_norm": 0.2500104491421713, "learning_rate": 3.5228279408610895e-05, "loss": 0.1139, "step": 2218 }, { "epoch": 1.15512753774076, "grad_norm": 0.23246173483256596, "learning_rate": 3.521545746049356e-05, "loss": 0.1107, "step": 2219 }, { "epoch": 1.1556480999479437, "grad_norm": 0.25759769841196517, "learning_rate": 3.5202632285859364e-05, "loss": 0.115, "step": 2220 }, { "epoch": 1.1561686621551275, "grad_norm": 0.2518042175741102, "learning_rate": 3.518980388875911e-05, "loss": 0.1126, "step": 2221 }, { "epoch": 1.1566892243623113, "grad_norm": 0.24279674908930854, "learning_rate": 3.517697227324459e-05, "loss": 0.1125, "step": 2222 }, { "epoch": 1.157209786569495, "grad_norm": 0.24501321952749994, "learning_rate": 3.516413744336863e-05, "loss": 0.1096, "step": 2223 }, { "epoch": 1.1577303487766788, "grad_norm": 0.24613219825749424, "learning_rate": 3.5151299403185075e-05, "loss": 0.1119, "step": 2224 }, { "epoch": 1.1582509109838626, "grad_norm": 0.2614423597655891, "learning_rate": 3.513845815674877e-05, "loss": 0.1139, "step": 2225 }, { "epoch": 1.1587714731910463, "grad_norm": 0.24042769917249954, "learning_rate": 3.512561370811556e-05, "loss": 0.1088, "step": 2226 }, { "epoch": 1.1592920353982301, "grad_norm": 0.25698514658916083, "learning_rate": 3.5112766061342344e-05, "loss": 0.112, "step": 2227 }, { "epoch": 1.1598125976054139, "grad_norm": 0.27566732529397137, "learning_rate": 3.5099915220487e-05, "loss": 0.1123, "step": 2228 }, { "epoch": 1.1603331598125977, "grad_norm": 0.2612613930290791, "learning_rate": 3.5087061189608425e-05, "loss": 0.1138, "step": 2229 }, { "epoch": 1.1608537220197814, "grad_norm": 0.2609146607471247, "learning_rate": 3.50742039727665e-05, "loss": 0.1131, "step": 2230 }, { "epoch": 1.1613742842269652, "grad_norm": 0.23452789382351913, "learning_rate": 3.506134357402216e-05, "loss": 0.1155, "step": 2231 }, { "epoch": 1.161894846434149, "grad_norm": 0.25381787284216584, "learning_rate": 3.504847999743729e-05, "loss": 0.1169, "step": 2232 }, { "epoch": 1.1624154086413325, "grad_norm": 0.2467153144538376, "learning_rate": 3.503561324707484e-05, "loss": 0.1113, "step": 2233 }, { "epoch": 1.1629359708485163, "grad_norm": 0.2365562667950988, "learning_rate": 3.50227433269987e-05, "loss": 0.1152, "step": 2234 }, { "epoch": 1.1634565330557, "grad_norm": 0.23867811869220612, "learning_rate": 3.50098702412738e-05, "loss": 0.1157, "step": 2235 }, { "epoch": 1.1639770952628838, "grad_norm": 0.26099442663987993, "learning_rate": 3.4996993993966057e-05, "loss": 0.117, "step": 2236 }, { "epoch": 1.1644976574700676, "grad_norm": 0.23935359957958538, "learning_rate": 3.498411458914238e-05, "loss": 0.1111, "step": 2237 }, { "epoch": 1.1650182196772514, "grad_norm": 0.25105828547638254, "learning_rate": 3.497123203087072e-05, "loss": 0.1158, "step": 2238 }, { "epoch": 1.1655387818844352, "grad_norm": 0.23331248980187663, "learning_rate": 3.4958346323219946e-05, "loss": 0.1037, "step": 2239 }, { "epoch": 1.166059344091619, "grad_norm": 0.23447637588391998, "learning_rate": 3.494545747025999e-05, "loss": 0.1111, "step": 2240 }, { "epoch": 1.1665799062988027, "grad_norm": 0.2526432724715221, "learning_rate": 3.493256547606174e-05, "loss": 0.1169, "step": 2241 }, { "epoch": 1.1671004685059865, "grad_norm": 0.23593617679344556, "learning_rate": 3.4919670344697085e-05, "loss": 0.1119, "step": 2242 }, { "epoch": 1.1676210307131702, "grad_norm": 0.24062003884271277, "learning_rate": 3.490677208023892e-05, "loss": 0.11, "step": 2243 }, { "epoch": 1.168141592920354, "grad_norm": 0.2606487027308462, "learning_rate": 3.489387068676111e-05, "loss": 0.1188, "step": 2244 }, { "epoch": 1.1686621551275378, "grad_norm": 0.23693423265149932, "learning_rate": 3.4880966168338515e-05, "loss": 0.1099, "step": 2245 }, { "epoch": 1.1691827173347216, "grad_norm": 0.24304853176359928, "learning_rate": 3.4868058529046985e-05, "loss": 0.1194, "step": 2246 }, { "epoch": 1.1697032795419053, "grad_norm": 0.25302863952962923, "learning_rate": 3.485514777296335e-05, "loss": 0.1136, "step": 2247 }, { "epoch": 1.170223841749089, "grad_norm": 0.26167932489486967, "learning_rate": 3.484223390416543e-05, "loss": 0.113, "step": 2248 }, { "epoch": 1.1707444039562729, "grad_norm": 0.2571613695259598, "learning_rate": 3.4829316926732025e-05, "loss": 0.1168, "step": 2249 }, { "epoch": 1.1712649661634567, "grad_norm": 0.24293615449947745, "learning_rate": 3.481639684474291e-05, "loss": 0.1082, "step": 2250 }, { "epoch": 1.1717855283706402, "grad_norm": 0.2473405511928646, "learning_rate": 3.4803473662278865e-05, "loss": 0.1122, "step": 2251 }, { "epoch": 1.172306090577824, "grad_norm": 0.25163795289435603, "learning_rate": 3.479054738342162e-05, "loss": 0.1146, "step": 2252 }, { "epoch": 1.1728266527850078, "grad_norm": 0.25041465835608206, "learning_rate": 3.47776180122539e-05, "loss": 0.1166, "step": 2253 }, { "epoch": 1.1733472149921915, "grad_norm": 0.24547313540848606, "learning_rate": 3.47646855528594e-05, "loss": 0.1148, "step": 2254 }, { "epoch": 1.1738677771993753, "grad_norm": 0.23742977846106733, "learning_rate": 3.4751750009322795e-05, "loss": 0.1063, "step": 2255 }, { "epoch": 1.174388339406559, "grad_norm": 0.23708783235578984, "learning_rate": 3.473881138572973e-05, "loss": 0.1155, "step": 2256 }, { "epoch": 1.1749089016137428, "grad_norm": 0.23348367213721333, "learning_rate": 3.472586968616682e-05, "loss": 0.1067, "step": 2257 }, { "epoch": 1.1754294638209266, "grad_norm": 0.2502161259943564, "learning_rate": 3.4712924914721664e-05, "loss": 0.111, "step": 2258 }, { "epoch": 1.1759500260281104, "grad_norm": 0.26009313181484484, "learning_rate": 3.469997707548281e-05, "loss": 0.117, "step": 2259 }, { "epoch": 1.1764705882352942, "grad_norm": 0.23101072131923964, "learning_rate": 3.468702617253981e-05, "loss": 0.1092, "step": 2260 }, { "epoch": 1.176991150442478, "grad_norm": 0.2546409766040149, "learning_rate": 3.4674072209983144e-05, "loss": 0.1142, "step": 2261 }, { "epoch": 1.1775117126496617, "grad_norm": 0.23922221142024416, "learning_rate": 3.466111519190428e-05, "loss": 0.1107, "step": 2262 }, { "epoch": 1.1780322748568455, "grad_norm": 0.244686444864269, "learning_rate": 3.464815512239565e-05, "loss": 0.1113, "step": 2263 }, { "epoch": 1.1785528370640292, "grad_norm": 0.23856819223838555, "learning_rate": 3.463519200555064e-05, "loss": 0.1113, "step": 2264 }, { "epoch": 1.1790733992712128, "grad_norm": 0.26366348156801095, "learning_rate": 3.462222584546363e-05, "loss": 0.1079, "step": 2265 }, { "epoch": 1.1795939614783966, "grad_norm": 0.24664767010017477, "learning_rate": 3.46092566462299e-05, "loss": 0.1131, "step": 2266 }, { "epoch": 1.1801145236855803, "grad_norm": 0.24616840057630468, "learning_rate": 3.4596284411945755e-05, "loss": 0.1173, "step": 2267 }, { "epoch": 1.1806350858927641, "grad_norm": 0.25367146760865916, "learning_rate": 3.4583309146708406e-05, "loss": 0.1154, "step": 2268 }, { "epoch": 1.1811556480999479, "grad_norm": 0.23507264194144847, "learning_rate": 3.457033085461607e-05, "loss": 0.1037, "step": 2269 }, { "epoch": 1.1816762103071317, "grad_norm": 0.23989782007015442, "learning_rate": 3.455734953976789e-05, "loss": 0.1128, "step": 2270 }, { "epoch": 1.1821967725143154, "grad_norm": 0.2405068672443556, "learning_rate": 3.454436520626396e-05, "loss": 0.1128, "step": 2271 }, { "epoch": 1.1827173347214992, "grad_norm": 0.2301126062735549, "learning_rate": 3.453137785820534e-05, "loss": 0.1091, "step": 2272 }, { "epoch": 1.183237896928683, "grad_norm": 0.25627883239774635, "learning_rate": 3.451838749969404e-05, "loss": 0.1088, "step": 2273 }, { "epoch": 1.1837584591358667, "grad_norm": 0.23620278190068159, "learning_rate": 3.450539413483302e-05, "loss": 0.1144, "step": 2274 }, { "epoch": 1.1842790213430505, "grad_norm": 0.25644395644621587, "learning_rate": 3.4492397767726195e-05, "loss": 0.1152, "step": 2275 }, { "epoch": 1.1847995835502343, "grad_norm": 0.22348749190590206, "learning_rate": 3.4479398402478406e-05, "loss": 0.108, "step": 2276 }, { "epoch": 1.185320145757418, "grad_norm": 0.2526453426350537, "learning_rate": 3.4466396043195484e-05, "loss": 0.1153, "step": 2277 }, { "epoch": 1.1858407079646018, "grad_norm": 0.2562747166960448, "learning_rate": 3.445339069398415e-05, "loss": 0.1194, "step": 2278 }, { "epoch": 1.1863612701717856, "grad_norm": 0.23680974190421952, "learning_rate": 3.444038235895212e-05, "loss": 0.11, "step": 2279 }, { "epoch": 1.1868818323789694, "grad_norm": 0.2545225578398651, "learning_rate": 3.442737104220801e-05, "loss": 0.1118, "step": 2280 }, { "epoch": 1.1874023945861532, "grad_norm": 0.2461192137870807, "learning_rate": 3.441435674786143e-05, "loss": 0.1104, "step": 2281 }, { "epoch": 1.187922956793337, "grad_norm": 0.2538161714997669, "learning_rate": 3.4401339480022874e-05, "loss": 0.1118, "step": 2282 }, { "epoch": 1.1884435190005205, "grad_norm": 0.25314168955138006, "learning_rate": 3.4388319242803806e-05, "loss": 0.1091, "step": 2283 }, { "epoch": 1.1889640812077042, "grad_norm": 0.2587245929932049, "learning_rate": 3.437529604031663e-05, "loss": 0.1125, "step": 2284 }, { "epoch": 1.189484643414888, "grad_norm": 0.24732781819326619, "learning_rate": 3.436226987667467e-05, "loss": 0.1095, "step": 2285 }, { "epoch": 1.1900052056220718, "grad_norm": 0.26399681078473874, "learning_rate": 3.4349240755992216e-05, "loss": 0.1144, "step": 2286 }, { "epoch": 1.1905257678292556, "grad_norm": 0.254509691032438, "learning_rate": 3.433620868238444e-05, "loss": 0.1195, "step": 2287 }, { "epoch": 1.1910463300364393, "grad_norm": 0.27960613887789765, "learning_rate": 3.4323173659967506e-05, "loss": 0.1147, "step": 2288 }, { "epoch": 1.191566892243623, "grad_norm": 0.23122617317358943, "learning_rate": 3.431013569285846e-05, "loss": 0.1082, "step": 2289 }, { "epoch": 1.1920874544508069, "grad_norm": 0.2452337995607666, "learning_rate": 3.429709478517531e-05, "loss": 0.1121, "step": 2290 }, { "epoch": 1.1926080166579907, "grad_norm": 0.24517649301196232, "learning_rate": 3.428405094103696e-05, "loss": 0.1145, "step": 2291 }, { "epoch": 1.1931285788651744, "grad_norm": 0.24289572986099975, "learning_rate": 3.42710041645633e-05, "loss": 0.1138, "step": 2292 }, { "epoch": 1.1936491410723582, "grad_norm": 0.26963497608876424, "learning_rate": 3.425795445987508e-05, "loss": 0.1186, "step": 2293 }, { "epoch": 1.194169703279542, "grad_norm": 0.2525547241342057, "learning_rate": 3.4244901831094014e-05, "loss": 0.1145, "step": 2294 }, { "epoch": 1.1946902654867257, "grad_norm": 0.25154080236686166, "learning_rate": 3.4231846282342725e-05, "loss": 0.1158, "step": 2295 }, { "epoch": 1.1952108276939095, "grad_norm": 0.2607865126803791, "learning_rate": 3.4218787817744773e-05, "loss": 0.1145, "step": 2296 }, { "epoch": 1.195731389901093, "grad_norm": 0.28235116430206414, "learning_rate": 3.420572644142463e-05, "loss": 0.1157, "step": 2297 }, { "epoch": 1.1962519521082768, "grad_norm": 0.23089667791953067, "learning_rate": 3.419266215750767e-05, "loss": 0.1131, "step": 2298 }, { "epoch": 1.1967725143154606, "grad_norm": 0.2293790099648737, "learning_rate": 3.4179594970120215e-05, "loss": 0.1151, "step": 2299 }, { "epoch": 1.1972930765226444, "grad_norm": 0.24575677024213127, "learning_rate": 3.4166524883389476e-05, "loss": 0.1128, "step": 2300 }, { "epoch": 1.1978136387298282, "grad_norm": 0.25216046527261354, "learning_rate": 3.415345190144362e-05, "loss": 0.1137, "step": 2301 }, { "epoch": 1.198334200937012, "grad_norm": 0.24305569802868002, "learning_rate": 3.414037602841168e-05, "loss": 0.1147, "step": 2302 }, { "epoch": 1.1988547631441957, "grad_norm": 0.27127800645249683, "learning_rate": 3.4127297268423636e-05, "loss": 0.1181, "step": 2303 }, { "epoch": 1.1993753253513795, "grad_norm": 0.23242248281346625, "learning_rate": 3.411421562561037e-05, "loss": 0.1128, "step": 2304 }, { "epoch": 1.1998958875585632, "grad_norm": 0.2745166695623737, "learning_rate": 3.410113110410366e-05, "loss": 0.1117, "step": 2305 }, { "epoch": 1.200416449765747, "grad_norm": 0.23978776170340857, "learning_rate": 3.408804370803623e-05, "loss": 0.1108, "step": 2306 }, { "epoch": 1.2009370119729308, "grad_norm": 0.24928731609898175, "learning_rate": 3.407495344154167e-05, "loss": 0.1161, "step": 2307 }, { "epoch": 1.2014575741801146, "grad_norm": 0.23242055883453808, "learning_rate": 3.40618603087545e-05, "loss": 0.1114, "step": 2308 }, { "epoch": 1.2019781363872983, "grad_norm": 0.2418508216693535, "learning_rate": 3.404876431381014e-05, "loss": 0.1097, "step": 2309 }, { "epoch": 1.202498698594482, "grad_norm": 0.2644872421572009, "learning_rate": 3.403566546084493e-05, "loss": 0.1198, "step": 2310 }, { "epoch": 1.2030192608016659, "grad_norm": 0.2432882408100227, "learning_rate": 3.4022563753996075e-05, "loss": 0.1161, "step": 2311 }, { "epoch": 1.2035398230088497, "grad_norm": 0.25959575629843024, "learning_rate": 3.400945919740171e-05, "loss": 0.1136, "step": 2312 }, { "epoch": 1.2040603852160334, "grad_norm": 0.2331391415291296, "learning_rate": 3.3996351795200865e-05, "loss": 0.1096, "step": 2313 }, { "epoch": 1.2045809474232172, "grad_norm": 0.24943855765809428, "learning_rate": 3.3983241551533465e-05, "loss": 0.1091, "step": 2314 }, { "epoch": 1.2051015096304007, "grad_norm": 0.24791662409738544, "learning_rate": 3.397012847054035e-05, "loss": 0.1148, "step": 2315 }, { "epoch": 1.2056220718375845, "grad_norm": 0.2618458677286621, "learning_rate": 3.3957012556363224e-05, "loss": 0.1119, "step": 2316 }, { "epoch": 1.2061426340447683, "grad_norm": 0.2635221612106524, "learning_rate": 3.394389381314471e-05, "loss": 0.112, "step": 2317 }, { "epoch": 1.206663196251952, "grad_norm": 0.24840413289756066, "learning_rate": 3.393077224502832e-05, "loss": 0.1104, "step": 2318 }, { "epoch": 1.2071837584591358, "grad_norm": 0.2504601503439982, "learning_rate": 3.391764785615845e-05, "loss": 0.1137, "step": 2319 }, { "epoch": 1.2077043206663196, "grad_norm": 0.22826339447866847, "learning_rate": 3.3904520650680405e-05, "loss": 0.1084, "step": 2320 }, { "epoch": 1.2082248828735034, "grad_norm": 0.23110036065955847, "learning_rate": 3.3891390632740345e-05, "loss": 0.1063, "step": 2321 }, { "epoch": 1.2087454450806872, "grad_norm": 0.25396241886265036, "learning_rate": 3.387825780648536e-05, "loss": 0.1209, "step": 2322 }, { "epoch": 1.209266007287871, "grad_norm": 0.2437164195108459, "learning_rate": 3.386512217606339e-05, "loss": 0.1202, "step": 2323 }, { "epoch": 1.2097865694950547, "grad_norm": 0.23771865004331263, "learning_rate": 3.38519837456233e-05, "loss": 0.1127, "step": 2324 }, { "epoch": 1.2103071317022385, "grad_norm": 0.22893673951370747, "learning_rate": 3.383884251931481e-05, "loss": 0.1098, "step": 2325 }, { "epoch": 1.2108276939094222, "grad_norm": 0.24464255415023003, "learning_rate": 3.3825698501288516e-05, "loss": 0.1114, "step": 2326 }, { "epoch": 1.211348256116606, "grad_norm": 0.24641719397311446, "learning_rate": 3.381255169569594e-05, "loss": 0.1163, "step": 2327 }, { "epoch": 1.2118688183237898, "grad_norm": 0.24202099555940623, "learning_rate": 3.3799402106689416e-05, "loss": 0.1122, "step": 2328 }, { "epoch": 1.2123893805309733, "grad_norm": 0.23976840972172184, "learning_rate": 3.3786249738422235e-05, "loss": 0.1138, "step": 2329 }, { "epoch": 1.212909942738157, "grad_norm": 0.26118454788338863, "learning_rate": 3.3773094595048506e-05, "loss": 0.1105, "step": 2330 }, { "epoch": 1.2134305049453409, "grad_norm": 0.2504712191955974, "learning_rate": 3.375993668072324e-05, "loss": 0.1124, "step": 2331 }, { "epoch": 1.2139510671525247, "grad_norm": 0.2325507565735545, "learning_rate": 3.374677599960231e-05, "loss": 0.1098, "step": 2332 }, { "epoch": 1.2144716293597084, "grad_norm": 0.2590513905450211, "learning_rate": 3.3733612555842486e-05, "loss": 0.1124, "step": 2333 }, { "epoch": 1.2149921915668922, "grad_norm": 0.24603883688487388, "learning_rate": 3.3720446353601394e-05, "loss": 0.1122, "step": 2334 }, { "epoch": 1.215512753774076, "grad_norm": 0.2377595048373133, "learning_rate": 3.370727739703752e-05, "loss": 0.1124, "step": 2335 }, { "epoch": 1.2160333159812597, "grad_norm": 0.23641590886231106, "learning_rate": 3.369410569031024e-05, "loss": 0.1114, "step": 2336 }, { "epoch": 1.2165538781884435, "grad_norm": 0.2498625675970159, "learning_rate": 3.36809312375798e-05, "loss": 0.1145, "step": 2337 }, { "epoch": 1.2170744403956273, "grad_norm": 0.22842565347219712, "learning_rate": 3.36677540430073e-05, "loss": 0.1129, "step": 2338 }, { "epoch": 1.217595002602811, "grad_norm": 0.22898607545193125, "learning_rate": 3.365457411075471e-05, "loss": 0.1127, "step": 2339 }, { "epoch": 1.2181155648099948, "grad_norm": 0.23953214969485537, "learning_rate": 3.3641391444984864e-05, "loss": 0.1112, "step": 2340 }, { "epoch": 1.2186361270171786, "grad_norm": 0.24104626520668057, "learning_rate": 3.362820604986147e-05, "loss": 0.1125, "step": 2341 }, { "epoch": 1.2191566892243624, "grad_norm": 0.23748691484838422, "learning_rate": 3.361501792954908e-05, "loss": 0.1095, "step": 2342 }, { "epoch": 1.2196772514315461, "grad_norm": 0.2483740020552502, "learning_rate": 3.360182708821312e-05, "loss": 0.1158, "step": 2343 }, { "epoch": 1.22019781363873, "grad_norm": 0.25425874287469785, "learning_rate": 3.358863353001987e-05, "loss": 0.1225, "step": 2344 }, { "epoch": 1.2207183758459137, "grad_norm": 0.2586255317538337, "learning_rate": 3.3575437259136474e-05, "loss": 0.1164, "step": 2345 }, { "epoch": 1.2212389380530975, "grad_norm": 0.23402320390581927, "learning_rate": 3.3562238279730916e-05, "loss": 0.1147, "step": 2346 }, { "epoch": 1.221759500260281, "grad_norm": 0.2477059723049737, "learning_rate": 3.354903659597207e-05, "loss": 0.1115, "step": 2347 }, { "epoch": 1.2222800624674648, "grad_norm": 0.23650878937783718, "learning_rate": 3.353583221202962e-05, "loss": 0.1067, "step": 2348 }, { "epoch": 1.2228006246746486, "grad_norm": 0.24360478631058402, "learning_rate": 3.352262513207413e-05, "loss": 0.1113, "step": 2349 }, { "epoch": 1.2233211868818323, "grad_norm": 0.239918587840492, "learning_rate": 3.350941536027702e-05, "loss": 0.1103, "step": 2350 }, { "epoch": 1.223841749089016, "grad_norm": 0.23335823537471334, "learning_rate": 3.349620290081055e-05, "loss": 0.107, "step": 2351 }, { "epoch": 1.2243623112961999, "grad_norm": 0.23023317202230778, "learning_rate": 3.348298775784782e-05, "loss": 0.1139, "step": 2352 }, { "epoch": 1.2248828735033837, "grad_norm": 0.24137499470962637, "learning_rate": 3.3469769935562796e-05, "loss": 0.1111, "step": 2353 }, { "epoch": 1.2254034357105674, "grad_norm": 0.23984175856275775, "learning_rate": 3.345654943813027e-05, "loss": 0.1126, "step": 2354 }, { "epoch": 1.2259239979177512, "grad_norm": 0.2511343023919521, "learning_rate": 3.34433262697259e-05, "loss": 0.1235, "step": 2355 }, { "epoch": 1.226444560124935, "grad_norm": 0.2248493142834007, "learning_rate": 3.343010043452618e-05, "loss": 0.11, "step": 2356 }, { "epoch": 1.2269651223321187, "grad_norm": 0.23619956004612885, "learning_rate": 3.3416871936708436e-05, "loss": 0.1122, "step": 2357 }, { "epoch": 1.2274856845393025, "grad_norm": 0.24154490816188087, "learning_rate": 3.340364078045085e-05, "loss": 0.118, "step": 2358 }, { "epoch": 1.2280062467464863, "grad_norm": 0.23906919393439444, "learning_rate": 3.339040696993243e-05, "loss": 0.112, "step": 2359 }, { "epoch": 1.22852680895367, "grad_norm": 0.23515780174889195, "learning_rate": 3.3377170509333035e-05, "loss": 0.1065, "step": 2360 }, { "epoch": 1.2290473711608536, "grad_norm": 0.2448541840615606, "learning_rate": 3.336393140283335e-05, "loss": 0.1104, "step": 2361 }, { "epoch": 1.2295679333680374, "grad_norm": 0.23502697559009228, "learning_rate": 3.33506896546149e-05, "loss": 0.1168, "step": 2362 }, { "epoch": 1.2300884955752212, "grad_norm": 0.24547086115985342, "learning_rate": 3.333744526886006e-05, "loss": 0.1178, "step": 2363 }, { "epoch": 1.230609057782405, "grad_norm": 0.2238019166845714, "learning_rate": 3.3324198249752004e-05, "loss": 0.1089, "step": 2364 }, { "epoch": 1.2311296199895887, "grad_norm": 0.25163269197743565, "learning_rate": 3.331094860147477e-05, "loss": 0.1149, "step": 2365 }, { "epoch": 1.2316501821967725, "grad_norm": 0.24302067777188527, "learning_rate": 3.329769632821321e-05, "loss": 0.1126, "step": 2366 }, { "epoch": 1.2321707444039562, "grad_norm": 0.24052489202021649, "learning_rate": 3.328444143415301e-05, "loss": 0.1126, "step": 2367 }, { "epoch": 1.23269130661114, "grad_norm": 0.23963819746590062, "learning_rate": 3.327118392348068e-05, "loss": 0.1111, "step": 2368 }, { "epoch": 1.2332118688183238, "grad_norm": 0.2554674901122427, "learning_rate": 3.325792380038356e-05, "loss": 0.122, "step": 2369 }, { "epoch": 1.2337324310255076, "grad_norm": 0.24101776937698122, "learning_rate": 3.324466106904981e-05, "loss": 0.1108, "step": 2370 }, { "epoch": 1.2342529932326913, "grad_norm": 0.23475211548593766, "learning_rate": 3.323139573366842e-05, "loss": 0.1143, "step": 2371 }, { "epoch": 1.234773555439875, "grad_norm": 0.23437107317802344, "learning_rate": 3.32181277984292e-05, "loss": 0.109, "step": 2372 }, { "epoch": 1.2352941176470589, "grad_norm": 0.23380076784273857, "learning_rate": 3.320485726752278e-05, "loss": 0.1119, "step": 2373 }, { "epoch": 1.2358146798542426, "grad_norm": 0.2540138119726505, "learning_rate": 3.3191584145140626e-05, "loss": 0.1171, "step": 2374 }, { "epoch": 1.2363352420614264, "grad_norm": 0.2411702497970775, "learning_rate": 3.317830843547499e-05, "loss": 0.1161, "step": 2375 }, { "epoch": 1.2368558042686102, "grad_norm": 0.233210046923344, "learning_rate": 3.316503014271896e-05, "loss": 0.1103, "step": 2376 }, { "epoch": 1.237376366475794, "grad_norm": 0.24776647975273905, "learning_rate": 3.3151749271066444e-05, "loss": 0.1155, "step": 2377 }, { "epoch": 1.2378969286829777, "grad_norm": 0.2273242590925185, "learning_rate": 3.3138465824712164e-05, "loss": 0.1124, "step": 2378 }, { "epoch": 1.2384174908901613, "grad_norm": 0.2394314835262025, "learning_rate": 3.312517980785164e-05, "loss": 0.1128, "step": 2379 }, { "epoch": 1.238938053097345, "grad_norm": 0.2448659548311951, "learning_rate": 3.311189122468122e-05, "loss": 0.1196, "step": 2380 }, { "epoch": 1.2394586153045288, "grad_norm": 0.23448498331985973, "learning_rate": 3.309860007939806e-05, "loss": 0.1116, "step": 2381 }, { "epoch": 1.2399791775117126, "grad_norm": 0.22911095022022188, "learning_rate": 3.308530637620011e-05, "loss": 0.1121, "step": 2382 }, { "epoch": 1.2404997397188964, "grad_norm": 0.24025893459427192, "learning_rate": 3.307201011928616e-05, "loss": 0.1144, "step": 2383 }, { "epoch": 1.2410203019260801, "grad_norm": 0.23259241617711043, "learning_rate": 3.305871131285577e-05, "loss": 0.1094, "step": 2384 }, { "epoch": 1.241540864133264, "grad_norm": 0.2361852244544814, "learning_rate": 3.3045409961109324e-05, "loss": 0.1071, "step": 2385 }, { "epoch": 1.2420614263404477, "grad_norm": 0.2561842268409719, "learning_rate": 3.3032106068248014e-05, "loss": 0.1198, "step": 2386 }, { "epoch": 1.2425819885476315, "grad_norm": 0.2511394825375184, "learning_rate": 3.301879963847383e-05, "loss": 0.1106, "step": 2387 }, { "epoch": 1.2431025507548152, "grad_norm": 0.22935577553554087, "learning_rate": 3.3005490675989545e-05, "loss": 0.1103, "step": 2388 }, { "epoch": 1.243623112961999, "grad_norm": 0.24634598430303592, "learning_rate": 3.2992179184998774e-05, "loss": 0.1118, "step": 2389 }, { "epoch": 1.2441436751691828, "grad_norm": 0.24037850632563773, "learning_rate": 3.2978865169705885e-05, "loss": 0.115, "step": 2390 }, { "epoch": 1.2446642373763666, "grad_norm": 0.24026168911843124, "learning_rate": 3.296554863431607e-05, "loss": 0.1093, "step": 2391 }, { "epoch": 1.2451847995835503, "grad_norm": 0.24022065182744629, "learning_rate": 3.295222958303532e-05, "loss": 0.1115, "step": 2392 }, { "epoch": 1.2457053617907339, "grad_norm": 0.23185501118099175, "learning_rate": 3.2938908020070404e-05, "loss": 0.1101, "step": 2393 }, { "epoch": 1.2462259239979177, "grad_norm": 0.23099404002234822, "learning_rate": 3.292558394962888e-05, "loss": 0.1132, "step": 2394 }, { "epoch": 1.2467464862051014, "grad_norm": 0.24176376810983632, "learning_rate": 3.2912257375919126e-05, "loss": 0.1148, "step": 2395 }, { "epoch": 1.2472670484122852, "grad_norm": 0.24222391889430964, "learning_rate": 3.289892830315028e-05, "loss": 0.1098, "step": 2396 }, { "epoch": 1.247787610619469, "grad_norm": 0.23277468724181424, "learning_rate": 3.28855967355323e-05, "loss": 0.1081, "step": 2397 }, { "epoch": 1.2483081728266527, "grad_norm": 0.2319208224863446, "learning_rate": 3.2872262677275906e-05, "loss": 0.1127, "step": 2398 }, { "epoch": 1.2488287350338365, "grad_norm": 0.22600332372977186, "learning_rate": 3.285892613259261e-05, "loss": 0.1082, "step": 2399 }, { "epoch": 1.2493492972410203, "grad_norm": 0.2380744039289219, "learning_rate": 3.2845587105694716e-05, "loss": 0.1157, "step": 2400 }, { "epoch": 1.249869859448204, "grad_norm": 0.24199537993154474, "learning_rate": 3.283224560079532e-05, "loss": 0.1085, "step": 2401 }, { "epoch": 1.2503904216553878, "grad_norm": 0.25841375982911335, "learning_rate": 3.281890162210829e-05, "loss": 0.1129, "step": 2402 }, { "epoch": 1.2509109838625716, "grad_norm": 0.23379214223234784, "learning_rate": 3.2805555173848254e-05, "loss": 0.11, "step": 2403 }, { "epoch": 1.2514315460697554, "grad_norm": 0.24808893501537926, "learning_rate": 3.279220626023065e-05, "loss": 0.1103, "step": 2404 }, { "epoch": 1.2519521082769391, "grad_norm": 0.23963781789500202, "learning_rate": 3.27788548854717e-05, "loss": 0.1126, "step": 2405 }, { "epoch": 1.252472670484123, "grad_norm": 0.2536793830039519, "learning_rate": 3.276550105378838e-05, "loss": 0.1188, "step": 2406 }, { "epoch": 1.2529932326913067, "grad_norm": 0.22592288227783439, "learning_rate": 3.275214476939845e-05, "loss": 0.1111, "step": 2407 }, { "epoch": 1.2535137948984905, "grad_norm": 0.2184518861484896, "learning_rate": 3.273878603652045e-05, "loss": 0.1101, "step": 2408 }, { "epoch": 1.2540343571056742, "grad_norm": 0.22611064467072162, "learning_rate": 3.272542485937369e-05, "loss": 0.1078, "step": 2409 }, { "epoch": 1.254554919312858, "grad_norm": 0.24252402472546458, "learning_rate": 3.271206124217825e-05, "loss": 0.1089, "step": 2410 }, { "epoch": 1.2550754815200418, "grad_norm": 0.24827439034038407, "learning_rate": 3.269869518915497e-05, "loss": 0.1153, "step": 2411 }, { "epoch": 1.2555960437272253, "grad_norm": 0.24447695733165611, "learning_rate": 3.268532670452549e-05, "loss": 0.111, "step": 2412 }, { "epoch": 1.256116605934409, "grad_norm": 0.24933714928678632, "learning_rate": 3.2671955792512186e-05, "loss": 0.1113, "step": 2413 }, { "epoch": 1.2566371681415929, "grad_norm": 0.2378454669519632, "learning_rate": 3.265858245733824e-05, "loss": 0.1152, "step": 2414 }, { "epoch": 1.2571577303487766, "grad_norm": 0.2512478037323881, "learning_rate": 3.2645206703227536e-05, "loss": 0.1206, "step": 2415 }, { "epoch": 1.2576782925559604, "grad_norm": 0.250096683928609, "learning_rate": 3.263182853440479e-05, "loss": 0.1135, "step": 2416 }, { "epoch": 1.2581988547631442, "grad_norm": 0.23599764911190643, "learning_rate": 3.261844795509542e-05, "loss": 0.116, "step": 2417 }, { "epoch": 1.258719416970328, "grad_norm": 0.2311757103757818, "learning_rate": 3.260506496952567e-05, "loss": 0.1154, "step": 2418 }, { "epoch": 1.2592399791775117, "grad_norm": 0.23201479378425932, "learning_rate": 3.259167958192249e-05, "loss": 0.114, "step": 2419 }, { "epoch": 1.2597605413846955, "grad_norm": 0.2365740831566155, "learning_rate": 3.257829179651361e-05, "loss": 0.1095, "step": 2420 }, { "epoch": 1.2602811035918793, "grad_norm": 0.22738663769119088, "learning_rate": 3.2564901617527513e-05, "loss": 0.1069, "step": 2421 }, { "epoch": 1.260801665799063, "grad_norm": 0.24513932299659597, "learning_rate": 3.2551509049193444e-05, "loss": 0.1126, "step": 2422 }, { "epoch": 1.2613222280062468, "grad_norm": 0.2295326150055126, "learning_rate": 3.253811409574141e-05, "loss": 0.1135, "step": 2423 }, { "epoch": 1.2618427902134304, "grad_norm": 0.2392260915445472, "learning_rate": 3.252471676140215e-05, "loss": 0.1133, "step": 2424 }, { "epoch": 1.2623633524206141, "grad_norm": 0.2281032715398985, "learning_rate": 3.251131705040716e-05, "loss": 0.1086, "step": 2425 }, { "epoch": 1.262883914627798, "grad_norm": 0.240395636614695, "learning_rate": 3.24979149669887e-05, "loss": 0.1145, "step": 2426 }, { "epoch": 1.2634044768349817, "grad_norm": 0.2406505726843499, "learning_rate": 3.2484510515379776e-05, "loss": 0.1122, "step": 2427 }, { "epoch": 1.2639250390421655, "grad_norm": 0.2645085343890866, "learning_rate": 3.247110369981413e-05, "loss": 0.1151, "step": 2428 }, { "epoch": 1.2644456012493492, "grad_norm": 0.22557528612610186, "learning_rate": 3.245769452452626e-05, "loss": 0.1092, "step": 2429 }, { "epoch": 1.264966163456533, "grad_norm": 0.23590364280733392, "learning_rate": 3.244428299375141e-05, "loss": 0.1157, "step": 2430 }, { "epoch": 1.2654867256637168, "grad_norm": 0.23355810219083442, "learning_rate": 3.243086911172555e-05, "loss": 0.1112, "step": 2431 }, { "epoch": 1.2660072878709006, "grad_norm": 0.22768610677426723, "learning_rate": 3.241745288268544e-05, "loss": 0.1112, "step": 2432 }, { "epoch": 1.2665278500780843, "grad_norm": 0.2213720698521258, "learning_rate": 3.240403431086853e-05, "loss": 0.1115, "step": 2433 }, { "epoch": 1.267048412285268, "grad_norm": 0.2511275509634126, "learning_rate": 3.239061340051302e-05, "loss": 0.1144, "step": 2434 }, { "epoch": 1.2675689744924519, "grad_norm": 0.22474601806469852, "learning_rate": 3.237719015585787e-05, "loss": 0.1084, "step": 2435 }, { "epoch": 1.2680895366996356, "grad_norm": 0.22967485481677846, "learning_rate": 3.236376458114276e-05, "loss": 0.1053, "step": 2436 }, { "epoch": 1.2686100989068194, "grad_norm": 0.24508106364086055, "learning_rate": 3.235033668060813e-05, "loss": 0.1141, "step": 2437 }, { "epoch": 1.2691306611140032, "grad_norm": 0.23936751039871776, "learning_rate": 3.233690645849511e-05, "loss": 0.1126, "step": 2438 }, { "epoch": 1.269651223321187, "grad_norm": 0.2423970044613412, "learning_rate": 3.23234739190456e-05, "loss": 0.11, "step": 2439 }, { "epoch": 1.2701717855283707, "grad_norm": 0.2408077607865761, "learning_rate": 3.2310039066502224e-05, "loss": 0.1186, "step": 2440 }, { "epoch": 1.2706923477355545, "grad_norm": 0.24264263902809302, "learning_rate": 3.229660190510833e-05, "loss": 0.1168, "step": 2441 }, { "epoch": 1.2712129099427383, "grad_norm": 0.23451047905283273, "learning_rate": 3.2283162439108004e-05, "loss": 0.1114, "step": 2442 }, { "epoch": 1.271733472149922, "grad_norm": 0.24187390316258348, "learning_rate": 3.226972067274605e-05, "loss": 0.1111, "step": 2443 }, { "epoch": 1.2722540343571056, "grad_norm": 0.23365628213616502, "learning_rate": 3.2256276610268e-05, "loss": 0.1177, "step": 2444 }, { "epoch": 1.2727745965642894, "grad_norm": 0.2427390698922539, "learning_rate": 3.224283025592011e-05, "loss": 0.1111, "step": 2445 }, { "epoch": 1.2732951587714731, "grad_norm": 0.2512975614866838, "learning_rate": 3.22293816139494e-05, "loss": 0.1182, "step": 2446 }, { "epoch": 1.273815720978657, "grad_norm": 0.24295218403998542, "learning_rate": 3.2215930688603525e-05, "loss": 0.1165, "step": 2447 }, { "epoch": 1.2743362831858407, "grad_norm": 0.23644527244752322, "learning_rate": 3.220247748413094e-05, "loss": 0.113, "step": 2448 }, { "epoch": 1.2748568453930245, "grad_norm": 0.23785821712712837, "learning_rate": 3.21890220047808e-05, "loss": 0.114, "step": 2449 }, { "epoch": 1.2753774076002082, "grad_norm": 0.23632989350892925, "learning_rate": 3.217556425480296e-05, "loss": 0.1111, "step": 2450 }, { "epoch": 1.275897969807392, "grad_norm": 0.23263875426854946, "learning_rate": 3.216210423844801e-05, "loss": 0.1102, "step": 2451 }, { "epoch": 1.2764185320145758, "grad_norm": 0.23188957689089493, "learning_rate": 3.214864195996723e-05, "loss": 0.1075, "step": 2452 }, { "epoch": 1.2769390942217596, "grad_norm": 0.23375003589359838, "learning_rate": 3.213517742361267e-05, "loss": 0.1082, "step": 2453 }, { "epoch": 1.2774596564289433, "grad_norm": 0.24021397633189484, "learning_rate": 3.212171063363702e-05, "loss": 0.1166, "step": 2454 }, { "epoch": 1.277980218636127, "grad_norm": 0.2322622124803816, "learning_rate": 3.2108241594293756e-05, "loss": 0.1139, "step": 2455 }, { "epoch": 1.2785007808433106, "grad_norm": 0.23120316428655097, "learning_rate": 3.209477030983698e-05, "loss": 0.112, "step": 2456 }, { "epoch": 1.2790213430504944, "grad_norm": 0.23819406522964934, "learning_rate": 3.20812967845216e-05, "loss": 0.1065, "step": 2457 }, { "epoch": 1.2795419052576782, "grad_norm": 0.23305129874267824, "learning_rate": 3.206782102260316e-05, "loss": 0.1111, "step": 2458 }, { "epoch": 1.280062467464862, "grad_norm": 0.26056719528908606, "learning_rate": 3.205434302833792e-05, "loss": 0.1104, "step": 2459 }, { "epoch": 1.2805830296720457, "grad_norm": 0.23577408055653468, "learning_rate": 3.204086280598289e-05, "loss": 0.1192, "step": 2460 }, { "epoch": 1.2811035918792295, "grad_norm": 0.2659768658535487, "learning_rate": 3.202738035979571e-05, "loss": 0.1197, "step": 2461 }, { "epoch": 1.2816241540864133, "grad_norm": 0.2439223726100321, "learning_rate": 3.2013895694034804e-05, "loss": 0.107, "step": 2462 }, { "epoch": 1.282144716293597, "grad_norm": 0.23562944836320895, "learning_rate": 3.200040881295922e-05, "loss": 0.1104, "step": 2463 }, { "epoch": 1.2826652785007808, "grad_norm": 0.25461295273652873, "learning_rate": 3.198691972082878e-05, "loss": 0.1142, "step": 2464 }, { "epoch": 1.2831858407079646, "grad_norm": 0.23055086268252875, "learning_rate": 3.197342842190394e-05, "loss": 0.1072, "step": 2465 }, { "epoch": 1.2837064029151484, "grad_norm": 0.2770919732305317, "learning_rate": 3.1959934920445894e-05, "loss": 0.1161, "step": 2466 }, { "epoch": 1.2842269651223321, "grad_norm": 0.24915210522689957, "learning_rate": 3.194643922071651e-05, "loss": 0.1139, "step": 2467 }, { "epoch": 1.284747527329516, "grad_norm": 0.23485697563408167, "learning_rate": 3.193294132697835e-05, "loss": 0.1111, "step": 2468 }, { "epoch": 1.2852680895366997, "grad_norm": 0.2376200199679929, "learning_rate": 3.191944124349471e-05, "loss": 0.1075, "step": 2469 }, { "epoch": 1.2857886517438835, "grad_norm": 0.23788594569376167, "learning_rate": 3.190593897452951e-05, "loss": 0.1086, "step": 2470 }, { "epoch": 1.2863092139510672, "grad_norm": 0.2338576028196406, "learning_rate": 3.189243452434741e-05, "loss": 0.1147, "step": 2471 }, { "epoch": 1.286829776158251, "grad_norm": 0.23732556832773136, "learning_rate": 3.187892789721373e-05, "loss": 0.1112, "step": 2472 }, { "epoch": 1.2873503383654348, "grad_norm": 0.24378666037810956, "learning_rate": 3.186541909739452e-05, "loss": 0.1124, "step": 2473 }, { "epoch": 1.2878709005726185, "grad_norm": 0.23279147802128494, "learning_rate": 3.185190812915646e-05, "loss": 0.1099, "step": 2474 }, { "epoch": 1.2883914627798023, "grad_norm": 0.2237628651766288, "learning_rate": 3.1838394996766946e-05, "loss": 0.1116, "step": 2475 }, { "epoch": 1.2889120249869859, "grad_norm": 0.22815623654435227, "learning_rate": 3.182487970449407e-05, "loss": 0.1094, "step": 2476 }, { "epoch": 1.2894325871941696, "grad_norm": 0.2324375751464136, "learning_rate": 3.181136225660657e-05, "loss": 0.1078, "step": 2477 }, { "epoch": 1.2899531494013534, "grad_norm": 0.24381943224649627, "learning_rate": 3.179784265737392e-05, "loss": 0.1186, "step": 2478 }, { "epoch": 1.2904737116085372, "grad_norm": 0.22358232434006062, "learning_rate": 3.178432091106619e-05, "loss": 0.1086, "step": 2479 }, { "epoch": 1.290994273815721, "grad_norm": 0.22505649527448798, "learning_rate": 3.1770797021954216e-05, "loss": 0.1149, "step": 2480 }, { "epoch": 1.2915148360229047, "grad_norm": 0.2271327003271489, "learning_rate": 3.1757270994309445e-05, "loss": 0.1151, "step": 2481 }, { "epoch": 1.2920353982300885, "grad_norm": 0.22354543223054793, "learning_rate": 3.174374283240405e-05, "loss": 0.1157, "step": 2482 }, { "epoch": 1.2925559604372723, "grad_norm": 0.23601648934888142, "learning_rate": 3.1730212540510835e-05, "loss": 0.1134, "step": 2483 }, { "epoch": 1.293076522644456, "grad_norm": 0.23493593163696477, "learning_rate": 3.1716680122903294e-05, "loss": 0.1146, "step": 2484 }, { "epoch": 1.2935970848516398, "grad_norm": 0.24903210748416696, "learning_rate": 3.170314558385562e-05, "loss": 0.112, "step": 2485 }, { "epoch": 1.2941176470588236, "grad_norm": 0.24092859338470038, "learning_rate": 3.1689608927642624e-05, "loss": 0.114, "step": 2486 }, { "epoch": 1.2946382092660074, "grad_norm": 0.23580201734250328, "learning_rate": 3.1676070158539825e-05, "loss": 0.1132, "step": 2487 }, { "epoch": 1.295158771473191, "grad_norm": 0.2356380954462469, "learning_rate": 3.166252928082339e-05, "loss": 0.1087, "step": 2488 }, { "epoch": 1.2956793336803747, "grad_norm": 0.23755644465406894, "learning_rate": 3.164898629877016e-05, "loss": 0.1109, "step": 2489 }, { "epoch": 1.2961998958875585, "grad_norm": 0.2319470903145725, "learning_rate": 3.1635441216657636e-05, "loss": 0.1085, "step": 2490 }, { "epoch": 1.2967204580947422, "grad_norm": 0.23994882725651862, "learning_rate": 3.1621894038763995e-05, "loss": 0.1142, "step": 2491 }, { "epoch": 1.297241020301926, "grad_norm": 0.23110760341343395, "learning_rate": 3.1608344769368056e-05, "loss": 0.105, "step": 2492 }, { "epoch": 1.2977615825091098, "grad_norm": 0.2493451968765941, "learning_rate": 3.1594793412749315e-05, "loss": 0.1144, "step": 2493 }, { "epoch": 1.2982821447162936, "grad_norm": 0.23459448167768657, "learning_rate": 3.158123997318792e-05, "loss": 0.1117, "step": 2494 }, { "epoch": 1.2988027069234773, "grad_norm": 0.23267743984705816, "learning_rate": 3.1567684454964675e-05, "loss": 0.1092, "step": 2495 }, { "epoch": 1.299323269130661, "grad_norm": 0.23495505350422138, "learning_rate": 3.155412686236105e-05, "loss": 0.1099, "step": 2496 }, { "epoch": 1.2998438313378449, "grad_norm": 0.2271281943706107, "learning_rate": 3.1540567199659154e-05, "loss": 0.1104, "step": 2497 }, { "epoch": 1.3003643935450286, "grad_norm": 0.25317690129572223, "learning_rate": 3.152700547114177e-05, "loss": 0.1118, "step": 2498 }, { "epoch": 1.3008849557522124, "grad_norm": 0.24086272384333318, "learning_rate": 3.15134416810923e-05, "loss": 0.1132, "step": 2499 }, { "epoch": 1.3014055179593962, "grad_norm": 0.25095110856441216, "learning_rate": 3.149987583379486e-05, "loss": 0.1184, "step": 2500 }, { "epoch": 1.30192608016658, "grad_norm": 0.23992994556687544, "learning_rate": 3.1486307933534143e-05, "loss": 0.1095, "step": 2501 }, { "epoch": 1.3024466423737637, "grad_norm": 0.2363209984551393, "learning_rate": 3.147273798459553e-05, "loss": 0.1114, "step": 2502 }, { "epoch": 1.3029672045809475, "grad_norm": 0.2524480432523037, "learning_rate": 3.145916599126506e-05, "loss": 0.1124, "step": 2503 }, { "epoch": 1.3034877667881313, "grad_norm": 0.22828236428464427, "learning_rate": 3.1445591957829374e-05, "loss": 0.105, "step": 2504 }, { "epoch": 1.304008328995315, "grad_norm": 0.25760549471351335, "learning_rate": 3.14320158885758e-05, "loss": 0.1099, "step": 2505 }, { "epoch": 1.3045288912024988, "grad_norm": 0.24230787911345597, "learning_rate": 3.141843778779229e-05, "loss": 0.1096, "step": 2506 }, { "epoch": 1.3050494534096826, "grad_norm": 0.24593151994037304, "learning_rate": 3.140485765976743e-05, "loss": 0.1068, "step": 2507 }, { "epoch": 1.3055700156168661, "grad_norm": 0.2367624744628246, "learning_rate": 3.1391275508790476e-05, "loss": 0.1089, "step": 2508 }, { "epoch": 1.30609057782405, "grad_norm": 0.25001842938229535, "learning_rate": 3.1377691339151285e-05, "loss": 0.1154, "step": 2509 }, { "epoch": 1.3066111400312337, "grad_norm": 0.23444841996962343, "learning_rate": 3.136410515514038e-05, "loss": 0.1111, "step": 2510 }, { "epoch": 1.3071317022384175, "grad_norm": 0.25857566739815296, "learning_rate": 3.13505169610489e-05, "loss": 0.1184, "step": 2511 }, { "epoch": 1.3076522644456012, "grad_norm": 0.23910560119186738, "learning_rate": 3.133692676116865e-05, "loss": 0.1072, "step": 2512 }, { "epoch": 1.308172826652785, "grad_norm": 0.23446735180490522, "learning_rate": 3.132333455979202e-05, "loss": 0.1109, "step": 2513 }, { "epoch": 1.3086933888599688, "grad_norm": 0.24121685895466385, "learning_rate": 3.130974036121208e-05, "loss": 0.1137, "step": 2514 }, { "epoch": 1.3092139510671525, "grad_norm": 0.2471492387424443, "learning_rate": 3.12961441697225e-05, "loss": 0.1115, "step": 2515 }, { "epoch": 1.3097345132743363, "grad_norm": 0.23961316944400285, "learning_rate": 3.1282545989617595e-05, "loss": 0.1174, "step": 2516 }, { "epoch": 1.31025507548152, "grad_norm": 0.22528913743073048, "learning_rate": 3.126894582519231e-05, "loss": 0.1122, "step": 2517 }, { "epoch": 1.3107756376887039, "grad_norm": 0.2222728768218916, "learning_rate": 3.1255343680742195e-05, "loss": 0.1088, "step": 2518 }, { "epoch": 1.3112961998958876, "grad_norm": 0.2364525553318129, "learning_rate": 3.1241739560563446e-05, "loss": 0.1092, "step": 2519 }, { "epoch": 1.3118167621030712, "grad_norm": 0.23549419626533213, "learning_rate": 3.122813346895288e-05, "loss": 0.1126, "step": 2520 }, { "epoch": 1.312337324310255, "grad_norm": 0.22438892761571402, "learning_rate": 3.121452541020793e-05, "loss": 0.1057, "step": 2521 }, { "epoch": 1.3128578865174387, "grad_norm": 0.2331411062533715, "learning_rate": 3.1200915388626654e-05, "loss": 0.1132, "step": 2522 }, { "epoch": 1.3133784487246225, "grad_norm": 0.23459665267739663, "learning_rate": 3.118730340850774e-05, "loss": 0.1075, "step": 2523 }, { "epoch": 1.3138990109318063, "grad_norm": 0.2347453526216641, "learning_rate": 3.1173689474150476e-05, "loss": 0.1098, "step": 2524 }, { "epoch": 1.31441957313899, "grad_norm": 0.23323798829157819, "learning_rate": 3.116007358985477e-05, "loss": 0.1091, "step": 2525 }, { "epoch": 1.3149401353461738, "grad_norm": 0.23608030886582382, "learning_rate": 3.1146455759921166e-05, "loss": 0.1116, "step": 2526 }, { "epoch": 1.3154606975533576, "grad_norm": 0.24356320449137384, "learning_rate": 3.11328359886508e-05, "loss": 0.1072, "step": 2527 }, { "epoch": 1.3159812597605414, "grad_norm": 0.23069187776813016, "learning_rate": 3.111921428034544e-05, "loss": 0.1072, "step": 2528 }, { "epoch": 1.3165018219677251, "grad_norm": 0.24927589485206192, "learning_rate": 3.110559063930743e-05, "loss": 0.1165, "step": 2529 }, { "epoch": 1.317022384174909, "grad_norm": 0.22698192027918843, "learning_rate": 3.109196506983978e-05, "loss": 0.1098, "step": 2530 }, { "epoch": 1.3175429463820927, "grad_norm": 0.2503079852778464, "learning_rate": 3.107833757624605e-05, "loss": 0.1106, "step": 2531 }, { "epoch": 1.3180635085892765, "grad_norm": 0.2459913765268515, "learning_rate": 3.1064708162830466e-05, "loss": 0.1176, "step": 2532 }, { "epoch": 1.3185840707964602, "grad_norm": 0.24673270505003705, "learning_rate": 3.105107683389781e-05, "loss": 0.1162, "step": 2533 }, { "epoch": 1.319104633003644, "grad_norm": 0.22092262416559946, "learning_rate": 3.10374435937535e-05, "loss": 0.1067, "step": 2534 }, { "epoch": 1.3196251952108278, "grad_norm": 0.23778340367233575, "learning_rate": 3.102380844670355e-05, "loss": 0.1156, "step": 2535 }, { "epoch": 1.3201457574180115, "grad_norm": 0.24483607513262326, "learning_rate": 3.101017139705455e-05, "loss": 0.1145, "step": 2536 }, { "epoch": 1.3206663196251953, "grad_norm": 0.22769610941345692, "learning_rate": 3.099653244911375e-05, "loss": 0.1144, "step": 2537 }, { "epoch": 1.321186881832379, "grad_norm": 0.24187095569563516, "learning_rate": 3.098289160718895e-05, "loss": 0.1146, "step": 2538 }, { "epoch": 1.3217074440395629, "grad_norm": 0.22264068349148974, "learning_rate": 3.096924887558855e-05, "loss": 0.1111, "step": 2539 }, { "epoch": 1.3222280062467464, "grad_norm": 0.24298069839995018, "learning_rate": 3.095560425862157e-05, "loss": 0.1138, "step": 2540 }, { "epoch": 1.3227485684539302, "grad_norm": 0.23672363156144616, "learning_rate": 3.094195776059763e-05, "loss": 0.1086, "step": 2541 }, { "epoch": 1.323269130661114, "grad_norm": 0.23121118320526707, "learning_rate": 3.09283093858269e-05, "loss": 0.1157, "step": 2542 }, { "epoch": 1.3237896928682977, "grad_norm": 0.22330970680800596, "learning_rate": 3.0914659138620186e-05, "loss": 0.1062, "step": 2543 }, { "epoch": 1.3243102550754815, "grad_norm": 0.24667833728809804, "learning_rate": 3.090100702328888e-05, "loss": 0.1203, "step": 2544 }, { "epoch": 1.3248308172826653, "grad_norm": 0.24428638534778402, "learning_rate": 3.088735304414494e-05, "loss": 0.1183, "step": 2545 }, { "epoch": 1.325351379489849, "grad_norm": 0.2357143337600258, "learning_rate": 3.087369720550094e-05, "loss": 0.1158, "step": 2546 }, { "epoch": 1.3258719416970328, "grad_norm": 0.21965988630691494, "learning_rate": 3.0860039511670024e-05, "loss": 0.1103, "step": 2547 }, { "epoch": 1.3263925039042166, "grad_norm": 0.22044710681349058, "learning_rate": 3.084637996696592e-05, "loss": 0.1148, "step": 2548 }, { "epoch": 1.3269130661114004, "grad_norm": 0.2305177255025451, "learning_rate": 3.083271857570297e-05, "loss": 0.1093, "step": 2549 }, { "epoch": 1.3274336283185841, "grad_norm": 0.23136166042256415, "learning_rate": 3.0819055342196054e-05, "loss": 0.1078, "step": 2550 }, { "epoch": 1.327954190525768, "grad_norm": 0.2421297560595075, "learning_rate": 3.080539027076066e-05, "loss": 0.1125, "step": 2551 }, { "epoch": 1.3284747527329515, "grad_norm": 0.22632609868379913, "learning_rate": 3.0791723365712867e-05, "loss": 0.1098, "step": 2552 }, { "epoch": 1.3289953149401352, "grad_norm": 0.23994272711042058, "learning_rate": 3.077805463136931e-05, "loss": 0.111, "step": 2553 }, { "epoch": 1.329515877147319, "grad_norm": 0.24457717811072027, "learning_rate": 3.07643840720472e-05, "loss": 0.1168, "step": 2554 }, { "epoch": 1.3300364393545028, "grad_norm": 0.2375550486078291, "learning_rate": 3.075071169206437e-05, "loss": 0.1164, "step": 2555 }, { "epoch": 1.3305570015616865, "grad_norm": 0.23885122754311797, "learning_rate": 3.073703749573916e-05, "loss": 0.1135, "step": 2556 }, { "epoch": 1.3310775637688703, "grad_norm": 0.24172905569894784, "learning_rate": 3.072336148739053e-05, "loss": 0.1105, "step": 2557 }, { "epoch": 1.331598125976054, "grad_norm": 0.24554361390660961, "learning_rate": 3.0709683671338e-05, "loss": 0.1169, "step": 2558 }, { "epoch": 1.3321186881832379, "grad_norm": 0.22416722525242608, "learning_rate": 3.069600405190167e-05, "loss": 0.1099, "step": 2559 }, { "epoch": 1.3326392503904216, "grad_norm": 0.23312900420360225, "learning_rate": 3.068232263340218e-05, "loss": 0.1126, "step": 2560 }, { "epoch": 1.3331598125976054, "grad_norm": 0.23606607210162725, "learning_rate": 3.066863942016077e-05, "loss": 0.1089, "step": 2561 }, { "epoch": 1.3336803748047892, "grad_norm": 0.2373211884453036, "learning_rate": 3.0654954416499244e-05, "loss": 0.109, "step": 2562 }, { "epoch": 1.334200937011973, "grad_norm": 0.2343989018463782, "learning_rate": 3.064126762673994e-05, "loss": 0.1125, "step": 2563 }, { "epoch": 1.3347214992191567, "grad_norm": 0.26796891784177396, "learning_rate": 3.062757905520582e-05, "loss": 0.122, "step": 2564 }, { "epoch": 1.3352420614263405, "grad_norm": 0.24223466770689397, "learning_rate": 3.0613888706220336e-05, "loss": 0.1134, "step": 2565 }, { "epoch": 1.3357626236335243, "grad_norm": 0.2549438338386172, "learning_rate": 3.060019658410755e-05, "loss": 0.117, "step": 2566 }, { "epoch": 1.336283185840708, "grad_norm": 0.25893651098604287, "learning_rate": 3.0586502693192074e-05, "loss": 0.116, "step": 2567 }, { "epoch": 1.3368037480478918, "grad_norm": 0.22061547809765672, "learning_rate": 3.0572807037799075e-05, "loss": 0.1069, "step": 2568 }, { "epoch": 1.3373243102550756, "grad_norm": 0.2707795969101939, "learning_rate": 3.055910962225428e-05, "loss": 0.1084, "step": 2569 }, { "epoch": 1.3378448724622594, "grad_norm": 0.23308154556319333, "learning_rate": 3.054541045088396e-05, "loss": 0.115, "step": 2570 }, { "epoch": 1.3383654346694431, "grad_norm": 0.2567417526177613, "learning_rate": 3.053170952801496e-05, "loss": 0.115, "step": 2571 }, { "epoch": 1.3388859968766267, "grad_norm": 0.2362561720344537, "learning_rate": 3.0518006857974666e-05, "loss": 0.1142, "step": 2572 }, { "epoch": 1.3394065590838105, "grad_norm": 0.24896491322044217, "learning_rate": 3.0504302445091027e-05, "loss": 0.1142, "step": 2573 }, { "epoch": 1.3399271212909942, "grad_norm": 0.26160908249539727, "learning_rate": 3.0490596293692525e-05, "loss": 0.1179, "step": 2574 }, { "epoch": 1.340447683498178, "grad_norm": 0.23521571929263785, "learning_rate": 3.0476888408108202e-05, "loss": 0.1162, "step": 2575 }, { "epoch": 1.3409682457053618, "grad_norm": 0.24915979845519137, "learning_rate": 3.0463178792667645e-05, "loss": 0.1127, "step": 2576 }, { "epoch": 1.3414888079125455, "grad_norm": 0.23236370763142386, "learning_rate": 3.0449467451700997e-05, "loss": 0.1052, "step": 2577 }, { "epoch": 1.3420093701197293, "grad_norm": 0.2204117728067703, "learning_rate": 3.0435754389538928e-05, "loss": 0.1048, "step": 2578 }, { "epoch": 1.342529932326913, "grad_norm": 0.23813101057229322, "learning_rate": 3.0422039610512666e-05, "loss": 0.109, "step": 2579 }, { "epoch": 1.3430504945340969, "grad_norm": 0.24713378801317953, "learning_rate": 3.0408323118953968e-05, "loss": 0.1121, "step": 2580 }, { "epoch": 1.3435710567412806, "grad_norm": 0.25584268149164247, "learning_rate": 3.0394604919195156e-05, "loss": 0.1118, "step": 2581 }, { "epoch": 1.3440916189484644, "grad_norm": 0.24392930456555278, "learning_rate": 3.0380885015569067e-05, "loss": 0.1083, "step": 2582 }, { "epoch": 1.3446121811556482, "grad_norm": 0.2352412179715951, "learning_rate": 3.036716341240908e-05, "loss": 0.1099, "step": 2583 }, { "epoch": 1.3451327433628317, "grad_norm": 0.25301035157344787, "learning_rate": 3.0353440114049126e-05, "loss": 0.1077, "step": 2584 }, { "epoch": 1.3456533055700155, "grad_norm": 0.27102376417563373, "learning_rate": 3.0339715124823652e-05, "loss": 0.1133, "step": 2585 }, { "epoch": 1.3461738677771993, "grad_norm": 0.245471066517425, "learning_rate": 3.0325988449067654e-05, "loss": 0.1063, "step": 2586 }, { "epoch": 1.346694429984383, "grad_norm": 0.25444136673567697, "learning_rate": 3.031226009111665e-05, "loss": 0.1114, "step": 2587 }, { "epoch": 1.3472149921915668, "grad_norm": 0.24276271471445565, "learning_rate": 3.0298530055306708e-05, "loss": 0.1131, "step": 2588 }, { "epoch": 1.3477355543987506, "grad_norm": 0.26370328884810446, "learning_rate": 3.028479834597439e-05, "loss": 0.1118, "step": 2589 }, { "epoch": 1.3482561166059344, "grad_norm": 0.252507222324151, "learning_rate": 3.027106496745683e-05, "loss": 0.1088, "step": 2590 }, { "epoch": 1.3487766788131181, "grad_norm": 0.22861875840700396, "learning_rate": 3.025732992409166e-05, "loss": 0.1117, "step": 2591 }, { "epoch": 1.349297241020302, "grad_norm": 0.24956628943142115, "learning_rate": 3.0243593220217044e-05, "loss": 0.1116, "step": 2592 }, { "epoch": 1.3498178032274857, "grad_norm": 0.2390835790110749, "learning_rate": 3.0229854860171662e-05, "loss": 0.1131, "step": 2593 }, { "epoch": 1.3503383654346695, "grad_norm": 0.24074530354960633, "learning_rate": 3.021611484829475e-05, "loss": 0.1111, "step": 2594 }, { "epoch": 1.3508589276418532, "grad_norm": 0.2444101290555521, "learning_rate": 3.0202373188926037e-05, "loss": 0.1084, "step": 2595 }, { "epoch": 1.351379489849037, "grad_norm": 0.24713525990174354, "learning_rate": 3.0188629886405763e-05, "loss": 0.1054, "step": 2596 }, { "epoch": 1.3519000520562208, "grad_norm": 0.25164204870542045, "learning_rate": 3.017488494507471e-05, "loss": 0.1121, "step": 2597 }, { "epoch": 1.3524206142634045, "grad_norm": 0.24569308639230078, "learning_rate": 3.0161138369274177e-05, "loss": 0.1121, "step": 2598 }, { "epoch": 1.3529411764705883, "grad_norm": 0.24172753415281717, "learning_rate": 3.0147390163345972e-05, "loss": 0.1104, "step": 2599 }, { "epoch": 1.353461738677772, "grad_norm": 0.2713692191610928, "learning_rate": 3.013364033163241e-05, "loss": 0.1175, "step": 2600 }, { "epoch": 1.3539823008849559, "grad_norm": 0.2610319613907833, "learning_rate": 3.0119888878476338e-05, "loss": 0.1154, "step": 2601 }, { "epoch": 1.3545028630921396, "grad_norm": 0.22606906047311914, "learning_rate": 3.0106135808221093e-05, "loss": 0.1057, "step": 2602 }, { "epoch": 1.3550234252993234, "grad_norm": 0.2416710905492764, "learning_rate": 3.009238112521054e-05, "loss": 0.1111, "step": 2603 }, { "epoch": 1.355543987506507, "grad_norm": 0.23725466138349213, "learning_rate": 3.007862483378906e-05, "loss": 0.114, "step": 2604 }, { "epoch": 1.3560645497136907, "grad_norm": 0.22201764810825417, "learning_rate": 3.0064866938301507e-05, "loss": 0.1091, "step": 2605 }, { "epoch": 1.3565851119208745, "grad_norm": 0.26553203846707873, "learning_rate": 3.005110744309328e-05, "loss": 0.1095, "step": 2606 }, { "epoch": 1.3571056741280583, "grad_norm": 0.22897934347302173, "learning_rate": 3.003734635251026e-05, "loss": 0.1042, "step": 2607 }, { "epoch": 1.357626236335242, "grad_norm": 0.22965225339232517, "learning_rate": 3.0023583670898848e-05, "loss": 0.1123, "step": 2608 }, { "epoch": 1.3581467985424258, "grad_norm": 0.2374884978816423, "learning_rate": 3.0009819402605938e-05, "loss": 0.1065, "step": 2609 }, { "epoch": 1.3586673607496096, "grad_norm": 0.23724855307575515, "learning_rate": 2.999605355197892e-05, "loss": 0.1138, "step": 2610 }, { "epoch": 1.3591879229567934, "grad_norm": 0.2522681898496753, "learning_rate": 2.9982286123365694e-05, "loss": 0.1127, "step": 2611 }, { "epoch": 1.3597084851639771, "grad_norm": 0.2390455975944688, "learning_rate": 2.9968517121114652e-05, "loss": 0.1103, "step": 2612 }, { "epoch": 1.360229047371161, "grad_norm": 0.22922231459543907, "learning_rate": 2.9954746549574697e-05, "loss": 0.1017, "step": 2613 }, { "epoch": 1.3607496095783447, "grad_norm": 0.23580922604859134, "learning_rate": 2.9940974413095203e-05, "loss": 0.1099, "step": 2614 }, { "epoch": 1.3612701717855284, "grad_norm": 0.22858569901780046, "learning_rate": 2.9927200716026055e-05, "loss": 0.111, "step": 2615 }, { "epoch": 1.361790733992712, "grad_norm": 0.2532805937856553, "learning_rate": 2.9913425462717625e-05, "loss": 0.113, "step": 2616 }, { "epoch": 1.3623112961998958, "grad_norm": 0.23253715435079703, "learning_rate": 2.989964865752079e-05, "loss": 0.11, "step": 2617 }, { "epoch": 1.3628318584070795, "grad_norm": 0.23333001277583243, "learning_rate": 2.9885870304786896e-05, "loss": 0.1116, "step": 2618 }, { "epoch": 1.3633524206142633, "grad_norm": 0.21615247744152524, "learning_rate": 2.9872090408867785e-05, "loss": 0.1058, "step": 2619 }, { "epoch": 1.363872982821447, "grad_norm": 0.2367438794292196, "learning_rate": 2.9858308974115808e-05, "loss": 0.112, "step": 2620 }, { "epoch": 1.3643935450286309, "grad_norm": 0.23367571489326353, "learning_rate": 2.9844526004883755e-05, "loss": 0.1045, "step": 2621 }, { "epoch": 1.3649141072358146, "grad_norm": 0.24556639627942822, "learning_rate": 2.9830741505524958e-05, "loss": 0.1217, "step": 2622 }, { "epoch": 1.3654346694429984, "grad_norm": 0.23637425075744797, "learning_rate": 2.9816955480393187e-05, "loss": 0.111, "step": 2623 }, { "epoch": 1.3659552316501822, "grad_norm": 0.25593497993008707, "learning_rate": 2.9803167933842714e-05, "loss": 0.11, "step": 2624 }, { "epoch": 1.366475793857366, "grad_norm": 0.23985020724360726, "learning_rate": 2.9789378870228283e-05, "loss": 0.1109, "step": 2625 }, { "epoch": 1.3669963560645497, "grad_norm": 0.23470299239423129, "learning_rate": 2.9775588293905132e-05, "loss": 0.1123, "step": 2626 }, { "epoch": 1.3675169182717335, "grad_norm": 0.2331930805433966, "learning_rate": 2.976179620922896e-05, "loss": 0.1086, "step": 2627 }, { "epoch": 1.3680374804789173, "grad_norm": 0.23535519041672734, "learning_rate": 2.9748002620555944e-05, "loss": 0.1084, "step": 2628 }, { "epoch": 1.368558042686101, "grad_norm": 0.2506834392739139, "learning_rate": 2.9734207532242754e-05, "loss": 0.1134, "step": 2629 }, { "epoch": 1.3690786048932848, "grad_norm": 0.22986664916695576, "learning_rate": 2.9720410948646504e-05, "loss": 0.1081, "step": 2630 }, { "epoch": 1.3695991671004686, "grad_norm": 0.24834546542727068, "learning_rate": 2.970661287412482e-05, "loss": 0.1162, "step": 2631 }, { "epoch": 1.3701197293076524, "grad_norm": 0.2308044732796431, "learning_rate": 2.969281331303576e-05, "loss": 0.1112, "step": 2632 }, { "epoch": 1.3706402915148361, "grad_norm": 0.24372786534936366, "learning_rate": 2.967901226973787e-05, "loss": 0.1123, "step": 2633 }, { "epoch": 1.37116085372202, "grad_norm": 0.23353876211260266, "learning_rate": 2.966520974859016e-05, "loss": 0.1063, "step": 2634 }, { "epoch": 1.3716814159292037, "grad_norm": 0.23840470512997244, "learning_rate": 2.965140575395211e-05, "loss": 0.1123, "step": 2635 }, { "epoch": 1.3722019781363872, "grad_norm": 0.2308432390077942, "learning_rate": 2.9637600290183675e-05, "loss": 0.1126, "step": 2636 }, { "epoch": 1.372722540343571, "grad_norm": 0.23561930060975914, "learning_rate": 2.9623793361645247e-05, "loss": 0.1106, "step": 2637 }, { "epoch": 1.3732431025507548, "grad_norm": 0.21718824756279217, "learning_rate": 2.96099849726977e-05, "loss": 0.1078, "step": 2638 }, { "epoch": 1.3737636647579385, "grad_norm": 0.23549809364298932, "learning_rate": 2.9596175127702368e-05, "loss": 0.1155, "step": 2639 }, { "epoch": 1.3742842269651223, "grad_norm": 0.2351146169717648, "learning_rate": 2.958236383102105e-05, "loss": 0.1089, "step": 2640 }, { "epoch": 1.374804789172306, "grad_norm": 0.230720734914438, "learning_rate": 2.956855108701599e-05, "loss": 0.1125, "step": 2641 }, { "epoch": 1.3753253513794899, "grad_norm": 0.23350953234570837, "learning_rate": 2.9554736900049883e-05, "loss": 0.1123, "step": 2642 }, { "epoch": 1.3758459135866736, "grad_norm": 0.2272348761594922, "learning_rate": 2.954092127448591e-05, "loss": 0.1126, "step": 2643 }, { "epoch": 1.3763664757938574, "grad_norm": 0.26121567593064826, "learning_rate": 2.9527104214687685e-05, "loss": 0.1171, "step": 2644 }, { "epoch": 1.3768870380010412, "grad_norm": 0.24347665782131095, "learning_rate": 2.951328572501928e-05, "loss": 0.114, "step": 2645 }, { "epoch": 1.377407600208225, "grad_norm": 0.22666019429301196, "learning_rate": 2.94994658098452e-05, "loss": 0.109, "step": 2646 }, { "epoch": 1.3779281624154087, "grad_norm": 0.24888395219943815, "learning_rate": 2.9485644473530437e-05, "loss": 0.114, "step": 2647 }, { "epoch": 1.3784487246225923, "grad_norm": 0.2339315941555209, "learning_rate": 2.9471821720440406e-05, "loss": 0.1054, "step": 2648 }, { "epoch": 1.378969286829776, "grad_norm": 0.22715324176204416, "learning_rate": 2.9457997554940974e-05, "loss": 0.1071, "step": 2649 }, { "epoch": 1.3794898490369598, "grad_norm": 0.23984980511126433, "learning_rate": 2.944417198139846e-05, "loss": 0.1104, "step": 2650 }, { "epoch": 1.3800104112441436, "grad_norm": 0.2388805658491324, "learning_rate": 2.9430345004179614e-05, "loss": 0.1112, "step": 2651 }, { "epoch": 1.3805309734513274, "grad_norm": 0.2609240796303999, "learning_rate": 2.9416516627651647e-05, "loss": 0.1201, "step": 2652 }, { "epoch": 1.3810515356585111, "grad_norm": 0.22009015612627597, "learning_rate": 2.9402686856182205e-05, "loss": 0.1059, "step": 2653 }, { "epoch": 1.381572097865695, "grad_norm": 0.25819973582961897, "learning_rate": 2.9388855694139373e-05, "loss": 0.1136, "step": 2654 }, { "epoch": 1.3820926600728787, "grad_norm": 0.2626233123096237, "learning_rate": 2.9375023145891666e-05, "loss": 0.1182, "step": 2655 }, { "epoch": 1.3826132222800624, "grad_norm": 0.24547878318931401, "learning_rate": 2.936118921580806e-05, "loss": 0.112, "step": 2656 }, { "epoch": 1.3831337844872462, "grad_norm": 0.22938479950092555, "learning_rate": 2.9347353908257936e-05, "loss": 0.1112, "step": 2657 }, { "epoch": 1.38365434669443, "grad_norm": 0.23335117709653277, "learning_rate": 2.9333517227611152e-05, "loss": 0.1116, "step": 2658 }, { "epoch": 1.3841749089016138, "grad_norm": 0.2514180524501941, "learning_rate": 2.9319679178237957e-05, "loss": 0.1115, "step": 2659 }, { "epoch": 1.3846954711087975, "grad_norm": 0.22641541429241346, "learning_rate": 2.9305839764509058e-05, "loss": 0.1071, "step": 2660 }, { "epoch": 1.3852160333159813, "grad_norm": 0.23200240748290513, "learning_rate": 2.929199899079558e-05, "loss": 0.1067, "step": 2661 }, { "epoch": 1.385736595523165, "grad_norm": 0.2390937747494344, "learning_rate": 2.9278156861469096e-05, "loss": 0.1081, "step": 2662 }, { "epoch": 1.3862571577303489, "grad_norm": 0.23468619522538356, "learning_rate": 2.9264313380901588e-05, "loss": 0.1097, "step": 2663 }, { "epoch": 1.3867777199375326, "grad_norm": 0.24345058195506755, "learning_rate": 2.9250468553465466e-05, "loss": 0.1089, "step": 2664 }, { "epoch": 1.3872982821447164, "grad_norm": 0.25475277322641765, "learning_rate": 2.9236622383533575e-05, "loss": 0.1113, "step": 2665 }, { "epoch": 1.3878188443519002, "grad_norm": 0.2532468565384966, "learning_rate": 2.9222774875479176e-05, "loss": 0.1079, "step": 2666 }, { "epoch": 1.388339406559084, "grad_norm": 0.2512503551401563, "learning_rate": 2.920892603367596e-05, "loss": 0.1107, "step": 2667 }, { "epoch": 1.3888599687662675, "grad_norm": 0.24339764746526638, "learning_rate": 2.919507586249805e-05, "loss": 0.1161, "step": 2668 }, { "epoch": 1.3893805309734513, "grad_norm": 0.2523651063330094, "learning_rate": 2.9181224366319947e-05, "loss": 0.1139, "step": 2669 }, { "epoch": 1.389901093180635, "grad_norm": 0.22988428164397576, "learning_rate": 2.916737154951662e-05, "loss": 0.1095, "step": 2670 }, { "epoch": 1.3904216553878188, "grad_norm": 0.24090438564750494, "learning_rate": 2.9153517416463418e-05, "loss": 0.1097, "step": 2671 }, { "epoch": 1.3909422175950026, "grad_norm": 0.24074539123393496, "learning_rate": 2.913966197153613e-05, "loss": 0.1068, "step": 2672 }, { "epoch": 1.3914627798021864, "grad_norm": 0.2374411651100617, "learning_rate": 2.9125805219110952e-05, "loss": 0.1135, "step": 2673 }, { "epoch": 1.3919833420093701, "grad_norm": 0.24364817979891987, "learning_rate": 2.9111947163564478e-05, "loss": 0.1077, "step": 2674 }, { "epoch": 1.392503904216554, "grad_norm": 0.239001833403062, "learning_rate": 2.9098087809273743e-05, "loss": 0.1089, "step": 2675 }, { "epoch": 1.3930244664237377, "grad_norm": 0.2415390586855587, "learning_rate": 2.908422716061617e-05, "loss": 0.108, "step": 2676 }, { "epoch": 1.3935450286309214, "grad_norm": 0.26779259818902096, "learning_rate": 2.9070365221969598e-05, "loss": 0.1099, "step": 2677 }, { "epoch": 1.3940655908381052, "grad_norm": 0.23464593639934386, "learning_rate": 2.9056501997712267e-05, "loss": 0.1111, "step": 2678 }, { "epoch": 1.394586153045289, "grad_norm": 0.2343125934015767, "learning_rate": 2.904263749222283e-05, "loss": 0.1055, "step": 2679 }, { "epoch": 1.3951067152524725, "grad_norm": 0.2366101209417697, "learning_rate": 2.9028771709880342e-05, "loss": 0.1104, "step": 2680 }, { "epoch": 1.3956272774596563, "grad_norm": 0.24371168991559053, "learning_rate": 2.9014904655064273e-05, "loss": 0.1132, "step": 2681 }, { "epoch": 1.39614783966684, "grad_norm": 0.23630952126438518, "learning_rate": 2.9001036332154474e-05, "loss": 0.1088, "step": 2682 }, { "epoch": 1.3966684018740239, "grad_norm": 0.24538418864834158, "learning_rate": 2.8987166745531207e-05, "loss": 0.1126, "step": 2683 }, { "epoch": 1.3971889640812076, "grad_norm": 0.23134351061219888, "learning_rate": 2.897329589957514e-05, "loss": 0.1144, "step": 2684 }, { "epoch": 1.3977095262883914, "grad_norm": 0.23678786376287386, "learning_rate": 2.8959423798667317e-05, "loss": 0.1088, "step": 2685 }, { "epoch": 1.3982300884955752, "grad_norm": 0.23975398835505177, "learning_rate": 2.894555044718921e-05, "loss": 0.1089, "step": 2686 }, { "epoch": 1.398750650702759, "grad_norm": 0.23608185526452935, "learning_rate": 2.893167584952266e-05, "loss": 0.11, "step": 2687 }, { "epoch": 1.3992712129099427, "grad_norm": 0.24040631010421307, "learning_rate": 2.8917800010049917e-05, "loss": 0.1169, "step": 2688 }, { "epoch": 1.3997917751171265, "grad_norm": 0.2414692039364617, "learning_rate": 2.8903922933153606e-05, "loss": 0.1097, "step": 2689 }, { "epoch": 1.4003123373243103, "grad_norm": 0.24897062492291208, "learning_rate": 2.8890044623216763e-05, "loss": 0.1149, "step": 2690 }, { "epoch": 1.400832899531494, "grad_norm": 0.23371010023798855, "learning_rate": 2.8876165084622797e-05, "loss": 0.1115, "step": 2691 }, { "epoch": 1.4013534617386778, "grad_norm": 0.21963840974379567, "learning_rate": 2.8862284321755517e-05, "loss": 0.1007, "step": 2692 }, { "epoch": 1.4018740239458616, "grad_norm": 0.23159324050922378, "learning_rate": 2.8848402338999115e-05, "loss": 0.1115, "step": 2693 }, { "epoch": 1.4023945861530454, "grad_norm": 0.23496545202276656, "learning_rate": 2.8834519140738158e-05, "loss": 0.1143, "step": 2694 }, { "epoch": 1.4029151483602291, "grad_norm": 0.23229098782752947, "learning_rate": 2.882063473135763e-05, "loss": 0.1135, "step": 2695 }, { "epoch": 1.403435710567413, "grad_norm": 0.23844892538632909, "learning_rate": 2.880674911524284e-05, "loss": 0.1125, "step": 2696 }, { "epoch": 1.4039562727745967, "grad_norm": 0.22375205668172357, "learning_rate": 2.8792862296779538e-05, "loss": 0.1086, "step": 2697 }, { "epoch": 1.4044768349817804, "grad_norm": 0.243507183609586, "learning_rate": 2.8778974280353817e-05, "loss": 0.1111, "step": 2698 }, { "epoch": 1.4049973971889642, "grad_norm": 0.23876948702376113, "learning_rate": 2.8765085070352153e-05, "loss": 0.1116, "step": 2699 }, { "epoch": 1.4055179593961478, "grad_norm": 0.23325769806586572, "learning_rate": 2.8751194671161423e-05, "loss": 0.1098, "step": 2700 }, { "epoch": 1.4060385216033315, "grad_norm": 0.2508356243169636, "learning_rate": 2.8737303087168837e-05, "loss": 0.1071, "step": 2701 }, { "epoch": 1.4065590838105153, "grad_norm": 0.24922685226760763, "learning_rate": 2.8723410322762027e-05, "loss": 0.1152, "step": 2702 }, { "epoch": 1.407079646017699, "grad_norm": 0.2487978849373613, "learning_rate": 2.8709516382328962e-05, "loss": 0.113, "step": 2703 }, { "epoch": 1.4076002082248829, "grad_norm": 0.24928009890621933, "learning_rate": 2.8695621270258e-05, "loss": 0.1174, "step": 2704 }, { "epoch": 1.4081207704320666, "grad_norm": 0.23579163389801594, "learning_rate": 2.8681724990937857e-05, "loss": 0.1105, "step": 2705 }, { "epoch": 1.4086413326392504, "grad_norm": 0.23415300436456926, "learning_rate": 2.8667827548757624e-05, "loss": 0.1102, "step": 2706 }, { "epoch": 1.4091618948464342, "grad_norm": 0.22972294165859275, "learning_rate": 2.865392894810678e-05, "loss": 0.1061, "step": 2707 }, { "epoch": 1.409682457053618, "grad_norm": 0.2345573651214216, "learning_rate": 2.8640029193375128e-05, "loss": 0.1046, "step": 2708 }, { "epoch": 1.4102030192608017, "grad_norm": 0.23651806913382753, "learning_rate": 2.8626128288952862e-05, "loss": 0.1069, "step": 2709 }, { "epoch": 1.4107235814679855, "grad_norm": 0.24904333761243203, "learning_rate": 2.8612226239230532e-05, "loss": 0.1129, "step": 2710 }, { "epoch": 1.4112441436751693, "grad_norm": 0.23917081649980002, "learning_rate": 2.8598323048599067e-05, "loss": 0.1081, "step": 2711 }, { "epoch": 1.4117647058823528, "grad_norm": 0.2456854467567518, "learning_rate": 2.8584418721449724e-05, "loss": 0.1103, "step": 2712 }, { "epoch": 1.4122852680895366, "grad_norm": 0.2452287893186469, "learning_rate": 2.8570513262174152e-05, "loss": 0.1174, "step": 2713 }, { "epoch": 1.4128058302967204, "grad_norm": 0.23238334523577805, "learning_rate": 2.855660667516433e-05, "loss": 0.1099, "step": 2714 }, { "epoch": 1.4133263925039041, "grad_norm": 0.23389506698918613, "learning_rate": 2.854269896481261e-05, "loss": 0.1084, "step": 2715 }, { "epoch": 1.413846954711088, "grad_norm": 0.23215202724524295, "learning_rate": 2.8528790135511695e-05, "loss": 0.1136, "step": 2716 }, { "epoch": 1.4143675169182717, "grad_norm": 0.2394218131173443, "learning_rate": 2.851488019165464e-05, "loss": 0.1073, "step": 2717 }, { "epoch": 1.4148880791254554, "grad_norm": 0.2260015168454527, "learning_rate": 2.8500969137634853e-05, "loss": 0.1111, "step": 2718 }, { "epoch": 1.4154086413326392, "grad_norm": 0.2196531969922503, "learning_rate": 2.8487056977846083e-05, "loss": 0.1122, "step": 2719 }, { "epoch": 1.415929203539823, "grad_norm": 0.23260699506685206, "learning_rate": 2.8473143716682455e-05, "loss": 0.1157, "step": 2720 }, { "epoch": 1.4164497657470068, "grad_norm": 0.23668138862358012, "learning_rate": 2.8459229358538407e-05, "loss": 0.1115, "step": 2721 }, { "epoch": 1.4169703279541905, "grad_norm": 0.2246223176402912, "learning_rate": 2.8445313907808756e-05, "loss": 0.1084, "step": 2722 }, { "epoch": 1.4174908901613743, "grad_norm": 0.2301496115534156, "learning_rate": 2.843139736888864e-05, "loss": 0.1076, "step": 2723 }, { "epoch": 1.418011452368558, "grad_norm": 0.22756253474564722, "learning_rate": 2.841747974617355e-05, "loss": 0.1086, "step": 2724 }, { "epoch": 1.4185320145757419, "grad_norm": 0.22250208266300767, "learning_rate": 2.8403561044059324e-05, "loss": 0.1086, "step": 2725 }, { "epoch": 1.4190525767829256, "grad_norm": 0.2254005836949918, "learning_rate": 2.8389641266942124e-05, "loss": 0.104, "step": 2726 }, { "epoch": 1.4195731389901094, "grad_norm": 0.23920028284992914, "learning_rate": 2.8375720419218488e-05, "loss": 0.1161, "step": 2727 }, { "epoch": 1.4200937011972932, "grad_norm": 0.23743660758278437, "learning_rate": 2.836179850528523e-05, "loss": 0.1085, "step": 2728 }, { "epoch": 1.420614263404477, "grad_norm": 0.22280403669098997, "learning_rate": 2.8347875529539576e-05, "loss": 0.1074, "step": 2729 }, { "epoch": 1.4211348256116607, "grad_norm": 0.2304981362316628, "learning_rate": 2.8333951496379023e-05, "loss": 0.1071, "step": 2730 }, { "epoch": 1.4216553878188445, "grad_norm": 0.23280393574418745, "learning_rate": 2.8320026410201445e-05, "loss": 0.1102, "step": 2731 }, { "epoch": 1.422175950026028, "grad_norm": 0.23611313379761717, "learning_rate": 2.8306100275405024e-05, "loss": 0.1101, "step": 2732 }, { "epoch": 1.4226965122332118, "grad_norm": 0.2241908878458328, "learning_rate": 2.829217309638828e-05, "loss": 0.1102, "step": 2733 }, { "epoch": 1.4232170744403956, "grad_norm": 0.2447870254970879, "learning_rate": 2.827824487755007e-05, "loss": 0.1119, "step": 2734 }, { "epoch": 1.4237376366475794, "grad_norm": 0.23587811793675378, "learning_rate": 2.8264315623289568e-05, "loss": 0.115, "step": 2735 }, { "epoch": 1.4242581988547631, "grad_norm": 0.23445583906080528, "learning_rate": 2.8250385338006297e-05, "loss": 0.1127, "step": 2736 }, { "epoch": 1.424778761061947, "grad_norm": 0.22467550766807312, "learning_rate": 2.823645402610006e-05, "loss": 0.1073, "step": 2737 }, { "epoch": 1.4252993232691307, "grad_norm": 0.22942028138580609, "learning_rate": 2.8222521691971037e-05, "loss": 0.1081, "step": 2738 }, { "epoch": 1.4258198854763144, "grad_norm": 0.2258282159083254, "learning_rate": 2.8208588340019703e-05, "loss": 0.1137, "step": 2739 }, { "epoch": 1.4263404476834982, "grad_norm": 0.2385778073200138, "learning_rate": 2.8194653974646858e-05, "loss": 0.1173, "step": 2740 }, { "epoch": 1.426861009890682, "grad_norm": 0.2279308225257302, "learning_rate": 2.8180718600253613e-05, "loss": 0.1118, "step": 2741 }, { "epoch": 1.4273815720978658, "grad_norm": 0.23064677854327556, "learning_rate": 2.8166782221241418e-05, "loss": 0.112, "step": 2742 }, { "epoch": 1.4279021343050495, "grad_norm": 0.23251063489281165, "learning_rate": 2.8152844842012034e-05, "loss": 0.1141, "step": 2743 }, { "epoch": 1.428422696512233, "grad_norm": 0.23020617762610876, "learning_rate": 2.8138906466967518e-05, "loss": 0.1081, "step": 2744 }, { "epoch": 1.4289432587194169, "grad_norm": 0.2222890423486755, "learning_rate": 2.812496710051028e-05, "loss": 0.1065, "step": 2745 }, { "epoch": 1.4294638209266006, "grad_norm": 0.2274018551434016, "learning_rate": 2.8111026747043002e-05, "loss": 0.1057, "step": 2746 }, { "epoch": 1.4299843831337844, "grad_norm": 0.236085240555113, "learning_rate": 2.80970854109687e-05, "loss": 0.1056, "step": 2747 }, { "epoch": 1.4305049453409682, "grad_norm": 0.2224073810854335, "learning_rate": 2.80831430966907e-05, "loss": 0.1094, "step": 2748 }, { "epoch": 1.431025507548152, "grad_norm": 0.22383924274617603, "learning_rate": 2.806919980861264e-05, "loss": 0.1106, "step": 2749 }, { "epoch": 1.4315460697553357, "grad_norm": 0.23203392604509374, "learning_rate": 2.805525555113845e-05, "loss": 0.1113, "step": 2750 }, { "epoch": 1.4320666319625195, "grad_norm": 0.22742895594575752, "learning_rate": 2.804131032867237e-05, "loss": 0.1094, "step": 2751 }, { "epoch": 1.4325871941697033, "grad_norm": 0.22277598495191936, "learning_rate": 2.8027364145618967e-05, "loss": 0.1107, "step": 2752 }, { "epoch": 1.433107756376887, "grad_norm": 0.23109249376380572, "learning_rate": 2.8013417006383076e-05, "loss": 0.1089, "step": 2753 }, { "epoch": 1.4336283185840708, "grad_norm": 0.23662969693774688, "learning_rate": 2.799946891536987e-05, "loss": 0.1129, "step": 2754 }, { "epoch": 1.4341488807912546, "grad_norm": 0.2260021154026626, "learning_rate": 2.7985519876984795e-05, "loss": 0.1082, "step": 2755 }, { "epoch": 1.4346694429984383, "grad_norm": 0.23726551030073853, "learning_rate": 2.7971569895633604e-05, "loss": 0.114, "step": 2756 }, { "epoch": 1.4351900052056221, "grad_norm": 0.23082581298133342, "learning_rate": 2.7957618975722362e-05, "loss": 0.1095, "step": 2757 }, { "epoch": 1.435710567412806, "grad_norm": 0.24003960853374687, "learning_rate": 2.7943667121657412e-05, "loss": 0.1118, "step": 2758 }, { "epoch": 1.4362311296199897, "grad_norm": 0.234089806557142, "learning_rate": 2.7929714337845396e-05, "loss": 0.1101, "step": 2759 }, { "epoch": 1.4367516918271734, "grad_norm": 0.23113107032487773, "learning_rate": 2.7915760628693256e-05, "loss": 0.1101, "step": 2760 }, { "epoch": 1.4372722540343572, "grad_norm": 0.2219497904207552, "learning_rate": 2.7901805998608217e-05, "loss": 0.1059, "step": 2761 }, { "epoch": 1.437792816241541, "grad_norm": 0.22605082077104915, "learning_rate": 2.7887850451997805e-05, "loss": 0.1068, "step": 2762 }, { "epoch": 1.4383133784487248, "grad_norm": 0.22649858856913815, "learning_rate": 2.787389399326984e-05, "loss": 0.1095, "step": 2763 }, { "epoch": 1.4388339406559083, "grad_norm": 0.2368963632813257, "learning_rate": 2.7859936626832407e-05, "loss": 0.1098, "step": 2764 }, { "epoch": 1.439354502863092, "grad_norm": 0.22188698169925974, "learning_rate": 2.7845978357093895e-05, "loss": 0.1114, "step": 2765 }, { "epoch": 1.4398750650702759, "grad_norm": 0.22991662693273585, "learning_rate": 2.7832019188462977e-05, "loss": 0.1128, "step": 2766 }, { "epoch": 1.4403956272774596, "grad_norm": 0.25211405079579136, "learning_rate": 2.7818059125348616e-05, "loss": 0.1215, "step": 2767 }, { "epoch": 1.4409161894846434, "grad_norm": 0.24479114751525108, "learning_rate": 2.7804098172160038e-05, "loss": 0.1172, "step": 2768 }, { "epoch": 1.4414367516918272, "grad_norm": 0.22826017987069053, "learning_rate": 2.779013633330676e-05, "loss": 0.1108, "step": 2769 }, { "epoch": 1.441957313899011, "grad_norm": 0.2346902016087554, "learning_rate": 2.7776173613198592e-05, "loss": 0.107, "step": 2770 }, { "epoch": 1.4424778761061947, "grad_norm": 0.2353334199015863, "learning_rate": 2.7762210016245605e-05, "loss": 0.1132, "step": 2771 }, { "epoch": 1.4429984383133785, "grad_norm": 0.2153683891298237, "learning_rate": 2.7748245546858155e-05, "loss": 0.1088, "step": 2772 }, { "epoch": 1.4435190005205623, "grad_norm": 0.23715391047211573, "learning_rate": 2.7734280209446865e-05, "loss": 0.1097, "step": 2773 }, { "epoch": 1.444039562727746, "grad_norm": 0.23698576922062276, "learning_rate": 2.7720314008422636e-05, "loss": 0.1105, "step": 2774 }, { "epoch": 1.4445601249349298, "grad_norm": 0.24630914098283596, "learning_rate": 2.770634694819666e-05, "loss": 0.1119, "step": 2775 }, { "epoch": 1.4450806871421134, "grad_norm": 0.24175525799827727, "learning_rate": 2.7692379033180376e-05, "loss": 0.111, "step": 2776 }, { "epoch": 1.4456012493492971, "grad_norm": 0.22721483418002544, "learning_rate": 2.7678410267785492e-05, "loss": 0.1094, "step": 2777 }, { "epoch": 1.446121811556481, "grad_norm": 0.24330824085525268, "learning_rate": 2.7664440656424014e-05, "loss": 0.112, "step": 2778 }, { "epoch": 1.4466423737636647, "grad_norm": 0.22575100884114555, "learning_rate": 2.7650470203508177e-05, "loss": 0.1048, "step": 2779 }, { "epoch": 1.4471629359708484, "grad_norm": 0.2302645170045773, "learning_rate": 2.7636498913450508e-05, "loss": 0.1088, "step": 2780 }, { "epoch": 1.4476834981780322, "grad_norm": 0.24802303184573393, "learning_rate": 2.7622526790663795e-05, "loss": 0.1163, "step": 2781 }, { "epoch": 1.448204060385216, "grad_norm": 0.23957252091431663, "learning_rate": 2.760855383956108e-05, "loss": 0.1104, "step": 2782 }, { "epoch": 1.4487246225923998, "grad_norm": 0.2226274711661623, "learning_rate": 2.7594580064555664e-05, "loss": 0.1073, "step": 2783 }, { "epoch": 1.4492451847995835, "grad_norm": 0.22154143748354907, "learning_rate": 2.7580605470061126e-05, "loss": 0.1063, "step": 2784 }, { "epoch": 1.4497657470067673, "grad_norm": 0.2213471609280991, "learning_rate": 2.7566630060491288e-05, "loss": 0.1068, "step": 2785 }, { "epoch": 1.450286309213951, "grad_norm": 0.22697992240357193, "learning_rate": 2.7552653840260234e-05, "loss": 0.1122, "step": 2786 }, { "epoch": 1.4508068714211348, "grad_norm": 0.23909687959750933, "learning_rate": 2.7538676813782315e-05, "loss": 0.1109, "step": 2787 }, { "epoch": 1.4513274336283186, "grad_norm": 0.22865206395405765, "learning_rate": 2.752469898547211e-05, "loss": 0.111, "step": 2788 }, { "epoch": 1.4518479958355024, "grad_norm": 0.230864102308563, "learning_rate": 2.751072035974448e-05, "loss": 0.1101, "step": 2789 }, { "epoch": 1.4523685580426862, "grad_norm": 0.22847986253277383, "learning_rate": 2.749674094101452e-05, "loss": 0.1059, "step": 2790 }, { "epoch": 1.45288912024987, "grad_norm": 0.22746929861816742, "learning_rate": 2.748276073369759e-05, "loss": 0.1077, "step": 2791 }, { "epoch": 1.4534096824570537, "grad_norm": 0.2490634493102354, "learning_rate": 2.7468779742209272e-05, "loss": 0.1149, "step": 2792 }, { "epoch": 1.4539302446642375, "grad_norm": 0.23550416923749126, "learning_rate": 2.745479797096543e-05, "loss": 0.1078, "step": 2793 }, { "epoch": 1.4544508068714213, "grad_norm": 0.23727643035605134, "learning_rate": 2.744081542438215e-05, "loss": 0.1102, "step": 2794 }, { "epoch": 1.454971369078605, "grad_norm": 0.23639067924333218, "learning_rate": 2.7426832106875772e-05, "loss": 0.1123, "step": 2795 }, { "epoch": 1.4554919312857886, "grad_norm": 0.22586950416876225, "learning_rate": 2.741284802286288e-05, "loss": 0.1079, "step": 2796 }, { "epoch": 1.4560124934929723, "grad_norm": 0.23428485678259794, "learning_rate": 2.7398863176760297e-05, "loss": 0.1103, "step": 2797 }, { "epoch": 1.4565330557001561, "grad_norm": 0.23671222076156578, "learning_rate": 2.7384877572985096e-05, "loss": 0.1131, "step": 2798 }, { "epoch": 1.45705361790734, "grad_norm": 0.24175271615548818, "learning_rate": 2.7370891215954568e-05, "loss": 0.1079, "step": 2799 }, { "epoch": 1.4575741801145237, "grad_norm": 0.25201562246791964, "learning_rate": 2.7356904110086267e-05, "loss": 0.1139, "step": 2800 }, { "epoch": 1.4580947423217074, "grad_norm": 0.23688810004618258, "learning_rate": 2.7342916259797964e-05, "loss": 0.114, "step": 2801 }, { "epoch": 1.4586153045288912, "grad_norm": 0.25496678426814795, "learning_rate": 2.7328927669507675e-05, "loss": 0.1123, "step": 2802 }, { "epoch": 1.459135866736075, "grad_norm": 0.24119391804523413, "learning_rate": 2.7314938343633656e-05, "loss": 0.1117, "step": 2803 }, { "epoch": 1.4596564289432588, "grad_norm": 0.23981545441457577, "learning_rate": 2.7300948286594373e-05, "loss": 0.1138, "step": 2804 }, { "epoch": 1.4601769911504425, "grad_norm": 0.238290521313305, "learning_rate": 2.7286957502808546e-05, "loss": 0.1088, "step": 2805 }, { "epoch": 1.4606975533576263, "grad_norm": 0.23496530630575055, "learning_rate": 2.7272965996695116e-05, "loss": 0.1091, "step": 2806 }, { "epoch": 1.46121811556481, "grad_norm": 0.2219092228421417, "learning_rate": 2.7258973772673247e-05, "loss": 0.1069, "step": 2807 }, { "epoch": 1.4617386777719936, "grad_norm": 0.23788495609882543, "learning_rate": 2.7244980835162342e-05, "loss": 0.1059, "step": 2808 }, { "epoch": 1.4622592399791774, "grad_norm": 0.22578800709661323, "learning_rate": 2.7230987188582008e-05, "loss": 0.1111, "step": 2809 }, { "epoch": 1.4627798021863612, "grad_norm": 0.2642738917819999, "learning_rate": 2.7216992837352108e-05, "loss": 0.111, "step": 2810 }, { "epoch": 1.463300364393545, "grad_norm": 0.23319413750842335, "learning_rate": 2.7202997785892688e-05, "loss": 0.1102, "step": 2811 }, { "epoch": 1.4638209266007287, "grad_norm": 0.25360100989894624, "learning_rate": 2.718900203862406e-05, "loss": 0.1128, "step": 2812 }, { "epoch": 1.4643414888079125, "grad_norm": 0.2639966129821883, "learning_rate": 2.7175005599966718e-05, "loss": 0.1165, "step": 2813 }, { "epoch": 1.4648620510150963, "grad_norm": 0.22862001367725718, "learning_rate": 2.7161008474341393e-05, "loss": 0.1059, "step": 2814 }, { "epoch": 1.46538261322228, "grad_norm": 0.24334822660492816, "learning_rate": 2.714701066616902e-05, "loss": 0.1124, "step": 2815 }, { "epoch": 1.4659031754294638, "grad_norm": 0.23329003664914877, "learning_rate": 2.713301217987077e-05, "loss": 0.1096, "step": 2816 }, { "epoch": 1.4664237376366476, "grad_norm": 0.24018365846353792, "learning_rate": 2.7119013019868013e-05, "loss": 0.1137, "step": 2817 }, { "epoch": 1.4669442998438313, "grad_norm": 0.22712008229642378, "learning_rate": 2.710501319058233e-05, "loss": 0.1098, "step": 2818 }, { "epoch": 1.4674648620510151, "grad_norm": 0.22145732141152524, "learning_rate": 2.7091012696435525e-05, "loss": 0.1097, "step": 2819 }, { "epoch": 1.467985424258199, "grad_norm": 0.22548306513700106, "learning_rate": 2.70770115418496e-05, "loss": 0.1084, "step": 2820 }, { "epoch": 1.4685059864653827, "grad_norm": 0.2316717588456251, "learning_rate": 2.706300973124678e-05, "loss": 0.1119, "step": 2821 }, { "epoch": 1.4690265486725664, "grad_norm": 0.24444595497147328, "learning_rate": 2.7049007269049483e-05, "loss": 0.112, "step": 2822 }, { "epoch": 1.4695471108797502, "grad_norm": 0.223605647603151, "learning_rate": 2.7035004159680332e-05, "loss": 0.1069, "step": 2823 }, { "epoch": 1.470067673086934, "grad_norm": 0.22488128372014973, "learning_rate": 2.702100040756217e-05, "loss": 0.1057, "step": 2824 }, { "epoch": 1.4705882352941178, "grad_norm": 0.24269771382490615, "learning_rate": 2.700699601711803e-05, "loss": 0.111, "step": 2825 }, { "epoch": 1.4711087975013015, "grad_norm": 0.23271593883892053, "learning_rate": 2.699299099277115e-05, "loss": 0.1034, "step": 2826 }, { "epoch": 1.4716293597084853, "grad_norm": 0.2238190756200398, "learning_rate": 2.6978985338944966e-05, "loss": 0.1096, "step": 2827 }, { "epoch": 1.4721499219156688, "grad_norm": 0.2466339353589823, "learning_rate": 2.6964979060063123e-05, "loss": 0.1116, "step": 2828 }, { "epoch": 1.4726704841228526, "grad_norm": 0.2540666543177738, "learning_rate": 2.6950972160549444e-05, "loss": 0.1133, "step": 2829 }, { "epoch": 1.4731910463300364, "grad_norm": 0.2250856415814824, "learning_rate": 2.6936964644827973e-05, "loss": 0.1078, "step": 2830 }, { "epoch": 1.4737116085372202, "grad_norm": 0.23250731739398878, "learning_rate": 2.692295651732293e-05, "loss": 0.1102, "step": 2831 }, { "epoch": 1.474232170744404, "grad_norm": 0.24620466821834674, "learning_rate": 2.6908947782458728e-05, "loss": 0.1111, "step": 2832 }, { "epoch": 1.4747527329515877, "grad_norm": 0.2245002310950658, "learning_rate": 2.6894938444659974e-05, "loss": 0.1032, "step": 2833 }, { "epoch": 1.4752732951587715, "grad_norm": 0.2327464778529271, "learning_rate": 2.6880928508351484e-05, "loss": 0.1098, "step": 2834 }, { "epoch": 1.4757938573659553, "grad_norm": 0.2376686699449246, "learning_rate": 2.6866917977958246e-05, "loss": 0.1101, "step": 2835 }, { "epoch": 1.476314419573139, "grad_norm": 0.2413687823738175, "learning_rate": 2.685290685790542e-05, "loss": 0.1125, "step": 2836 }, { "epoch": 1.4768349817803228, "grad_norm": 0.23618732799015563, "learning_rate": 2.6838895152618387e-05, "loss": 0.112, "step": 2837 }, { "epoch": 1.4773555439875066, "grad_norm": 0.22955043165979686, "learning_rate": 2.682488286652269e-05, "loss": 0.1059, "step": 2838 }, { "epoch": 1.4778761061946903, "grad_norm": 0.27218809301190455, "learning_rate": 2.6810870004044063e-05, "loss": 0.1124, "step": 2839 }, { "epoch": 1.478396668401874, "grad_norm": 0.2370646380825962, "learning_rate": 2.6796856569608414e-05, "loss": 0.1144, "step": 2840 }, { "epoch": 1.4789172306090577, "grad_norm": 0.2379047779082474, "learning_rate": 2.6782842567641842e-05, "loss": 0.107, "step": 2841 }, { "epoch": 1.4794377928162414, "grad_norm": 0.24483160304496035, "learning_rate": 2.6768828002570623e-05, "loss": 0.1099, "step": 2842 }, { "epoch": 1.4799583550234252, "grad_norm": 0.2323379686773366, "learning_rate": 2.6754812878821206e-05, "loss": 0.1085, "step": 2843 }, { "epoch": 1.480478917230609, "grad_norm": 0.23946063633545, "learning_rate": 2.6740797200820228e-05, "loss": 0.1096, "step": 2844 }, { "epoch": 1.4809994794377928, "grad_norm": 0.24336186169601903, "learning_rate": 2.672678097299447e-05, "loss": 0.1111, "step": 2845 }, { "epoch": 1.4815200416449765, "grad_norm": 0.23709636885936694, "learning_rate": 2.6712764199770936e-05, "loss": 0.1129, "step": 2846 }, { "epoch": 1.4820406038521603, "grad_norm": 0.23972559290010753, "learning_rate": 2.6698746885576746e-05, "loss": 0.1137, "step": 2847 }, { "epoch": 1.482561166059344, "grad_norm": 0.23976136195183834, "learning_rate": 2.668472903483925e-05, "loss": 0.1121, "step": 2848 }, { "epoch": 1.4830817282665278, "grad_norm": 0.24400625472890314, "learning_rate": 2.6670710651985924e-05, "loss": 0.1135, "step": 2849 }, { "epoch": 1.4836022904737116, "grad_norm": 0.2367697438563428, "learning_rate": 2.6656691741444423e-05, "loss": 0.107, "step": 2850 }, { "epoch": 1.4841228526808954, "grad_norm": 0.23848746621745118, "learning_rate": 2.6642672307642575e-05, "loss": 0.1079, "step": 2851 }, { "epoch": 1.4846434148880792, "grad_norm": 0.23917458954785298, "learning_rate": 2.6628652355008364e-05, "loss": 0.1135, "step": 2852 }, { "epoch": 1.485163977095263, "grad_norm": 0.23079110138661027, "learning_rate": 2.661463188796996e-05, "loss": 0.113, "step": 2853 }, { "epoch": 1.4856845393024467, "grad_norm": 0.22660723854160528, "learning_rate": 2.6600610910955652e-05, "loss": 0.1084, "step": 2854 }, { "epoch": 1.4862051015096305, "grad_norm": 0.24763834613940575, "learning_rate": 2.6586589428393944e-05, "loss": 0.1139, "step": 2855 }, { "epoch": 1.4867256637168142, "grad_norm": 0.22651685710665284, "learning_rate": 2.6572567444713453e-05, "loss": 0.1088, "step": 2856 }, { "epoch": 1.487246225923998, "grad_norm": 0.23203385277212307, "learning_rate": 2.655854496434299e-05, "loss": 0.1082, "step": 2857 }, { "epoch": 1.4877667881311818, "grad_norm": 0.23006346404589428, "learning_rate": 2.6544521991711498e-05, "loss": 0.109, "step": 2858 }, { "epoch": 1.4882873503383656, "grad_norm": 0.24467746804171445, "learning_rate": 2.6530498531248078e-05, "loss": 0.1078, "step": 2859 }, { "epoch": 1.4888079125455491, "grad_norm": 0.2376855115614867, "learning_rate": 2.6516474587382002e-05, "loss": 0.1038, "step": 2860 }, { "epoch": 1.489328474752733, "grad_norm": 0.2278413610792057, "learning_rate": 2.650245016454268e-05, "loss": 0.1084, "step": 2861 }, { "epoch": 1.4898490369599167, "grad_norm": 0.24707716899895538, "learning_rate": 2.6488425267159688e-05, "loss": 0.1044, "step": 2862 }, { "epoch": 1.4903695991671004, "grad_norm": 0.2587262411800931, "learning_rate": 2.6474399899662715e-05, "loss": 0.1116, "step": 2863 }, { "epoch": 1.4908901613742842, "grad_norm": 0.2435423020073306, "learning_rate": 2.646037406648165e-05, "loss": 0.1115, "step": 2864 }, { "epoch": 1.491410723581468, "grad_norm": 0.23050479938532595, "learning_rate": 2.6446347772046492e-05, "loss": 0.1084, "step": 2865 }, { "epoch": 1.4919312857886518, "grad_norm": 0.2374569801662734, "learning_rate": 2.6432321020787403e-05, "loss": 0.1088, "step": 2866 }, { "epoch": 1.4924518479958355, "grad_norm": 0.2312604690786547, "learning_rate": 2.641829381713468e-05, "loss": 0.1109, "step": 2867 }, { "epoch": 1.4929724102030193, "grad_norm": 0.23090596721393158, "learning_rate": 2.6404266165518767e-05, "loss": 0.114, "step": 2868 }, { "epoch": 1.493492972410203, "grad_norm": 0.23229013774491258, "learning_rate": 2.6390238070370255e-05, "loss": 0.108, "step": 2869 }, { "epoch": 1.4940135346173868, "grad_norm": 0.22185984755752802, "learning_rate": 2.6376209536119856e-05, "loss": 0.1074, "step": 2870 }, { "epoch": 1.4945340968245706, "grad_norm": 0.22282503223676492, "learning_rate": 2.6362180567198447e-05, "loss": 0.1084, "step": 2871 }, { "epoch": 1.4950546590317542, "grad_norm": 0.22439529261749314, "learning_rate": 2.6348151168037028e-05, "loss": 0.106, "step": 2872 }, { "epoch": 1.495575221238938, "grad_norm": 0.23223806149855644, "learning_rate": 2.633412134306672e-05, "loss": 0.1107, "step": 2873 }, { "epoch": 1.4960957834461217, "grad_norm": 0.23202694735599594, "learning_rate": 2.6320091096718817e-05, "loss": 0.1108, "step": 2874 }, { "epoch": 1.4966163456533055, "grad_norm": 0.22150351815304223, "learning_rate": 2.6306060433424712e-05, "loss": 0.1089, "step": 2875 }, { "epoch": 1.4971369078604893, "grad_norm": 0.2356343347877159, "learning_rate": 2.629202935761595e-05, "loss": 0.1066, "step": 2876 }, { "epoch": 1.497657470067673, "grad_norm": 0.22946145700792592, "learning_rate": 2.6277997873724182e-05, "loss": 0.1092, "step": 2877 }, { "epoch": 1.4981780322748568, "grad_norm": 0.24032297573317024, "learning_rate": 2.6263965986181215e-05, "loss": 0.1172, "step": 2878 }, { "epoch": 1.4986985944820406, "grad_norm": 0.23337639493576984, "learning_rate": 2.6249933699418965e-05, "loss": 0.1092, "step": 2879 }, { "epoch": 1.4992191566892243, "grad_norm": 0.23600113599122988, "learning_rate": 2.6235901017869495e-05, "loss": 0.1109, "step": 2880 }, { "epoch": 1.4997397188964081, "grad_norm": 0.2239919599001527, "learning_rate": 2.6221867945964966e-05, "loss": 0.107, "step": 2881 }, { "epoch": 1.5002602811035919, "grad_norm": 0.2438916295121852, "learning_rate": 2.6207834488137677e-05, "loss": 0.1071, "step": 2882 }, { "epoch": 1.5007808433107757, "grad_norm": 0.24034447143613874, "learning_rate": 2.6193800648820055e-05, "loss": 0.1119, "step": 2883 }, { "epoch": 1.5013014055179594, "grad_norm": 0.21632247281172787, "learning_rate": 2.6179766432444624e-05, "loss": 0.105, "step": 2884 }, { "epoch": 1.5018219677251432, "grad_norm": 0.23446966101143912, "learning_rate": 2.6165731843444063e-05, "loss": 0.1053, "step": 2885 }, { "epoch": 1.502342529932327, "grad_norm": 0.23530370722928862, "learning_rate": 2.6151696886251126e-05, "loss": 0.1079, "step": 2886 }, { "epoch": 1.5028630921395107, "grad_norm": 0.2489383930332058, "learning_rate": 2.6137661565298726e-05, "loss": 0.1121, "step": 2887 }, { "epoch": 1.5033836543466945, "grad_norm": 0.24643956447343202, "learning_rate": 2.6123625885019854e-05, "loss": 0.1091, "step": 2888 }, { "epoch": 1.5039042165538783, "grad_norm": 0.23991702848744126, "learning_rate": 2.6109589849847643e-05, "loss": 0.1164, "step": 2889 }, { "epoch": 1.504424778761062, "grad_norm": 0.2381158559586777, "learning_rate": 2.6095553464215322e-05, "loss": 0.1087, "step": 2890 }, { "epoch": 1.5049453409682458, "grad_norm": 0.2264207570228489, "learning_rate": 2.6081516732556226e-05, "loss": 0.1077, "step": 2891 }, { "epoch": 1.5054659031754296, "grad_norm": 0.23184141890912552, "learning_rate": 2.606747965930383e-05, "loss": 0.1093, "step": 2892 }, { "epoch": 1.5059864653826134, "grad_norm": 0.2300329290920324, "learning_rate": 2.605344224889167e-05, "loss": 0.1114, "step": 2893 }, { "epoch": 1.506507027589797, "grad_norm": 0.23186346307565206, "learning_rate": 2.6039404505753433e-05, "loss": 0.1052, "step": 2894 }, { "epoch": 1.5070275897969807, "grad_norm": 0.22392258697887885, "learning_rate": 2.602536643432288e-05, "loss": 0.1041, "step": 2895 }, { "epoch": 1.5075481520041645, "grad_norm": 0.24950950930317453, "learning_rate": 2.601132803903389e-05, "loss": 0.1097, "step": 2896 }, { "epoch": 1.5080687142113482, "grad_norm": 0.22767870898021633, "learning_rate": 2.599728932432044e-05, "loss": 0.1049, "step": 2897 }, { "epoch": 1.508589276418532, "grad_norm": 0.23533974509856762, "learning_rate": 2.5983250294616618e-05, "loss": 0.108, "step": 2898 }, { "epoch": 1.5091098386257158, "grad_norm": 0.24447947730787556, "learning_rate": 2.596921095435659e-05, "loss": 0.1109, "step": 2899 }, { "epoch": 1.5096304008328996, "grad_norm": 0.24152156647479783, "learning_rate": 2.595517130797464e-05, "loss": 0.1164, "step": 2900 }, { "epoch": 1.5101509630400833, "grad_norm": 0.23201234882111582, "learning_rate": 2.5941131359905146e-05, "loss": 0.1057, "step": 2901 }, { "epoch": 1.5106715252472669, "grad_norm": 0.22589534043240497, "learning_rate": 2.5927091114582565e-05, "loss": 0.1099, "step": 2902 }, { "epoch": 1.5111920874544507, "grad_norm": 0.22101671654491578, "learning_rate": 2.5913050576441477e-05, "loss": 0.109, "step": 2903 }, { "epoch": 1.5117126496616344, "grad_norm": 0.2186466588558665, "learning_rate": 2.589900974991652e-05, "loss": 0.1088, "step": 2904 }, { "epoch": 1.5122332118688182, "grad_norm": 0.22498932105359454, "learning_rate": 2.588496863944244e-05, "loss": 0.1034, "step": 2905 }, { "epoch": 1.512753774076002, "grad_norm": 0.21166235287356577, "learning_rate": 2.5870927249454097e-05, "loss": 0.1037, "step": 2906 }, { "epoch": 1.5132743362831858, "grad_norm": 0.22559742909440408, "learning_rate": 2.5856885584386393e-05, "loss": 0.1045, "step": 2907 }, { "epoch": 1.5137948984903695, "grad_norm": 0.22659709971519004, "learning_rate": 2.584284364867435e-05, "loss": 0.106, "step": 2908 }, { "epoch": 1.5143154606975533, "grad_norm": 0.24273341457463493, "learning_rate": 2.582880144675305e-05, "loss": 0.1155, "step": 2909 }, { "epoch": 1.514836022904737, "grad_norm": 0.23888844864219327, "learning_rate": 2.5814758983057684e-05, "loss": 0.1058, "step": 2910 }, { "epoch": 1.5153565851119208, "grad_norm": 0.24063817042473679, "learning_rate": 2.5800716262023515e-05, "loss": 0.1078, "step": 2911 }, { "epoch": 1.5158771473191046, "grad_norm": 0.2403037315106119, "learning_rate": 2.5786673288085898e-05, "loss": 0.1127, "step": 2912 }, { "epoch": 1.5163977095262884, "grad_norm": 0.2273826223314166, "learning_rate": 2.577263006568025e-05, "loss": 0.1064, "step": 2913 }, { "epoch": 1.5169182717334722, "grad_norm": 0.21909822724724917, "learning_rate": 2.5758586599242057e-05, "loss": 0.1036, "step": 2914 }, { "epoch": 1.517438833940656, "grad_norm": 0.23969847647923984, "learning_rate": 2.5744542893206924e-05, "loss": 0.1106, "step": 2915 }, { "epoch": 1.5179593961478397, "grad_norm": 0.24016614079483722, "learning_rate": 2.5730498952010502e-05, "loss": 0.1074, "step": 2916 }, { "epoch": 1.5184799583550235, "grad_norm": 0.2308633624750452, "learning_rate": 2.5716454780088512e-05, "loss": 0.1115, "step": 2917 }, { "epoch": 1.5190005205622072, "grad_norm": 0.23198908006842925, "learning_rate": 2.570241038187675e-05, "loss": 0.1128, "step": 2918 }, { "epoch": 1.519521082769391, "grad_norm": 0.23442644263464943, "learning_rate": 2.5688365761811116e-05, "loss": 0.1049, "step": 2919 }, { "epoch": 1.5200416449765748, "grad_norm": 0.22883414451848424, "learning_rate": 2.5674320924327533e-05, "loss": 0.1097, "step": 2920 }, { "epoch": 1.5205622071837586, "grad_norm": 0.2361125411690794, "learning_rate": 2.566027587386203e-05, "loss": 0.1072, "step": 2921 }, { "epoch": 1.5210827693909423, "grad_norm": 0.23120541766705457, "learning_rate": 2.5646230614850673e-05, "loss": 0.1082, "step": 2922 }, { "epoch": 1.521603331598126, "grad_norm": 0.22388494741212342, "learning_rate": 2.5632185151729616e-05, "loss": 0.1091, "step": 2923 }, { "epoch": 1.5221238938053099, "grad_norm": 0.2260163271545998, "learning_rate": 2.5618139488935072e-05, "loss": 0.1066, "step": 2924 }, { "epoch": 1.5226444560124937, "grad_norm": 0.2365959253566713, "learning_rate": 2.5604093630903307e-05, "loss": 0.1147, "step": 2925 }, { "epoch": 1.5231650182196772, "grad_norm": 0.2195917844320001, "learning_rate": 2.559004758207067e-05, "loss": 0.1016, "step": 2926 }, { "epoch": 1.523685580426861, "grad_norm": 0.22196040442631998, "learning_rate": 2.557600134687354e-05, "loss": 0.1106, "step": 2927 }, { "epoch": 1.5242061426340447, "grad_norm": 0.2249212499365945, "learning_rate": 2.5561954929748382e-05, "loss": 0.1091, "step": 2928 }, { "epoch": 1.5247267048412285, "grad_norm": 0.21665296060051842, "learning_rate": 2.5547908335131704e-05, "loss": 0.1066, "step": 2929 }, { "epoch": 1.5252472670484123, "grad_norm": 0.23396144154716453, "learning_rate": 2.5533861567460077e-05, "loss": 0.1122, "step": 2930 }, { "epoch": 1.525767829255596, "grad_norm": 0.23254614834471804, "learning_rate": 2.5519814631170125e-05, "loss": 0.1099, "step": 2931 }, { "epoch": 1.5262883914627798, "grad_norm": 0.2313903436634836, "learning_rate": 2.550576753069852e-05, "loss": 0.111, "step": 2932 }, { "epoch": 1.5268089536699636, "grad_norm": 0.22589258005003615, "learning_rate": 2.5491720270481994e-05, "loss": 0.1086, "step": 2933 }, { "epoch": 1.5273295158771472, "grad_norm": 0.24350507728473914, "learning_rate": 2.5477672854957325e-05, "loss": 0.1149, "step": 2934 }, { "epoch": 1.527850078084331, "grad_norm": 0.2296799862656636, "learning_rate": 2.5463625288561343e-05, "loss": 0.1095, "step": 2935 }, { "epoch": 1.5283706402915147, "grad_norm": 0.21068213738843436, "learning_rate": 2.5449577575730908e-05, "loss": 0.1037, "step": 2936 }, { "epoch": 1.5288912024986985, "grad_norm": 0.2282417696222234, "learning_rate": 2.5435529720902955e-05, "loss": 0.1078, "step": 2937 }, { "epoch": 1.5294117647058822, "grad_norm": 0.2287731855036949, "learning_rate": 2.5421481728514456e-05, "loss": 0.1077, "step": 2938 }, { "epoch": 1.529932326913066, "grad_norm": 0.24416895094839683, "learning_rate": 2.5407433603002417e-05, "loss": 0.116, "step": 2939 }, { "epoch": 1.5304528891202498, "grad_norm": 0.2254694810188856, "learning_rate": 2.5393385348803877e-05, "loss": 0.1101, "step": 2940 }, { "epoch": 1.5309734513274336, "grad_norm": 0.22885825764719317, "learning_rate": 2.5379336970355938e-05, "loss": 0.1105, "step": 2941 }, { "epoch": 1.5314940135346173, "grad_norm": 0.2240956766046063, "learning_rate": 2.5365288472095734e-05, "loss": 0.1105, "step": 2942 }, { "epoch": 1.532014575741801, "grad_norm": 0.22399611393029464, "learning_rate": 2.5351239858460423e-05, "loss": 0.1116, "step": 2943 }, { "epoch": 1.5325351379489849, "grad_norm": 0.2291513822745778, "learning_rate": 2.5337191133887232e-05, "loss": 0.1106, "step": 2944 }, { "epoch": 1.5330557001561687, "grad_norm": 0.2278099394652461, "learning_rate": 2.5323142302813384e-05, "loss": 0.1108, "step": 2945 }, { "epoch": 1.5335762623633524, "grad_norm": 0.23495759529423726, "learning_rate": 2.5309093369676158e-05, "loss": 0.1122, "step": 2946 }, { "epoch": 1.5340968245705362, "grad_norm": 0.22433879417057873, "learning_rate": 2.529504433891286e-05, "loss": 0.1094, "step": 2947 }, { "epoch": 1.53461738677772, "grad_norm": 0.2312010873910697, "learning_rate": 2.5280995214960835e-05, "loss": 0.1108, "step": 2948 }, { "epoch": 1.5351379489849037, "grad_norm": 0.22683977448776502, "learning_rate": 2.5266946002257447e-05, "loss": 0.1078, "step": 2949 }, { "epoch": 1.5356585111920875, "grad_norm": 0.24008434305692997, "learning_rate": 2.525289670524008e-05, "loss": 0.1058, "step": 2950 }, { "epoch": 1.5361790733992713, "grad_norm": 0.2276628830953262, "learning_rate": 2.523884732834617e-05, "loss": 0.1156, "step": 2951 }, { "epoch": 1.536699635606455, "grad_norm": 0.23018808698410154, "learning_rate": 2.522479787601315e-05, "loss": 0.1107, "step": 2952 }, { "epoch": 1.5372201978136388, "grad_norm": 0.22488086495321882, "learning_rate": 2.521074835267851e-05, "loss": 0.1084, "step": 2953 }, { "epoch": 1.5377407600208226, "grad_norm": 0.23467533242982103, "learning_rate": 2.519669876277973e-05, "loss": 0.11, "step": 2954 }, { "epoch": 1.5382613222280064, "grad_norm": 0.24014862239225046, "learning_rate": 2.5182649110754324e-05, "loss": 0.117, "step": 2955 }, { "epoch": 1.5387818844351902, "grad_norm": 0.2313151546820494, "learning_rate": 2.5168599401039833e-05, "loss": 0.1118, "step": 2956 }, { "epoch": 1.539302446642374, "grad_norm": 0.24807712232283863, "learning_rate": 2.515454963807381e-05, "loss": 0.1088, "step": 2957 }, { "epoch": 1.5398230088495575, "grad_norm": 0.22300185095946995, "learning_rate": 2.514049982629381e-05, "loss": 0.107, "step": 2958 }, { "epoch": 1.5403435710567412, "grad_norm": 0.24240071957864304, "learning_rate": 2.5126449970137427e-05, "loss": 0.1118, "step": 2959 }, { "epoch": 1.540864133263925, "grad_norm": 0.22785169569105423, "learning_rate": 2.5112400074042264e-05, "loss": 0.1097, "step": 2960 }, { "epoch": 1.5413846954711088, "grad_norm": 0.23607456456416684, "learning_rate": 2.509835014244592e-05, "loss": 0.1047, "step": 2961 }, { "epoch": 1.5419052576782926, "grad_norm": 0.24205109765791188, "learning_rate": 2.5084300179786036e-05, "loss": 0.1086, "step": 2962 }, { "epoch": 1.5424258198854763, "grad_norm": 0.22178597028098188, "learning_rate": 2.5070250190500223e-05, "loss": 0.1079, "step": 2963 }, { "epoch": 1.54294638209266, "grad_norm": 0.24576503225959828, "learning_rate": 2.5056200179026128e-05, "loss": 0.1124, "step": 2964 }, { "epoch": 1.5434669442998439, "grad_norm": 0.219334903872518, "learning_rate": 2.5042150149801408e-05, "loss": 0.1045, "step": 2965 }, { "epoch": 1.5439875065070274, "grad_norm": 0.2253492148046559, "learning_rate": 2.5028100107263714e-05, "loss": 0.1078, "step": 2966 }, { "epoch": 1.5445080687142112, "grad_norm": 0.21980719949962024, "learning_rate": 2.501405005585069e-05, "loss": 0.1101, "step": 2967 }, { "epoch": 1.545028630921395, "grad_norm": 0.22443739038898264, "learning_rate": 2.5e-05, "loss": 0.1075, "step": 2968 }, { "epoch": 1.5455491931285787, "grad_norm": 0.22056246246510713, "learning_rate": 2.4985949944149315e-05, "loss": 0.1076, "step": 2969 }, { "epoch": 1.5460697553357625, "grad_norm": 0.22545188726465185, "learning_rate": 2.4971899892736295e-05, "loss": 0.1095, "step": 2970 }, { "epoch": 1.5465903175429463, "grad_norm": 0.2127466187917586, "learning_rate": 2.4957849850198588e-05, "loss": 0.1023, "step": 2971 }, { "epoch": 1.54711087975013, "grad_norm": 0.24537811434768622, "learning_rate": 2.494379982097387e-05, "loss": 0.112, "step": 2972 }, { "epoch": 1.5476314419573138, "grad_norm": 0.23557533614284934, "learning_rate": 2.4929749809499786e-05, "loss": 0.1109, "step": 2973 }, { "epoch": 1.5481520041644976, "grad_norm": 0.23075351228444654, "learning_rate": 2.4915699820213973e-05, "loss": 0.1092, "step": 2974 }, { "epoch": 1.5486725663716814, "grad_norm": 0.23468413572684516, "learning_rate": 2.4901649857554082e-05, "loss": 0.1106, "step": 2975 }, { "epoch": 1.5491931285788652, "grad_norm": 0.2282770092156467, "learning_rate": 2.488759992595774e-05, "loss": 0.1055, "step": 2976 }, { "epoch": 1.549713690786049, "grad_norm": 0.2344915741816126, "learning_rate": 2.487355002986258e-05, "loss": 0.1105, "step": 2977 }, { "epoch": 1.5502342529932327, "grad_norm": 0.23092806515292114, "learning_rate": 2.4859500173706195e-05, "loss": 0.1101, "step": 2978 }, { "epoch": 1.5507548152004165, "grad_norm": 0.25062475394919426, "learning_rate": 2.48454503619262e-05, "loss": 0.1087, "step": 2979 }, { "epoch": 1.5512753774076002, "grad_norm": 0.22139929458785265, "learning_rate": 2.4831400598960162e-05, "loss": 0.1072, "step": 2980 }, { "epoch": 1.551795939614784, "grad_norm": 0.24617079179057558, "learning_rate": 2.4817350889245675e-05, "loss": 0.1092, "step": 2981 }, { "epoch": 1.5523165018219678, "grad_norm": 0.23491552403906255, "learning_rate": 2.4803301237220277e-05, "loss": 0.1052, "step": 2982 }, { "epoch": 1.5528370640291516, "grad_norm": 0.22975191075816628, "learning_rate": 2.4789251647321497e-05, "loss": 0.1085, "step": 2983 }, { "epoch": 1.5533576262363353, "grad_norm": 0.23209505246845444, "learning_rate": 2.4775202123986855e-05, "loss": 0.1108, "step": 2984 }, { "epoch": 1.553878188443519, "grad_norm": 0.22191390499400956, "learning_rate": 2.4761152671653835e-05, "loss": 0.1046, "step": 2985 }, { "epoch": 1.5543987506507029, "grad_norm": 0.22032710409929804, "learning_rate": 2.4747103294759928e-05, "loss": 0.1037, "step": 2986 }, { "epoch": 1.5549193128578866, "grad_norm": 0.22728163589619113, "learning_rate": 2.4733053997742562e-05, "loss": 0.1102, "step": 2987 }, { "epoch": 1.5554398750650704, "grad_norm": 0.22954424667321993, "learning_rate": 2.4719004785039168e-05, "loss": 0.1086, "step": 2988 }, { "epoch": 1.5559604372722542, "grad_norm": 0.22332893329361106, "learning_rate": 2.4704955661087137e-05, "loss": 0.1095, "step": 2989 }, { "epoch": 1.5564809994794377, "grad_norm": 0.2228519433822219, "learning_rate": 2.4690906630323844e-05, "loss": 0.1113, "step": 2990 }, { "epoch": 1.5570015616866215, "grad_norm": 0.22383437614333498, "learning_rate": 2.4676857697186625e-05, "loss": 0.1107, "step": 2991 }, { "epoch": 1.5575221238938053, "grad_norm": 0.2230728740150953, "learning_rate": 2.4662808866112773e-05, "loss": 0.1112, "step": 2992 }, { "epoch": 1.558042686100989, "grad_norm": 0.21685734122134906, "learning_rate": 2.464876014153958e-05, "loss": 0.1088, "step": 2993 }, { "epoch": 1.5585632483081728, "grad_norm": 0.2203672898784778, "learning_rate": 2.4634711527904272e-05, "loss": 0.1081, "step": 2994 }, { "epoch": 1.5590838105153566, "grad_norm": 0.2197549868768869, "learning_rate": 2.4620663029644068e-05, "loss": 0.1042, "step": 2995 }, { "epoch": 1.5596043727225404, "grad_norm": 0.22043617116963546, "learning_rate": 2.460661465119613e-05, "loss": 0.1069, "step": 2996 }, { "epoch": 1.5601249349297241, "grad_norm": 0.23842553602217398, "learning_rate": 2.459256639699759e-05, "loss": 0.1137, "step": 2997 }, { "epoch": 1.5606454971369077, "grad_norm": 0.21377025661406016, "learning_rate": 2.457851827148554e-05, "loss": 0.104, "step": 2998 }, { "epoch": 1.5611660593440915, "grad_norm": 0.2274138298267601, "learning_rate": 2.456447027909704e-05, "loss": 0.1097, "step": 2999 }, { "epoch": 1.5616866215512752, "grad_norm": 0.22165119897394892, "learning_rate": 2.45504224242691e-05, "loss": 0.1094, "step": 3000 }, { "epoch": 1.562207183758459, "grad_norm": 0.21490549783538904, "learning_rate": 2.453637471143867e-05, "loss": 0.1125, "step": 3001 }, { "epoch": 1.5627277459656428, "grad_norm": 0.21227852422993695, "learning_rate": 2.4522327145042684e-05, "loss": 0.1069, "step": 3002 }, { "epoch": 1.5632483081728266, "grad_norm": 0.21598368023419817, "learning_rate": 2.450827972951801e-05, "loss": 0.1067, "step": 3003 }, { "epoch": 1.5637688703800103, "grad_norm": 0.21825295793980276, "learning_rate": 2.4494232469301485e-05, "loss": 0.1071, "step": 3004 }, { "epoch": 1.564289432587194, "grad_norm": 0.22548740973175538, "learning_rate": 2.4480185368829877e-05, "loss": 0.1129, "step": 3005 }, { "epoch": 1.5648099947943779, "grad_norm": 0.22232346176457024, "learning_rate": 2.446613843253993e-05, "loss": 0.1067, "step": 3006 }, { "epoch": 1.5653305570015617, "grad_norm": 0.24009122736060118, "learning_rate": 2.4452091664868298e-05, "loss": 0.111, "step": 3007 }, { "epoch": 1.5658511192087454, "grad_norm": 0.2243097388008218, "learning_rate": 2.4438045070251624e-05, "loss": 0.1036, "step": 3008 }, { "epoch": 1.5663716814159292, "grad_norm": 0.23190645268050183, "learning_rate": 2.4423998653126472e-05, "loss": 0.1124, "step": 3009 }, { "epoch": 1.566892243623113, "grad_norm": 0.3486307228352415, "learning_rate": 2.440995241792934e-05, "loss": 0.1043, "step": 3010 }, { "epoch": 1.5674128058302967, "grad_norm": 0.23470724899809928, "learning_rate": 2.43959063690967e-05, "loss": 0.1075, "step": 3011 }, { "epoch": 1.5679333680374805, "grad_norm": 0.23329367665913123, "learning_rate": 2.4381860511064933e-05, "loss": 0.1139, "step": 3012 }, { "epoch": 1.5684539302446643, "grad_norm": 0.23185338653990387, "learning_rate": 2.4367814848270387e-05, "loss": 0.1082, "step": 3013 }, { "epoch": 1.568974492451848, "grad_norm": 0.2259910671539203, "learning_rate": 2.435376938514933e-05, "loss": 0.1114, "step": 3014 }, { "epoch": 1.5694950546590318, "grad_norm": 0.2427562483119206, "learning_rate": 2.4339724126137974e-05, "loss": 0.1119, "step": 3015 }, { "epoch": 1.5700156168662156, "grad_norm": 0.25899173179638624, "learning_rate": 2.4325679075672462e-05, "loss": 0.116, "step": 3016 }, { "epoch": 1.5705361790733994, "grad_norm": 0.22878997419672623, "learning_rate": 2.431163423818889e-05, "loss": 0.1085, "step": 3017 }, { "epoch": 1.5710567412805831, "grad_norm": 0.2384972240792906, "learning_rate": 2.4297589618123258e-05, "loss": 0.1147, "step": 3018 }, { "epoch": 1.571577303487767, "grad_norm": 0.22510819520011185, "learning_rate": 2.4283545219911503e-05, "loss": 0.1069, "step": 3019 }, { "epoch": 1.5720978656949507, "grad_norm": 0.2230992006833652, "learning_rate": 2.4269501047989514e-05, "loss": 0.1089, "step": 3020 }, { "epoch": 1.5726184279021345, "grad_norm": 0.22853011702632323, "learning_rate": 2.425545710679308e-05, "loss": 0.1091, "step": 3021 }, { "epoch": 1.573138990109318, "grad_norm": 0.2242485692166422, "learning_rate": 2.424141340075795e-05, "loss": 0.1049, "step": 3022 }, { "epoch": 1.5736595523165018, "grad_norm": 0.2322211744480236, "learning_rate": 2.422736993431976e-05, "loss": 0.1125, "step": 3023 }, { "epoch": 1.5741801145236856, "grad_norm": 0.2076629175930266, "learning_rate": 2.4213326711914108e-05, "loss": 0.102, "step": 3024 }, { "epoch": 1.5747006767308693, "grad_norm": 0.2217478061639669, "learning_rate": 2.419928373797648e-05, "loss": 0.1059, "step": 3025 }, { "epoch": 1.575221238938053, "grad_norm": 0.22413761407508365, "learning_rate": 2.418524101694232e-05, "loss": 0.1088, "step": 3026 }, { "epoch": 1.5757418011452369, "grad_norm": 0.2261030701823896, "learning_rate": 2.4171198553246967e-05, "loss": 0.113, "step": 3027 }, { "epoch": 1.5762623633524206, "grad_norm": 0.24473343847561574, "learning_rate": 2.4157156351325668e-05, "loss": 0.1062, "step": 3028 }, { "epoch": 1.5767829255596044, "grad_norm": 0.22190551778893397, "learning_rate": 2.414311441561362e-05, "loss": 0.1086, "step": 3029 }, { "epoch": 1.577303487766788, "grad_norm": 0.23472397117142715, "learning_rate": 2.4129072750545912e-05, "loss": 0.1137, "step": 3030 }, { "epoch": 1.5778240499739717, "grad_norm": 0.22344372474806895, "learning_rate": 2.4115031360557562e-05, "loss": 0.1087, "step": 3031 }, { "epoch": 1.5783446121811555, "grad_norm": 0.25965872299574905, "learning_rate": 2.4100990250083487e-05, "loss": 0.1139, "step": 3032 }, { "epoch": 1.5788651743883393, "grad_norm": 0.22568577090311404, "learning_rate": 2.4086949423558526e-05, "loss": 0.1098, "step": 3033 }, { "epoch": 1.579385736595523, "grad_norm": 0.21814595333115117, "learning_rate": 2.4072908885417438e-05, "loss": 0.1084, "step": 3034 }, { "epoch": 1.5799062988027068, "grad_norm": 0.22292260467366531, "learning_rate": 2.4058868640094857e-05, "loss": 0.1087, "step": 3035 }, { "epoch": 1.5804268610098906, "grad_norm": 0.22464387093431798, "learning_rate": 2.404482869202537e-05, "loss": 0.1043, "step": 3036 }, { "epoch": 1.5809474232170744, "grad_norm": 0.22574493698081854, "learning_rate": 2.4030789045643418e-05, "loss": 0.1046, "step": 3037 }, { "epoch": 1.5814679854242581, "grad_norm": 0.23794985547646166, "learning_rate": 2.401674970538339e-05, "loss": 0.1092, "step": 3038 }, { "epoch": 1.581988547631442, "grad_norm": 0.23061947403120528, "learning_rate": 2.4002710675679565e-05, "loss": 0.1083, "step": 3039 }, { "epoch": 1.5825091098386257, "grad_norm": 0.3114813728434248, "learning_rate": 2.3988671960966113e-05, "loss": 0.1135, "step": 3040 }, { "epoch": 1.5830296720458095, "grad_norm": 0.24350087448310825, "learning_rate": 2.3974633565677126e-05, "loss": 0.1126, "step": 3041 }, { "epoch": 1.5835502342529932, "grad_norm": 0.23325201805390236, "learning_rate": 2.3960595494246573e-05, "loss": 0.1088, "step": 3042 }, { "epoch": 1.584070796460177, "grad_norm": 0.22534337357637105, "learning_rate": 2.394655775110833e-05, "loss": 0.1109, "step": 3043 }, { "epoch": 1.5845913586673608, "grad_norm": 0.2186403250472862, "learning_rate": 2.393252034069617e-05, "loss": 0.1078, "step": 3044 }, { "epoch": 1.5851119208745446, "grad_norm": 0.23893857707748875, "learning_rate": 2.3918483267443777e-05, "loss": 0.1096, "step": 3045 }, { "epoch": 1.5856324830817283, "grad_norm": 0.2296564204097977, "learning_rate": 2.3904446535784687e-05, "loss": 0.1117, "step": 3046 }, { "epoch": 1.586153045288912, "grad_norm": 0.2256094183111909, "learning_rate": 2.3890410150152363e-05, "loss": 0.11, "step": 3047 }, { "epoch": 1.5866736074960959, "grad_norm": 0.2197512896748002, "learning_rate": 2.387637411498015e-05, "loss": 0.1085, "step": 3048 }, { "epoch": 1.5871941697032796, "grad_norm": 0.22684246333884714, "learning_rate": 2.386233843470128e-05, "loss": 0.107, "step": 3049 }, { "epoch": 1.5877147319104634, "grad_norm": 0.22474155235201673, "learning_rate": 2.384830311374888e-05, "loss": 0.1067, "step": 3050 }, { "epoch": 1.5882352941176472, "grad_norm": 0.22213948016741225, "learning_rate": 2.3834268156555943e-05, "loss": 0.1066, "step": 3051 }, { "epoch": 1.588755856324831, "grad_norm": 0.234760326851895, "learning_rate": 2.3820233567555378e-05, "loss": 0.1059, "step": 3052 }, { "epoch": 1.5892764185320147, "grad_norm": 0.22463049283556413, "learning_rate": 2.3806199351179948e-05, "loss": 0.1119, "step": 3053 }, { "epoch": 1.5897969807391983, "grad_norm": 0.2233971474985051, "learning_rate": 2.379216551186233e-05, "loss": 0.1039, "step": 3054 }, { "epoch": 1.590317542946382, "grad_norm": 0.23716623599975512, "learning_rate": 2.3778132054035043e-05, "loss": 0.1077, "step": 3055 }, { "epoch": 1.5908381051535658, "grad_norm": 0.22256838771775073, "learning_rate": 2.3764098982130507e-05, "loss": 0.1058, "step": 3056 }, { "epoch": 1.5913586673607496, "grad_norm": 0.22537466848570817, "learning_rate": 2.3750066300581037e-05, "loss": 0.1039, "step": 3057 }, { "epoch": 1.5918792295679334, "grad_norm": 0.2796215036036369, "learning_rate": 2.3736034013818788e-05, "loss": 0.1144, "step": 3058 }, { "epoch": 1.5923997917751171, "grad_norm": 0.23128267449784587, "learning_rate": 2.3722002126275824e-05, "loss": 0.1105, "step": 3059 }, { "epoch": 1.592920353982301, "grad_norm": 0.22108759830456698, "learning_rate": 2.3707970642384056e-05, "loss": 0.1059, "step": 3060 }, { "epoch": 1.5934409161894847, "grad_norm": 0.23163491105689502, "learning_rate": 2.3693939566575287e-05, "loss": 0.1085, "step": 3061 }, { "epoch": 1.5939614783966682, "grad_norm": 0.226474109078299, "learning_rate": 2.3679908903281182e-05, "loss": 0.1108, "step": 3062 }, { "epoch": 1.594482040603852, "grad_norm": 0.2360898550334602, "learning_rate": 2.366587865693328e-05, "loss": 0.1046, "step": 3063 }, { "epoch": 1.5950026028110358, "grad_norm": 0.21596317159749592, "learning_rate": 2.3651848831962985e-05, "loss": 0.1065, "step": 3064 }, { "epoch": 1.5955231650182196, "grad_norm": 0.2340159211967676, "learning_rate": 2.363781943280156e-05, "loss": 0.1064, "step": 3065 }, { "epoch": 1.5960437272254033, "grad_norm": 0.23811590741012165, "learning_rate": 2.3623790463880153e-05, "loss": 0.1101, "step": 3066 }, { "epoch": 1.596564289432587, "grad_norm": 0.24057517611837248, "learning_rate": 2.3609761929629755e-05, "loss": 0.1038, "step": 3067 }, { "epoch": 1.5970848516397709, "grad_norm": 0.22888857683722608, "learning_rate": 2.3595733834481236e-05, "loss": 0.1093, "step": 3068 }, { "epoch": 1.5976054138469546, "grad_norm": 0.23000911098983517, "learning_rate": 2.3581706182865324e-05, "loss": 0.1133, "step": 3069 }, { "epoch": 1.5981259760541384, "grad_norm": 0.23001119374170786, "learning_rate": 2.3567678979212602e-05, "loss": 0.1064, "step": 3070 }, { "epoch": 1.5986465382613222, "grad_norm": 0.22412995503191058, "learning_rate": 2.3553652227953507e-05, "loss": 0.1138, "step": 3071 }, { "epoch": 1.599167100468506, "grad_norm": 0.23039758715555397, "learning_rate": 2.3539625933518354e-05, "loss": 0.1095, "step": 3072 }, { "epoch": 1.5996876626756897, "grad_norm": 0.2356318998221753, "learning_rate": 2.3525600100337294e-05, "loss": 0.1139, "step": 3073 }, { "epoch": 1.6002082248828735, "grad_norm": 0.21699411021910314, "learning_rate": 2.3511574732840325e-05, "loss": 0.1024, "step": 3074 }, { "epoch": 1.6007287870900573, "grad_norm": 0.2185662528539939, "learning_rate": 2.3497549835457328e-05, "loss": 0.1085, "step": 3075 }, { "epoch": 1.601249349297241, "grad_norm": 0.22922474819533187, "learning_rate": 2.3483525412618e-05, "loss": 0.1097, "step": 3076 }, { "epoch": 1.6017699115044248, "grad_norm": 0.21577899387549374, "learning_rate": 2.3469501468751928e-05, "loss": 0.1074, "step": 3077 }, { "epoch": 1.6022904737116086, "grad_norm": 0.22051359137252985, "learning_rate": 2.3455478008288508e-05, "loss": 0.1039, "step": 3078 }, { "epoch": 1.6028110359187924, "grad_norm": 0.23827856861336888, "learning_rate": 2.3441455035657013e-05, "loss": 0.1068, "step": 3079 }, { "epoch": 1.6033315981259761, "grad_norm": 0.2338622329393293, "learning_rate": 2.3427432555286543e-05, "loss": 0.1072, "step": 3080 }, { "epoch": 1.60385216033316, "grad_norm": 0.23740858644971805, "learning_rate": 2.341341057160606e-05, "loss": 0.1114, "step": 3081 }, { "epoch": 1.6043727225403437, "grad_norm": 0.23132959270597508, "learning_rate": 2.3399389089044354e-05, "loss": 0.1058, "step": 3082 }, { "epoch": 1.6048932847475275, "grad_norm": 0.253632972193079, "learning_rate": 2.3385368112030052e-05, "loss": 0.1089, "step": 3083 }, { "epoch": 1.6054138469547112, "grad_norm": 0.2249408980909488, "learning_rate": 2.337134764499164e-05, "loss": 0.1059, "step": 3084 }, { "epoch": 1.605934409161895, "grad_norm": 0.2220495865666503, "learning_rate": 2.335732769235743e-05, "loss": 0.105, "step": 3085 }, { "epoch": 1.6064549713690786, "grad_norm": 0.2442265679104099, "learning_rate": 2.3343308258555587e-05, "loss": 0.1086, "step": 3086 }, { "epoch": 1.6069755335762623, "grad_norm": 0.2336403991069815, "learning_rate": 2.332928934801408e-05, "loss": 0.1075, "step": 3087 }, { "epoch": 1.607496095783446, "grad_norm": 0.22276349197581477, "learning_rate": 2.3315270965160753e-05, "loss": 0.1055, "step": 3088 }, { "epoch": 1.6080166579906299, "grad_norm": 0.2280395730745695, "learning_rate": 2.330125311442325e-05, "loss": 0.1044, "step": 3089 }, { "epoch": 1.6085372201978136, "grad_norm": 0.23389298695008165, "learning_rate": 2.3287235800229073e-05, "loss": 0.1047, "step": 3090 }, { "epoch": 1.6090577824049974, "grad_norm": 0.22779890717062137, "learning_rate": 2.327321902700554e-05, "loss": 0.1092, "step": 3091 }, { "epoch": 1.6095783446121812, "grad_norm": 0.21949265647763663, "learning_rate": 2.3259202799179785e-05, "loss": 0.1036, "step": 3092 }, { "epoch": 1.610098906819365, "grad_norm": 0.22277867696874984, "learning_rate": 2.3245187121178804e-05, "loss": 0.1106, "step": 3093 }, { "epoch": 1.6106194690265485, "grad_norm": 0.23819505127712554, "learning_rate": 2.323117199742938e-05, "loss": 0.1102, "step": 3094 }, { "epoch": 1.6111400312337323, "grad_norm": 0.22647009120490577, "learning_rate": 2.3217157432358164e-05, "loss": 0.1048, "step": 3095 }, { "epoch": 1.611660593440916, "grad_norm": 0.23823079455303328, "learning_rate": 2.320314343039159e-05, "loss": 0.1067, "step": 3096 }, { "epoch": 1.6121811556480998, "grad_norm": 0.22520992501315207, "learning_rate": 2.3189129995955943e-05, "loss": 0.1053, "step": 3097 }, { "epoch": 1.6127017178552836, "grad_norm": 0.2258137397120401, "learning_rate": 2.3175117133477313e-05, "loss": 0.1053, "step": 3098 }, { "epoch": 1.6132222800624674, "grad_norm": 0.2271783410129132, "learning_rate": 2.3161104847381608e-05, "loss": 0.1102, "step": 3099 }, { "epoch": 1.6137428422696511, "grad_norm": 0.2271396794206383, "learning_rate": 2.314709314209459e-05, "loss": 0.1025, "step": 3100 }, { "epoch": 1.614263404476835, "grad_norm": 0.2512629230037695, "learning_rate": 2.3133082022041767e-05, "loss": 0.1088, "step": 3101 }, { "epoch": 1.6147839666840187, "grad_norm": 0.23119313550176498, "learning_rate": 2.3119071491648525e-05, "loss": 0.108, "step": 3102 }, { "epoch": 1.6153045288912025, "grad_norm": 0.21317276454406853, "learning_rate": 2.310506155534003e-05, "loss": 0.1034, "step": 3103 }, { "epoch": 1.6158250910983862, "grad_norm": 0.24282928727082653, "learning_rate": 2.3091052217541278e-05, "loss": 0.11, "step": 3104 }, { "epoch": 1.61634565330557, "grad_norm": 0.2408607507520557, "learning_rate": 2.307704348267708e-05, "loss": 0.1087, "step": 3105 }, { "epoch": 1.6168662155127538, "grad_norm": 0.23585883743968147, "learning_rate": 2.3063035355172026e-05, "loss": 0.1117, "step": 3106 }, { "epoch": 1.6173867777199376, "grad_norm": 0.23123042557678272, "learning_rate": 2.304902783945056e-05, "loss": 0.1056, "step": 3107 }, { "epoch": 1.6179073399271213, "grad_norm": 0.23648141825819088, "learning_rate": 2.3035020939936876e-05, "loss": 0.1037, "step": 3108 }, { "epoch": 1.618427902134305, "grad_norm": 0.22436906961143638, "learning_rate": 2.302101466105504e-05, "loss": 0.1017, "step": 3109 }, { "epoch": 1.6189484643414889, "grad_norm": 0.21301990592667625, "learning_rate": 2.3007009007228857e-05, "loss": 0.1059, "step": 3110 }, { "epoch": 1.6194690265486726, "grad_norm": 0.23396887540615707, "learning_rate": 2.2993003982881975e-05, "loss": 0.1062, "step": 3111 }, { "epoch": 1.6199895887558564, "grad_norm": 0.2433106836264054, "learning_rate": 2.2978999592437837e-05, "loss": 0.1058, "step": 3112 }, { "epoch": 1.6205101509630402, "grad_norm": 0.2216670932634246, "learning_rate": 2.296499584031967e-05, "loss": 0.1049, "step": 3113 }, { "epoch": 1.621030713170224, "grad_norm": 0.23403580449109418, "learning_rate": 2.2950992730950523e-05, "loss": 0.1047, "step": 3114 }, { "epoch": 1.6215512753774077, "grad_norm": 0.23966172688921225, "learning_rate": 2.293699026875322e-05, "loss": 0.1048, "step": 3115 }, { "epoch": 1.6220718375845915, "grad_norm": 0.22806661382306423, "learning_rate": 2.29229884581504e-05, "loss": 0.1082, "step": 3116 }, { "epoch": 1.6225923997917753, "grad_norm": 0.23681871295661272, "learning_rate": 2.2908987303564474e-05, "loss": 0.1044, "step": 3117 }, { "epoch": 1.6231129619989588, "grad_norm": 0.22656571389726227, "learning_rate": 2.2894986809417676e-05, "loss": 0.1077, "step": 3118 }, { "epoch": 1.6236335242061426, "grad_norm": 0.23062976184034878, "learning_rate": 2.2880986980131996e-05, "loss": 0.1031, "step": 3119 }, { "epoch": 1.6241540864133264, "grad_norm": 0.22553061329604537, "learning_rate": 2.2866987820129234e-05, "loss": 0.1053, "step": 3120 }, { "epoch": 1.6246746486205101, "grad_norm": 0.22883909177019793, "learning_rate": 2.2852989333830988e-05, "loss": 0.1059, "step": 3121 }, { "epoch": 1.625195210827694, "grad_norm": 0.2267238669608568, "learning_rate": 2.2838991525658616e-05, "loss": 0.1092, "step": 3122 }, { "epoch": 1.6257157730348777, "grad_norm": 0.2221538641240812, "learning_rate": 2.282499440003329e-05, "loss": 0.1019, "step": 3123 }, { "epoch": 1.6262363352420615, "grad_norm": 0.23065218500300072, "learning_rate": 2.281099796137594e-05, "loss": 0.1073, "step": 3124 }, { "epoch": 1.6267568974492452, "grad_norm": 0.21528998054447926, "learning_rate": 2.279700221410731e-05, "loss": 0.1038, "step": 3125 }, { "epoch": 1.6272774596564288, "grad_norm": 0.22406127559380892, "learning_rate": 2.2783007162647894e-05, "loss": 0.1092, "step": 3126 }, { "epoch": 1.6277980218636126, "grad_norm": 0.22732856950130653, "learning_rate": 2.2769012811417998e-05, "loss": 0.1043, "step": 3127 }, { "epoch": 1.6283185840707963, "grad_norm": 0.24391940916747334, "learning_rate": 2.275501916483767e-05, "loss": 0.1084, "step": 3128 }, { "epoch": 1.62883914627798, "grad_norm": 0.212950903066082, "learning_rate": 2.274102622732676e-05, "loss": 0.1021, "step": 3129 }, { "epoch": 1.6293597084851639, "grad_norm": 0.22582328484751868, "learning_rate": 2.2727034003304893e-05, "loss": 0.1078, "step": 3130 }, { "epoch": 1.6298802706923476, "grad_norm": 0.23178125241301198, "learning_rate": 2.2713042497191456e-05, "loss": 0.1052, "step": 3131 }, { "epoch": 1.6304008328995314, "grad_norm": 0.22817672478922343, "learning_rate": 2.2699051713405633e-05, "loss": 0.113, "step": 3132 }, { "epoch": 1.6309213951067152, "grad_norm": 0.21735110824642498, "learning_rate": 2.2685061656366347e-05, "loss": 0.1048, "step": 3133 }, { "epoch": 1.631441957313899, "grad_norm": 0.219691390085283, "learning_rate": 2.2671072330492328e-05, "loss": 0.1063, "step": 3134 }, { "epoch": 1.6319625195210827, "grad_norm": 0.23114194424793824, "learning_rate": 2.2657083740202035e-05, "loss": 0.1096, "step": 3135 }, { "epoch": 1.6324830817282665, "grad_norm": 0.2194312392722462, "learning_rate": 2.264309588991374e-05, "loss": 0.1066, "step": 3136 }, { "epoch": 1.6330036439354503, "grad_norm": 0.21507036864686763, "learning_rate": 2.2629108784045438e-05, "loss": 0.1077, "step": 3137 }, { "epoch": 1.633524206142634, "grad_norm": 0.22777523976635303, "learning_rate": 2.2615122427014913e-05, "loss": 0.1086, "step": 3138 }, { "epoch": 1.6340447683498178, "grad_norm": 0.22055091654855918, "learning_rate": 2.260113682323971e-05, "loss": 0.1035, "step": 3139 }, { "epoch": 1.6345653305570016, "grad_norm": 0.2238767964565613, "learning_rate": 2.2587151977137122e-05, "loss": 0.1077, "step": 3140 }, { "epoch": 1.6350858927641854, "grad_norm": 0.22760165259373993, "learning_rate": 2.2573167893124237e-05, "loss": 0.1128, "step": 3141 }, { "epoch": 1.6356064549713691, "grad_norm": 0.22042911403521775, "learning_rate": 2.2559184575617857e-05, "loss": 0.1073, "step": 3142 }, { "epoch": 1.636127017178553, "grad_norm": 0.2278962819089735, "learning_rate": 2.254520202903458e-05, "loss": 0.107, "step": 3143 }, { "epoch": 1.6366475793857367, "grad_norm": 0.22694087159169066, "learning_rate": 2.253122025779073e-05, "loss": 0.1092, "step": 3144 }, { "epoch": 1.6371681415929205, "grad_norm": 0.22199970972024982, "learning_rate": 2.2517239266302424e-05, "loss": 0.1088, "step": 3145 }, { "epoch": 1.6376887038001042, "grad_norm": 0.22587462774553319, "learning_rate": 2.2503259058985487e-05, "loss": 0.1039, "step": 3146 }, { "epoch": 1.638209266007288, "grad_norm": 0.22588919883230146, "learning_rate": 2.2489279640255526e-05, "loss": 0.1018, "step": 3147 }, { "epoch": 1.6387298282144718, "grad_norm": 0.2361781102822983, "learning_rate": 2.2475301014527897e-05, "loss": 0.1078, "step": 3148 }, { "epoch": 1.6392503904216555, "grad_norm": 0.22178776083606402, "learning_rate": 2.246132318621769e-05, "loss": 0.1065, "step": 3149 }, { "epoch": 1.639770952628839, "grad_norm": 0.2301808192860147, "learning_rate": 2.2447346159739772e-05, "loss": 0.1069, "step": 3150 }, { "epoch": 1.6402915148360229, "grad_norm": 0.23564349495869355, "learning_rate": 2.2433369939508718e-05, "loss": 0.1132, "step": 3151 }, { "epoch": 1.6408120770432066, "grad_norm": 0.2142534971466308, "learning_rate": 2.241939452993888e-05, "loss": 0.1035, "step": 3152 }, { "epoch": 1.6413326392503904, "grad_norm": 0.2269015837271862, "learning_rate": 2.2405419935444338e-05, "loss": 0.1146, "step": 3153 }, { "epoch": 1.6418532014575742, "grad_norm": 0.23097131531972812, "learning_rate": 2.2391446160438933e-05, "loss": 0.1092, "step": 3154 }, { "epoch": 1.642373763664758, "grad_norm": 0.22445851872935704, "learning_rate": 2.2377473209336214e-05, "loss": 0.1059, "step": 3155 }, { "epoch": 1.6428943258719417, "grad_norm": 0.21849931770987602, "learning_rate": 2.2363501086549498e-05, "loss": 0.1067, "step": 3156 }, { "epoch": 1.6434148880791255, "grad_norm": 0.22288897541665642, "learning_rate": 2.234952979649183e-05, "loss": 0.1072, "step": 3157 }, { "epoch": 1.643935450286309, "grad_norm": 0.23784579556458776, "learning_rate": 2.233555934357599e-05, "loss": 0.1146, "step": 3158 }, { "epoch": 1.6444560124934928, "grad_norm": 0.22729963389585045, "learning_rate": 2.232158973221451e-05, "loss": 0.1048, "step": 3159 }, { "epoch": 1.6449765747006766, "grad_norm": 0.22444048797078522, "learning_rate": 2.230762096681963e-05, "loss": 0.1032, "step": 3160 }, { "epoch": 1.6454971369078604, "grad_norm": 0.2274706569746683, "learning_rate": 2.2293653051803344e-05, "loss": 0.1125, "step": 3161 }, { "epoch": 1.6460176991150441, "grad_norm": 0.2215985773170227, "learning_rate": 2.2279685991577363e-05, "loss": 0.1106, "step": 3162 }, { "epoch": 1.646538261322228, "grad_norm": 0.2248236336469974, "learning_rate": 2.2265719790553147e-05, "loss": 0.1056, "step": 3163 }, { "epoch": 1.6470588235294117, "grad_norm": 0.21576271667103167, "learning_rate": 2.225175445314186e-05, "loss": 0.1046, "step": 3164 }, { "epoch": 1.6475793857365955, "grad_norm": 0.23144058603703913, "learning_rate": 2.2237789983754405e-05, "loss": 0.1113, "step": 3165 }, { "epoch": 1.6480999479437792, "grad_norm": 0.2143838630761341, "learning_rate": 2.2223826386801417e-05, "loss": 0.1041, "step": 3166 }, { "epoch": 1.648620510150963, "grad_norm": 0.2199889181336657, "learning_rate": 2.2209863666693244e-05, "loss": 0.1041, "step": 3167 }, { "epoch": 1.6491410723581468, "grad_norm": 0.23070227108778804, "learning_rate": 2.2195901827839965e-05, "loss": 0.1092, "step": 3168 }, { "epoch": 1.6496616345653305, "grad_norm": 0.21963888599506054, "learning_rate": 2.2181940874651393e-05, "loss": 0.1067, "step": 3169 }, { "epoch": 1.6501821967725143, "grad_norm": 0.22331117871742942, "learning_rate": 2.216798081153702e-05, "loss": 0.107, "step": 3170 }, { "epoch": 1.650702758979698, "grad_norm": 0.23747544142009083, "learning_rate": 2.2154021642906107e-05, "loss": 0.1125, "step": 3171 }, { "epoch": 1.6512233211868819, "grad_norm": 0.2304050364200822, "learning_rate": 2.2140063373167606e-05, "loss": 0.1068, "step": 3172 }, { "epoch": 1.6517438833940656, "grad_norm": 0.23358412449420016, "learning_rate": 2.212610600673017e-05, "loss": 0.1059, "step": 3173 }, { "epoch": 1.6522644456012494, "grad_norm": 0.21636641417173472, "learning_rate": 2.21121495480022e-05, "loss": 0.1055, "step": 3174 }, { "epoch": 1.6527850078084332, "grad_norm": 0.22691961656100734, "learning_rate": 2.2098194001391785e-05, "loss": 0.1039, "step": 3175 }, { "epoch": 1.653305570015617, "grad_norm": 0.23081013769275266, "learning_rate": 2.2084239371306753e-05, "loss": 0.1108, "step": 3176 }, { "epoch": 1.6538261322228007, "grad_norm": 0.24275473044090903, "learning_rate": 2.2070285662154607e-05, "loss": 0.1108, "step": 3177 }, { "epoch": 1.6543466944299845, "grad_norm": 0.2161343954274846, "learning_rate": 2.2056332878342594e-05, "loss": 0.106, "step": 3178 }, { "epoch": 1.6548672566371683, "grad_norm": 0.22720407589672256, "learning_rate": 2.2042381024277637e-05, "loss": 0.1044, "step": 3179 }, { "epoch": 1.655387818844352, "grad_norm": 0.23757875519411803, "learning_rate": 2.202843010436639e-05, "loss": 0.1122, "step": 3180 }, { "epoch": 1.6559083810515358, "grad_norm": 0.209322137149078, "learning_rate": 2.2014480123015214e-05, "loss": 0.1052, "step": 3181 }, { "epoch": 1.6564289432587194, "grad_norm": 0.2238782072859011, "learning_rate": 2.2000531084630137e-05, "loss": 0.1092, "step": 3182 }, { "epoch": 1.6569495054659031, "grad_norm": 0.21411441676663537, "learning_rate": 2.1986582993616926e-05, "loss": 0.0983, "step": 3183 }, { "epoch": 1.657470067673087, "grad_norm": 0.23607167726054, "learning_rate": 2.1972635854381042e-05, "loss": 0.1062, "step": 3184 }, { "epoch": 1.6579906298802707, "grad_norm": 0.23634245749199465, "learning_rate": 2.1958689671327635e-05, "loss": 0.1077, "step": 3185 }, { "epoch": 1.6585111920874545, "grad_norm": 0.26476311947101455, "learning_rate": 2.1944744448861557e-05, "loss": 0.1029, "step": 3186 }, { "epoch": 1.6590317542946382, "grad_norm": 0.2225889940109205, "learning_rate": 2.1930800191387366e-05, "loss": 0.1019, "step": 3187 }, { "epoch": 1.659552316501822, "grad_norm": 0.2123029031111446, "learning_rate": 2.1916856903309298e-05, "loss": 0.1013, "step": 3188 }, { "epoch": 1.6600728787090058, "grad_norm": 0.23091095057929104, "learning_rate": 2.19029145890313e-05, "loss": 0.1065, "step": 3189 }, { "epoch": 1.6605934409161893, "grad_norm": 0.24015311654835608, "learning_rate": 2.188897325295701e-05, "loss": 0.1039, "step": 3190 }, { "epoch": 1.661114003123373, "grad_norm": 0.23370275976758934, "learning_rate": 2.187503289948973e-05, "loss": 0.1104, "step": 3191 }, { "epoch": 1.6616345653305569, "grad_norm": 0.22827631936172066, "learning_rate": 2.1861093533032488e-05, "loss": 0.1033, "step": 3192 }, { "epoch": 1.6621551275377406, "grad_norm": 0.2208651631543092, "learning_rate": 2.1847155157987972e-05, "loss": 0.1046, "step": 3193 }, { "epoch": 1.6626756897449244, "grad_norm": 0.22601221991866668, "learning_rate": 2.1833217778758584e-05, "loss": 0.1077, "step": 3194 }, { "epoch": 1.6631962519521082, "grad_norm": 0.22968672480262192, "learning_rate": 2.1819281399746392e-05, "loss": 0.104, "step": 3195 }, { "epoch": 1.663716814159292, "grad_norm": 0.23140457155651345, "learning_rate": 2.180534602535315e-05, "loss": 0.1068, "step": 3196 }, { "epoch": 1.6642373763664757, "grad_norm": 0.2153723166687647, "learning_rate": 2.17914116599803e-05, "loss": 0.1063, "step": 3197 }, { "epoch": 1.6647579385736595, "grad_norm": 0.23114435518218374, "learning_rate": 2.1777478308028965e-05, "loss": 0.1089, "step": 3198 }, { "epoch": 1.6652785007808433, "grad_norm": 0.22821342024755512, "learning_rate": 2.176354597389995e-05, "loss": 0.1054, "step": 3199 }, { "epoch": 1.665799062988027, "grad_norm": 0.2339992932085678, "learning_rate": 2.1749614661993715e-05, "loss": 0.1113, "step": 3200 }, { "epoch": 1.6663196251952108, "grad_norm": 0.22259182696219096, "learning_rate": 2.1735684376710435e-05, "loss": 0.1044, "step": 3201 }, { "epoch": 1.6668401874023946, "grad_norm": 0.2219044472166739, "learning_rate": 2.1721755122449932e-05, "loss": 0.1077, "step": 3202 }, { "epoch": 1.6673607496095784, "grad_norm": 0.21727491507702684, "learning_rate": 2.1707826903611726e-05, "loss": 0.1028, "step": 3203 }, { "epoch": 1.6678813118167621, "grad_norm": 0.2299535668577538, "learning_rate": 2.169389972459498e-05, "loss": 0.1098, "step": 3204 }, { "epoch": 1.668401874023946, "grad_norm": 0.21563012365258863, "learning_rate": 2.1679973589798564e-05, "loss": 0.103, "step": 3205 }, { "epoch": 1.6689224362311297, "grad_norm": 0.22726073168216962, "learning_rate": 2.166604850362098e-05, "loss": 0.1082, "step": 3206 }, { "epoch": 1.6694429984383135, "grad_norm": 0.23344147957321737, "learning_rate": 2.165212447046043e-05, "loss": 0.11, "step": 3207 }, { "epoch": 1.6699635606454972, "grad_norm": 0.21101470102115977, "learning_rate": 2.163820149471478e-05, "loss": 0.1067, "step": 3208 }, { "epoch": 1.670484122852681, "grad_norm": 0.21852389821603915, "learning_rate": 2.1624279580781525e-05, "loss": 0.1094, "step": 3209 }, { "epoch": 1.6710046850598648, "grad_norm": 0.21706627954133206, "learning_rate": 2.1610358733057882e-05, "loss": 0.1092, "step": 3210 }, { "epoch": 1.6715252472670485, "grad_norm": 0.2260538276253177, "learning_rate": 2.1596438955940682e-05, "loss": 0.1079, "step": 3211 }, { "epoch": 1.6720458094742323, "grad_norm": 0.23132753540493692, "learning_rate": 2.1582520253826454e-05, "loss": 0.1061, "step": 3212 }, { "epoch": 1.672566371681416, "grad_norm": 0.22764404827151757, "learning_rate": 2.156860263111136e-05, "loss": 0.1087, "step": 3213 }, { "epoch": 1.6730869338885996, "grad_norm": 0.23998945954385206, "learning_rate": 2.155468609219125e-05, "loss": 0.1068, "step": 3214 }, { "epoch": 1.6736074960957834, "grad_norm": 0.22270579464295384, "learning_rate": 2.154077064146159e-05, "loss": 0.1031, "step": 3215 }, { "epoch": 1.6741280583029672, "grad_norm": 0.23422116557049794, "learning_rate": 2.152685628331755e-05, "loss": 0.1083, "step": 3216 }, { "epoch": 1.674648620510151, "grad_norm": 0.23808475492769957, "learning_rate": 2.1512943022153926e-05, "loss": 0.1108, "step": 3217 }, { "epoch": 1.6751691827173347, "grad_norm": 0.2336184489419489, "learning_rate": 2.149903086236516e-05, "loss": 0.1072, "step": 3218 }, { "epoch": 1.6756897449245185, "grad_norm": 0.21905361177253538, "learning_rate": 2.1485119808345372e-05, "loss": 0.1037, "step": 3219 }, { "epoch": 1.6762103071317023, "grad_norm": 0.20981047900721037, "learning_rate": 2.147120986448831e-05, "loss": 0.0976, "step": 3220 }, { "epoch": 1.676730869338886, "grad_norm": 0.2545642241580596, "learning_rate": 2.1457301035187397e-05, "loss": 0.1093, "step": 3221 }, { "epoch": 1.6772514315460696, "grad_norm": 0.2218754138460833, "learning_rate": 2.1443393324835675e-05, "loss": 0.1047, "step": 3222 }, { "epoch": 1.6777719937532534, "grad_norm": 0.23088102794993018, "learning_rate": 2.1429486737825854e-05, "loss": 0.1104, "step": 3223 }, { "epoch": 1.6782925559604371, "grad_norm": 0.22005937755272137, "learning_rate": 2.1415581278550275e-05, "loss": 0.1073, "step": 3224 }, { "epoch": 1.678813118167621, "grad_norm": 0.23400677719772744, "learning_rate": 2.140167695140094e-05, "loss": 0.1103, "step": 3225 }, { "epoch": 1.6793336803748047, "grad_norm": 0.2365964192307508, "learning_rate": 2.1387773760769474e-05, "loss": 0.1085, "step": 3226 }, { "epoch": 1.6798542425819885, "grad_norm": 0.21637550071988382, "learning_rate": 2.137387171104715e-05, "loss": 0.107, "step": 3227 }, { "epoch": 1.6803748047891722, "grad_norm": 0.22333862046046274, "learning_rate": 2.1359970806624885e-05, "loss": 0.1084, "step": 3228 }, { "epoch": 1.680895366996356, "grad_norm": 0.23203824461964603, "learning_rate": 2.134607105189323e-05, "loss": 0.1074, "step": 3229 }, { "epoch": 1.6814159292035398, "grad_norm": 0.22765546349056484, "learning_rate": 2.1332172451242378e-05, "loss": 0.1069, "step": 3230 }, { "epoch": 1.6819364914107235, "grad_norm": 0.23666560244713625, "learning_rate": 2.131827500906215e-05, "loss": 0.1034, "step": 3231 }, { "epoch": 1.6824570536179073, "grad_norm": 0.2374292849318095, "learning_rate": 2.1304378729742007e-05, "loss": 0.1068, "step": 3232 }, { "epoch": 1.682977615825091, "grad_norm": 0.2324037821047654, "learning_rate": 2.129048361767104e-05, "loss": 0.1038, "step": 3233 }, { "epoch": 1.6834981780322749, "grad_norm": 0.23887705711017124, "learning_rate": 2.127658967723797e-05, "loss": 0.1116, "step": 3234 }, { "epoch": 1.6840187402394586, "grad_norm": 0.24478758959999747, "learning_rate": 2.126269691283117e-05, "loss": 0.1078, "step": 3235 }, { "epoch": 1.6845393024466424, "grad_norm": 0.24871328706272613, "learning_rate": 2.124880532883859e-05, "loss": 0.1081, "step": 3236 }, { "epoch": 1.6850598646538262, "grad_norm": 0.2516650860777226, "learning_rate": 2.123491492964785e-05, "loss": 0.1085, "step": 3237 }, { "epoch": 1.68558042686101, "grad_norm": 0.2253323571868253, "learning_rate": 2.1221025719646193e-05, "loss": 0.1019, "step": 3238 }, { "epoch": 1.6861009890681937, "grad_norm": 0.22356011592203995, "learning_rate": 2.1207137703220465e-05, "loss": 0.1069, "step": 3239 }, { "epoch": 1.6866215512753775, "grad_norm": 0.22129126827686282, "learning_rate": 2.119325088475716e-05, "loss": 0.1021, "step": 3240 }, { "epoch": 1.6871421134825613, "grad_norm": 0.24363724324274239, "learning_rate": 2.1179365268642374e-05, "loss": 0.1073, "step": 3241 }, { "epoch": 1.687662675689745, "grad_norm": 0.22571502175977035, "learning_rate": 2.1165480859261838e-05, "loss": 0.1086, "step": 3242 }, { "epoch": 1.6881832378969288, "grad_norm": 0.2333938281028589, "learning_rate": 2.1151597661000884e-05, "loss": 0.1116, "step": 3243 }, { "epoch": 1.6887038001041126, "grad_norm": 0.2418535802191957, "learning_rate": 2.1137715678244492e-05, "loss": 0.1122, "step": 3244 }, { "epoch": 1.6892243623112964, "grad_norm": 0.2226092711205871, "learning_rate": 2.1123834915377212e-05, "loss": 0.1077, "step": 3245 }, { "epoch": 1.68974492451848, "grad_norm": 0.2218422154239549, "learning_rate": 2.1109955376783247e-05, "loss": 0.1063, "step": 3246 }, { "epoch": 1.6902654867256637, "grad_norm": 0.22432155018614183, "learning_rate": 2.1096077066846404e-05, "loss": 0.1042, "step": 3247 }, { "epoch": 1.6907860489328475, "grad_norm": 0.23580900748913308, "learning_rate": 2.1082199989950093e-05, "loss": 0.1044, "step": 3248 }, { "epoch": 1.6913066111400312, "grad_norm": 0.22358704155742054, "learning_rate": 2.1068324150477346e-05, "loss": 0.1069, "step": 3249 }, { "epoch": 1.691827173347215, "grad_norm": 0.22829186908041899, "learning_rate": 2.105444955281079e-05, "loss": 0.1048, "step": 3250 }, { "epoch": 1.6923477355543988, "grad_norm": 0.22170478945882835, "learning_rate": 2.1040576201332685e-05, "loss": 0.1044, "step": 3251 }, { "epoch": 1.6928682977615825, "grad_norm": 0.23340110469012113, "learning_rate": 2.1026704100424864e-05, "loss": 0.1078, "step": 3252 }, { "epoch": 1.6933888599687663, "grad_norm": 0.22247959374329498, "learning_rate": 2.10128332544688e-05, "loss": 0.1072, "step": 3253 }, { "epoch": 1.6939094221759499, "grad_norm": 0.24563634319942448, "learning_rate": 2.0998963667845535e-05, "loss": 0.1069, "step": 3254 }, { "epoch": 1.6944299843831336, "grad_norm": 0.2222993782818447, "learning_rate": 2.0985095344935733e-05, "loss": 0.1089, "step": 3255 }, { "epoch": 1.6949505465903174, "grad_norm": 0.23032803072350344, "learning_rate": 2.0971228290119664e-05, "loss": 0.1065, "step": 3256 }, { "epoch": 1.6954711087975012, "grad_norm": 0.22538817632991662, "learning_rate": 2.0957362507777176e-05, "loss": 0.1109, "step": 3257 }, { "epoch": 1.695991671004685, "grad_norm": 0.22220876229186903, "learning_rate": 2.0943498002287743e-05, "loss": 0.1089, "step": 3258 }, { "epoch": 1.6965122332118687, "grad_norm": 0.22756130766303528, "learning_rate": 2.0929634778030408e-05, "loss": 0.1082, "step": 3259 }, { "epoch": 1.6970327954190525, "grad_norm": 0.2324022767031974, "learning_rate": 2.0915772839383834e-05, "loss": 0.1048, "step": 3260 }, { "epoch": 1.6975533576262363, "grad_norm": 0.2224932908070099, "learning_rate": 2.0901912190726256e-05, "loss": 0.1052, "step": 3261 }, { "epoch": 1.69807391983342, "grad_norm": 0.22064157879488797, "learning_rate": 2.0888052836435524e-05, "loss": 0.1034, "step": 3262 }, { "epoch": 1.6985944820406038, "grad_norm": 0.22010208727090902, "learning_rate": 2.087419478088906e-05, "loss": 0.1048, "step": 3263 }, { "epoch": 1.6991150442477876, "grad_norm": 0.2454058370043552, "learning_rate": 2.0860338028463876e-05, "loss": 0.1032, "step": 3264 }, { "epoch": 1.6996356064549714, "grad_norm": 0.22591759232291647, "learning_rate": 2.084648258353659e-05, "loss": 0.1041, "step": 3265 }, { "epoch": 1.7001561686621551, "grad_norm": 0.2269876270874768, "learning_rate": 2.0832628450483388e-05, "loss": 0.1054, "step": 3266 }, { "epoch": 1.700676730869339, "grad_norm": 0.2323776942869489, "learning_rate": 2.081877563368006e-05, "loss": 0.107, "step": 3267 }, { "epoch": 1.7011972930765227, "grad_norm": 0.23268241409871376, "learning_rate": 2.0804924137501955e-05, "loss": 0.1056, "step": 3268 }, { "epoch": 1.7017178552837064, "grad_norm": 0.2356301736735351, "learning_rate": 2.0791073966324037e-05, "loss": 0.1035, "step": 3269 }, { "epoch": 1.7022384174908902, "grad_norm": 0.22499277174575544, "learning_rate": 2.0777225124520823e-05, "loss": 0.1054, "step": 3270 }, { "epoch": 1.702758979698074, "grad_norm": 0.2304220231846641, "learning_rate": 2.0763377616466427e-05, "loss": 0.108, "step": 3271 }, { "epoch": 1.7032795419052578, "grad_norm": 0.21825438924376672, "learning_rate": 2.0749531446534546e-05, "loss": 0.1046, "step": 3272 }, { "epoch": 1.7038001041124415, "grad_norm": 0.2237918265772953, "learning_rate": 2.073568661909842e-05, "loss": 0.1069, "step": 3273 }, { "epoch": 1.7043206663196253, "grad_norm": 0.22614928472918566, "learning_rate": 2.072184313853091e-05, "loss": 0.1063, "step": 3274 }, { "epoch": 1.704841228526809, "grad_norm": 0.21936797781205583, "learning_rate": 2.070800100920442e-05, "loss": 0.105, "step": 3275 }, { "epoch": 1.7053617907339929, "grad_norm": 0.21812208623696525, "learning_rate": 2.069416023549095e-05, "loss": 0.1045, "step": 3276 }, { "epoch": 1.7058823529411766, "grad_norm": 0.21349218293678732, "learning_rate": 2.068032082176205e-05, "loss": 0.1017, "step": 3277 }, { "epoch": 1.7064029151483602, "grad_norm": 0.23388102739671973, "learning_rate": 2.0666482772388853e-05, "loss": 0.1065, "step": 3278 }, { "epoch": 1.706923477355544, "grad_norm": 0.2114916830371581, "learning_rate": 2.0652646091742063e-05, "loss": 0.1001, "step": 3279 }, { "epoch": 1.7074440395627277, "grad_norm": 0.22727043325186014, "learning_rate": 2.0638810784191946e-05, "loss": 0.1113, "step": 3280 }, { "epoch": 1.7079646017699115, "grad_norm": 0.2509541792776318, "learning_rate": 2.0624976854108347e-05, "loss": 0.1133, "step": 3281 }, { "epoch": 1.7084851639770953, "grad_norm": 0.22983253253459388, "learning_rate": 2.061114430586064e-05, "loss": 0.1056, "step": 3282 }, { "epoch": 1.709005726184279, "grad_norm": 0.22546592705998422, "learning_rate": 2.0597313143817804e-05, "loss": 0.1074, "step": 3283 }, { "epoch": 1.7095262883914628, "grad_norm": 0.22447733642224385, "learning_rate": 2.0583483372348356e-05, "loss": 0.1051, "step": 3284 }, { "epoch": 1.7100468505986466, "grad_norm": 0.22044225269872403, "learning_rate": 2.056965499582039e-05, "loss": 0.107, "step": 3285 }, { "epoch": 1.7105674128058301, "grad_norm": 0.23172108119736853, "learning_rate": 2.055582801860155e-05, "loss": 0.1101, "step": 3286 }, { "epoch": 1.711087975013014, "grad_norm": 0.22418510588075263, "learning_rate": 2.0542002445059032e-05, "loss": 0.1031, "step": 3287 }, { "epoch": 1.7116085372201977, "grad_norm": 0.21742221769182057, "learning_rate": 2.0528178279559596e-05, "loss": 0.1112, "step": 3288 }, { "epoch": 1.7121290994273815, "grad_norm": 0.2285310956948703, "learning_rate": 2.0514355526469566e-05, "loss": 0.1082, "step": 3289 }, { "epoch": 1.7126496616345652, "grad_norm": 0.23733520699349175, "learning_rate": 2.0500534190154808e-05, "loss": 0.1108, "step": 3290 }, { "epoch": 1.713170223841749, "grad_norm": 0.224702532100698, "learning_rate": 2.0486714274980732e-05, "loss": 0.1054, "step": 3291 }, { "epoch": 1.7136907860489328, "grad_norm": 0.23380286724902663, "learning_rate": 2.0472895785312324e-05, "loss": 0.1042, "step": 3292 }, { "epoch": 1.7142113482561165, "grad_norm": 0.22363815675761445, "learning_rate": 2.0459078725514092e-05, "loss": 0.0998, "step": 3293 }, { "epoch": 1.7147319104633003, "grad_norm": 0.23504429753004497, "learning_rate": 2.0445263099950123e-05, "loss": 0.1084, "step": 3294 }, { "epoch": 1.715252472670484, "grad_norm": 0.22724060712973546, "learning_rate": 2.043144891298402e-05, "loss": 0.1048, "step": 3295 }, { "epoch": 1.7157730348776679, "grad_norm": 0.23183122000489004, "learning_rate": 2.0417636168978954e-05, "loss": 0.0994, "step": 3296 }, { "epoch": 1.7162935970848516, "grad_norm": 0.2417008008534072, "learning_rate": 2.040382487229763e-05, "loss": 0.108, "step": 3297 }, { "epoch": 1.7168141592920354, "grad_norm": 0.21706572332557506, "learning_rate": 2.03900150273023e-05, "loss": 0.1044, "step": 3298 }, { "epoch": 1.7173347214992192, "grad_norm": 0.2250832016912324, "learning_rate": 2.0376206638354766e-05, "loss": 0.1072, "step": 3299 }, { "epoch": 1.717855283706403, "grad_norm": 0.21945238165897196, "learning_rate": 2.036239970981633e-05, "loss": 0.1056, "step": 3300 }, { "epoch": 1.7183758459135867, "grad_norm": 0.2217903336181355, "learning_rate": 2.0348594246047893e-05, "loss": 0.1022, "step": 3301 }, { "epoch": 1.7188964081207705, "grad_norm": 0.21983648892682253, "learning_rate": 2.0334790251409845e-05, "loss": 0.1006, "step": 3302 }, { "epoch": 1.7194169703279543, "grad_norm": 0.2271718283385726, "learning_rate": 2.0320987730262132e-05, "loss": 0.1071, "step": 3303 }, { "epoch": 1.719937532535138, "grad_norm": 0.2199145587427751, "learning_rate": 2.0307186686964245e-05, "loss": 0.1073, "step": 3304 }, { "epoch": 1.7204580947423218, "grad_norm": 0.2231189412324603, "learning_rate": 2.029338712587518e-05, "loss": 0.1042, "step": 3305 }, { "epoch": 1.7209786569495056, "grad_norm": 0.22383118284350628, "learning_rate": 2.027958905135349e-05, "loss": 0.1073, "step": 3306 }, { "epoch": 1.7214992191566894, "grad_norm": 0.2179631138427114, "learning_rate": 2.0265792467757248e-05, "loss": 0.107, "step": 3307 }, { "epoch": 1.7220197813638731, "grad_norm": 0.2367574980395077, "learning_rate": 2.0251997379444062e-05, "loss": 0.1114, "step": 3308 }, { "epoch": 1.722540343571057, "grad_norm": 0.2103000613187075, "learning_rate": 2.0238203790771054e-05, "loss": 0.1028, "step": 3309 }, { "epoch": 1.7230609057782404, "grad_norm": 0.21798620187439052, "learning_rate": 2.0224411706094877e-05, "loss": 0.1044, "step": 3310 }, { "epoch": 1.7235814679854242, "grad_norm": 0.20804477935141746, "learning_rate": 2.0210621129771722e-05, "loss": 0.101, "step": 3311 }, { "epoch": 1.724102030192608, "grad_norm": 0.22253756149900092, "learning_rate": 2.019683206615729e-05, "loss": 0.1045, "step": 3312 }, { "epoch": 1.7246225923997918, "grad_norm": 0.21956300028974318, "learning_rate": 2.018304451960682e-05, "loss": 0.1073, "step": 3313 }, { "epoch": 1.7251431546069755, "grad_norm": 0.21104867004615668, "learning_rate": 2.016925849447504e-05, "loss": 0.1049, "step": 3314 }, { "epoch": 1.7256637168141593, "grad_norm": 0.21007160419878684, "learning_rate": 2.015547399511624e-05, "loss": 0.1002, "step": 3315 }, { "epoch": 1.726184279021343, "grad_norm": 0.21210442829270762, "learning_rate": 2.0141691025884195e-05, "loss": 0.1039, "step": 3316 }, { "epoch": 1.7267048412285269, "grad_norm": 0.20410554097153225, "learning_rate": 2.0127909591132217e-05, "loss": 0.1004, "step": 3317 }, { "epoch": 1.7272254034357104, "grad_norm": 0.23935102740693862, "learning_rate": 2.0114129695213114e-05, "loss": 0.1056, "step": 3318 }, { "epoch": 1.7277459656428942, "grad_norm": 0.23313707905631195, "learning_rate": 2.0100351342479216e-05, "loss": 0.1102, "step": 3319 }, { "epoch": 1.728266527850078, "grad_norm": 0.23139683180145568, "learning_rate": 2.008657453728238e-05, "loss": 0.1099, "step": 3320 }, { "epoch": 1.7287870900572617, "grad_norm": 0.24872510544166382, "learning_rate": 2.007279928397395e-05, "loss": 0.1066, "step": 3321 }, { "epoch": 1.7293076522644455, "grad_norm": 0.23200706308434033, "learning_rate": 2.0059025586904807e-05, "loss": 0.1054, "step": 3322 }, { "epoch": 1.7298282144716293, "grad_norm": 0.21942541308934396, "learning_rate": 2.004525345042531e-05, "loss": 0.1028, "step": 3323 }, { "epoch": 1.730348776678813, "grad_norm": 0.2609746790506161, "learning_rate": 2.003148287888535e-05, "loss": 0.1088, "step": 3324 }, { "epoch": 1.7308693388859968, "grad_norm": 0.2366478927625931, "learning_rate": 2.0017713876634305e-05, "loss": 0.1059, "step": 3325 }, { "epoch": 1.7313899010931806, "grad_norm": 0.22258749938323874, "learning_rate": 2.000394644802109e-05, "loss": 0.105, "step": 3326 }, { "epoch": 1.7319104633003644, "grad_norm": 0.22200613011043596, "learning_rate": 1.9990180597394075e-05, "loss": 0.1024, "step": 3327 }, { "epoch": 1.7324310255075481, "grad_norm": 0.22635255297590737, "learning_rate": 1.9976416329101154e-05, "loss": 0.1112, "step": 3328 }, { "epoch": 1.732951587714732, "grad_norm": 0.21816833986696646, "learning_rate": 1.9962653647489745e-05, "loss": 0.1083, "step": 3329 }, { "epoch": 1.7334721499219157, "grad_norm": 0.2320214874149184, "learning_rate": 1.9948892556906727e-05, "loss": 0.1063, "step": 3330 }, { "epoch": 1.7339927121290994, "grad_norm": 0.21304345539614472, "learning_rate": 1.99351330616985e-05, "loss": 0.1058, "step": 3331 }, { "epoch": 1.7345132743362832, "grad_norm": 0.21513285495170326, "learning_rate": 1.9921375166210948e-05, "loss": 0.1045, "step": 3332 }, { "epoch": 1.735033836543467, "grad_norm": 0.23454273842647397, "learning_rate": 1.990761887478946e-05, "loss": 0.1115, "step": 3333 }, { "epoch": 1.7355543987506508, "grad_norm": 0.22338156044764015, "learning_rate": 1.989386419177891e-05, "loss": 0.1082, "step": 3334 }, { "epoch": 1.7360749609578345, "grad_norm": 0.20836963730690877, "learning_rate": 1.988011112152367e-05, "loss": 0.1041, "step": 3335 }, { "epoch": 1.7365955231650183, "grad_norm": 0.2236097112876602, "learning_rate": 1.9866359668367594e-05, "loss": 0.1086, "step": 3336 }, { "epoch": 1.737116085372202, "grad_norm": 0.21123759334297273, "learning_rate": 1.9852609836654034e-05, "loss": 0.1062, "step": 3337 }, { "epoch": 1.7376366475793859, "grad_norm": 0.22237040640657335, "learning_rate": 1.9838861630725826e-05, "loss": 0.108, "step": 3338 }, { "epoch": 1.7381572097865696, "grad_norm": 0.2495840089981045, "learning_rate": 1.982511505492529e-05, "loss": 0.1076, "step": 3339 }, { "epoch": 1.7386777719937534, "grad_norm": 0.22958506111561988, "learning_rate": 1.9811370113594246e-05, "loss": 0.1047, "step": 3340 }, { "epoch": 1.7391983342009372, "grad_norm": 0.216172102598489, "learning_rate": 1.9797626811073972e-05, "loss": 0.1027, "step": 3341 }, { "epoch": 1.7397188964081207, "grad_norm": 0.2221979457825088, "learning_rate": 1.9783885151705252e-05, "loss": 0.1056, "step": 3342 }, { "epoch": 1.7402394586153045, "grad_norm": 0.22693748401563166, "learning_rate": 1.9770145139828333e-05, "loss": 0.1086, "step": 3343 }, { "epoch": 1.7407600208224883, "grad_norm": 0.2273052352822081, "learning_rate": 1.975640677978297e-05, "loss": 0.1023, "step": 3344 }, { "epoch": 1.741280583029672, "grad_norm": 0.23018043315752323, "learning_rate": 1.974267007590835e-05, "loss": 0.1073, "step": 3345 }, { "epoch": 1.7418011452368558, "grad_norm": 0.21977354781533529, "learning_rate": 1.9728935032543174e-05, "loss": 0.1025, "step": 3346 }, { "epoch": 1.7423217074440396, "grad_norm": 0.22170578071619307, "learning_rate": 1.9715201654025614e-05, "loss": 0.1053, "step": 3347 }, { "epoch": 1.7428422696512234, "grad_norm": 0.22697352459866138, "learning_rate": 1.9701469944693298e-05, "loss": 0.1045, "step": 3348 }, { "epoch": 1.7433628318584071, "grad_norm": 0.21983474061455138, "learning_rate": 1.9687739908883352e-05, "loss": 0.1015, "step": 3349 }, { "epoch": 1.7438833940655907, "grad_norm": 0.22230299934979136, "learning_rate": 1.967401155093235e-05, "loss": 0.1075, "step": 3350 }, { "epoch": 1.7444039562727744, "grad_norm": 0.2278927921689901, "learning_rate": 1.9660284875176354e-05, "loss": 0.104, "step": 3351 }, { "epoch": 1.7449245184799582, "grad_norm": 0.2258066335818804, "learning_rate": 1.9646559885950876e-05, "loss": 0.1072, "step": 3352 }, { "epoch": 1.745445080687142, "grad_norm": 0.21809762204524716, "learning_rate": 1.9632836587590928e-05, "loss": 0.1057, "step": 3353 }, { "epoch": 1.7459656428943258, "grad_norm": 0.2332642653387811, "learning_rate": 1.9619114984430946e-05, "loss": 0.1101, "step": 3354 }, { "epoch": 1.7464862051015095, "grad_norm": 0.22377591593309243, "learning_rate": 1.960539508080485e-05, "loss": 0.1034, "step": 3355 }, { "epoch": 1.7470067673086933, "grad_norm": 0.2214624251282975, "learning_rate": 1.9591676881046038e-05, "loss": 0.1051, "step": 3356 }, { "epoch": 1.747527329515877, "grad_norm": 0.22969375504708825, "learning_rate": 1.957796038948734e-05, "loss": 0.1042, "step": 3357 }, { "epoch": 1.7480478917230609, "grad_norm": 0.2212002516504855, "learning_rate": 1.9564245610461078e-05, "loss": 0.1056, "step": 3358 }, { "epoch": 1.7485684539302446, "grad_norm": 0.226595022233284, "learning_rate": 1.955053254829901e-05, "loss": 0.1022, "step": 3359 }, { "epoch": 1.7490890161374284, "grad_norm": 0.21300220127544178, "learning_rate": 1.9536821207332357e-05, "loss": 0.1032, "step": 3360 }, { "epoch": 1.7496095783446122, "grad_norm": 0.2237621475683394, "learning_rate": 1.95231115918918e-05, "loss": 0.1029, "step": 3361 }, { "epoch": 1.750130140551796, "grad_norm": 0.22048593748404666, "learning_rate": 1.9509403706307484e-05, "loss": 0.1053, "step": 3362 }, { "epoch": 1.7506507027589797, "grad_norm": 0.22433810240135404, "learning_rate": 1.9495697554908986e-05, "loss": 0.102, "step": 3363 }, { "epoch": 1.7511712649661635, "grad_norm": 0.22380034181909983, "learning_rate": 1.948199314202534e-05, "loss": 0.0986, "step": 3364 }, { "epoch": 1.7516918271733473, "grad_norm": 0.22338005341310205, "learning_rate": 1.946829047198505e-05, "loss": 0.1035, "step": 3365 }, { "epoch": 1.752212389380531, "grad_norm": 0.230442140004699, "learning_rate": 1.945458954911605e-05, "loss": 0.108, "step": 3366 }, { "epoch": 1.7527329515877148, "grad_norm": 0.2304987431815853, "learning_rate": 1.944089037774573e-05, "loss": 0.1082, "step": 3367 }, { "epoch": 1.7532535137948986, "grad_norm": 0.2214657123371355, "learning_rate": 1.942719296220093e-05, "loss": 0.1027, "step": 3368 }, { "epoch": 1.7537740760020823, "grad_norm": 0.20755785482599343, "learning_rate": 1.9413497306807925e-05, "loss": 0.1005, "step": 3369 }, { "epoch": 1.7542946382092661, "grad_norm": 0.2109976471942146, "learning_rate": 1.9399803415892454e-05, "loss": 0.1001, "step": 3370 }, { "epoch": 1.75481520041645, "grad_norm": 0.22412574394179605, "learning_rate": 1.9386111293779673e-05, "loss": 0.1011, "step": 3371 }, { "epoch": 1.7553357626236337, "grad_norm": 0.2276297126469934, "learning_rate": 1.937242094479419e-05, "loss": 0.1016, "step": 3372 }, { "epoch": 1.7558563248308174, "grad_norm": 0.224838207969345, "learning_rate": 1.9358732373260056e-05, "loss": 0.1049, "step": 3373 }, { "epoch": 1.756376887038001, "grad_norm": 0.2211883964455484, "learning_rate": 1.934504558350076e-05, "loss": 0.1054, "step": 3374 }, { "epoch": 1.7568974492451848, "grad_norm": 0.22067590673603887, "learning_rate": 1.933136057983923e-05, "loss": 0.1013, "step": 3375 }, { "epoch": 1.7574180114523685, "grad_norm": 0.22690702349673778, "learning_rate": 1.931767736659782e-05, "loss": 0.1032, "step": 3376 }, { "epoch": 1.7579385736595523, "grad_norm": 0.22520906999437268, "learning_rate": 1.930399594809834e-05, "loss": 0.1069, "step": 3377 }, { "epoch": 1.758459135866736, "grad_norm": 0.22649242565754607, "learning_rate": 1.9290316328662e-05, "loss": 0.1049, "step": 3378 }, { "epoch": 1.7589796980739199, "grad_norm": 0.22332071105437934, "learning_rate": 1.9276638512609474e-05, "loss": 0.1059, "step": 3379 }, { "epoch": 1.7595002602811036, "grad_norm": 0.21894476859270845, "learning_rate": 1.926296250426085e-05, "loss": 0.1038, "step": 3380 }, { "epoch": 1.7600208224882874, "grad_norm": 0.2303137164013817, "learning_rate": 1.9249288307935642e-05, "loss": 0.1055, "step": 3381 }, { "epoch": 1.760541384695471, "grad_norm": 0.22126996251729855, "learning_rate": 1.9235615927952804e-05, "loss": 0.107, "step": 3382 }, { "epoch": 1.7610619469026547, "grad_norm": 0.21780067256343733, "learning_rate": 1.92219453686307e-05, "loss": 0.1032, "step": 3383 }, { "epoch": 1.7615825091098385, "grad_norm": 0.21443493771388783, "learning_rate": 1.9208276634287143e-05, "loss": 0.1047, "step": 3384 }, { "epoch": 1.7621030713170223, "grad_norm": 0.22561009131363186, "learning_rate": 1.9194609729239344e-05, "loss": 0.1056, "step": 3385 }, { "epoch": 1.762623633524206, "grad_norm": 0.22455128134416372, "learning_rate": 1.9180944657803956e-05, "loss": 0.1071, "step": 3386 }, { "epoch": 1.7631441957313898, "grad_norm": 0.2264693768229407, "learning_rate": 1.9167281424297035e-05, "loss": 0.1109, "step": 3387 }, { "epoch": 1.7636647579385736, "grad_norm": 0.2107756363167944, "learning_rate": 1.9153620033034075e-05, "loss": 0.1059, "step": 3388 }, { "epoch": 1.7641853201457574, "grad_norm": 0.23268370370945063, "learning_rate": 1.9139960488329985e-05, "loss": 0.1125, "step": 3389 }, { "epoch": 1.7647058823529411, "grad_norm": 0.2373633791421892, "learning_rate": 1.912630279449906e-05, "loss": 0.1025, "step": 3390 }, { "epoch": 1.765226444560125, "grad_norm": 0.2310037923502031, "learning_rate": 1.911264695585506e-05, "loss": 0.1089, "step": 3391 }, { "epoch": 1.7657470067673087, "grad_norm": 0.2123561360419581, "learning_rate": 1.9098992976711123e-05, "loss": 0.1023, "step": 3392 }, { "epoch": 1.7662675689744924, "grad_norm": 0.2306983798259102, "learning_rate": 1.9085340861379813e-05, "loss": 0.1126, "step": 3393 }, { "epoch": 1.7667881311816762, "grad_norm": 0.23277860448314053, "learning_rate": 1.9071690614173102e-05, "loss": 0.1029, "step": 3394 }, { "epoch": 1.76730869338886, "grad_norm": 0.2087088586057, "learning_rate": 1.9058042239402378e-05, "loss": 0.0985, "step": 3395 }, { "epoch": 1.7678292555960438, "grad_norm": 0.21479438193487035, "learning_rate": 1.9044395741378425e-05, "loss": 0.1002, "step": 3396 }, { "epoch": 1.7683498178032275, "grad_norm": 0.22921709100541876, "learning_rate": 1.903075112441145e-05, "loss": 0.1058, "step": 3397 }, { "epoch": 1.7688703800104113, "grad_norm": 0.22434060260788052, "learning_rate": 1.9017108392811065e-05, "loss": 0.1044, "step": 3398 }, { "epoch": 1.769390942217595, "grad_norm": 0.23525431934160432, "learning_rate": 1.9003467550886253e-05, "loss": 0.1125, "step": 3399 }, { "epoch": 1.7699115044247788, "grad_norm": 0.22009803271545916, "learning_rate": 1.8989828602945454e-05, "loss": 0.1047, "step": 3400 }, { "epoch": 1.7704320666319626, "grad_norm": 0.22537635925080748, "learning_rate": 1.897619155329646e-05, "loss": 0.1043, "step": 3401 }, { "epoch": 1.7709526288391464, "grad_norm": 0.2367752202655961, "learning_rate": 1.8962556406246505e-05, "loss": 0.1035, "step": 3402 }, { "epoch": 1.7714731910463302, "grad_norm": 0.20474450022021, "learning_rate": 1.8948923166102192e-05, "loss": 0.0994, "step": 3403 }, { "epoch": 1.771993753253514, "grad_norm": 0.21437274278815563, "learning_rate": 1.893529183716954e-05, "loss": 0.0997, "step": 3404 }, { "epoch": 1.7725143154606977, "grad_norm": 0.22322450199964058, "learning_rate": 1.892166242375395e-05, "loss": 0.0996, "step": 3405 }, { "epoch": 1.7730348776678813, "grad_norm": 0.22023539278452614, "learning_rate": 1.8908034930160228e-05, "loss": 0.1039, "step": 3406 }, { "epoch": 1.773555439875065, "grad_norm": 0.2114044276184025, "learning_rate": 1.889440936069258e-05, "loss": 0.1026, "step": 3407 }, { "epoch": 1.7740760020822488, "grad_norm": 0.21695286591737722, "learning_rate": 1.8880785719654577e-05, "loss": 0.1037, "step": 3408 }, { "epoch": 1.7745965642894326, "grad_norm": 0.2363632816801158, "learning_rate": 1.8867164011349208e-05, "loss": 0.1071, "step": 3409 }, { "epoch": 1.7751171264966163, "grad_norm": 0.22771210911879075, "learning_rate": 1.885354424007884e-05, "loss": 0.1083, "step": 3410 }, { "epoch": 1.7756376887038001, "grad_norm": 0.2274868597566939, "learning_rate": 1.8839926410145235e-05, "loss": 0.1077, "step": 3411 }, { "epoch": 1.776158250910984, "grad_norm": 0.21891521186548685, "learning_rate": 1.882631052584953e-05, "loss": 0.1022, "step": 3412 }, { "epoch": 1.7766788131181677, "grad_norm": 0.22044753666985667, "learning_rate": 1.8812696591492265e-05, "loss": 0.1027, "step": 3413 }, { "epoch": 1.7771993753253512, "grad_norm": 0.23004226671888997, "learning_rate": 1.8799084611373345e-05, "loss": 0.1085, "step": 3414 }, { "epoch": 1.777719937532535, "grad_norm": 0.21243628528006495, "learning_rate": 1.8785474589792074e-05, "loss": 0.1044, "step": 3415 }, { "epoch": 1.7782404997397188, "grad_norm": 0.220110579204587, "learning_rate": 1.8771866531047133e-05, "loss": 0.1023, "step": 3416 }, { "epoch": 1.7787610619469025, "grad_norm": 0.22305394623742394, "learning_rate": 1.8758260439436563e-05, "loss": 0.1081, "step": 3417 }, { "epoch": 1.7792816241540863, "grad_norm": 0.2276585098729061, "learning_rate": 1.8744656319257817e-05, "loss": 0.1073, "step": 3418 }, { "epoch": 1.77980218636127, "grad_norm": 0.22252174513686673, "learning_rate": 1.87310541748077e-05, "loss": 0.1079, "step": 3419 }, { "epoch": 1.7803227485684539, "grad_norm": 0.21570267838003582, "learning_rate": 1.8717454010382407e-05, "loss": 0.1049, "step": 3420 }, { "epoch": 1.7808433107756376, "grad_norm": 0.21778548124027283, "learning_rate": 1.87038558302775e-05, "loss": 0.1014, "step": 3421 }, { "epoch": 1.7813638729828214, "grad_norm": 0.21552687109805088, "learning_rate": 1.8690259638787926e-05, "loss": 0.0991, "step": 3422 }, { "epoch": 1.7818844351900052, "grad_norm": 0.2345671900413482, "learning_rate": 1.867666544020798e-05, "loss": 0.107, "step": 3423 }, { "epoch": 1.782404997397189, "grad_norm": 0.23258757335297145, "learning_rate": 1.866307323883136e-05, "loss": 0.1105, "step": 3424 }, { "epoch": 1.7829255596043727, "grad_norm": 0.21889647232741902, "learning_rate": 1.8649483038951107e-05, "loss": 0.107, "step": 3425 }, { "epoch": 1.7834461218115565, "grad_norm": 0.22215635618720045, "learning_rate": 1.863589484485963e-05, "loss": 0.1066, "step": 3426 }, { "epoch": 1.7839666840187403, "grad_norm": 0.21415158989479102, "learning_rate": 1.8622308660848724e-05, "loss": 0.1029, "step": 3427 }, { "epoch": 1.784487246225924, "grad_norm": 0.22781030552769713, "learning_rate": 1.860872449120953e-05, "loss": 0.1087, "step": 3428 }, { "epoch": 1.7850078084331078, "grad_norm": 0.22290517689214315, "learning_rate": 1.8595142340232575e-05, "loss": 0.1019, "step": 3429 }, { "epoch": 1.7855283706402916, "grad_norm": 0.21633222759985674, "learning_rate": 1.8581562212207714e-05, "loss": 0.1021, "step": 3430 }, { "epoch": 1.7860489328474753, "grad_norm": 0.230030532368222, "learning_rate": 1.8567984111424205e-05, "loss": 0.1109, "step": 3431 }, { "epoch": 1.7865694950546591, "grad_norm": 0.22735979206413187, "learning_rate": 1.8554408042170628e-05, "loss": 0.1089, "step": 3432 }, { "epoch": 1.787090057261843, "grad_norm": 0.2253631691551036, "learning_rate": 1.8540834008734943e-05, "loss": 0.1023, "step": 3433 }, { "epoch": 1.7876106194690267, "grad_norm": 0.21969328560623388, "learning_rate": 1.8527262015404477e-05, "loss": 0.1046, "step": 3434 }, { "epoch": 1.7881311816762104, "grad_norm": 0.22505681618644893, "learning_rate": 1.8513692066465866e-05, "loss": 0.1061, "step": 3435 }, { "epoch": 1.7886517438833942, "grad_norm": 0.22392160124207264, "learning_rate": 1.8500124166205152e-05, "loss": 0.1055, "step": 3436 }, { "epoch": 1.789172306090578, "grad_norm": 0.22703973700198465, "learning_rate": 1.84865583189077e-05, "loss": 0.1053, "step": 3437 }, { "epoch": 1.7896928682977615, "grad_norm": 0.2235806887467908, "learning_rate": 1.847299452885824e-05, "loss": 0.1059, "step": 3438 }, { "epoch": 1.7902134305049453, "grad_norm": 0.22095971299700543, "learning_rate": 1.8459432800340855e-05, "loss": 0.1015, "step": 3439 }, { "epoch": 1.790733992712129, "grad_norm": 0.23339506867569496, "learning_rate": 1.8445873137638953e-05, "loss": 0.1044, "step": 3440 }, { "epoch": 1.7912545549193128, "grad_norm": 0.21709782713883458, "learning_rate": 1.8432315545035328e-05, "loss": 0.1034, "step": 3441 }, { "epoch": 1.7917751171264966, "grad_norm": 0.2165477854014385, "learning_rate": 1.841876002681208e-05, "loss": 0.1041, "step": 3442 }, { "epoch": 1.7922956793336804, "grad_norm": 0.23599068704496834, "learning_rate": 1.840520658725069e-05, "loss": 0.1082, "step": 3443 }, { "epoch": 1.7928162415408642, "grad_norm": 0.22660187330424075, "learning_rate": 1.8391655230631953e-05, "loss": 0.1083, "step": 3444 }, { "epoch": 1.793336803748048, "grad_norm": 0.21818235771020988, "learning_rate": 1.837810596123601e-05, "loss": 0.1041, "step": 3445 }, { "epoch": 1.7938573659552315, "grad_norm": 0.21184422424865806, "learning_rate": 1.836455878334237e-05, "loss": 0.1013, "step": 3446 }, { "epoch": 1.7943779281624153, "grad_norm": 0.21802505383850326, "learning_rate": 1.8351013701229846e-05, "loss": 0.1054, "step": 3447 }, { "epoch": 1.794898490369599, "grad_norm": 0.2155910468353952, "learning_rate": 1.833747071917662e-05, "loss": 0.1011, "step": 3448 }, { "epoch": 1.7954190525767828, "grad_norm": 0.21140794526123036, "learning_rate": 1.832392984146018e-05, "loss": 0.1022, "step": 3449 }, { "epoch": 1.7959396147839666, "grad_norm": 0.20930278134313027, "learning_rate": 1.8310391072357382e-05, "loss": 0.0955, "step": 3450 }, { "epoch": 1.7964601769911503, "grad_norm": 0.2316473180383841, "learning_rate": 1.829685441614438e-05, "loss": 0.1055, "step": 3451 }, { "epoch": 1.7969807391983341, "grad_norm": 0.23166286020742097, "learning_rate": 1.8283319877096705e-05, "loss": 0.1025, "step": 3452 }, { "epoch": 1.797501301405518, "grad_norm": 0.22381973015644815, "learning_rate": 1.8269787459489174e-05, "loss": 0.1048, "step": 3453 }, { "epoch": 1.7980218636127017, "grad_norm": 0.22774693899244658, "learning_rate": 1.8256257167595957e-05, "loss": 0.1028, "step": 3454 }, { "epoch": 1.7985424258198854, "grad_norm": 0.22509542841432917, "learning_rate": 1.8242729005690557e-05, "loss": 0.1012, "step": 3455 }, { "epoch": 1.7990629880270692, "grad_norm": 0.22202303234657486, "learning_rate": 1.822920297804579e-05, "loss": 0.1025, "step": 3456 }, { "epoch": 1.799583550234253, "grad_norm": 0.2268089623653908, "learning_rate": 1.8215679088933813e-05, "loss": 0.1046, "step": 3457 }, { "epoch": 1.8001041124414368, "grad_norm": 0.22470714993267865, "learning_rate": 1.8202157342626087e-05, "loss": 0.1064, "step": 3458 }, { "epoch": 1.8006246746486205, "grad_norm": 0.2153205950636323, "learning_rate": 1.8188637743393423e-05, "loss": 0.1003, "step": 3459 }, { "epoch": 1.8011452368558043, "grad_norm": 0.20954593624683357, "learning_rate": 1.8175120295505925e-05, "loss": 0.1031, "step": 3460 }, { "epoch": 1.801665799062988, "grad_norm": 0.21853659329976693, "learning_rate": 1.8161605003233056e-05, "loss": 0.1043, "step": 3461 }, { "epoch": 1.8021863612701718, "grad_norm": 0.21719611628693516, "learning_rate": 1.8148091870843554e-05, "loss": 0.1029, "step": 3462 }, { "epoch": 1.8027069234773556, "grad_norm": 0.21032988240795134, "learning_rate": 1.813458090260549e-05, "loss": 0.1012, "step": 3463 }, { "epoch": 1.8032274856845394, "grad_norm": 0.22087081939591202, "learning_rate": 1.8121072102786274e-05, "loss": 0.1082, "step": 3464 }, { "epoch": 1.8037480478917232, "grad_norm": 0.2190982660554764, "learning_rate": 1.81075654756526e-05, "loss": 0.1058, "step": 3465 }, { "epoch": 1.804268610098907, "grad_norm": 0.22040569387096418, "learning_rate": 1.8094061025470498e-05, "loss": 0.1001, "step": 3466 }, { "epoch": 1.8047891723060907, "grad_norm": 0.21603368220817645, "learning_rate": 1.8080558756505294e-05, "loss": 0.0998, "step": 3467 }, { "epoch": 1.8053097345132745, "grad_norm": 0.20574278416396982, "learning_rate": 1.8067058673021646e-05, "loss": 0.099, "step": 3468 }, { "epoch": 1.8058302967204583, "grad_norm": 0.2277361813536661, "learning_rate": 1.8053560779283495e-05, "loss": 0.1053, "step": 3469 }, { "epoch": 1.8063508589276418, "grad_norm": 0.21219090161169762, "learning_rate": 1.804006507955411e-05, "loss": 0.1013, "step": 3470 }, { "epoch": 1.8068714211348256, "grad_norm": 0.22700160243005704, "learning_rate": 1.8026571578096068e-05, "loss": 0.1038, "step": 3471 }, { "epoch": 1.8073919833420093, "grad_norm": 0.2369122624279523, "learning_rate": 1.8013080279171228e-05, "loss": 0.1123, "step": 3472 }, { "epoch": 1.8079125455491931, "grad_norm": 0.22360440845976917, "learning_rate": 1.799959118704078e-05, "loss": 0.1048, "step": 3473 }, { "epoch": 1.808433107756377, "grad_norm": 0.2104592319944269, "learning_rate": 1.7986104305965205e-05, "loss": 0.0973, "step": 3474 }, { "epoch": 1.8089536699635607, "grad_norm": 0.218593446956845, "learning_rate": 1.7972619640204296e-05, "loss": 0.1043, "step": 3475 }, { "epoch": 1.8094742321707444, "grad_norm": 0.22148524142264853, "learning_rate": 1.795913719401712e-05, "loss": 0.1004, "step": 3476 }, { "epoch": 1.8099947943779282, "grad_norm": 0.2155675656645299, "learning_rate": 1.7945656971662085e-05, "loss": 0.1029, "step": 3477 }, { "epoch": 1.8105153565851118, "grad_norm": 0.23004321978824924, "learning_rate": 1.7932178977396848e-05, "loss": 0.1071, "step": 3478 }, { "epoch": 1.8110359187922955, "grad_norm": 0.2323235989966905, "learning_rate": 1.79187032154784e-05, "loss": 0.1088, "step": 3479 }, { "epoch": 1.8115564809994793, "grad_norm": 0.2174771770164572, "learning_rate": 1.7905229690163023e-05, "loss": 0.1032, "step": 3480 }, { "epoch": 1.812077043206663, "grad_norm": 0.21461562545103127, "learning_rate": 1.789175840570626e-05, "loss": 0.1015, "step": 3481 }, { "epoch": 1.8125976054138468, "grad_norm": 0.21080577930269348, "learning_rate": 1.7878289366362984e-05, "loss": 0.1057, "step": 3482 }, { "epoch": 1.8131181676210306, "grad_norm": 0.2166558884121671, "learning_rate": 1.7864822576387342e-05, "loss": 0.1021, "step": 3483 }, { "epoch": 1.8136387298282144, "grad_norm": 0.22936772903160424, "learning_rate": 1.7851358040032774e-05, "loss": 0.1058, "step": 3484 }, { "epoch": 1.8141592920353982, "grad_norm": 0.21693822231883433, "learning_rate": 1.7837895761552002e-05, "loss": 0.0993, "step": 3485 }, { "epoch": 1.814679854242582, "grad_norm": 0.21424260587284594, "learning_rate": 1.782443574519705e-05, "loss": 0.1045, "step": 3486 }, { "epoch": 1.8152004164497657, "grad_norm": 0.21784231944571822, "learning_rate": 1.7810977995219203e-05, "loss": 0.101, "step": 3487 }, { "epoch": 1.8157209786569495, "grad_norm": 0.2337885632751051, "learning_rate": 1.7797522515869062e-05, "loss": 0.1076, "step": 3488 }, { "epoch": 1.8162415408641333, "grad_norm": 0.2281419186886461, "learning_rate": 1.778406931139649e-05, "loss": 0.1054, "step": 3489 }, { "epoch": 1.816762103071317, "grad_norm": 0.21997153529814273, "learning_rate": 1.777061838605062e-05, "loss": 0.1057, "step": 3490 }, { "epoch": 1.8172826652785008, "grad_norm": 0.21614117988760198, "learning_rate": 1.7757169744079893e-05, "loss": 0.1016, "step": 3491 }, { "epoch": 1.8178032274856846, "grad_norm": 0.214012852621221, "learning_rate": 1.774372338973201e-05, "loss": 0.1044, "step": 3492 }, { "epoch": 1.8183237896928683, "grad_norm": 0.21662067485067615, "learning_rate": 1.7730279327253962e-05, "loss": 0.1017, "step": 3493 }, { "epoch": 1.8188443519000521, "grad_norm": 0.22564115079401081, "learning_rate": 1.7716837560892e-05, "loss": 0.1052, "step": 3494 }, { "epoch": 1.8193649141072359, "grad_norm": 0.21323173454794608, "learning_rate": 1.7703398094891673e-05, "loss": 0.0973, "step": 3495 }, { "epoch": 1.8198854763144197, "grad_norm": 0.21838902892220904, "learning_rate": 1.7689960933497778e-05, "loss": 0.1028, "step": 3496 }, { "epoch": 1.8204060385216034, "grad_norm": 0.2271905848355706, "learning_rate": 1.76765260809544e-05, "loss": 0.1036, "step": 3497 }, { "epoch": 1.8209266007287872, "grad_norm": 0.22011061951043087, "learning_rate": 1.7663093541504905e-05, "loss": 0.1019, "step": 3498 }, { "epoch": 1.821447162935971, "grad_norm": 0.22881874170767702, "learning_rate": 1.7649663319391883e-05, "loss": 0.1073, "step": 3499 }, { "epoch": 1.8219677251431547, "grad_norm": 0.22620885690592923, "learning_rate": 1.7636235418857245e-05, "loss": 0.1025, "step": 3500 }, { "epoch": 1.8224882873503385, "grad_norm": 0.23410418806456498, "learning_rate": 1.7622809844142137e-05, "loss": 0.102, "step": 3501 }, { "epoch": 1.823008849557522, "grad_norm": 0.22101701713784802, "learning_rate": 1.760938659948699e-05, "loss": 0.102, "step": 3502 }, { "epoch": 1.8235294117647058, "grad_norm": 0.22714340766149158, "learning_rate": 1.7595965689131484e-05, "loss": 0.1001, "step": 3503 }, { "epoch": 1.8240499739718896, "grad_norm": 0.22057919964249992, "learning_rate": 1.7582547117314563e-05, "loss": 0.102, "step": 3504 }, { "epoch": 1.8245705361790734, "grad_norm": 0.21511543280496126, "learning_rate": 1.7569130888274446e-05, "loss": 0.1043, "step": 3505 }, { "epoch": 1.8250910983862572, "grad_norm": 0.21440452554779588, "learning_rate": 1.7555717006248594e-05, "loss": 0.0959, "step": 3506 }, { "epoch": 1.825611660593441, "grad_norm": 0.22709181091908154, "learning_rate": 1.7542305475473746e-05, "loss": 0.1043, "step": 3507 }, { "epoch": 1.8261322228006247, "grad_norm": 0.2268531896037747, "learning_rate": 1.7528896300185878e-05, "loss": 0.103, "step": 3508 }, { "epoch": 1.8266527850078085, "grad_norm": 0.22430552658597233, "learning_rate": 1.751548948462023e-05, "loss": 0.1041, "step": 3509 }, { "epoch": 1.827173347214992, "grad_norm": 0.22793206081910328, "learning_rate": 1.7502085033011302e-05, "loss": 0.1019, "step": 3510 }, { "epoch": 1.8276939094221758, "grad_norm": 0.23464781588284941, "learning_rate": 1.748868294959284e-05, "loss": 0.1054, "step": 3511 }, { "epoch": 1.8282144716293596, "grad_norm": 0.22172768017872468, "learning_rate": 1.7475283238597857e-05, "loss": 0.1017, "step": 3512 }, { "epoch": 1.8287350338365433, "grad_norm": 0.2518798662916246, "learning_rate": 1.746188590425859e-05, "loss": 0.1055, "step": 3513 }, { "epoch": 1.8292555960437271, "grad_norm": 0.21708297533262325, "learning_rate": 1.7448490950806552e-05, "loss": 0.1041, "step": 3514 }, { "epoch": 1.829776158250911, "grad_norm": 0.23188297562996182, "learning_rate": 1.7435098382472486e-05, "loss": 0.1094, "step": 3515 }, { "epoch": 1.8302967204580947, "grad_norm": 0.21477736177121873, "learning_rate": 1.74217082034864e-05, "loss": 0.1037, "step": 3516 }, { "epoch": 1.8308172826652784, "grad_norm": 0.2174218899457457, "learning_rate": 1.740832041807752e-05, "loss": 0.1041, "step": 3517 }, { "epoch": 1.8313378448724622, "grad_norm": 0.22509449830210138, "learning_rate": 1.7394935030474335e-05, "loss": 0.0994, "step": 3518 }, { "epoch": 1.831858407079646, "grad_norm": 0.2141160924638327, "learning_rate": 1.738155204490458e-05, "loss": 0.0995, "step": 3519 }, { "epoch": 1.8323789692868298, "grad_norm": 0.23187384577530246, "learning_rate": 1.7368171465595222e-05, "loss": 0.1086, "step": 3520 }, { "epoch": 1.8328995314940135, "grad_norm": 0.2130230900152308, "learning_rate": 1.735479329677247e-05, "loss": 0.1006, "step": 3521 }, { "epoch": 1.8334200937011973, "grad_norm": 0.2235989907682875, "learning_rate": 1.7341417542661767e-05, "loss": 0.1055, "step": 3522 }, { "epoch": 1.833940655908381, "grad_norm": 0.2304050686195293, "learning_rate": 1.732804420748781e-05, "loss": 0.1042, "step": 3523 }, { "epoch": 1.8344612181155648, "grad_norm": 0.21354769016325825, "learning_rate": 1.731467329547451e-05, "loss": 0.0985, "step": 3524 }, { "epoch": 1.8349817803227486, "grad_norm": 0.22881459978388433, "learning_rate": 1.7301304810845037e-05, "loss": 0.1009, "step": 3525 }, { "epoch": 1.8355023425299324, "grad_norm": 0.21108826324764815, "learning_rate": 1.7287938757821765e-05, "loss": 0.1042, "step": 3526 }, { "epoch": 1.8360229047371162, "grad_norm": 0.23139125405119665, "learning_rate": 1.7274575140626318e-05, "loss": 0.1075, "step": 3527 }, { "epoch": 1.8365434669443, "grad_norm": 0.2174665102211108, "learning_rate": 1.7261213963479556e-05, "loss": 0.1041, "step": 3528 }, { "epoch": 1.8370640291514837, "grad_norm": 0.22610247555368754, "learning_rate": 1.724785523060155e-05, "loss": 0.1009, "step": 3529 }, { "epoch": 1.8375845913586675, "grad_norm": 0.22191495291139376, "learning_rate": 1.7234498946211625e-05, "loss": 0.1066, "step": 3530 }, { "epoch": 1.8381051535658512, "grad_norm": 0.22121309383263021, "learning_rate": 1.7221145114528297e-05, "loss": 0.1068, "step": 3531 }, { "epoch": 1.838625715773035, "grad_norm": 0.2133266726862183, "learning_rate": 1.7207793739769352e-05, "loss": 0.1021, "step": 3532 }, { "epoch": 1.8391462779802188, "grad_norm": 0.218435643614945, "learning_rate": 1.7194444826151752e-05, "loss": 0.1027, "step": 3533 }, { "epoch": 1.8396668401874023, "grad_norm": 0.21306246655698843, "learning_rate": 1.7181098377891723e-05, "loss": 0.1028, "step": 3534 }, { "epoch": 1.8401874023945861, "grad_norm": 0.2316381773531802, "learning_rate": 1.7167754399204683e-05, "loss": 0.1087, "step": 3535 }, { "epoch": 1.8407079646017699, "grad_norm": 0.2324977841120054, "learning_rate": 1.7154412894305283e-05, "loss": 0.1083, "step": 3536 }, { "epoch": 1.8412285268089537, "grad_norm": 0.21543732870890425, "learning_rate": 1.7141073867407397e-05, "loss": 0.1004, "step": 3537 }, { "epoch": 1.8417490890161374, "grad_norm": 0.21111111952706546, "learning_rate": 1.71277373227241e-05, "loss": 0.1035, "step": 3538 }, { "epoch": 1.8422696512233212, "grad_norm": 0.20655633250382696, "learning_rate": 1.7114403264467703e-05, "loss": 0.098, "step": 3539 }, { "epoch": 1.842790213430505, "grad_norm": 0.22438903873520755, "learning_rate": 1.710107169684972e-05, "loss": 0.1067, "step": 3540 }, { "epoch": 1.8433107756376887, "grad_norm": 0.2199314935079129, "learning_rate": 1.7087742624080883e-05, "loss": 0.1021, "step": 3541 }, { "epoch": 1.8438313378448723, "grad_norm": 0.2188004654542698, "learning_rate": 1.7074416050371122e-05, "loss": 0.1063, "step": 3542 }, { "epoch": 1.844351900052056, "grad_norm": 0.22136277736501705, "learning_rate": 1.7061091979929612e-05, "loss": 0.1041, "step": 3543 }, { "epoch": 1.8448724622592398, "grad_norm": 0.22383412015373572, "learning_rate": 1.7047770416964688e-05, "loss": 0.1037, "step": 3544 }, { "epoch": 1.8453930244664236, "grad_norm": 0.22366283880625565, "learning_rate": 1.7034451365683927e-05, "loss": 0.1057, "step": 3545 }, { "epoch": 1.8459135866736074, "grad_norm": 0.22075942408527013, "learning_rate": 1.702113483029412e-05, "loss": 0.1017, "step": 3546 }, { "epoch": 1.8464341488807912, "grad_norm": 0.2313958085020113, "learning_rate": 1.700782081500123e-05, "loss": 0.1046, "step": 3547 }, { "epoch": 1.846954711087975, "grad_norm": 0.22215124468148365, "learning_rate": 1.6994509324010457e-05, "loss": 0.1038, "step": 3548 }, { "epoch": 1.8474752732951587, "grad_norm": 0.2140104139101645, "learning_rate": 1.6981200361526177e-05, "loss": 0.1016, "step": 3549 }, { "epoch": 1.8479958355023425, "grad_norm": 0.2199758204376425, "learning_rate": 1.6967893931751988e-05, "loss": 0.0985, "step": 3550 }, { "epoch": 1.8485163977095262, "grad_norm": 0.21980628404627928, "learning_rate": 1.695459003889068e-05, "loss": 0.1027, "step": 3551 }, { "epoch": 1.84903695991671, "grad_norm": 0.22964814499456035, "learning_rate": 1.694128868714424e-05, "loss": 0.1008, "step": 3552 }, { "epoch": 1.8495575221238938, "grad_norm": 0.2194752903061473, "learning_rate": 1.692798988071385e-05, "loss": 0.1043, "step": 3553 }, { "epoch": 1.8500780843310776, "grad_norm": 0.22695773488044693, "learning_rate": 1.6914693623799894e-05, "loss": 0.1063, "step": 3554 }, { "epoch": 1.8505986465382613, "grad_norm": 0.22960187036725627, "learning_rate": 1.690139992060195e-05, "loss": 0.1032, "step": 3555 }, { "epoch": 1.851119208745445, "grad_norm": 0.2174160418036627, "learning_rate": 1.6888108775318785e-05, "loss": 0.1009, "step": 3556 }, { "epoch": 1.8516397709526289, "grad_norm": 0.2234493817495722, "learning_rate": 1.6874820192148365e-05, "loss": 0.104, "step": 3557 }, { "epoch": 1.8521603331598127, "grad_norm": 0.220450115790806, "learning_rate": 1.686153417528784e-05, "loss": 0.1016, "step": 3558 }, { "epoch": 1.8526808953669964, "grad_norm": 0.2274125870596465, "learning_rate": 1.684825072893356e-05, "loss": 0.1098, "step": 3559 }, { "epoch": 1.8532014575741802, "grad_norm": 0.22368136276174644, "learning_rate": 1.6834969857281042e-05, "loss": 0.1048, "step": 3560 }, { "epoch": 1.853722019781364, "grad_norm": 0.22278441077025057, "learning_rate": 1.682169156452502e-05, "loss": 0.1021, "step": 3561 }, { "epoch": 1.8542425819885477, "grad_norm": 0.2144032315295473, "learning_rate": 1.6808415854859384e-05, "loss": 0.1009, "step": 3562 }, { "epoch": 1.8547631441957315, "grad_norm": 0.21883351924688985, "learning_rate": 1.679514273247722e-05, "loss": 0.1046, "step": 3563 }, { "epoch": 1.8552837064029153, "grad_norm": 0.21804994979018316, "learning_rate": 1.678187220157081e-05, "loss": 0.105, "step": 3564 }, { "epoch": 1.855804268610099, "grad_norm": 0.23179533965357424, "learning_rate": 1.6768604266331585e-05, "loss": 0.1049, "step": 3565 }, { "epoch": 1.8563248308172826, "grad_norm": 0.21771973013714693, "learning_rate": 1.675533893095019e-05, "loss": 0.1042, "step": 3566 }, { "epoch": 1.8568453930244664, "grad_norm": 0.21510845344937907, "learning_rate": 1.6742076199616448e-05, "loss": 0.1035, "step": 3567 }, { "epoch": 1.8573659552316502, "grad_norm": 0.22503185695899153, "learning_rate": 1.6728816076519322e-05, "loss": 0.1045, "step": 3568 }, { "epoch": 1.857886517438834, "grad_norm": 0.21278464572186798, "learning_rate": 1.671555856584699e-05, "loss": 0.1043, "step": 3569 }, { "epoch": 1.8584070796460177, "grad_norm": 0.21442678992556535, "learning_rate": 1.6702303671786797e-05, "loss": 0.1019, "step": 3570 }, { "epoch": 1.8589276418532015, "grad_norm": 0.21995013323000376, "learning_rate": 1.6689051398525233e-05, "loss": 0.0996, "step": 3571 }, { "epoch": 1.8594482040603852, "grad_norm": 0.2166463003463736, "learning_rate": 1.6675801750247998e-05, "loss": 0.1001, "step": 3572 }, { "epoch": 1.859968766267569, "grad_norm": 0.21168851210455933, "learning_rate": 1.6662554731139944e-05, "loss": 0.0986, "step": 3573 }, { "epoch": 1.8604893284747526, "grad_norm": 0.2187117583742439, "learning_rate": 1.66493103453851e-05, "loss": 0.099, "step": 3574 }, { "epoch": 1.8610098906819363, "grad_norm": 0.2323426842133383, "learning_rate": 1.6636068597166655e-05, "loss": 0.1083, "step": 3575 }, { "epoch": 1.8615304528891201, "grad_norm": 0.226528190668166, "learning_rate": 1.6622829490666974e-05, "loss": 0.1015, "step": 3576 }, { "epoch": 1.8620510150963039, "grad_norm": 0.2099990904532976, "learning_rate": 1.6609593030067574e-05, "loss": 0.102, "step": 3577 }, { "epoch": 1.8625715773034877, "grad_norm": 0.21653151598199424, "learning_rate": 1.6596359219549158e-05, "loss": 0.102, "step": 3578 }, { "epoch": 1.8630921395106714, "grad_norm": 0.23703398831513672, "learning_rate": 1.6583128063291576e-05, "loss": 0.0998, "step": 3579 }, { "epoch": 1.8636127017178552, "grad_norm": 0.2204375202877476, "learning_rate": 1.6569899565473828e-05, "loss": 0.1021, "step": 3580 }, { "epoch": 1.864133263925039, "grad_norm": 0.21805115310094073, "learning_rate": 1.6556673730274107e-05, "loss": 0.1049, "step": 3581 }, { "epoch": 1.8646538261322227, "grad_norm": 0.22105359053989074, "learning_rate": 1.6543450561869732e-05, "loss": 0.1069, "step": 3582 }, { "epoch": 1.8651743883394065, "grad_norm": 0.226961101870007, "learning_rate": 1.6530230064437213e-05, "loss": 0.1039, "step": 3583 }, { "epoch": 1.8656949505465903, "grad_norm": 0.21715026468515009, "learning_rate": 1.651701224215218e-05, "loss": 0.1058, "step": 3584 }, { "epoch": 1.866215512753774, "grad_norm": 0.2185791885590469, "learning_rate": 1.6503797099189453e-05, "loss": 0.1033, "step": 3585 }, { "epoch": 1.8667360749609578, "grad_norm": 0.21828087490161857, "learning_rate": 1.6490584639722976e-05, "loss": 0.1044, "step": 3586 }, { "epoch": 1.8672566371681416, "grad_norm": 0.2258399116146915, "learning_rate": 1.6477374867925867e-05, "loss": 0.1046, "step": 3587 }, { "epoch": 1.8677771993753254, "grad_norm": 0.21880046178089685, "learning_rate": 1.646416778797039e-05, "loss": 0.1029, "step": 3588 }, { "epoch": 1.8682977615825092, "grad_norm": 0.22303778060525425, "learning_rate": 1.645096340402794e-05, "loss": 0.1014, "step": 3589 }, { "epoch": 1.868818323789693, "grad_norm": 0.22130295830879798, "learning_rate": 1.6437761720269087e-05, "loss": 0.1021, "step": 3590 }, { "epoch": 1.8693388859968767, "grad_norm": 0.2164153839615916, "learning_rate": 1.642456274086353e-05, "loss": 0.1033, "step": 3591 }, { "epoch": 1.8698594482040605, "grad_norm": 0.22284825275166165, "learning_rate": 1.6411366469980134e-05, "loss": 0.1024, "step": 3592 }, { "epoch": 1.8703800104112442, "grad_norm": 0.2110508578950555, "learning_rate": 1.6398172911786883e-05, "loss": 0.1024, "step": 3593 }, { "epoch": 1.870900572618428, "grad_norm": 0.20736136520524462, "learning_rate": 1.6384982070450922e-05, "loss": 0.0949, "step": 3594 }, { "epoch": 1.8714211348256118, "grad_norm": 0.21550440293843662, "learning_rate": 1.637179395013853e-05, "loss": 0.1, "step": 3595 }, { "epoch": 1.8719416970327956, "grad_norm": 0.22901002637779894, "learning_rate": 1.6358608555015135e-05, "loss": 0.1085, "step": 3596 }, { "epoch": 1.8724622592399793, "grad_norm": 0.21708597501510324, "learning_rate": 1.6345425889245298e-05, "loss": 0.0956, "step": 3597 }, { "epoch": 1.8729828214471629, "grad_norm": 0.21299348538165125, "learning_rate": 1.6332245956992703e-05, "loss": 0.0975, "step": 3598 }, { "epoch": 1.8735033836543467, "grad_norm": 0.21235644848347995, "learning_rate": 1.6319068762420204e-05, "loss": 0.1009, "step": 3599 }, { "epoch": 1.8740239458615304, "grad_norm": 0.21460664666278179, "learning_rate": 1.6305894309689763e-05, "loss": 0.1024, "step": 3600 }, { "epoch": 1.8745445080687142, "grad_norm": 0.22828274571427395, "learning_rate": 1.629272260296249e-05, "loss": 0.1071, "step": 3601 }, { "epoch": 1.875065070275898, "grad_norm": 0.22326950946792548, "learning_rate": 1.6279553646398615e-05, "loss": 0.1061, "step": 3602 }, { "epoch": 1.8755856324830817, "grad_norm": 0.21887665058394934, "learning_rate": 1.626638744415752e-05, "loss": 0.1052, "step": 3603 }, { "epoch": 1.8761061946902655, "grad_norm": 0.22556670911635793, "learning_rate": 1.625322400039769e-05, "loss": 0.1034, "step": 3604 }, { "epoch": 1.8766267568974493, "grad_norm": 0.22477039664046272, "learning_rate": 1.6240063319276767e-05, "loss": 0.1066, "step": 3605 }, { "epoch": 1.8771473191046328, "grad_norm": 0.208448037377274, "learning_rate": 1.6226905404951503e-05, "loss": 0.0964, "step": 3606 }, { "epoch": 1.8776678813118166, "grad_norm": 0.20964154673856936, "learning_rate": 1.621375026157777e-05, "loss": 0.0974, "step": 3607 }, { "epoch": 1.8781884435190004, "grad_norm": 0.21982252986036593, "learning_rate": 1.6200597893310586e-05, "loss": 0.1041, "step": 3608 }, { "epoch": 1.8787090057261842, "grad_norm": 0.21630070655518385, "learning_rate": 1.618744830430407e-05, "loss": 0.1028, "step": 3609 }, { "epoch": 1.879229567933368, "grad_norm": 0.22686308982838688, "learning_rate": 1.6174301498711486e-05, "loss": 0.1037, "step": 3610 }, { "epoch": 1.8797501301405517, "grad_norm": 0.22043831684254314, "learning_rate": 1.6161157480685197e-05, "loss": 0.103, "step": 3611 }, { "epoch": 1.8802706923477355, "grad_norm": 0.2178337556402931, "learning_rate": 1.6148016254376702e-05, "loss": 0.0978, "step": 3612 }, { "epoch": 1.8807912545549192, "grad_norm": 0.22627246524746905, "learning_rate": 1.613487782393661e-05, "loss": 0.1046, "step": 3613 }, { "epoch": 1.881311816762103, "grad_norm": 0.22358601640561931, "learning_rate": 1.6121742193514648e-05, "loss": 0.0986, "step": 3614 }, { "epoch": 1.8818323789692868, "grad_norm": 0.22423924786442362, "learning_rate": 1.610860936725967e-05, "loss": 0.1001, "step": 3615 }, { "epoch": 1.8823529411764706, "grad_norm": 0.2187539346723351, "learning_rate": 1.6095479349319607e-05, "loss": 0.106, "step": 3616 }, { "epoch": 1.8828735033836543, "grad_norm": 0.22405774990004992, "learning_rate": 1.6082352143841555e-05, "loss": 0.0999, "step": 3617 }, { "epoch": 1.883394065590838, "grad_norm": 0.2268274693952615, "learning_rate": 1.6069227754971683e-05, "loss": 0.1007, "step": 3618 }, { "epoch": 1.8839146277980219, "grad_norm": 0.2387625325422801, "learning_rate": 1.6056106186855292e-05, "loss": 0.1072, "step": 3619 }, { "epoch": 1.8844351900052057, "grad_norm": 0.21912658060501727, "learning_rate": 1.6042987443636775e-05, "loss": 0.1043, "step": 3620 }, { "epoch": 1.8849557522123894, "grad_norm": 0.22608005928160363, "learning_rate": 1.6029871529459656e-05, "loss": 0.1034, "step": 3621 }, { "epoch": 1.8854763144195732, "grad_norm": 0.21470577776047628, "learning_rate": 1.601675844846653e-05, "loss": 0.0991, "step": 3622 }, { "epoch": 1.885996876626757, "grad_norm": 0.21496261618823018, "learning_rate": 1.600364820479914e-05, "loss": 0.0995, "step": 3623 }, { "epoch": 1.8865174388339407, "grad_norm": 0.20783054254928857, "learning_rate": 1.5990540802598302e-05, "loss": 0.0999, "step": 3624 }, { "epoch": 1.8870380010411245, "grad_norm": 0.20378345368619807, "learning_rate": 1.5977436246003937e-05, "loss": 0.0936, "step": 3625 }, { "epoch": 1.8875585632483083, "grad_norm": 0.2186409812177155, "learning_rate": 1.5964334539155084e-05, "loss": 0.1048, "step": 3626 }, { "epoch": 1.888079125455492, "grad_norm": 0.22599765981965825, "learning_rate": 1.5951235686189857e-05, "loss": 0.1043, "step": 3627 }, { "epoch": 1.8885996876626758, "grad_norm": 0.22083477562213608, "learning_rate": 1.5938139691245505e-05, "loss": 0.1018, "step": 3628 }, { "epoch": 1.8891202498698596, "grad_norm": 0.2170688955855864, "learning_rate": 1.5925046558458333e-05, "loss": 0.1021, "step": 3629 }, { "epoch": 1.8896408120770432, "grad_norm": 0.2148159717859146, "learning_rate": 1.5911956291963775e-05, "loss": 0.0983, "step": 3630 }, { "epoch": 1.890161374284227, "grad_norm": 0.20873764640196554, "learning_rate": 1.5898868895896334e-05, "loss": 0.0974, "step": 3631 }, { "epoch": 1.8906819364914107, "grad_norm": 0.3148651473791158, "learning_rate": 1.5885784374389632e-05, "loss": 0.1065, "step": 3632 }, { "epoch": 1.8912024986985945, "grad_norm": 0.22482430722857094, "learning_rate": 1.5872702731576373e-05, "loss": 0.1042, "step": 3633 }, { "epoch": 1.8917230609057782, "grad_norm": 0.22637467340246845, "learning_rate": 1.585962397158833e-05, "loss": 0.1087, "step": 3634 }, { "epoch": 1.892243623112962, "grad_norm": 0.22354140941775033, "learning_rate": 1.584654809855639e-05, "loss": 0.0974, "step": 3635 }, { "epoch": 1.8927641853201458, "grad_norm": 0.21626315203073032, "learning_rate": 1.5833475116610523e-05, "loss": 0.1005, "step": 3636 }, { "epoch": 1.8932847475273296, "grad_norm": 0.2113336278345908, "learning_rate": 1.582040502987979e-05, "loss": 0.1041, "step": 3637 }, { "epoch": 1.893805309734513, "grad_norm": 0.22087048570487774, "learning_rate": 1.5807337842492337e-05, "loss": 0.1036, "step": 3638 }, { "epoch": 1.8943258719416969, "grad_norm": 0.21572578777284565, "learning_rate": 1.5794273558575374e-05, "loss": 0.0984, "step": 3639 }, { "epoch": 1.8948464341488807, "grad_norm": 0.22769892060924338, "learning_rate": 1.5781212182255226e-05, "loss": 0.1122, "step": 3640 }, { "epoch": 1.8953669963560644, "grad_norm": 0.213430358027192, "learning_rate": 1.5768153717657268e-05, "loss": 0.0988, "step": 3641 }, { "epoch": 1.8958875585632482, "grad_norm": 0.2622995076619564, "learning_rate": 1.5755098168905992e-05, "loss": 0.098, "step": 3642 }, { "epoch": 1.896408120770432, "grad_norm": 0.2307094720665387, "learning_rate": 1.574204554012493e-05, "loss": 0.0983, "step": 3643 }, { "epoch": 1.8969286829776157, "grad_norm": 0.22053805471841584, "learning_rate": 1.572899583543671e-05, "loss": 0.1017, "step": 3644 }, { "epoch": 1.8974492451847995, "grad_norm": 0.21911020706637074, "learning_rate": 1.571594905896304e-05, "loss": 0.1048, "step": 3645 }, { "epoch": 1.8979698073919833, "grad_norm": 0.21973155556064952, "learning_rate": 1.5702905214824705e-05, "loss": 0.1047, "step": 3646 }, { "epoch": 1.898490369599167, "grad_norm": 0.2446184787139059, "learning_rate": 1.5689864307141548e-05, "loss": 0.1003, "step": 3647 }, { "epoch": 1.8990109318063508, "grad_norm": 0.21721103382874216, "learning_rate": 1.56768263400325e-05, "loss": 0.0934, "step": 3648 }, { "epoch": 1.8995314940135346, "grad_norm": 0.22688257718189353, "learning_rate": 1.566379131761556e-05, "loss": 0.104, "step": 3649 }, { "epoch": 1.9000520562207184, "grad_norm": 0.21482237576043078, "learning_rate": 1.5650759244007783e-05, "loss": 0.0964, "step": 3650 }, { "epoch": 1.9005726184279021, "grad_norm": 0.22722159279382412, "learning_rate": 1.5637730123325327e-05, "loss": 0.104, "step": 3651 }, { "epoch": 1.901093180635086, "grad_norm": 0.22648407542435803, "learning_rate": 1.562470395968338e-05, "loss": 0.1043, "step": 3652 }, { "epoch": 1.9016137428422697, "grad_norm": 0.21123512550603266, "learning_rate": 1.56116807571962e-05, "loss": 0.0991, "step": 3653 }, { "epoch": 1.9021343050494535, "grad_norm": 0.22587891194139853, "learning_rate": 1.559866051997714e-05, "loss": 0.1019, "step": 3654 }, { "epoch": 1.9026548672566372, "grad_norm": 0.22485896129735808, "learning_rate": 1.5585643252138577e-05, "loss": 0.0991, "step": 3655 }, { "epoch": 1.903175429463821, "grad_norm": 0.21654611342402638, "learning_rate": 1.557262895779199e-05, "loss": 0.1018, "step": 3656 }, { "epoch": 1.9036959916710048, "grad_norm": 0.22992554106238536, "learning_rate": 1.5559617641047886e-05, "loss": 0.1052, "step": 3657 }, { "epoch": 1.9042165538781886, "grad_norm": 0.2261900928255639, "learning_rate": 1.5546609306015856e-05, "loss": 0.1038, "step": 3658 }, { "epoch": 1.9047371160853723, "grad_norm": 0.22144384444707318, "learning_rate": 1.5533603956804522e-05, "loss": 0.1015, "step": 3659 }, { "epoch": 1.905257678292556, "grad_norm": 0.21449867047073035, "learning_rate": 1.5520601597521596e-05, "loss": 0.0986, "step": 3660 }, { "epoch": 1.9057782404997399, "grad_norm": 0.22083165166943855, "learning_rate": 1.5507602232273814e-05, "loss": 0.0978, "step": 3661 }, { "epoch": 1.9062988027069234, "grad_norm": 0.21908132571042455, "learning_rate": 1.5494605865166983e-05, "loss": 0.1009, "step": 3662 }, { "epoch": 1.9068193649141072, "grad_norm": 0.2256794889232402, "learning_rate": 1.5481612500305964e-05, "loss": 0.1064, "step": 3663 }, { "epoch": 1.907339927121291, "grad_norm": 0.20932873751015288, "learning_rate": 1.5468622141794664e-05, "loss": 0.1019, "step": 3664 }, { "epoch": 1.9078604893284747, "grad_norm": 0.22557308680633772, "learning_rate": 1.5455634793736046e-05, "loss": 0.1051, "step": 3665 }, { "epoch": 1.9083810515356585, "grad_norm": 0.21717795714503, "learning_rate": 1.544265046023211e-05, "loss": 0.1043, "step": 3666 }, { "epoch": 1.9089016137428423, "grad_norm": 0.21382166007000752, "learning_rate": 1.542966914538393e-05, "loss": 0.1003, "step": 3667 }, { "epoch": 1.909422175950026, "grad_norm": 0.21047577575173457, "learning_rate": 1.541669085329159e-05, "loss": 0.1021, "step": 3668 }, { "epoch": 1.9099427381572098, "grad_norm": 0.21885604669207556, "learning_rate": 1.540371558805425e-05, "loss": 0.1045, "step": 3669 }, { "epoch": 1.9104633003643934, "grad_norm": 0.2263376773671937, "learning_rate": 1.539074335377011e-05, "loss": 0.1079, "step": 3670 }, { "epoch": 1.9109838625715772, "grad_norm": 0.21167752773460963, "learning_rate": 1.537777415453638e-05, "loss": 0.0996, "step": 3671 }, { "epoch": 1.911504424778761, "grad_norm": 0.21780236850803966, "learning_rate": 1.536480799444936e-05, "loss": 0.104, "step": 3672 }, { "epoch": 1.9120249869859447, "grad_norm": 0.2202957087255931, "learning_rate": 1.5351844877604353e-05, "loss": 0.101, "step": 3673 }, { "epoch": 1.9125455491931285, "grad_norm": 0.2215233885356788, "learning_rate": 1.5338884808095726e-05, "loss": 0.1031, "step": 3674 }, { "epoch": 1.9130661114003122, "grad_norm": 0.221036149093325, "learning_rate": 1.5325927790016858e-05, "loss": 0.1009, "step": 3675 }, { "epoch": 1.913586673607496, "grad_norm": 0.21072634910142585, "learning_rate": 1.5312973827460194e-05, "loss": 0.1022, "step": 3676 }, { "epoch": 1.9141072358146798, "grad_norm": 0.21579345962748916, "learning_rate": 1.5300022924517186e-05, "loss": 0.1014, "step": 3677 }, { "epoch": 1.9146277980218636, "grad_norm": 0.22972340517503928, "learning_rate": 1.528707508527834e-05, "loss": 0.1066, "step": 3678 }, { "epoch": 1.9151483602290473, "grad_norm": 0.22399399495289485, "learning_rate": 1.527413031383319e-05, "loss": 0.1062, "step": 3679 }, { "epoch": 1.915668922436231, "grad_norm": 0.22928343502766305, "learning_rate": 1.5261188614270278e-05, "loss": 0.1044, "step": 3680 }, { "epoch": 1.9161894846434149, "grad_norm": 0.20874236255449022, "learning_rate": 1.5248249990677212e-05, "loss": 0.1011, "step": 3681 }, { "epoch": 1.9167100468505986, "grad_norm": 0.2238709567201973, "learning_rate": 1.5235314447140603e-05, "loss": 0.1049, "step": 3682 }, { "epoch": 1.9172306090577824, "grad_norm": 0.22429081990026067, "learning_rate": 1.5222381987746104e-05, "loss": 0.1058, "step": 3683 }, { "epoch": 1.9177511712649662, "grad_norm": 0.21422872549018027, "learning_rate": 1.5209452616578379e-05, "loss": 0.0997, "step": 3684 }, { "epoch": 1.91827173347215, "grad_norm": 0.2133382379880559, "learning_rate": 1.5196526337721137e-05, "loss": 0.1031, "step": 3685 }, { "epoch": 1.9187922956793337, "grad_norm": 0.2191123247317634, "learning_rate": 1.5183603155257087e-05, "loss": 0.1003, "step": 3686 }, { "epoch": 1.9193128578865175, "grad_norm": 0.2203372017928507, "learning_rate": 1.517068307326798e-05, "loss": 0.1008, "step": 3687 }, { "epoch": 1.9198334200937013, "grad_norm": 0.22009662025394322, "learning_rate": 1.5157766095834581e-05, "loss": 0.1015, "step": 3688 }, { "epoch": 1.920353982300885, "grad_norm": 0.21359112700683444, "learning_rate": 1.5144852227036658e-05, "loss": 0.1021, "step": 3689 }, { "epoch": 1.9208745445080688, "grad_norm": 0.2333651788145784, "learning_rate": 1.5131941470953026e-05, "loss": 0.1082, "step": 3690 }, { "epoch": 1.9213951067152526, "grad_norm": 0.2199424676535258, "learning_rate": 1.5119033831661489e-05, "loss": 0.0969, "step": 3691 }, { "epoch": 1.9219156689224364, "grad_norm": 0.21869088466324416, "learning_rate": 1.5106129313238898e-05, "loss": 0.1047, "step": 3692 }, { "epoch": 1.9224362311296201, "grad_norm": 0.2257137266532471, "learning_rate": 1.5093227919761082e-05, "loss": 0.1059, "step": 3693 }, { "epoch": 1.9229567933368037, "grad_norm": 0.2042823698266939, "learning_rate": 1.5080329655302916e-05, "loss": 0.0964, "step": 3694 }, { "epoch": 1.9234773555439875, "grad_norm": 0.21289243041563377, "learning_rate": 1.5067434523938263e-05, "loss": 0.0969, "step": 3695 }, { "epoch": 1.9239979177511712, "grad_norm": 0.2157354773694145, "learning_rate": 1.5054542529740009e-05, "loss": 0.1054, "step": 3696 }, { "epoch": 1.924518479958355, "grad_norm": 0.20594700902414997, "learning_rate": 1.504165367678006e-05, "loss": 0.0991, "step": 3697 }, { "epoch": 1.9250390421655388, "grad_norm": 0.2251525996090509, "learning_rate": 1.5028767969129288e-05, "loss": 0.0977, "step": 3698 }, { "epoch": 1.9255596043727226, "grad_norm": 0.21960241687587326, "learning_rate": 1.5015885410857616e-05, "loss": 0.0994, "step": 3699 }, { "epoch": 1.9260801665799063, "grad_norm": 0.20915888582744993, "learning_rate": 1.5003006006033948e-05, "loss": 0.0996, "step": 3700 }, { "epoch": 1.92660072878709, "grad_norm": 0.2225064521826276, "learning_rate": 1.4990129758726203e-05, "loss": 0.1028, "step": 3701 }, { "epoch": 1.9271212909942737, "grad_norm": 0.22458058269108447, "learning_rate": 1.4977256673001305e-05, "loss": 0.1027, "step": 3702 }, { "epoch": 1.9276418532014574, "grad_norm": 0.21114252037998082, "learning_rate": 1.4964386752925163e-05, "loss": 0.0958, "step": 3703 }, { "epoch": 1.9281624154086412, "grad_norm": 0.21678013606532165, "learning_rate": 1.4951520002562705e-05, "loss": 0.0992, "step": 3704 }, { "epoch": 1.928682977615825, "grad_norm": 0.20924637264966345, "learning_rate": 1.4938656425977842e-05, "loss": 0.0971, "step": 3705 }, { "epoch": 1.9292035398230087, "grad_norm": 0.21121389074080546, "learning_rate": 1.4925796027233505e-05, "loss": 0.1028, "step": 3706 }, { "epoch": 1.9297241020301925, "grad_norm": 0.2188455100447437, "learning_rate": 1.4912938810391591e-05, "loss": 0.1019, "step": 3707 }, { "epoch": 1.9302446642373763, "grad_norm": 0.22810063283033222, "learning_rate": 1.4900084779513004e-05, "loss": 0.1034, "step": 3708 }, { "epoch": 1.93076522644456, "grad_norm": 0.2144538872962715, "learning_rate": 1.488723393865766e-05, "loss": 0.1016, "step": 3709 }, { "epoch": 1.9312857886517438, "grad_norm": 0.20918446229699433, "learning_rate": 1.487438629188444e-05, "loss": 0.0993, "step": 3710 }, { "epoch": 1.9318063508589276, "grad_norm": 0.23023524817864033, "learning_rate": 1.4861541843251242e-05, "loss": 0.1019, "step": 3711 }, { "epoch": 1.9323269130661114, "grad_norm": 0.22076188050168008, "learning_rate": 1.4848700596814926e-05, "loss": 0.101, "step": 3712 }, { "epoch": 1.9328474752732951, "grad_norm": 0.21201820939269886, "learning_rate": 1.483586255663137e-05, "loss": 0.1034, "step": 3713 }, { "epoch": 1.933368037480479, "grad_norm": 0.2154132891161572, "learning_rate": 1.482302772675541e-05, "loss": 0.0982, "step": 3714 }, { "epoch": 1.9338885996876627, "grad_norm": 0.22979358381751835, "learning_rate": 1.4810196111240898e-05, "loss": 0.105, "step": 3715 }, { "epoch": 1.9344091618948465, "grad_norm": 0.2170365978314031, "learning_rate": 1.4797367714140642e-05, "loss": 0.1036, "step": 3716 }, { "epoch": 1.9349297241020302, "grad_norm": 0.21104199550912994, "learning_rate": 1.4784542539506447e-05, "loss": 0.0989, "step": 3717 }, { "epoch": 1.935450286309214, "grad_norm": 0.21704690446239305, "learning_rate": 1.477172059138911e-05, "loss": 0.0988, "step": 3718 }, { "epoch": 1.9359708485163978, "grad_norm": 0.2216423717123099, "learning_rate": 1.4758901873838387e-05, "loss": 0.1008, "step": 3719 }, { "epoch": 1.9364914107235816, "grad_norm": 0.2224915756445421, "learning_rate": 1.4746086390903041e-05, "loss": 0.1028, "step": 3720 }, { "epoch": 1.9370119729307653, "grad_norm": 0.22165523853861174, "learning_rate": 1.4733274146630782e-05, "loss": 0.1027, "step": 3721 }, { "epoch": 1.937532535137949, "grad_norm": 0.2104242110992599, "learning_rate": 1.472046514506832e-05, "loss": 0.0999, "step": 3722 }, { "epoch": 1.9380530973451329, "grad_norm": 0.2183583161427088, "learning_rate": 1.4707659390261336e-05, "loss": 0.1008, "step": 3723 }, { "epoch": 1.9385736595523166, "grad_norm": 0.2205753671797551, "learning_rate": 1.4694856886254484e-05, "loss": 0.0993, "step": 3724 }, { "epoch": 1.9390942217595004, "grad_norm": 0.21265574296368048, "learning_rate": 1.4682057637091386e-05, "loss": 0.099, "step": 3725 }, { "epoch": 1.939614783966684, "grad_norm": 0.21653176063173926, "learning_rate": 1.4669261646814637e-05, "loss": 0.0994, "step": 3726 }, { "epoch": 1.9401353461738677, "grad_norm": 0.23412308864317125, "learning_rate": 1.4656468919465816e-05, "loss": 0.1081, "step": 3727 }, { "epoch": 1.9406559083810515, "grad_norm": 0.22672128076546094, "learning_rate": 1.4643679459085451e-05, "loss": 0.0958, "step": 3728 }, { "epoch": 1.9411764705882353, "grad_norm": 0.21722673840560922, "learning_rate": 1.4630893269713058e-05, "loss": 0.101, "step": 3729 }, { "epoch": 1.941697032795419, "grad_norm": 0.21147974417849058, "learning_rate": 1.4618110355387105e-05, "loss": 0.1029, "step": 3730 }, { "epoch": 1.9422175950026028, "grad_norm": 0.21231022676958888, "learning_rate": 1.4605330720145036e-05, "loss": 0.1004, "step": 3731 }, { "epoch": 1.9427381572097866, "grad_norm": 0.2202858395550634, "learning_rate": 1.4592554368023248e-05, "loss": 0.1061, "step": 3732 }, { "epoch": 1.9432587194169704, "grad_norm": 0.2079520954395182, "learning_rate": 1.4579781303057122e-05, "loss": 0.0993, "step": 3733 }, { "epoch": 1.943779281624154, "grad_norm": 0.2232459002547435, "learning_rate": 1.4567011529280963e-05, "loss": 0.1016, "step": 3734 }, { "epoch": 1.9442998438313377, "grad_norm": 0.2131387702891851, "learning_rate": 1.4554245050728085e-05, "loss": 0.0976, "step": 3735 }, { "epoch": 1.9448204060385215, "grad_norm": 0.21742449638032005, "learning_rate": 1.4541481871430712e-05, "loss": 0.0997, "step": 3736 }, { "epoch": 1.9453409682457052, "grad_norm": 0.21341755263048834, "learning_rate": 1.4528721995420065e-05, "loss": 0.1032, "step": 3737 }, { "epoch": 1.945861530452889, "grad_norm": 0.22009074356689978, "learning_rate": 1.4515965426726297e-05, "loss": 0.1059, "step": 3738 }, { "epoch": 1.9463820926600728, "grad_norm": 0.21912610193463983, "learning_rate": 1.4503212169378549e-05, "loss": 0.1001, "step": 3739 }, { "epoch": 1.9469026548672566, "grad_norm": 0.2170439457377092, "learning_rate": 1.449046222740486e-05, "loss": 0.1057, "step": 3740 }, { "epoch": 1.9474232170744403, "grad_norm": 0.21785895921005488, "learning_rate": 1.4477715604832277e-05, "loss": 0.1031, "step": 3741 }, { "epoch": 1.947943779281624, "grad_norm": 0.21495265954061052, "learning_rate": 1.4464972305686777e-05, "loss": 0.0986, "step": 3742 }, { "epoch": 1.9484643414888079, "grad_norm": 0.20444844893689895, "learning_rate": 1.4452232333993271e-05, "loss": 0.0953, "step": 3743 }, { "epoch": 1.9489849036959916, "grad_norm": 0.22666704840339114, "learning_rate": 1.4439495693775657e-05, "loss": 0.1055, "step": 3744 }, { "epoch": 1.9495054659031754, "grad_norm": 0.21327639711763433, "learning_rate": 1.4426762389056735e-05, "loss": 0.1034, "step": 3745 }, { "epoch": 1.9500260281103592, "grad_norm": 0.2184419717310296, "learning_rate": 1.4414032423858287e-05, "loss": 0.1047, "step": 3746 }, { "epoch": 1.950546590317543, "grad_norm": 0.21193552764734155, "learning_rate": 1.4401305802201029e-05, "loss": 0.1026, "step": 3747 }, { "epoch": 1.9510671525247267, "grad_norm": 0.2209102393312676, "learning_rate": 1.4388582528104628e-05, "loss": 0.1011, "step": 3748 }, { "epoch": 1.9515877147319105, "grad_norm": 0.21657115471489805, "learning_rate": 1.4375862605587668e-05, "loss": 0.0986, "step": 3749 }, { "epoch": 1.9521082769390943, "grad_norm": 0.21434326778156354, "learning_rate": 1.4363146038667702e-05, "loss": 0.1012, "step": 3750 }, { "epoch": 1.952628839146278, "grad_norm": 0.21320540069825253, "learning_rate": 1.4350432831361221e-05, "loss": 0.0988, "step": 3751 }, { "epoch": 1.9531494013534618, "grad_norm": 0.22627494817868693, "learning_rate": 1.4337722987683632e-05, "loss": 0.101, "step": 3752 }, { "epoch": 1.9536699635606456, "grad_norm": 0.2194395485328437, "learning_rate": 1.4325016511649315e-05, "loss": 0.0982, "step": 3753 }, { "epoch": 1.9541905257678294, "grad_norm": 0.21191340497198383, "learning_rate": 1.4312313407271543e-05, "loss": 0.099, "step": 3754 }, { "epoch": 1.9547110879750131, "grad_norm": 0.21476576576191445, "learning_rate": 1.429961367856256e-05, "loss": 0.0995, "step": 3755 }, { "epoch": 1.955231650182197, "grad_norm": 0.21770044547304337, "learning_rate": 1.4286917329533528e-05, "loss": 0.1015, "step": 3756 }, { "epoch": 1.9557522123893807, "grad_norm": 0.21158808266880086, "learning_rate": 1.427422436419456e-05, "loss": 0.098, "step": 3757 }, { "epoch": 1.9562727745965642, "grad_norm": 0.21361489248889914, "learning_rate": 1.4261534786554661e-05, "loss": 0.1001, "step": 3758 }, { "epoch": 1.956793336803748, "grad_norm": 0.21171491539393147, "learning_rate": 1.4248848600621801e-05, "loss": 0.1007, "step": 3759 }, { "epoch": 1.9573138990109318, "grad_norm": 0.21237977936588603, "learning_rate": 1.4236165810402879e-05, "loss": 0.0974, "step": 3760 }, { "epoch": 1.9578344612181156, "grad_norm": 0.22772433653264357, "learning_rate": 1.4223486419903692e-05, "loss": 0.105, "step": 3761 }, { "epoch": 1.9583550234252993, "grad_norm": 0.21822253874910252, "learning_rate": 1.4210810433128997e-05, "loss": 0.1027, "step": 3762 }, { "epoch": 1.958875585632483, "grad_norm": 0.21968663113465622, "learning_rate": 1.4198137854082443e-05, "loss": 0.0979, "step": 3763 }, { "epoch": 1.9593961478396669, "grad_norm": 0.21509071607759678, "learning_rate": 1.4185468686766628e-05, "loss": 0.1002, "step": 3764 }, { "epoch": 1.9599167100468506, "grad_norm": 0.21833030367787082, "learning_rate": 1.4172802935183071e-05, "loss": 0.104, "step": 3765 }, { "epoch": 1.9604372722540342, "grad_norm": 0.21150315924836155, "learning_rate": 1.41601406033322e-05, "loss": 0.0976, "step": 3766 }, { "epoch": 1.960957834461218, "grad_norm": 0.22119462315092003, "learning_rate": 1.4147481695213377e-05, "loss": 0.101, "step": 3767 }, { "epoch": 1.9614783966684017, "grad_norm": 0.22377655608418107, "learning_rate": 1.4134826214824859e-05, "loss": 0.1087, "step": 3768 }, { "epoch": 1.9619989588755855, "grad_norm": 0.214726683314058, "learning_rate": 1.4122174166163853e-05, "loss": 0.1011, "step": 3769 }, { "epoch": 1.9625195210827693, "grad_norm": 0.22428658200555723, "learning_rate": 1.410952555322645e-05, "loss": 0.1002, "step": 3770 }, { "epoch": 1.963040083289953, "grad_norm": 0.22440945050923508, "learning_rate": 1.4096880380007673e-05, "loss": 0.1079, "step": 3771 }, { "epoch": 1.9635606454971368, "grad_norm": 0.22006424334394925, "learning_rate": 1.4084238650501471e-05, "loss": 0.103, "step": 3772 }, { "epoch": 1.9640812077043206, "grad_norm": 0.2078275175323549, "learning_rate": 1.4071600368700668e-05, "loss": 0.0965, "step": 3773 }, { "epoch": 1.9646017699115044, "grad_norm": 0.20908961325681313, "learning_rate": 1.4058965538597033e-05, "loss": 0.0958, "step": 3774 }, { "epoch": 1.9651223321186881, "grad_norm": 0.23113004080134392, "learning_rate": 1.4046334164181232e-05, "loss": 0.1046, "step": 3775 }, { "epoch": 1.965642894325872, "grad_norm": 0.21341710865458288, "learning_rate": 1.4033706249442852e-05, "loss": 0.0992, "step": 3776 }, { "epoch": 1.9661634565330557, "grad_norm": 0.2255057605756392, "learning_rate": 1.4021081798370356e-05, "loss": 0.1076, "step": 3777 }, { "epoch": 1.9666840187402395, "grad_norm": 0.22209298875300015, "learning_rate": 1.4008460814951151e-05, "loss": 0.1057, "step": 3778 }, { "epoch": 1.9672045809474232, "grad_norm": 0.22187334155214908, "learning_rate": 1.3995843303171517e-05, "loss": 0.0982, "step": 3779 }, { "epoch": 1.967725143154607, "grad_norm": 0.2103303538717618, "learning_rate": 1.3983229267016653e-05, "loss": 0.1011, "step": 3780 }, { "epoch": 1.9682457053617908, "grad_norm": 0.2040935740244857, "learning_rate": 1.3970618710470676e-05, "loss": 0.0998, "step": 3781 }, { "epoch": 1.9687662675689745, "grad_norm": 0.22402864584457863, "learning_rate": 1.395801163751656e-05, "loss": 0.1026, "step": 3782 }, { "epoch": 1.9692868297761583, "grad_norm": 0.20692376854563577, "learning_rate": 1.3945408052136222e-05, "loss": 0.0981, "step": 3783 }, { "epoch": 1.969807391983342, "grad_norm": 0.22980259328124997, "learning_rate": 1.3932807958310456e-05, "loss": 0.1076, "step": 3784 }, { "epoch": 1.9703279541905259, "grad_norm": 0.20367490497979784, "learning_rate": 1.392021136001897e-05, "loss": 0.0968, "step": 3785 }, { "epoch": 1.9708485163977096, "grad_norm": 0.20898246158228093, "learning_rate": 1.3907618261240334e-05, "loss": 0.1033, "step": 3786 }, { "epoch": 1.9713690786048934, "grad_norm": 0.2113639999243892, "learning_rate": 1.3895028665952058e-05, "loss": 0.1014, "step": 3787 }, { "epoch": 1.9718896408120772, "grad_norm": 0.2175367277530161, "learning_rate": 1.38824425781305e-05, "loss": 0.1008, "step": 3788 }, { "epoch": 1.972410203019261, "grad_norm": 0.21182935807043565, "learning_rate": 1.3869860001750942e-05, "loss": 0.1013, "step": 3789 }, { "epoch": 1.9729307652264445, "grad_norm": 0.22349166273303656, "learning_rate": 1.3857280940787559e-05, "loss": 0.0988, "step": 3790 }, { "epoch": 1.9734513274336283, "grad_norm": 0.21190197883778825, "learning_rate": 1.3844705399213379e-05, "loss": 0.0995, "step": 3791 }, { "epoch": 1.973971889640812, "grad_norm": 0.2193754184132712, "learning_rate": 1.3832133381000359e-05, "loss": 0.105, "step": 3792 }, { "epoch": 1.9744924518479958, "grad_norm": 0.22565894195227415, "learning_rate": 1.3819564890119325e-05, "loss": 0.0992, "step": 3793 }, { "epoch": 1.9750130140551796, "grad_norm": 0.2223063511163848, "learning_rate": 1.3806999930539998e-05, "loss": 0.1082, "step": 3794 }, { "epoch": 1.9755335762623634, "grad_norm": 0.2182868394326183, "learning_rate": 1.3794438506230967e-05, "loss": 0.0969, "step": 3795 }, { "epoch": 1.9760541384695471, "grad_norm": 0.21985719559638187, "learning_rate": 1.378188062115972e-05, "loss": 0.1031, "step": 3796 }, { "epoch": 1.976574700676731, "grad_norm": 0.21709916965896411, "learning_rate": 1.3769326279292616e-05, "loss": 0.1, "step": 3797 }, { "epoch": 1.9770952628839145, "grad_norm": 0.21263835779355764, "learning_rate": 1.3756775484594896e-05, "loss": 0.1004, "step": 3798 }, { "epoch": 1.9776158250910982, "grad_norm": 0.21782936591583624, "learning_rate": 1.374422824103071e-05, "loss": 0.102, "step": 3799 }, { "epoch": 1.978136387298282, "grad_norm": 0.21792766988581672, "learning_rate": 1.373168455256303e-05, "loss": 0.1007, "step": 3800 }, { "epoch": 1.9786569495054658, "grad_norm": 0.20963925747095527, "learning_rate": 1.3719144423153751e-05, "loss": 0.0944, "step": 3801 }, { "epoch": 1.9791775117126496, "grad_norm": 0.2122594378833588, "learning_rate": 1.370660785676363e-05, "loss": 0.0998, "step": 3802 }, { "epoch": 1.9796980739198333, "grad_norm": 0.2213288489164458, "learning_rate": 1.3694074857352308e-05, "loss": 0.1002, "step": 3803 }, { "epoch": 1.980218636127017, "grad_norm": 0.21750190142433626, "learning_rate": 1.3681545428878268e-05, "loss": 0.1015, "step": 3804 }, { "epoch": 1.9807391983342009, "grad_norm": 0.21026897260525107, "learning_rate": 1.3669019575298902e-05, "loss": 0.097, "step": 3805 }, { "epoch": 1.9812597605413846, "grad_norm": 0.22344596462757446, "learning_rate": 1.3656497300570448e-05, "loss": 0.1017, "step": 3806 }, { "epoch": 1.9817803227485684, "grad_norm": 0.23421057868440046, "learning_rate": 1.3643978608648028e-05, "loss": 0.1042, "step": 3807 }, { "epoch": 1.9823008849557522, "grad_norm": 0.2200839534373584, "learning_rate": 1.3631463503485634e-05, "loss": 0.0985, "step": 3808 }, { "epoch": 1.982821447162936, "grad_norm": 0.21843886850429678, "learning_rate": 1.3618951989036102e-05, "loss": 0.1027, "step": 3809 }, { "epoch": 1.9833420093701197, "grad_norm": 0.21096923350471475, "learning_rate": 1.3606444069251162e-05, "loss": 0.0993, "step": 3810 }, { "epoch": 1.9838625715773035, "grad_norm": 0.21343331941361038, "learning_rate": 1.3593939748081393e-05, "loss": 0.0993, "step": 3811 }, { "epoch": 1.9843831337844873, "grad_norm": 0.22152103395034903, "learning_rate": 1.3581439029476255e-05, "loss": 0.1011, "step": 3812 }, { "epoch": 1.984903695991671, "grad_norm": 0.20750708820454272, "learning_rate": 1.3568941917384036e-05, "loss": 0.1011, "step": 3813 }, { "epoch": 1.9854242581988548, "grad_norm": 0.22282972962697625, "learning_rate": 1.3556448415751927e-05, "loss": 0.1, "step": 3814 }, { "epoch": 1.9859448204060386, "grad_norm": 0.21973378480721914, "learning_rate": 1.3543958528525934e-05, "loss": 0.0998, "step": 3815 }, { "epoch": 1.9864653826132224, "grad_norm": 0.21975469487000557, "learning_rate": 1.3531472259650956e-05, "loss": 0.1027, "step": 3816 }, { "epoch": 1.9869859448204061, "grad_norm": 0.20704264881854378, "learning_rate": 1.3518989613070745e-05, "loss": 0.0992, "step": 3817 }, { "epoch": 1.98750650702759, "grad_norm": 0.2276001326716296, "learning_rate": 1.3506510592727889e-05, "loss": 0.103, "step": 3818 }, { "epoch": 1.9880270692347737, "grad_norm": 0.21862242414923339, "learning_rate": 1.3494035202563842e-05, "loss": 0.102, "step": 3819 }, { "epoch": 1.9885476314419575, "grad_norm": 0.20514072992531857, "learning_rate": 1.3481563446518924e-05, "loss": 0.0979, "step": 3820 }, { "epoch": 1.9890681936491412, "grad_norm": 0.219390795219004, "learning_rate": 1.3469095328532305e-05, "loss": 0.1015, "step": 3821 }, { "epoch": 1.9895887558563248, "grad_norm": 0.2234331591905043, "learning_rate": 1.3456630852541968e-05, "loss": 0.0986, "step": 3822 }, { "epoch": 1.9901093180635085, "grad_norm": 0.22043672801905884, "learning_rate": 1.3444170022484803e-05, "loss": 0.1, "step": 3823 }, { "epoch": 1.9906298802706923, "grad_norm": 0.2121077118944676, "learning_rate": 1.3431712842296495e-05, "loss": 0.0992, "step": 3824 }, { "epoch": 1.991150442477876, "grad_norm": 0.2098280218821776, "learning_rate": 1.3419259315911612e-05, "loss": 0.0974, "step": 3825 }, { "epoch": 1.9916710046850599, "grad_norm": 0.22151509852945767, "learning_rate": 1.3406809447263569e-05, "loss": 0.1058, "step": 3826 }, { "epoch": 1.9921915668922436, "grad_norm": 0.2188216176247912, "learning_rate": 1.3394363240284596e-05, "loss": 0.1001, "step": 3827 }, { "epoch": 1.9927121290994274, "grad_norm": 0.2175007496547397, "learning_rate": 1.3381920698905787e-05, "loss": 0.0997, "step": 3828 }, { "epoch": 1.9932326913066112, "grad_norm": 0.22386791931729272, "learning_rate": 1.3369481827057084e-05, "loss": 0.1046, "step": 3829 }, { "epoch": 1.9937532535137947, "grad_norm": 0.22189589045670546, "learning_rate": 1.3357046628667266e-05, "loss": 0.102, "step": 3830 }, { "epoch": 1.9942738157209785, "grad_norm": 0.22222680495180996, "learning_rate": 1.3344615107663929e-05, "loss": 0.1015, "step": 3831 }, { "epoch": 1.9947943779281623, "grad_norm": 0.21165370943689257, "learning_rate": 1.3332187267973545e-05, "loss": 0.1, "step": 3832 }, { "epoch": 1.995314940135346, "grad_norm": 0.21493189063483512, "learning_rate": 1.3319763113521388e-05, "loss": 0.0988, "step": 3833 }, { "epoch": 1.9958355023425298, "grad_norm": 0.2152720610885134, "learning_rate": 1.3307342648231587e-05, "loss": 0.1046, "step": 3834 }, { "epoch": 1.9963560645497136, "grad_norm": 0.22504773486380045, "learning_rate": 1.3294925876027112e-05, "loss": 0.1018, "step": 3835 }, { "epoch": 1.9968766267568974, "grad_norm": 0.21719340129386458, "learning_rate": 1.3282512800829761e-05, "loss": 0.0994, "step": 3836 }, { "epoch": 1.9973971889640811, "grad_norm": 0.2351408224727131, "learning_rate": 1.3270103426560138e-05, "loss": 0.1031, "step": 3837 }, { "epoch": 1.997917751171265, "grad_norm": 0.22730448971582903, "learning_rate": 1.3257697757137722e-05, "loss": 0.1018, "step": 3838 }, { "epoch": 1.9984383133784487, "grad_norm": 0.2240262905997672, "learning_rate": 1.3245295796480789e-05, "loss": 0.0991, "step": 3839 }, { "epoch": 1.9989588755856325, "grad_norm": 0.20159272727142402, "learning_rate": 1.3232897548506473e-05, "loss": 0.0929, "step": 3840 }, { "epoch": 1.9994794377928162, "grad_norm": 0.2185282556675355, "learning_rate": 1.3220503017130702e-05, "loss": 0.1029, "step": 3841 }, { "epoch": 2.0, "grad_norm": 0.21010022623920993, "learning_rate": 1.3208112206268241e-05, "loss": 0.0951, "step": 3842 }, { "epoch": 2.0005205622071838, "grad_norm": 0.23888745487077462, "learning_rate": 1.3195725119832692e-05, "loss": 0.0589, "step": 3843 }, { "epoch": 2.0010411244143675, "grad_norm": 0.22113727902582608, "learning_rate": 1.3183341761736474e-05, "loss": 0.059, "step": 3844 }, { "epoch": 2.0015616866215513, "grad_norm": 0.2100996955115601, "learning_rate": 1.3170962135890838e-05, "loss": 0.057, "step": 3845 }, { "epoch": 2.002082248828735, "grad_norm": 0.21218819629958469, "learning_rate": 1.3158586246205823e-05, "loss": 0.0553, "step": 3846 }, { "epoch": 2.002602811035919, "grad_norm": 0.20870010973163983, "learning_rate": 1.3146214096590326e-05, "loss": 0.0527, "step": 3847 }, { "epoch": 2.0031233732431026, "grad_norm": 0.2324010648172641, "learning_rate": 1.3133845690952045e-05, "loss": 0.0533, "step": 3848 }, { "epoch": 2.0036439354502864, "grad_norm": 0.27402223811767706, "learning_rate": 1.3121481033197508e-05, "loss": 0.0548, "step": 3849 }, { "epoch": 2.00416449765747, "grad_norm": 0.3136132661537596, "learning_rate": 1.3109120127232039e-05, "loss": 0.0561, "step": 3850 }, { "epoch": 2.004685059864654, "grad_norm": 0.28713280826038895, "learning_rate": 1.3096762976959776e-05, "loss": 0.0554, "step": 3851 }, { "epoch": 2.0052056220718377, "grad_norm": 0.2568407408897936, "learning_rate": 1.3084409586283696e-05, "loss": 0.0537, "step": 3852 }, { "epoch": 2.0057261842790215, "grad_norm": 0.2422352941267545, "learning_rate": 1.307205995910557e-05, "loss": 0.0562, "step": 3853 }, { "epoch": 2.0062467464862053, "grad_norm": 0.2393420163757943, "learning_rate": 1.3059714099326e-05, "loss": 0.055, "step": 3854 }, { "epoch": 2.006767308693389, "grad_norm": 0.22062809727347588, "learning_rate": 1.3047372010844361e-05, "loss": 0.0532, "step": 3855 }, { "epoch": 2.007287870900573, "grad_norm": 0.22318736134320938, "learning_rate": 1.3035033697558868e-05, "loss": 0.052, "step": 3856 }, { "epoch": 2.0078084331077566, "grad_norm": 0.2175095710848304, "learning_rate": 1.302269916336653e-05, "loss": 0.0532, "step": 3857 }, { "epoch": 2.0083289953149404, "grad_norm": 0.2043884688220176, "learning_rate": 1.3010368412163187e-05, "loss": 0.0522, "step": 3858 }, { "epoch": 2.0088495575221237, "grad_norm": 0.21515354889304936, "learning_rate": 1.2998041447843448e-05, "loss": 0.055, "step": 3859 }, { "epoch": 2.0093701197293075, "grad_norm": 0.22420587505565287, "learning_rate": 1.2985718274300731e-05, "loss": 0.0516, "step": 3860 }, { "epoch": 2.0098906819364912, "grad_norm": 0.2049175307303528, "learning_rate": 1.2973398895427283e-05, "loss": 0.0519, "step": 3861 }, { "epoch": 2.010411244143675, "grad_norm": 0.21591901082933884, "learning_rate": 1.2961083315114131e-05, "loss": 0.0527, "step": 3862 }, { "epoch": 2.0109318063508588, "grad_norm": 0.22484304767098973, "learning_rate": 1.2948771537251119e-05, "loss": 0.053, "step": 3863 }, { "epoch": 2.0114523685580425, "grad_norm": 0.22695563724281367, "learning_rate": 1.2936463565726864e-05, "loss": 0.0522, "step": 3864 }, { "epoch": 2.0119729307652263, "grad_norm": 0.22138427669788502, "learning_rate": 1.2924159404428803e-05, "loss": 0.052, "step": 3865 }, { "epoch": 2.01249349297241, "grad_norm": 0.2260270817057609, "learning_rate": 1.2911859057243165e-05, "loss": 0.052, "step": 3866 }, { "epoch": 2.013014055179594, "grad_norm": 0.22818335275933718, "learning_rate": 1.2899562528054981e-05, "loss": 0.0514, "step": 3867 }, { "epoch": 2.0135346173867776, "grad_norm": 0.2291490389139092, "learning_rate": 1.2887269820748044e-05, "loss": 0.0522, "step": 3868 }, { "epoch": 2.0140551795939614, "grad_norm": 0.2253847436578246, "learning_rate": 1.2874980939204984e-05, "loss": 0.0514, "step": 3869 }, { "epoch": 2.014575741801145, "grad_norm": 0.2376467323645594, "learning_rate": 1.2862695887307186e-05, "loss": 0.0531, "step": 3870 }, { "epoch": 2.015096304008329, "grad_norm": 0.22645666631710104, "learning_rate": 1.285041466893485e-05, "loss": 0.052, "step": 3871 }, { "epoch": 2.0156168662155127, "grad_norm": 0.2422877418801926, "learning_rate": 1.2838137287966961e-05, "loss": 0.0507, "step": 3872 }, { "epoch": 2.0161374284226965, "grad_norm": 0.2506319561703479, "learning_rate": 1.282586374828127e-05, "loss": 0.0503, "step": 3873 }, { "epoch": 2.0166579906298803, "grad_norm": 0.23453280208493882, "learning_rate": 1.2813594053754346e-05, "loss": 0.0543, "step": 3874 }, { "epoch": 2.017178552837064, "grad_norm": 0.22699601063395236, "learning_rate": 1.2801328208261526e-05, "loss": 0.0524, "step": 3875 }, { "epoch": 2.017699115044248, "grad_norm": 0.24112589242280094, "learning_rate": 1.2789066215676943e-05, "loss": 0.0522, "step": 3876 }, { "epoch": 2.0182196772514316, "grad_norm": 0.23871504911812172, "learning_rate": 1.2776808079873487e-05, "loss": 0.0535, "step": 3877 }, { "epoch": 2.0187402394586154, "grad_norm": 0.23353444655198632, "learning_rate": 1.2764553804722867e-05, "loss": 0.052, "step": 3878 }, { "epoch": 2.019260801665799, "grad_norm": 0.21500168671332048, "learning_rate": 1.2752303394095538e-05, "loss": 0.0499, "step": 3879 }, { "epoch": 2.019781363872983, "grad_norm": 0.22650014917043784, "learning_rate": 1.2740056851860754e-05, "loss": 0.0556, "step": 3880 }, { "epoch": 2.0203019260801667, "grad_norm": 0.23063435161871682, "learning_rate": 1.2727814181886555e-05, "loss": 0.0529, "step": 3881 }, { "epoch": 2.0208224882873504, "grad_norm": 0.22502317978846956, "learning_rate": 1.2715575388039724e-05, "loss": 0.0531, "step": 3882 }, { "epoch": 2.021343050494534, "grad_norm": 0.23365639149073505, "learning_rate": 1.2703340474185854e-05, "loss": 0.0542, "step": 3883 }, { "epoch": 2.021863612701718, "grad_norm": 0.22024899646428664, "learning_rate": 1.2691109444189303e-05, "loss": 0.0503, "step": 3884 }, { "epoch": 2.0223841749089018, "grad_norm": 0.23270362452207452, "learning_rate": 1.2678882301913202e-05, "loss": 0.0551, "step": 3885 }, { "epoch": 2.0229047371160855, "grad_norm": 0.23138418992597184, "learning_rate": 1.2666659051219437e-05, "loss": 0.053, "step": 3886 }, { "epoch": 2.0234252993232693, "grad_norm": 0.2239621991704845, "learning_rate": 1.2654439695968696e-05, "loss": 0.0508, "step": 3887 }, { "epoch": 2.023945861530453, "grad_norm": 0.21409030138660093, "learning_rate": 1.2642224240020404e-05, "loss": 0.0521, "step": 3888 }, { "epoch": 2.024466423737637, "grad_norm": 0.2342885910165884, "learning_rate": 1.263001268723278e-05, "loss": 0.0511, "step": 3889 }, { "epoch": 2.02498698594482, "grad_norm": 0.230010672562304, "learning_rate": 1.2617805041462805e-05, "loss": 0.0532, "step": 3890 }, { "epoch": 2.025507548152004, "grad_norm": 0.22439174131590303, "learning_rate": 1.2605601306566205e-05, "loss": 0.0506, "step": 3891 }, { "epoch": 2.0260281103591877, "grad_norm": 0.22388621675313647, "learning_rate": 1.2593401486397499e-05, "loss": 0.0517, "step": 3892 }, { "epoch": 2.0265486725663715, "grad_norm": 0.22601607131458107, "learning_rate": 1.258120558480996e-05, "loss": 0.0526, "step": 3893 }, { "epoch": 2.0270692347735553, "grad_norm": 0.22319982470499325, "learning_rate": 1.2569013605655627e-05, "loss": 0.0503, "step": 3894 }, { "epoch": 2.027589796980739, "grad_norm": 0.23487166926356898, "learning_rate": 1.2556825552785273e-05, "loss": 0.0527, "step": 3895 }, { "epoch": 2.028110359187923, "grad_norm": 0.23072170266065373, "learning_rate": 1.2544641430048479e-05, "loss": 0.052, "step": 3896 }, { "epoch": 2.0286309213951066, "grad_norm": 0.22670085314206623, "learning_rate": 1.2532461241293531e-05, "loss": 0.0505, "step": 3897 }, { "epoch": 2.0291514836022904, "grad_norm": 0.23480717165950493, "learning_rate": 1.2520284990367514e-05, "loss": 0.052, "step": 3898 }, { "epoch": 2.029672045809474, "grad_norm": 0.22717862102204162, "learning_rate": 1.2508112681116263e-05, "loss": 0.0513, "step": 3899 }, { "epoch": 2.030192608016658, "grad_norm": 0.2259905619825204, "learning_rate": 1.2495944317384337e-05, "loss": 0.0513, "step": 3900 }, { "epoch": 2.0307131702238417, "grad_norm": 0.23979551126797663, "learning_rate": 1.2483779903015086e-05, "loss": 0.0498, "step": 3901 }, { "epoch": 2.0312337324310255, "grad_norm": 0.22847333964606212, "learning_rate": 1.2471619441850596e-05, "loss": 0.0509, "step": 3902 }, { "epoch": 2.0317542946382092, "grad_norm": 0.23177006094278615, "learning_rate": 1.2459462937731708e-05, "loss": 0.0506, "step": 3903 }, { "epoch": 2.032274856845393, "grad_norm": 0.24262261994408266, "learning_rate": 1.2447310394498019e-05, "loss": 0.053, "step": 3904 }, { "epoch": 2.0327954190525768, "grad_norm": 0.23652782752700235, "learning_rate": 1.2435161815987859e-05, "loss": 0.0528, "step": 3905 }, { "epoch": 2.0333159812597605, "grad_norm": 0.23162625176396265, "learning_rate": 1.2423017206038307e-05, "loss": 0.0537, "step": 3906 }, { "epoch": 2.0338365434669443, "grad_norm": 0.22628246516711711, "learning_rate": 1.2410876568485203e-05, "loss": 0.0523, "step": 3907 }, { "epoch": 2.034357105674128, "grad_norm": 0.23814151638305578, "learning_rate": 1.2398739907163124e-05, "loss": 0.0518, "step": 3908 }, { "epoch": 2.034877667881312, "grad_norm": 0.24212468958455002, "learning_rate": 1.2386607225905405e-05, "loss": 0.0522, "step": 3909 }, { "epoch": 2.0353982300884956, "grad_norm": 0.22458499825758288, "learning_rate": 1.2374478528544092e-05, "loss": 0.0503, "step": 3910 }, { "epoch": 2.0359187922956794, "grad_norm": 0.2154706609637643, "learning_rate": 1.2362353818910002e-05, "loss": 0.0505, "step": 3911 }, { "epoch": 2.036439354502863, "grad_norm": 0.23207911313268095, "learning_rate": 1.2350233100832678e-05, "loss": 0.0523, "step": 3912 }, { "epoch": 2.036959916710047, "grad_norm": 0.2275097951477535, "learning_rate": 1.2338116378140424e-05, "loss": 0.0508, "step": 3913 }, { "epoch": 2.0374804789172307, "grad_norm": 0.24046310726808087, "learning_rate": 1.2326003654660249e-05, "loss": 0.0486, "step": 3914 }, { "epoch": 2.0380010411244145, "grad_norm": 0.25638052879623396, "learning_rate": 1.2313894934217907e-05, "loss": 0.0506, "step": 3915 }, { "epoch": 2.0385216033315983, "grad_norm": 0.2241861365775457, "learning_rate": 1.2301790220637904e-05, "loss": 0.0504, "step": 3916 }, { "epoch": 2.039042165538782, "grad_norm": 0.21754058305545665, "learning_rate": 1.2289689517743475e-05, "loss": 0.0478, "step": 3917 }, { "epoch": 2.039562727745966, "grad_norm": 0.2471236978327747, "learning_rate": 1.2277592829356593e-05, "loss": 0.0539, "step": 3918 }, { "epoch": 2.0400832899531496, "grad_norm": 0.23244298598394517, "learning_rate": 1.2265500159297935e-05, "loss": 0.0514, "step": 3919 }, { "epoch": 2.0406038521603334, "grad_norm": 0.2434195925375697, "learning_rate": 1.2253411511386938e-05, "loss": 0.0529, "step": 3920 }, { "epoch": 2.041124414367517, "grad_norm": 0.23277356231448715, "learning_rate": 1.2241326889441763e-05, "loss": 0.0509, "step": 3921 }, { "epoch": 2.041644976574701, "grad_norm": 0.2326730216527539, "learning_rate": 1.2229246297279302e-05, "loss": 0.0499, "step": 3922 }, { "epoch": 2.0421655387818842, "grad_norm": 0.22920084540866484, "learning_rate": 1.2217169738715162e-05, "loss": 0.052, "step": 3923 }, { "epoch": 2.042686100989068, "grad_norm": 0.2283017342208316, "learning_rate": 1.2205097217563668e-05, "loss": 0.0503, "step": 3924 }, { "epoch": 2.0432066631962518, "grad_norm": 0.2316172935733046, "learning_rate": 1.2193028737637897e-05, "loss": 0.0519, "step": 3925 }, { "epoch": 2.0437272254034355, "grad_norm": 0.2165080484448344, "learning_rate": 1.2180964302749637e-05, "loss": 0.0483, "step": 3926 }, { "epoch": 2.0442477876106193, "grad_norm": 0.21978990050785774, "learning_rate": 1.2168903916709404e-05, "loss": 0.0499, "step": 3927 }, { "epoch": 2.044768349817803, "grad_norm": 0.24452756991961144, "learning_rate": 1.2156847583326414e-05, "loss": 0.0532, "step": 3928 }, { "epoch": 2.045288912024987, "grad_norm": 0.21604477471728578, "learning_rate": 1.2144795306408626e-05, "loss": 0.0484, "step": 3929 }, { "epoch": 2.0458094742321706, "grad_norm": 0.22490194735030902, "learning_rate": 1.213274708976271e-05, "loss": 0.0507, "step": 3930 }, { "epoch": 2.0463300364393544, "grad_norm": 0.23117597236920664, "learning_rate": 1.2120702937194061e-05, "loss": 0.0499, "step": 3931 }, { "epoch": 2.046850598646538, "grad_norm": 0.2302153815021224, "learning_rate": 1.2108662852506778e-05, "loss": 0.0514, "step": 3932 }, { "epoch": 2.047371160853722, "grad_norm": 0.22991731459055237, "learning_rate": 1.2096626839503666e-05, "loss": 0.0504, "step": 3933 }, { "epoch": 2.0478917230609057, "grad_norm": 0.2336931030300491, "learning_rate": 1.2084594901986271e-05, "loss": 0.051, "step": 3934 }, { "epoch": 2.0484122852680895, "grad_norm": 0.23611354689827643, "learning_rate": 1.2072567043754837e-05, "loss": 0.0505, "step": 3935 }, { "epoch": 2.0489328474752733, "grad_norm": 0.24039532523121837, "learning_rate": 1.2060543268608329e-05, "loss": 0.0533, "step": 3936 }, { "epoch": 2.049453409682457, "grad_norm": 0.23726727147966595, "learning_rate": 1.2048523580344398e-05, "loss": 0.0507, "step": 3937 }, { "epoch": 2.049973971889641, "grad_norm": 0.2385742302487926, "learning_rate": 1.2036507982759431e-05, "loss": 0.0526, "step": 3938 }, { "epoch": 2.0504945340968246, "grad_norm": 0.22713140069481194, "learning_rate": 1.2024496479648514e-05, "loss": 0.0507, "step": 3939 }, { "epoch": 2.0510150963040084, "grad_norm": 0.22422405740318607, "learning_rate": 1.2012489074805444e-05, "loss": 0.049, "step": 3940 }, { "epoch": 2.051535658511192, "grad_norm": 0.23105843716868302, "learning_rate": 1.200048577202271e-05, "loss": 0.0529, "step": 3941 }, { "epoch": 2.052056220718376, "grad_norm": 0.23528988007664547, "learning_rate": 1.1988486575091507e-05, "loss": 0.0533, "step": 3942 }, { "epoch": 2.0525767829255597, "grad_norm": 0.23894532699169194, "learning_rate": 1.1976491487801748e-05, "loss": 0.0524, "step": 3943 }, { "epoch": 2.0530973451327434, "grad_norm": 0.23433351722426457, "learning_rate": 1.1964500513942034e-05, "loss": 0.0503, "step": 3944 }, { "epoch": 2.053617907339927, "grad_norm": 0.2429361501735258, "learning_rate": 1.1952513657299691e-05, "loss": 0.0525, "step": 3945 }, { "epoch": 2.054138469547111, "grad_norm": 0.2357366347477472, "learning_rate": 1.1940530921660703e-05, "loss": 0.0508, "step": 3946 }, { "epoch": 2.0546590317542948, "grad_norm": 0.22479006903552398, "learning_rate": 1.1928552310809785e-05, "loss": 0.0499, "step": 3947 }, { "epoch": 2.0551795939614785, "grad_norm": 0.22971193296557763, "learning_rate": 1.191657782853034e-05, "loss": 0.0505, "step": 3948 }, { "epoch": 2.0557001561686623, "grad_norm": 0.2251516109143194, "learning_rate": 1.1904607478604476e-05, "loss": 0.0508, "step": 3949 }, { "epoch": 2.056220718375846, "grad_norm": 0.21994611285526833, "learning_rate": 1.1892641264812978e-05, "loss": 0.0491, "step": 3950 }, { "epoch": 2.05674128058303, "grad_norm": 0.23498529308957097, "learning_rate": 1.1880679190935323e-05, "loss": 0.0505, "step": 3951 }, { "epoch": 2.0572618427902136, "grad_norm": 0.24181869335547906, "learning_rate": 1.1868721260749699e-05, "loss": 0.0506, "step": 3952 }, { "epoch": 2.0577824049973974, "grad_norm": 0.251727766194006, "learning_rate": 1.1856767478032979e-05, "loss": 0.0524, "step": 3953 }, { "epoch": 2.0583029672045807, "grad_norm": 0.22110211479371478, "learning_rate": 1.1844817846560732e-05, "loss": 0.0489, "step": 3954 }, { "epoch": 2.0588235294117645, "grad_norm": 0.25357982403761625, "learning_rate": 1.183287237010719e-05, "loss": 0.0524, "step": 3955 }, { "epoch": 2.0593440916189483, "grad_norm": 0.2435984952058527, "learning_rate": 1.1820931052445297e-05, "loss": 0.053, "step": 3956 }, { "epoch": 2.059864653826132, "grad_norm": 0.23320265572221055, "learning_rate": 1.180899389734668e-05, "loss": 0.0494, "step": 3957 }, { "epoch": 2.060385216033316, "grad_norm": 0.23298375809694774, "learning_rate": 1.1797060908581656e-05, "loss": 0.052, "step": 3958 }, { "epoch": 2.0609057782404996, "grad_norm": 0.24071234192220964, "learning_rate": 1.1785132089919208e-05, "loss": 0.0513, "step": 3959 }, { "epoch": 2.0614263404476834, "grad_norm": 0.21300744847374067, "learning_rate": 1.1773207445127005e-05, "loss": 0.0488, "step": 3960 }, { "epoch": 2.061946902654867, "grad_norm": 0.22968959003299735, "learning_rate": 1.1761286977971412e-05, "loss": 0.0515, "step": 3961 }, { "epoch": 2.062467464862051, "grad_norm": 0.2293890367687117, "learning_rate": 1.1749370692217465e-05, "loss": 0.0525, "step": 3962 }, { "epoch": 2.0629880270692347, "grad_norm": 0.22697976848422804, "learning_rate": 1.1737458591628897e-05, "loss": 0.053, "step": 3963 }, { "epoch": 2.0635085892764184, "grad_norm": 0.23070259973942472, "learning_rate": 1.1725550679968084e-05, "loss": 0.0515, "step": 3964 }, { "epoch": 2.064029151483602, "grad_norm": 0.21962051644788558, "learning_rate": 1.1713646960996102e-05, "loss": 0.0509, "step": 3965 }, { "epoch": 2.064549713690786, "grad_norm": 0.23542235445056928, "learning_rate": 1.1701747438472704e-05, "loss": 0.0513, "step": 3966 }, { "epoch": 2.0650702758979698, "grad_norm": 0.2249101062878514, "learning_rate": 1.1689852116156313e-05, "loss": 0.051, "step": 3967 }, { "epoch": 2.0655908381051535, "grad_norm": 0.22365269003898253, "learning_rate": 1.1677960997804047e-05, "loss": 0.0519, "step": 3968 }, { "epoch": 2.0661114003123373, "grad_norm": 0.23393573206151985, "learning_rate": 1.1666074087171627e-05, "loss": 0.0505, "step": 3969 }, { "epoch": 2.066631962519521, "grad_norm": 0.22998763268787198, "learning_rate": 1.1654191388013521e-05, "loss": 0.0484, "step": 3970 }, { "epoch": 2.067152524726705, "grad_norm": 0.2439217995498246, "learning_rate": 1.1642312904082835e-05, "loss": 0.0517, "step": 3971 }, { "epoch": 2.0676730869338886, "grad_norm": 0.24085294418610648, "learning_rate": 1.163043863913135e-05, "loss": 0.0538, "step": 3972 }, { "epoch": 2.0681936491410724, "grad_norm": 0.22810652141332463, "learning_rate": 1.1618568596909512e-05, "loss": 0.0488, "step": 3973 }, { "epoch": 2.068714211348256, "grad_norm": 0.23657856963046003, "learning_rate": 1.1606702781166423e-05, "loss": 0.0501, "step": 3974 }, { "epoch": 2.06923477355544, "grad_norm": 0.23158656572125352, "learning_rate": 1.1594841195649866e-05, "loss": 0.0516, "step": 3975 }, { "epoch": 2.0697553357626237, "grad_norm": 0.23564568907963002, "learning_rate": 1.1582983844106282e-05, "loss": 0.0523, "step": 3976 }, { "epoch": 2.0702758979698075, "grad_norm": 0.21940900341049915, "learning_rate": 1.1571130730280786e-05, "loss": 0.0505, "step": 3977 }, { "epoch": 2.0707964601769913, "grad_norm": 0.23826470500495703, "learning_rate": 1.1559281857917125e-05, "loss": 0.0502, "step": 3978 }, { "epoch": 2.071317022384175, "grad_norm": 0.22049292585636585, "learning_rate": 1.1547437230757726e-05, "loss": 0.0504, "step": 3979 }, { "epoch": 2.071837584591359, "grad_norm": 0.23925268301002867, "learning_rate": 1.1535596852543675e-05, "loss": 0.0528, "step": 3980 }, { "epoch": 2.0723581467985426, "grad_norm": 0.22626393109224524, "learning_rate": 1.1523760727014721e-05, "loss": 0.0521, "step": 3981 }, { "epoch": 2.0728787090057263, "grad_norm": 0.23358296619905644, "learning_rate": 1.1511928857909266e-05, "loss": 0.0521, "step": 3982 }, { "epoch": 2.07339927121291, "grad_norm": 0.22578239853084178, "learning_rate": 1.1500101248964348e-05, "loss": 0.049, "step": 3983 }, { "epoch": 2.073919833420094, "grad_norm": 0.22302089050267151, "learning_rate": 1.1488277903915689e-05, "loss": 0.0496, "step": 3984 }, { "epoch": 2.0744403956272777, "grad_norm": 0.22645035221882673, "learning_rate": 1.147645882649765e-05, "loss": 0.0511, "step": 3985 }, { "epoch": 2.0749609578344614, "grad_norm": 0.23257285368266278, "learning_rate": 1.1464644020443253e-05, "loss": 0.0498, "step": 3986 }, { "epoch": 2.0754815200416448, "grad_norm": 0.23394643638643065, "learning_rate": 1.1452833489484155e-05, "loss": 0.0508, "step": 3987 }, { "epoch": 2.0760020822488285, "grad_norm": 0.2302396225330739, "learning_rate": 1.1441027237350663e-05, "loss": 0.0501, "step": 3988 }, { "epoch": 2.0765226444560123, "grad_norm": 0.2364153136455663, "learning_rate": 1.142922526777175e-05, "loss": 0.0532, "step": 3989 }, { "epoch": 2.077043206663196, "grad_norm": 0.24046454093029992, "learning_rate": 1.1417427584475027e-05, "loss": 0.0526, "step": 3990 }, { "epoch": 2.07756376887038, "grad_norm": 0.23257494130614306, "learning_rate": 1.1405634191186759e-05, "loss": 0.0515, "step": 3991 }, { "epoch": 2.0780843310775636, "grad_norm": 0.2327035472109291, "learning_rate": 1.1393845091631833e-05, "loss": 0.0497, "step": 3992 }, { "epoch": 2.0786048932847474, "grad_norm": 0.23438697636486627, "learning_rate": 1.1382060289533804e-05, "loss": 0.0507, "step": 3993 }, { "epoch": 2.079125455491931, "grad_norm": 0.23128247990194226, "learning_rate": 1.1370279788614856e-05, "loss": 0.0508, "step": 3994 }, { "epoch": 2.079646017699115, "grad_norm": 0.23693622205792916, "learning_rate": 1.1358503592595837e-05, "loss": 0.0508, "step": 3995 }, { "epoch": 2.0801665799062987, "grad_norm": 0.23110049467220117, "learning_rate": 1.1346731705196204e-05, "loss": 0.0501, "step": 3996 }, { "epoch": 2.0806871421134825, "grad_norm": 0.24306539756134846, "learning_rate": 1.1334964130134055e-05, "loss": 0.0538, "step": 3997 }, { "epoch": 2.0812077043206663, "grad_norm": 0.219892553239681, "learning_rate": 1.132320087112615e-05, "loss": 0.0478, "step": 3998 }, { "epoch": 2.08172826652785, "grad_norm": 0.24126403657509093, "learning_rate": 1.1311441931887873e-05, "loss": 0.0519, "step": 3999 }, { "epoch": 2.082248828735034, "grad_norm": 0.2248359123191256, "learning_rate": 1.1299687316133256e-05, "loss": 0.0497, "step": 4000 }, { "epoch": 2.0827693909422176, "grad_norm": 0.24495968302747279, "learning_rate": 1.1287937027574933e-05, "loss": 0.0536, "step": 4001 }, { "epoch": 2.0832899531494014, "grad_norm": 0.2280996896351997, "learning_rate": 1.1276191069924197e-05, "loss": 0.0516, "step": 4002 }, { "epoch": 2.083810515356585, "grad_norm": 0.22983408181704557, "learning_rate": 1.1264449446890975e-05, "loss": 0.0497, "step": 4003 }, { "epoch": 2.084331077563769, "grad_norm": 0.22345330078372413, "learning_rate": 1.1252712162183825e-05, "loss": 0.0481, "step": 4004 }, { "epoch": 2.0848516397709527, "grad_norm": 0.22929949981841313, "learning_rate": 1.1240979219509917e-05, "loss": 0.0508, "step": 4005 }, { "epoch": 2.0853722019781364, "grad_norm": 0.22837492034567583, "learning_rate": 1.1229250622575052e-05, "loss": 0.0494, "step": 4006 }, { "epoch": 2.08589276418532, "grad_norm": 0.2343978470902539, "learning_rate": 1.1217526375083675e-05, "loss": 0.0499, "step": 4007 }, { "epoch": 2.086413326392504, "grad_norm": 0.2336189431913341, "learning_rate": 1.120580648073885e-05, "loss": 0.0516, "step": 4008 }, { "epoch": 2.0869338885996878, "grad_norm": 0.23282836656117412, "learning_rate": 1.1194090943242278e-05, "loss": 0.0503, "step": 4009 }, { "epoch": 2.0874544508068715, "grad_norm": 0.24746886009644017, "learning_rate": 1.1182379766294244e-05, "loss": 0.0522, "step": 4010 }, { "epoch": 2.0879750130140553, "grad_norm": 0.23120603188037314, "learning_rate": 1.1170672953593696e-05, "loss": 0.0501, "step": 4011 }, { "epoch": 2.088495575221239, "grad_norm": 0.23047710533447974, "learning_rate": 1.1158970508838193e-05, "loss": 0.0482, "step": 4012 }, { "epoch": 2.089016137428423, "grad_norm": 0.2524446742959235, "learning_rate": 1.1147272435723918e-05, "loss": 0.05, "step": 4013 }, { "epoch": 2.0895366996356066, "grad_norm": 0.25459941682903553, "learning_rate": 1.113557873794566e-05, "loss": 0.0521, "step": 4014 }, { "epoch": 2.0900572618427904, "grad_norm": 0.22652476025578092, "learning_rate": 1.1123889419196821e-05, "loss": 0.0514, "step": 4015 }, { "epoch": 2.090577824049974, "grad_norm": 0.2288103168103935, "learning_rate": 1.1112204483169439e-05, "loss": 0.0506, "step": 4016 }, { "epoch": 2.091098386257158, "grad_norm": 0.24150820992396882, "learning_rate": 1.1100523933554166e-05, "loss": 0.05, "step": 4017 }, { "epoch": 2.0916189484643413, "grad_norm": 0.2322103337709988, "learning_rate": 1.108884777404027e-05, "loss": 0.0507, "step": 4018 }, { "epoch": 2.092139510671525, "grad_norm": 0.22947198807908026, "learning_rate": 1.1077176008315606e-05, "loss": 0.0509, "step": 4019 }, { "epoch": 2.092660072878709, "grad_norm": 0.23943146290614956, "learning_rate": 1.1065508640066672e-05, "loss": 0.0505, "step": 4020 }, { "epoch": 2.0931806350858926, "grad_norm": 0.2487064925589514, "learning_rate": 1.1053845672978567e-05, "loss": 0.0535, "step": 4021 }, { "epoch": 2.0937011972930764, "grad_norm": 0.23593594385346, "learning_rate": 1.1042187110735e-05, "loss": 0.0503, "step": 4022 }, { "epoch": 2.09422175950026, "grad_norm": 0.24229015214508823, "learning_rate": 1.1030532957018288e-05, "loss": 0.0512, "step": 4023 }, { "epoch": 2.094742321707444, "grad_norm": 0.2257929833420632, "learning_rate": 1.1018883215509343e-05, "loss": 0.0485, "step": 4024 }, { "epoch": 2.0952628839146277, "grad_norm": 0.23317922327013857, "learning_rate": 1.10072378898877e-05, "loss": 0.0496, "step": 4025 }, { "epoch": 2.0957834461218114, "grad_norm": 0.2375777735308176, "learning_rate": 1.0995596983831502e-05, "loss": 0.0512, "step": 4026 }, { "epoch": 2.096304008328995, "grad_norm": 0.23856279889569232, "learning_rate": 1.0983960501017492e-05, "loss": 0.0495, "step": 4027 }, { "epoch": 2.096824570536179, "grad_norm": 0.23551216040588757, "learning_rate": 1.0972328445121e-05, "loss": 0.0495, "step": 4028 }, { "epoch": 2.0973451327433628, "grad_norm": 0.24501438351869415, "learning_rate": 1.0960700819815973e-05, "loss": 0.0491, "step": 4029 }, { "epoch": 2.0978656949505465, "grad_norm": 0.23782057609972138, "learning_rate": 1.0949077628774961e-05, "loss": 0.0488, "step": 4030 }, { "epoch": 2.0983862571577303, "grad_norm": 0.23261361096280053, "learning_rate": 1.0937458875669119e-05, "loss": 0.0513, "step": 4031 }, { "epoch": 2.098906819364914, "grad_norm": 0.24475133620833012, "learning_rate": 1.0925844564168175e-05, "loss": 0.05, "step": 4032 }, { "epoch": 2.099427381572098, "grad_norm": 0.28758926057418177, "learning_rate": 1.0914234697940465e-05, "loss": 0.0496, "step": 4033 }, { "epoch": 2.0999479437792816, "grad_norm": 0.22436454655048274, "learning_rate": 1.0902629280652931e-05, "loss": 0.0506, "step": 4034 }, { "epoch": 2.1004685059864654, "grad_norm": 0.234024748448541, "learning_rate": 1.0891028315971105e-05, "loss": 0.0485, "step": 4035 }, { "epoch": 2.100989068193649, "grad_norm": 0.22684434792800956, "learning_rate": 1.0879431807559116e-05, "loss": 0.0463, "step": 4036 }, { "epoch": 2.101509630400833, "grad_norm": 0.23052518036453548, "learning_rate": 1.0867839759079682e-05, "loss": 0.0489, "step": 4037 }, { "epoch": 2.1020301926080167, "grad_norm": 0.23554676365556773, "learning_rate": 1.0856252174194096e-05, "loss": 0.0501, "step": 4038 }, { "epoch": 2.1025507548152005, "grad_norm": 0.2364179349009896, "learning_rate": 1.084466905656227e-05, "loss": 0.0486, "step": 4039 }, { "epoch": 2.1030713170223843, "grad_norm": 0.2315928309233159, "learning_rate": 1.0833090409842694e-05, "loss": 0.0485, "step": 4040 }, { "epoch": 2.103591879229568, "grad_norm": 0.24327580350563419, "learning_rate": 1.0821516237692434e-05, "loss": 0.0524, "step": 4041 }, { "epoch": 2.104112441436752, "grad_norm": 0.22953852222832158, "learning_rate": 1.080994654376716e-05, "loss": 0.0475, "step": 4042 }, { "epoch": 2.1046330036439356, "grad_norm": 0.23269426088814224, "learning_rate": 1.0798381331721109e-05, "loss": 0.0491, "step": 4043 }, { "epoch": 2.1051535658511193, "grad_norm": 0.2393902668332969, "learning_rate": 1.0786820605207117e-05, "loss": 0.0496, "step": 4044 }, { "epoch": 2.105674128058303, "grad_norm": 0.23157080328354862, "learning_rate": 1.0775264367876605e-05, "loss": 0.0509, "step": 4045 }, { "epoch": 2.106194690265487, "grad_norm": 0.24639805362000278, "learning_rate": 1.0763712623379577e-05, "loss": 0.0517, "step": 4046 }, { "epoch": 2.1067152524726707, "grad_norm": 0.25130598484708905, "learning_rate": 1.0752165375364593e-05, "loss": 0.0528, "step": 4047 }, { "epoch": 2.1072358146798544, "grad_norm": 0.22906779523998783, "learning_rate": 1.0740622627478821e-05, "loss": 0.0505, "step": 4048 }, { "epoch": 2.107756376887038, "grad_norm": 0.22056035942244043, "learning_rate": 1.0729084383368005e-05, "loss": 0.0489, "step": 4049 }, { "epoch": 2.108276939094222, "grad_norm": 0.24068696987121577, "learning_rate": 1.0717550646676443e-05, "loss": 0.0508, "step": 4050 }, { "epoch": 2.1087975013014053, "grad_norm": 0.22714859742711338, "learning_rate": 1.0706021421047047e-05, "loss": 0.0506, "step": 4051 }, { "epoch": 2.109318063508589, "grad_norm": 0.22747067082854852, "learning_rate": 1.0694496710121257e-05, "loss": 0.0491, "step": 4052 }, { "epoch": 2.109838625715773, "grad_norm": 0.23375027504810572, "learning_rate": 1.0682976517539128e-05, "loss": 0.05, "step": 4053 }, { "epoch": 2.1103591879229566, "grad_norm": 0.22132944162068763, "learning_rate": 1.0671460846939274e-05, "loss": 0.0482, "step": 4054 }, { "epoch": 2.1108797501301404, "grad_norm": 0.22882347387684723, "learning_rate": 1.0659949701958885e-05, "loss": 0.0499, "step": 4055 }, { "epoch": 2.111400312337324, "grad_norm": 0.2312263646586655, "learning_rate": 1.0648443086233697e-05, "loss": 0.0511, "step": 4056 }, { "epoch": 2.111920874544508, "grad_norm": 0.24220776541349695, "learning_rate": 1.063694100339805e-05, "loss": 0.0516, "step": 4057 }, { "epoch": 2.1124414367516917, "grad_norm": 0.22625059430843475, "learning_rate": 1.0625443457084841e-05, "loss": 0.0494, "step": 4058 }, { "epoch": 2.1129619989588755, "grad_norm": 0.23299358120448602, "learning_rate": 1.0613950450925513e-05, "loss": 0.0495, "step": 4059 }, { "epoch": 2.1134825611660593, "grad_norm": 0.23342314525886373, "learning_rate": 1.060246198855011e-05, "loss": 0.0494, "step": 4060 }, { "epoch": 2.114003123373243, "grad_norm": 0.2421280557624441, "learning_rate": 1.059097807358721e-05, "loss": 0.0503, "step": 4061 }, { "epoch": 2.114523685580427, "grad_norm": 0.22689249003133796, "learning_rate": 1.0579498709663968e-05, "loss": 0.0479, "step": 4062 }, { "epoch": 2.1150442477876106, "grad_norm": 0.2392848913272859, "learning_rate": 1.0568023900406107e-05, "loss": 0.052, "step": 4063 }, { "epoch": 2.1155648099947943, "grad_norm": 0.25646230307725143, "learning_rate": 1.0556553649437914e-05, "loss": 0.0513, "step": 4064 }, { "epoch": 2.116085372201978, "grad_norm": 0.2416279360872608, "learning_rate": 1.0545087960382211e-05, "loss": 0.0515, "step": 4065 }, { "epoch": 2.116605934409162, "grad_norm": 0.2231969043709799, "learning_rate": 1.05336268368604e-05, "loss": 0.0491, "step": 4066 }, { "epoch": 2.1171264966163457, "grad_norm": 0.2425556380715717, "learning_rate": 1.0522170282492444e-05, "loss": 0.0495, "step": 4067 }, { "epoch": 2.1176470588235294, "grad_norm": 0.2383579552897591, "learning_rate": 1.051071830089686e-05, "loss": 0.0517, "step": 4068 }, { "epoch": 2.118167621030713, "grad_norm": 0.24083746604101072, "learning_rate": 1.049927089569071e-05, "loss": 0.0514, "step": 4069 }, { "epoch": 2.118688183237897, "grad_norm": 0.23603705033415373, "learning_rate": 1.048782807048961e-05, "loss": 0.0509, "step": 4070 }, { "epoch": 2.1192087454450808, "grad_norm": 0.22699841700025328, "learning_rate": 1.0476389828907743e-05, "loss": 0.0507, "step": 4071 }, { "epoch": 2.1197293076522645, "grad_norm": 0.2384365332964614, "learning_rate": 1.046495617455784e-05, "loss": 0.0506, "step": 4072 }, { "epoch": 2.1202498698594483, "grad_norm": 0.23686492769560208, "learning_rate": 1.0453527111051184e-05, "loss": 0.0499, "step": 4073 }, { "epoch": 2.120770432066632, "grad_norm": 0.23537377480311067, "learning_rate": 1.0442102641997594e-05, "loss": 0.0495, "step": 4074 }, { "epoch": 2.121290994273816, "grad_norm": 0.23207116343425835, "learning_rate": 1.0430682771005456e-05, "loss": 0.0525, "step": 4075 }, { "epoch": 2.1218115564809996, "grad_norm": 0.23332099551170052, "learning_rate": 1.0419267501681699e-05, "loss": 0.0498, "step": 4076 }, { "epoch": 2.1223321186881834, "grad_norm": 0.23684848540512882, "learning_rate": 1.0407856837631798e-05, "loss": 0.0521, "step": 4077 }, { "epoch": 2.122852680895367, "grad_norm": 0.22933556276740186, "learning_rate": 1.0396450782459771e-05, "loss": 0.0497, "step": 4078 }, { "epoch": 2.123373243102551, "grad_norm": 0.2288593333175558, "learning_rate": 1.0385049339768168e-05, "loss": 0.0506, "step": 4079 }, { "epoch": 2.1238938053097347, "grad_norm": 0.23192903613988014, "learning_rate": 1.0373652513158105e-05, "loss": 0.0513, "step": 4080 }, { "epoch": 2.1244143675169185, "grad_norm": 0.23148424092687478, "learning_rate": 1.0362260306229229e-05, "loss": 0.0517, "step": 4081 }, { "epoch": 2.124934929724102, "grad_norm": 0.23760449025559408, "learning_rate": 1.0350872722579741e-05, "loss": 0.0506, "step": 4082 }, { "epoch": 2.1254554919312856, "grad_norm": 0.23072320281257291, "learning_rate": 1.0339489765806345e-05, "loss": 0.0519, "step": 4083 }, { "epoch": 2.1259760541384694, "grad_norm": 0.2273989358106142, "learning_rate": 1.032811143950433e-05, "loss": 0.0492, "step": 4084 }, { "epoch": 2.126496616345653, "grad_norm": 0.2286725964642402, "learning_rate": 1.0316737747267486e-05, "loss": 0.0508, "step": 4085 }, { "epoch": 2.127017178552837, "grad_norm": 0.2403403105915755, "learning_rate": 1.0305368692688174e-05, "loss": 0.0498, "step": 4086 }, { "epoch": 2.1275377407600207, "grad_norm": 0.22835020945257542, "learning_rate": 1.0294004279357259e-05, "loss": 0.0494, "step": 4087 }, { "epoch": 2.1280583029672044, "grad_norm": 0.23886208707779596, "learning_rate": 1.028264451086414e-05, "loss": 0.05, "step": 4088 }, { "epoch": 2.128578865174388, "grad_norm": 0.23621522502410017, "learning_rate": 1.0271289390796771e-05, "loss": 0.0495, "step": 4089 }, { "epoch": 2.129099427381572, "grad_norm": 0.23986277451442098, "learning_rate": 1.0259938922741627e-05, "loss": 0.0505, "step": 4090 }, { "epoch": 2.1296199895887558, "grad_norm": 0.2306221065006331, "learning_rate": 1.0248593110283725e-05, "loss": 0.0503, "step": 4091 }, { "epoch": 2.1301405517959395, "grad_norm": 0.23069214444668543, "learning_rate": 1.023725195700658e-05, "loss": 0.0497, "step": 4092 }, { "epoch": 2.1306611140031233, "grad_norm": 0.22731226780333583, "learning_rate": 1.0225915466492267e-05, "loss": 0.0492, "step": 4093 }, { "epoch": 2.131181676210307, "grad_norm": 0.23439529582276133, "learning_rate": 1.0214583642321376e-05, "loss": 0.0499, "step": 4094 }, { "epoch": 2.131702238417491, "grad_norm": 0.2390100567918747, "learning_rate": 1.0203256488073034e-05, "loss": 0.0498, "step": 4095 }, { "epoch": 2.1322228006246746, "grad_norm": 0.22979409308662924, "learning_rate": 1.0191934007324874e-05, "loss": 0.0503, "step": 4096 }, { "epoch": 2.1327433628318584, "grad_norm": 0.23224468806672807, "learning_rate": 1.0180616203653054e-05, "loss": 0.0498, "step": 4097 }, { "epoch": 2.133263925039042, "grad_norm": 0.2310744812755954, "learning_rate": 1.0169303080632272e-05, "loss": 0.05, "step": 4098 }, { "epoch": 2.133784487246226, "grad_norm": 0.2266485347614057, "learning_rate": 1.0157994641835736e-05, "loss": 0.05, "step": 4099 }, { "epoch": 2.1343050494534097, "grad_norm": 0.231117064885941, "learning_rate": 1.014669089083518e-05, "loss": 0.0484, "step": 4100 }, { "epoch": 2.1348256116605935, "grad_norm": 0.23246610511746577, "learning_rate": 1.0135391831200866e-05, "loss": 0.0509, "step": 4101 }, { "epoch": 2.1353461738677773, "grad_norm": 0.24212771834516583, "learning_rate": 1.012409746650154e-05, "loss": 0.053, "step": 4102 }, { "epoch": 2.135866736074961, "grad_norm": 0.23361341416282064, "learning_rate": 1.01128078003045e-05, "loss": 0.0482, "step": 4103 }, { "epoch": 2.136387298282145, "grad_norm": 0.23836572088407046, "learning_rate": 1.0101522836175555e-05, "loss": 0.0504, "step": 4104 }, { "epoch": 2.1369078604893286, "grad_norm": 0.2259664441526069, "learning_rate": 1.0090242577679005e-05, "loss": 0.0495, "step": 4105 }, { "epoch": 2.1374284226965123, "grad_norm": 0.22082689150726414, "learning_rate": 1.0078967028377697e-05, "loss": 0.049, "step": 4106 }, { "epoch": 2.137948984903696, "grad_norm": 0.2342587477443236, "learning_rate": 1.006769619183296e-05, "loss": 0.0502, "step": 4107 }, { "epoch": 2.13846954711088, "grad_norm": 0.22943741247932514, "learning_rate": 1.0056430071604653e-05, "loss": 0.0487, "step": 4108 }, { "epoch": 2.1389901093180637, "grad_norm": 0.22043167610871012, "learning_rate": 1.0045168671251143e-05, "loss": 0.049, "step": 4109 }, { "epoch": 2.1395106715252474, "grad_norm": 0.2405489283322427, "learning_rate": 1.0033911994329314e-05, "loss": 0.051, "step": 4110 }, { "epoch": 2.140031233732431, "grad_norm": 0.234193224915324, "learning_rate": 1.0022660044394534e-05, "loss": 0.0483, "step": 4111 }, { "epoch": 2.140551795939615, "grad_norm": 0.22465165622300198, "learning_rate": 1.0011412825000694e-05, "loss": 0.0478, "step": 4112 }, { "epoch": 2.1410723581467987, "grad_norm": 0.23278809432046188, "learning_rate": 1.00001703397002e-05, "loss": 0.0479, "step": 4113 }, { "epoch": 2.1415929203539825, "grad_norm": 0.23156952891318974, "learning_rate": 9.988932592043937e-06, "loss": 0.048, "step": 4114 }, { "epoch": 2.1421134825611663, "grad_norm": 0.24878058256395674, "learning_rate": 9.977699585581324e-06, "loss": 0.0529, "step": 4115 }, { "epoch": 2.1426340447683496, "grad_norm": 0.2401759125829604, "learning_rate": 9.966471323860251e-06, "loss": 0.0507, "step": 4116 }, { "epoch": 2.1431546069755334, "grad_norm": 0.2318433491416454, "learning_rate": 9.955247810427137e-06, "loss": 0.0506, "step": 4117 }, { "epoch": 2.143675169182717, "grad_norm": 0.23626591123934843, "learning_rate": 9.944029048826887e-06, "loss": 0.0498, "step": 4118 }, { "epoch": 2.144195731389901, "grad_norm": 0.23359239052440017, "learning_rate": 9.932815042602913e-06, "loss": 0.053, "step": 4119 }, { "epoch": 2.1447162935970847, "grad_norm": 0.24007159621963173, "learning_rate": 9.921605795297109e-06, "loss": 0.0495, "step": 4120 }, { "epoch": 2.1452368558042685, "grad_norm": 0.23718809404737304, "learning_rate": 9.910401310449883e-06, "loss": 0.0497, "step": 4121 }, { "epoch": 2.1457574180114523, "grad_norm": 0.23382317156311533, "learning_rate": 9.899201591600138e-06, "loss": 0.0487, "step": 4122 }, { "epoch": 2.146277980218636, "grad_norm": 0.23007554978616346, "learning_rate": 9.888006642285255e-06, "loss": 0.0514, "step": 4123 }, { "epoch": 2.14679854242582, "grad_norm": 0.2360311167478857, "learning_rate": 9.876816466041133e-06, "loss": 0.0514, "step": 4124 }, { "epoch": 2.1473191046330036, "grad_norm": 0.23498144817531677, "learning_rate": 9.865631066402137e-06, "loss": 0.0528, "step": 4125 }, { "epoch": 2.1478396668401873, "grad_norm": 0.22215864699197535, "learning_rate": 9.854450446901143e-06, "loss": 0.0463, "step": 4126 }, { "epoch": 2.148360229047371, "grad_norm": 0.22531836871998293, "learning_rate": 9.843274611069509e-06, "loss": 0.0472, "step": 4127 }, { "epoch": 2.148880791254555, "grad_norm": 0.24034863053588068, "learning_rate": 9.832103562437096e-06, "loss": 0.0514, "step": 4128 }, { "epoch": 2.1494013534617387, "grad_norm": 0.2526527336235642, "learning_rate": 9.820937304532221e-06, "loss": 0.0511, "step": 4129 }, { "epoch": 2.1499219156689224, "grad_norm": 0.23965378370422113, "learning_rate": 9.809775840881718e-06, "loss": 0.0493, "step": 4130 }, { "epoch": 2.150442477876106, "grad_norm": 0.23284542516117132, "learning_rate": 9.798619175010907e-06, "loss": 0.0486, "step": 4131 }, { "epoch": 2.15096304008329, "grad_norm": 0.2517441778747228, "learning_rate": 9.787467310443562e-06, "loss": 0.0518, "step": 4132 }, { "epoch": 2.1514836022904738, "grad_norm": 0.24113620179268663, "learning_rate": 9.77632025070198e-06, "loss": 0.0487, "step": 4133 }, { "epoch": 2.1520041644976575, "grad_norm": 0.23483751096204794, "learning_rate": 9.765177999306904e-06, "loss": 0.05, "step": 4134 }, { "epoch": 2.1525247267048413, "grad_norm": 0.2372927103941721, "learning_rate": 9.754040559777583e-06, "loss": 0.0498, "step": 4135 }, { "epoch": 2.153045288912025, "grad_norm": 0.23825778506276546, "learning_rate": 9.742907935631737e-06, "loss": 0.049, "step": 4136 }, { "epoch": 2.153565851119209, "grad_norm": 0.24176693832701435, "learning_rate": 9.731780130385578e-06, "loss": 0.0497, "step": 4137 }, { "epoch": 2.1540864133263926, "grad_norm": 0.2335246650469084, "learning_rate": 9.720657147553769e-06, "loss": 0.05, "step": 4138 }, { "epoch": 2.1546069755335764, "grad_norm": 0.23861089006529668, "learning_rate": 9.709538990649472e-06, "loss": 0.0503, "step": 4139 }, { "epoch": 2.15512753774076, "grad_norm": 0.2291209928615605, "learning_rate": 9.698425663184324e-06, "loss": 0.0485, "step": 4140 }, { "epoch": 2.155648099947944, "grad_norm": 0.2342784317879674, "learning_rate": 9.68731716866842e-06, "loss": 0.049, "step": 4141 }, { "epoch": 2.1561686621551277, "grad_norm": 0.2320214212518064, "learning_rate": 9.676213510610352e-06, "loss": 0.0485, "step": 4142 }, { "epoch": 2.1566892243623115, "grad_norm": 0.2377587377352587, "learning_rate": 9.665114692517158e-06, "loss": 0.0523, "step": 4143 }, { "epoch": 2.1572097865694952, "grad_norm": 0.2348296445693297, "learning_rate": 9.654020717894366e-06, "loss": 0.0502, "step": 4144 }, { "epoch": 2.157730348776679, "grad_norm": 0.21794445970232668, "learning_rate": 9.642931590245973e-06, "loss": 0.0474, "step": 4145 }, { "epoch": 2.1582509109838623, "grad_norm": 0.2350475368286949, "learning_rate": 9.63184731307445e-06, "loss": 0.0502, "step": 4146 }, { "epoch": 2.158771473191046, "grad_norm": 0.23095612217802725, "learning_rate": 9.620767889880708e-06, "loss": 0.0483, "step": 4147 }, { "epoch": 2.15929203539823, "grad_norm": 0.21882801219665537, "learning_rate": 9.609693324164154e-06, "loss": 0.0476, "step": 4148 }, { "epoch": 2.1598125976054137, "grad_norm": 0.23959480333572203, "learning_rate": 9.59862361942266e-06, "loss": 0.0499, "step": 4149 }, { "epoch": 2.1603331598125974, "grad_norm": 0.2382618434451453, "learning_rate": 9.587558779152536e-06, "loss": 0.0483, "step": 4150 }, { "epoch": 2.160853722019781, "grad_norm": 0.2481562495202432, "learning_rate": 9.576498806848591e-06, "loss": 0.0515, "step": 4151 }, { "epoch": 2.161374284226965, "grad_norm": 0.23415717741022837, "learning_rate": 9.565443706004065e-06, "loss": 0.0506, "step": 4152 }, { "epoch": 2.1618948464341488, "grad_norm": 0.2310272602868718, "learning_rate": 9.554393480110677e-06, "loss": 0.0501, "step": 4153 }, { "epoch": 2.1624154086413325, "grad_norm": 0.21717437554416877, "learning_rate": 9.543348132658602e-06, "loss": 0.0471, "step": 4154 }, { "epoch": 2.1629359708485163, "grad_norm": 0.24030597109135618, "learning_rate": 9.532307667136493e-06, "loss": 0.0503, "step": 4155 }, { "epoch": 2.1634565330557, "grad_norm": 0.2271124234150207, "learning_rate": 9.521272087031414e-06, "loss": 0.0474, "step": 4156 }, { "epoch": 2.163977095262884, "grad_norm": 0.23172566333461286, "learning_rate": 9.510241395828926e-06, "loss": 0.0509, "step": 4157 }, { "epoch": 2.1644976574700676, "grad_norm": 0.23463565756110716, "learning_rate": 9.499215597013048e-06, "loss": 0.0519, "step": 4158 }, { "epoch": 2.1650182196772514, "grad_norm": 0.2269853064996004, "learning_rate": 9.488194694066219e-06, "loss": 0.0503, "step": 4159 }, { "epoch": 2.165538781884435, "grad_norm": 0.22448361065078423, "learning_rate": 9.47717869046937e-06, "loss": 0.0488, "step": 4160 }, { "epoch": 2.166059344091619, "grad_norm": 0.2298514916947985, "learning_rate": 9.466167589701855e-06, "loss": 0.0492, "step": 4161 }, { "epoch": 2.1665799062988027, "grad_norm": 0.2411385315001518, "learning_rate": 9.455161395241496e-06, "loss": 0.0514, "step": 4162 }, { "epoch": 2.1671004685059865, "grad_norm": 0.23344249520016908, "learning_rate": 9.444160110564562e-06, "loss": 0.0507, "step": 4163 }, { "epoch": 2.1676210307131702, "grad_norm": 0.22970482297994116, "learning_rate": 9.433163739145773e-06, "loss": 0.049, "step": 4164 }, { "epoch": 2.168141592920354, "grad_norm": 0.24137131622878738, "learning_rate": 9.422172284458303e-06, "loss": 0.0501, "step": 4165 }, { "epoch": 2.168662155127538, "grad_norm": 0.2391941048873966, "learning_rate": 9.411185749973744e-06, "loss": 0.0491, "step": 4166 }, { "epoch": 2.1691827173347216, "grad_norm": 0.22597120615731292, "learning_rate": 9.400204139162178e-06, "loss": 0.0478, "step": 4167 }, { "epoch": 2.1697032795419053, "grad_norm": 0.22407522799206378, "learning_rate": 9.389227455492083e-06, "loss": 0.0488, "step": 4168 }, { "epoch": 2.170223841749089, "grad_norm": 0.23341996865358303, "learning_rate": 9.378255702430425e-06, "loss": 0.0495, "step": 4169 }, { "epoch": 2.170744403956273, "grad_norm": 0.22756065098090528, "learning_rate": 9.367288883442596e-06, "loss": 0.0465, "step": 4170 }, { "epoch": 2.1712649661634567, "grad_norm": 0.24032557476752722, "learning_rate": 9.356327001992412e-06, "loss": 0.0514, "step": 4171 }, { "epoch": 2.1717855283706404, "grad_norm": 0.23279773656960784, "learning_rate": 9.345370061542158e-06, "loss": 0.049, "step": 4172 }, { "epoch": 2.172306090577824, "grad_norm": 0.227184511533179, "learning_rate": 9.334418065552538e-06, "loss": 0.048, "step": 4173 }, { "epoch": 2.172826652785008, "grad_norm": 0.2388318118994056, "learning_rate": 9.323471017482718e-06, "loss": 0.0492, "step": 4174 }, { "epoch": 2.1733472149921917, "grad_norm": 0.23862458637474987, "learning_rate": 9.312528920790265e-06, "loss": 0.0507, "step": 4175 }, { "epoch": 2.1738677771993755, "grad_norm": 0.2300694111237474, "learning_rate": 9.301591778931218e-06, "loss": 0.0496, "step": 4176 }, { "epoch": 2.1743883394065593, "grad_norm": 0.22487725179561208, "learning_rate": 9.290659595360018e-06, "loss": 0.0474, "step": 4177 }, { "epoch": 2.174908901613743, "grad_norm": 0.24227495997845988, "learning_rate": 9.27973237352957e-06, "loss": 0.0484, "step": 4178 }, { "epoch": 2.175429463820927, "grad_norm": 0.24971312513542707, "learning_rate": 9.268810116891205e-06, "loss": 0.0522, "step": 4179 }, { "epoch": 2.17595002602811, "grad_norm": 0.23240074321999316, "learning_rate": 9.257892828894663e-06, "loss": 0.0509, "step": 4180 }, { "epoch": 2.176470588235294, "grad_norm": 0.23411180744875743, "learning_rate": 9.24698051298814e-06, "loss": 0.0507, "step": 4181 }, { "epoch": 2.1769911504424777, "grad_norm": 0.2283915870645207, "learning_rate": 9.236073172618254e-06, "loss": 0.0496, "step": 4182 }, { "epoch": 2.1775117126496615, "grad_norm": 0.21862188186366638, "learning_rate": 9.225170811230058e-06, "loss": 0.0488, "step": 4183 }, { "epoch": 2.1780322748568453, "grad_norm": 0.25870559139681704, "learning_rate": 9.214273432267009e-06, "loss": 0.0491, "step": 4184 }, { "epoch": 2.178552837064029, "grad_norm": 0.2338660696928279, "learning_rate": 9.203381039171022e-06, "loss": 0.0488, "step": 4185 }, { "epoch": 2.179073399271213, "grad_norm": 0.23653181541138485, "learning_rate": 9.192493635382407e-06, "loss": 0.0482, "step": 4186 }, { "epoch": 2.1795939614783966, "grad_norm": 0.2475444397957008, "learning_rate": 9.181611224339917e-06, "loss": 0.051, "step": 4187 }, { "epoch": 2.1801145236855803, "grad_norm": 0.23144206027749797, "learning_rate": 9.170733809480738e-06, "loss": 0.0478, "step": 4188 }, { "epoch": 2.180635085892764, "grad_norm": 0.23938498072235687, "learning_rate": 9.159861394240444e-06, "loss": 0.0494, "step": 4189 }, { "epoch": 2.181155648099948, "grad_norm": 0.24401500298605283, "learning_rate": 9.148993982053058e-06, "loss": 0.05, "step": 4190 }, { "epoch": 2.1816762103071317, "grad_norm": 0.23262393333357667, "learning_rate": 9.138131576351014e-06, "loss": 0.0487, "step": 4191 }, { "epoch": 2.1821967725143154, "grad_norm": 0.23923195938864855, "learning_rate": 9.127274180565177e-06, "loss": 0.0497, "step": 4192 }, { "epoch": 2.182717334721499, "grad_norm": 0.2385711033015507, "learning_rate": 9.116421798124794e-06, "loss": 0.0508, "step": 4193 }, { "epoch": 2.183237896928683, "grad_norm": 0.2305410960253734, "learning_rate": 9.105574432457576e-06, "loss": 0.0504, "step": 4194 }, { "epoch": 2.1837584591358667, "grad_norm": 0.2315059122487158, "learning_rate": 9.094732086989608e-06, "loss": 0.0481, "step": 4195 }, { "epoch": 2.1842790213430505, "grad_norm": 0.2388138458267806, "learning_rate": 9.083894765145412e-06, "loss": 0.0495, "step": 4196 }, { "epoch": 2.1847995835502343, "grad_norm": 0.24338846982564916, "learning_rate": 9.073062470347928e-06, "loss": 0.0497, "step": 4197 }, { "epoch": 2.185320145757418, "grad_norm": 0.24700761283541492, "learning_rate": 9.062235206018488e-06, "loss": 0.0512, "step": 4198 }, { "epoch": 2.185840707964602, "grad_norm": 0.22202347104845527, "learning_rate": 9.051412975576849e-06, "loss": 0.0481, "step": 4199 }, { "epoch": 2.1863612701717856, "grad_norm": 0.23160644856981238, "learning_rate": 9.040595782441172e-06, "loss": 0.0495, "step": 4200 }, { "epoch": 2.1868818323789694, "grad_norm": 0.2192099194389428, "learning_rate": 9.029783630028044e-06, "loss": 0.0495, "step": 4201 }, { "epoch": 2.187402394586153, "grad_norm": 0.23563639581238982, "learning_rate": 9.018976521752426e-06, "loss": 0.0519, "step": 4202 }, { "epoch": 2.187922956793337, "grad_norm": 0.2229417962071288, "learning_rate": 9.008174461027724e-06, "loss": 0.0482, "step": 4203 }, { "epoch": 2.1884435190005207, "grad_norm": 0.23972449679471294, "learning_rate": 8.997377451265715e-06, "loss": 0.0513, "step": 4204 }, { "epoch": 2.1889640812077045, "grad_norm": 0.226247431389423, "learning_rate": 8.986585495876605e-06, "loss": 0.0493, "step": 4205 }, { "epoch": 2.1894846434148882, "grad_norm": 0.23325199021404394, "learning_rate": 8.975798598269002e-06, "loss": 0.0506, "step": 4206 }, { "epoch": 2.190005205622072, "grad_norm": 0.2245553471409382, "learning_rate": 8.965016761849898e-06, "loss": 0.0487, "step": 4207 }, { "epoch": 2.190525767829256, "grad_norm": 0.23199873840527224, "learning_rate": 8.954239990024704e-06, "loss": 0.0485, "step": 4208 }, { "epoch": 2.1910463300364396, "grad_norm": 0.22307074327270973, "learning_rate": 8.943468286197224e-06, "loss": 0.0477, "step": 4209 }, { "epoch": 2.191566892243623, "grad_norm": 0.23538404056571152, "learning_rate": 8.932701653769676e-06, "loss": 0.0492, "step": 4210 }, { "epoch": 2.1920874544508067, "grad_norm": 0.23621294854326094, "learning_rate": 8.921940096142645e-06, "loss": 0.0495, "step": 4211 }, { "epoch": 2.1926080166579904, "grad_norm": 0.2286930249072395, "learning_rate": 8.911183616715148e-06, "loss": 0.0481, "step": 4212 }, { "epoch": 2.193128578865174, "grad_norm": 0.2466558181780694, "learning_rate": 8.900432218884567e-06, "loss": 0.0509, "step": 4213 }, { "epoch": 2.193649141072358, "grad_norm": 0.2372365759392663, "learning_rate": 8.8896859060467e-06, "loss": 0.0497, "step": 4214 }, { "epoch": 2.1941697032795417, "grad_norm": 0.2319389953511392, "learning_rate": 8.878944681595742e-06, "loss": 0.0496, "step": 4215 }, { "epoch": 2.1946902654867255, "grad_norm": 0.23364662530422042, "learning_rate": 8.868208548924253e-06, "loss": 0.0491, "step": 4216 }, { "epoch": 2.1952108276939093, "grad_norm": 0.24276463476612706, "learning_rate": 8.857477511423215e-06, "loss": 0.05, "step": 4217 }, { "epoch": 2.195731389901093, "grad_norm": 0.23011307596727132, "learning_rate": 8.846751572481984e-06, "loss": 0.0477, "step": 4218 }, { "epoch": 2.196251952108277, "grad_norm": 0.2337315315374658, "learning_rate": 8.836030735488327e-06, "loss": 0.0489, "step": 4219 }, { "epoch": 2.1967725143154606, "grad_norm": 0.2325310579261457, "learning_rate": 8.825315003828358e-06, "loss": 0.0465, "step": 4220 }, { "epoch": 2.1972930765226444, "grad_norm": 0.22830561027317142, "learning_rate": 8.814604380886623e-06, "loss": 0.0476, "step": 4221 }, { "epoch": 2.197813638729828, "grad_norm": 0.24012212633480134, "learning_rate": 8.803898870046023e-06, "loss": 0.0488, "step": 4222 }, { "epoch": 2.198334200937012, "grad_norm": 0.23919713359665457, "learning_rate": 8.79319847468786e-06, "loss": 0.0489, "step": 4223 }, { "epoch": 2.1988547631441957, "grad_norm": 0.2506742677947288, "learning_rate": 8.782503198191828e-06, "loss": 0.0514, "step": 4224 }, { "epoch": 2.1993753253513795, "grad_norm": 0.23617636186386712, "learning_rate": 8.771813043935972e-06, "loss": 0.0494, "step": 4225 }, { "epoch": 2.1998958875585632, "grad_norm": 0.23474992131532096, "learning_rate": 8.761128015296754e-06, "loss": 0.0494, "step": 4226 }, { "epoch": 2.200416449765747, "grad_norm": 0.24128412011120356, "learning_rate": 8.750448115649001e-06, "loss": 0.0506, "step": 4227 }, { "epoch": 2.200937011972931, "grad_norm": 0.23586375626059694, "learning_rate": 8.739773348365928e-06, "loss": 0.05, "step": 4228 }, { "epoch": 2.2014575741801146, "grad_norm": 0.22292550967559244, "learning_rate": 8.729103716819112e-06, "loss": 0.0481, "step": 4229 }, { "epoch": 2.2019781363872983, "grad_norm": 0.23125396362101913, "learning_rate": 8.71843922437853e-06, "loss": 0.0473, "step": 4230 }, { "epoch": 2.202498698594482, "grad_norm": 0.22704085751147418, "learning_rate": 8.707779874412514e-06, "loss": 0.047, "step": 4231 }, { "epoch": 2.203019260801666, "grad_norm": 0.23357200148046692, "learning_rate": 8.697125670287787e-06, "loss": 0.0486, "step": 4232 }, { "epoch": 2.2035398230088497, "grad_norm": 0.24120365615695358, "learning_rate": 8.686476615369451e-06, "loss": 0.0507, "step": 4233 }, { "epoch": 2.2040603852160334, "grad_norm": 0.2440416527287281, "learning_rate": 8.67583271302096e-06, "loss": 0.0498, "step": 4234 }, { "epoch": 2.204580947423217, "grad_norm": 0.22550408972403665, "learning_rate": 8.665193966604157e-06, "loss": 0.0461, "step": 4235 }, { "epoch": 2.205101509630401, "grad_norm": 0.24657257357860465, "learning_rate": 8.654560379479257e-06, "loss": 0.0508, "step": 4236 }, { "epoch": 2.2056220718375847, "grad_norm": 0.2378711148974515, "learning_rate": 8.643931955004839e-06, "loss": 0.0504, "step": 4237 }, { "epoch": 2.2061426340447685, "grad_norm": 0.23414764136210095, "learning_rate": 8.633308696537865e-06, "loss": 0.0486, "step": 4238 }, { "epoch": 2.2066631962519523, "grad_norm": 0.24236213181260452, "learning_rate": 8.622690607433644e-06, "loss": 0.0506, "step": 4239 }, { "epoch": 2.207183758459136, "grad_norm": 0.22663158777101486, "learning_rate": 8.612077691045856e-06, "loss": 0.0481, "step": 4240 }, { "epoch": 2.20770432066632, "grad_norm": 0.23127324675787353, "learning_rate": 8.601469950726562e-06, "loss": 0.0494, "step": 4241 }, { "epoch": 2.2082248828735036, "grad_norm": 0.23219971597920153, "learning_rate": 8.59086738982618e-06, "loss": 0.0482, "step": 4242 }, { "epoch": 2.2087454450806874, "grad_norm": 0.22566023224763826, "learning_rate": 8.580270011693498e-06, "loss": 0.0501, "step": 4243 }, { "epoch": 2.2092660072878707, "grad_norm": 0.22931201717326824, "learning_rate": 8.569677819675646e-06, "loss": 0.0469, "step": 4244 }, { "epoch": 2.2097865694950545, "grad_norm": 0.22932887506999267, "learning_rate": 8.55909081711814e-06, "loss": 0.0474, "step": 4245 }, { "epoch": 2.2103071317022382, "grad_norm": 0.23290519077904787, "learning_rate": 8.548509007364849e-06, "loss": 0.0484, "step": 4246 }, { "epoch": 2.210827693909422, "grad_norm": 0.23050032981873889, "learning_rate": 8.537932393758008e-06, "loss": 0.0495, "step": 4247 }, { "epoch": 2.211348256116606, "grad_norm": 0.23320169879502017, "learning_rate": 8.527360979638196e-06, "loss": 0.0487, "step": 4248 }, { "epoch": 2.2118688183237896, "grad_norm": 0.22937417737271626, "learning_rate": 8.51679476834435e-06, "loss": 0.0492, "step": 4249 }, { "epoch": 2.2123893805309733, "grad_norm": 0.23866247343146085, "learning_rate": 8.506233763213776e-06, "loss": 0.0481, "step": 4250 }, { "epoch": 2.212909942738157, "grad_norm": 0.2544913382799314, "learning_rate": 8.495677967582135e-06, "loss": 0.0493, "step": 4251 }, { "epoch": 2.213430504945341, "grad_norm": 0.2335235599829452, "learning_rate": 8.485127384783446e-06, "loss": 0.0486, "step": 4252 }, { "epoch": 2.2139510671525247, "grad_norm": 0.24283971222288098, "learning_rate": 8.474582018150054e-06, "loss": 0.0484, "step": 4253 }, { "epoch": 2.2144716293597084, "grad_norm": 0.23164997555170744, "learning_rate": 8.464041871012687e-06, "loss": 0.0497, "step": 4254 }, { "epoch": 2.214992191566892, "grad_norm": 0.2413814098062649, "learning_rate": 8.453506946700418e-06, "loss": 0.0493, "step": 4255 }, { "epoch": 2.215512753774076, "grad_norm": 0.2454607147482343, "learning_rate": 8.442977248540667e-06, "loss": 0.0479, "step": 4256 }, { "epoch": 2.2160333159812597, "grad_norm": 0.23779862580138247, "learning_rate": 8.4324527798592e-06, "loss": 0.0497, "step": 4257 }, { "epoch": 2.2165538781884435, "grad_norm": 0.23762422063366767, "learning_rate": 8.421933543980126e-06, "loss": 0.0505, "step": 4258 }, { "epoch": 2.2170744403956273, "grad_norm": 0.23130632658477607, "learning_rate": 8.411419544225913e-06, "loss": 0.0487, "step": 4259 }, { "epoch": 2.217595002602811, "grad_norm": 0.22356019790315676, "learning_rate": 8.400910783917377e-06, "loss": 0.0472, "step": 4260 }, { "epoch": 2.218115564809995, "grad_norm": 0.22887925032024237, "learning_rate": 8.390407266373674e-06, "loss": 0.0476, "step": 4261 }, { "epoch": 2.2186361270171786, "grad_norm": 0.23463897003025824, "learning_rate": 8.379908994912294e-06, "loss": 0.049, "step": 4262 }, { "epoch": 2.2191566892243624, "grad_norm": 0.23240197161886428, "learning_rate": 8.369415972849088e-06, "loss": 0.0495, "step": 4263 }, { "epoch": 2.219677251431546, "grad_norm": 0.22919016167185535, "learning_rate": 8.358928203498236e-06, "loss": 0.0502, "step": 4264 }, { "epoch": 2.22019781363873, "grad_norm": 0.2202440420250215, "learning_rate": 8.348445690172274e-06, "loss": 0.0476, "step": 4265 }, { "epoch": 2.2207183758459137, "grad_norm": 0.23110580036654815, "learning_rate": 8.337968436182054e-06, "loss": 0.0503, "step": 4266 }, { "epoch": 2.2212389380530975, "grad_norm": 0.23758638402574997, "learning_rate": 8.327496444836793e-06, "loss": 0.0509, "step": 4267 }, { "epoch": 2.2217595002602812, "grad_norm": 0.22791558989136954, "learning_rate": 8.317029719444016e-06, "loss": 0.0469, "step": 4268 }, { "epoch": 2.222280062467465, "grad_norm": 0.23245148595947804, "learning_rate": 8.306568263309616e-06, "loss": 0.049, "step": 4269 }, { "epoch": 2.222800624674649, "grad_norm": 0.22057289323649362, "learning_rate": 8.296112079737808e-06, "loss": 0.047, "step": 4270 }, { "epoch": 2.2233211868818326, "grad_norm": 0.2291814857700997, "learning_rate": 8.28566117203113e-06, "loss": 0.0493, "step": 4271 }, { "epoch": 2.2238417490890163, "grad_norm": 0.23917673247453083, "learning_rate": 8.275215543490475e-06, "loss": 0.0492, "step": 4272 }, { "epoch": 2.2243623112962, "grad_norm": 0.23646886044374246, "learning_rate": 8.264775197415053e-06, "loss": 0.0495, "step": 4273 }, { "epoch": 2.2248828735033834, "grad_norm": 0.23541029580737305, "learning_rate": 8.254340137102426e-06, "loss": 0.0485, "step": 4274 }, { "epoch": 2.225403435710567, "grad_norm": 0.25343510785437356, "learning_rate": 8.243910365848448e-06, "loss": 0.051, "step": 4275 }, { "epoch": 2.225923997917751, "grad_norm": 0.2298835859276625, "learning_rate": 8.233485886947346e-06, "loss": 0.0489, "step": 4276 }, { "epoch": 2.2264445601249347, "grad_norm": 0.22912963408681325, "learning_rate": 8.22306670369164e-06, "loss": 0.0487, "step": 4277 }, { "epoch": 2.2269651223321185, "grad_norm": 0.23556170673164437, "learning_rate": 8.2126528193722e-06, "loss": 0.0485, "step": 4278 }, { "epoch": 2.2274856845393023, "grad_norm": 0.22736010372358842, "learning_rate": 8.202244237278223e-06, "loss": 0.0474, "step": 4279 }, { "epoch": 2.228006246746486, "grad_norm": 0.2427914749744837, "learning_rate": 8.19184096069721e-06, "loss": 0.0501, "step": 4280 }, { "epoch": 2.22852680895367, "grad_norm": 0.2351054578306983, "learning_rate": 8.181442992915e-06, "loss": 0.0486, "step": 4281 }, { "epoch": 2.2290473711608536, "grad_norm": 0.22778548939677618, "learning_rate": 8.171050337215767e-06, "loss": 0.0483, "step": 4282 }, { "epoch": 2.2295679333680374, "grad_norm": 0.22980336566734544, "learning_rate": 8.160662996881996e-06, "loss": 0.0468, "step": 4283 }, { "epoch": 2.230088495575221, "grad_norm": 0.23245660876536978, "learning_rate": 8.150280975194478e-06, "loss": 0.0477, "step": 4284 }, { "epoch": 2.230609057782405, "grad_norm": 0.22749204175239007, "learning_rate": 8.139904275432354e-06, "loss": 0.0489, "step": 4285 }, { "epoch": 2.2311296199895887, "grad_norm": 0.23848791969754046, "learning_rate": 8.129532900873051e-06, "loss": 0.0495, "step": 4286 }, { "epoch": 2.2316501821967725, "grad_norm": 0.24418712222939992, "learning_rate": 8.119166854792345e-06, "loss": 0.048, "step": 4287 }, { "epoch": 2.2321707444039562, "grad_norm": 0.23832224809407973, "learning_rate": 8.10880614046432e-06, "loss": 0.0502, "step": 4288 }, { "epoch": 2.23269130661114, "grad_norm": 0.2349337209019424, "learning_rate": 8.098450761161356e-06, "loss": 0.0488, "step": 4289 }, { "epoch": 2.233211868818324, "grad_norm": 0.236101048702289, "learning_rate": 8.08810072015417e-06, "loss": 0.0486, "step": 4290 }, { "epoch": 2.2337324310255076, "grad_norm": 0.2436594526550554, "learning_rate": 8.07775602071179e-06, "loss": 0.0502, "step": 4291 }, { "epoch": 2.2342529932326913, "grad_norm": 0.24215638153778066, "learning_rate": 8.067416666101562e-06, "loss": 0.0501, "step": 4292 }, { "epoch": 2.234773555439875, "grad_norm": 0.22910339256480491, "learning_rate": 8.057082659589115e-06, "loss": 0.0485, "step": 4293 }, { "epoch": 2.235294117647059, "grad_norm": 0.23759220672459413, "learning_rate": 8.046754004438429e-06, "loss": 0.0501, "step": 4294 }, { "epoch": 2.2358146798542426, "grad_norm": 0.2396620539258785, "learning_rate": 8.036430703911754e-06, "loss": 0.0505, "step": 4295 }, { "epoch": 2.2363352420614264, "grad_norm": 0.21656808633265354, "learning_rate": 8.026112761269683e-06, "loss": 0.0467, "step": 4296 }, { "epoch": 2.23685580426861, "grad_norm": 0.23694210045408912, "learning_rate": 8.015800179771105e-06, "loss": 0.0492, "step": 4297 }, { "epoch": 2.237376366475794, "grad_norm": 0.23066475593043279, "learning_rate": 8.005492962673197e-06, "loss": 0.0485, "step": 4298 }, { "epoch": 2.2378969286829777, "grad_norm": 0.23780414518123882, "learning_rate": 7.99519111323147e-06, "loss": 0.0502, "step": 4299 }, { "epoch": 2.2384174908901615, "grad_norm": 0.22874416790471078, "learning_rate": 7.984894634699725e-06, "loss": 0.0477, "step": 4300 }, { "epoch": 2.2389380530973453, "grad_norm": 0.225516589384533, "learning_rate": 7.974603530330069e-06, "loss": 0.0478, "step": 4301 }, { "epoch": 2.239458615304529, "grad_norm": 0.24222066580737353, "learning_rate": 7.964317803372918e-06, "loss": 0.0509, "step": 4302 }, { "epoch": 2.239979177511713, "grad_norm": 0.24287142818992802, "learning_rate": 7.95403745707698e-06, "loss": 0.0497, "step": 4303 }, { "epoch": 2.2404997397188966, "grad_norm": 0.24632751357990357, "learning_rate": 7.943762494689252e-06, "loss": 0.0503, "step": 4304 }, { "epoch": 2.2410203019260804, "grad_norm": 0.2450475168217388, "learning_rate": 7.93349291945506e-06, "loss": 0.0484, "step": 4305 }, { "epoch": 2.241540864133264, "grad_norm": 0.24379993992170595, "learning_rate": 7.92322873461801e-06, "loss": 0.0478, "step": 4306 }, { "epoch": 2.242061426340448, "grad_norm": 0.24148406455105237, "learning_rate": 7.912969943420018e-06, "loss": 0.049, "step": 4307 }, { "epoch": 2.2425819885476312, "grad_norm": 0.25211178202149287, "learning_rate": 7.902716549101272e-06, "loss": 0.0519, "step": 4308 }, { "epoch": 2.243102550754815, "grad_norm": 0.22380206855764131, "learning_rate": 7.892468554900278e-06, "loss": 0.0457, "step": 4309 }, { "epoch": 2.243623112961999, "grad_norm": 0.23150832565863866, "learning_rate": 7.88222596405383e-06, "loss": 0.048, "step": 4310 }, { "epoch": 2.2441436751691826, "grad_norm": 0.24560507817753072, "learning_rate": 7.871988779797024e-06, "loss": 0.0509, "step": 4311 }, { "epoch": 2.2446642373763663, "grad_norm": 0.22181091893953384, "learning_rate": 7.861757005363232e-06, "loss": 0.0475, "step": 4312 }, { "epoch": 2.24518479958355, "grad_norm": 0.23679248354187557, "learning_rate": 7.851530643984111e-06, "loss": 0.0484, "step": 4313 }, { "epoch": 2.245705361790734, "grad_norm": 0.23595668958031096, "learning_rate": 7.841309698889638e-06, "loss": 0.0501, "step": 4314 }, { "epoch": 2.2462259239979177, "grad_norm": 0.22600608264316838, "learning_rate": 7.831094173308056e-06, "loss": 0.0483, "step": 4315 }, { "epoch": 2.2467464862051014, "grad_norm": 0.23927132977588947, "learning_rate": 7.820884070465914e-06, "loss": 0.0497, "step": 4316 }, { "epoch": 2.247267048412285, "grad_norm": 0.2291797616221194, "learning_rate": 7.810679393588025e-06, "loss": 0.049, "step": 4317 }, { "epoch": 2.247787610619469, "grad_norm": 0.2252726239114742, "learning_rate": 7.800480145897501e-06, "loss": 0.0479, "step": 4318 }, { "epoch": 2.2483081728266527, "grad_norm": 0.2209306053422971, "learning_rate": 7.790286330615749e-06, "loss": 0.0466, "step": 4319 }, { "epoch": 2.2488287350338365, "grad_norm": 0.21256290410992668, "learning_rate": 7.780097950962447e-06, "loss": 0.0459, "step": 4320 }, { "epoch": 2.2493492972410203, "grad_norm": 0.23304751234583473, "learning_rate": 7.76991501015556e-06, "loss": 0.0494, "step": 4321 }, { "epoch": 2.249869859448204, "grad_norm": 0.24109946012783512, "learning_rate": 7.759737511411325e-06, "loss": 0.05, "step": 4322 }, { "epoch": 2.250390421655388, "grad_norm": 0.23100844503880485, "learning_rate": 7.749565457944274e-06, "loss": 0.0477, "step": 4323 }, { "epoch": 2.2509109838625716, "grad_norm": 0.22756640638272935, "learning_rate": 7.73939885296722e-06, "loss": 0.0488, "step": 4324 }, { "epoch": 2.2514315460697554, "grad_norm": 0.2411358442421993, "learning_rate": 7.729237699691254e-06, "loss": 0.0514, "step": 4325 }, { "epoch": 2.251952108276939, "grad_norm": 0.2385049139121903, "learning_rate": 7.719082001325728e-06, "loss": 0.0489, "step": 4326 }, { "epoch": 2.252472670484123, "grad_norm": 0.24245891150143284, "learning_rate": 7.70893176107829e-06, "loss": 0.0492, "step": 4327 }, { "epoch": 2.2529932326913067, "grad_norm": 0.23962567363070575, "learning_rate": 7.698786982154857e-06, "loss": 0.0486, "step": 4328 }, { "epoch": 2.2535137948984905, "grad_norm": 0.23067229481030735, "learning_rate": 7.688647667759633e-06, "loss": 0.0482, "step": 4329 }, { "epoch": 2.2540343571056742, "grad_norm": 0.23572306878031657, "learning_rate": 7.678513821095076e-06, "loss": 0.0488, "step": 4330 }, { "epoch": 2.254554919312858, "grad_norm": 0.22767821391268697, "learning_rate": 7.668385445361923e-06, "loss": 0.0478, "step": 4331 }, { "epoch": 2.255075481520042, "grad_norm": 0.22836659984706298, "learning_rate": 7.658262543759184e-06, "loss": 0.0465, "step": 4332 }, { "epoch": 2.2555960437272256, "grad_norm": 0.2393729125564174, "learning_rate": 7.648145119484152e-06, "loss": 0.0482, "step": 4333 }, { "epoch": 2.2561166059344093, "grad_norm": 0.23094639008000092, "learning_rate": 7.638033175732385e-06, "loss": 0.0483, "step": 4334 }, { "epoch": 2.256637168141593, "grad_norm": 0.22558491929936555, "learning_rate": 7.627926715697689e-06, "loss": 0.0473, "step": 4335 }, { "epoch": 2.257157730348777, "grad_norm": 0.23284991884566408, "learning_rate": 7.617825742572163e-06, "loss": 0.0488, "step": 4336 }, { "epoch": 2.25767829255596, "grad_norm": 0.23896169808363904, "learning_rate": 7.607730259546164e-06, "loss": 0.0491, "step": 4337 }, { "epoch": 2.258198854763144, "grad_norm": 0.229443652276642, "learning_rate": 7.597640269808323e-06, "loss": 0.0483, "step": 4338 }, { "epoch": 2.2587194169703277, "grad_norm": 0.22477269530377628, "learning_rate": 7.5875557765455245e-06, "loss": 0.0485, "step": 4339 }, { "epoch": 2.2592399791775115, "grad_norm": 0.2248750508583482, "learning_rate": 7.577476782942905e-06, "loss": 0.046, "step": 4340 }, { "epoch": 2.2597605413846953, "grad_norm": 0.24282761232837097, "learning_rate": 7.567403292183892e-06, "loss": 0.0488, "step": 4341 }, { "epoch": 2.260281103591879, "grad_norm": 0.24559688750387063, "learning_rate": 7.557335307450164e-06, "loss": 0.0487, "step": 4342 }, { "epoch": 2.260801665799063, "grad_norm": 0.2279229009237183, "learning_rate": 7.547272831921665e-06, "loss": 0.0473, "step": 4343 }, { "epoch": 2.2613222280062466, "grad_norm": 0.24163113811010245, "learning_rate": 7.5372158687765784e-06, "loss": 0.0488, "step": 4344 }, { "epoch": 2.2618427902134304, "grad_norm": 0.22542343579424412, "learning_rate": 7.527164421191369e-06, "loss": 0.0473, "step": 4345 }, { "epoch": 2.262363352420614, "grad_norm": 0.24354092974905714, "learning_rate": 7.51711849234075e-06, "loss": 0.0493, "step": 4346 }, { "epoch": 2.262883914627798, "grad_norm": 0.2315919364564454, "learning_rate": 7.507078085397701e-06, "loss": 0.0481, "step": 4347 }, { "epoch": 2.2634044768349817, "grad_norm": 0.23710368286301195, "learning_rate": 7.497043203533444e-06, "loss": 0.0481, "step": 4348 }, { "epoch": 2.2639250390421655, "grad_norm": 0.23553140969761996, "learning_rate": 7.487013849917454e-06, "loss": 0.0479, "step": 4349 }, { "epoch": 2.2644456012493492, "grad_norm": 0.24202468474373762, "learning_rate": 7.476990027717473e-06, "loss": 0.0502, "step": 4350 }, { "epoch": 2.264966163456533, "grad_norm": 0.24252991253463546, "learning_rate": 7.46697174009949e-06, "loss": 0.0506, "step": 4351 }, { "epoch": 2.265486725663717, "grad_norm": 0.24450739997748402, "learning_rate": 7.456958990227761e-06, "loss": 0.0493, "step": 4352 }, { "epoch": 2.2660072878709006, "grad_norm": 0.23512299197952063, "learning_rate": 7.446951781264755e-06, "loss": 0.05, "step": 4353 }, { "epoch": 2.2665278500780843, "grad_norm": 0.2349636567214315, "learning_rate": 7.436950116371225e-06, "loss": 0.0478, "step": 4354 }, { "epoch": 2.267048412285268, "grad_norm": 0.22313490108520748, "learning_rate": 7.4269539987061625e-06, "loss": 0.0489, "step": 4355 }, { "epoch": 2.267568974492452, "grad_norm": 0.24266098411924822, "learning_rate": 7.416963431426815e-06, "loss": 0.0508, "step": 4356 }, { "epoch": 2.2680895366996356, "grad_norm": 0.22709180519413907, "learning_rate": 7.406978417688659e-06, "loss": 0.0476, "step": 4357 }, { "epoch": 2.2686100989068194, "grad_norm": 0.2252843789851944, "learning_rate": 7.396998960645418e-06, "loss": 0.0475, "step": 4358 }, { "epoch": 2.269130661114003, "grad_norm": 0.2280964931147517, "learning_rate": 7.387025063449082e-06, "loss": 0.0479, "step": 4359 }, { "epoch": 2.269651223321187, "grad_norm": 0.22442020507845567, "learning_rate": 7.377056729249865e-06, "loss": 0.0474, "step": 4360 }, { "epoch": 2.2701717855283707, "grad_norm": 0.23536807823855144, "learning_rate": 7.3670939611962446e-06, "loss": 0.0471, "step": 4361 }, { "epoch": 2.2706923477355545, "grad_norm": 0.22341853555386254, "learning_rate": 7.357136762434908e-06, "loss": 0.0467, "step": 4362 }, { "epoch": 2.2712129099427383, "grad_norm": 0.2314457941140156, "learning_rate": 7.347185136110807e-06, "loss": 0.0474, "step": 4363 }, { "epoch": 2.271733472149922, "grad_norm": 0.23444389464917653, "learning_rate": 7.337239085367134e-06, "loss": 0.0473, "step": 4364 }, { "epoch": 2.272254034357106, "grad_norm": 0.22895085279319216, "learning_rate": 7.32729861334531e-06, "loss": 0.0482, "step": 4365 }, { "epoch": 2.2727745965642896, "grad_norm": 0.2301198469063954, "learning_rate": 7.317363723185017e-06, "loss": 0.0466, "step": 4366 }, { "epoch": 2.2732951587714734, "grad_norm": 0.2375738571090302, "learning_rate": 7.3074344180241225e-06, "loss": 0.0493, "step": 4367 }, { "epoch": 2.273815720978657, "grad_norm": 0.23547330811705733, "learning_rate": 7.297510700998783e-06, "loss": 0.0467, "step": 4368 }, { "epoch": 2.274336283185841, "grad_norm": 0.23704718711434258, "learning_rate": 7.2875925752433655e-06, "loss": 0.0484, "step": 4369 }, { "epoch": 2.2748568453930247, "grad_norm": 0.23007390803788796, "learning_rate": 7.277680043890475e-06, "loss": 0.0462, "step": 4370 }, { "epoch": 2.2753774076002085, "grad_norm": 0.2382599584192981, "learning_rate": 7.267773110070964e-06, "loss": 0.0489, "step": 4371 }, { "epoch": 2.2758979698073922, "grad_norm": 0.23792105081665424, "learning_rate": 7.25787177691388e-06, "loss": 0.0485, "step": 4372 }, { "epoch": 2.2764185320145756, "grad_norm": 0.22893605244305398, "learning_rate": 7.2479760475465395e-06, "loss": 0.0457, "step": 4373 }, { "epoch": 2.2769390942217593, "grad_norm": 0.2372593732772901, "learning_rate": 7.238085925094468e-06, "loss": 0.0486, "step": 4374 }, { "epoch": 2.277459656428943, "grad_norm": 0.23222430381532452, "learning_rate": 7.22820141268144e-06, "loss": 0.0473, "step": 4375 }, { "epoch": 2.277980218636127, "grad_norm": 0.23165532275317738, "learning_rate": 7.2183225134294345e-06, "loss": 0.0489, "step": 4376 }, { "epoch": 2.2785007808433106, "grad_norm": 0.22857397337810045, "learning_rate": 7.2084492304586586e-06, "loss": 0.0485, "step": 4377 }, { "epoch": 2.2790213430504944, "grad_norm": 0.2308547953684119, "learning_rate": 7.19858156688756e-06, "loss": 0.0459, "step": 4378 }, { "epoch": 2.279541905257678, "grad_norm": 0.22575144523819185, "learning_rate": 7.188719525832813e-06, "loss": 0.0476, "step": 4379 }, { "epoch": 2.280062467464862, "grad_norm": 0.24027575061928344, "learning_rate": 7.1788631104093145e-06, "loss": 0.0496, "step": 4380 }, { "epoch": 2.2805830296720457, "grad_norm": 0.23540572654705236, "learning_rate": 7.1690123237301596e-06, "loss": 0.0501, "step": 4381 }, { "epoch": 2.2811035918792295, "grad_norm": 0.24772965477869333, "learning_rate": 7.159167168906694e-06, "loss": 0.049, "step": 4382 }, { "epoch": 2.2816241540864133, "grad_norm": 0.2320276288812207, "learning_rate": 7.149327649048482e-06, "loss": 0.047, "step": 4383 }, { "epoch": 2.282144716293597, "grad_norm": 0.23303073974073554, "learning_rate": 7.1394937672633e-06, "loss": 0.0481, "step": 4384 }, { "epoch": 2.282665278500781, "grad_norm": 0.2353449984385499, "learning_rate": 7.129665526657145e-06, "loss": 0.0479, "step": 4385 }, { "epoch": 2.2831858407079646, "grad_norm": 0.23270277910500065, "learning_rate": 7.119842930334222e-06, "loss": 0.0479, "step": 4386 }, { "epoch": 2.2837064029151484, "grad_norm": 0.24467249136218208, "learning_rate": 7.110025981396975e-06, "loss": 0.0503, "step": 4387 }, { "epoch": 2.284226965122332, "grad_norm": 0.23634108219594238, "learning_rate": 7.100214682946049e-06, "loss": 0.0483, "step": 4388 }, { "epoch": 2.284747527329516, "grad_norm": 0.2243524743535289, "learning_rate": 7.090409038080317e-06, "loss": 0.0477, "step": 4389 }, { "epoch": 2.2852680895366997, "grad_norm": 0.2272063584018309, "learning_rate": 7.080609049896844e-06, "loss": 0.0486, "step": 4390 }, { "epoch": 2.2857886517438835, "grad_norm": 0.22104862431300737, "learning_rate": 7.0708147214909315e-06, "loss": 0.0479, "step": 4391 }, { "epoch": 2.2863092139510672, "grad_norm": 0.2324349902565072, "learning_rate": 7.06102605595608e-06, "loss": 0.0489, "step": 4392 }, { "epoch": 2.286829776158251, "grad_norm": 0.23945786205814168, "learning_rate": 7.051243056384016e-06, "loss": 0.0473, "step": 4393 }, { "epoch": 2.2873503383654348, "grad_norm": 0.2290382839925783, "learning_rate": 7.04146572586466e-06, "loss": 0.0476, "step": 4394 }, { "epoch": 2.2878709005726185, "grad_norm": 0.22924448362356376, "learning_rate": 7.031694067486136e-06, "loss": 0.0474, "step": 4395 }, { "epoch": 2.2883914627798023, "grad_norm": 0.23714424830350492, "learning_rate": 7.0219280843348e-06, "loss": 0.05, "step": 4396 }, { "epoch": 2.288912024986986, "grad_norm": 0.2261718669538399, "learning_rate": 7.012167779495201e-06, "loss": 0.0471, "step": 4397 }, { "epoch": 2.28943258719417, "grad_norm": 0.22758504532465043, "learning_rate": 7.002413156050108e-06, "loss": 0.0467, "step": 4398 }, { "epoch": 2.2899531494013536, "grad_norm": 0.23789065938410056, "learning_rate": 6.9926642170804665e-06, "loss": 0.0485, "step": 4399 }, { "epoch": 2.2904737116085374, "grad_norm": 0.24091702145465044, "learning_rate": 6.982920965665457e-06, "loss": 0.0455, "step": 4400 }, { "epoch": 2.2909942738157207, "grad_norm": 0.2359330189743294, "learning_rate": 6.9731834048824465e-06, "loss": 0.0486, "step": 4401 }, { "epoch": 2.2915148360229045, "grad_norm": 0.23437207513741382, "learning_rate": 6.963451537807023e-06, "loss": 0.0473, "step": 4402 }, { "epoch": 2.2920353982300883, "grad_norm": 0.23735518742716438, "learning_rate": 6.953725367512951e-06, "loss": 0.0476, "step": 4403 }, { "epoch": 2.292555960437272, "grad_norm": 0.2462383376150093, "learning_rate": 6.944004897072201e-06, "loss": 0.0487, "step": 4404 }, { "epoch": 2.293076522644456, "grad_norm": 0.2411982549992137, "learning_rate": 6.934290129554957e-06, "loss": 0.0477, "step": 4405 }, { "epoch": 2.2935970848516396, "grad_norm": 0.218847924533852, "learning_rate": 6.924581068029598e-06, "loss": 0.0456, "step": 4406 }, { "epoch": 2.2941176470588234, "grad_norm": 0.22811627743109275, "learning_rate": 6.914877715562704e-06, "loss": 0.0464, "step": 4407 }, { "epoch": 2.294638209266007, "grad_norm": 0.2463222783668148, "learning_rate": 6.905180075219025e-06, "loss": 0.0476, "step": 4408 }, { "epoch": 2.295158771473191, "grad_norm": 0.2494378835645998, "learning_rate": 6.895488150061541e-06, "loss": 0.0481, "step": 4409 }, { "epoch": 2.2956793336803747, "grad_norm": 0.22371944092940846, "learning_rate": 6.88580194315141e-06, "loss": 0.0456, "step": 4410 }, { "epoch": 2.2961998958875585, "grad_norm": 0.23726791642543021, "learning_rate": 6.876121457547996e-06, "loss": 0.0477, "step": 4411 }, { "epoch": 2.2967204580947422, "grad_norm": 0.22025059869257912, "learning_rate": 6.866446696308837e-06, "loss": 0.0444, "step": 4412 }, { "epoch": 2.297241020301926, "grad_norm": 0.2373689563171938, "learning_rate": 6.856777662489669e-06, "loss": 0.0473, "step": 4413 }, { "epoch": 2.29776158250911, "grad_norm": 0.22953651254536686, "learning_rate": 6.847114359144427e-06, "loss": 0.0476, "step": 4414 }, { "epoch": 2.2982821447162936, "grad_norm": 0.2314904782948286, "learning_rate": 6.83745678932523e-06, "loss": 0.0475, "step": 4415 }, { "epoch": 2.2988027069234773, "grad_norm": 0.23068456271896137, "learning_rate": 6.8278049560824035e-06, "loss": 0.0497, "step": 4416 }, { "epoch": 2.299323269130661, "grad_norm": 0.22509485065033707, "learning_rate": 6.818158862464422e-06, "loss": 0.0473, "step": 4417 }, { "epoch": 2.299843831337845, "grad_norm": 0.23754030254959319, "learning_rate": 6.8085185115179836e-06, "loss": 0.0484, "step": 4418 }, { "epoch": 2.3003643935450286, "grad_norm": 0.23154423326948806, "learning_rate": 6.798883906287956e-06, "loss": 0.0476, "step": 4419 }, { "epoch": 2.3008849557522124, "grad_norm": 0.2276877249333266, "learning_rate": 6.789255049817406e-06, "loss": 0.0474, "step": 4420 }, { "epoch": 2.301405517959396, "grad_norm": 0.2455145987692723, "learning_rate": 6.779631945147566e-06, "loss": 0.0484, "step": 4421 }, { "epoch": 2.30192608016658, "grad_norm": 0.23746214997113513, "learning_rate": 6.770014595317853e-06, "loss": 0.0464, "step": 4422 }, { "epoch": 2.3024466423737637, "grad_norm": 0.22566035826376565, "learning_rate": 6.760403003365884e-06, "loss": 0.0471, "step": 4423 }, { "epoch": 2.3029672045809475, "grad_norm": 0.23320331066661007, "learning_rate": 6.750797172327442e-06, "loss": 0.0492, "step": 4424 }, { "epoch": 2.3034877667881313, "grad_norm": 0.22163307688943074, "learning_rate": 6.741197105236505e-06, "loss": 0.0456, "step": 4425 }, { "epoch": 2.304008328995315, "grad_norm": 0.24047914367787057, "learning_rate": 6.731602805125206e-06, "loss": 0.0492, "step": 4426 }, { "epoch": 2.304528891202499, "grad_norm": 0.22844822202025908, "learning_rate": 6.72201427502388e-06, "loss": 0.0477, "step": 4427 }, { "epoch": 2.3050494534096826, "grad_norm": 0.22813096140516748, "learning_rate": 6.712431517961029e-06, "loss": 0.0472, "step": 4428 }, { "epoch": 2.3055700156168664, "grad_norm": 0.22811760187365832, "learning_rate": 6.702854536963343e-06, "loss": 0.0472, "step": 4429 }, { "epoch": 2.30609057782405, "grad_norm": 0.23994023415059773, "learning_rate": 6.69328333505567e-06, "loss": 0.0484, "step": 4430 }, { "epoch": 2.306611140031234, "grad_norm": 0.23874833178382263, "learning_rate": 6.683717915261034e-06, "loss": 0.0498, "step": 4431 }, { "epoch": 2.3071317022384177, "grad_norm": 0.2331323610830941, "learning_rate": 6.674158280600645e-06, "loss": 0.0479, "step": 4432 }, { "epoch": 2.3076522644456015, "grad_norm": 0.23190741443914148, "learning_rate": 6.6646044340938854e-06, "loss": 0.0474, "step": 4433 }, { "epoch": 2.3081728266527852, "grad_norm": 0.23628413869646864, "learning_rate": 6.655056378758298e-06, "loss": 0.0498, "step": 4434 }, { "epoch": 2.308693388859969, "grad_norm": 0.2313885805739463, "learning_rate": 6.645514117609616e-06, "loss": 0.0475, "step": 4435 }, { "epoch": 2.3092139510671528, "grad_norm": 0.2233459799496242, "learning_rate": 6.6359776536617096e-06, "loss": 0.0467, "step": 4436 }, { "epoch": 2.309734513274336, "grad_norm": 0.2363879717090792, "learning_rate": 6.626446989926652e-06, "loss": 0.0484, "step": 4437 }, { "epoch": 2.31025507548152, "grad_norm": 0.23027837691324807, "learning_rate": 6.616922129414671e-06, "loss": 0.0462, "step": 4438 }, { "epoch": 2.3107756376887036, "grad_norm": 0.22061521966437242, "learning_rate": 6.6074030751341496e-06, "loss": 0.046, "step": 4439 }, { "epoch": 2.3112961998958874, "grad_norm": 0.236489044723267, "learning_rate": 6.597889830091664e-06, "loss": 0.0491, "step": 4440 }, { "epoch": 2.311816762103071, "grad_norm": 0.2256938496890428, "learning_rate": 6.5883823972919205e-06, "loss": 0.0463, "step": 4441 }, { "epoch": 2.312337324310255, "grad_norm": 0.23195296992115286, "learning_rate": 6.5788807797378196e-06, "loss": 0.0486, "step": 4442 }, { "epoch": 2.3128578865174387, "grad_norm": 0.2274872199904384, "learning_rate": 6.569384980430415e-06, "loss": 0.0466, "step": 4443 }, { "epoch": 2.3133784487246225, "grad_norm": 0.2438659322406287, "learning_rate": 6.559895002368927e-06, "loss": 0.0502, "step": 4444 }, { "epoch": 2.3138990109318063, "grad_norm": 0.24195755478933637, "learning_rate": 6.5504108485507175e-06, "loss": 0.0493, "step": 4445 }, { "epoch": 2.31441957313899, "grad_norm": 0.2312187971962166, "learning_rate": 6.5409325219713325e-06, "loss": 0.0468, "step": 4446 }, { "epoch": 2.314940135346174, "grad_norm": 0.23686834836229043, "learning_rate": 6.531460025624475e-06, "loss": 0.0473, "step": 4447 }, { "epoch": 2.3154606975533576, "grad_norm": 0.22867574310554584, "learning_rate": 6.521993362501988e-06, "loss": 0.0458, "step": 4448 }, { "epoch": 2.3159812597605414, "grad_norm": 0.23825031277590675, "learning_rate": 6.512532535593896e-06, "loss": 0.0474, "step": 4449 }, { "epoch": 2.316501821967725, "grad_norm": 0.23330630762399085, "learning_rate": 6.503077547888353e-06, "loss": 0.047, "step": 4450 }, { "epoch": 2.317022384174909, "grad_norm": 0.22866429601766594, "learning_rate": 6.493628402371693e-06, "loss": 0.047, "step": 4451 }, { "epoch": 2.3175429463820927, "grad_norm": 0.23326929579367334, "learning_rate": 6.484185102028398e-06, "loss": 0.0467, "step": 4452 }, { "epoch": 2.3180635085892765, "grad_norm": 0.21423278359694856, "learning_rate": 6.474747649841103e-06, "loss": 0.0458, "step": 4453 }, { "epoch": 2.3185840707964602, "grad_norm": 0.22292986350540375, "learning_rate": 6.465316048790587e-06, "loss": 0.0473, "step": 4454 }, { "epoch": 2.319104633003644, "grad_norm": 0.24046086305383696, "learning_rate": 6.4558903018557936e-06, "loss": 0.0506, "step": 4455 }, { "epoch": 2.3196251952108278, "grad_norm": 0.23153182365151787, "learning_rate": 6.446470412013817e-06, "loss": 0.0471, "step": 4456 }, { "epoch": 2.3201457574180115, "grad_norm": 0.2256223599408448, "learning_rate": 6.437056382239884e-06, "loss": 0.0475, "step": 4457 }, { "epoch": 2.3206663196251953, "grad_norm": 0.22640346868399505, "learning_rate": 6.427648215507398e-06, "loss": 0.0497, "step": 4458 }, { "epoch": 2.321186881832379, "grad_norm": 0.22663691130270067, "learning_rate": 6.418245914787882e-06, "loss": 0.0451, "step": 4459 }, { "epoch": 2.321707444039563, "grad_norm": 0.22821336766562575, "learning_rate": 6.408849483051024e-06, "loss": 0.0472, "step": 4460 }, { "epoch": 2.3222280062467466, "grad_norm": 0.23385635491282356, "learning_rate": 6.399458923264659e-06, "loss": 0.0482, "step": 4461 }, { "epoch": 2.3227485684539304, "grad_norm": 0.2320806105076779, "learning_rate": 6.3900742383947664e-06, "loss": 0.0459, "step": 4462 }, { "epoch": 2.323269130661114, "grad_norm": 0.2389305017153277, "learning_rate": 6.380695431405453e-06, "loss": 0.0496, "step": 4463 }, { "epoch": 2.323789692868298, "grad_norm": 0.22813071098401094, "learning_rate": 6.371322505258992e-06, "loss": 0.0457, "step": 4464 }, { "epoch": 2.3243102550754813, "grad_norm": 0.24014119920977195, "learning_rate": 6.361955462915795e-06, "loss": 0.0474, "step": 4465 }, { "epoch": 2.324830817282665, "grad_norm": 0.2416415435940098, "learning_rate": 6.352594307334395e-06, "loss": 0.0473, "step": 4466 }, { "epoch": 2.325351379489849, "grad_norm": 0.2656651285253611, "learning_rate": 6.343239041471497e-06, "loss": 0.0479, "step": 4467 }, { "epoch": 2.3258719416970326, "grad_norm": 0.23550173668414345, "learning_rate": 6.333889668281912e-06, "loss": 0.0473, "step": 4468 }, { "epoch": 2.3263925039042164, "grad_norm": 0.23828823064619004, "learning_rate": 6.324546190718614e-06, "loss": 0.0457, "step": 4469 }, { "epoch": 2.3269130661114, "grad_norm": 0.24471560830756434, "learning_rate": 6.3152086117327116e-06, "loss": 0.0488, "step": 4470 }, { "epoch": 2.327433628318584, "grad_norm": 0.22933514872804697, "learning_rate": 6.305876934273452e-06, "loss": 0.0468, "step": 4471 }, { "epoch": 2.3279541905257677, "grad_norm": 0.23426764882810178, "learning_rate": 6.296551161288197e-06, "loss": 0.0485, "step": 4472 }, { "epoch": 2.3284747527329515, "grad_norm": 0.22856682242539877, "learning_rate": 6.28723129572247e-06, "loss": 0.0462, "step": 4473 }, { "epoch": 2.3289953149401352, "grad_norm": 0.23846002941423738, "learning_rate": 6.277917340519918e-06, "loss": 0.0482, "step": 4474 }, { "epoch": 2.329515877147319, "grad_norm": 0.234480561210467, "learning_rate": 6.268609298622327e-06, "loss": 0.0482, "step": 4475 }, { "epoch": 2.3300364393545028, "grad_norm": 0.24047174883310446, "learning_rate": 6.259307172969606e-06, "loss": 0.0477, "step": 4476 }, { "epoch": 2.3305570015616865, "grad_norm": 0.2344548406347454, "learning_rate": 6.250010966499786e-06, "loss": 0.0484, "step": 4477 }, { "epoch": 2.3310775637688703, "grad_norm": 0.22642383076214023, "learning_rate": 6.240720682149054e-06, "loss": 0.0471, "step": 4478 }, { "epoch": 2.331598125976054, "grad_norm": 0.22569085877899356, "learning_rate": 6.231436322851711e-06, "loss": 0.0465, "step": 4479 }, { "epoch": 2.332118688183238, "grad_norm": 0.21706380279472942, "learning_rate": 6.222157891540198e-06, "loss": 0.0465, "step": 4480 }, { "epoch": 2.3326392503904216, "grad_norm": 0.22733120940204257, "learning_rate": 6.21288539114506e-06, "loss": 0.0475, "step": 4481 }, { "epoch": 2.3331598125976054, "grad_norm": 0.22939434669257294, "learning_rate": 6.203618824594995e-06, "loss": 0.0463, "step": 4482 }, { "epoch": 2.333680374804789, "grad_norm": 0.23734452631886388, "learning_rate": 6.194358194816813e-06, "loss": 0.0477, "step": 4483 }, { "epoch": 2.334200937011973, "grad_norm": 0.23411419581489817, "learning_rate": 6.1851035047354595e-06, "loss": 0.0477, "step": 4484 }, { "epoch": 2.3347214992191567, "grad_norm": 0.24002134917837561, "learning_rate": 6.175854757273989e-06, "loss": 0.0481, "step": 4485 }, { "epoch": 2.3352420614263405, "grad_norm": 0.2370166595867462, "learning_rate": 6.166611955353577e-06, "loss": 0.0506, "step": 4486 }, { "epoch": 2.3357626236335243, "grad_norm": 0.23745847434442083, "learning_rate": 6.157375101893543e-06, "loss": 0.047, "step": 4487 }, { "epoch": 2.336283185840708, "grad_norm": 0.24957266727201047, "learning_rate": 6.148144199811309e-06, "loss": 0.0479, "step": 4488 }, { "epoch": 2.336803748047892, "grad_norm": 0.2351547804630439, "learning_rate": 6.138919252022435e-06, "loss": 0.0474, "step": 4489 }, { "epoch": 2.3373243102550756, "grad_norm": 0.24163133481886173, "learning_rate": 6.129700261440574e-06, "loss": 0.0475, "step": 4490 }, { "epoch": 2.3378448724622594, "grad_norm": 0.24059572984473415, "learning_rate": 6.120487230977517e-06, "loss": 0.0502, "step": 4491 }, { "epoch": 2.338365434669443, "grad_norm": 0.2355645081535075, "learning_rate": 6.1112801635431704e-06, "loss": 0.0497, "step": 4492 }, { "epoch": 2.338885996876627, "grad_norm": 0.23939602704815524, "learning_rate": 6.102079062045559e-06, "loss": 0.0479, "step": 4493 }, { "epoch": 2.3394065590838107, "grad_norm": 0.22977889226796178, "learning_rate": 6.092883929390816e-06, "loss": 0.0456, "step": 4494 }, { "epoch": 2.3399271212909944, "grad_norm": 0.23478062109634587, "learning_rate": 6.083694768483181e-06, "loss": 0.0481, "step": 4495 }, { "epoch": 2.340447683498178, "grad_norm": 0.22765522951948453, "learning_rate": 6.074511582225029e-06, "loss": 0.0484, "step": 4496 }, { "epoch": 2.340968245705362, "grad_norm": 0.21743859076143826, "learning_rate": 6.065334373516834e-06, "loss": 0.0471, "step": 4497 }, { "epoch": 2.3414888079125458, "grad_norm": 0.2314187819934621, "learning_rate": 6.056163145257187e-06, "loss": 0.0484, "step": 4498 }, { "epoch": 2.3420093701197295, "grad_norm": 0.23842056215195048, "learning_rate": 6.046997900342796e-06, "loss": 0.048, "step": 4499 }, { "epoch": 2.3425299323269133, "grad_norm": 0.22448513602756287, "learning_rate": 6.037838641668459e-06, "loss": 0.0459, "step": 4500 }, { "epoch": 2.3430504945340966, "grad_norm": 0.2236326297612647, "learning_rate": 6.028685372127099e-06, "loss": 0.0469, "step": 4501 }, { "epoch": 2.3435710567412804, "grad_norm": 0.22213192913695828, "learning_rate": 6.019538094609759e-06, "loss": 0.0466, "step": 4502 }, { "epoch": 2.344091618948464, "grad_norm": 0.22847968405344984, "learning_rate": 6.010396812005553e-06, "loss": 0.0468, "step": 4503 }, { "epoch": 2.344612181155648, "grad_norm": 0.23492086518084526, "learning_rate": 6.00126152720174e-06, "loss": 0.047, "step": 4504 }, { "epoch": 2.3451327433628317, "grad_norm": 0.2394105213441764, "learning_rate": 5.992132243083656e-06, "loss": 0.0496, "step": 4505 }, { "epoch": 2.3456533055700155, "grad_norm": 0.24111441487478028, "learning_rate": 5.983008962534761e-06, "loss": 0.0461, "step": 4506 }, { "epoch": 2.3461738677771993, "grad_norm": 0.23107118946181654, "learning_rate": 5.973891688436608e-06, "loss": 0.0476, "step": 4507 }, { "epoch": 2.346694429984383, "grad_norm": 0.23065663806986544, "learning_rate": 5.964780423668867e-06, "loss": 0.0481, "step": 4508 }, { "epoch": 2.347214992191567, "grad_norm": 0.23820356615082777, "learning_rate": 5.9556751711092844e-06, "loss": 0.0472, "step": 4509 }, { "epoch": 2.3477355543987506, "grad_norm": 0.2365896072704537, "learning_rate": 5.94657593363373e-06, "loss": 0.0473, "step": 4510 }, { "epoch": 2.3482561166059344, "grad_norm": 0.23479232602004582, "learning_rate": 5.9374827141161715e-06, "loss": 0.0489, "step": 4511 }, { "epoch": 2.348776678813118, "grad_norm": 0.23324065251358814, "learning_rate": 5.928395515428656e-06, "loss": 0.0469, "step": 4512 }, { "epoch": 2.349297241020302, "grad_norm": 0.2271112729337206, "learning_rate": 5.919314340441362e-06, "loss": 0.0462, "step": 4513 }, { "epoch": 2.3498178032274857, "grad_norm": 0.23075689196359045, "learning_rate": 5.9102391920225315e-06, "loss": 0.0484, "step": 4514 }, { "epoch": 2.3503383654346695, "grad_norm": 0.22572112690840151, "learning_rate": 5.901170073038523e-06, "loss": 0.0466, "step": 4515 }, { "epoch": 2.3508589276418532, "grad_norm": 0.23256450262742248, "learning_rate": 5.89210698635379e-06, "loss": 0.0461, "step": 4516 }, { "epoch": 2.351379489849037, "grad_norm": 0.220207578677318, "learning_rate": 5.883049934830884e-06, "loss": 0.0465, "step": 4517 }, { "epoch": 2.3519000520562208, "grad_norm": 0.22721249545677813, "learning_rate": 5.873998921330426e-06, "loss": 0.0455, "step": 4518 }, { "epoch": 2.3524206142634045, "grad_norm": 0.2632945933248785, "learning_rate": 5.864953948711155e-06, "loss": 0.0481, "step": 4519 }, { "epoch": 2.3529411764705883, "grad_norm": 0.23515126483913215, "learning_rate": 5.855915019829902e-06, "loss": 0.0469, "step": 4520 }, { "epoch": 2.353461738677772, "grad_norm": 0.22931919368137274, "learning_rate": 5.846882137541571e-06, "loss": 0.0474, "step": 4521 }, { "epoch": 2.353982300884956, "grad_norm": 0.24606325790404943, "learning_rate": 5.837855304699175e-06, "loss": 0.0484, "step": 4522 }, { "epoch": 2.3545028630921396, "grad_norm": 0.23172001395098654, "learning_rate": 5.828834524153795e-06, "loss": 0.0475, "step": 4523 }, { "epoch": 2.3550234252993234, "grad_norm": 0.21936760204415123, "learning_rate": 5.819819798754625e-06, "loss": 0.0458, "step": 4524 }, { "epoch": 2.355543987506507, "grad_norm": 0.2193218125572941, "learning_rate": 5.810811131348929e-06, "loss": 0.0459, "step": 4525 }, { "epoch": 2.356064549713691, "grad_norm": 0.2345064969588428, "learning_rate": 5.801808524782071e-06, "loss": 0.0468, "step": 4526 }, { "epoch": 2.3565851119208747, "grad_norm": 0.24851347312329655, "learning_rate": 5.792811981897484e-06, "loss": 0.0492, "step": 4527 }, { "epoch": 2.3571056741280585, "grad_norm": 0.23050774006520552, "learning_rate": 5.783821505536696e-06, "loss": 0.0467, "step": 4528 }, { "epoch": 2.357626236335242, "grad_norm": 0.23084948710195863, "learning_rate": 5.7748370985393295e-06, "loss": 0.0454, "step": 4529 }, { "epoch": 2.3581467985424256, "grad_norm": 0.24791040020859287, "learning_rate": 5.765858763743062e-06, "loss": 0.0476, "step": 4530 }, { "epoch": 2.3586673607496094, "grad_norm": 0.23318412091442703, "learning_rate": 5.756886503983683e-06, "loss": 0.0475, "step": 4531 }, { "epoch": 2.359187922956793, "grad_norm": 0.23289524027494776, "learning_rate": 5.747920322095035e-06, "loss": 0.0464, "step": 4532 }, { "epoch": 2.359708485163977, "grad_norm": 0.23663155315479364, "learning_rate": 5.738960220909068e-06, "loss": 0.0465, "step": 4533 }, { "epoch": 2.3602290473711607, "grad_norm": 0.2330747611757856, "learning_rate": 5.730006203255792e-06, "loss": 0.0466, "step": 4534 }, { "epoch": 2.3607496095783445, "grad_norm": 0.23854565465214606, "learning_rate": 5.721058271963311e-06, "loss": 0.0466, "step": 4535 }, { "epoch": 2.3612701717855282, "grad_norm": 0.23694164646216895, "learning_rate": 5.712116429857789e-06, "loss": 0.0473, "step": 4536 }, { "epoch": 2.361790733992712, "grad_norm": 0.23150554800278614, "learning_rate": 5.7031806797634755e-06, "loss": 0.0466, "step": 4537 }, { "epoch": 2.3623112961998958, "grad_norm": 0.23630006270170592, "learning_rate": 5.694251024502709e-06, "loss": 0.0484, "step": 4538 }, { "epoch": 2.3628318584070795, "grad_norm": 0.2338320873775774, "learning_rate": 5.685327466895874e-06, "loss": 0.0471, "step": 4539 }, { "epoch": 2.3633524206142633, "grad_norm": 0.2377908975210259, "learning_rate": 5.6764100097614595e-06, "loss": 0.0488, "step": 4540 }, { "epoch": 2.363872982821447, "grad_norm": 0.23304039641551305, "learning_rate": 5.667498655916001e-06, "loss": 0.0462, "step": 4541 }, { "epoch": 2.364393545028631, "grad_norm": 0.23372925814357076, "learning_rate": 5.6585934081741205e-06, "loss": 0.0471, "step": 4542 }, { "epoch": 2.3649141072358146, "grad_norm": 0.23054147188963464, "learning_rate": 5.649694269348516e-06, "loss": 0.0487, "step": 4543 }, { "epoch": 2.3654346694429984, "grad_norm": 0.2295991682759455, "learning_rate": 5.640801242249952e-06, "loss": 0.0466, "step": 4544 }, { "epoch": 2.365955231650182, "grad_norm": 0.23056386741672127, "learning_rate": 5.631914329687249e-06, "loss": 0.0478, "step": 4545 }, { "epoch": 2.366475793857366, "grad_norm": 0.22736615156094703, "learning_rate": 5.623033534467315e-06, "loss": 0.0459, "step": 4546 }, { "epoch": 2.3669963560645497, "grad_norm": 0.23702181937720163, "learning_rate": 5.614158859395122e-06, "loss": 0.0481, "step": 4547 }, { "epoch": 2.3675169182717335, "grad_norm": 0.2266535316694567, "learning_rate": 5.605290307273694e-06, "loss": 0.046, "step": 4548 }, { "epoch": 2.3680374804789173, "grad_norm": 0.23582655875491848, "learning_rate": 5.596427880904148e-06, "loss": 0.0472, "step": 4549 }, { "epoch": 2.368558042686101, "grad_norm": 0.24143542099238016, "learning_rate": 5.587571583085632e-06, "loss": 0.0467, "step": 4550 }, { "epoch": 2.369078604893285, "grad_norm": 0.2307564717560343, "learning_rate": 5.5787214166153875e-06, "loss": 0.0467, "step": 4551 }, { "epoch": 2.3695991671004686, "grad_norm": 0.24506784373841559, "learning_rate": 5.569877384288708e-06, "loss": 0.0474, "step": 4552 }, { "epoch": 2.3701197293076524, "grad_norm": 0.32435382511813377, "learning_rate": 5.5610394888989585e-06, "loss": 0.0476, "step": 4553 }, { "epoch": 2.370640291514836, "grad_norm": 0.2342486669021339, "learning_rate": 5.5522077332375436e-06, "loss": 0.0474, "step": 4554 }, { "epoch": 2.37116085372202, "grad_norm": 0.21410206999675588, "learning_rate": 5.543382120093946e-06, "loss": 0.0442, "step": 4555 }, { "epoch": 2.3716814159292037, "grad_norm": 0.2338596233871188, "learning_rate": 5.5345626522557175e-06, "loss": 0.0455, "step": 4556 }, { "epoch": 2.3722019781363874, "grad_norm": 0.23916410275167593, "learning_rate": 5.525749332508437e-06, "loss": 0.0469, "step": 4557 }, { "epoch": 2.372722540343571, "grad_norm": 0.23457736048555594, "learning_rate": 5.51694216363578e-06, "loss": 0.0456, "step": 4558 }, { "epoch": 2.373243102550755, "grad_norm": 0.25322127032434727, "learning_rate": 5.5081411484194435e-06, "loss": 0.048, "step": 4559 }, { "epoch": 2.3737636647579388, "grad_norm": 0.24019282093163075, "learning_rate": 5.499346289639206e-06, "loss": 0.0466, "step": 4560 }, { "epoch": 2.3742842269651225, "grad_norm": 0.2264918241309705, "learning_rate": 5.490557590072892e-06, "loss": 0.0454, "step": 4561 }, { "epoch": 2.3748047891723063, "grad_norm": 0.2357276481863, "learning_rate": 5.4817750524963904e-06, "loss": 0.047, "step": 4562 }, { "epoch": 2.37532535137949, "grad_norm": 0.23230594226469076, "learning_rate": 5.472998679683619e-06, "loss": 0.0465, "step": 4563 }, { "epoch": 2.375845913586674, "grad_norm": 0.22533858615611319, "learning_rate": 5.4642284744065715e-06, "loss": 0.0468, "step": 4564 }, { "epoch": 2.376366475793857, "grad_norm": 0.22724858506067902, "learning_rate": 5.455464439435299e-06, "loss": 0.0481, "step": 4565 }, { "epoch": 2.376887038001041, "grad_norm": 0.23606749438431365, "learning_rate": 5.446706577537869e-06, "loss": 0.0466, "step": 4566 }, { "epoch": 2.3774076002082247, "grad_norm": 0.2370377994578295, "learning_rate": 5.437954891480443e-06, "loss": 0.0494, "step": 4567 }, { "epoch": 2.3779281624154085, "grad_norm": 0.2240890350805136, "learning_rate": 5.4292093840271955e-06, "loss": 0.0458, "step": 4568 }, { "epoch": 2.3784487246225923, "grad_norm": 0.2289474671965338, "learning_rate": 5.420470057940372e-06, "loss": 0.0467, "step": 4569 }, { "epoch": 2.378969286829776, "grad_norm": 0.2298982841879016, "learning_rate": 5.411736915980253e-06, "loss": 0.049, "step": 4570 }, { "epoch": 2.37948984903696, "grad_norm": 0.2268863902053528, "learning_rate": 5.403009960905178e-06, "loss": 0.0461, "step": 4571 }, { "epoch": 2.3800104112441436, "grad_norm": 0.2436459860162038, "learning_rate": 5.394289195471527e-06, "loss": 0.0487, "step": 4572 }, { "epoch": 2.3805309734513274, "grad_norm": 0.2632454708511737, "learning_rate": 5.385574622433714e-06, "loss": 0.0469, "step": 4573 }, { "epoch": 2.381051535658511, "grad_norm": 0.2301709888777297, "learning_rate": 5.3768662445442204e-06, "loss": 0.0475, "step": 4574 }, { "epoch": 2.381572097865695, "grad_norm": 0.22767022475445184, "learning_rate": 5.368164064553541e-06, "loss": 0.0473, "step": 4575 }, { "epoch": 2.3820926600728787, "grad_norm": 0.22628560424820487, "learning_rate": 5.359468085210237e-06, "loss": 0.045, "step": 4576 }, { "epoch": 2.3826132222800624, "grad_norm": 0.23342340422854826, "learning_rate": 5.3507783092609095e-06, "loss": 0.0486, "step": 4577 }, { "epoch": 2.383133784487246, "grad_norm": 0.2245786257913053, "learning_rate": 5.342094739450179e-06, "loss": 0.0464, "step": 4578 }, { "epoch": 2.38365434669443, "grad_norm": 0.2302258019263189, "learning_rate": 5.333417378520733e-06, "loss": 0.0474, "step": 4579 }, { "epoch": 2.3841749089016138, "grad_norm": 0.2268167472789496, "learning_rate": 5.324746229213282e-06, "loss": 0.0455, "step": 4580 }, { "epoch": 2.3846954711087975, "grad_norm": 0.235065648886395, "learning_rate": 5.316081294266587e-06, "loss": 0.0476, "step": 4581 }, { "epoch": 2.3852160333159813, "grad_norm": 0.23538123207470538, "learning_rate": 5.30742257641742e-06, "loss": 0.0481, "step": 4582 }, { "epoch": 2.385736595523165, "grad_norm": 0.23818619605234898, "learning_rate": 5.298770078400628e-06, "loss": 0.0468, "step": 4583 }, { "epoch": 2.386257157730349, "grad_norm": 0.2320901545098414, "learning_rate": 5.290123802949051e-06, "loss": 0.0451, "step": 4584 }, { "epoch": 2.3867777199375326, "grad_norm": 0.23469735771174544, "learning_rate": 5.2814837527935975e-06, "loss": 0.047, "step": 4585 }, { "epoch": 2.3872982821447164, "grad_norm": 0.23767900812756343, "learning_rate": 5.272849930663204e-06, "loss": 0.0489, "step": 4586 }, { "epoch": 2.3878188443519, "grad_norm": 0.23370065886236566, "learning_rate": 5.264222339284816e-06, "loss": 0.0461, "step": 4587 }, { "epoch": 2.388339406559084, "grad_norm": 0.23170361003181508, "learning_rate": 5.255600981383438e-06, "loss": 0.0476, "step": 4588 }, { "epoch": 2.3888599687662677, "grad_norm": 0.23695214156506306, "learning_rate": 5.246985859682094e-06, "loss": 0.0481, "step": 4589 }, { "epoch": 2.3893805309734515, "grad_norm": 0.22780483346006977, "learning_rate": 5.238376976901849e-06, "loss": 0.0471, "step": 4590 }, { "epoch": 2.3899010931806353, "grad_norm": 0.23083988277274833, "learning_rate": 5.229774335761775e-06, "loss": 0.0459, "step": 4591 }, { "epoch": 2.390421655387819, "grad_norm": 0.22986426177716607, "learning_rate": 5.221177938978999e-06, "loss": 0.0446, "step": 4592 }, { "epoch": 2.3909422175950024, "grad_norm": 0.23342974438965086, "learning_rate": 5.2125877892686496e-06, "loss": 0.0472, "step": 4593 }, { "epoch": 2.391462779802186, "grad_norm": 0.23195959114003814, "learning_rate": 5.204003889343906e-06, "loss": 0.0461, "step": 4594 }, { "epoch": 2.39198334200937, "grad_norm": 0.2216035633570572, "learning_rate": 5.195426241915963e-06, "loss": 0.045, "step": 4595 }, { "epoch": 2.3925039042165537, "grad_norm": 0.2132065666076437, "learning_rate": 5.186854849694034e-06, "loss": 0.0445, "step": 4596 }, { "epoch": 2.3930244664237375, "grad_norm": 0.23129878723213484, "learning_rate": 5.178289715385368e-06, "loss": 0.0469, "step": 4597 }, { "epoch": 2.3935450286309212, "grad_norm": 0.2388488336500406, "learning_rate": 5.169730841695233e-06, "loss": 0.047, "step": 4598 }, { "epoch": 2.394065590838105, "grad_norm": 0.22989393580863302, "learning_rate": 5.161178231326927e-06, "loss": 0.046, "step": 4599 }, { "epoch": 2.3945861530452888, "grad_norm": 0.24387299564354334, "learning_rate": 5.152631886981746e-06, "loss": 0.0478, "step": 4600 }, { "epoch": 2.3951067152524725, "grad_norm": 0.2184126453644583, "learning_rate": 5.144091811359039e-06, "loss": 0.0437, "step": 4601 }, { "epoch": 2.3956272774596563, "grad_norm": 0.23254797708689787, "learning_rate": 5.1355580071561465e-06, "loss": 0.0455, "step": 4602 }, { "epoch": 2.39614783966684, "grad_norm": 0.24362953858535674, "learning_rate": 5.127030477068445e-06, "loss": 0.0462, "step": 4603 }, { "epoch": 2.396668401874024, "grad_norm": 0.23599914959344417, "learning_rate": 5.118509223789336e-06, "loss": 0.0467, "step": 4604 }, { "epoch": 2.3971889640812076, "grad_norm": 0.22043950196000547, "learning_rate": 5.109994250010211e-06, "loss": 0.0439, "step": 4605 }, { "epoch": 2.3977095262883914, "grad_norm": 0.23070921769768574, "learning_rate": 5.101485558420505e-06, "loss": 0.0467, "step": 4606 }, { "epoch": 2.398230088495575, "grad_norm": 0.24197283057250074, "learning_rate": 5.092983151707656e-06, "loss": 0.0481, "step": 4607 }, { "epoch": 2.398750650702759, "grad_norm": 0.22154942089827273, "learning_rate": 5.0844870325571255e-06, "loss": 0.0448, "step": 4608 }, { "epoch": 2.3992712129099427, "grad_norm": 0.24014632166672964, "learning_rate": 5.0759972036523715e-06, "loss": 0.0479, "step": 4609 }, { "epoch": 2.3997917751171265, "grad_norm": 0.22536943970116005, "learning_rate": 5.067513667674892e-06, "loss": 0.0457, "step": 4610 }, { "epoch": 2.4003123373243103, "grad_norm": 0.2250611130271378, "learning_rate": 5.059036427304167e-06, "loss": 0.0447, "step": 4611 }, { "epoch": 2.400832899531494, "grad_norm": 0.2408162294102492, "learning_rate": 5.050565485217712e-06, "loss": 0.0477, "step": 4612 }, { "epoch": 2.401353461738678, "grad_norm": 0.24625623591280466, "learning_rate": 5.04210084409105e-06, "loss": 0.0484, "step": 4613 }, { "epoch": 2.4018740239458616, "grad_norm": 0.2481755981581284, "learning_rate": 5.033642506597694e-06, "loss": 0.048, "step": 4614 }, { "epoch": 2.4023945861530454, "grad_norm": 0.23987593424996412, "learning_rate": 5.025190475409189e-06, "loss": 0.0468, "step": 4615 }, { "epoch": 2.402915148360229, "grad_norm": 0.23074712648149856, "learning_rate": 5.01674475319508e-06, "loss": 0.0473, "step": 4616 }, { "epoch": 2.403435710567413, "grad_norm": 0.22920234691129646, "learning_rate": 5.008305342622923e-06, "loss": 0.0463, "step": 4617 }, { "epoch": 2.4039562727745967, "grad_norm": 0.24431092684106212, "learning_rate": 4.99987224635827e-06, "loss": 0.0486, "step": 4618 }, { "epoch": 2.4044768349817804, "grad_norm": 0.24071342837263257, "learning_rate": 4.99144546706469e-06, "loss": 0.0482, "step": 4619 }, { "epoch": 2.404997397188964, "grad_norm": 0.22487442588291262, "learning_rate": 4.9830250074037435e-06, "loss": 0.0463, "step": 4620 }, { "epoch": 2.405517959396148, "grad_norm": 0.23677496930387812, "learning_rate": 4.97461087003501e-06, "loss": 0.0472, "step": 4621 }, { "epoch": 2.4060385216033318, "grad_norm": 0.2339609503507166, "learning_rate": 4.966203057616073e-06, "loss": 0.0463, "step": 4622 }, { "epoch": 2.4065590838105155, "grad_norm": 0.22648734693696182, "learning_rate": 4.9578015728024955e-06, "loss": 0.0452, "step": 4623 }, { "epoch": 2.4070796460176993, "grad_norm": 0.23140249756070344, "learning_rate": 4.949406418247865e-06, "loss": 0.046, "step": 4624 }, { "epoch": 2.407600208224883, "grad_norm": 0.22814922822292663, "learning_rate": 4.941017596603761e-06, "loss": 0.0468, "step": 4625 }, { "epoch": 2.408120770432067, "grad_norm": 0.24189988700195564, "learning_rate": 4.9326351105197704e-06, "loss": 0.0486, "step": 4626 }, { "epoch": 2.4086413326392506, "grad_norm": 0.2363782391459408, "learning_rate": 4.92425896264346e-06, "loss": 0.0477, "step": 4627 }, { "epoch": 2.4091618948464344, "grad_norm": 0.23153910984513215, "learning_rate": 4.915889155620423e-06, "loss": 0.0483, "step": 4628 }, { "epoch": 2.4096824570536177, "grad_norm": 0.23130021099353434, "learning_rate": 4.907525692094217e-06, "loss": 0.0474, "step": 4629 }, { "epoch": 2.4102030192608015, "grad_norm": 0.25831436035426264, "learning_rate": 4.89916857470642e-06, "loss": 0.0506, "step": 4630 }, { "epoch": 2.4107235814679853, "grad_norm": 0.2321273528575011, "learning_rate": 4.890817806096606e-06, "loss": 0.049, "step": 4631 }, { "epoch": 2.411244143675169, "grad_norm": 0.23553678312517642, "learning_rate": 4.882473388902323e-06, "loss": 0.0473, "step": 4632 }, { "epoch": 2.411764705882353, "grad_norm": 0.23546720700809365, "learning_rate": 4.874135325759133e-06, "loss": 0.0479, "step": 4633 }, { "epoch": 2.4122852680895366, "grad_norm": 0.24113411652212086, "learning_rate": 4.8658036193005855e-06, "loss": 0.047, "step": 4634 }, { "epoch": 2.4128058302967204, "grad_norm": 0.2324247954219874, "learning_rate": 4.857478272158217e-06, "loss": 0.0481, "step": 4635 }, { "epoch": 2.413326392503904, "grad_norm": 0.23429448270749925, "learning_rate": 4.849159286961571e-06, "loss": 0.0474, "step": 4636 }, { "epoch": 2.413846954711088, "grad_norm": 0.23663034771539931, "learning_rate": 4.840846666338161e-06, "loss": 0.047, "step": 4637 }, { "epoch": 2.4143675169182717, "grad_norm": 0.2270730528314231, "learning_rate": 4.8325404129134915e-06, "loss": 0.0461, "step": 4638 }, { "epoch": 2.4148880791254554, "grad_norm": 0.22378060847821732, "learning_rate": 4.824240529311075e-06, "loss": 0.0466, "step": 4639 }, { "epoch": 2.415408641332639, "grad_norm": 0.225573866033014, "learning_rate": 4.815947018152397e-06, "loss": 0.0464, "step": 4640 }, { "epoch": 2.415929203539823, "grad_norm": 0.23586252079175646, "learning_rate": 4.807659882056945e-06, "loss": 0.0466, "step": 4641 }, { "epoch": 2.4164497657470068, "grad_norm": 0.22986325100510877, "learning_rate": 4.799379123642162e-06, "loss": 0.0446, "step": 4642 }, { "epoch": 2.4169703279541905, "grad_norm": 0.23705062124271192, "learning_rate": 4.791104745523509e-06, "loss": 0.0479, "step": 4643 }, { "epoch": 2.4174908901613743, "grad_norm": 0.23732585219476587, "learning_rate": 4.78283675031442e-06, "loss": 0.047, "step": 4644 }, { "epoch": 2.418011452368558, "grad_norm": 0.23331514934441308, "learning_rate": 4.7745751406263165e-06, "loss": 0.0459, "step": 4645 }, { "epoch": 2.418532014575742, "grad_norm": 0.23781982825270778, "learning_rate": 4.766319919068593e-06, "loss": 0.0476, "step": 4646 }, { "epoch": 2.4190525767829256, "grad_norm": 0.24131605817905036, "learning_rate": 4.758071088248628e-06, "loss": 0.0483, "step": 4647 }, { "epoch": 2.4195731389901094, "grad_norm": 0.2351381427413349, "learning_rate": 4.7498286507717895e-06, "loss": 0.047, "step": 4648 }, { "epoch": 2.420093701197293, "grad_norm": 0.23702310918812222, "learning_rate": 4.741592609241427e-06, "loss": 0.0471, "step": 4649 }, { "epoch": 2.420614263404477, "grad_norm": 0.2214982247537357, "learning_rate": 4.733362966258869e-06, "loss": 0.0439, "step": 4650 }, { "epoch": 2.4211348256116607, "grad_norm": 0.23411796590355238, "learning_rate": 4.725139724423411e-06, "loss": 0.0461, "step": 4651 }, { "epoch": 2.4216553878188445, "grad_norm": 0.23041508871761626, "learning_rate": 4.716922886332334e-06, "loss": 0.0449, "step": 4652 }, { "epoch": 2.4221759500260283, "grad_norm": 0.2402349549586395, "learning_rate": 4.7087124545809045e-06, "loss": 0.046, "step": 4653 }, { "epoch": 2.422696512233212, "grad_norm": 0.23216773676851069, "learning_rate": 4.700508431762365e-06, "loss": 0.0465, "step": 4654 }, { "epoch": 2.423217074440396, "grad_norm": 0.2348648584242627, "learning_rate": 4.692310820467919e-06, "loss": 0.0478, "step": 4655 }, { "epoch": 2.4237376366475796, "grad_norm": 0.22464964421666625, "learning_rate": 4.684119623286748e-06, "loss": 0.0445, "step": 4656 }, { "epoch": 2.424258198854763, "grad_norm": 0.2246231882200223, "learning_rate": 4.675934842806018e-06, "loss": 0.0469, "step": 4657 }, { "epoch": 2.4247787610619467, "grad_norm": 0.2288877796143045, "learning_rate": 4.667756481610866e-06, "loss": 0.0465, "step": 4658 }, { "epoch": 2.4252993232691304, "grad_norm": 0.22672281731855962, "learning_rate": 4.6595845422844035e-06, "loss": 0.0461, "step": 4659 }, { "epoch": 2.425819885476314, "grad_norm": 0.23471740148435588, "learning_rate": 4.6514190274076996e-06, "loss": 0.0478, "step": 4660 }, { "epoch": 2.426340447683498, "grad_norm": 0.22362959400916294, "learning_rate": 4.643259939559807e-06, "loss": 0.0458, "step": 4661 }, { "epoch": 2.4268610098906818, "grad_norm": 0.2227801341848505, "learning_rate": 4.6351072813177495e-06, "loss": 0.046, "step": 4662 }, { "epoch": 2.4273815720978655, "grad_norm": 0.22031118417632997, "learning_rate": 4.626961055256515e-06, "loss": 0.0439, "step": 4663 }, { "epoch": 2.4279021343050493, "grad_norm": 0.22982141836032516, "learning_rate": 4.618821263949061e-06, "loss": 0.0462, "step": 4664 }, { "epoch": 2.428422696512233, "grad_norm": 0.23150711698760643, "learning_rate": 4.610687909966304e-06, "loss": 0.0456, "step": 4665 }, { "epoch": 2.428943258719417, "grad_norm": 0.23045867200414713, "learning_rate": 4.602560995877142e-06, "loss": 0.0457, "step": 4666 }, { "epoch": 2.4294638209266006, "grad_norm": 0.23591693108732284, "learning_rate": 4.594440524248431e-06, "loss": 0.0442, "step": 4667 }, { "epoch": 2.4299843831337844, "grad_norm": 0.22810723603892324, "learning_rate": 4.586326497645002e-06, "loss": 0.046, "step": 4668 }, { "epoch": 2.430504945340968, "grad_norm": 0.2399613227524779, "learning_rate": 4.578218918629632e-06, "loss": 0.0476, "step": 4669 }, { "epoch": 2.431025507548152, "grad_norm": 0.2464507638072641, "learning_rate": 4.570117789763073e-06, "loss": 0.0473, "step": 4670 }, { "epoch": 2.4315460697553357, "grad_norm": 0.23144494070866378, "learning_rate": 4.562023113604041e-06, "loss": 0.0461, "step": 4671 }, { "epoch": 2.4320666319625195, "grad_norm": 0.2325135941641269, "learning_rate": 4.553934892709216e-06, "loss": 0.0477, "step": 4672 }, { "epoch": 2.4325871941697033, "grad_norm": 0.22934582369828932, "learning_rate": 4.545853129633226e-06, "loss": 0.0468, "step": 4673 }, { "epoch": 2.433107756376887, "grad_norm": 0.23396484474390528, "learning_rate": 4.5377778269286766e-06, "loss": 0.0468, "step": 4674 }, { "epoch": 2.433628318584071, "grad_norm": 0.23530318910470144, "learning_rate": 4.529708987146114e-06, "loss": 0.0462, "step": 4675 }, { "epoch": 2.4341488807912546, "grad_norm": 0.23344486790079444, "learning_rate": 4.521646612834057e-06, "loss": 0.0464, "step": 4676 }, { "epoch": 2.4346694429984383, "grad_norm": 0.23330292282907472, "learning_rate": 4.513590706538989e-06, "loss": 0.0463, "step": 4677 }, { "epoch": 2.435190005205622, "grad_norm": 0.22437040772272743, "learning_rate": 4.5055412708053245e-06, "loss": 0.0477, "step": 4678 }, { "epoch": 2.435710567412806, "grad_norm": 0.2265909891553176, "learning_rate": 4.497498308175454e-06, "loss": 0.0461, "step": 4679 }, { "epoch": 2.4362311296199897, "grad_norm": 0.23121513283352574, "learning_rate": 4.489461821189725e-06, "loss": 0.0464, "step": 4680 }, { "epoch": 2.4367516918271734, "grad_norm": 0.2528579823174406, "learning_rate": 4.481431812386436e-06, "loss": 0.0495, "step": 4681 }, { "epoch": 2.437272254034357, "grad_norm": 0.23289575073759872, "learning_rate": 4.473408284301825e-06, "loss": 0.0459, "step": 4682 }, { "epoch": 2.437792816241541, "grad_norm": 0.23015881999073623, "learning_rate": 4.465391239470112e-06, "loss": 0.0453, "step": 4683 }, { "epoch": 2.4383133784487248, "grad_norm": 0.22624808663286283, "learning_rate": 4.457380680423434e-06, "loss": 0.0453, "step": 4684 }, { "epoch": 2.4388339406559085, "grad_norm": 0.2296916685174956, "learning_rate": 4.4493766096919136e-06, "loss": 0.0464, "step": 4685 }, { "epoch": 2.4393545028630923, "grad_norm": 0.21738846706883652, "learning_rate": 4.441379029803605e-06, "loss": 0.0425, "step": 4686 }, { "epoch": 2.439875065070276, "grad_norm": 0.2423083039591658, "learning_rate": 4.433387943284511e-06, "loss": 0.0468, "step": 4687 }, { "epoch": 2.44039562727746, "grad_norm": 0.2341956963383124, "learning_rate": 4.425403352658591e-06, "loss": 0.0447, "step": 4688 }, { "epoch": 2.4409161894846436, "grad_norm": 0.24101151048749272, "learning_rate": 4.417425260447753e-06, "loss": 0.0475, "step": 4689 }, { "epoch": 2.4414367516918274, "grad_norm": 0.24063994225163093, "learning_rate": 4.4094536691718505e-06, "loss": 0.0465, "step": 4690 }, { "epoch": 2.441957313899011, "grad_norm": 0.23071169180768644, "learning_rate": 4.401488581348679e-06, "loss": 0.0471, "step": 4691 }, { "epoch": 2.442477876106195, "grad_norm": 0.2355448253070943, "learning_rate": 4.393529999493989e-06, "loss": 0.0471, "step": 4692 }, { "epoch": 2.4429984383133783, "grad_norm": 0.23632196722116713, "learning_rate": 4.385577926121464e-06, "loss": 0.0474, "step": 4693 }, { "epoch": 2.443519000520562, "grad_norm": 0.23534929674479932, "learning_rate": 4.3776323637427395e-06, "loss": 0.0452, "step": 4694 }, { "epoch": 2.444039562727746, "grad_norm": 0.24304563929485506, "learning_rate": 4.369693314867407e-06, "loss": 0.047, "step": 4695 }, { "epoch": 2.4445601249349296, "grad_norm": 0.23188182601997595, "learning_rate": 4.3617607820029686e-06, "loss": 0.045, "step": 4696 }, { "epoch": 2.4450806871421134, "grad_norm": 0.23588489710907637, "learning_rate": 4.353834767654896e-06, "loss": 0.046, "step": 4697 }, { "epoch": 2.445601249349297, "grad_norm": 0.2423879338466431, "learning_rate": 4.345915274326595e-06, "loss": 0.0486, "step": 4698 }, { "epoch": 2.446121811556481, "grad_norm": 0.23190042633660438, "learning_rate": 4.338002304519406e-06, "loss": 0.0476, "step": 4699 }, { "epoch": 2.4466423737636647, "grad_norm": 0.2371416200568294, "learning_rate": 4.330095860732625e-06, "loss": 0.0475, "step": 4700 }, { "epoch": 2.4471629359708484, "grad_norm": 0.233119464425362, "learning_rate": 4.322195945463464e-06, "loss": 0.0453, "step": 4701 }, { "epoch": 2.447683498178032, "grad_norm": 0.23660527240553358, "learning_rate": 4.314302561207079e-06, "loss": 0.047, "step": 4702 }, { "epoch": 2.448204060385216, "grad_norm": 0.2225359791140698, "learning_rate": 4.306415710456577e-06, "loss": 0.0451, "step": 4703 }, { "epoch": 2.4487246225923998, "grad_norm": 0.22352359233272487, "learning_rate": 4.2985353957029876e-06, "loss": 0.0438, "step": 4704 }, { "epoch": 2.4492451847995835, "grad_norm": 0.22659412226171605, "learning_rate": 4.29066161943529e-06, "loss": 0.0453, "step": 4705 }, { "epoch": 2.4497657470067673, "grad_norm": 0.23202678432385068, "learning_rate": 4.282794384140379e-06, "loss": 0.0456, "step": 4706 }, { "epoch": 2.450286309213951, "grad_norm": 0.23436332924353653, "learning_rate": 4.274933692303093e-06, "loss": 0.0467, "step": 4707 }, { "epoch": 2.450806871421135, "grad_norm": 0.23568145553682646, "learning_rate": 4.267079546406211e-06, "loss": 0.0457, "step": 4708 }, { "epoch": 2.4513274336283186, "grad_norm": 0.23688538546245036, "learning_rate": 4.259231948930442e-06, "loss": 0.0465, "step": 4709 }, { "epoch": 2.4518479958355024, "grad_norm": 0.22584612440650176, "learning_rate": 4.251390902354413e-06, "loss": 0.045, "step": 4710 }, { "epoch": 2.452368558042686, "grad_norm": 0.22787648763996438, "learning_rate": 4.243556409154692e-06, "loss": 0.0447, "step": 4711 }, { "epoch": 2.45288912024987, "grad_norm": 0.2308201824461422, "learning_rate": 4.235728471805775e-06, "loss": 0.0452, "step": 4712 }, { "epoch": 2.4534096824570537, "grad_norm": 0.23117909367110293, "learning_rate": 4.227907092780095e-06, "loss": 0.0455, "step": 4713 }, { "epoch": 2.4539302446642375, "grad_norm": 0.22631631172759079, "learning_rate": 4.22009227454801e-06, "loss": 0.0463, "step": 4714 }, { "epoch": 2.4544508068714213, "grad_norm": 0.2344996227202645, "learning_rate": 4.212284019577792e-06, "loss": 0.0469, "step": 4715 }, { "epoch": 2.454971369078605, "grad_norm": 0.22826293638212433, "learning_rate": 4.204482330335657e-06, "loss": 0.046, "step": 4716 }, { "epoch": 2.455491931285789, "grad_norm": 0.24057079364699233, "learning_rate": 4.196687209285744e-06, "loss": 0.0475, "step": 4717 }, { "epoch": 2.4560124934929726, "grad_norm": 0.23451233031179078, "learning_rate": 4.188898658890117e-06, "loss": 0.0462, "step": 4718 }, { "epoch": 2.4565330557001563, "grad_norm": 0.23186747086362713, "learning_rate": 4.1811166816087595e-06, "loss": 0.0464, "step": 4719 }, { "epoch": 2.45705361790734, "grad_norm": 0.22706734857716063, "learning_rate": 4.173341279899576e-06, "loss": 0.0442, "step": 4720 }, { "epoch": 2.4575741801145234, "grad_norm": 0.23846594551848097, "learning_rate": 4.165572456218405e-06, "loss": 0.0492, "step": 4721 }, { "epoch": 2.458094742321707, "grad_norm": 0.22761321129821427, "learning_rate": 4.157810213019003e-06, "loss": 0.0467, "step": 4722 }, { "epoch": 2.458615304528891, "grad_norm": 0.21913183574661646, "learning_rate": 4.150054552753055e-06, "loss": 0.0449, "step": 4723 }, { "epoch": 2.4591358667360748, "grad_norm": 0.2258958824847465, "learning_rate": 4.1423054778701455e-06, "loss": 0.0463, "step": 4724 }, { "epoch": 2.4596564289432585, "grad_norm": 0.2535114295876158, "learning_rate": 4.1345629908178e-06, "loss": 0.0446, "step": 4725 }, { "epoch": 2.4601769911504423, "grad_norm": 0.2188190545994277, "learning_rate": 4.126827094041455e-06, "loss": 0.045, "step": 4726 }, { "epoch": 2.460697553357626, "grad_norm": 0.22451949242641625, "learning_rate": 4.119097789984472e-06, "loss": 0.0474, "step": 4727 }, { "epoch": 2.46121811556481, "grad_norm": 0.23179831019249533, "learning_rate": 4.111375081088123e-06, "loss": 0.0456, "step": 4728 }, { "epoch": 2.4617386777719936, "grad_norm": 0.23401390682247894, "learning_rate": 4.103658969791588e-06, "loss": 0.047, "step": 4729 }, { "epoch": 2.4622592399791774, "grad_norm": 0.29525830006055254, "learning_rate": 4.095949458531984e-06, "loss": 0.045, "step": 4730 }, { "epoch": 2.462779802186361, "grad_norm": 0.2259193500356729, "learning_rate": 4.088246549744331e-06, "loss": 0.0438, "step": 4731 }, { "epoch": 2.463300364393545, "grad_norm": 0.22553481026955677, "learning_rate": 4.0805502458615725e-06, "loss": 0.0441, "step": 4732 }, { "epoch": 2.4638209266007287, "grad_norm": 0.2533883156131254, "learning_rate": 4.07286054931455e-06, "loss": 0.0473, "step": 4733 }, { "epoch": 2.4643414888079125, "grad_norm": 0.23645871037063934, "learning_rate": 4.065177462532027e-06, "loss": 0.0453, "step": 4734 }, { "epoch": 2.4648620510150963, "grad_norm": 0.2390469842852221, "learning_rate": 4.057500987940688e-06, "loss": 0.0467, "step": 4735 }, { "epoch": 2.46538261322228, "grad_norm": 0.24040218439850058, "learning_rate": 4.04983112796512e-06, "loss": 0.0464, "step": 4736 }, { "epoch": 2.465903175429464, "grad_norm": 0.2323783034515998, "learning_rate": 4.04216788502782e-06, "loss": 0.0466, "step": 4737 }, { "epoch": 2.4664237376366476, "grad_norm": 0.2421242614235067, "learning_rate": 4.03451126154919e-06, "loss": 0.0457, "step": 4738 }, { "epoch": 2.4669442998438313, "grad_norm": 0.2241947463119316, "learning_rate": 4.0268612599475534e-06, "loss": 0.0442, "step": 4739 }, { "epoch": 2.467464862051015, "grad_norm": 0.23390631221316124, "learning_rate": 4.019217882639137e-06, "loss": 0.0459, "step": 4740 }, { "epoch": 2.467985424258199, "grad_norm": 0.23604571830215357, "learning_rate": 4.011581132038078e-06, "loss": 0.046, "step": 4741 }, { "epoch": 2.4685059864653827, "grad_norm": 0.23679414599656148, "learning_rate": 4.003951010556412e-06, "loss": 0.0477, "step": 4742 }, { "epoch": 2.4690265486725664, "grad_norm": 0.22466780147244902, "learning_rate": 3.996327520604087e-06, "loss": 0.0453, "step": 4743 }, { "epoch": 2.46954711087975, "grad_norm": 0.23183844848666738, "learning_rate": 3.9887106645889574e-06, "loss": 0.0458, "step": 4744 }, { "epoch": 2.470067673086934, "grad_norm": 0.2258352174710472, "learning_rate": 3.981100444916788e-06, "loss": 0.0446, "step": 4745 }, { "epoch": 2.4705882352941178, "grad_norm": 0.23149767246195083, "learning_rate": 3.973496863991233e-06, "loss": 0.0447, "step": 4746 }, { "epoch": 2.4711087975013015, "grad_norm": 0.23290847900975084, "learning_rate": 3.965899924213851e-06, "loss": 0.0446, "step": 4747 }, { "epoch": 2.4716293597084853, "grad_norm": 0.2249778446324839, "learning_rate": 3.958309627984116e-06, "loss": 0.045, "step": 4748 }, { "epoch": 2.472149921915669, "grad_norm": 0.21861880310195234, "learning_rate": 3.950725977699396e-06, "loss": 0.0443, "step": 4749 }, { "epoch": 2.472670484122853, "grad_norm": 0.23380723096798692, "learning_rate": 3.943148975754968e-06, "loss": 0.0456, "step": 4750 }, { "epoch": 2.4731910463300366, "grad_norm": 0.25812439912963125, "learning_rate": 3.9355786245439896e-06, "loss": 0.0492, "step": 4751 }, { "epoch": 2.4737116085372204, "grad_norm": 0.2346925416518511, "learning_rate": 3.928014926457532e-06, "loss": 0.0469, "step": 4752 }, { "epoch": 2.474232170744404, "grad_norm": 0.2324299060972178, "learning_rate": 3.920457883884571e-06, "loss": 0.0469, "step": 4753 }, { "epoch": 2.474752732951588, "grad_norm": 0.22340130582451653, "learning_rate": 3.9129074992119705e-06, "loss": 0.0452, "step": 4754 }, { "epoch": 2.4752732951587717, "grad_norm": 0.23185760003678735, "learning_rate": 3.905363774824492e-06, "loss": 0.0461, "step": 4755 }, { "epoch": 2.4757938573659555, "grad_norm": 0.22369373177962135, "learning_rate": 3.897826713104786e-06, "loss": 0.0429, "step": 4756 }, { "epoch": 2.476314419573139, "grad_norm": 0.22152244736340082, "learning_rate": 3.8902963164334145e-06, "loss": 0.0454, "step": 4757 }, { "epoch": 2.4768349817803226, "grad_norm": 0.2288794662126069, "learning_rate": 3.882772587188827e-06, "loss": 0.044, "step": 4758 }, { "epoch": 2.4773555439875063, "grad_norm": 0.22786850218271076, "learning_rate": 3.875255527747376e-06, "loss": 0.0455, "step": 4759 }, { "epoch": 2.47787610619469, "grad_norm": 0.24127063199154952, "learning_rate": 3.867745140483281e-06, "loss": 0.0494, "step": 4760 }, { "epoch": 2.478396668401874, "grad_norm": 0.22651816469440886, "learning_rate": 3.860241427768682e-06, "loss": 0.0474, "step": 4761 }, { "epoch": 2.4789172306090577, "grad_norm": 0.23292917754764078, "learning_rate": 3.852744391973601e-06, "loss": 0.0453, "step": 4762 }, { "epoch": 2.4794377928162414, "grad_norm": 0.23481848280563425, "learning_rate": 3.845254035465951e-06, "loss": 0.0462, "step": 4763 }, { "epoch": 2.479958355023425, "grad_norm": 0.23409115715668288, "learning_rate": 3.8377703606115425e-06, "loss": 0.0456, "step": 4764 }, { "epoch": 2.480478917230609, "grad_norm": 0.22688970649832368, "learning_rate": 3.830293369774049e-06, "loss": 0.046, "step": 4765 }, { "epoch": 2.4809994794377928, "grad_norm": 0.24911131431251246, "learning_rate": 3.822823065315062e-06, "loss": 0.0477, "step": 4766 }, { "epoch": 2.4815200416449765, "grad_norm": 0.23613842700910898, "learning_rate": 3.815359449594053e-06, "loss": 0.0464, "step": 4767 }, { "epoch": 2.4820406038521603, "grad_norm": 0.22630425395246193, "learning_rate": 3.8079025249683766e-06, "loss": 0.0454, "step": 4768 }, { "epoch": 2.482561166059344, "grad_norm": 0.23834407534931806, "learning_rate": 3.800452293793283e-06, "loss": 0.0474, "step": 4769 }, { "epoch": 2.483081728266528, "grad_norm": 0.22299016955003192, "learning_rate": 3.7930087584218924e-06, "loss": 0.0447, "step": 4770 }, { "epoch": 2.4836022904737116, "grad_norm": 0.22180738154042506, "learning_rate": 3.785571921205225e-06, "loss": 0.0444, "step": 4771 }, { "epoch": 2.4841228526808954, "grad_norm": 0.23606413057780049, "learning_rate": 3.7781417844921785e-06, "loss": 0.0462, "step": 4772 }, { "epoch": 2.484643414888079, "grad_norm": 0.23226564075490944, "learning_rate": 3.770718350629543e-06, "loss": 0.0471, "step": 4773 }, { "epoch": 2.485163977095263, "grad_norm": 0.222173152299585, "learning_rate": 3.7633016219619786e-06, "loss": 0.044, "step": 4774 }, { "epoch": 2.4856845393024467, "grad_norm": 0.22670387438422338, "learning_rate": 3.755891600832026e-06, "loss": 0.0451, "step": 4775 }, { "epoch": 2.4862051015096305, "grad_norm": 0.24457041056832837, "learning_rate": 3.748488289580124e-06, "loss": 0.0471, "step": 4776 }, { "epoch": 2.4867256637168142, "grad_norm": 0.2292671621830906, "learning_rate": 3.7410916905445763e-06, "loss": 0.0461, "step": 4777 }, { "epoch": 2.487246225923998, "grad_norm": 0.2319982917968305, "learning_rate": 3.7337018060615847e-06, "loss": 0.0455, "step": 4778 }, { "epoch": 2.487766788131182, "grad_norm": 0.22435841846525945, "learning_rate": 3.7263186384652064e-06, "loss": 0.043, "step": 4779 }, { "epoch": 2.4882873503383656, "grad_norm": 0.22949123049534018, "learning_rate": 3.7189421900873905e-06, "loss": 0.045, "step": 4780 }, { "epoch": 2.4888079125455493, "grad_norm": 0.23210355715087153, "learning_rate": 3.71157246325797e-06, "loss": 0.0441, "step": 4781 }, { "epoch": 2.489328474752733, "grad_norm": 0.2260444385170093, "learning_rate": 3.7042094603046473e-06, "loss": 0.0436, "step": 4782 }, { "epoch": 2.489849036959917, "grad_norm": 0.2293597694356391, "learning_rate": 3.696853183552998e-06, "loss": 0.0453, "step": 4783 }, { "epoch": 2.4903695991671007, "grad_norm": 0.22728917524183118, "learning_rate": 3.6895036353264716e-06, "loss": 0.0473, "step": 4784 }, { "epoch": 2.490890161374284, "grad_norm": 0.2258597144539425, "learning_rate": 3.6821608179464006e-06, "loss": 0.0447, "step": 4785 }, { "epoch": 2.4914107235814678, "grad_norm": 0.23529432877833512, "learning_rate": 3.674824733731991e-06, "loss": 0.0454, "step": 4786 }, { "epoch": 2.4919312857886515, "grad_norm": 0.2276587426156354, "learning_rate": 3.6674953850003245e-06, "loss": 0.0457, "step": 4787 }, { "epoch": 2.4924518479958353, "grad_norm": 0.23595370854468917, "learning_rate": 3.6601727740663395e-06, "loss": 0.0466, "step": 4788 }, { "epoch": 2.492972410203019, "grad_norm": 0.23081012893994207, "learning_rate": 3.652856903242863e-06, "loss": 0.0437, "step": 4789 }, { "epoch": 2.493492972410203, "grad_norm": 0.23017085621336414, "learning_rate": 3.6455477748405853e-06, "loss": 0.0445, "step": 4790 }, { "epoch": 2.4940135346173866, "grad_norm": 0.23217068455139228, "learning_rate": 3.638245391168077e-06, "loss": 0.0476, "step": 4791 }, { "epoch": 2.4945340968245704, "grad_norm": 0.2224420779312252, "learning_rate": 3.630949754531765e-06, "loss": 0.0455, "step": 4792 }, { "epoch": 2.495054659031754, "grad_norm": 0.2332204491629984, "learning_rate": 3.623660867235945e-06, "loss": 0.0455, "step": 4793 }, { "epoch": 2.495575221238938, "grad_norm": 0.2302258850640075, "learning_rate": 3.6163787315827894e-06, "loss": 0.0461, "step": 4794 }, { "epoch": 2.4960957834461217, "grad_norm": 0.23170180876721022, "learning_rate": 3.609103349872342e-06, "loss": 0.0462, "step": 4795 }, { "epoch": 2.4966163456533055, "grad_norm": 0.22548808881382684, "learning_rate": 3.6018347244025085e-06, "loss": 0.0456, "step": 4796 }, { "epoch": 2.4971369078604893, "grad_norm": 0.22318432696419793, "learning_rate": 3.5945728574690474e-06, "loss": 0.0451, "step": 4797 }, { "epoch": 2.497657470067673, "grad_norm": 0.2328804121095348, "learning_rate": 3.5873177513655985e-06, "loss": 0.0465, "step": 4798 }, { "epoch": 2.498178032274857, "grad_norm": 0.2362488676580557, "learning_rate": 3.5800694083836685e-06, "loss": 0.0458, "step": 4799 }, { "epoch": 2.4986985944820406, "grad_norm": 0.22505683785956507, "learning_rate": 3.5728278308126195e-06, "loss": 0.0445, "step": 4800 }, { "epoch": 2.4992191566892243, "grad_norm": 0.23832222888676316, "learning_rate": 3.5655930209396783e-06, "loss": 0.0451, "step": 4801 }, { "epoch": 2.499739718896408, "grad_norm": 0.23018716395069688, "learning_rate": 3.5583649810499246e-06, "loss": 0.0442, "step": 4802 }, { "epoch": 2.500260281103592, "grad_norm": 0.22556770408037974, "learning_rate": 3.551143713426319e-06, "loss": 0.0438, "step": 4803 }, { "epoch": 2.5007808433107757, "grad_norm": 0.22278746289847356, "learning_rate": 3.543929220349673e-06, "loss": 0.0452, "step": 4804 }, { "epoch": 2.5013014055179594, "grad_norm": 0.22592179865196593, "learning_rate": 3.536721504098664e-06, "loss": 0.0454, "step": 4805 }, { "epoch": 2.501821967725143, "grad_norm": 0.2191833418264602, "learning_rate": 3.529520566949812e-06, "loss": 0.0429, "step": 4806 }, { "epoch": 2.502342529932327, "grad_norm": 0.23567814923196181, "learning_rate": 3.522326411177515e-06, "loss": 0.0455, "step": 4807 }, { "epoch": 2.5028630921395107, "grad_norm": 0.2329494968156347, "learning_rate": 3.5151390390540245e-06, "loss": 0.0456, "step": 4808 }, { "epoch": 2.5033836543466945, "grad_norm": 0.23201928458314705, "learning_rate": 3.5079584528494497e-06, "loss": 0.0453, "step": 4809 }, { "epoch": 2.5039042165538783, "grad_norm": 0.22257219879809637, "learning_rate": 3.5007846548317487e-06, "loss": 0.0432, "step": 4810 }, { "epoch": 2.504424778761062, "grad_norm": 0.2357881279026739, "learning_rate": 3.4936176472667337e-06, "loss": 0.0455, "step": 4811 }, { "epoch": 2.504945340968246, "grad_norm": 0.22156754335142384, "learning_rate": 3.486457432418089e-06, "loss": 0.0439, "step": 4812 }, { "epoch": 2.5054659031754296, "grad_norm": 0.222223788188508, "learning_rate": 3.479304012547338e-06, "loss": 0.0437, "step": 4813 }, { "epoch": 2.5059864653826134, "grad_norm": 0.23233184259143652, "learning_rate": 3.4721573899138743e-06, "loss": 0.045, "step": 4814 }, { "epoch": 2.506507027589797, "grad_norm": 0.23772916739161976, "learning_rate": 3.4650175667749223e-06, "loss": 0.0466, "step": 4815 }, { "epoch": 2.507027589796981, "grad_norm": 0.2380129235798541, "learning_rate": 3.457884545385573e-06, "loss": 0.0463, "step": 4816 }, { "epoch": 2.5075481520041647, "grad_norm": 0.2428567210468212, "learning_rate": 3.450758327998768e-06, "loss": 0.0455, "step": 4817 }, { "epoch": 2.5080687142113485, "grad_norm": 0.24116904429043992, "learning_rate": 3.4436389168653023e-06, "loss": 0.0441, "step": 4818 }, { "epoch": 2.5085892764185322, "grad_norm": 0.2311244048249909, "learning_rate": 3.436526314233815e-06, "loss": 0.0459, "step": 4819 }, { "epoch": 2.509109838625716, "grad_norm": 0.22619706111589447, "learning_rate": 3.42942052235079e-06, "loss": 0.0452, "step": 4820 }, { "epoch": 2.5096304008329, "grad_norm": 0.22462384968951704, "learning_rate": 3.4223215434605714e-06, "loss": 0.0437, "step": 4821 }, { "epoch": 2.5101509630400836, "grad_norm": 0.23196960288458288, "learning_rate": 3.4152293798053486e-06, "loss": 0.046, "step": 4822 }, { "epoch": 2.510671525247267, "grad_norm": 0.22816810301577573, "learning_rate": 3.408144033625163e-06, "loss": 0.0449, "step": 4823 }, { "epoch": 2.5111920874544507, "grad_norm": 0.22363210681026655, "learning_rate": 3.401065507157883e-06, "loss": 0.0426, "step": 4824 }, { "epoch": 2.5117126496616344, "grad_norm": 0.22254268395086219, "learning_rate": 3.393993802639245e-06, "loss": 0.0445, "step": 4825 }, { "epoch": 2.512233211868818, "grad_norm": 0.222987753608597, "learning_rate": 3.3869289223028204e-06, "loss": 0.0444, "step": 4826 }, { "epoch": 2.512753774076002, "grad_norm": 0.24197883530585623, "learning_rate": 3.3798708683800305e-06, "loss": 0.0446, "step": 4827 }, { "epoch": 2.5132743362831858, "grad_norm": 0.236727669006638, "learning_rate": 3.372819643100139e-06, "loss": 0.0467, "step": 4828 }, { "epoch": 2.5137948984903695, "grad_norm": 0.24252487797178882, "learning_rate": 3.3657752486902396e-06, "loss": 0.046, "step": 4829 }, { "epoch": 2.5143154606975533, "grad_norm": 0.23031235971733746, "learning_rate": 3.3587376873752853e-06, "loss": 0.0456, "step": 4830 }, { "epoch": 2.514836022904737, "grad_norm": 0.22614796596777614, "learning_rate": 3.351706961378068e-06, "loss": 0.0439, "step": 4831 }, { "epoch": 2.515356585111921, "grad_norm": 0.22456575523764083, "learning_rate": 3.344683072919216e-06, "loss": 0.0451, "step": 4832 }, { "epoch": 2.5158771473191046, "grad_norm": 0.22971764854819687, "learning_rate": 3.337666024217209e-06, "loss": 0.0457, "step": 4833 }, { "epoch": 2.5163977095262884, "grad_norm": 0.23225244680033916, "learning_rate": 3.3306558174883427e-06, "loss": 0.0469, "step": 4834 }, { "epoch": 2.516918271733472, "grad_norm": 0.22408742635883183, "learning_rate": 3.323652454946774e-06, "loss": 0.0453, "step": 4835 }, { "epoch": 2.517438833940656, "grad_norm": 0.22183055756851636, "learning_rate": 3.3166559388044945e-06, "loss": 0.0434, "step": 4836 }, { "epoch": 2.5179593961478397, "grad_norm": 0.2260016674787736, "learning_rate": 3.3096662712713224e-06, "loss": 0.0448, "step": 4837 }, { "epoch": 2.5184799583550235, "grad_norm": 0.2301369168176432, "learning_rate": 3.3026834545549252e-06, "loss": 0.0448, "step": 4838 }, { "epoch": 2.5190005205622072, "grad_norm": 0.21883186300554672, "learning_rate": 3.295707490860797e-06, "loss": 0.0436, "step": 4839 }, { "epoch": 2.519521082769391, "grad_norm": 0.22202859390787064, "learning_rate": 3.288738382392273e-06, "loss": 0.0449, "step": 4840 }, { "epoch": 2.520041644976575, "grad_norm": 0.22761610071343236, "learning_rate": 3.2817761313505226e-06, "loss": 0.0468, "step": 4841 }, { "epoch": 2.5205622071837586, "grad_norm": 0.22523059843548607, "learning_rate": 3.2748207399345534e-06, "loss": 0.0431, "step": 4842 }, { "epoch": 2.5210827693909423, "grad_norm": 0.22572775303764772, "learning_rate": 3.267872210341194e-06, "loss": 0.0445, "step": 4843 }, { "epoch": 2.521603331598126, "grad_norm": 0.23141678410475952, "learning_rate": 3.2609305447651145e-06, "loss": 0.0473, "step": 4844 }, { "epoch": 2.52212389380531, "grad_norm": 0.4886518611240305, "learning_rate": 3.2539957453988244e-06, "loss": 0.0473, "step": 4845 }, { "epoch": 2.5226444560124937, "grad_norm": 0.23911782956874156, "learning_rate": 3.2470678144326442e-06, "loss": 0.0469, "step": 4846 }, { "epoch": 2.523165018219677, "grad_norm": 0.22949597789881793, "learning_rate": 3.2401467540547485e-06, "loss": 0.044, "step": 4847 }, { "epoch": 2.5236855804268608, "grad_norm": 0.22907687990668468, "learning_rate": 3.233232566451119e-06, "loss": 0.0448, "step": 4848 }, { "epoch": 2.5242061426340445, "grad_norm": 0.23226555702427493, "learning_rate": 3.2263252538055816e-06, "loss": 0.0457, "step": 4849 }, { "epoch": 2.5247267048412283, "grad_norm": 0.22719820874022031, "learning_rate": 3.2194248182997904e-06, "loss": 0.0446, "step": 4850 }, { "epoch": 2.525247267048412, "grad_norm": 0.2207201168091615, "learning_rate": 3.2125312621132274e-06, "loss": 0.0455, "step": 4851 }, { "epoch": 2.525767829255596, "grad_norm": 0.21985764504983096, "learning_rate": 3.2056445874231873e-06, "loss": 0.0449, "step": 4852 }, { "epoch": 2.5262883914627796, "grad_norm": 0.22836848944195356, "learning_rate": 3.198764796404807e-06, "loss": 0.0449, "step": 4853 }, { "epoch": 2.5268089536699634, "grad_norm": 0.24033077551127982, "learning_rate": 3.191891891231055e-06, "loss": 0.0457, "step": 4854 }, { "epoch": 2.527329515877147, "grad_norm": 0.2412658133769983, "learning_rate": 3.1850258740726975e-06, "loss": 0.0458, "step": 4855 }, { "epoch": 2.527850078084331, "grad_norm": 0.22859774083041914, "learning_rate": 3.178166747098357e-06, "loss": 0.0436, "step": 4856 }, { "epoch": 2.5283706402915147, "grad_norm": 0.23096275204448605, "learning_rate": 3.171314512474452e-06, "loss": 0.0458, "step": 4857 }, { "epoch": 2.5288912024986985, "grad_norm": 0.22706448675029148, "learning_rate": 3.1644691723652448e-06, "loss": 0.0437, "step": 4858 }, { "epoch": 2.5294117647058822, "grad_norm": 0.2378141750228416, "learning_rate": 3.1576307289328117e-06, "loss": 0.0483, "step": 4859 }, { "epoch": 2.529932326913066, "grad_norm": 0.2289339944308606, "learning_rate": 3.1507991843370526e-06, "loss": 0.0437, "step": 4860 }, { "epoch": 2.53045288912025, "grad_norm": 0.23778111816368155, "learning_rate": 3.1439745407356835e-06, "loss": 0.0471, "step": 4861 }, { "epoch": 2.5309734513274336, "grad_norm": 0.2176967688986415, "learning_rate": 3.1371568002842437e-06, "loss": 0.0434, "step": 4862 }, { "epoch": 2.5314940135346173, "grad_norm": 0.22833640651862924, "learning_rate": 3.1303459651361026e-06, "loss": 0.0459, "step": 4863 }, { "epoch": 2.532014575741801, "grad_norm": 0.23296274792690516, "learning_rate": 3.123542037442426e-06, "loss": 0.0448, "step": 4864 }, { "epoch": 2.532535137948985, "grad_norm": 0.23036570209219906, "learning_rate": 3.1167450193522214e-06, "loss": 0.0456, "step": 4865 }, { "epoch": 2.5330557001561687, "grad_norm": 0.22673874717998874, "learning_rate": 3.1099549130122944e-06, "loss": 0.0444, "step": 4866 }, { "epoch": 2.5335762623633524, "grad_norm": 0.23336621523932544, "learning_rate": 3.10317172056728e-06, "loss": 0.0459, "step": 4867 }, { "epoch": 2.534096824570536, "grad_norm": 0.22381256494910945, "learning_rate": 3.0963954441596277e-06, "loss": 0.0426, "step": 4868 }, { "epoch": 2.53461738677772, "grad_norm": 0.22193048669411228, "learning_rate": 3.0896260859296035e-06, "loss": 0.0428, "step": 4869 }, { "epoch": 2.5351379489849037, "grad_norm": 0.233380501372848, "learning_rate": 3.082863648015277e-06, "loss": 0.0448, "step": 4870 }, { "epoch": 2.5356585111920875, "grad_norm": 0.24772658711823495, "learning_rate": 3.076108132552549e-06, "loss": 0.0464, "step": 4871 }, { "epoch": 2.5361790733992713, "grad_norm": 0.2333756921366926, "learning_rate": 3.0693595416751207e-06, "loss": 0.0449, "step": 4872 }, { "epoch": 2.536699635606455, "grad_norm": 0.22180824102144125, "learning_rate": 3.0626178775145175e-06, "loss": 0.0445, "step": 4873 }, { "epoch": 2.537220197813639, "grad_norm": 0.23966984007720837, "learning_rate": 3.0558831422000695e-06, "loss": 0.0465, "step": 4874 }, { "epoch": 2.5377407600208226, "grad_norm": 0.23454749585264367, "learning_rate": 3.0491553378589084e-06, "loss": 0.0446, "step": 4875 }, { "epoch": 2.5382613222280064, "grad_norm": 0.24426571149502913, "learning_rate": 3.042434466615998e-06, "loss": 0.0478, "step": 4876 }, { "epoch": 2.53878188443519, "grad_norm": 0.2350973499571893, "learning_rate": 3.0357205305940993e-06, "loss": 0.0436, "step": 4877 }, { "epoch": 2.539302446642374, "grad_norm": 0.2326275497622743, "learning_rate": 3.0290135319137908e-06, "loss": 0.0465, "step": 4878 }, { "epoch": 2.5398230088495577, "grad_norm": 0.2281455888156863, "learning_rate": 3.0223134726934472e-06, "loss": 0.043, "step": 4879 }, { "epoch": 2.5403435710567415, "grad_norm": 0.24217000636895653, "learning_rate": 3.015620355049262e-06, "loss": 0.0473, "step": 4880 }, { "epoch": 2.5408641332639252, "grad_norm": 0.23048820939944398, "learning_rate": 3.0089341810952327e-06, "loss": 0.0443, "step": 4881 }, { "epoch": 2.541384695471109, "grad_norm": 0.22378119144167463, "learning_rate": 3.0022549529431704e-06, "loss": 0.0438, "step": 4882 }, { "epoch": 2.541905257678293, "grad_norm": 0.2296752533850872, "learning_rate": 2.995582672702679e-06, "loss": 0.0452, "step": 4883 }, { "epoch": 2.5424258198854766, "grad_norm": 0.2243414666365788, "learning_rate": 2.9889173424811735e-06, "loss": 0.0442, "step": 4884 }, { "epoch": 2.5429463820926603, "grad_norm": 0.2304455679727754, "learning_rate": 2.98225896438388e-06, "loss": 0.0443, "step": 4885 }, { "epoch": 2.543466944299844, "grad_norm": 0.23018490529524177, "learning_rate": 2.9756075405138222e-06, "loss": 0.0454, "step": 4886 }, { "epoch": 2.5439875065070274, "grad_norm": 0.24131348157714574, "learning_rate": 2.9689630729718337e-06, "loss": 0.0454, "step": 4887 }, { "epoch": 2.544508068714211, "grad_norm": 0.21657168953862416, "learning_rate": 2.9623255638565384e-06, "loss": 0.0437, "step": 4888 }, { "epoch": 2.545028630921395, "grad_norm": 0.23781682691026923, "learning_rate": 2.9556950152643757e-06, "loss": 0.0456, "step": 4889 }, { "epoch": 2.5455491931285787, "grad_norm": 0.23395997912589891, "learning_rate": 2.9490714292895822e-06, "loss": 0.0451, "step": 4890 }, { "epoch": 2.5460697553357625, "grad_norm": 0.2439448653848847, "learning_rate": 2.9424548080241978e-06, "loss": 0.0458, "step": 4891 }, { "epoch": 2.5465903175429463, "grad_norm": 0.2184786715073653, "learning_rate": 2.9358451535580534e-06, "loss": 0.043, "step": 4892 }, { "epoch": 2.54711087975013, "grad_norm": 0.22699072515393065, "learning_rate": 2.9292424679787824e-06, "loss": 0.0438, "step": 4893 }, { "epoch": 2.547631441957314, "grad_norm": 0.23484480726234366, "learning_rate": 2.9226467533718244e-06, "loss": 0.0445, "step": 4894 }, { "epoch": 2.5481520041644976, "grad_norm": 0.23093997831111027, "learning_rate": 2.9160580118204104e-06, "loss": 0.0452, "step": 4895 }, { "epoch": 2.5486725663716814, "grad_norm": 0.23405936670463198, "learning_rate": 2.9094762454055847e-06, "loss": 0.0452, "step": 4896 }, { "epoch": 2.549193128578865, "grad_norm": 0.23354436325740416, "learning_rate": 2.902901456206156e-06, "loss": 0.0452, "step": 4897 }, { "epoch": 2.549713690786049, "grad_norm": 0.25244543484114845, "learning_rate": 2.89633364629876e-06, "loss": 0.0492, "step": 4898 }, { "epoch": 2.5502342529932327, "grad_norm": 0.22617895203553662, "learning_rate": 2.889772817757813e-06, "loss": 0.0448, "step": 4899 }, { "epoch": 2.5507548152004165, "grad_norm": 0.23586461110043583, "learning_rate": 2.8832189726555383e-06, "loss": 0.0461, "step": 4900 }, { "epoch": 2.5512753774076002, "grad_norm": 0.22350522590122324, "learning_rate": 2.8766721130619315e-06, "loss": 0.0464, "step": 4901 }, { "epoch": 2.551795939614784, "grad_norm": 0.23067382345830922, "learning_rate": 2.8701322410448095e-06, "loss": 0.0446, "step": 4902 }, { "epoch": 2.552316501821968, "grad_norm": 0.23541409906186989, "learning_rate": 2.8635993586697553e-06, "loss": 0.0455, "step": 4903 }, { "epoch": 2.5528370640291516, "grad_norm": 0.2196421849316751, "learning_rate": 2.8570734680001627e-06, "loss": 0.0417, "step": 4904 }, { "epoch": 2.5533576262363353, "grad_norm": 0.21999629966823, "learning_rate": 2.850554571097211e-06, "loss": 0.0435, "step": 4905 }, { "epoch": 2.553878188443519, "grad_norm": 0.22111791678119153, "learning_rate": 2.844042670019878e-06, "loss": 0.0446, "step": 4906 }, { "epoch": 2.554398750650703, "grad_norm": 0.23774001294637379, "learning_rate": 2.837537766824913e-06, "loss": 0.0455, "step": 4907 }, { "epoch": 2.5549193128578866, "grad_norm": 0.22855614328754512, "learning_rate": 2.8310398635668755e-06, "loss": 0.0449, "step": 4908 }, { "epoch": 2.5554398750650704, "grad_norm": 0.22786759432794976, "learning_rate": 2.824548962298107e-06, "loss": 0.0436, "step": 4909 }, { "epoch": 2.555960437272254, "grad_norm": 0.2217342917078937, "learning_rate": 2.8180650650687287e-06, "loss": 0.0432, "step": 4910 }, { "epoch": 2.5564809994794375, "grad_norm": 0.2337668784988823, "learning_rate": 2.811588173926666e-06, "loss": 0.0463, "step": 4911 }, { "epoch": 2.5570015616866213, "grad_norm": 0.23907810436310875, "learning_rate": 2.8051182909176133e-06, "loss": 0.0467, "step": 4912 }, { "epoch": 2.557522123893805, "grad_norm": 0.23815342311596505, "learning_rate": 2.7986554180850665e-06, "loss": 0.0464, "step": 4913 }, { "epoch": 2.558042686100989, "grad_norm": 0.2310077217504325, "learning_rate": 2.7921995574702986e-06, "loss": 0.0449, "step": 4914 }, { "epoch": 2.5585632483081726, "grad_norm": 0.22853140276683548, "learning_rate": 2.7857507111123755e-06, "loss": 0.0456, "step": 4915 }, { "epoch": 2.5590838105153564, "grad_norm": 0.24059012312722752, "learning_rate": 2.779308881048137e-06, "loss": 0.0431, "step": 4916 }, { "epoch": 2.55960437272254, "grad_norm": 0.24488720395307423, "learning_rate": 2.7728740693122147e-06, "loss": 0.0471, "step": 4917 }, { "epoch": 2.560124934929724, "grad_norm": 0.22806652160138638, "learning_rate": 2.7664462779370293e-06, "loss": 0.0449, "step": 4918 }, { "epoch": 2.5606454971369077, "grad_norm": 0.23288075306841058, "learning_rate": 2.7600255089527626e-06, "loss": 0.0447, "step": 4919 }, { "epoch": 2.5611660593440915, "grad_norm": 0.23782264448640814, "learning_rate": 2.7536117643874067e-06, "loss": 0.0473, "step": 4920 }, { "epoch": 2.5616866215512752, "grad_norm": 0.23345197126927117, "learning_rate": 2.747205046266707e-06, "loss": 0.0452, "step": 4921 }, { "epoch": 2.562207183758459, "grad_norm": 0.22295930818431847, "learning_rate": 2.7408053566142124e-06, "loss": 0.0453, "step": 4922 }, { "epoch": 2.562727745965643, "grad_norm": 0.2400982174167174, "learning_rate": 2.734412697451236e-06, "loss": 0.0448, "step": 4923 }, { "epoch": 2.5632483081728266, "grad_norm": 0.2226346510144299, "learning_rate": 2.7280270707968874e-06, "loss": 0.0413, "step": 4924 }, { "epoch": 2.5637688703800103, "grad_norm": 0.23435826768260656, "learning_rate": 2.721648478668032e-06, "loss": 0.044, "step": 4925 }, { "epoch": 2.564289432587194, "grad_norm": 0.22124720780327345, "learning_rate": 2.715276923079335e-06, "loss": 0.0441, "step": 4926 }, { "epoch": 2.564809994794378, "grad_norm": 0.23111451493845334, "learning_rate": 2.708912406043229e-06, "loss": 0.0443, "step": 4927 }, { "epoch": 2.5653305570015617, "grad_norm": 0.22812981752484784, "learning_rate": 2.70255492956992e-06, "loss": 0.0437, "step": 4928 }, { "epoch": 2.5658511192087454, "grad_norm": 0.2242696338157624, "learning_rate": 2.6962044956674035e-06, "loss": 0.044, "step": 4929 }, { "epoch": 2.566371681415929, "grad_norm": 0.2526958000740198, "learning_rate": 2.689861106341432e-06, "loss": 0.0468, "step": 4930 }, { "epoch": 2.566892243623113, "grad_norm": 0.2275462053374598, "learning_rate": 2.6835247635955463e-06, "loss": 0.0447, "step": 4931 }, { "epoch": 2.5674128058302967, "grad_norm": 0.2237994819214662, "learning_rate": 2.6771954694310597e-06, "loss": 0.0438, "step": 4932 }, { "epoch": 2.5679333680374805, "grad_norm": 0.22406574776804847, "learning_rate": 2.670873225847062e-06, "loss": 0.0435, "step": 4933 }, { "epoch": 2.5684539302446643, "grad_norm": 0.2270917489269596, "learning_rate": 2.664558034840403e-06, "loss": 0.0447, "step": 4934 }, { "epoch": 2.568974492451848, "grad_norm": 0.23089342107757108, "learning_rate": 2.658249898405718e-06, "loss": 0.0456, "step": 4935 }, { "epoch": 2.569495054659032, "grad_norm": 0.2313393425399634, "learning_rate": 2.6519488185354157e-06, "loss": 0.0436, "step": 4936 }, { "epoch": 2.5700156168662156, "grad_norm": 0.2221957268696646, "learning_rate": 2.6456547972196625e-06, "loss": 0.0443, "step": 4937 }, { "epoch": 2.5705361790733994, "grad_norm": 0.23170888298200445, "learning_rate": 2.6393678364464074e-06, "loss": 0.0452, "step": 4938 }, { "epoch": 2.571056741280583, "grad_norm": 0.2308739427020387, "learning_rate": 2.6330879382013617e-06, "loss": 0.0466, "step": 4939 }, { "epoch": 2.571577303487767, "grad_norm": 0.23012613337882634, "learning_rate": 2.6268151044680113e-06, "loss": 0.0443, "step": 4940 }, { "epoch": 2.5720978656949507, "grad_norm": 0.23042253316392428, "learning_rate": 2.620549337227607e-06, "loss": 0.0443, "step": 4941 }, { "epoch": 2.5726184279021345, "grad_norm": 0.22649967085410116, "learning_rate": 2.6142906384591798e-06, "loss": 0.0443, "step": 4942 }, { "epoch": 2.5731389901093182, "grad_norm": 0.22980513686165474, "learning_rate": 2.6080390101395043e-06, "loss": 0.0443, "step": 4943 }, { "epoch": 2.573659552316502, "grad_norm": 0.22269196931932628, "learning_rate": 2.6017944542431393e-06, "loss": 0.043, "step": 4944 }, { "epoch": 2.574180114523686, "grad_norm": 0.2220316442992769, "learning_rate": 2.5955569727424163e-06, "loss": 0.043, "step": 4945 }, { "epoch": 2.5747006767308696, "grad_norm": 0.2382111826632995, "learning_rate": 2.589326567607406e-06, "loss": 0.045, "step": 4946 }, { "epoch": 2.5752212389380533, "grad_norm": 0.2300585724875798, "learning_rate": 2.5831032408059758e-06, "loss": 0.0431, "step": 4947 }, { "epoch": 2.575741801145237, "grad_norm": 0.2279139534194501, "learning_rate": 2.576886994303729e-06, "loss": 0.0461, "step": 4948 }, { "epoch": 2.576262363352421, "grad_norm": 0.23023216030690563, "learning_rate": 2.5706778300640527e-06, "loss": 0.0454, "step": 4949 }, { "epoch": 2.5767829255596046, "grad_norm": 0.22354558169408728, "learning_rate": 2.564475750048087e-06, "loss": 0.0452, "step": 4950 }, { "epoch": 2.577303487766788, "grad_norm": 0.23125790378504177, "learning_rate": 2.5582807562147455e-06, "loss": 0.0448, "step": 4951 }, { "epoch": 2.5778240499739717, "grad_norm": 0.22611942246092737, "learning_rate": 2.552092850520682e-06, "loss": 0.0445, "step": 4952 }, { "epoch": 2.5783446121811555, "grad_norm": 0.2241937478852757, "learning_rate": 2.545912034920331e-06, "loss": 0.0434, "step": 4953 }, { "epoch": 2.5788651743883393, "grad_norm": 0.228033785778967, "learning_rate": 2.5397383113658883e-06, "loss": 0.0461, "step": 4954 }, { "epoch": 2.579385736595523, "grad_norm": 0.2248949405146001, "learning_rate": 2.533571681807295e-06, "loss": 0.0429, "step": 4955 }, { "epoch": 2.579906298802707, "grad_norm": 0.231228519351519, "learning_rate": 2.527412148192265e-06, "loss": 0.0463, "step": 4956 }, { "epoch": 2.5804268610098906, "grad_norm": 0.2171525044347602, "learning_rate": 2.5212597124662564e-06, "loss": 0.0439, "step": 4957 }, { "epoch": 2.5809474232170744, "grad_norm": 0.23091260307400774, "learning_rate": 2.5151143765725027e-06, "loss": 0.046, "step": 4958 }, { "epoch": 2.581467985424258, "grad_norm": 0.21624018139865478, "learning_rate": 2.5089761424519853e-06, "loss": 0.0422, "step": 4959 }, { "epoch": 2.581988547631442, "grad_norm": 0.224287761302505, "learning_rate": 2.5028450120434505e-06, "loss": 0.0438, "step": 4960 }, { "epoch": 2.5825091098386257, "grad_norm": 0.2339375258097004, "learning_rate": 2.4967209872833822e-06, "loss": 0.0449, "step": 4961 }, { "epoch": 2.5830296720458095, "grad_norm": 0.2317454115663667, "learning_rate": 2.4906040701060367e-06, "loss": 0.0453, "step": 4962 }, { "epoch": 2.5835502342529932, "grad_norm": 0.224028657841797, "learning_rate": 2.484494262443429e-06, "loss": 0.0427, "step": 4963 }, { "epoch": 2.584070796460177, "grad_norm": 0.22363155592376474, "learning_rate": 2.4783915662253103e-06, "loss": 0.0432, "step": 4964 }, { "epoch": 2.584591358667361, "grad_norm": 0.24028921492138963, "learning_rate": 2.472295983379205e-06, "loss": 0.0463, "step": 4965 }, { "epoch": 2.5851119208745446, "grad_norm": 0.21789704784809844, "learning_rate": 2.466207515830374e-06, "loss": 0.043, "step": 4966 }, { "epoch": 2.5856324830817283, "grad_norm": 0.23331471651872346, "learning_rate": 2.4601261655018385e-06, "loss": 0.045, "step": 4967 }, { "epoch": 2.586153045288912, "grad_norm": 0.23970513589566356, "learning_rate": 2.4540519343143774e-06, "loss": 0.0456, "step": 4968 }, { "epoch": 2.586673607496096, "grad_norm": 0.22417695780216113, "learning_rate": 2.447984824186514e-06, "loss": 0.043, "step": 4969 }, { "epoch": 2.5871941697032796, "grad_norm": 0.22276549554035466, "learning_rate": 2.441924837034529e-06, "loss": 0.0423, "step": 4970 }, { "epoch": 2.5877147319104634, "grad_norm": 0.22293938392333162, "learning_rate": 2.4358719747724356e-06, "loss": 0.0426, "step": 4971 }, { "epoch": 2.588235294117647, "grad_norm": 0.23230080511957873, "learning_rate": 2.429826239312022e-06, "loss": 0.0465, "step": 4972 }, { "epoch": 2.588755856324831, "grad_norm": 0.24276681968930813, "learning_rate": 2.4237876325628017e-06, "loss": 0.0445, "step": 4973 }, { "epoch": 2.5892764185320147, "grad_norm": 0.22442049970967431, "learning_rate": 2.417756156432055e-06, "loss": 0.0432, "step": 4974 }, { "epoch": 2.589796980739198, "grad_norm": 0.23419001522856595, "learning_rate": 2.411731812824808e-06, "loss": 0.0456, "step": 4975 }, { "epoch": 2.590317542946382, "grad_norm": 0.22466863339121676, "learning_rate": 2.4057146036438135e-06, "loss": 0.0439, "step": 4976 }, { "epoch": 2.5908381051535656, "grad_norm": 0.22312298115872314, "learning_rate": 2.3997045307895954e-06, "loss": 0.0432, "step": 4977 }, { "epoch": 2.5913586673607494, "grad_norm": 0.23722999672753922, "learning_rate": 2.3937015961604165e-06, "loss": 0.0461, "step": 4978 }, { "epoch": 2.591879229567933, "grad_norm": 0.22912082388275234, "learning_rate": 2.3877058016522816e-06, "loss": 0.0447, "step": 4979 }, { "epoch": 2.592399791775117, "grad_norm": 0.2355711003414976, "learning_rate": 2.381717149158935e-06, "loss": 0.0458, "step": 4980 }, { "epoch": 2.5929203539823007, "grad_norm": 0.2281375391556885, "learning_rate": 2.3757356405718854e-06, "loss": 0.0441, "step": 4981 }, { "epoch": 2.5934409161894845, "grad_norm": 0.23163425813558036, "learning_rate": 2.369761277780355e-06, "loss": 0.0445, "step": 4982 }, { "epoch": 2.5939614783966682, "grad_norm": 0.24549021694669057, "learning_rate": 2.3637940626713346e-06, "loss": 0.0471, "step": 4983 }, { "epoch": 2.594482040603852, "grad_norm": 0.2406789780817914, "learning_rate": 2.357833997129552e-06, "loss": 0.0463, "step": 4984 }, { "epoch": 2.595002602811036, "grad_norm": 0.23803338315683714, "learning_rate": 2.3518810830374634e-06, "loss": 0.0448, "step": 4985 }, { "epoch": 2.5955231650182196, "grad_norm": 0.2315546330265174, "learning_rate": 2.3459353222752835e-06, "loss": 0.0471, "step": 4986 }, { "epoch": 2.5960437272254033, "grad_norm": 0.21931566525318558, "learning_rate": 2.3399967167209576e-06, "loss": 0.0425, "step": 4987 }, { "epoch": 2.596564289432587, "grad_norm": 0.22188942682547205, "learning_rate": 2.3340652682501767e-06, "loss": 0.0432, "step": 4988 }, { "epoch": 2.597084851639771, "grad_norm": 0.2292900755538919, "learning_rate": 2.328140978736365e-06, "loss": 0.0432, "step": 4989 }, { "epoch": 2.5976054138469546, "grad_norm": 0.23064657071914493, "learning_rate": 2.3222238500506923e-06, "loss": 0.0449, "step": 4990 }, { "epoch": 2.5981259760541384, "grad_norm": 0.22546541475421025, "learning_rate": 2.3163138840620546e-06, "loss": 0.0439, "step": 4991 }, { "epoch": 2.598646538261322, "grad_norm": 0.23350588400038483, "learning_rate": 2.310411082637101e-06, "loss": 0.0465, "step": 4992 }, { "epoch": 2.599167100468506, "grad_norm": 0.2314847500853126, "learning_rate": 2.3045154476402154e-06, "loss": 0.0453, "step": 4993 }, { "epoch": 2.5996876626756897, "grad_norm": 0.23217992293328432, "learning_rate": 2.2986269809335022e-06, "loss": 0.0446, "step": 4994 }, { "epoch": 2.6002082248828735, "grad_norm": 0.21965356479125087, "learning_rate": 2.2927456843768206e-06, "loss": 0.0426, "step": 4995 }, { "epoch": 2.6007287870900573, "grad_norm": 0.2350569114494813, "learning_rate": 2.286871559827758e-06, "loss": 0.0451, "step": 4996 }, { "epoch": 2.601249349297241, "grad_norm": 0.24155775370205634, "learning_rate": 2.2810046091416374e-06, "loss": 0.0479, "step": 4997 }, { "epoch": 2.601769911504425, "grad_norm": 0.21928794475027916, "learning_rate": 2.2751448341715083e-06, "loss": 0.0453, "step": 4998 }, { "epoch": 2.6022904737116086, "grad_norm": 0.2478180725379701, "learning_rate": 2.269292236768167e-06, "loss": 0.0443, "step": 4999 }, { "epoch": 2.6028110359187924, "grad_norm": 0.22140285555828385, "learning_rate": 2.2634468187801296e-06, "loss": 0.0439, "step": 5000 }, { "epoch": 2.603331598125976, "grad_norm": 0.22451869562094484, "learning_rate": 2.257608582053655e-06, "loss": 0.0446, "step": 5001 }, { "epoch": 2.60385216033316, "grad_norm": 0.23546031458995, "learning_rate": 2.251777528432736e-06, "loss": 0.0451, "step": 5002 }, { "epoch": 2.6043727225403437, "grad_norm": 0.2324359389521522, "learning_rate": 2.2459536597590786e-06, "loss": 0.0452, "step": 5003 }, { "epoch": 2.6048932847475275, "grad_norm": 0.2206269507203779, "learning_rate": 2.240136977872137e-06, "loss": 0.0432, "step": 5004 }, { "epoch": 2.6054138469547112, "grad_norm": 0.22420136480404548, "learning_rate": 2.2343274846090918e-06, "loss": 0.0438, "step": 5005 }, { "epoch": 2.605934409161895, "grad_norm": 0.22114323880099848, "learning_rate": 2.228525181804855e-06, "loss": 0.0447, "step": 5006 }, { "epoch": 2.6064549713690788, "grad_norm": 0.22862454975827587, "learning_rate": 2.2227300712920534e-06, "loss": 0.0433, "step": 5007 }, { "epoch": 2.6069755335762625, "grad_norm": 0.22833262874322383, "learning_rate": 2.2169421549010636e-06, "loss": 0.0444, "step": 5008 }, { "epoch": 2.6074960957834463, "grad_norm": 0.23904644665049268, "learning_rate": 2.2111614344599683e-06, "loss": 0.0459, "step": 5009 }, { "epoch": 2.60801665799063, "grad_norm": 0.2305455365922837, "learning_rate": 2.205387911794593e-06, "loss": 0.0453, "step": 5010 }, { "epoch": 2.608537220197814, "grad_norm": 0.2328552560660912, "learning_rate": 2.19962158872849e-06, "loss": 0.0452, "step": 5011 }, { "epoch": 2.6090577824049976, "grad_norm": 0.22388814260008266, "learning_rate": 2.1938624670829218e-06, "loss": 0.045, "step": 5012 }, { "epoch": 2.6095783446121814, "grad_norm": 0.22661115206828292, "learning_rate": 2.1881105486768945e-06, "loss": 0.0447, "step": 5013 }, { "epoch": 2.610098906819365, "grad_norm": 0.21667312091492352, "learning_rate": 2.18236583532713e-06, "loss": 0.0415, "step": 5014 }, { "epoch": 2.6106194690265485, "grad_norm": 0.23591261253651902, "learning_rate": 2.1766283288480793e-06, "loss": 0.0442, "step": 5015 }, { "epoch": 2.6111400312337323, "grad_norm": 0.2345912526719469, "learning_rate": 2.1708980310519045e-06, "loss": 0.0437, "step": 5016 }, { "epoch": 2.611660593440916, "grad_norm": 0.2268571102942271, "learning_rate": 2.165174943748513e-06, "loss": 0.044, "step": 5017 }, { "epoch": 2.6121811556481, "grad_norm": 0.22718616461448765, "learning_rate": 2.159459068745512e-06, "loss": 0.0449, "step": 5018 }, { "epoch": 2.6127017178552836, "grad_norm": 0.22275521228810305, "learning_rate": 2.1537504078482427e-06, "loss": 0.0413, "step": 5019 }, { "epoch": 2.6132222800624674, "grad_norm": 0.23325969759410764, "learning_rate": 2.1480489628597723e-06, "loss": 0.0457, "step": 5020 }, { "epoch": 2.613742842269651, "grad_norm": 0.23677448755403346, "learning_rate": 2.142354735580873e-06, "loss": 0.0458, "step": 5021 }, { "epoch": 2.614263404476835, "grad_norm": 0.2231689376626884, "learning_rate": 2.1366677278100487e-06, "loss": 0.0445, "step": 5022 }, { "epoch": 2.6147839666840187, "grad_norm": 0.22687556800748557, "learning_rate": 2.1309879413435292e-06, "loss": 0.0447, "step": 5023 }, { "epoch": 2.6153045288912025, "grad_norm": 0.23231196286407194, "learning_rate": 2.125315377975251e-06, "loss": 0.0447, "step": 5024 }, { "epoch": 2.6158250910983862, "grad_norm": 0.23049253549431217, "learning_rate": 2.1196500394968678e-06, "loss": 0.0455, "step": 5025 }, { "epoch": 2.61634565330557, "grad_norm": 0.22425972462104576, "learning_rate": 2.113991927697767e-06, "loss": 0.0449, "step": 5026 }, { "epoch": 2.616866215512754, "grad_norm": 0.22468948485303378, "learning_rate": 2.1083410443650365e-06, "loss": 0.0444, "step": 5027 }, { "epoch": 2.6173867777199376, "grad_norm": 0.2384716964753025, "learning_rate": 2.102697391283487e-06, "loss": 0.0461, "step": 5028 }, { "epoch": 2.6179073399271213, "grad_norm": 0.2316329860082873, "learning_rate": 2.0970609702356563e-06, "loss": 0.045, "step": 5029 }, { "epoch": 2.618427902134305, "grad_norm": 0.22706235586772133, "learning_rate": 2.0914317830017764e-06, "loss": 0.0442, "step": 5030 }, { "epoch": 2.618948464341489, "grad_norm": 0.2283217603622717, "learning_rate": 2.085809831359814e-06, "loss": 0.0434, "step": 5031 }, { "epoch": 2.6194690265486726, "grad_norm": 0.23412543514217077, "learning_rate": 2.08019511708544e-06, "loss": 0.044, "step": 5032 }, { "epoch": 2.6199895887558564, "grad_norm": 0.23322446403477373, "learning_rate": 2.0745876419520442e-06, "loss": 0.0454, "step": 5033 }, { "epoch": 2.62051015096304, "grad_norm": 0.2225537808805055, "learning_rate": 2.0689874077307325e-06, "loss": 0.0441, "step": 5034 }, { "epoch": 2.621030713170224, "grad_norm": 0.22326275306638776, "learning_rate": 2.0633944161903144e-06, "loss": 0.0427, "step": 5035 }, { "epoch": 2.6215512753774077, "grad_norm": 0.2242939602375942, "learning_rate": 2.0578086690973135e-06, "loss": 0.045, "step": 5036 }, { "epoch": 2.6220718375845915, "grad_norm": 0.22728481687598673, "learning_rate": 2.052230168215971e-06, "loss": 0.0456, "step": 5037 }, { "epoch": 2.6225923997917753, "grad_norm": 0.22262615082530046, "learning_rate": 2.0466589153082416e-06, "loss": 0.0433, "step": 5038 }, { "epoch": 2.6231129619989586, "grad_norm": 0.22560826041711385, "learning_rate": 2.041094912133784e-06, "loss": 0.0446, "step": 5039 }, { "epoch": 2.6236335242061424, "grad_norm": 0.23114517964831685, "learning_rate": 2.035538160449968e-06, "loss": 0.0466, "step": 5040 }, { "epoch": 2.624154086413326, "grad_norm": 0.2420652950928089, "learning_rate": 2.0299886620118722e-06, "loss": 0.0458, "step": 5041 }, { "epoch": 2.62467464862051, "grad_norm": 0.22783597835324096, "learning_rate": 2.02444641857229e-06, "loss": 0.0446, "step": 5042 }, { "epoch": 2.6251952108276937, "grad_norm": 0.22160692042897737, "learning_rate": 2.018911431881723e-06, "loss": 0.0446, "step": 5043 }, { "epoch": 2.6257157730348775, "grad_norm": 0.2322363544425751, "learning_rate": 2.0133837036883734e-06, "loss": 0.0451, "step": 5044 }, { "epoch": 2.6262363352420612, "grad_norm": 0.22850250345514178, "learning_rate": 2.007863235738153e-06, "loss": 0.0452, "step": 5045 }, { "epoch": 2.626756897449245, "grad_norm": 0.22005963804167653, "learning_rate": 2.0023500297746828e-06, "loss": 0.0439, "step": 5046 }, { "epoch": 2.627277459656429, "grad_norm": 0.2215482537319163, "learning_rate": 1.9968440875392902e-06, "loss": 0.0429, "step": 5047 }, { "epoch": 2.6277980218636126, "grad_norm": 0.22063378797132666, "learning_rate": 1.9913454107710173e-06, "loss": 0.0425, "step": 5048 }, { "epoch": 2.6283185840707963, "grad_norm": 0.2300782059424934, "learning_rate": 1.9858540012065886e-06, "loss": 0.0443, "step": 5049 }, { "epoch": 2.62883914627798, "grad_norm": 0.21421042163884607, "learning_rate": 1.9803698605804497e-06, "loss": 0.0419, "step": 5050 }, { "epoch": 2.629359708485164, "grad_norm": 0.2273905924198931, "learning_rate": 1.974892990624752e-06, "loss": 0.0448, "step": 5051 }, { "epoch": 2.6298802706923476, "grad_norm": 0.23000101131311979, "learning_rate": 1.96942339306935e-06, "loss": 0.0449, "step": 5052 }, { "epoch": 2.6304008328995314, "grad_norm": 0.2306219483746907, "learning_rate": 1.96396106964179e-06, "loss": 0.0434, "step": 5053 }, { "epoch": 2.630921395106715, "grad_norm": 0.2254745000498489, "learning_rate": 1.9585060220673247e-06, "loss": 0.0441, "step": 5054 }, { "epoch": 2.631441957313899, "grad_norm": 0.23129142941993588, "learning_rate": 1.953058252068915e-06, "loss": 0.0448, "step": 5055 }, { "epoch": 2.6319625195210827, "grad_norm": 0.22233175432556077, "learning_rate": 1.9476177613672237e-06, "loss": 0.0438, "step": 5056 }, { "epoch": 2.6324830817282665, "grad_norm": 0.22444580299782396, "learning_rate": 1.942184551680612e-06, "loss": 0.0451, "step": 5057 }, { "epoch": 2.6330036439354503, "grad_norm": 0.22412724223445896, "learning_rate": 1.9367586247251323e-06, "loss": 0.044, "step": 5058 }, { "epoch": 2.633524206142634, "grad_norm": 0.24134427074612705, "learning_rate": 1.93133998221455e-06, "loss": 0.0462, "step": 5059 }, { "epoch": 2.634044768349818, "grad_norm": 0.23306486452552985, "learning_rate": 1.9259286258603263e-06, "loss": 0.0448, "step": 5060 }, { "epoch": 2.6345653305570016, "grad_norm": 0.2357629035963088, "learning_rate": 1.9205245573716197e-06, "loss": 0.0464, "step": 5061 }, { "epoch": 2.6350858927641854, "grad_norm": 0.22558172778330046, "learning_rate": 1.9151277784552864e-06, "loss": 0.0442, "step": 5062 }, { "epoch": 2.635606454971369, "grad_norm": 0.23769331834375482, "learning_rate": 1.9097382908158713e-06, "loss": 0.0448, "step": 5063 }, { "epoch": 2.636127017178553, "grad_norm": 0.22352280580432798, "learning_rate": 1.9043560961556323e-06, "loss": 0.0448, "step": 5064 }, { "epoch": 2.6366475793857367, "grad_norm": 0.24155333261345938, "learning_rate": 1.898981196174518e-06, "loss": 0.0456, "step": 5065 }, { "epoch": 2.6371681415929205, "grad_norm": 0.2153187541406319, "learning_rate": 1.8936135925701732e-06, "loss": 0.0428, "step": 5066 }, { "epoch": 2.6376887038001042, "grad_norm": 0.23310151041795749, "learning_rate": 1.8882532870379331e-06, "loss": 0.0447, "step": 5067 }, { "epoch": 2.638209266007288, "grad_norm": 0.23186210725957584, "learning_rate": 1.8829002812708302e-06, "loss": 0.0456, "step": 5068 }, { "epoch": 2.6387298282144718, "grad_norm": 0.24386297462518527, "learning_rate": 1.8775545769595975e-06, "loss": 0.0454, "step": 5069 }, { "epoch": 2.6392503904216555, "grad_norm": 0.21918598958276325, "learning_rate": 1.8722161757926597e-06, "loss": 0.0433, "step": 5070 }, { "epoch": 2.6397709526288393, "grad_norm": 0.22662770705374358, "learning_rate": 1.866885079456121e-06, "loss": 0.0403, "step": 5071 }, { "epoch": 2.640291514836023, "grad_norm": 0.22853002659541766, "learning_rate": 1.8615612896338036e-06, "loss": 0.0438, "step": 5072 }, { "epoch": 2.640812077043207, "grad_norm": 0.22941524162811514, "learning_rate": 1.856244808007196e-06, "loss": 0.044, "step": 5073 }, { "epoch": 2.6413326392503906, "grad_norm": 0.22276184054208367, "learning_rate": 1.8509356362554963e-06, "loss": 0.0451, "step": 5074 }, { "epoch": 2.6418532014575744, "grad_norm": 0.22604762542260493, "learning_rate": 1.8456337760555915e-06, "loss": 0.0447, "step": 5075 }, { "epoch": 2.642373763664758, "grad_norm": 0.23677583990143997, "learning_rate": 1.840339229082047e-06, "loss": 0.0455, "step": 5076 }, { "epoch": 2.642894325871942, "grad_norm": 0.2286482680320122, "learning_rate": 1.8350519970071312e-06, "loss": 0.0442, "step": 5077 }, { "epoch": 2.6434148880791257, "grad_norm": 0.22736556778997194, "learning_rate": 1.8297720815007969e-06, "loss": 0.0453, "step": 5078 }, { "epoch": 2.643935450286309, "grad_norm": 0.22259072643024455, "learning_rate": 1.824499484230696e-06, "loss": 0.0435, "step": 5079 }, { "epoch": 2.644456012493493, "grad_norm": 0.2199119162247651, "learning_rate": 1.819234206862147e-06, "loss": 0.0417, "step": 5080 }, { "epoch": 2.6449765747006766, "grad_norm": 0.24083225482663506, "learning_rate": 1.8139762510581804e-06, "loss": 0.0451, "step": 5081 }, { "epoch": 2.6454971369078604, "grad_norm": 0.23292508889547486, "learning_rate": 1.8087256184794953e-06, "loss": 0.0463, "step": 5082 }, { "epoch": 2.646017699115044, "grad_norm": 0.2300747118454459, "learning_rate": 1.8034823107844878e-06, "loss": 0.0445, "step": 5083 }, { "epoch": 2.646538261322228, "grad_norm": 0.2363535678951791, "learning_rate": 1.798246329629244e-06, "loss": 0.0431, "step": 5084 }, { "epoch": 2.6470588235294117, "grad_norm": 0.22966034657954063, "learning_rate": 1.793017676667519e-06, "loss": 0.0437, "step": 5085 }, { "epoch": 2.6475793857365955, "grad_norm": 0.22141333009963216, "learning_rate": 1.7877963535507748e-06, "loss": 0.044, "step": 5086 }, { "epoch": 2.6480999479437792, "grad_norm": 0.22871432021572957, "learning_rate": 1.7825823619281452e-06, "loss": 0.0427, "step": 5087 }, { "epoch": 2.648620510150963, "grad_norm": 0.22928264134273937, "learning_rate": 1.7773757034464545e-06, "loss": 0.0458, "step": 5088 }, { "epoch": 2.6491410723581468, "grad_norm": 0.22874179067788555, "learning_rate": 1.7721763797501984e-06, "loss": 0.0448, "step": 5089 }, { "epoch": 2.6496616345653305, "grad_norm": 0.23127911334565235, "learning_rate": 1.766984392481577e-06, "loss": 0.0439, "step": 5090 }, { "epoch": 2.6501821967725143, "grad_norm": 0.22687967610681098, "learning_rate": 1.761799743280454e-06, "loss": 0.0447, "step": 5091 }, { "epoch": 2.650702758979698, "grad_norm": 0.21699331621599646, "learning_rate": 1.75662243378438e-06, "loss": 0.0419, "step": 5092 }, { "epoch": 2.651223321186882, "grad_norm": 0.23305173793665826, "learning_rate": 1.751452465628603e-06, "loss": 0.0435, "step": 5093 }, { "epoch": 2.6517438833940656, "grad_norm": 0.230107160982093, "learning_rate": 1.7462898404460275e-06, "loss": 0.041, "step": 5094 }, { "epoch": 2.6522644456012494, "grad_norm": 0.2298327980073945, "learning_rate": 1.7411345598672523e-06, "loss": 0.044, "step": 5095 }, { "epoch": 2.652785007808433, "grad_norm": 0.2309553511497464, "learning_rate": 1.7359866255205609e-06, "loss": 0.0447, "step": 5096 }, { "epoch": 2.653305570015617, "grad_norm": 0.23228524615867757, "learning_rate": 1.7308460390319025e-06, "loss": 0.0443, "step": 5097 }, { "epoch": 2.6538261322228007, "grad_norm": 0.2323659311301738, "learning_rate": 1.7257128020249258e-06, "loss": 0.0464, "step": 5098 }, { "epoch": 2.6543466944299845, "grad_norm": 0.23831555902873636, "learning_rate": 1.7205869161209365e-06, "loss": 0.0465, "step": 5099 }, { "epoch": 2.6548672566371683, "grad_norm": 0.22428319592649987, "learning_rate": 1.7154683829389284e-06, "loss": 0.0421, "step": 5100 }, { "epoch": 2.655387818844352, "grad_norm": 0.23529402262221027, "learning_rate": 1.7103572040955696e-06, "loss": 0.0442, "step": 5101 }, { "epoch": 2.655908381051536, "grad_norm": 0.23011207860973742, "learning_rate": 1.7052533812052157e-06, "loss": 0.0448, "step": 5102 }, { "epoch": 2.656428943258719, "grad_norm": 0.22964408679947323, "learning_rate": 1.7001569158798914e-06, "loss": 0.0439, "step": 5103 }, { "epoch": 2.656949505465903, "grad_norm": 0.22620381387212382, "learning_rate": 1.6950678097292893e-06, "loss": 0.0432, "step": 5104 }, { "epoch": 2.6574700676730867, "grad_norm": 0.23347058886316926, "learning_rate": 1.6899860643607907e-06, "loss": 0.0458, "step": 5105 }, { "epoch": 2.6579906298802705, "grad_norm": 0.2229703409152849, "learning_rate": 1.6849116813794503e-06, "loss": 0.0436, "step": 5106 }, { "epoch": 2.6585111920874542, "grad_norm": 0.22996346900113476, "learning_rate": 1.6798446623879915e-06, "loss": 0.0448, "step": 5107 }, { "epoch": 2.659031754294638, "grad_norm": 0.22933075899722521, "learning_rate": 1.6747850089868178e-06, "loss": 0.0454, "step": 5108 }, { "epoch": 2.659552316501822, "grad_norm": 0.22374222520186948, "learning_rate": 1.669732722773995e-06, "loss": 0.045, "step": 5109 }, { "epoch": 2.6600728787090056, "grad_norm": 0.23066068965417216, "learning_rate": 1.6646878053452776e-06, "loss": 0.0431, "step": 5110 }, { "epoch": 2.6605934409161893, "grad_norm": 0.224015399002532, "learning_rate": 1.659650258294082e-06, "loss": 0.0422, "step": 5111 }, { "epoch": 2.661114003123373, "grad_norm": 0.23582456384176945, "learning_rate": 1.6546200832115028e-06, "loss": 0.0447, "step": 5112 }, { "epoch": 2.661634565330557, "grad_norm": 0.2261905982992855, "learning_rate": 1.649597281686302e-06, "loss": 0.0414, "step": 5113 }, { "epoch": 2.6621551275377406, "grad_norm": 0.23148944353074696, "learning_rate": 1.644581855304911e-06, "loss": 0.0451, "step": 5114 }, { "epoch": 2.6626756897449244, "grad_norm": 0.22331597314340787, "learning_rate": 1.639573805651437e-06, "loss": 0.0419, "step": 5115 }, { "epoch": 2.663196251952108, "grad_norm": 0.22397006275035072, "learning_rate": 1.6345731343076626e-06, "loss": 0.0437, "step": 5116 }, { "epoch": 2.663716814159292, "grad_norm": 0.2324005116144098, "learning_rate": 1.629579842853024e-06, "loss": 0.0457, "step": 5117 }, { "epoch": 2.6642373763664757, "grad_norm": 0.22967664976854812, "learning_rate": 1.624593932864632e-06, "loss": 0.0445, "step": 5118 }, { "epoch": 2.6647579385736595, "grad_norm": 0.23099217882260017, "learning_rate": 1.6196154059172742e-06, "loss": 0.0437, "step": 5119 }, { "epoch": 2.6652785007808433, "grad_norm": 0.23529689019600636, "learning_rate": 1.6146442635834008e-06, "loss": 0.0452, "step": 5120 }, { "epoch": 2.665799062988027, "grad_norm": 0.24041486589271255, "learning_rate": 1.6096805074331338e-06, "loss": 0.0459, "step": 5121 }, { "epoch": 2.666319625195211, "grad_norm": 0.2268769288085849, "learning_rate": 1.6047241390342498e-06, "loss": 0.0449, "step": 5122 }, { "epoch": 2.6668401874023946, "grad_norm": 0.22397701381379848, "learning_rate": 1.599775159952205e-06, "loss": 0.0448, "step": 5123 }, { "epoch": 2.6673607496095784, "grad_norm": 0.2243649170475879, "learning_rate": 1.5948335717501179e-06, "loss": 0.0422, "step": 5124 }, { "epoch": 2.667881311816762, "grad_norm": 0.23886065101491008, "learning_rate": 1.5898993759887765e-06, "loss": 0.0452, "step": 5125 }, { "epoch": 2.668401874023946, "grad_norm": 0.22837358130534788, "learning_rate": 1.5849725742266231e-06, "loss": 0.043, "step": 5126 }, { "epoch": 2.6689224362311297, "grad_norm": 0.22398559556865122, "learning_rate": 1.5800531680197683e-06, "loss": 0.0443, "step": 5127 }, { "epoch": 2.6694429984383135, "grad_norm": 0.22326877672343834, "learning_rate": 1.5751411589219945e-06, "loss": 0.0415, "step": 5128 }, { "epoch": 2.6699635606454972, "grad_norm": 0.24686809704433404, "learning_rate": 1.570236548484741e-06, "loss": 0.0448, "step": 5129 }, { "epoch": 2.670484122852681, "grad_norm": 0.22496284749102272, "learning_rate": 1.5653393382571158e-06, "loss": 0.0436, "step": 5130 }, { "epoch": 2.6710046850598648, "grad_norm": 0.2216536409582885, "learning_rate": 1.560449529785879e-06, "loss": 0.0427, "step": 5131 }, { "epoch": 2.6715252472670485, "grad_norm": 0.23039373528584386, "learning_rate": 1.5555671246154647e-06, "loss": 0.0456, "step": 5132 }, { "epoch": 2.6720458094742323, "grad_norm": 0.2267257512361847, "learning_rate": 1.5506921242879612e-06, "loss": 0.0438, "step": 5133 }, { "epoch": 2.672566371681416, "grad_norm": 0.22909660719838928, "learning_rate": 1.5458245303431262e-06, "loss": 0.0442, "step": 5134 }, { "epoch": 2.6730869338886, "grad_norm": 0.23207465894660437, "learning_rate": 1.5409643443183658e-06, "loss": 0.0452, "step": 5135 }, { "epoch": 2.6736074960957836, "grad_norm": 0.23464400344667644, "learning_rate": 1.5361115677487548e-06, "loss": 0.0452, "step": 5136 }, { "epoch": 2.6741280583029674, "grad_norm": 0.23955054324523992, "learning_rate": 1.5312662021670227e-06, "loss": 0.0451, "step": 5137 }, { "epoch": 2.674648620510151, "grad_norm": 0.22781012646604124, "learning_rate": 1.5264282491035676e-06, "loss": 0.0427, "step": 5138 }, { "epoch": 2.675169182717335, "grad_norm": 0.22466051885015176, "learning_rate": 1.5215977100864392e-06, "loss": 0.0442, "step": 5139 }, { "epoch": 2.6756897449245187, "grad_norm": 0.223780756891236, "learning_rate": 1.5167745866413424e-06, "loss": 0.0442, "step": 5140 }, { "epoch": 2.6762103071317025, "grad_norm": 0.21773883268442826, "learning_rate": 1.5119588802916445e-06, "loss": 0.0426, "step": 5141 }, { "epoch": 2.6767308693388863, "grad_norm": 0.23066656195993457, "learning_rate": 1.5071505925583735e-06, "loss": 0.044, "step": 5142 }, { "epoch": 2.6772514315460696, "grad_norm": 0.2226523056001122, "learning_rate": 1.5023497249602086e-06, "loss": 0.0438, "step": 5143 }, { "epoch": 2.6777719937532534, "grad_norm": 0.2188974751683572, "learning_rate": 1.49755627901349e-06, "loss": 0.0435, "step": 5144 }, { "epoch": 2.678292555960437, "grad_norm": 0.2229966329377455, "learning_rate": 1.4927702562322037e-06, "loss": 0.0428, "step": 5145 }, { "epoch": 2.678813118167621, "grad_norm": 0.24359513195681573, "learning_rate": 1.4879916581280045e-06, "loss": 0.0463, "step": 5146 }, { "epoch": 2.6793336803748047, "grad_norm": 0.23261019586712137, "learning_rate": 1.4832204862101906e-06, "loss": 0.0429, "step": 5147 }, { "epoch": 2.6798542425819885, "grad_norm": 0.2157186684270539, "learning_rate": 1.4784567419857314e-06, "loss": 0.0425, "step": 5148 }, { "epoch": 2.6803748047891722, "grad_norm": 0.23508661356548638, "learning_rate": 1.4737004269592236e-06, "loss": 0.045, "step": 5149 }, { "epoch": 2.680895366996356, "grad_norm": 0.22272454558258475, "learning_rate": 1.468951542632943e-06, "loss": 0.0437, "step": 5150 }, { "epoch": 2.6814159292035398, "grad_norm": 0.22708962558308424, "learning_rate": 1.4642100905068068e-06, "loss": 0.0451, "step": 5151 }, { "epoch": 2.6819364914107235, "grad_norm": 0.22570865365739282, "learning_rate": 1.4594760720783863e-06, "loss": 0.0426, "step": 5152 }, { "epoch": 2.6824570536179073, "grad_norm": 0.22957967759748524, "learning_rate": 1.4547494888429074e-06, "loss": 0.0432, "step": 5153 }, { "epoch": 2.682977615825091, "grad_norm": 0.22887509990760044, "learning_rate": 1.4500303422932348e-06, "loss": 0.0434, "step": 5154 }, { "epoch": 2.683498178032275, "grad_norm": 0.23098550242907323, "learning_rate": 1.4453186339199037e-06, "loss": 0.0452, "step": 5155 }, { "epoch": 2.6840187402394586, "grad_norm": 0.22547319635189877, "learning_rate": 1.4406143652110875e-06, "loss": 0.0423, "step": 5156 }, { "epoch": 2.6845393024466424, "grad_norm": 0.236534240012221, "learning_rate": 1.4359175376526174e-06, "loss": 0.0469, "step": 5157 }, { "epoch": 2.685059864653826, "grad_norm": 0.22335001771494034, "learning_rate": 1.431228152727962e-06, "loss": 0.0441, "step": 5158 }, { "epoch": 2.68558042686101, "grad_norm": 0.22432122506376032, "learning_rate": 1.4265462119182532e-06, "loss": 0.0421, "step": 5159 }, { "epoch": 2.6861009890681937, "grad_norm": 0.22882569581384093, "learning_rate": 1.4218717167022638e-06, "loss": 0.0432, "step": 5160 }, { "epoch": 2.6866215512753775, "grad_norm": 0.23103308439566217, "learning_rate": 1.4172046685564212e-06, "loss": 0.0447, "step": 5161 }, { "epoch": 2.6871421134825613, "grad_norm": 0.22503412132865003, "learning_rate": 1.412545068954796e-06, "loss": 0.0435, "step": 5162 }, { "epoch": 2.687662675689745, "grad_norm": 0.2228402807168184, "learning_rate": 1.4078929193690998e-06, "loss": 0.0426, "step": 5163 }, { "epoch": 2.688183237896929, "grad_norm": 0.2287986065956918, "learning_rate": 1.4032482212686993e-06, "loss": 0.0434, "step": 5164 }, { "epoch": 2.6887038001041126, "grad_norm": 0.2320753787979263, "learning_rate": 1.3986109761206095e-06, "loss": 0.0441, "step": 5165 }, { "epoch": 2.6892243623112964, "grad_norm": 0.23129496956085294, "learning_rate": 1.3939811853894896e-06, "loss": 0.0456, "step": 5166 }, { "epoch": 2.6897449245184797, "grad_norm": 0.2345935034216518, "learning_rate": 1.389358850537642e-06, "loss": 0.0448, "step": 5167 }, { "epoch": 2.6902654867256635, "grad_norm": 0.23234893177636318, "learning_rate": 1.38474397302501e-06, "loss": 0.0442, "step": 5168 }, { "epoch": 2.6907860489328472, "grad_norm": 0.23634107034074991, "learning_rate": 1.3801365543091916e-06, "loss": 0.0458, "step": 5169 }, { "epoch": 2.691306611140031, "grad_norm": 0.23696395645252785, "learning_rate": 1.3755365958454254e-06, "loss": 0.0451, "step": 5170 }, { "epoch": 2.6918271733472148, "grad_norm": 0.22469488655336936, "learning_rate": 1.3709440990865908e-06, "loss": 0.0447, "step": 5171 }, { "epoch": 2.6923477355543985, "grad_norm": 0.23085933725904473, "learning_rate": 1.366359065483211e-06, "loss": 0.0446, "step": 5172 }, { "epoch": 2.6928682977615823, "grad_norm": 0.23659193340498502, "learning_rate": 1.3617814964834523e-06, "loss": 0.0455, "step": 5173 }, { "epoch": 2.693388859968766, "grad_norm": 0.2189403460488524, "learning_rate": 1.3572113935331226e-06, "loss": 0.044, "step": 5174 }, { "epoch": 2.69390942217595, "grad_norm": 0.2247914833766047, "learning_rate": 1.3526487580756752e-06, "loss": 0.0441, "step": 5175 }, { "epoch": 2.6944299843831336, "grad_norm": 0.21893245384497728, "learning_rate": 1.3480935915522075e-06, "loss": 0.043, "step": 5176 }, { "epoch": 2.6949505465903174, "grad_norm": 0.23444838887341896, "learning_rate": 1.3435458954014463e-06, "loss": 0.0453, "step": 5177 }, { "epoch": 2.695471108797501, "grad_norm": 0.23107549495251808, "learning_rate": 1.3390056710597649e-06, "loss": 0.0442, "step": 5178 }, { "epoch": 2.695991671004685, "grad_norm": 0.22611578165673227, "learning_rate": 1.3344729199611827e-06, "loss": 0.0458, "step": 5179 }, { "epoch": 2.6965122332118687, "grad_norm": 0.22290496338439902, "learning_rate": 1.3299476435373548e-06, "loss": 0.0429, "step": 5180 }, { "epoch": 2.6970327954190525, "grad_norm": 0.21442995257650893, "learning_rate": 1.3254298432175682e-06, "loss": 0.0424, "step": 5181 }, { "epoch": 2.6975533576262363, "grad_norm": 0.21876017392018013, "learning_rate": 1.320919520428754e-06, "loss": 0.0432, "step": 5182 }, { "epoch": 2.69807391983342, "grad_norm": 0.22822179337752255, "learning_rate": 1.3164166765954861e-06, "loss": 0.0454, "step": 5183 }, { "epoch": 2.698594482040604, "grad_norm": 0.23403568093314686, "learning_rate": 1.3119213131399688e-06, "loss": 0.0468, "step": 5184 }, { "epoch": 2.6991150442477876, "grad_norm": 0.22524839908609418, "learning_rate": 1.3074334314820551e-06, "loss": 0.0447, "step": 5185 }, { "epoch": 2.6996356064549714, "grad_norm": 0.22270469667567147, "learning_rate": 1.3029530330392143e-06, "loss": 0.0442, "step": 5186 }, { "epoch": 2.700156168662155, "grad_norm": 0.23012017240768629, "learning_rate": 1.2984801192265749e-06, "loss": 0.0439, "step": 5187 }, { "epoch": 2.700676730869339, "grad_norm": 0.21642941461801396, "learning_rate": 1.2940146914568852e-06, "loss": 0.0438, "step": 5188 }, { "epoch": 2.7011972930765227, "grad_norm": 0.22328673189570775, "learning_rate": 1.2895567511405414e-06, "loss": 0.0439, "step": 5189 }, { "epoch": 2.7017178552837064, "grad_norm": 0.23823980764075436, "learning_rate": 1.285106299685565e-06, "loss": 0.0457, "step": 5190 }, { "epoch": 2.70223841749089, "grad_norm": 0.22391174202738537, "learning_rate": 1.280663338497609e-06, "loss": 0.0448, "step": 5191 }, { "epoch": 2.702758979698074, "grad_norm": 0.22413624516419742, "learning_rate": 1.276227868979976e-06, "loss": 0.0422, "step": 5192 }, { "epoch": 2.7032795419052578, "grad_norm": 0.219580469635659, "learning_rate": 1.2717998925335927e-06, "loss": 0.0425, "step": 5193 }, { "epoch": 2.7038001041124415, "grad_norm": 0.23393120071768486, "learning_rate": 1.267379410557018e-06, "loss": 0.0463, "step": 5194 }, { "epoch": 2.7043206663196253, "grad_norm": 0.22651080633930457, "learning_rate": 1.2629664244464463e-06, "loss": 0.0446, "step": 5195 }, { "epoch": 2.704841228526809, "grad_norm": 0.2345516458739699, "learning_rate": 1.258560935595704e-06, "loss": 0.0457, "step": 5196 }, { "epoch": 2.705361790733993, "grad_norm": 0.21826699278019435, "learning_rate": 1.2541629453962479e-06, "loss": 0.0427, "step": 5197 }, { "epoch": 2.7058823529411766, "grad_norm": 0.21948973250772175, "learning_rate": 1.2497724552371747e-06, "loss": 0.0428, "step": 5198 }, { "epoch": 2.7064029151483604, "grad_norm": 0.23291910481811462, "learning_rate": 1.2453894665052008e-06, "loss": 0.0445, "step": 5199 }, { "epoch": 2.706923477355544, "grad_norm": 0.21755848524659785, "learning_rate": 1.2410139805846738e-06, "loss": 0.0423, "step": 5200 }, { "epoch": 2.707444039562728, "grad_norm": 0.2319847511448208, "learning_rate": 1.2366459988575774e-06, "loss": 0.0438, "step": 5201 }, { "epoch": 2.7079646017699117, "grad_norm": 0.2303536354287684, "learning_rate": 1.2322855227035301e-06, "loss": 0.0439, "step": 5202 }, { "epoch": 2.7084851639770955, "grad_norm": 0.23065828693762852, "learning_rate": 1.2279325534997693e-06, "loss": 0.0446, "step": 5203 }, { "epoch": 2.7090057261842793, "grad_norm": 0.24141647025860344, "learning_rate": 1.2235870926211619e-06, "loss": 0.0457, "step": 5204 }, { "epoch": 2.709526288391463, "grad_norm": 0.2350329888138472, "learning_rate": 1.2192491414402096e-06, "loss": 0.044, "step": 5205 }, { "epoch": 2.710046850598647, "grad_norm": 0.231757509837116, "learning_rate": 1.2149187013270392e-06, "loss": 0.045, "step": 5206 }, { "epoch": 2.71056741280583, "grad_norm": 0.2317717587228352, "learning_rate": 1.2105957736494089e-06, "loss": 0.0435, "step": 5207 }, { "epoch": 2.711087975013014, "grad_norm": 0.22055288340775916, "learning_rate": 1.2062803597726963e-06, "loss": 0.0411, "step": 5208 }, { "epoch": 2.7116085372201977, "grad_norm": 0.2268087132163455, "learning_rate": 1.2019724610599081e-06, "loss": 0.0436, "step": 5209 }, { "epoch": 2.7121290994273815, "grad_norm": 0.2303811370184185, "learning_rate": 1.197672078871681e-06, "loss": 0.044, "step": 5210 }, { "epoch": 2.7126496616345652, "grad_norm": 0.23071049199969476, "learning_rate": 1.193379214566276e-06, "loss": 0.0446, "step": 5211 }, { "epoch": 2.713170223841749, "grad_norm": 0.22811091103571507, "learning_rate": 1.1890938694995829e-06, "loss": 0.0432, "step": 5212 }, { "epoch": 2.7136907860489328, "grad_norm": 0.2379584189281919, "learning_rate": 1.1848160450251083e-06, "loss": 0.0463, "step": 5213 }, { "epoch": 2.7142113482561165, "grad_norm": 0.22764918185318186, "learning_rate": 1.180545742493988e-06, "loss": 0.0442, "step": 5214 }, { "epoch": 2.7147319104633003, "grad_norm": 0.2254826094616572, "learning_rate": 1.1762829632549849e-06, "loss": 0.044, "step": 5215 }, { "epoch": 2.715252472670484, "grad_norm": 0.22681141939496943, "learning_rate": 1.1720277086544857e-06, "loss": 0.0433, "step": 5216 }, { "epoch": 2.715773034877668, "grad_norm": 0.21876872488915144, "learning_rate": 1.1677799800364958e-06, "loss": 0.0429, "step": 5217 }, { "epoch": 2.7162935970848516, "grad_norm": 0.22378659997771677, "learning_rate": 1.1635397787426366e-06, "loss": 0.0438, "step": 5218 }, { "epoch": 2.7168141592920354, "grad_norm": 0.2265493115245886, "learning_rate": 1.159307106112173e-06, "loss": 0.0442, "step": 5219 }, { "epoch": 2.717334721499219, "grad_norm": 0.2298762644808635, "learning_rate": 1.1550819634819743e-06, "loss": 0.0443, "step": 5220 }, { "epoch": 2.717855283706403, "grad_norm": 0.22228468158301753, "learning_rate": 1.1508643521865397e-06, "loss": 0.0429, "step": 5221 }, { "epoch": 2.7183758459135867, "grad_norm": 0.23151020928005936, "learning_rate": 1.1466542735579844e-06, "loss": 0.0427, "step": 5222 }, { "epoch": 2.7188964081207705, "grad_norm": 0.2214334447059201, "learning_rate": 1.14245172892605e-06, "loss": 0.0433, "step": 5223 }, { "epoch": 2.7194169703279543, "grad_norm": 0.2283368759330757, "learning_rate": 1.1382567196180916e-06, "loss": 0.0437, "step": 5224 }, { "epoch": 2.719937532535138, "grad_norm": 0.21320513712505007, "learning_rate": 1.1340692469590964e-06, "loss": 0.0422, "step": 5225 }, { "epoch": 2.720458094742322, "grad_norm": 0.21709414283045186, "learning_rate": 1.1298893122716563e-06, "loss": 0.0438, "step": 5226 }, { "epoch": 2.7209786569495056, "grad_norm": 0.2274072043634905, "learning_rate": 1.125716916875988e-06, "loss": 0.0433, "step": 5227 }, { "epoch": 2.7214992191566894, "grad_norm": 0.2275711914621425, "learning_rate": 1.1215520620899311e-06, "loss": 0.0439, "step": 5228 }, { "epoch": 2.722019781363873, "grad_norm": 0.22221898121836894, "learning_rate": 1.1173947492289395e-06, "loss": 0.0428, "step": 5229 }, { "epoch": 2.722540343571057, "grad_norm": 0.21297199594248953, "learning_rate": 1.1132449796060872e-06, "loss": 0.0416, "step": 5230 }, { "epoch": 2.7230609057782402, "grad_norm": 0.2255176774021255, "learning_rate": 1.1091027545320654e-06, "loss": 0.0426, "step": 5231 }, { "epoch": 2.723581467985424, "grad_norm": 0.22910733482663895, "learning_rate": 1.1049680753151798e-06, "loss": 0.0449, "step": 5232 }, { "epoch": 2.7241020301926078, "grad_norm": 0.22540358774719207, "learning_rate": 1.1008409432613525e-06, "loss": 0.0423, "step": 5233 }, { "epoch": 2.7246225923997915, "grad_norm": 0.22668282262981723, "learning_rate": 1.0967213596741327e-06, "loss": 0.0417, "step": 5234 }, { "epoch": 2.7251431546069753, "grad_norm": 0.22941299415233102, "learning_rate": 1.0926093258546655e-06, "loss": 0.0445, "step": 5235 }, { "epoch": 2.725663716814159, "grad_norm": 0.2460091226274877, "learning_rate": 1.0885048431017313e-06, "loss": 0.0474, "step": 5236 }, { "epoch": 2.726184279021343, "grad_norm": 0.22696800191590433, "learning_rate": 1.0844079127117074e-06, "loss": 0.0441, "step": 5237 }, { "epoch": 2.7267048412285266, "grad_norm": 0.2279206422911676, "learning_rate": 1.0803185359786028e-06, "loss": 0.0435, "step": 5238 }, { "epoch": 2.7272254034357104, "grad_norm": 0.2206185588040236, "learning_rate": 1.0762367141940287e-06, "loss": 0.0439, "step": 5239 }, { "epoch": 2.727745965642894, "grad_norm": 0.22844650094437752, "learning_rate": 1.0721624486472209e-06, "loss": 0.0436, "step": 5240 }, { "epoch": 2.728266527850078, "grad_norm": 0.22100974494576425, "learning_rate": 1.0680957406250135e-06, "loss": 0.0436, "step": 5241 }, { "epoch": 2.7287870900572617, "grad_norm": 0.22075747241403315, "learning_rate": 1.0640365914118682e-06, "loss": 0.0424, "step": 5242 }, { "epoch": 2.7293076522644455, "grad_norm": 0.230695539655403, "learning_rate": 1.0599850022898539e-06, "loss": 0.043, "step": 5243 }, { "epoch": 2.7298282144716293, "grad_norm": 0.23168258489285376, "learning_rate": 1.055940974538641e-06, "loss": 0.0431, "step": 5244 }, { "epoch": 2.730348776678813, "grad_norm": 0.22697802722231222, "learning_rate": 1.0519045094355363e-06, "loss": 0.0432, "step": 5245 }, { "epoch": 2.730869338885997, "grad_norm": 0.2308978573693367, "learning_rate": 1.0478756082554304e-06, "loss": 0.0441, "step": 5246 }, { "epoch": 2.7313899010931806, "grad_norm": 0.2304130228518842, "learning_rate": 1.0438542722708445e-06, "loss": 0.0427, "step": 5247 }, { "epoch": 2.7319104633003644, "grad_norm": 0.2282468794565119, "learning_rate": 1.0398405027519016e-06, "loss": 0.0424, "step": 5248 }, { "epoch": 2.732431025507548, "grad_norm": 0.21865000074849475, "learning_rate": 1.0358343009663428e-06, "loss": 0.0422, "step": 5249 }, { "epoch": 2.732951587714732, "grad_norm": 0.21986811718537888, "learning_rate": 1.0318356681795039e-06, "loss": 0.0409, "step": 5250 }, { "epoch": 2.7334721499219157, "grad_norm": 0.22870095194546886, "learning_rate": 1.0278446056543406e-06, "loss": 0.0434, "step": 5251 }, { "epoch": 2.7339927121290994, "grad_norm": 0.22636557592510134, "learning_rate": 1.0238611146514253e-06, "loss": 0.0429, "step": 5252 }, { "epoch": 2.734513274336283, "grad_norm": 0.21850314176206617, "learning_rate": 1.0198851964289185e-06, "loss": 0.0431, "step": 5253 }, { "epoch": 2.735033836543467, "grad_norm": 0.2186586118194875, "learning_rate": 1.01591685224261e-06, "loss": 0.0418, "step": 5254 }, { "epoch": 2.7355543987506508, "grad_norm": 0.24147015358727544, "learning_rate": 1.0119560833458775e-06, "loss": 0.0444, "step": 5255 }, { "epoch": 2.7360749609578345, "grad_norm": 0.2315952637276368, "learning_rate": 1.0080028909897233e-06, "loss": 0.0463, "step": 5256 }, { "epoch": 2.7365955231650183, "grad_norm": 0.2200895690176689, "learning_rate": 1.0040572764227458e-06, "loss": 0.0432, "step": 5257 }, { "epoch": 2.737116085372202, "grad_norm": 0.22812841777700557, "learning_rate": 1.0001192408911592e-06, "loss": 0.0442, "step": 5258 }, { "epoch": 2.737636647579386, "grad_norm": 0.2224052677158258, "learning_rate": 9.961887856387714e-07, "loss": 0.0426, "step": 5259 }, { "epoch": 2.7381572097865696, "grad_norm": 0.22285780659280513, "learning_rate": 9.92265911907006e-07, "loss": 0.0429, "step": 5260 }, { "epoch": 2.7386777719937534, "grad_norm": 0.22292964049968192, "learning_rate": 9.883506209348914e-07, "loss": 0.0415, "step": 5261 }, { "epoch": 2.739198334200937, "grad_norm": 0.2253363805591684, "learning_rate": 9.84442913959055e-07, "loss": 0.0445, "step": 5262 }, { "epoch": 2.739718896408121, "grad_norm": 0.22914905603731295, "learning_rate": 9.805427922137373e-07, "loss": 0.0436, "step": 5263 }, { "epoch": 2.7402394586153047, "grad_norm": 0.22423612517145042, "learning_rate": 9.766502569307722e-07, "loss": 0.0443, "step": 5264 }, { "epoch": 2.7407600208224885, "grad_norm": 0.22723159502834736, "learning_rate": 9.727653093396044e-07, "loss": 0.0447, "step": 5265 }, { "epoch": 2.7412805830296723, "grad_norm": 0.22822872201192684, "learning_rate": 9.688879506672854e-07, "loss": 0.043, "step": 5266 }, { "epoch": 2.741801145236856, "grad_norm": 0.21393631920738038, "learning_rate": 9.650181821384636e-07, "loss": 0.0412, "step": 5267 }, { "epoch": 2.74232170744404, "grad_norm": 0.22094569188225746, "learning_rate": 9.611560049753915e-07, "loss": 0.0434, "step": 5268 }, { "epoch": 2.7428422696512236, "grad_norm": 0.22305606790036275, "learning_rate": 9.573014203979242e-07, "loss": 0.0425, "step": 5269 }, { "epoch": 2.7433628318584073, "grad_norm": 0.22902027508759243, "learning_rate": 9.534544296235181e-07, "loss": 0.0428, "step": 5270 }, { "epoch": 2.7438833940655907, "grad_norm": 0.23242670385926198, "learning_rate": 9.49615033867235e-07, "loss": 0.0446, "step": 5271 }, { "epoch": 2.7444039562727744, "grad_norm": 0.2347141380798654, "learning_rate": 9.45783234341735e-07, "loss": 0.0425, "step": 5272 }, { "epoch": 2.744924518479958, "grad_norm": 0.22332579684782305, "learning_rate": 9.419590322572725e-07, "loss": 0.0443, "step": 5273 }, { "epoch": 2.745445080687142, "grad_norm": 0.22189445785006218, "learning_rate": 9.381424288217117e-07, "loss": 0.0423, "step": 5274 }, { "epoch": 2.7459656428943258, "grad_norm": 0.2261260353066565, "learning_rate": 9.343334252405133e-07, "loss": 0.0434, "step": 5275 }, { "epoch": 2.7464862051015095, "grad_norm": 0.22674984875383666, "learning_rate": 9.30532022716743e-07, "loss": 0.0434, "step": 5276 }, { "epoch": 2.7470067673086933, "grad_norm": 0.22247669966218622, "learning_rate": 9.26738222451054e-07, "loss": 0.0431, "step": 5277 }, { "epoch": 2.747527329515877, "grad_norm": 0.22557924637748528, "learning_rate": 9.229520256417073e-07, "loss": 0.0456, "step": 5278 }, { "epoch": 2.748047891723061, "grad_norm": 0.22992144412583332, "learning_rate": 9.191734334845603e-07, "loss": 0.0452, "step": 5279 }, { "epoch": 2.7485684539302446, "grad_norm": 0.22300684919494487, "learning_rate": 9.154024471730721e-07, "loss": 0.0438, "step": 5280 }, { "epoch": 2.7490890161374284, "grad_norm": 0.22245433446016094, "learning_rate": 9.1163906789829e-07, "loss": 0.042, "step": 5281 }, { "epoch": 2.749609578344612, "grad_norm": 0.22206770415357324, "learning_rate": 9.078832968488632e-07, "loss": 0.0434, "step": 5282 }, { "epoch": 2.750130140551796, "grad_norm": 0.23376405716262028, "learning_rate": 9.041351352110427e-07, "loss": 0.0431, "step": 5283 }, { "epoch": 2.7506507027589797, "grad_norm": 0.22966704496277124, "learning_rate": 9.003945841686707e-07, "loss": 0.0434, "step": 5284 }, { "epoch": 2.7511712649661635, "grad_norm": 0.22838593078199046, "learning_rate": 8.966616449031906e-07, "loss": 0.0422, "step": 5285 }, { "epoch": 2.7516918271733473, "grad_norm": 0.22176204273902209, "learning_rate": 8.929363185936346e-07, "loss": 0.0443, "step": 5286 }, { "epoch": 2.752212389380531, "grad_norm": 0.22581365238705586, "learning_rate": 8.892186064166335e-07, "loss": 0.0441, "step": 5287 }, { "epoch": 2.752732951587715, "grad_norm": 0.2244870202646647, "learning_rate": 8.855085095464149e-07, "loss": 0.0429, "step": 5288 }, { "epoch": 2.7532535137948986, "grad_norm": 0.22803770453270514, "learning_rate": 8.818060291548053e-07, "loss": 0.0436, "step": 5289 }, { "epoch": 2.7537740760020823, "grad_norm": 0.2250742592276933, "learning_rate": 8.781111664112162e-07, "loss": 0.042, "step": 5290 }, { "epoch": 2.754294638209266, "grad_norm": 0.2253420328172488, "learning_rate": 8.744239224826534e-07, "loss": 0.0419, "step": 5291 }, { "epoch": 2.75481520041645, "grad_norm": 0.2264536838481127, "learning_rate": 8.707442985337239e-07, "loss": 0.0441, "step": 5292 }, { "epoch": 2.7553357626236337, "grad_norm": 0.2202483024468351, "learning_rate": 8.670722957266231e-07, "loss": 0.0443, "step": 5293 }, { "epoch": 2.7558563248308174, "grad_norm": 0.2303490443562307, "learning_rate": 8.634079152211427e-07, "loss": 0.0429, "step": 5294 }, { "epoch": 2.7563768870380008, "grad_norm": 0.22221701362424096, "learning_rate": 8.597511581746626e-07, "loss": 0.0433, "step": 5295 }, { "epoch": 2.7568974492451845, "grad_norm": 0.23951455762675466, "learning_rate": 8.56102025742156e-07, "loss": 0.0447, "step": 5296 }, { "epoch": 2.7574180114523683, "grad_norm": 0.2374125127215411, "learning_rate": 8.524605190761897e-07, "loss": 0.0451, "step": 5297 }, { "epoch": 2.757938573659552, "grad_norm": 0.23192693160940323, "learning_rate": 8.488266393269245e-07, "loss": 0.0432, "step": 5298 }, { "epoch": 2.758459135866736, "grad_norm": 0.2342718094829394, "learning_rate": 8.452003876421033e-07, "loss": 0.0437, "step": 5299 }, { "epoch": 2.7589796980739196, "grad_norm": 0.2250652941295221, "learning_rate": 8.415817651670654e-07, "loss": 0.0437, "step": 5300 }, { "epoch": 2.7595002602811034, "grad_norm": 0.22979466231086934, "learning_rate": 8.379707730447439e-07, "loss": 0.0437, "step": 5301 }, { "epoch": 2.760020822488287, "grad_norm": 0.23361883586709467, "learning_rate": 8.343674124156542e-07, "loss": 0.0443, "step": 5302 }, { "epoch": 2.760541384695471, "grad_norm": 0.22631184004097923, "learning_rate": 8.307716844179081e-07, "loss": 0.0416, "step": 5303 }, { "epoch": 2.7610619469026547, "grad_norm": 0.23184232037276983, "learning_rate": 8.271835901872055e-07, "loss": 0.0439, "step": 5304 }, { "epoch": 2.7615825091098385, "grad_norm": 0.23202421193452336, "learning_rate": 8.236031308568287e-07, "loss": 0.0431, "step": 5305 }, { "epoch": 2.7621030713170223, "grad_norm": 0.22241507265189012, "learning_rate": 8.200303075576565e-07, "loss": 0.0425, "step": 5306 }, { "epoch": 2.762623633524206, "grad_norm": 0.22697877013798187, "learning_rate": 8.164651214181556e-07, "loss": 0.044, "step": 5307 }, { "epoch": 2.76314419573139, "grad_norm": 0.2247465473110427, "learning_rate": 8.129075735643698e-07, "loss": 0.0429, "step": 5308 }, { "epoch": 2.7636647579385736, "grad_norm": 0.22274117285283673, "learning_rate": 8.093576651199447e-07, "loss": 0.0425, "step": 5309 }, { "epoch": 2.7641853201457574, "grad_norm": 0.22736433771242023, "learning_rate": 8.058153972061027e-07, "loss": 0.0415, "step": 5310 }, { "epoch": 2.764705882352941, "grad_norm": 0.22964935642008533, "learning_rate": 8.022807709416575e-07, "loss": 0.0433, "step": 5311 }, { "epoch": 2.765226444560125, "grad_norm": 0.23653451017732308, "learning_rate": 7.987537874430101e-07, "loss": 0.0447, "step": 5312 }, { "epoch": 2.7657470067673087, "grad_norm": 0.22232202569716866, "learning_rate": 7.952344478241503e-07, "loss": 0.042, "step": 5313 }, { "epoch": 2.7662675689744924, "grad_norm": 0.22865924001277593, "learning_rate": 7.917227531966387e-07, "loss": 0.0443, "step": 5314 }, { "epoch": 2.766788131181676, "grad_norm": 0.2199762097546008, "learning_rate": 7.88218704669641e-07, "loss": 0.0416, "step": 5315 }, { "epoch": 2.76730869338886, "grad_norm": 0.22907960763658383, "learning_rate": 7.847223033498968e-07, "loss": 0.0443, "step": 5316 }, { "epoch": 2.7678292555960438, "grad_norm": 0.2224849881577553, "learning_rate": 7.812335503417284e-07, "loss": 0.0434, "step": 5317 }, { "epoch": 2.7683498178032275, "grad_norm": 0.2229443345455532, "learning_rate": 7.777524467470515e-07, "loss": 0.0423, "step": 5318 }, { "epoch": 2.7688703800104113, "grad_norm": 0.22816646258582196, "learning_rate": 7.742789936653561e-07, "loss": 0.0429, "step": 5319 }, { "epoch": 2.769390942217595, "grad_norm": 0.22839763781886158, "learning_rate": 7.708131921937229e-07, "loss": 0.0441, "step": 5320 }, { "epoch": 2.769911504424779, "grad_norm": 0.22727070637222285, "learning_rate": 7.673550434268123e-07, "loss": 0.045, "step": 5321 }, { "epoch": 2.7704320666319626, "grad_norm": 0.22633561873190844, "learning_rate": 7.639045484568702e-07, "loss": 0.045, "step": 5322 }, { "epoch": 2.7709526288391464, "grad_norm": 0.22765311723057183, "learning_rate": 7.60461708373722e-07, "loss": 0.044, "step": 5323 }, { "epoch": 2.77147319104633, "grad_norm": 0.2272959845769333, "learning_rate": 7.570265242647784e-07, "loss": 0.0432, "step": 5324 }, { "epoch": 2.771993753253514, "grad_norm": 0.2181349991575582, "learning_rate": 7.535989972150298e-07, "loss": 0.0426, "step": 5325 }, { "epoch": 2.7725143154606977, "grad_norm": 0.21996140541009873, "learning_rate": 7.501791283070436e-07, "loss": 0.0431, "step": 5326 }, { "epoch": 2.7730348776678815, "grad_norm": 0.22240120810935823, "learning_rate": 7.467669186209836e-07, "loss": 0.0426, "step": 5327 }, { "epoch": 2.7735554398750653, "grad_norm": 0.23526222986344317, "learning_rate": 7.433623692345765e-07, "loss": 0.0428, "step": 5328 }, { "epoch": 2.774076002082249, "grad_norm": 0.23391764152544686, "learning_rate": 7.399654812231399e-07, "loss": 0.0441, "step": 5329 }, { "epoch": 2.774596564289433, "grad_norm": 0.23704308426871976, "learning_rate": 7.365762556595685e-07, "loss": 0.0442, "step": 5330 }, { "epoch": 2.7751171264966166, "grad_norm": 0.22376362085484436, "learning_rate": 7.331946936143392e-07, "loss": 0.0456, "step": 5331 }, { "epoch": 2.7756376887038003, "grad_norm": 0.2265634568261286, "learning_rate": 7.298207961555031e-07, "loss": 0.0443, "step": 5332 }, { "epoch": 2.776158250910984, "grad_norm": 0.23042926713303066, "learning_rate": 7.264545643486997e-07, "loss": 0.0429, "step": 5333 }, { "epoch": 2.776678813118168, "grad_norm": 0.2213224298794227, "learning_rate": 7.230959992571368e-07, "loss": 0.0421, "step": 5334 }, { "epoch": 2.777199375325351, "grad_norm": 0.2210558855748437, "learning_rate": 7.197451019416073e-07, "loss": 0.0425, "step": 5335 }, { "epoch": 2.777719937532535, "grad_norm": 0.2258634646023793, "learning_rate": 7.164018734604816e-07, "loss": 0.0437, "step": 5336 }, { "epoch": 2.7782404997397188, "grad_norm": 0.23229199016420934, "learning_rate": 7.130663148697037e-07, "loss": 0.0464, "step": 5337 }, { "epoch": 2.7787610619469025, "grad_norm": 0.22326405991841328, "learning_rate": 7.097384272228003e-07, "loss": 0.043, "step": 5338 }, { "epoch": 2.7792816241540863, "grad_norm": 0.23391749933423464, "learning_rate": 7.064182115708723e-07, "loss": 0.0445, "step": 5339 }, { "epoch": 2.77980218636127, "grad_norm": 0.23115860660431092, "learning_rate": 7.031056689626031e-07, "loss": 0.0437, "step": 5340 }, { "epoch": 2.780322748568454, "grad_norm": 0.22704336146442725, "learning_rate": 6.998008004442391e-07, "loss": 0.0457, "step": 5341 }, { "epoch": 2.7808433107756376, "grad_norm": 0.2226141103882042, "learning_rate": 6.965036070596175e-07, "loss": 0.0437, "step": 5342 }, { "epoch": 2.7813638729828214, "grad_norm": 0.22972774937475848, "learning_rate": 6.932140898501471e-07, "loss": 0.0437, "step": 5343 }, { "epoch": 2.781884435190005, "grad_norm": 0.22763949291942495, "learning_rate": 6.899322498548022e-07, "loss": 0.0444, "step": 5344 }, { "epoch": 2.782404997397189, "grad_norm": 0.2269364277590439, "learning_rate": 6.866580881101508e-07, "loss": 0.0455, "step": 5345 }, { "epoch": 2.7829255596043727, "grad_norm": 0.22415876770946747, "learning_rate": 6.833916056503187e-07, "loss": 0.0413, "step": 5346 }, { "epoch": 2.7834461218115565, "grad_norm": 0.2169182386149355, "learning_rate": 6.801328035070137e-07, "loss": 0.0411, "step": 5347 }, { "epoch": 2.7839666840187403, "grad_norm": 0.22859639804739612, "learning_rate": 6.768816827095182e-07, "loss": 0.0439, "step": 5348 }, { "epoch": 2.784487246225924, "grad_norm": 0.2252301895314262, "learning_rate": 6.736382442846911e-07, "loss": 0.0435, "step": 5349 }, { "epoch": 2.785007808433108, "grad_norm": 0.23199340249555175, "learning_rate": 6.70402489256955e-07, "loss": 0.0433, "step": 5350 }, { "epoch": 2.7855283706402916, "grad_norm": 0.2217393408623345, "learning_rate": 6.671744186483143e-07, "loss": 0.0427, "step": 5351 }, { "epoch": 2.7860489328474753, "grad_norm": 0.22705878796369014, "learning_rate": 6.639540334783478e-07, "loss": 0.0436, "step": 5352 }, { "epoch": 2.786569495054659, "grad_norm": 0.22046258969292104, "learning_rate": 6.60741334764195e-07, "loss": 0.0408, "step": 5353 }, { "epoch": 2.787090057261843, "grad_norm": 0.22592912598051434, "learning_rate": 6.575363235205856e-07, "loss": 0.0434, "step": 5354 }, { "epoch": 2.7876106194690267, "grad_norm": 0.22293933544483777, "learning_rate": 6.543390007598016e-07, "loss": 0.0433, "step": 5355 }, { "epoch": 2.7881311816762104, "grad_norm": 0.2200500939054284, "learning_rate": 6.511493674917102e-07, "loss": 0.0418, "step": 5356 }, { "epoch": 2.788651743883394, "grad_norm": 0.22913113334418797, "learning_rate": 6.479674247237472e-07, "loss": 0.0443, "step": 5357 }, { "epoch": 2.789172306090578, "grad_norm": 0.23960510693606646, "learning_rate": 6.447931734609197e-07, "loss": 0.0434, "step": 5358 }, { "epoch": 2.7896928682977613, "grad_norm": 0.23198268720084506, "learning_rate": 6.416266147058009e-07, "loss": 0.0436, "step": 5359 }, { "epoch": 2.790213430504945, "grad_norm": 0.23155582454440707, "learning_rate": 6.38467749458535e-07, "loss": 0.0445, "step": 5360 }, { "epoch": 2.790733992712129, "grad_norm": 0.2298256419187941, "learning_rate": 6.353165787168464e-07, "loss": 0.0437, "step": 5361 }, { "epoch": 2.7912545549193126, "grad_norm": 0.2254102181052845, "learning_rate": 6.321731034760164e-07, "loss": 0.0433, "step": 5362 }, { "epoch": 2.7917751171264964, "grad_norm": 0.23436390964542603, "learning_rate": 6.290373247289011e-07, "loss": 0.0458, "step": 5363 }, { "epoch": 2.79229567933368, "grad_norm": 0.21766623446970718, "learning_rate": 6.259092434659247e-07, "loss": 0.0416, "step": 5364 }, { "epoch": 2.792816241540864, "grad_norm": 0.21981768528318218, "learning_rate": 6.227888606750803e-07, "loss": 0.042, "step": 5365 }, { "epoch": 2.7933368037480477, "grad_norm": 0.23061777624720375, "learning_rate": 6.196761773419324e-07, "loss": 0.0445, "step": 5366 }, { "epoch": 2.7938573659552315, "grad_norm": 0.21991675403724478, "learning_rate": 6.165711944496083e-07, "loss": 0.0409, "step": 5367 }, { "epoch": 2.7943779281624153, "grad_norm": 0.23345443481213357, "learning_rate": 6.134739129788125e-07, "loss": 0.0458, "step": 5368 }, { "epoch": 2.794898490369599, "grad_norm": 0.23028111148067149, "learning_rate": 6.103843339078014e-07, "loss": 0.0436, "step": 5369 }, { "epoch": 2.795419052576783, "grad_norm": 0.2302827035468124, "learning_rate": 6.073024582124165e-07, "loss": 0.0435, "step": 5370 }, { "epoch": 2.7959396147839666, "grad_norm": 0.23249897658678184, "learning_rate": 6.042282868660515e-07, "loss": 0.0458, "step": 5371 }, { "epoch": 2.7964601769911503, "grad_norm": 0.22510823189817172, "learning_rate": 6.011618208396768e-07, "loss": 0.0426, "step": 5372 }, { "epoch": 2.796980739198334, "grad_norm": 0.22638808421418086, "learning_rate": 5.981030611018234e-07, "loss": 0.042, "step": 5373 }, { "epoch": 2.797501301405518, "grad_norm": 0.22413378380890733, "learning_rate": 5.950520086185878e-07, "loss": 0.0424, "step": 5374 }, { "epoch": 2.7980218636127017, "grad_norm": 0.2430762884956928, "learning_rate": 5.920086643536354e-07, "loss": 0.047, "step": 5375 }, { "epoch": 2.7985424258198854, "grad_norm": 0.232168583757956, "learning_rate": 5.889730292681972e-07, "loss": 0.0438, "step": 5376 }, { "epoch": 2.799062988027069, "grad_norm": 0.24007634677429804, "learning_rate": 5.859451043210701e-07, "loss": 0.0442, "step": 5377 }, { "epoch": 2.799583550234253, "grad_norm": 0.22672307210185036, "learning_rate": 5.829248904686085e-07, "loss": 0.0433, "step": 5378 }, { "epoch": 2.8001041124414368, "grad_norm": 0.2346606956607048, "learning_rate": 5.799123886647439e-07, "loss": 0.0451, "step": 5379 }, { "epoch": 2.8006246746486205, "grad_norm": 0.22725576849779167, "learning_rate": 5.769075998609569e-07, "loss": 0.0429, "step": 5380 }, { "epoch": 2.8011452368558043, "grad_norm": 0.23830547232482183, "learning_rate": 5.73910525006302e-07, "loss": 0.0442, "step": 5381 }, { "epoch": 2.801665799062988, "grad_norm": 0.22960806546029328, "learning_rate": 5.709211650473972e-07, "loss": 0.0425, "step": 5382 }, { "epoch": 2.802186361270172, "grad_norm": 0.2201304972527062, "learning_rate": 5.679395209284178e-07, "loss": 0.0426, "step": 5383 }, { "epoch": 2.8027069234773556, "grad_norm": 0.22778314103337913, "learning_rate": 5.649655935911075e-07, "loss": 0.0435, "step": 5384 }, { "epoch": 2.8032274856845394, "grad_norm": 0.22560465254179524, "learning_rate": 5.619993839747733e-07, "loss": 0.0432, "step": 5385 }, { "epoch": 2.803748047891723, "grad_norm": 0.23421440146211206, "learning_rate": 5.590408930162799e-07, "loss": 0.0435, "step": 5386 }, { "epoch": 2.804268610098907, "grad_norm": 0.21559388239552502, "learning_rate": 5.560901216500575e-07, "loss": 0.0411, "step": 5387 }, { "epoch": 2.8047891723060907, "grad_norm": 0.22569321419302593, "learning_rate": 5.531470708080965e-07, "loss": 0.0428, "step": 5388 }, { "epoch": 2.8053097345132745, "grad_norm": 0.2256230654004025, "learning_rate": 5.502117414199481e-07, "loss": 0.0432, "step": 5389 }, { "epoch": 2.8058302967204583, "grad_norm": 0.23309729929515796, "learning_rate": 5.472841344127261e-07, "loss": 0.043, "step": 5390 }, { "epoch": 2.806350858927642, "grad_norm": 0.22323967104644515, "learning_rate": 5.443642507111074e-07, "loss": 0.0429, "step": 5391 }, { "epoch": 2.806871421134826, "grad_norm": 0.2200278159127524, "learning_rate": 5.414520912373239e-07, "loss": 0.0415, "step": 5392 }, { "epoch": 2.8073919833420096, "grad_norm": 0.22357673987141546, "learning_rate": 5.38547656911173e-07, "loss": 0.0434, "step": 5393 }, { "epoch": 2.8079125455491933, "grad_norm": 0.23721653321770933, "learning_rate": 5.3565094865001e-07, "loss": 0.0436, "step": 5394 }, { "epoch": 2.808433107756377, "grad_norm": 0.23374671104516043, "learning_rate": 5.327619673687528e-07, "loss": 0.0442, "step": 5395 }, { "epoch": 2.808953669963561, "grad_norm": 0.2243628561143931, "learning_rate": 5.298807139798689e-07, "loss": 0.0434, "step": 5396 }, { "epoch": 2.8094742321707447, "grad_norm": 0.21338440786570576, "learning_rate": 5.270071893934026e-07, "loss": 0.0422, "step": 5397 }, { "epoch": 2.8099947943779284, "grad_norm": 0.21726078260576132, "learning_rate": 5.24141394516936e-07, "loss": 0.0425, "step": 5398 }, { "epoch": 2.8105153565851118, "grad_norm": 0.2357056904995546, "learning_rate": 5.212833302556258e-07, "loss": 0.0441, "step": 5399 }, { "epoch": 2.8110359187922955, "grad_norm": 0.2218114034723653, "learning_rate": 5.184329975121832e-07, "loss": 0.0414, "step": 5400 }, { "epoch": 2.8115564809994793, "grad_norm": 0.21777410706096698, "learning_rate": 5.155903971868742e-07, "loss": 0.0425, "step": 5401 }, { "epoch": 2.812077043206663, "grad_norm": 0.21863538909227456, "learning_rate": 5.127555301775223e-07, "loss": 0.0411, "step": 5402 }, { "epoch": 2.812597605413847, "grad_norm": 0.2272082792608195, "learning_rate": 5.099283973795111e-07, "loss": 0.0438, "step": 5403 }, { "epoch": 2.8131181676210306, "grad_norm": 0.21938578677958842, "learning_rate": 5.071089996857848e-07, "loss": 0.0422, "step": 5404 }, { "epoch": 2.8136387298282144, "grad_norm": 0.22905873049766085, "learning_rate": 5.042973379868365e-07, "loss": 0.0422, "step": 5405 }, { "epoch": 2.814159292035398, "grad_norm": 0.22649704219116712, "learning_rate": 5.014934131707199e-07, "loss": 0.0435, "step": 5406 }, { "epoch": 2.814679854242582, "grad_norm": 0.22467215809657948, "learning_rate": 4.98697226123046e-07, "loss": 0.0418, "step": 5407 }, { "epoch": 2.8152004164497657, "grad_norm": 0.22783791469459005, "learning_rate": 4.959087777269805e-07, "loss": 0.0446, "step": 5408 }, { "epoch": 2.8157209786569495, "grad_norm": 0.2214050794833035, "learning_rate": 4.931280688632467e-07, "loss": 0.0408, "step": 5409 }, { "epoch": 2.8162415408641333, "grad_norm": 0.22088031305505856, "learning_rate": 4.90355100410117e-07, "loss": 0.0416, "step": 5410 }, { "epoch": 2.816762103071317, "grad_norm": 0.22653460185601557, "learning_rate": 4.875898732434298e-07, "loss": 0.0427, "step": 5411 }, { "epoch": 2.817282665278501, "grad_norm": 0.21289858185605157, "learning_rate": 4.848323882365668e-07, "loss": 0.0413, "step": 5412 }, { "epoch": 2.8178032274856846, "grad_norm": 0.2309828690362825, "learning_rate": 4.820826462604788e-07, "loss": 0.0434, "step": 5413 }, { "epoch": 2.8183237896928683, "grad_norm": 0.2374990881444587, "learning_rate": 4.793406481836515e-07, "loss": 0.0427, "step": 5414 }, { "epoch": 2.818844351900052, "grad_norm": 0.23637721721606647, "learning_rate": 4.7660639487214496e-07, "loss": 0.0464, "step": 5415 }, { "epoch": 2.819364914107236, "grad_norm": 0.22388241364899847, "learning_rate": 4.738798871895572e-07, "loss": 0.0434, "step": 5416 }, { "epoch": 2.8198854763144197, "grad_norm": 0.2210864475178364, "learning_rate": 4.7116112599704666e-07, "loss": 0.0435, "step": 5417 }, { "epoch": 2.8204060385216034, "grad_norm": 0.2259240497819733, "learning_rate": 4.6845011215332914e-07, "loss": 0.0427, "step": 5418 }, { "epoch": 2.820926600728787, "grad_norm": 0.219204603117465, "learning_rate": 4.6574684651466415e-07, "loss": 0.0411, "step": 5419 }, { "epoch": 2.821447162935971, "grad_norm": 0.21818043914111, "learning_rate": 4.6305132993487155e-07, "loss": 0.0413, "step": 5420 }, { "epoch": 2.8219677251431547, "grad_norm": 0.21659493110106742, "learning_rate": 4.6036356326532024e-07, "loss": 0.0402, "step": 5421 }, { "epoch": 2.8224882873503385, "grad_norm": 0.22715592904564377, "learning_rate": 4.57683547354934e-07, "loss": 0.0437, "step": 5422 }, { "epoch": 2.823008849557522, "grad_norm": 0.23159567590291835, "learning_rate": 4.5501128305018013e-07, "loss": 0.0432, "step": 5423 }, { "epoch": 2.8235294117647056, "grad_norm": 0.21733811243688106, "learning_rate": 4.523467711950946e-07, "loss": 0.0413, "step": 5424 }, { "epoch": 2.8240499739718894, "grad_norm": 0.23788228121357813, "learning_rate": 4.496900126312431e-07, "loss": 0.0433, "step": 5425 }, { "epoch": 2.824570536179073, "grad_norm": 0.22471884600489062, "learning_rate": 4.4704100819776e-07, "loss": 0.0426, "step": 5426 }, { "epoch": 2.825091098386257, "grad_norm": 0.22556479297052626, "learning_rate": 4.443997587313231e-07, "loss": 0.0436, "step": 5427 }, { "epoch": 2.8256116605934407, "grad_norm": 0.2204685863010288, "learning_rate": 4.4176626506616245e-07, "loss": 0.0426, "step": 5428 }, { "epoch": 2.8261322228006245, "grad_norm": 0.2222480259682644, "learning_rate": 4.391405280340544e-07, "loss": 0.0425, "step": 5429 }, { "epoch": 2.8266527850078083, "grad_norm": 0.223467726694392, "learning_rate": 4.365225484643326e-07, "loss": 0.0435, "step": 5430 }, { "epoch": 2.827173347214992, "grad_norm": 0.22428205507604543, "learning_rate": 4.339123271838746e-07, "loss": 0.0419, "step": 5431 }, { "epoch": 2.827693909422176, "grad_norm": 0.22403734441364156, "learning_rate": 4.3130986501711547e-07, "loss": 0.0418, "step": 5432 }, { "epoch": 2.8282144716293596, "grad_norm": 0.2274253646113262, "learning_rate": 4.2871516278602806e-07, "loss": 0.0439, "step": 5433 }, { "epoch": 2.8287350338365433, "grad_norm": 0.22023423609927847, "learning_rate": 4.2612822131013754e-07, "loss": 0.0431, "step": 5434 }, { "epoch": 2.829255596043727, "grad_norm": 0.23148682456065023, "learning_rate": 4.235490414065263e-07, "loss": 0.0442, "step": 5435 }, { "epoch": 2.829776158250911, "grad_norm": 0.21570556980256503, "learning_rate": 4.2097762388981775e-07, "loss": 0.0404, "step": 5436 }, { "epoch": 2.8302967204580947, "grad_norm": 0.2261675056659339, "learning_rate": 4.1841396957218446e-07, "loss": 0.0434, "step": 5437 }, { "epoch": 2.8308172826652784, "grad_norm": 0.23651027031881064, "learning_rate": 4.158580792633482e-07, "loss": 0.0459, "step": 5438 }, { "epoch": 2.831337844872462, "grad_norm": 0.22013855746990973, "learning_rate": 4.1330995377057703e-07, "loss": 0.0426, "step": 5439 }, { "epoch": 2.831858407079646, "grad_norm": 0.21970369162169295, "learning_rate": 4.107695938986883e-07, "loss": 0.0412, "step": 5440 }, { "epoch": 2.8323789692868298, "grad_norm": 0.23453520708614153, "learning_rate": 4.0823700045004854e-07, "loss": 0.0449, "step": 5441 }, { "epoch": 2.8328995314940135, "grad_norm": 0.22928185978871454, "learning_rate": 4.057121742245651e-07, "loss": 0.0428, "step": 5442 }, { "epoch": 2.8334200937011973, "grad_norm": 0.2378862033367092, "learning_rate": 4.031951160196945e-07, "loss": 0.0438, "step": 5443 }, { "epoch": 2.833940655908381, "grad_norm": 0.22721678410714027, "learning_rate": 4.0068582663044527e-07, "loss": 0.0452, "step": 5444 }, { "epoch": 2.834461218115565, "grad_norm": 0.22411659172958168, "learning_rate": 3.98184306849364e-07, "loss": 0.0434, "step": 5445 }, { "epoch": 2.8349817803227486, "grad_norm": 0.22502599203444407, "learning_rate": 3.9569055746654927e-07, "loss": 0.0441, "step": 5446 }, { "epoch": 2.8355023425299324, "grad_norm": 0.22506720047885997, "learning_rate": 3.9320457926964313e-07, "loss": 0.0434, "step": 5447 }, { "epoch": 2.836022904737116, "grad_norm": 0.2329833809778821, "learning_rate": 3.9072637304383155e-07, "loss": 0.0429, "step": 5448 }, { "epoch": 2.8365434669443, "grad_norm": 0.22427732647548107, "learning_rate": 3.882559395718466e-07, "loss": 0.0416, "step": 5449 }, { "epoch": 2.8370640291514837, "grad_norm": 0.22799511119215582, "learning_rate": 3.857932796339697e-07, "loss": 0.0434, "step": 5450 }, { "epoch": 2.8375845913586675, "grad_norm": 0.22666653298135034, "learning_rate": 3.833383940080232e-07, "loss": 0.0431, "step": 5451 }, { "epoch": 2.8381051535658512, "grad_norm": 0.22778792498675723, "learning_rate": 3.808912834693701e-07, "loss": 0.0424, "step": 5452 }, { "epoch": 2.838625715773035, "grad_norm": 0.22353151517718023, "learning_rate": 3.784519487909255e-07, "loss": 0.0421, "step": 5453 }, { "epoch": 2.839146277980219, "grad_norm": 0.22576570564471596, "learning_rate": 3.7602039074314254e-07, "loss": 0.0432, "step": 5454 }, { "epoch": 2.8396668401874026, "grad_norm": 0.21923761704451078, "learning_rate": 3.7359661009402356e-07, "loss": 0.0408, "step": 5455 }, { "epoch": 2.8401874023945863, "grad_norm": 0.22437305305796867, "learning_rate": 3.71180607609109e-07, "loss": 0.0436, "step": 5456 }, { "epoch": 2.84070796460177, "grad_norm": 0.22396548451251694, "learning_rate": 3.687723840514828e-07, "loss": 0.043, "step": 5457 }, { "epoch": 2.841228526808954, "grad_norm": 0.22949152227259195, "learning_rate": 3.6637194018177556e-07, "loss": 0.0431, "step": 5458 }, { "epoch": 2.8417490890161377, "grad_norm": 0.23054445515120947, "learning_rate": 3.63979276758164e-07, "loss": 0.0438, "step": 5459 }, { "epoch": 2.8422696512233214, "grad_norm": 0.23505182113223505, "learning_rate": 3.6159439453635757e-07, "loss": 0.0446, "step": 5460 }, { "epoch": 2.842790213430505, "grad_norm": 0.2304779734911683, "learning_rate": 3.5921729426961206e-07, "loss": 0.044, "step": 5461 }, { "epoch": 2.843310775637689, "grad_norm": 0.22385791465509375, "learning_rate": 3.568479767087296e-07, "loss": 0.0431, "step": 5462 }, { "epoch": 2.8438313378448723, "grad_norm": 0.2264655835605014, "learning_rate": 3.544864426020478e-07, "loss": 0.0442, "step": 5463 }, { "epoch": 2.844351900052056, "grad_norm": 0.22304778819677265, "learning_rate": 3.521326926954532e-07, "loss": 0.044, "step": 5464 }, { "epoch": 2.84487246225924, "grad_norm": 0.21800081748182232, "learning_rate": 3.497867277323652e-07, "loss": 0.0424, "step": 5465 }, { "epoch": 2.8453930244664236, "grad_norm": 0.23473879191053773, "learning_rate": 3.474485484537521e-07, "loss": 0.045, "step": 5466 }, { "epoch": 2.8459135866736074, "grad_norm": 0.23264235355245735, "learning_rate": 3.451181555981148e-07, "loss": 0.043, "step": 5467 }, { "epoch": 2.846434148880791, "grad_norm": 0.22835863639242554, "learning_rate": 3.42795549901509e-07, "loss": 0.0433, "step": 5468 }, { "epoch": 2.846954711087975, "grad_norm": 0.2296814079782182, "learning_rate": 3.4048073209751175e-07, "loss": 0.044, "step": 5469 }, { "epoch": 2.8474752732951587, "grad_norm": 0.22896223448226471, "learning_rate": 3.381737029172577e-07, "loss": 0.043, "step": 5470 }, { "epoch": 2.8479958355023425, "grad_norm": 0.2229011394279904, "learning_rate": 3.358744630894084e-07, "loss": 0.0433, "step": 5471 }, { "epoch": 2.8485163977095262, "grad_norm": 0.22816466058853094, "learning_rate": 3.335830133401746e-07, "loss": 0.0434, "step": 5472 }, { "epoch": 2.84903695991671, "grad_norm": 0.23157385199630254, "learning_rate": 3.3129935439329963e-07, "loss": 0.0439, "step": 5473 }, { "epoch": 2.849557522123894, "grad_norm": 0.22032768627149715, "learning_rate": 3.290234869700731e-07, "loss": 0.0424, "step": 5474 }, { "epoch": 2.8500780843310776, "grad_norm": 0.23190042597170488, "learning_rate": 3.2675541178931456e-07, "loss": 0.044, "step": 5475 }, { "epoch": 2.8505986465382613, "grad_norm": 0.23176287027892795, "learning_rate": 3.244951295673926e-07, "loss": 0.0429, "step": 5476 }, { "epoch": 2.851119208745445, "grad_norm": 0.2242543810542042, "learning_rate": 3.222426410182111e-07, "loss": 0.0421, "step": 5477 }, { "epoch": 2.851639770952629, "grad_norm": 0.2206514598162686, "learning_rate": 3.199979468532038e-07, "loss": 0.0426, "step": 5478 }, { "epoch": 2.8521603331598127, "grad_norm": 0.23274276038647804, "learning_rate": 3.1776104778135364e-07, "loss": 0.0435, "step": 5479 }, { "epoch": 2.8526808953669964, "grad_norm": 0.22635998598505638, "learning_rate": 3.155319445091787e-07, "loss": 0.043, "step": 5480 }, { "epoch": 2.85320145757418, "grad_norm": 0.2324227117826993, "learning_rate": 3.1331063774072965e-07, "loss": 0.044, "step": 5481 }, { "epoch": 2.853722019781364, "grad_norm": 0.22289276085299753, "learning_rate": 3.1109712817760374e-07, "loss": 0.0431, "step": 5482 }, { "epoch": 2.8542425819885477, "grad_norm": 0.2306524725426029, "learning_rate": 3.0889141651892495e-07, "loss": 0.0439, "step": 5483 }, { "epoch": 2.8547631441957315, "grad_norm": 0.23473790460862667, "learning_rate": 3.0669350346136106e-07, "loss": 0.0446, "step": 5484 }, { "epoch": 2.8552837064029153, "grad_norm": 0.2322283298990947, "learning_rate": 3.045033896991178e-07, "loss": 0.044, "step": 5485 }, { "epoch": 2.855804268610099, "grad_norm": 0.23255630154350312, "learning_rate": 3.0232107592393364e-07, "loss": 0.0436, "step": 5486 }, { "epoch": 2.8563248308172824, "grad_norm": 0.22101058262042608, "learning_rate": 3.001465628250849e-07, "loss": 0.0414, "step": 5487 }, { "epoch": 2.856845393024466, "grad_norm": 0.22455476284506215, "learning_rate": 2.979798510893833e-07, "loss": 0.0418, "step": 5488 }, { "epoch": 2.85736595523165, "grad_norm": 0.2183840949547814, "learning_rate": 2.958209414011759e-07, "loss": 0.042, "step": 5489 }, { "epoch": 2.8578865174388337, "grad_norm": 0.2277491006600283, "learning_rate": 2.936698344423505e-07, "loss": 0.0439, "step": 5490 }, { "epoch": 2.8584070796460175, "grad_norm": 0.22275283037062169, "learning_rate": 2.9152653089232764e-07, "loss": 0.0412, "step": 5491 }, { "epoch": 2.8589276418532013, "grad_norm": 0.23345487136636467, "learning_rate": 2.8939103142805457e-07, "loss": 0.0423, "step": 5492 }, { "epoch": 2.859448204060385, "grad_norm": 0.2319513079584263, "learning_rate": 2.8726333672402796e-07, "loss": 0.0446, "step": 5493 }, { "epoch": 2.859968766267569, "grad_norm": 0.22778088078659994, "learning_rate": 2.8514344745227126e-07, "loss": 0.0428, "step": 5494 }, { "epoch": 2.8604893284747526, "grad_norm": 0.2332741422674495, "learning_rate": 2.8303136428234624e-07, "loss": 0.0434, "step": 5495 }, { "epoch": 2.8610098906819363, "grad_norm": 0.22627378712005314, "learning_rate": 2.809270878813441e-07, "loss": 0.0442, "step": 5496 }, { "epoch": 2.86153045288912, "grad_norm": 0.22239061683349978, "learning_rate": 2.788306189138945e-07, "loss": 0.0428, "step": 5497 }, { "epoch": 2.862051015096304, "grad_norm": 0.2248198343709385, "learning_rate": 2.767419580421593e-07, "loss": 0.0429, "step": 5498 }, { "epoch": 2.8625715773034877, "grad_norm": 0.2304909297159804, "learning_rate": 2.7466110592583585e-07, "loss": 0.0413, "step": 5499 }, { "epoch": 2.8630921395106714, "grad_norm": 0.23465634470891614, "learning_rate": 2.725880632221511e-07, "loss": 0.0442, "step": 5500 }, { "epoch": 2.863612701717855, "grad_norm": 0.22387193157763116, "learning_rate": 2.705228305858731e-07, "loss": 0.0418, "step": 5501 }, { "epoch": 2.864133263925039, "grad_norm": 0.23336695279278744, "learning_rate": 2.684654086692939e-07, "loss": 0.0445, "step": 5502 }, { "epoch": 2.8646538261322227, "grad_norm": 0.21957744263463164, "learning_rate": 2.664157981222437e-07, "loss": 0.0435, "step": 5503 }, { "epoch": 2.8651743883394065, "grad_norm": 0.2170786887345366, "learning_rate": 2.6437399959208797e-07, "loss": 0.0421, "step": 5504 }, { "epoch": 2.8656949505465903, "grad_norm": 0.22627011415059628, "learning_rate": 2.6234001372372194e-07, "loss": 0.0428, "step": 5505 }, { "epoch": 2.866215512753774, "grad_norm": 0.2196527927457996, "learning_rate": 2.603138411595707e-07, "loss": 0.0422, "step": 5506 }, { "epoch": 2.866736074960958, "grad_norm": 0.2273020912046598, "learning_rate": 2.5829548253959445e-07, "loss": 0.043, "step": 5507 }, { "epoch": 2.8672566371681416, "grad_norm": 0.22215073225966223, "learning_rate": 2.5628493850128334e-07, "loss": 0.0425, "step": 5508 }, { "epoch": 2.8677771993753254, "grad_norm": 0.21964426631376047, "learning_rate": 2.5428220967965986e-07, "loss": 0.0419, "step": 5509 }, { "epoch": 2.868297761582509, "grad_norm": 0.23320229447875457, "learning_rate": 2.5228729670728477e-07, "loss": 0.0438, "step": 5510 }, { "epoch": 2.868818323789693, "grad_norm": 0.22196795916795412, "learning_rate": 2.503002002142374e-07, "loss": 0.0422, "step": 5511 }, { "epoch": 2.8693388859968767, "grad_norm": 0.22835206156602514, "learning_rate": 2.483209208281406e-07, "loss": 0.0431, "step": 5512 }, { "epoch": 2.8698594482040605, "grad_norm": 0.22386662881380173, "learning_rate": 2.4634945917414164e-07, "loss": 0.0421, "step": 5513 }, { "epoch": 2.8703800104112442, "grad_norm": 0.2352898314338022, "learning_rate": 2.4438581587491737e-07, "loss": 0.0445, "step": 5514 }, { "epoch": 2.870900572618428, "grad_norm": 0.23787340624984157, "learning_rate": 2.424299915506828e-07, "loss": 0.0447, "step": 5515 }, { "epoch": 2.871421134825612, "grad_norm": 0.2205758396912121, "learning_rate": 2.4048198681917154e-07, "loss": 0.0432, "step": 5516 }, { "epoch": 2.8719416970327956, "grad_norm": 0.21752952294602812, "learning_rate": 2.3854180229565816e-07, "loss": 0.0416, "step": 5517 }, { "epoch": 2.8724622592399793, "grad_norm": 0.22801262969470634, "learning_rate": 2.3660943859294127e-07, "loss": 0.0438, "step": 5518 }, { "epoch": 2.872982821447163, "grad_norm": 0.23350404962787336, "learning_rate": 2.3468489632135772e-07, "loss": 0.044, "step": 5519 }, { "epoch": 2.873503383654347, "grad_norm": 0.23598923493996454, "learning_rate": 2.3276817608875734e-07, "loss": 0.0442, "step": 5520 }, { "epoch": 2.8740239458615306, "grad_norm": 0.2246003814846989, "learning_rate": 2.3085927850053646e-07, "loss": 0.0423, "step": 5521 }, { "epoch": 2.8745445080687144, "grad_norm": 0.22646231927768828, "learning_rate": 2.2895820415961e-07, "loss": 0.0444, "step": 5522 }, { "epoch": 2.875065070275898, "grad_norm": 0.22076704915255055, "learning_rate": 2.2706495366643098e-07, "loss": 0.0422, "step": 5523 }, { "epoch": 2.875585632483082, "grad_norm": 0.22233921973664836, "learning_rate": 2.2517952761897387e-07, "loss": 0.0406, "step": 5524 }, { "epoch": 2.8761061946902657, "grad_norm": 0.22865295080762318, "learning_rate": 2.233019266127373e-07, "loss": 0.0421, "step": 5525 }, { "epoch": 2.8766267568974495, "grad_norm": 0.22596505815503942, "learning_rate": 2.2143215124076354e-07, "loss": 0.0429, "step": 5526 }, { "epoch": 2.877147319104633, "grad_norm": 0.22657611067678246, "learning_rate": 2.195702020936108e-07, "loss": 0.0425, "step": 5527 }, { "epoch": 2.8776678813118166, "grad_norm": 0.22817410965615426, "learning_rate": 2.1771607975937246e-07, "loss": 0.0432, "step": 5528 }, { "epoch": 2.8781884435190004, "grad_norm": 0.22318595175463884, "learning_rate": 2.158697848236607e-07, "loss": 0.0425, "step": 5529 }, { "epoch": 2.878709005726184, "grad_norm": 0.23144548762355127, "learning_rate": 2.1403131786962848e-07, "loss": 0.043, "step": 5530 }, { "epoch": 2.879229567933368, "grad_norm": 0.22008731677973759, "learning_rate": 2.1220067947794187e-07, "loss": 0.0423, "step": 5531 }, { "epoch": 2.8797501301405517, "grad_norm": 0.22081632722175687, "learning_rate": 2.1037787022681055e-07, "loss": 0.0411, "step": 5532 }, { "epoch": 2.8802706923477355, "grad_norm": 0.2261541595421086, "learning_rate": 2.0856289069195457e-07, "loss": 0.0435, "step": 5533 }, { "epoch": 2.8807912545549192, "grad_norm": 0.2325478049396348, "learning_rate": 2.0675574144663202e-07, "loss": 0.0455, "step": 5534 }, { "epoch": 2.881311816762103, "grad_norm": 0.22779678676819293, "learning_rate": 2.0495642306162244e-07, "loss": 0.0423, "step": 5535 }, { "epoch": 2.881832378969287, "grad_norm": 0.2708994243214654, "learning_rate": 2.031649361052379e-07, "loss": 0.0435, "step": 5536 }, { "epoch": 2.8823529411764706, "grad_norm": 0.225376654268015, "learning_rate": 2.0138128114331467e-07, "loss": 0.0425, "step": 5537 }, { "epoch": 2.8828735033836543, "grad_norm": 0.21849928271688246, "learning_rate": 1.9960545873920765e-07, "loss": 0.0422, "step": 5538 }, { "epoch": 2.883394065590838, "grad_norm": 0.2240071596809921, "learning_rate": 1.978374694538071e-07, "loss": 0.0446, "step": 5539 }, { "epoch": 2.883914627798022, "grad_norm": 0.21762364229612807, "learning_rate": 1.960773138455302e-07, "loss": 0.0424, "step": 5540 }, { "epoch": 2.8844351900052057, "grad_norm": 0.22194985729963923, "learning_rate": 1.9432499247031e-07, "loss": 0.0419, "step": 5541 }, { "epoch": 2.8849557522123894, "grad_norm": 0.2192694312980397, "learning_rate": 1.925805058816177e-07, "loss": 0.0414, "step": 5542 }, { "epoch": 2.885476314419573, "grad_norm": 0.2458743689393578, "learning_rate": 1.9084385463043475e-07, "loss": 0.0451, "step": 5543 }, { "epoch": 2.885996876626757, "grad_norm": 0.22425721644334612, "learning_rate": 1.891150392652835e-07, "loss": 0.0417, "step": 5544 }, { "epoch": 2.8865174388339407, "grad_norm": 0.2275200585924636, "learning_rate": 1.873940603322022e-07, "loss": 0.0437, "step": 5545 }, { "epoch": 2.8870380010411245, "grad_norm": 0.22808079161183273, "learning_rate": 1.85680918374756e-07, "loss": 0.0427, "step": 5546 }, { "epoch": 2.8875585632483083, "grad_norm": 0.2232387790940575, "learning_rate": 1.8397561393403427e-07, "loss": 0.0443, "step": 5547 }, { "epoch": 2.888079125455492, "grad_norm": 0.22740682941731324, "learning_rate": 1.8227814754865068e-07, "loss": 0.0444, "step": 5548 }, { "epoch": 2.888599687662676, "grad_norm": 0.2304311796701836, "learning_rate": 1.8058851975474577e-07, "loss": 0.0446, "step": 5549 }, { "epoch": 2.8891202498698596, "grad_norm": 0.2347568933658674, "learning_rate": 1.7890673108598433e-07, "loss": 0.0424, "step": 5550 }, { "epoch": 2.889640812077043, "grad_norm": 0.22430118829279044, "learning_rate": 1.7723278207354976e-07, "loss": 0.0433, "step": 5551 }, { "epoch": 2.8901613742842267, "grad_norm": 0.22066156804762518, "learning_rate": 1.7556667324615527e-07, "loss": 0.0417, "step": 5552 }, { "epoch": 2.8906819364914105, "grad_norm": 0.22032601369478175, "learning_rate": 1.7390840513003548e-07, "loss": 0.0415, "step": 5553 }, { "epoch": 2.8912024986985942, "grad_norm": 0.23669903664743355, "learning_rate": 1.722579782489464e-07, "loss": 0.0443, "step": 5554 }, { "epoch": 2.891723060905778, "grad_norm": 0.23261439457929947, "learning_rate": 1.7061539312417108e-07, "loss": 0.0434, "step": 5555 }, { "epoch": 2.892243623112962, "grad_norm": 0.22614962993197088, "learning_rate": 1.689806502745167e-07, "loss": 0.042, "step": 5556 }, { "epoch": 2.8927641853201456, "grad_norm": 0.22566726168640677, "learning_rate": 1.6735375021630916e-07, "loss": 0.0437, "step": 5557 }, { "epoch": 2.8932847475273293, "grad_norm": 0.23008423849366555, "learning_rate": 1.6573469346339576e-07, "loss": 0.0432, "step": 5558 }, { "epoch": 2.893805309734513, "grad_norm": 0.22692870640049476, "learning_rate": 1.6412348052715632e-07, "loss": 0.0448, "step": 5559 }, { "epoch": 2.894325871941697, "grad_norm": 0.22253713385008417, "learning_rate": 1.6252011191648653e-07, "loss": 0.0418, "step": 5560 }, { "epoch": 2.8948464341488807, "grad_norm": 0.24561203289796596, "learning_rate": 1.6092458813779797e-07, "loss": 0.043, "step": 5561 }, { "epoch": 2.8953669963560644, "grad_norm": 0.2272264787299964, "learning_rate": 1.593369096950348e-07, "loss": 0.0422, "step": 5562 }, { "epoch": 2.895887558563248, "grad_norm": 0.21728188819033073, "learning_rate": 1.5775707708966248e-07, "loss": 0.0418, "step": 5563 }, { "epoch": 2.896408120770432, "grad_norm": 0.23004082744114976, "learning_rate": 1.5618509082066246e-07, "loss": 0.0437, "step": 5564 }, { "epoch": 2.8969286829776157, "grad_norm": 0.24070902532286687, "learning_rate": 1.5462095138454314e-07, "loss": 0.043, "step": 5565 }, { "epoch": 2.8974492451847995, "grad_norm": 0.2340610425537053, "learning_rate": 1.5306465927533154e-07, "loss": 0.0439, "step": 5566 }, { "epoch": 2.8979698073919833, "grad_norm": 0.2266359975592615, "learning_rate": 1.515162149845789e-07, "loss": 0.0427, "step": 5567 }, { "epoch": 2.898490369599167, "grad_norm": 0.22807262882625087, "learning_rate": 1.4997561900135238e-07, "loss": 0.0428, "step": 5568 }, { "epoch": 2.899010931806351, "grad_norm": 0.22716325142678023, "learning_rate": 1.4844287181224603e-07, "loss": 0.0439, "step": 5569 }, { "epoch": 2.8995314940135346, "grad_norm": 0.22620114345008535, "learning_rate": 1.469179739013754e-07, "loss": 0.0443, "step": 5570 }, { "epoch": 2.9000520562207184, "grad_norm": 0.22824850058053361, "learning_rate": 1.4540092575036636e-07, "loss": 0.043, "step": 5571 }, { "epoch": 2.900572618427902, "grad_norm": 0.23480974832174725, "learning_rate": 1.4389172783838277e-07, "loss": 0.0467, "step": 5572 }, { "epoch": 2.901093180635086, "grad_norm": 0.22686928685216148, "learning_rate": 1.4239038064209343e-07, "loss": 0.0426, "step": 5573 }, { "epoch": 2.9016137428422697, "grad_norm": 0.2185130750861107, "learning_rate": 1.4089688463569394e-07, "loss": 0.0412, "step": 5574 }, { "epoch": 2.9021343050494535, "grad_norm": 0.22340460528624712, "learning_rate": 1.3941124029090425e-07, "loss": 0.0427, "step": 5575 }, { "epoch": 2.9026548672566372, "grad_norm": 0.24387629963245097, "learning_rate": 1.379334480769545e-07, "loss": 0.046, "step": 5576 }, { "epoch": 2.903175429463821, "grad_norm": 0.22899996540170645, "learning_rate": 1.3646350846060197e-07, "loss": 0.0453, "step": 5577 }, { "epoch": 2.903695991671005, "grad_norm": 0.22580749048473495, "learning_rate": 1.3500142190612797e-07, "loss": 0.043, "step": 5578 }, { "epoch": 2.9042165538781886, "grad_norm": 0.22259129283476642, "learning_rate": 1.335471888753187e-07, "loss": 0.0431, "step": 5579 }, { "epoch": 2.9047371160853723, "grad_norm": 0.2222893681291658, "learning_rate": 1.3210080982749284e-07, "loss": 0.0424, "step": 5580 }, { "epoch": 2.905257678292556, "grad_norm": 0.21864221805966788, "learning_rate": 1.306622852194822e-07, "loss": 0.0413, "step": 5581 }, { "epoch": 2.90577824049974, "grad_norm": 0.2293590339534558, "learning_rate": 1.2923161550564278e-07, "loss": 0.0436, "step": 5582 }, { "epoch": 2.9062988027069236, "grad_norm": 0.228497915949144, "learning_rate": 1.2780880113784365e-07, "loss": 0.0449, "step": 5583 }, { "epoch": 2.9068193649141074, "grad_norm": 0.21641699537000803, "learning_rate": 1.2639384256547816e-07, "loss": 0.0413, "step": 5584 }, { "epoch": 2.907339927121291, "grad_norm": 0.21723573180469424, "learning_rate": 1.2498674023545543e-07, "loss": 0.042, "step": 5585 }, { "epoch": 2.907860489328475, "grad_norm": 0.2299450832808966, "learning_rate": 1.2358749459220332e-07, "loss": 0.0431, "step": 5586 }, { "epoch": 2.9083810515356587, "grad_norm": 0.23308971409684184, "learning_rate": 1.221961060776683e-07, "loss": 0.0439, "step": 5587 }, { "epoch": 2.9089016137428425, "grad_norm": 0.23005044711744224, "learning_rate": 1.2081257513131828e-07, "loss": 0.0447, "step": 5588 }, { "epoch": 2.9094221759500263, "grad_norm": 0.22157757520601382, "learning_rate": 1.1943690219013148e-07, "loss": 0.0424, "step": 5589 }, { "epoch": 2.90994273815721, "grad_norm": 0.22673099799671645, "learning_rate": 1.1806908768861314e-07, "loss": 0.0427, "step": 5590 }, { "epoch": 2.9104633003643934, "grad_norm": 0.22789349189024055, "learning_rate": 1.167091320587843e-07, "loss": 0.0433, "step": 5591 }, { "epoch": 2.910983862571577, "grad_norm": 0.221483966989294, "learning_rate": 1.1535703573017919e-07, "loss": 0.0421, "step": 5592 }, { "epoch": 2.911504424778761, "grad_norm": 0.22112758106997762, "learning_rate": 1.1401279912985341e-07, "loss": 0.0426, "step": 5593 }, { "epoch": 2.9120249869859447, "grad_norm": 0.22109349725590008, "learning_rate": 1.1267642268238121e-07, "loss": 0.0426, "step": 5594 }, { "epoch": 2.9125455491931285, "grad_norm": 0.22080664506069955, "learning_rate": 1.1134790680984996e-07, "loss": 0.0418, "step": 5595 }, { "epoch": 2.9130661114003122, "grad_norm": 0.22213280394837107, "learning_rate": 1.1002725193186847e-07, "loss": 0.0414, "step": 5596 }, { "epoch": 2.913586673607496, "grad_norm": 0.230005852903613, "learning_rate": 1.0871445846555861e-07, "loss": 0.0451, "step": 5597 }, { "epoch": 2.91410723581468, "grad_norm": 0.22684379339260194, "learning_rate": 1.0740952682556371e-07, "loss": 0.0439, "step": 5598 }, { "epoch": 2.9146277980218636, "grad_norm": 0.2309550050238035, "learning_rate": 1.0611245742404297e-07, "loss": 0.0443, "step": 5599 }, { "epoch": 2.9151483602290473, "grad_norm": 0.22694425419899925, "learning_rate": 1.0482325067066868e-07, "loss": 0.0429, "step": 5600 }, { "epoch": 2.915668922436231, "grad_norm": 0.2355942815015955, "learning_rate": 1.0354190697263454e-07, "loss": 0.0416, "step": 5601 }, { "epoch": 2.916189484643415, "grad_norm": 0.22915067675452885, "learning_rate": 1.0226842673464742e-07, "loss": 0.0431, "step": 5602 }, { "epoch": 2.9167100468505986, "grad_norm": 0.22859882687372848, "learning_rate": 1.0100281035893277e-07, "loss": 0.0421, "step": 5603 }, { "epoch": 2.9172306090577824, "grad_norm": 0.22612284310863184, "learning_rate": 9.97450582452264e-08, "loss": 0.0432, "step": 5604 }, { "epoch": 2.917751171264966, "grad_norm": 0.2371748286947038, "learning_rate": 9.849517079079384e-08, "loss": 0.0442, "step": 5605 }, { "epoch": 2.91827173347215, "grad_norm": 0.23429013057909479, "learning_rate": 9.725314839039989e-08, "loss": 0.0456, "step": 5606 }, { "epoch": 2.9187922956793337, "grad_norm": 0.2252893473582409, "learning_rate": 9.60189914363363e-08, "loss": 0.0435, "step": 5607 }, { "epoch": 2.9193128578865175, "grad_norm": 0.22061594822107208, "learning_rate": 9.479270031840514e-08, "loss": 0.0426, "step": 5608 }, { "epoch": 2.9198334200937013, "grad_norm": 0.22394052877244155, "learning_rate": 9.357427542392716e-08, "loss": 0.0443, "step": 5609 }, { "epoch": 2.920353982300885, "grad_norm": 0.22924733216319026, "learning_rate": 9.236371713774172e-08, "loss": 0.0432, "step": 5610 }, { "epoch": 2.920874544508069, "grad_norm": 0.22008904881055802, "learning_rate": 9.116102584219299e-08, "loss": 0.0411, "step": 5611 }, { "epoch": 2.9213951067152526, "grad_norm": 0.23123570570029328, "learning_rate": 8.996620191714933e-08, "loss": 0.0448, "step": 5612 }, { "epoch": 2.9219156689224364, "grad_norm": 0.22502509485305863, "learning_rate": 8.877924573999496e-08, "loss": 0.0435, "step": 5613 }, { "epoch": 2.92243623112962, "grad_norm": 0.22614275559557945, "learning_rate": 8.760015768562169e-08, "loss": 0.0431, "step": 5614 }, { "epoch": 2.9229567933368035, "grad_norm": 0.23309402951892166, "learning_rate": 8.642893812644271e-08, "loss": 0.043, "step": 5615 }, { "epoch": 2.9234773555439872, "grad_norm": 0.23455703296115385, "learning_rate": 8.526558743238156e-08, "loss": 0.0459, "step": 5616 }, { "epoch": 2.923997917751171, "grad_norm": 0.22213718683676725, "learning_rate": 8.411010597088043e-08, "loss": 0.0411, "step": 5617 }, { "epoch": 2.924518479958355, "grad_norm": 0.2309034800189144, "learning_rate": 8.296249410689461e-08, "loss": 0.0446, "step": 5618 }, { "epoch": 2.9250390421655386, "grad_norm": 0.22884832459468085, "learning_rate": 8.182275220288971e-08, "loss": 0.0437, "step": 5619 }, { "epoch": 2.9255596043727223, "grad_norm": 0.22329303957735033, "learning_rate": 8.069088061885277e-08, "loss": 0.0421, "step": 5620 }, { "epoch": 2.926080166579906, "grad_norm": 0.22435831062356618, "learning_rate": 7.956687971228116e-08, "loss": 0.0445, "step": 5621 }, { "epoch": 2.92660072878709, "grad_norm": 0.22004471253385186, "learning_rate": 7.845074983818534e-08, "loss": 0.0423, "step": 5622 }, { "epoch": 2.9271212909942737, "grad_norm": 0.2281975415710203, "learning_rate": 7.734249134909166e-08, "loss": 0.0443, "step": 5623 }, { "epoch": 2.9276418532014574, "grad_norm": 0.2347633299429142, "learning_rate": 7.624210459504233e-08, "loss": 0.0468, "step": 5624 }, { "epoch": 2.928162415408641, "grad_norm": 0.2244782906812353, "learning_rate": 7.514958992358434e-08, "loss": 0.0429, "step": 5625 }, { "epoch": 2.928682977615825, "grad_norm": 0.22082459228471335, "learning_rate": 7.406494767979167e-08, "loss": 0.0404, "step": 5626 }, { "epoch": 2.9292035398230087, "grad_norm": 0.23891153241426735, "learning_rate": 7.298817820623749e-08, "loss": 0.0452, "step": 5627 }, { "epoch": 2.9297241020301925, "grad_norm": 0.2285958330718903, "learning_rate": 7.191928184302477e-08, "loss": 0.0424, "step": 5628 }, { "epoch": 2.9302446642373763, "grad_norm": 0.21899788706540083, "learning_rate": 7.085825892775288e-08, "loss": 0.0439, "step": 5629 }, { "epoch": 2.93076522644456, "grad_norm": 0.22669475507459996, "learning_rate": 6.980510979554545e-08, "loss": 0.0419, "step": 5630 }, { "epoch": 2.931285788651744, "grad_norm": 0.23827004869207266, "learning_rate": 6.875983477903635e-08, "loss": 0.0448, "step": 5631 }, { "epoch": 2.9318063508589276, "grad_norm": 0.23319352811948646, "learning_rate": 6.772243420836988e-08, "loss": 0.0425, "step": 5632 }, { "epoch": 2.9323269130661114, "grad_norm": 0.21762653350885441, "learning_rate": 6.66929084112089e-08, "loss": 0.0409, "step": 5633 }, { "epoch": 2.932847475273295, "grad_norm": 0.2387237574668359, "learning_rate": 6.567125771272387e-08, "loss": 0.0452, "step": 5634 }, { "epoch": 2.933368037480479, "grad_norm": 0.23525862361486116, "learning_rate": 6.465748243559556e-08, "loss": 0.0429, "step": 5635 }, { "epoch": 2.9338885996876627, "grad_norm": 0.23174264458375252, "learning_rate": 6.365158290002893e-08, "loss": 0.0442, "step": 5636 }, { "epoch": 2.9344091618948465, "grad_norm": 0.2253382202261745, "learning_rate": 6.26535594237282e-08, "loss": 0.0426, "step": 5637 }, { "epoch": 2.9349297241020302, "grad_norm": 0.23139331028310633, "learning_rate": 6.166341232191896e-08, "loss": 0.0437, "step": 5638 }, { "epoch": 2.935450286309214, "grad_norm": 0.22693372243517235, "learning_rate": 6.068114190733442e-08, "loss": 0.0429, "step": 5639 }, { "epoch": 2.935970848516398, "grad_norm": 0.2286453647937163, "learning_rate": 5.970674849022084e-08, "loss": 0.0427, "step": 5640 }, { "epoch": 2.9364914107235816, "grad_norm": 0.21710776971071516, "learning_rate": 5.874023237833759e-08, "loss": 0.0422, "step": 5641 }, { "epoch": 2.9370119729307653, "grad_norm": 0.22418298046187546, "learning_rate": 5.7781593876954366e-08, "loss": 0.0432, "step": 5642 }, { "epoch": 2.937532535137949, "grad_norm": 0.2227493313624406, "learning_rate": 5.683083328885397e-08, "loss": 0.0413, "step": 5643 }, { "epoch": 2.938053097345133, "grad_norm": 0.22433420679943167, "learning_rate": 5.58879509143323e-08, "loss": 0.0422, "step": 5644 }, { "epoch": 2.9385736595523166, "grad_norm": 0.2201777114475942, "learning_rate": 5.495294705119558e-08, "loss": 0.0427, "step": 5645 }, { "epoch": 2.9390942217595004, "grad_norm": 0.2272931098330437, "learning_rate": 5.402582199476036e-08, "loss": 0.0427, "step": 5646 }, { "epoch": 2.939614783966684, "grad_norm": 0.21987207104156062, "learning_rate": 5.3106576037856295e-08, "loss": 0.0416, "step": 5647 }, { "epoch": 2.940135346173868, "grad_norm": 0.23629433671926608, "learning_rate": 5.2195209470823345e-08, "loss": 0.0434, "step": 5648 }, { "epoch": 2.9406559083810517, "grad_norm": 0.226357025922961, "learning_rate": 5.1291722581511826e-08, "loss": 0.0437, "step": 5649 }, { "epoch": 2.9411764705882355, "grad_norm": 0.22887056354520308, "learning_rate": 5.039611565529068e-08, "loss": 0.0424, "step": 5650 }, { "epoch": 2.9416970327954193, "grad_norm": 0.22765259780951763, "learning_rate": 4.950838897503085e-08, "loss": 0.0434, "step": 5651 }, { "epoch": 2.942217595002603, "grad_norm": 0.22353601786311908, "learning_rate": 4.86285428211164e-08, "loss": 0.0419, "step": 5652 }, { "epoch": 2.942738157209787, "grad_norm": 0.22628985062359203, "learning_rate": 4.7756577471444466e-08, "loss": 0.0427, "step": 5653 }, { "epoch": 2.9432587194169706, "grad_norm": 0.2334402923764293, "learning_rate": 4.6892493201422525e-08, "loss": 0.0443, "step": 5654 }, { "epoch": 2.943779281624154, "grad_norm": 0.2213843214467801, "learning_rate": 4.603629028397116e-08, "loss": 0.0411, "step": 5655 }, { "epoch": 2.9442998438313377, "grad_norm": 0.22400947744966332, "learning_rate": 4.518796898951572e-08, "loss": 0.0424, "step": 5656 }, { "epoch": 2.9448204060385215, "grad_norm": 0.2205563956277841, "learning_rate": 4.434752958599464e-08, "loss": 0.0419, "step": 5657 }, { "epoch": 2.9453409682457052, "grad_norm": 0.2218373906484088, "learning_rate": 4.351497233886226e-08, "loss": 0.0423, "step": 5658 }, { "epoch": 2.945861530452889, "grad_norm": 0.2257691605384441, "learning_rate": 4.26902975110749e-08, "loss": 0.0434, "step": 5659 }, { "epoch": 2.946382092660073, "grad_norm": 0.21762331110270125, "learning_rate": 4.1873505363104746e-08, "loss": 0.0414, "step": 5660 }, { "epoch": 2.9469026548672566, "grad_norm": 0.2270394287344173, "learning_rate": 4.106459615293434e-08, "loss": 0.0434, "step": 5661 }, { "epoch": 2.9474232170744403, "grad_norm": 0.22235806351067547, "learning_rate": 4.026357013605098e-08, "loss": 0.0432, "step": 5662 }, { "epoch": 2.947943779281624, "grad_norm": 0.20548692652481407, "learning_rate": 3.9470427565460596e-08, "loss": 0.0413, "step": 5663 }, { "epoch": 2.948464341488808, "grad_norm": 0.2279626929922583, "learning_rate": 3.8685168691671156e-08, "loss": 0.042, "step": 5664 }, { "epoch": 2.9489849036959916, "grad_norm": 0.2257806223440966, "learning_rate": 3.7907793762703705e-08, "loss": 0.0418, "step": 5665 }, { "epoch": 2.9495054659031754, "grad_norm": 0.2270748445679294, "learning_rate": 3.713830302409238e-08, "loss": 0.0433, "step": 5666 }, { "epoch": 2.950026028110359, "grad_norm": 0.22232084572422514, "learning_rate": 3.637669671887611e-08, "loss": 0.0418, "step": 5667 }, { "epoch": 2.950546590317543, "grad_norm": 0.22207686160720008, "learning_rate": 3.562297508760415e-08, "loss": 0.042, "step": 5668 }, { "epoch": 2.9510671525247267, "grad_norm": 0.2335518580519873, "learning_rate": 3.4877138368341614e-08, "loss": 0.0447, "step": 5669 }, { "epoch": 2.9515877147319105, "grad_norm": 0.23000775212512042, "learning_rate": 3.413918679665284e-08, "loss": 0.0434, "step": 5670 }, { "epoch": 2.9521082769390943, "grad_norm": 0.2200961120523701, "learning_rate": 3.3409120605623624e-08, "loss": 0.0416, "step": 5671 }, { "epoch": 2.952628839146278, "grad_norm": 0.2211023507432535, "learning_rate": 3.268694002583617e-08, "loss": 0.0421, "step": 5672 }, { "epoch": 2.953149401353462, "grad_norm": 0.23087187119179148, "learning_rate": 3.197264528539135e-08, "loss": 0.0447, "step": 5673 }, { "epoch": 2.9536699635606456, "grad_norm": 0.22974934193368118, "learning_rate": 3.1266236609900356e-08, "loss": 0.0436, "step": 5674 }, { "epoch": 2.9541905257678294, "grad_norm": 0.23183069263185715, "learning_rate": 3.056771422247362e-08, "loss": 0.0428, "step": 5675 }, { "epoch": 2.954711087975013, "grad_norm": 0.22537547711234884, "learning_rate": 2.9877078343740206e-08, "loss": 0.0435, "step": 5676 }, { "epoch": 2.955231650182197, "grad_norm": 0.22654670232931923, "learning_rate": 2.919432919183396e-08, "loss": 0.042, "step": 5677 }, { "epoch": 2.9557522123893807, "grad_norm": 0.22542875112228397, "learning_rate": 2.851946698240182e-08, "loss": 0.0431, "step": 5678 }, { "epoch": 2.956272774596564, "grad_norm": 0.22216150114544114, "learning_rate": 2.7852491928595514e-08, "loss": 0.0407, "step": 5679 }, { "epoch": 2.956793336803748, "grad_norm": 0.22284731710279135, "learning_rate": 2.7193404241074306e-08, "loss": 0.0428, "step": 5680 }, { "epoch": 2.9573138990109316, "grad_norm": 0.22243490757474335, "learning_rate": 2.654220412801056e-08, "loss": 0.0416, "step": 5681 }, { "epoch": 2.9578344612181153, "grad_norm": 0.22588973540324708, "learning_rate": 2.5898891795084202e-08, "loss": 0.0429, "step": 5682 }, { "epoch": 2.958355023425299, "grad_norm": 0.2378710459634585, "learning_rate": 2.5263467445479915e-08, "loss": 0.0448, "step": 5683 }, { "epoch": 2.958875585632483, "grad_norm": 0.23245001982292812, "learning_rate": 2.4635931279898273e-08, "loss": 0.043, "step": 5684 }, { "epoch": 2.9593961478396666, "grad_norm": 0.2277675922701488, "learning_rate": 2.4016283496544613e-08, "loss": 0.043, "step": 5685 }, { "epoch": 2.9599167100468504, "grad_norm": 0.21835077756145146, "learning_rate": 2.340452429112905e-08, "loss": 0.0416, "step": 5686 }, { "epoch": 2.960437272254034, "grad_norm": 0.20768373110222885, "learning_rate": 2.2800653856874797e-08, "loss": 0.0403, "step": 5687 }, { "epoch": 2.960957834461218, "grad_norm": 0.22352062317713317, "learning_rate": 2.2204672384512625e-08, "loss": 0.0415, "step": 5688 }, { "epoch": 2.9614783966684017, "grad_norm": 0.22503022396452138, "learning_rate": 2.161658006228362e-08, "loss": 0.0422, "step": 5689 }, { "epoch": 2.9619989588755855, "grad_norm": 0.22166037598844507, "learning_rate": 2.1036377075930867e-08, "loss": 0.0415, "step": 5690 }, { "epoch": 2.9625195210827693, "grad_norm": 0.22650211899906575, "learning_rate": 2.046406360871056e-08, "loss": 0.0425, "step": 5691 }, { "epoch": 2.963040083289953, "grad_norm": 0.22401814975398185, "learning_rate": 1.989963984138643e-08, "loss": 0.0444, "step": 5692 }, { "epoch": 2.963560645497137, "grad_norm": 0.22589420895588494, "learning_rate": 1.9343105952229768e-08, "loss": 0.0425, "step": 5693 }, { "epoch": 2.9640812077043206, "grad_norm": 0.2303703387090916, "learning_rate": 1.8794462117022182e-08, "loss": 0.0437, "step": 5694 }, { "epoch": 2.9646017699115044, "grad_norm": 0.22853971241469895, "learning_rate": 1.8253708509047285e-08, "loss": 0.0427, "step": 5695 }, { "epoch": 2.965122332118688, "grad_norm": 0.22039445211667255, "learning_rate": 1.7720845299101783e-08, "loss": 0.0427, "step": 5696 }, { "epoch": 2.965642894325872, "grad_norm": 0.2267933732428841, "learning_rate": 1.7195872655487166e-08, "loss": 0.0439, "step": 5697 }, { "epoch": 2.9661634565330557, "grad_norm": 0.22063737426257915, "learning_rate": 1.6678790744015238e-08, "loss": 0.0427, "step": 5698 }, { "epoch": 2.9666840187402395, "grad_norm": 0.22278000386951152, "learning_rate": 1.616959972800536e-08, "loss": 0.0429, "step": 5699 }, { "epoch": 2.9672045809474232, "grad_norm": 0.2207802651418027, "learning_rate": 1.5668299768284434e-08, "loss": 0.0415, "step": 5700 }, { "epoch": 2.967725143154607, "grad_norm": 0.22385079884702969, "learning_rate": 1.5174891023184146e-08, "loss": 0.044, "step": 5701 }, { "epoch": 2.9682457053617908, "grad_norm": 0.2247189896311398, "learning_rate": 1.4689373648549277e-08, "loss": 0.0434, "step": 5702 }, { "epoch": 2.9687662675689745, "grad_norm": 0.2352896279762599, "learning_rate": 1.421174779772383e-08, "loss": 0.0438, "step": 5703 }, { "epoch": 2.9692868297761583, "grad_norm": 0.21791843470524855, "learning_rate": 1.3742013621564909e-08, "loss": 0.0411, "step": 5704 }, { "epoch": 2.969807391983342, "grad_norm": 0.21979737042742506, "learning_rate": 1.3280171268442721e-08, "loss": 0.0421, "step": 5705 }, { "epoch": 2.970327954190526, "grad_norm": 0.22711419377584297, "learning_rate": 1.282622088422114e-08, "loss": 0.0422, "step": 5706 }, { "epoch": 2.9708485163977096, "grad_norm": 0.2214357911141881, "learning_rate": 1.2380162612282697e-08, "loss": 0.0423, "step": 5707 }, { "epoch": 2.9713690786048934, "grad_norm": 0.23079498217049907, "learning_rate": 1.1941996593514693e-08, "loss": 0.044, "step": 5708 }, { "epoch": 2.971889640812077, "grad_norm": 0.2328227144923505, "learning_rate": 1.1511722966306426e-08, "loss": 0.0442, "step": 5709 }, { "epoch": 2.972410203019261, "grad_norm": 0.23439895371290795, "learning_rate": 1.10893418665603e-08, "loss": 0.043, "step": 5710 }, { "epoch": 2.9729307652264447, "grad_norm": 0.22428047997601785, "learning_rate": 1.0674853427683484e-08, "loss": 0.0444, "step": 5711 }, { "epoch": 2.9734513274336285, "grad_norm": 0.22842869772809338, "learning_rate": 1.0268257780590707e-08, "loss": 0.0417, "step": 5712 }, { "epoch": 2.9739718896408123, "grad_norm": 0.21987057018297476, "learning_rate": 9.869555053704239e-09, "loss": 0.0413, "step": 5713 }, { "epoch": 2.974492451847996, "grad_norm": 0.2298608691054948, "learning_rate": 9.4787453729539e-09, "loss": 0.0437, "step": 5714 }, { "epoch": 2.97501301405518, "grad_norm": 0.2253370055551647, "learning_rate": 9.095828861771516e-09, "loss": 0.0432, "step": 5715 }, { "epoch": 2.9755335762623636, "grad_norm": 0.2278149249962827, "learning_rate": 8.720805641104779e-09, "loss": 0.0434, "step": 5716 }, { "epoch": 2.9760541384695474, "grad_norm": 0.2314267428074255, "learning_rate": 8.353675829403385e-09, "loss": 0.0443, "step": 5717 }, { "epoch": 2.976574700676731, "grad_norm": 0.22400674704669293, "learning_rate": 7.994439542619025e-09, "loss": 0.0425, "step": 5718 }, { "epoch": 2.9770952628839145, "grad_norm": 0.22281537791883546, "learning_rate": 7.643096894222046e-09, "loss": 0.0427, "step": 5719 }, { "epoch": 2.9776158250910982, "grad_norm": 0.22059637224805248, "learning_rate": 7.299647995176462e-09, "loss": 0.0427, "step": 5720 }, { "epoch": 2.978136387298282, "grad_norm": 0.22754394710380124, "learning_rate": 6.964092953962165e-09, "loss": 0.0436, "step": 5721 }, { "epoch": 2.978656949505466, "grad_norm": 0.23398045942704954, "learning_rate": 6.63643187656382e-09, "loss": 0.0447, "step": 5722 }, { "epoch": 2.9791775117126496, "grad_norm": 0.22256790802648144, "learning_rate": 6.316664866470867e-09, "loss": 0.0434, "step": 5723 }, { "epoch": 2.9796980739198333, "grad_norm": 0.22504707512425381, "learning_rate": 6.004792024680295e-09, "loss": 0.0433, "step": 5724 }, { "epoch": 2.980218636127017, "grad_norm": 0.2172897922400504, "learning_rate": 5.700813449699416e-09, "loss": 0.043, "step": 5725 }, { "epoch": 2.980739198334201, "grad_norm": 0.23460775231227962, "learning_rate": 5.404729237531991e-09, "loss": 0.045, "step": 5726 }, { "epoch": 2.9812597605413846, "grad_norm": 0.2093848412957586, "learning_rate": 5.116539481703208e-09, "loss": 0.0402, "step": 5727 }, { "epoch": 2.9817803227485684, "grad_norm": 0.22035954463083757, "learning_rate": 4.836244273231927e-09, "loss": 0.0423, "step": 5728 }, { "epoch": 2.982300884955752, "grad_norm": 0.24018779005516844, "learning_rate": 4.56384370064733e-09, "loss": 0.0433, "step": 5729 }, { "epoch": 2.982821447162936, "grad_norm": 0.22737460452274955, "learning_rate": 4.299337849991703e-09, "loss": 0.0441, "step": 5730 }, { "epoch": 2.9833420093701197, "grad_norm": 0.219433427420989, "learning_rate": 4.042726804801e-09, "loss": 0.0412, "step": 5731 }, { "epoch": 2.9838625715773035, "grad_norm": 0.22548984426888996, "learning_rate": 3.794010646132606e-09, "loss": 0.0428, "step": 5732 }, { "epoch": 2.9843831337844873, "grad_norm": 0.2231171978367076, "learning_rate": 3.553189452537575e-09, "loss": 0.0435, "step": 5733 }, { "epoch": 2.984903695991671, "grad_norm": 0.2261689918270949, "learning_rate": 3.3202633000772865e-09, "loss": 0.0423, "step": 5734 }, { "epoch": 2.985424258198855, "grad_norm": 0.24047982640447405, "learning_rate": 3.0952322623262197e-09, "loss": 0.0442, "step": 5735 }, { "epoch": 2.9859448204060386, "grad_norm": 0.2198384025823191, "learning_rate": 2.878096410355302e-09, "loss": 0.0421, "step": 5736 }, { "epoch": 2.9864653826132224, "grad_norm": 0.2205868075205334, "learning_rate": 2.6688558127485607e-09, "loss": 0.0414, "step": 5737 }, { "epoch": 2.986985944820406, "grad_norm": 0.23578679799030974, "learning_rate": 2.4675105355920213e-09, "loss": 0.0445, "step": 5738 }, { "epoch": 2.98750650702759, "grad_norm": 0.22520079740469365, "learning_rate": 2.2740606424792587e-09, "loss": 0.0416, "step": 5739 }, { "epoch": 2.9880270692347737, "grad_norm": 0.23337576964018655, "learning_rate": 2.088506194514173e-09, "loss": 0.043, "step": 5740 }, { "epoch": 2.9885476314419575, "grad_norm": 0.23186028702019476, "learning_rate": 1.9108472502998855e-09, "loss": 0.0432, "step": 5741 }, { "epoch": 2.9890681936491412, "grad_norm": 0.2184071173157055, "learning_rate": 1.7410838659498442e-09, "loss": 0.0418, "step": 5742 }, { "epoch": 2.9895887558563246, "grad_norm": 0.2199140316024418, "learning_rate": 1.579216095087821e-09, "loss": 0.0428, "step": 5743 }, { "epoch": 2.9901093180635083, "grad_norm": 0.22353723184238114, "learning_rate": 1.4252439888340353e-09, "loss": 0.0435, "step": 5744 }, { "epoch": 2.990629880270692, "grad_norm": 0.22673756486431215, "learning_rate": 1.2791675958218064e-09, "loss": 0.0439, "step": 5745 }, { "epoch": 2.991150442477876, "grad_norm": 0.22545208764365354, "learning_rate": 1.140986962186452e-09, "loss": 0.0438, "step": 5746 }, { "epoch": 2.9916710046850596, "grad_norm": 0.23688504795341422, "learning_rate": 1.010702131576391e-09, "loss": 0.0442, "step": 5747 }, { "epoch": 2.9921915668922434, "grad_norm": 0.23253053657185024, "learning_rate": 8.883131451392635e-10, "loss": 0.044, "step": 5748 }, { "epoch": 2.992712129099427, "grad_norm": 0.2379163810816669, "learning_rate": 7.738200415302599e-10, "loss": 0.0452, "step": 5749 }, { "epoch": 2.993232691306611, "grad_norm": 0.21509722288903138, "learning_rate": 6.672228569148953e-10, "loss": 0.0425, "step": 5750 }, { "epoch": 2.9937532535137947, "grad_norm": 0.22479935136282975, "learning_rate": 5.685216249579073e-10, "loss": 0.0421, "step": 5751 }, { "epoch": 2.9942738157209785, "grad_norm": 0.2329527890243757, "learning_rate": 4.777163768343585e-10, "loss": 0.0431, "step": 5752 }, { "epoch": 2.9947943779281623, "grad_norm": 0.21818439935194653, "learning_rate": 3.94807141224085e-10, "loss": 0.0418, "step": 5753 }, { "epoch": 2.995314940135346, "grad_norm": 0.23421104271035925, "learning_rate": 3.197939443172482e-10, "loss": 0.0437, "step": 5754 }, { "epoch": 2.99583550234253, "grad_norm": 0.21714292300858615, "learning_rate": 2.526768098060073e-10, "loss": 0.0426, "step": 5755 }, { "epoch": 2.9963560645497136, "grad_norm": 0.23045424040900458, "learning_rate": 1.9345575888451983e-10, "loss": 0.0436, "step": 5756 }, { "epoch": 2.9968766267568974, "grad_norm": 0.22483821530138692, "learning_rate": 1.421308102628194e-10, "loss": 0.0433, "step": 5757 }, { "epoch": 2.997397188964081, "grad_norm": 0.22259646441449232, "learning_rate": 9.870198014738652e-11, "loss": 0.0449, "step": 5758 }, { "epoch": 2.997917751171265, "grad_norm": 0.22244406662721328, "learning_rate": 6.316928225780228e-11, "loss": 0.0437, "step": 5759 }, { "epoch": 2.9984383133784487, "grad_norm": 0.22721923240497144, "learning_rate": 3.553272781842143e-11, "loss": 0.0434, "step": 5760 }, { "epoch": 2.9989588755856325, "grad_norm": 0.22864277231584557, "learning_rate": 1.5792325552821398e-11, "loss": 0.0435, "step": 5761 }, { "epoch": 2.9994794377928162, "grad_norm": 0.22668136835192348, "learning_rate": 3.948081700455575e-12, "loss": 0.0439, "step": 5762 }, { "epoch": 3.0, "grad_norm": 0.21769778535390255, "learning_rate": 0.0, "loss": 0.042, "step": 5763 } ], "logging_steps": 1.0, "max_steps": 5763, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 240, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2413100531712000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }