diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,40374 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 5763, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005205622071837585, + "grad_norm": 6.376532092079771, + "learning_rate": 2.890173410404624e-07, + "loss": 0.5233, + "step": 1 + }, + { + "epoch": 0.001041124414367517, + "grad_norm": 6.4124152514036075, + "learning_rate": 5.780346820809248e-07, + "loss": 0.5204, + "step": 2 + }, + { + "epoch": 0.0015616866215512754, + "grad_norm": 6.458779162822453, + "learning_rate": 8.670520231213873e-07, + "loss": 0.529, + "step": 3 + }, + { + "epoch": 0.002082248828735034, + "grad_norm": 6.285519837284632, + "learning_rate": 1.1560693641618497e-06, + "loss": 0.5214, + "step": 4 + }, + { + "epoch": 0.0026028110359187923, + "grad_norm": 5.796645692951652, + "learning_rate": 1.4450867052023122e-06, + "loss": 0.5185, + "step": 5 + }, + { + "epoch": 0.0031233732431025507, + "grad_norm": 3.9879840763701693, + "learning_rate": 1.7341040462427746e-06, + "loss": 0.4823, + "step": 6 + }, + { + "epoch": 0.003643935450286309, + "grad_norm": 3.6346238149034646, + "learning_rate": 2.023121387283237e-06, + "loss": 0.4531, + "step": 7 + }, + { + "epoch": 0.004164497657470068, + "grad_norm": 3.3536127160309777, + "learning_rate": 2.3121387283236993e-06, + "loss": 0.427, + "step": 8 + }, + { + "epoch": 0.004685059864653826, + "grad_norm": 2.7703984048593324, + "learning_rate": 2.601156069364162e-06, + "loss": 0.3954, + "step": 9 + }, + { + "epoch": 0.0052056220718375845, + "grad_norm": 2.6269808449331644, + "learning_rate": 2.8901734104046244e-06, + "loss": 0.3784, + "step": 10 + }, + { + "epoch": 0.005726184279021343, + "grad_norm": 1.7055430768828044, + "learning_rate": 3.1791907514450866e-06, + "loss": 0.3593, + "step": 11 + }, + { + "epoch": 0.006246746486205101, + "grad_norm": 2.517105495712176, + "learning_rate": 3.468208092485549e-06, + "loss": 0.3521, + "step": 12 + }, + { + "epoch": 0.00676730869338886, + "grad_norm": 3.195572501235165, + "learning_rate": 3.757225433526012e-06, + "loss": 0.3539, + "step": 13 + }, + { + "epoch": 0.007287870900572618, + "grad_norm": 2.323008206835536, + "learning_rate": 4.046242774566474e-06, + "loss": 0.3425, + "step": 14 + }, + { + "epoch": 0.007808433107756377, + "grad_norm": 1.5310630307639206, + "learning_rate": 4.3352601156069365e-06, + "loss": 0.3317, + "step": 15 + }, + { + "epoch": 0.008328995314940135, + "grad_norm": 1.365492849675652, + "learning_rate": 4.624277456647399e-06, + "loss": 0.3148, + "step": 16 + }, + { + "epoch": 0.008849557522123894, + "grad_norm": 1.4095650207655759, + "learning_rate": 4.913294797687862e-06, + "loss": 0.3157, + "step": 17 + }, + { + "epoch": 0.009370119729307652, + "grad_norm": 1.5215580885689424, + "learning_rate": 5.202312138728324e-06, + "loss": 0.2976, + "step": 18 + }, + { + "epoch": 0.00989068193649141, + "grad_norm": 1.2318061379322545, + "learning_rate": 5.491329479768787e-06, + "loss": 0.3052, + "step": 19 + }, + { + "epoch": 0.010411244143675169, + "grad_norm": 1.1045087369383553, + "learning_rate": 5.780346820809249e-06, + "loss": 0.2965, + "step": 20 + }, + { + "epoch": 0.010931806350858927, + "grad_norm": 1.1036987329271282, + "learning_rate": 6.069364161849711e-06, + "loss": 0.2897, + "step": 21 + }, + { + "epoch": 0.011452368558042686, + "grad_norm": 1.0423572850985787, + "learning_rate": 6.358381502890173e-06, + "loss": 0.2845, + "step": 22 + }, + { + "epoch": 0.011972930765226444, + "grad_norm": 0.9845462775484162, + "learning_rate": 6.647398843930635e-06, + "loss": 0.268, + "step": 23 + }, + { + "epoch": 0.012493492972410203, + "grad_norm": 0.9205758761394418, + "learning_rate": 6.936416184971098e-06, + "loss": 0.2728, + "step": 24 + }, + { + "epoch": 0.013014055179593961, + "grad_norm": 0.936483616234927, + "learning_rate": 7.225433526011561e-06, + "loss": 0.2594, + "step": 25 + }, + { + "epoch": 0.01353461738677772, + "grad_norm": 0.8503591857964459, + "learning_rate": 7.514450867052024e-06, + "loss": 0.2628, + "step": 26 + }, + { + "epoch": 0.014055179593961478, + "grad_norm": 0.8672336072786627, + "learning_rate": 7.803468208092486e-06, + "loss": 0.2555, + "step": 27 + }, + { + "epoch": 0.014575741801145237, + "grad_norm": 0.9212778120486387, + "learning_rate": 8.092485549132949e-06, + "loss": 0.2547, + "step": 28 + }, + { + "epoch": 0.015096304008328995, + "grad_norm": 1.0623750925591287, + "learning_rate": 8.38150289017341e-06, + "loss": 0.2518, + "step": 29 + }, + { + "epoch": 0.015616866215512754, + "grad_norm": 1.0990772755341645, + "learning_rate": 8.670520231213873e-06, + "loss": 0.2542, + "step": 30 + }, + { + "epoch": 0.016137428422696512, + "grad_norm": 0.7201508676237249, + "learning_rate": 8.959537572254335e-06, + "loss": 0.2411, + "step": 31 + }, + { + "epoch": 0.01665799062988027, + "grad_norm": 0.8464593786312, + "learning_rate": 9.248554913294797e-06, + "loss": 0.2453, + "step": 32 + }, + { + "epoch": 0.01717855283706403, + "grad_norm": 0.8038877087534331, + "learning_rate": 9.53757225433526e-06, + "loss": 0.2358, + "step": 33 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 1.0702814348205445, + "learning_rate": 9.826589595375723e-06, + "loss": 0.2375, + "step": 34 + }, + { + "epoch": 0.018219677251431546, + "grad_norm": 1.425305836752578, + "learning_rate": 1.0115606936416185e-05, + "loss": 0.2344, + "step": 35 + }, + { + "epoch": 0.018740239458615304, + "grad_norm": 0.9940426565737253, + "learning_rate": 1.0404624277456647e-05, + "loss": 0.2388, + "step": 36 + }, + { + "epoch": 0.019260801665799063, + "grad_norm": 0.7934472038238055, + "learning_rate": 1.0693641618497111e-05, + "loss": 0.2352, + "step": 37 + }, + { + "epoch": 0.01978136387298282, + "grad_norm": 1.06277101439209, + "learning_rate": 1.0982658959537573e-05, + "loss": 0.2382, + "step": 38 + }, + { + "epoch": 0.02030192608016658, + "grad_norm": 1.2617518893593866, + "learning_rate": 1.1271676300578036e-05, + "loss": 0.2299, + "step": 39 + }, + { + "epoch": 0.020822488287350338, + "grad_norm": 1.0241590169095665, + "learning_rate": 1.1560693641618498e-05, + "loss": 0.2341, + "step": 40 + }, + { + "epoch": 0.021343050494534097, + "grad_norm": 0.7725404532271828, + "learning_rate": 1.184971098265896e-05, + "loss": 0.2361, + "step": 41 + }, + { + "epoch": 0.021863612701717855, + "grad_norm": 0.8376692345275107, + "learning_rate": 1.2138728323699422e-05, + "loss": 0.2362, + "step": 42 + }, + { + "epoch": 0.022384174908901613, + "grad_norm": 1.0428393775559166, + "learning_rate": 1.2427745664739884e-05, + "loss": 0.2338, + "step": 43 + }, + { + "epoch": 0.022904737116085372, + "grad_norm": 0.7784715293287426, + "learning_rate": 1.2716763005780346e-05, + "loss": 0.2316, + "step": 44 + }, + { + "epoch": 0.02342529932326913, + "grad_norm": 0.9048629801576277, + "learning_rate": 1.3005780346820809e-05, + "loss": 0.2253, + "step": 45 + }, + { + "epoch": 0.02394586153045289, + "grad_norm": 1.1048673864810445, + "learning_rate": 1.329479768786127e-05, + "loss": 0.229, + "step": 46 + }, + { + "epoch": 0.024466423737636647, + "grad_norm": 0.7942075601745159, + "learning_rate": 1.3583815028901733e-05, + "loss": 0.2285, + "step": 47 + }, + { + "epoch": 0.024986985944820406, + "grad_norm": 0.9826389811914579, + "learning_rate": 1.3872832369942197e-05, + "loss": 0.2276, + "step": 48 + }, + { + "epoch": 0.025507548152004164, + "grad_norm": 1.6396773861852498, + "learning_rate": 1.416184971098266e-05, + "loss": 0.2242, + "step": 49 + }, + { + "epoch": 0.026028110359187923, + "grad_norm": 1.0452409902447666, + "learning_rate": 1.4450867052023123e-05, + "loss": 0.2281, + "step": 50 + }, + { + "epoch": 0.02654867256637168, + "grad_norm": 1.0756275987207078, + "learning_rate": 1.4739884393063585e-05, + "loss": 0.2277, + "step": 51 + }, + { + "epoch": 0.02706923477355544, + "grad_norm": 0.8991098023314276, + "learning_rate": 1.5028901734104049e-05, + "loss": 0.2172, + "step": 52 + }, + { + "epoch": 0.027589796980739198, + "grad_norm": 0.9582658807979682, + "learning_rate": 1.531791907514451e-05, + "loss": 0.2178, + "step": 53 + }, + { + "epoch": 0.028110359187922956, + "grad_norm": 0.9911148408927335, + "learning_rate": 1.5606936416184973e-05, + "loss": 0.2341, + "step": 54 + }, + { + "epoch": 0.028630921395106715, + "grad_norm": 1.3180692766328292, + "learning_rate": 1.5895953757225435e-05, + "loss": 0.2195, + "step": 55 + }, + { + "epoch": 0.029151483602290473, + "grad_norm": 1.486871730799441, + "learning_rate": 1.6184971098265897e-05, + "loss": 0.2247, + "step": 56 + }, + { + "epoch": 0.029672045809474232, + "grad_norm": 0.8943418358679089, + "learning_rate": 1.647398843930636e-05, + "loss": 0.2208, + "step": 57 + }, + { + "epoch": 0.03019260801665799, + "grad_norm": 0.9549840398600492, + "learning_rate": 1.676300578034682e-05, + "loss": 0.2303, + "step": 58 + }, + { + "epoch": 0.03071317022384175, + "grad_norm": 1.6440802970143786, + "learning_rate": 1.7052023121387284e-05, + "loss": 0.2189, + "step": 59 + }, + { + "epoch": 0.031233732431025507, + "grad_norm": 0.8692681682580999, + "learning_rate": 1.7341040462427746e-05, + "loss": 0.2242, + "step": 60 + }, + { + "epoch": 0.031754294638209266, + "grad_norm": 1.0685256597249488, + "learning_rate": 1.7630057803468208e-05, + "loss": 0.2124, + "step": 61 + }, + { + "epoch": 0.032274856845393024, + "grad_norm": 1.401339524265495, + "learning_rate": 1.791907514450867e-05, + "loss": 0.222, + "step": 62 + }, + { + "epoch": 0.03279541905257678, + "grad_norm": 0.8600585202663065, + "learning_rate": 1.8208092485549132e-05, + "loss": 0.2138, + "step": 63 + }, + { + "epoch": 0.03331598125976054, + "grad_norm": 1.2477374949025173, + "learning_rate": 1.8497109826589594e-05, + "loss": 0.2282, + "step": 64 + }, + { + "epoch": 0.0338365434669443, + "grad_norm": 1.097660347997359, + "learning_rate": 1.8786127167630057e-05, + "loss": 0.217, + "step": 65 + }, + { + "epoch": 0.03435710567412806, + "grad_norm": 1.2951250009085689, + "learning_rate": 1.907514450867052e-05, + "loss": 0.2199, + "step": 66 + }, + { + "epoch": 0.034877667881311816, + "grad_norm": 0.826872263396979, + "learning_rate": 1.936416184971098e-05, + "loss": 0.2134, + "step": 67 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 1.326379381993655, + "learning_rate": 1.9653179190751446e-05, + "loss": 0.2253, + "step": 68 + }, + { + "epoch": 0.03591879229567933, + "grad_norm": 0.8549714338599687, + "learning_rate": 1.994219653179191e-05, + "loss": 0.2118, + "step": 69 + }, + { + "epoch": 0.03643935450286309, + "grad_norm": 1.1468222920831441, + "learning_rate": 2.023121387283237e-05, + "loss": 0.225, + "step": 70 + }, + { + "epoch": 0.03695991671004685, + "grad_norm": 0.8454115307212339, + "learning_rate": 2.0520231213872833e-05, + "loss": 0.2165, + "step": 71 + }, + { + "epoch": 0.03748047891723061, + "grad_norm": 0.8498885949939733, + "learning_rate": 2.0809248554913295e-05, + "loss": 0.2098, + "step": 72 + }, + { + "epoch": 0.03800104112441437, + "grad_norm": 0.8214398654369537, + "learning_rate": 2.1098265895953757e-05, + "loss": 0.2183, + "step": 73 + }, + { + "epoch": 0.038521603331598125, + "grad_norm": 1.0466964698294483, + "learning_rate": 2.1387283236994223e-05, + "loss": 0.2193, + "step": 74 + }, + { + "epoch": 0.039042165538781884, + "grad_norm": 1.1184070427085229, + "learning_rate": 2.1676300578034685e-05, + "loss": 0.2187, + "step": 75 + }, + { + "epoch": 0.03956272774596564, + "grad_norm": 1.040775059918436, + "learning_rate": 2.1965317919075147e-05, + "loss": 0.2194, + "step": 76 + }, + { + "epoch": 0.0400832899531494, + "grad_norm": 1.2038697374050564, + "learning_rate": 2.225433526011561e-05, + "loss": 0.2184, + "step": 77 + }, + { + "epoch": 0.04060385216033316, + "grad_norm": 0.8663131100942328, + "learning_rate": 2.254335260115607e-05, + "loss": 0.2107, + "step": 78 + }, + { + "epoch": 0.04112441436751692, + "grad_norm": 0.7771080010057934, + "learning_rate": 2.2832369942196533e-05, + "loss": 0.2091, + "step": 79 + }, + { + "epoch": 0.041644976574700676, + "grad_norm": 0.9075557788861287, + "learning_rate": 2.3121387283236996e-05, + "loss": 0.213, + "step": 80 + }, + { + "epoch": 0.042165538781884435, + "grad_norm": 1.1814000831436555, + "learning_rate": 2.3410404624277458e-05, + "loss": 0.2111, + "step": 81 + }, + { + "epoch": 0.04268610098906819, + "grad_norm": 0.808697178051325, + "learning_rate": 2.369942196531792e-05, + "loss": 0.2079, + "step": 82 + }, + { + "epoch": 0.04320666319625195, + "grad_norm": 0.9213395387224003, + "learning_rate": 2.3988439306358382e-05, + "loss": 0.213, + "step": 83 + }, + { + "epoch": 0.04372722540343571, + "grad_norm": 0.8345054388332048, + "learning_rate": 2.4277456647398844e-05, + "loss": 0.2099, + "step": 84 + }, + { + "epoch": 0.04424778761061947, + "grad_norm": 0.8849593943677236, + "learning_rate": 2.4566473988439306e-05, + "loss": 0.222, + "step": 85 + }, + { + "epoch": 0.04476834981780323, + "grad_norm": 0.8478338478744712, + "learning_rate": 2.485549132947977e-05, + "loss": 0.2203, + "step": 86 + }, + { + "epoch": 0.045288912024986985, + "grad_norm": 0.9085767079232848, + "learning_rate": 2.5144508670520234e-05, + "loss": 0.2134, + "step": 87 + }, + { + "epoch": 0.045809474232170744, + "grad_norm": 0.9152260021422175, + "learning_rate": 2.5433526011560693e-05, + "loss": 0.2212, + "step": 88 + }, + { + "epoch": 0.0463300364393545, + "grad_norm": 0.7878287625575658, + "learning_rate": 2.5722543352601158e-05, + "loss": 0.2088, + "step": 89 + }, + { + "epoch": 0.04685059864653826, + "grad_norm": 0.9443429001103073, + "learning_rate": 2.6011560693641617e-05, + "loss": 0.2126, + "step": 90 + }, + { + "epoch": 0.04737116085372202, + "grad_norm": 1.0473598485121471, + "learning_rate": 2.6300578034682083e-05, + "loss": 0.2211, + "step": 91 + }, + { + "epoch": 0.04789172306090578, + "grad_norm": 0.8361641199343858, + "learning_rate": 2.658959537572254e-05, + "loss": 0.2128, + "step": 92 + }, + { + "epoch": 0.048412285268089536, + "grad_norm": 0.7931786259368718, + "learning_rate": 2.6878612716763007e-05, + "loss": 0.2166, + "step": 93 + }, + { + "epoch": 0.048932847475273294, + "grad_norm": 0.8124807154897088, + "learning_rate": 2.7167630057803466e-05, + "loss": 0.2122, + "step": 94 + }, + { + "epoch": 0.04945340968245705, + "grad_norm": 0.7637460807323695, + "learning_rate": 2.745664739884393e-05, + "loss": 0.2157, + "step": 95 + }, + { + "epoch": 0.04997397188964081, + "grad_norm": 1.0197675734431848, + "learning_rate": 2.7745664739884393e-05, + "loss": 0.2202, + "step": 96 + }, + { + "epoch": 0.05049453409682457, + "grad_norm": 0.7718672139219506, + "learning_rate": 2.8034682080924855e-05, + "loss": 0.2157, + "step": 97 + }, + { + "epoch": 0.05101509630400833, + "grad_norm": 0.6967370402605544, + "learning_rate": 2.832369942196532e-05, + "loss": 0.2115, + "step": 98 + }, + { + "epoch": 0.05153565851119209, + "grad_norm": 0.8860212093442391, + "learning_rate": 2.861271676300578e-05, + "loss": 0.2164, + "step": 99 + }, + { + "epoch": 0.052056220718375845, + "grad_norm": 0.806336638090273, + "learning_rate": 2.8901734104046245e-05, + "loss": 0.2066, + "step": 100 + }, + { + "epoch": 0.052576782925559604, + "grad_norm": 0.8839202966628119, + "learning_rate": 2.9190751445086707e-05, + "loss": 0.2133, + "step": 101 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 1.0715053048576961, + "learning_rate": 2.947976878612717e-05, + "loss": 0.2094, + "step": 102 + }, + { + "epoch": 0.05361790733992712, + "grad_norm": 0.9923778641494126, + "learning_rate": 2.9768786127167632e-05, + "loss": 0.2147, + "step": 103 + }, + { + "epoch": 0.05413846954711088, + "grad_norm": 0.996319311639693, + "learning_rate": 3.0057803468208097e-05, + "loss": 0.2119, + "step": 104 + }, + { + "epoch": 0.05465903175429464, + "grad_norm": 0.8155226884859121, + "learning_rate": 3.0346820809248556e-05, + "loss": 0.2078, + "step": 105 + }, + { + "epoch": 0.055179593961478396, + "grad_norm": 0.7905094789778886, + "learning_rate": 3.063583815028902e-05, + "loss": 0.2096, + "step": 106 + }, + { + "epoch": 0.055700156168662154, + "grad_norm": 0.8773860436858838, + "learning_rate": 3.092485549132948e-05, + "loss": 0.2167, + "step": 107 + }, + { + "epoch": 0.05622071837584591, + "grad_norm": 0.8873612182478008, + "learning_rate": 3.1213872832369946e-05, + "loss": 0.2147, + "step": 108 + }, + { + "epoch": 0.05674128058302967, + "grad_norm": 0.8093261489985791, + "learning_rate": 3.1502890173410405e-05, + "loss": 0.2147, + "step": 109 + }, + { + "epoch": 0.05726184279021343, + "grad_norm": 0.7069615535722936, + "learning_rate": 3.179190751445087e-05, + "loss": 0.2175, + "step": 110 + }, + { + "epoch": 0.05778240499739719, + "grad_norm": 0.7496643468031777, + "learning_rate": 3.208092485549133e-05, + "loss": 0.2133, + "step": 111 + }, + { + "epoch": 0.05830296720458095, + "grad_norm": 0.6778780641394953, + "learning_rate": 3.2369942196531794e-05, + "loss": 0.2142, + "step": 112 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 0.8038377581955503, + "learning_rate": 3.265895953757225e-05, + "loss": 0.2081, + "step": 113 + }, + { + "epoch": 0.059344091618948464, + "grad_norm": 0.8262906806241153, + "learning_rate": 3.294797687861272e-05, + "loss": 0.2198, + "step": 114 + }, + { + "epoch": 0.05986465382613222, + "grad_norm": 0.8225060449744769, + "learning_rate": 3.323699421965318e-05, + "loss": 0.2085, + "step": 115 + }, + { + "epoch": 0.06038521603331598, + "grad_norm": 0.9474684268193776, + "learning_rate": 3.352601156069364e-05, + "loss": 0.2141, + "step": 116 + }, + { + "epoch": 0.06090577824049974, + "grad_norm": 0.9490625592067119, + "learning_rate": 3.381502890173411e-05, + "loss": 0.2116, + "step": 117 + }, + { + "epoch": 0.0614263404476835, + "grad_norm": 1.0111096680389842, + "learning_rate": 3.410404624277457e-05, + "loss": 0.2199, + "step": 118 + }, + { + "epoch": 0.061946902654867256, + "grad_norm": 0.9650260633005836, + "learning_rate": 3.439306358381503e-05, + "loss": 0.2126, + "step": 119 + }, + { + "epoch": 0.062467464862051014, + "grad_norm": 0.6980785653915276, + "learning_rate": 3.468208092485549e-05, + "loss": 0.2099, + "step": 120 + }, + { + "epoch": 0.06298802706923477, + "grad_norm": 0.8534007635131398, + "learning_rate": 3.497109826589596e-05, + "loss": 0.2193, + "step": 121 + }, + { + "epoch": 0.06350858927641853, + "grad_norm": 0.8531206959278465, + "learning_rate": 3.5260115606936416e-05, + "loss": 0.2121, + "step": 122 + }, + { + "epoch": 0.06402915148360229, + "grad_norm": 0.8548017488318986, + "learning_rate": 3.554913294797688e-05, + "loss": 0.2228, + "step": 123 + }, + { + "epoch": 0.06454971369078605, + "grad_norm": 0.6612943150792601, + "learning_rate": 3.583815028901734e-05, + "loss": 0.2103, + "step": 124 + }, + { + "epoch": 0.0650702758979698, + "grad_norm": 0.7715488990577469, + "learning_rate": 3.6127167630057806e-05, + "loss": 0.2035, + "step": 125 + }, + { + "epoch": 0.06559083810515356, + "grad_norm": 0.8671638801555337, + "learning_rate": 3.6416184971098265e-05, + "loss": 0.2152, + "step": 126 + }, + { + "epoch": 0.06611140031233732, + "grad_norm": 0.9914635629797826, + "learning_rate": 3.670520231213873e-05, + "loss": 0.2162, + "step": 127 + }, + { + "epoch": 0.06663196251952108, + "grad_norm": 0.979896089844123, + "learning_rate": 3.699421965317919e-05, + "loss": 0.2161, + "step": 128 + }, + { + "epoch": 0.06715252472670484, + "grad_norm": 0.6633992741028703, + "learning_rate": 3.7283236994219654e-05, + "loss": 0.2121, + "step": 129 + }, + { + "epoch": 0.0676730869338886, + "grad_norm": 0.7005104328408702, + "learning_rate": 3.757225433526011e-05, + "loss": 0.2068, + "step": 130 + }, + { + "epoch": 0.06819364914107236, + "grad_norm": 0.7770874448510542, + "learning_rate": 3.786127167630058e-05, + "loss": 0.2148, + "step": 131 + }, + { + "epoch": 0.06871421134825612, + "grad_norm": 0.8338865623089665, + "learning_rate": 3.815028901734104e-05, + "loss": 0.2143, + "step": 132 + }, + { + "epoch": 0.06923477355543987, + "grad_norm": 0.7816413255198202, + "learning_rate": 3.84393063583815e-05, + "loss": 0.2189, + "step": 133 + }, + { + "epoch": 0.06975533576262363, + "grad_norm": 0.65556462363938, + "learning_rate": 3.872832369942196e-05, + "loss": 0.2079, + "step": 134 + }, + { + "epoch": 0.07027589796980739, + "grad_norm": 0.7701568466848703, + "learning_rate": 3.901734104046243e-05, + "loss": 0.2093, + "step": 135 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 0.832197605955844, + "learning_rate": 3.930635838150289e-05, + "loss": 0.2131, + "step": 136 + }, + { + "epoch": 0.07131702238417491, + "grad_norm": 0.8404782199225694, + "learning_rate": 3.959537572254335e-05, + "loss": 0.2183, + "step": 137 + }, + { + "epoch": 0.07183758459135867, + "grad_norm": 0.6621631106005148, + "learning_rate": 3.988439306358382e-05, + "loss": 0.2176, + "step": 138 + }, + { + "epoch": 0.07235814679854242, + "grad_norm": 0.800987879800019, + "learning_rate": 4.0173410404624276e-05, + "loss": 0.2068, + "step": 139 + }, + { + "epoch": 0.07287870900572618, + "grad_norm": 0.8086309231367268, + "learning_rate": 4.046242774566474e-05, + "loss": 0.2149, + "step": 140 + }, + { + "epoch": 0.07339927121290994, + "grad_norm": 0.6390918137014469, + "learning_rate": 4.07514450867052e-05, + "loss": 0.2067, + "step": 141 + }, + { + "epoch": 0.0739198334200937, + "grad_norm": 0.6208193264026112, + "learning_rate": 4.1040462427745666e-05, + "loss": 0.2146, + "step": 142 + }, + { + "epoch": 0.07444039562727746, + "grad_norm": 0.6102728010873583, + "learning_rate": 4.132947976878613e-05, + "loss": 0.2136, + "step": 143 + }, + { + "epoch": 0.07496095783446122, + "grad_norm": 0.7098434352977124, + "learning_rate": 4.161849710982659e-05, + "loss": 0.2124, + "step": 144 + }, + { + "epoch": 0.07548152004164498, + "grad_norm": 0.7136938829382273, + "learning_rate": 4.1907514450867055e-05, + "loss": 0.2176, + "step": 145 + }, + { + "epoch": 0.07600208224882873, + "grad_norm": 0.6206135947118576, + "learning_rate": 4.2196531791907514e-05, + "loss": 0.2128, + "step": 146 + }, + { + "epoch": 0.07652264445601249, + "grad_norm": 0.6662245815894154, + "learning_rate": 4.248554913294798e-05, + "loss": 0.2096, + "step": 147 + }, + { + "epoch": 0.07704320666319625, + "grad_norm": 0.6047404577832085, + "learning_rate": 4.2774566473988445e-05, + "loss": 0.2067, + "step": 148 + }, + { + "epoch": 0.07756376887038001, + "grad_norm": 0.6072140153823105, + "learning_rate": 4.3063583815028904e-05, + "loss": 0.2173, + "step": 149 + }, + { + "epoch": 0.07808433107756377, + "grad_norm": 0.6556745346884483, + "learning_rate": 4.335260115606937e-05, + "loss": 0.214, + "step": 150 + }, + { + "epoch": 0.07860489328474753, + "grad_norm": 0.593639230209489, + "learning_rate": 4.364161849710983e-05, + "loss": 0.2088, + "step": 151 + }, + { + "epoch": 0.07912545549193128, + "grad_norm": 0.6371994241596302, + "learning_rate": 4.3930635838150294e-05, + "loss": 0.2046, + "step": 152 + }, + { + "epoch": 0.07964601769911504, + "grad_norm": 0.638999242172759, + "learning_rate": 4.421965317919075e-05, + "loss": 0.2161, + "step": 153 + }, + { + "epoch": 0.0801665799062988, + "grad_norm": 0.5754915699358542, + "learning_rate": 4.450867052023122e-05, + "loss": 0.2057, + "step": 154 + }, + { + "epoch": 0.08068714211348256, + "grad_norm": 0.5766300270893668, + "learning_rate": 4.4797687861271684e-05, + "loss": 0.2087, + "step": 155 + }, + { + "epoch": 0.08120770432066632, + "grad_norm": 0.6651018920554423, + "learning_rate": 4.508670520231214e-05, + "loss": 0.2111, + "step": 156 + }, + { + "epoch": 0.08172826652785008, + "grad_norm": 0.6875773025621368, + "learning_rate": 4.537572254335261e-05, + "loss": 0.2168, + "step": 157 + }, + { + "epoch": 0.08224882873503384, + "grad_norm": 0.6150201790012484, + "learning_rate": 4.566473988439307e-05, + "loss": 0.2141, + "step": 158 + }, + { + "epoch": 0.0827693909422176, + "grad_norm": 0.6599147042410466, + "learning_rate": 4.595375722543353e-05, + "loss": 0.2153, + "step": 159 + }, + { + "epoch": 0.08328995314940135, + "grad_norm": 0.7633380008679602, + "learning_rate": 4.624277456647399e-05, + "loss": 0.2118, + "step": 160 + }, + { + "epoch": 0.08381051535658511, + "grad_norm": 0.7353603486570877, + "learning_rate": 4.653179190751446e-05, + "loss": 0.2125, + "step": 161 + }, + { + "epoch": 0.08433107756376887, + "grad_norm": 0.7963914788392694, + "learning_rate": 4.6820809248554915e-05, + "loss": 0.2177, + "step": 162 + }, + { + "epoch": 0.08485163977095263, + "grad_norm": 0.7816123652355957, + "learning_rate": 4.710982658959538e-05, + "loss": 0.2113, + "step": 163 + }, + { + "epoch": 0.08537220197813639, + "grad_norm": 0.673743870011103, + "learning_rate": 4.739884393063584e-05, + "loss": 0.215, + "step": 164 + }, + { + "epoch": 0.08589276418532014, + "grad_norm": 0.5227320089849515, + "learning_rate": 4.7687861271676305e-05, + "loss": 0.2103, + "step": 165 + }, + { + "epoch": 0.0864133263925039, + "grad_norm": 0.678198278022982, + "learning_rate": 4.7976878612716764e-05, + "loss": 0.2193, + "step": 166 + }, + { + "epoch": 0.08693388859968766, + "grad_norm": 0.6374039379559228, + "learning_rate": 4.826589595375723e-05, + "loss": 0.2167, + "step": 167 + }, + { + "epoch": 0.08745445080687142, + "grad_norm": 0.5577509103478314, + "learning_rate": 4.855491329479769e-05, + "loss": 0.2047, + "step": 168 + }, + { + "epoch": 0.08797501301405518, + "grad_norm": 0.6071594590963697, + "learning_rate": 4.8843930635838154e-05, + "loss": 0.2129, + "step": 169 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 0.5535849448502077, + "learning_rate": 4.913294797687861e-05, + "loss": 0.2112, + "step": 170 + }, + { + "epoch": 0.0890161374284227, + "grad_norm": 0.5570160624184318, + "learning_rate": 4.942196531791908e-05, + "loss": 0.2249, + "step": 171 + }, + { + "epoch": 0.08953669963560645, + "grad_norm": 0.5926801003230638, + "learning_rate": 4.971098265895954e-05, + "loss": 0.2183, + "step": 172 + }, + { + "epoch": 0.09005726184279021, + "grad_norm": 0.6340827920564475, + "learning_rate": 5e-05, + "loss": 0.2134, + "step": 173 + }, + { + "epoch": 0.09057782404997397, + "grad_norm": 0.5566326852792648, + "learning_rate": 4.99999960519183e-05, + "loss": 0.2109, + "step": 174 + }, + { + "epoch": 0.09109838625715773, + "grad_norm": 0.6364150749071997, + "learning_rate": 4.999998420767445e-05, + "loss": 0.2207, + "step": 175 + }, + { + "epoch": 0.09161894846434149, + "grad_norm": 0.5946977297441665, + "learning_rate": 4.999996446727219e-05, + "loss": 0.22, + "step": 176 + }, + { + "epoch": 0.09213951067152525, + "grad_norm": 0.5756761019081571, + "learning_rate": 4.9999936830717745e-05, + "loss": 0.2157, + "step": 177 + }, + { + "epoch": 0.092660072878709, + "grad_norm": 0.755144779893014, + "learning_rate": 4.999990129801986e-05, + "loss": 0.2149, + "step": 178 + }, + { + "epoch": 0.09318063508589276, + "grad_norm": 0.8329885805863328, + "learning_rate": 4.9999857869189735e-05, + "loss": 0.2104, + "step": 179 + }, + { + "epoch": 0.09370119729307652, + "grad_norm": 0.7690781702453137, + "learning_rate": 4.999980654424112e-05, + "loss": 0.2078, + "step": 180 + }, + { + "epoch": 0.09422175950026028, + "grad_norm": 0.6308760503667642, + "learning_rate": 4.9999747323190195e-05, + "loss": 0.2197, + "step": 181 + }, + { + "epoch": 0.09474232170744404, + "grad_norm": 0.6148255176499823, + "learning_rate": 4.9999680206055686e-05, + "loss": 0.213, + "step": 182 + }, + { + "epoch": 0.0952628839146278, + "grad_norm": 0.6734184929964855, + "learning_rate": 4.999960519285878e-05, + "loss": 0.203, + "step": 183 + }, + { + "epoch": 0.09578344612181156, + "grad_norm": 0.5406620690992346, + "learning_rate": 4.999952228362317e-05, + "loss": 0.2051, + "step": 184 + }, + { + "epoch": 0.09630400832899531, + "grad_norm": 0.5805819703164237, + "learning_rate": 4.999943147837505e-05, + "loss": 0.2243, + "step": 185 + }, + { + "epoch": 0.09682457053617907, + "grad_norm": 0.6790309432200481, + "learning_rate": 4.999933277714309e-05, + "loss": 0.223, + "step": 186 + }, + { + "epoch": 0.09734513274336283, + "grad_norm": 0.5725871196289053, + "learning_rate": 4.999922617995847e-05, + "loss": 0.2137, + "step": 187 + }, + { + "epoch": 0.09786569495054659, + "grad_norm": 0.5475251087174847, + "learning_rate": 4.999911168685486e-05, + "loss": 0.2151, + "step": 188 + }, + { + "epoch": 0.09838625715773035, + "grad_norm": 0.6035996664978388, + "learning_rate": 4.999898929786842e-05, + "loss": 0.2073, + "step": 189 + }, + { + "epoch": 0.0989068193649141, + "grad_norm": 0.6086080086431157, + "learning_rate": 4.999885901303781e-05, + "loss": 0.2183, + "step": 190 + }, + { + "epoch": 0.09942738157209786, + "grad_norm": 0.4937337226206942, + "learning_rate": 4.999872083240418e-05, + "loss": 0.2102, + "step": 191 + }, + { + "epoch": 0.09994794377928162, + "grad_norm": 0.725349945989928, + "learning_rate": 4.999857475601117e-05, + "loss": 0.2089, + "step": 192 + }, + { + "epoch": 0.10046850598646538, + "grad_norm": 0.7612664949980464, + "learning_rate": 4.999842078390492e-05, + "loss": 0.1991, + "step": 193 + }, + { + "epoch": 0.10098906819364914, + "grad_norm": 0.5707299858159967, + "learning_rate": 4.9998258916134055e-05, + "loss": 0.2126, + "step": 194 + }, + { + "epoch": 0.1015096304008329, + "grad_norm": 0.5643415112854695, + "learning_rate": 4.999808915274971e-05, + "loss": 0.2069, + "step": 195 + }, + { + "epoch": 0.10203019260801666, + "grad_norm": 0.5633442640544284, + "learning_rate": 4.999791149380549e-05, + "loss": 0.2132, + "step": 196 + }, + { + "epoch": 0.10255075481520042, + "grad_norm": 0.5614129286121543, + "learning_rate": 4.999772593935752e-05, + "loss": 0.2054, + "step": 197 + }, + { + "epoch": 0.10307131702238417, + "grad_norm": 0.5646241630980633, + "learning_rate": 4.999753248946441e-05, + "loss": 0.21, + "step": 198 + }, + { + "epoch": 0.10359187922956793, + "grad_norm": 0.5425474957654737, + "learning_rate": 4.9997331144187255e-05, + "loss": 0.2187, + "step": 199 + }, + { + "epoch": 0.10411244143675169, + "grad_norm": 0.6177258752993052, + "learning_rate": 4.999712190358965e-05, + "loss": 0.2082, + "step": 200 + }, + { + "epoch": 0.10463300364393545, + "grad_norm": 0.6029817403963842, + "learning_rate": 4.999690476773767e-05, + "loss": 0.212, + "step": 201 + }, + { + "epoch": 0.10515356585111921, + "grad_norm": 0.48878469751460535, + "learning_rate": 4.9996679736699924e-05, + "loss": 0.2024, + "step": 202 + }, + { + "epoch": 0.10567412805830297, + "grad_norm": 0.5654845375393931, + "learning_rate": 4.9996446810547464e-05, + "loss": 0.2107, + "step": 203 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 0.5149953607057386, + "learning_rate": 4.999620598935387e-05, + "loss": 0.1983, + "step": 204 + }, + { + "epoch": 0.10671525247267048, + "grad_norm": 0.4962347495654175, + "learning_rate": 4.9995957273195206e-05, + "loss": 0.2095, + "step": 205 + }, + { + "epoch": 0.10723581467985424, + "grad_norm": 0.48397875587172107, + "learning_rate": 4.9995700662150015e-05, + "loss": 0.2062, + "step": 206 + }, + { + "epoch": 0.107756376887038, + "grad_norm": 0.49662456882149275, + "learning_rate": 4.9995436156299355e-05, + "loss": 0.2186, + "step": 207 + }, + { + "epoch": 0.10827693909422176, + "grad_norm": 0.5132623063501726, + "learning_rate": 4.999516375572677e-05, + "loss": 0.2106, + "step": 208 + }, + { + "epoch": 0.10879750130140552, + "grad_norm": 0.48217335031801317, + "learning_rate": 4.99948834605183e-05, + "loss": 0.2196, + "step": 209 + }, + { + "epoch": 0.10931806350858927, + "grad_norm": 0.5083613448407168, + "learning_rate": 4.999459527076247e-05, + "loss": 0.2092, + "step": 210 + }, + { + "epoch": 0.10983862571577303, + "grad_norm": 0.45169614742277503, + "learning_rate": 4.99942991865503e-05, + "loss": 0.1936, + "step": 211 + }, + { + "epoch": 0.11035918792295679, + "grad_norm": 0.5475430570698664, + "learning_rate": 4.999399520797532e-05, + "loss": 0.2089, + "step": 212 + }, + { + "epoch": 0.11087975013014055, + "grad_norm": 0.4605893449333839, + "learning_rate": 4.9993683335133535e-05, + "loss": 0.2023, + "step": 213 + }, + { + "epoch": 0.11140031233732431, + "grad_norm": 0.48657373710725793, + "learning_rate": 4.999336356812344e-05, + "loss": 0.2141, + "step": 214 + }, + { + "epoch": 0.11192087454450807, + "grad_norm": 0.5120193381886232, + "learning_rate": 4.9993035907046034e-05, + "loss": 0.2026, + "step": 215 + }, + { + "epoch": 0.11244143675169183, + "grad_norm": 0.49871162631911875, + "learning_rate": 4.999270035200483e-05, + "loss": 0.2207, + "step": 216 + }, + { + "epoch": 0.11296199895887558, + "grad_norm": 0.4917375628104701, + "learning_rate": 4.999235690310578e-05, + "loss": 0.2045, + "step": 217 + }, + { + "epoch": 0.11348256116605934, + "grad_norm": 0.5254314690487704, + "learning_rate": 4.999200556045739e-05, + "loss": 0.2068, + "step": 218 + }, + { + "epoch": 0.1140031233732431, + "grad_norm": 0.5379406712902184, + "learning_rate": 4.99916463241706e-05, + "loss": 0.1999, + "step": 219 + }, + { + "epoch": 0.11452368558042686, + "grad_norm": 0.5497570174948546, + "learning_rate": 4.99912791943589e-05, + "loss": 0.2174, + "step": 220 + }, + { + "epoch": 0.11504424778761062, + "grad_norm": 0.5159970185644596, + "learning_rate": 4.999090417113823e-05, + "loss": 0.2121, + "step": 221 + }, + { + "epoch": 0.11556480999479438, + "grad_norm": 0.5255410280335246, + "learning_rate": 4.999052125462705e-05, + "loss": 0.2005, + "step": 222 + }, + { + "epoch": 0.11608537220197813, + "grad_norm": 0.4370433291491418, + "learning_rate": 4.9990130444946295e-05, + "loss": 0.2062, + "step": 223 + }, + { + "epoch": 0.1166059344091619, + "grad_norm": 0.5368330251522332, + "learning_rate": 4.9989731742219415e-05, + "loss": 0.2053, + "step": 224 + }, + { + "epoch": 0.11712649661634565, + "grad_norm": 0.4997030944367229, + "learning_rate": 4.998932514657232e-05, + "loss": 0.2035, + "step": 225 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.48475596745975486, + "learning_rate": 4.9988910658133445e-05, + "loss": 0.2066, + "step": 226 + }, + { + "epoch": 0.11816762103071317, + "grad_norm": 0.4549147383812301, + "learning_rate": 4.99884882770337e-05, + "loss": 0.1947, + "step": 227 + }, + { + "epoch": 0.11868818323789693, + "grad_norm": 0.4829688354526709, + "learning_rate": 4.998805800340649e-05, + "loss": 0.2043, + "step": 228 + }, + { + "epoch": 0.11920874544508069, + "grad_norm": 0.49782219243743464, + "learning_rate": 4.998761983738772e-05, + "loss": 0.2051, + "step": 229 + }, + { + "epoch": 0.11972930765226444, + "grad_norm": 0.502253063495323, + "learning_rate": 4.998717377911578e-05, + "loss": 0.2169, + "step": 230 + }, + { + "epoch": 0.1202498698594482, + "grad_norm": 0.5061752506717145, + "learning_rate": 4.998671982873156e-05, + "loss": 0.2035, + "step": 231 + }, + { + "epoch": 0.12077043206663196, + "grad_norm": 0.511314601070742, + "learning_rate": 4.9986257986378434e-05, + "loss": 0.2034, + "step": 232 + }, + { + "epoch": 0.12129099427381572, + "grad_norm": 0.4988500109798501, + "learning_rate": 4.9985788252202284e-05, + "loss": 0.2007, + "step": 233 + }, + { + "epoch": 0.12181155648099948, + "grad_norm": 0.47625297037289466, + "learning_rate": 4.9985310626351453e-05, + "loss": 0.2065, + "step": 234 + }, + { + "epoch": 0.12233211868818324, + "grad_norm": 0.5281298227193022, + "learning_rate": 4.998482510897682e-05, + "loss": 0.2063, + "step": 235 + }, + { + "epoch": 0.122852680895367, + "grad_norm": 0.4762927778180656, + "learning_rate": 4.9984331700231716e-05, + "loss": 0.2091, + "step": 236 + }, + { + "epoch": 0.12337324310255075, + "grad_norm": 0.5474311482218877, + "learning_rate": 4.9983830400271995e-05, + "loss": 0.2061, + "step": 237 + }, + { + "epoch": 0.12389380530973451, + "grad_norm": 0.4774773588042615, + "learning_rate": 4.998332120925598e-05, + "loss": 0.2002, + "step": 238 + }, + { + "epoch": 0.12441436751691827, + "grad_norm": 0.48763373747314165, + "learning_rate": 4.9982804127344515e-05, + "loss": 0.2012, + "step": 239 + }, + { + "epoch": 0.12493492972410203, + "grad_norm": 0.5185578830596879, + "learning_rate": 4.9982279154700905e-05, + "loss": 0.2003, + "step": 240 + }, + { + "epoch": 0.1254554919312858, + "grad_norm": 0.5024445332518108, + "learning_rate": 4.9981746291490955e-05, + "loss": 0.1951, + "step": 241 + }, + { + "epoch": 0.12597605413846955, + "grad_norm": 0.5449525262128915, + "learning_rate": 4.998120553788298e-05, + "loss": 0.2032, + "step": 242 + }, + { + "epoch": 0.12649661634565332, + "grad_norm": 0.4665771668363775, + "learning_rate": 4.9980656894047776e-05, + "loss": 0.2039, + "step": 243 + }, + { + "epoch": 0.12701717855283706, + "grad_norm": 0.4372817573132417, + "learning_rate": 4.998010036015862e-05, + "loss": 0.2051, + "step": 244 + }, + { + "epoch": 0.12753774076002083, + "grad_norm": 0.43781674957374894, + "learning_rate": 4.997953593639129e-05, + "loss": 0.2054, + "step": 245 + }, + { + "epoch": 0.12805830296720458, + "grad_norm": 0.4541311939721181, + "learning_rate": 4.997896362292407e-05, + "loss": 0.1971, + "step": 246 + }, + { + "epoch": 0.12857886517438835, + "grad_norm": 0.4384524770223584, + "learning_rate": 4.997838341993772e-05, + "loss": 0.198, + "step": 247 + }, + { + "epoch": 0.1290994273815721, + "grad_norm": 0.4471256807532643, + "learning_rate": 4.997779532761549e-05, + "loss": 0.1986, + "step": 248 + }, + { + "epoch": 0.12961998958875587, + "grad_norm": 0.5163351668297698, + "learning_rate": 4.997719934614313e-05, + "loss": 0.2006, + "step": 249 + }, + { + "epoch": 0.1301405517959396, + "grad_norm": 0.5102778772697563, + "learning_rate": 4.9976595475708873e-05, + "loss": 0.2016, + "step": 250 + }, + { + "epoch": 0.13066111400312339, + "grad_norm": 0.4253852174219019, + "learning_rate": 4.997598371650346e-05, + "loss": 0.1965, + "step": 251 + }, + { + "epoch": 0.13118167621030713, + "grad_norm": 0.46645746563329804, + "learning_rate": 4.9975364068720106e-05, + "loss": 0.2031, + "step": 252 + }, + { + "epoch": 0.1317022384174909, + "grad_norm": 0.43965296000744936, + "learning_rate": 4.997473653255452e-05, + "loss": 0.1995, + "step": 253 + }, + { + "epoch": 0.13222280062467465, + "grad_norm": 0.44221288800503233, + "learning_rate": 4.997410110820492e-05, + "loss": 0.1982, + "step": 254 + }, + { + "epoch": 0.13274336283185842, + "grad_norm": 0.4370059916055007, + "learning_rate": 4.997345779587199e-05, + "loss": 0.1939, + "step": 255 + }, + { + "epoch": 0.13326392503904216, + "grad_norm": 0.5211107612902853, + "learning_rate": 4.997280659575892e-05, + "loss": 0.2026, + "step": 256 + }, + { + "epoch": 0.13378448724622594, + "grad_norm": 0.488605855839782, + "learning_rate": 4.997214750807141e-05, + "loss": 0.2005, + "step": 257 + }, + { + "epoch": 0.13430504945340968, + "grad_norm": 0.4932959365760645, + "learning_rate": 4.99714805330176e-05, + "loss": 0.2019, + "step": 258 + }, + { + "epoch": 0.13482561166059345, + "grad_norm": 0.5546717733269304, + "learning_rate": 4.997080567080817e-05, + "loss": 0.1948, + "step": 259 + }, + { + "epoch": 0.1353461738677772, + "grad_norm": 0.48295394900788036, + "learning_rate": 4.9970122921656257e-05, + "loss": 0.1955, + "step": 260 + }, + { + "epoch": 0.13586673607496097, + "grad_norm": 0.5105683280432859, + "learning_rate": 4.996943228577753e-05, + "loss": 0.2075, + "step": 261 + }, + { + "epoch": 0.13638729828214471, + "grad_norm": 0.5759006373431579, + "learning_rate": 4.996873376339011e-05, + "loss": 0.201, + "step": 262 + }, + { + "epoch": 0.1369078604893285, + "grad_norm": 0.483482317837608, + "learning_rate": 4.996802735471461e-05, + "loss": 0.1938, + "step": 263 + }, + { + "epoch": 0.13742842269651223, + "grad_norm": 0.48756843106965886, + "learning_rate": 4.996731305997416e-05, + "loss": 0.2, + "step": 264 + }, + { + "epoch": 0.137948984903696, + "grad_norm": 0.5231891365741146, + "learning_rate": 4.996659087939438e-05, + "loss": 0.2009, + "step": 265 + }, + { + "epoch": 0.13846954711087975, + "grad_norm": 0.518441009753121, + "learning_rate": 4.9965860813203345e-05, + "loss": 0.1831, + "step": 266 + }, + { + "epoch": 0.13899010931806352, + "grad_norm": 0.45224708236227923, + "learning_rate": 4.996512286163166e-05, + "loss": 0.2059, + "step": 267 + }, + { + "epoch": 0.13951067152524727, + "grad_norm": 0.5456839005168297, + "learning_rate": 4.99643770249124e-05, + "loss": 0.205, + "step": 268 + }, + { + "epoch": 0.14003123373243104, + "grad_norm": 0.4891910966189773, + "learning_rate": 4.996362330328113e-05, + "loss": 0.2, + "step": 269 + }, + { + "epoch": 0.14055179593961478, + "grad_norm": 0.5478174066545909, + "learning_rate": 4.996286169697591e-05, + "loss": 0.2004, + "step": 270 + }, + { + "epoch": 0.14107235814679855, + "grad_norm": 0.5582267970906812, + "learning_rate": 4.99620922062373e-05, + "loss": 0.203, + "step": 271 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 0.5105647459053501, + "learning_rate": 4.996131483130833e-05, + "loss": 0.2029, + "step": 272 + }, + { + "epoch": 0.14211348256116607, + "grad_norm": 0.48249484447771235, + "learning_rate": 4.9960529572434545e-05, + "loss": 0.2015, + "step": 273 + }, + { + "epoch": 0.14263404476834982, + "grad_norm": 0.5235774115905943, + "learning_rate": 4.995973642986395e-05, + "loss": 0.2036, + "step": 274 + }, + { + "epoch": 0.1431546069755336, + "grad_norm": 0.47923742023520904, + "learning_rate": 4.995893540384707e-05, + "loss": 0.2017, + "step": 275 + }, + { + "epoch": 0.14367516918271733, + "grad_norm": 0.43341131129494787, + "learning_rate": 4.99581264946369e-05, + "loss": 0.1995, + "step": 276 + }, + { + "epoch": 0.1441957313899011, + "grad_norm": 0.5535433768591606, + "learning_rate": 4.995730970248893e-05, + "loss": 0.2048, + "step": 277 + }, + { + "epoch": 0.14471629359708485, + "grad_norm": 0.3972052460404149, + "learning_rate": 4.9956485027661136e-05, + "loss": 0.2039, + "step": 278 + }, + { + "epoch": 0.14523685580426862, + "grad_norm": 0.594647280576585, + "learning_rate": 4.995565247041401e-05, + "loss": 0.1998, + "step": 279 + }, + { + "epoch": 0.14575741801145237, + "grad_norm": 0.4962198874500988, + "learning_rate": 4.995481203101049e-05, + "loss": 0.2018, + "step": 280 + }, + { + "epoch": 0.14627798021863614, + "grad_norm": 0.4301303739409537, + "learning_rate": 4.9953963709716034e-05, + "loss": 0.2053, + "step": 281 + }, + { + "epoch": 0.14679854242581988, + "grad_norm": 0.5069059225942892, + "learning_rate": 4.995310750679858e-05, + "loss": 0.1979, + "step": 282 + }, + { + "epoch": 0.14731910463300366, + "grad_norm": 0.5279838955451033, + "learning_rate": 4.995224342252855e-05, + "loss": 0.2017, + "step": 283 + }, + { + "epoch": 0.1478396668401874, + "grad_norm": 0.3970535952201932, + "learning_rate": 4.995137145717889e-05, + "loss": 0.2025, + "step": 284 + }, + { + "epoch": 0.14836022904737117, + "grad_norm": 0.4049155457791142, + "learning_rate": 4.9950491611024975e-05, + "loss": 0.1893, + "step": 285 + }, + { + "epoch": 0.14888079125455492, + "grad_norm": 0.4413501454399157, + "learning_rate": 4.994960388434471e-05, + "loss": 0.2012, + "step": 286 + }, + { + "epoch": 0.1494013534617387, + "grad_norm": 0.39118558952416227, + "learning_rate": 4.994870827741849e-05, + "loss": 0.194, + "step": 287 + }, + { + "epoch": 0.14992191566892243, + "grad_norm": 0.4209076336573591, + "learning_rate": 4.9947804790529176e-05, + "loss": 0.1923, + "step": 288 + }, + { + "epoch": 0.1504424778761062, + "grad_norm": 0.4149330323573577, + "learning_rate": 4.994689342396215e-05, + "loss": 0.1944, + "step": 289 + }, + { + "epoch": 0.15096304008328995, + "grad_norm": 0.41004710305075126, + "learning_rate": 4.994597417800524e-05, + "loss": 0.1993, + "step": 290 + }, + { + "epoch": 0.15148360229047372, + "grad_norm": 0.41295205841010674, + "learning_rate": 4.994504705294881e-05, + "loss": 0.2048, + "step": 291 + }, + { + "epoch": 0.15200416449765747, + "grad_norm": 0.3865464642966178, + "learning_rate": 4.994411204908567e-05, + "loss": 0.1952, + "step": 292 + }, + { + "epoch": 0.15252472670484124, + "grad_norm": 0.4033991442815395, + "learning_rate": 4.994316916671114e-05, + "loss": 0.1945, + "step": 293 + }, + { + "epoch": 0.15304528891202498, + "grad_norm": 0.37826488837927863, + "learning_rate": 4.9942218406123045e-05, + "loss": 0.1989, + "step": 294 + }, + { + "epoch": 0.15356585111920876, + "grad_norm": 0.3925013641735211, + "learning_rate": 4.994125976762167e-05, + "loss": 0.1948, + "step": 295 + }, + { + "epoch": 0.1540864133263925, + "grad_norm": 0.4297598640191415, + "learning_rate": 4.9940293251509786e-05, + "loss": 0.199, + "step": 296 + }, + { + "epoch": 0.15460697553357627, + "grad_norm": 0.39427507343176355, + "learning_rate": 4.9939318858092664e-05, + "loss": 0.2038, + "step": 297 + }, + { + "epoch": 0.15512753774076002, + "grad_norm": 0.41295108211994785, + "learning_rate": 4.993833658767808e-05, + "loss": 0.1855, + "step": 298 + }, + { + "epoch": 0.1556480999479438, + "grad_norm": 0.428783102563254, + "learning_rate": 4.993734644057627e-05, + "loss": 0.1925, + "step": 299 + }, + { + "epoch": 0.15616866215512754, + "grad_norm": 0.39643845709605496, + "learning_rate": 4.993634841709998e-05, + "loss": 0.1953, + "step": 300 + }, + { + "epoch": 0.1566892243623113, + "grad_norm": 0.40315780437097026, + "learning_rate": 4.993534251756441e-05, + "loss": 0.197, + "step": 301 + }, + { + "epoch": 0.15720978656949505, + "grad_norm": 0.42325725150557625, + "learning_rate": 4.9934328742287285e-05, + "loss": 0.1993, + "step": 302 + }, + { + "epoch": 0.15773034877667882, + "grad_norm": 0.4080909081022836, + "learning_rate": 4.9933307091588796e-05, + "loss": 0.1872, + "step": 303 + }, + { + "epoch": 0.15825091098386257, + "grad_norm": 0.4403757079324371, + "learning_rate": 4.993227756579163e-05, + "loss": 0.196, + "step": 304 + }, + { + "epoch": 0.15877147319104634, + "grad_norm": 0.4315054835172639, + "learning_rate": 4.993124016522097e-05, + "loss": 0.1988, + "step": 305 + }, + { + "epoch": 0.1592920353982301, + "grad_norm": 0.4181529747933808, + "learning_rate": 4.993019489020446e-05, + "loss": 0.1874, + "step": 306 + }, + { + "epoch": 0.15981259760541386, + "grad_norm": 0.38805878488629814, + "learning_rate": 4.992914174107225e-05, + "loss": 0.1885, + "step": 307 + }, + { + "epoch": 0.1603331598125976, + "grad_norm": 0.4442375624813517, + "learning_rate": 4.992808071815698e-05, + "loss": 0.1918, + "step": 308 + }, + { + "epoch": 0.16085372201978138, + "grad_norm": 0.3809208737314787, + "learning_rate": 4.9927011821793766e-05, + "loss": 0.1997, + "step": 309 + }, + { + "epoch": 0.16137428422696512, + "grad_norm": 0.4138404674084266, + "learning_rate": 4.9925935052320214e-05, + "loss": 0.1959, + "step": 310 + }, + { + "epoch": 0.1618948464341489, + "grad_norm": 0.41341096380173886, + "learning_rate": 4.9924850410076416e-05, + "loss": 0.2007, + "step": 311 + }, + { + "epoch": 0.16241540864133264, + "grad_norm": 0.43053394989910876, + "learning_rate": 4.9923757895404966e-05, + "loss": 0.1898, + "step": 312 + }, + { + "epoch": 0.1629359708485164, + "grad_norm": 0.40770200417265084, + "learning_rate": 4.992265750865091e-05, + "loss": 0.1961, + "step": 313 + }, + { + "epoch": 0.16345653305570015, + "grad_norm": 0.44481929282099114, + "learning_rate": 4.9921549250161817e-05, + "loss": 0.1903, + "step": 314 + }, + { + "epoch": 0.16397709526288393, + "grad_norm": 0.4214318597046138, + "learning_rate": 4.9920433120287726e-05, + "loss": 0.1978, + "step": 315 + }, + { + "epoch": 0.16449765747006767, + "grad_norm": 0.42351713283633147, + "learning_rate": 4.9919309119381155e-05, + "loss": 0.1913, + "step": 316 + }, + { + "epoch": 0.16501821967725144, + "grad_norm": 0.40073423145965437, + "learning_rate": 4.991817724779711e-05, + "loss": 0.201, + "step": 317 + }, + { + "epoch": 0.1655387818844352, + "grad_norm": 0.46802701541194947, + "learning_rate": 4.99170375058931e-05, + "loss": 0.1984, + "step": 318 + }, + { + "epoch": 0.16605934409161896, + "grad_norm": 0.4391832742508613, + "learning_rate": 4.9915889894029124e-05, + "loss": 0.1896, + "step": 319 + }, + { + "epoch": 0.1665799062988027, + "grad_norm": 0.4276877940360142, + "learning_rate": 4.991473441256762e-05, + "loss": 0.1883, + "step": 320 + }, + { + "epoch": 0.16710046850598648, + "grad_norm": 0.4783858273158986, + "learning_rate": 4.991357106187356e-05, + "loss": 0.1914, + "step": 321 + }, + { + "epoch": 0.16762103071317022, + "grad_norm": 0.37335690024180623, + "learning_rate": 4.991239984231438e-05, + "loss": 0.1861, + "step": 322 + }, + { + "epoch": 0.168141592920354, + "grad_norm": 0.5571380480881387, + "learning_rate": 4.991122075426001e-05, + "loss": 0.1938, + "step": 323 + }, + { + "epoch": 0.16866215512753774, + "grad_norm": 0.4673856871456706, + "learning_rate": 4.991003379808286e-05, + "loss": 0.1956, + "step": 324 + }, + { + "epoch": 0.1691827173347215, + "grad_norm": 0.6907331185852513, + "learning_rate": 4.990883897415781e-05, + "loss": 0.1946, + "step": 325 + }, + { + "epoch": 0.16970327954190526, + "grad_norm": 0.4420336300041284, + "learning_rate": 4.9907636282862256e-05, + "loss": 0.1917, + "step": 326 + }, + { + "epoch": 0.17022384174908903, + "grad_norm": 0.4673927210202748, + "learning_rate": 4.9906425724576075e-05, + "loss": 0.1868, + "step": 327 + }, + { + "epoch": 0.17074440395627277, + "grad_norm": 0.39164253998754844, + "learning_rate": 4.99052072996816e-05, + "loss": 0.1811, + "step": 328 + }, + { + "epoch": 0.17126496616345654, + "grad_norm": 0.44534231109604294, + "learning_rate": 4.990398100856367e-05, + "loss": 0.1938, + "step": 329 + }, + { + "epoch": 0.1717855283706403, + "grad_norm": 0.46887801231015935, + "learning_rate": 4.990274685160961e-05, + "loss": 0.1903, + "step": 330 + }, + { + "epoch": 0.17230609057782406, + "grad_norm": 0.3968509774573516, + "learning_rate": 4.990150482920921e-05, + "loss": 0.1908, + "step": 331 + }, + { + "epoch": 0.1728266527850078, + "grad_norm": 0.5263341947271295, + "learning_rate": 4.990025494175477e-05, + "loss": 0.1922, + "step": 332 + }, + { + "epoch": 0.17334721499219158, + "grad_norm": 0.4775537352423801, + "learning_rate": 4.989899718964107e-05, + "loss": 0.1841, + "step": 333 + }, + { + "epoch": 0.17386777719937532, + "grad_norm": 0.41924991923989763, + "learning_rate": 4.989773157326535e-05, + "loss": 0.2015, + "step": 334 + }, + { + "epoch": 0.1743883394065591, + "grad_norm": 0.5252465757639925, + "learning_rate": 4.989645809302736e-05, + "loss": 0.1917, + "step": 335 + }, + { + "epoch": 0.17490890161374284, + "grad_norm": 0.5169440105481551, + "learning_rate": 4.9895176749329334e-05, + "loss": 0.1917, + "step": 336 + }, + { + "epoch": 0.1754294638209266, + "grad_norm": 0.42246102114852874, + "learning_rate": 4.989388754257596e-05, + "loss": 0.2005, + "step": 337 + }, + { + "epoch": 0.17595002602811036, + "grad_norm": 0.5029332963068417, + "learning_rate": 4.989259047317444e-05, + "loss": 0.1903, + "step": 338 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.41379461152486186, + "learning_rate": 4.989128554153444e-05, + "loss": 0.1844, + "step": 339 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 0.45322329232876984, + "learning_rate": 4.9889972748068134e-05, + "loss": 0.188, + "step": 340 + }, + { + "epoch": 0.17751171264966165, + "grad_norm": 0.48168712385017154, + "learning_rate": 4.988865209319015e-05, + "loss": 0.1981, + "step": 341 + }, + { + "epoch": 0.1780322748568454, + "grad_norm": 0.41086331407455057, + "learning_rate": 4.988732357731762e-05, + "loss": 0.1942, + "step": 342 + }, + { + "epoch": 0.17855283706402916, + "grad_norm": 0.540497000624346, + "learning_rate": 4.988598720087015e-05, + "loss": 0.1907, + "step": 343 + }, + { + "epoch": 0.1790733992712129, + "grad_norm": 0.38794214622006457, + "learning_rate": 4.9884642964269824e-05, + "loss": 0.1874, + "step": 344 + }, + { + "epoch": 0.17959396147839668, + "grad_norm": 0.5258434187192316, + "learning_rate": 4.988329086794122e-05, + "loss": 0.1877, + "step": 345 + }, + { + "epoch": 0.18011452368558042, + "grad_norm": 0.5353658539595278, + "learning_rate": 4.9881930912311394e-05, + "loss": 0.194, + "step": 346 + }, + { + "epoch": 0.1806350858927642, + "grad_norm": 0.4392015964672634, + "learning_rate": 4.988056309780987e-05, + "loss": 0.1904, + "step": 347 + }, + { + "epoch": 0.18115564809994794, + "grad_norm": 0.5430979821127188, + "learning_rate": 4.987918742486869e-05, + "loss": 0.2045, + "step": 348 + }, + { + "epoch": 0.1816762103071317, + "grad_norm": 0.47746127042266084, + "learning_rate": 4.987780389392234e-05, + "loss": 0.1877, + "step": 349 + }, + { + "epoch": 0.18219677251431546, + "grad_norm": 0.48616627754776087, + "learning_rate": 4.98764125054078e-05, + "loss": 0.1856, + "step": 350 + }, + { + "epoch": 0.18271733472149923, + "grad_norm": 0.4137932189199551, + "learning_rate": 4.987501325976455e-05, + "loss": 0.1882, + "step": 351 + }, + { + "epoch": 0.18323789692868298, + "grad_norm": 0.48916635581424217, + "learning_rate": 4.987360615743453e-05, + "loss": 0.1872, + "step": 352 + }, + { + "epoch": 0.18375845913586675, + "grad_norm": 0.45140194045768417, + "learning_rate": 4.987219119886216e-05, + "loss": 0.1974, + "step": 353 + }, + { + "epoch": 0.1842790213430505, + "grad_norm": 0.4683554765097198, + "learning_rate": 4.987076838449436e-05, + "loss": 0.1887, + "step": 354 + }, + { + "epoch": 0.18479958355023426, + "grad_norm": 0.45043420134623535, + "learning_rate": 4.986933771478052e-05, + "loss": 0.1953, + "step": 355 + }, + { + "epoch": 0.185320145757418, + "grad_norm": 0.38632079781772455, + "learning_rate": 4.9867899190172505e-05, + "loss": 0.1944, + "step": 356 + }, + { + "epoch": 0.18584070796460178, + "grad_norm": 0.4502927446904294, + "learning_rate": 4.986645281112469e-05, + "loss": 0.1993, + "step": 357 + }, + { + "epoch": 0.18636127017178553, + "grad_norm": 0.35109988684784443, + "learning_rate": 4.986499857809387e-05, + "loss": 0.1832, + "step": 358 + }, + { + "epoch": 0.1868818323789693, + "grad_norm": 0.4855353635745786, + "learning_rate": 4.98635364915394e-05, + "loss": 0.1928, + "step": 359 + }, + { + "epoch": 0.18740239458615304, + "grad_norm": 0.3907280285159689, + "learning_rate": 4.986206655192305e-05, + "loss": 0.1876, + "step": 360 + }, + { + "epoch": 0.18792295679333682, + "grad_norm": 0.42227390242070895, + "learning_rate": 4.98605887597091e-05, + "loss": 0.193, + "step": 361 + }, + { + "epoch": 0.18844351900052056, + "grad_norm": 0.39584572126339823, + "learning_rate": 4.985910311536431e-05, + "loss": 0.184, + "step": 362 + }, + { + "epoch": 0.18896408120770433, + "grad_norm": 0.3990052626089986, + "learning_rate": 4.985760961935791e-05, + "loss": 0.1881, + "step": 363 + }, + { + "epoch": 0.18948464341488808, + "grad_norm": 0.42636340213787094, + "learning_rate": 4.9856108272161614e-05, + "loss": 0.1926, + "step": 364 + }, + { + "epoch": 0.19000520562207185, + "grad_norm": 0.42577564707501814, + "learning_rate": 4.9854599074249633e-05, + "loss": 0.195, + "step": 365 + }, + { + "epoch": 0.1905257678292556, + "grad_norm": 0.41616380256414365, + "learning_rate": 4.985308202609863e-05, + "loss": 0.1885, + "step": 366 + }, + { + "epoch": 0.19104633003643937, + "grad_norm": 0.46495409518003894, + "learning_rate": 4.9851557128187755e-05, + "loss": 0.1908, + "step": 367 + }, + { + "epoch": 0.1915668922436231, + "grad_norm": 0.41526376329857984, + "learning_rate": 4.985002438099865e-05, + "loss": 0.188, + "step": 368 + }, + { + "epoch": 0.19208745445080688, + "grad_norm": 0.3935066236122138, + "learning_rate": 4.984848378501542e-05, + "loss": 0.1885, + "step": 369 + }, + { + "epoch": 0.19260801665799063, + "grad_norm": 0.4088622746642285, + "learning_rate": 4.984693534072467e-05, + "loss": 0.1918, + "step": 370 + }, + { + "epoch": 0.1931285788651744, + "grad_norm": 0.3809983710951146, + "learning_rate": 4.984537904861546e-05, + "loss": 0.1826, + "step": 371 + }, + { + "epoch": 0.19364914107235814, + "grad_norm": 0.40658651643232196, + "learning_rate": 4.9843814909179345e-05, + "loss": 0.19, + "step": 372 + }, + { + "epoch": 0.19416970327954192, + "grad_norm": 0.43969698475310975, + "learning_rate": 4.9842242922910345e-05, + "loss": 0.1894, + "step": 373 + }, + { + "epoch": 0.19469026548672566, + "grad_norm": 0.3862559796395953, + "learning_rate": 4.9840663090304965e-05, + "loss": 0.1914, + "step": 374 + }, + { + "epoch": 0.19521082769390943, + "grad_norm": 0.5347847427366343, + "learning_rate": 4.983907541186221e-05, + "loss": 0.1917, + "step": 375 + }, + { + "epoch": 0.19573138990109318, + "grad_norm": 0.3974569413531495, + "learning_rate": 4.983747988808352e-05, + "loss": 0.184, + "step": 376 + }, + { + "epoch": 0.19625195210827695, + "grad_norm": 0.4412640683782702, + "learning_rate": 4.983587651947285e-05, + "loss": 0.1855, + "step": 377 + }, + { + "epoch": 0.1967725143154607, + "grad_norm": 0.4423096342665371, + "learning_rate": 4.983426530653661e-05, + "loss": 0.1923, + "step": 378 + }, + { + "epoch": 0.19729307652264447, + "grad_norm": 0.43821702301793497, + "learning_rate": 4.9832646249783694e-05, + "loss": 0.189, + "step": 379 + }, + { + "epoch": 0.1978136387298282, + "grad_norm": 0.3924545030570308, + "learning_rate": 4.983101934972548e-05, + "loss": 0.1891, + "step": 380 + }, + { + "epoch": 0.19833420093701198, + "grad_norm": 0.42299618371561903, + "learning_rate": 4.982938460687583e-05, + "loss": 0.1903, + "step": 381 + }, + { + "epoch": 0.19885476314419573, + "grad_norm": 0.4001882492331783, + "learning_rate": 4.982774202175105e-05, + "loss": 0.1803, + "step": 382 + }, + { + "epoch": 0.1993753253513795, + "grad_norm": 0.49011114003395145, + "learning_rate": 4.9826091594869974e-05, + "loss": 0.1813, + "step": 383 + }, + { + "epoch": 0.19989588755856325, + "grad_norm": 0.37442432473888976, + "learning_rate": 4.982443332675385e-05, + "loss": 0.1802, + "step": 384 + }, + { + "epoch": 0.20041644976574702, + "grad_norm": 0.48051842908306147, + "learning_rate": 4.9822767217926456e-05, + "loss": 0.1947, + "step": 385 + }, + { + "epoch": 0.20093701197293076, + "grad_norm": 0.4257153401498972, + "learning_rate": 4.982109326891402e-05, + "loss": 0.1884, + "step": 386 + }, + { + "epoch": 0.20145757418011453, + "grad_norm": 0.47752005156773103, + "learning_rate": 4.981941148024526e-05, + "loss": 0.1857, + "step": 387 + }, + { + "epoch": 0.20197813638729828, + "grad_norm": 0.40745960509516727, + "learning_rate": 4.981772185245135e-05, + "loss": 0.187, + "step": 388 + }, + { + "epoch": 0.20249869859448205, + "grad_norm": 0.4359497459483064, + "learning_rate": 4.9816024386065973e-05, + "loss": 0.1871, + "step": 389 + }, + { + "epoch": 0.2030192608016658, + "grad_norm": 0.3892740872530919, + "learning_rate": 4.981431908162525e-05, + "loss": 0.188, + "step": 390 + }, + { + "epoch": 0.20353982300884957, + "grad_norm": 0.420465792965565, + "learning_rate": 4.98126059396678e-05, + "loss": 0.1936, + "step": 391 + }, + { + "epoch": 0.2040603852160333, + "grad_norm": 0.3773127318291861, + "learning_rate": 4.981088496073472e-05, + "loss": 0.1875, + "step": 392 + }, + { + "epoch": 0.20458094742321709, + "grad_norm": 0.4041436379650743, + "learning_rate": 4.980915614536957e-05, + "loss": 0.1915, + "step": 393 + }, + { + "epoch": 0.20510150963040083, + "grad_norm": 0.3808004928284413, + "learning_rate": 4.980741949411839e-05, + "loss": 0.1886, + "step": 394 + }, + { + "epoch": 0.2056220718375846, + "grad_norm": 0.39329042034751993, + "learning_rate": 4.98056750075297e-05, + "loss": 0.189, + "step": 395 + }, + { + "epoch": 0.20614263404476835, + "grad_norm": 0.3950097377987076, + "learning_rate": 4.980392268615447e-05, + "loss": 0.1922, + "step": 396 + }, + { + "epoch": 0.20666319625195212, + "grad_norm": 0.3768193415771947, + "learning_rate": 4.980216253054619e-05, + "loss": 0.1863, + "step": 397 + }, + { + "epoch": 0.20718375845913586, + "grad_norm": 0.40683290447145076, + "learning_rate": 4.98003945412608e-05, + "loss": 0.1888, + "step": 398 + }, + { + "epoch": 0.20770432066631964, + "grad_norm": 0.3547903108580832, + "learning_rate": 4.979861871885669e-05, + "loss": 0.1916, + "step": 399 + }, + { + "epoch": 0.20822488287350338, + "grad_norm": 0.36494431710070346, + "learning_rate": 4.9796835063894765e-05, + "loss": 0.1806, + "step": 400 + }, + { + "epoch": 0.20874544508068715, + "grad_norm": 0.36485565181417856, + "learning_rate": 4.9795043576938384e-05, + "loss": 0.1833, + "step": 401 + }, + { + "epoch": 0.2092660072878709, + "grad_norm": 0.3632899653463087, + "learning_rate": 4.9793244258553375e-05, + "loss": 0.185, + "step": 402 + }, + { + "epoch": 0.20978656949505467, + "grad_norm": 0.35512187990656785, + "learning_rate": 4.979143710930805e-05, + "loss": 0.1817, + "step": 403 + }, + { + "epoch": 0.21030713170223841, + "grad_norm": 0.4054160927254296, + "learning_rate": 4.9789622129773195e-05, + "loss": 0.1807, + "step": 404 + }, + { + "epoch": 0.2108276939094222, + "grad_norm": 0.348222334286123, + "learning_rate": 4.978779932052206e-05, + "loss": 0.1802, + "step": 405 + }, + { + "epoch": 0.21134825611660593, + "grad_norm": 0.35217898427995553, + "learning_rate": 4.978596868213037e-05, + "loss": 0.1876, + "step": 406 + }, + { + "epoch": 0.2118688183237897, + "grad_norm": 0.3399990298458504, + "learning_rate": 4.978413021517634e-05, + "loss": 0.1824, + "step": 407 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 0.3914421983501113, + "learning_rate": 4.978228392024063e-05, + "loss": 0.1832, + "step": 408 + }, + { + "epoch": 0.21290994273815722, + "grad_norm": 0.3599097944411457, + "learning_rate": 4.978042979790639e-05, + "loss": 0.1857, + "step": 409 + }, + { + "epoch": 0.21343050494534097, + "grad_norm": 0.3764468760970812, + "learning_rate": 4.977856784875924e-05, + "loss": 0.1874, + "step": 410 + }, + { + "epoch": 0.21395106715252474, + "grad_norm": 0.36370332252391396, + "learning_rate": 4.977669807338726e-05, + "loss": 0.1909, + "step": 411 + }, + { + "epoch": 0.21447162935970848, + "grad_norm": 0.3590343415362341, + "learning_rate": 4.9774820472381037e-05, + "loss": 0.1808, + "step": 412 + }, + { + "epoch": 0.21499219156689225, + "grad_norm": 0.3964803489327254, + "learning_rate": 4.977293504633357e-05, + "loss": 0.1993, + "step": 413 + }, + { + "epoch": 0.215512753774076, + "grad_norm": 0.409647084566613, + "learning_rate": 4.977104179584039e-05, + "loss": 0.1828, + "step": 414 + }, + { + "epoch": 0.21603331598125977, + "grad_norm": 0.37684217160913236, + "learning_rate": 4.9769140721499466e-05, + "loss": 0.1903, + "step": 415 + }, + { + "epoch": 0.21655387818844352, + "grad_norm": 0.40627101878375, + "learning_rate": 4.976723182391124e-05, + "loss": 0.1947, + "step": 416 + }, + { + "epoch": 0.2170744403956273, + "grad_norm": 0.4293268173755893, + "learning_rate": 4.9765315103678646e-05, + "loss": 0.1928, + "step": 417 + }, + { + "epoch": 0.21759500260281103, + "grad_norm": 0.363143103690957, + "learning_rate": 4.976339056140706e-05, + "loss": 0.1859, + "step": 418 + }, + { + "epoch": 0.2181155648099948, + "grad_norm": 0.3801901908980379, + "learning_rate": 4.976145819770435e-05, + "loss": 0.1839, + "step": 419 + }, + { + "epoch": 0.21863612701717855, + "grad_norm": 0.3728199137683034, + "learning_rate": 4.975951801318083e-05, + "loss": 0.1836, + "step": 420 + }, + { + "epoch": 0.21915668922436232, + "grad_norm": 0.39399419625104315, + "learning_rate": 4.975757000844932e-05, + "loss": 0.1869, + "step": 421 + }, + { + "epoch": 0.21967725143154607, + "grad_norm": 0.352068817370931, + "learning_rate": 4.975561418412509e-05, + "loss": 0.1836, + "step": 422 + }, + { + "epoch": 0.22019781363872984, + "grad_norm": 0.38356796463195586, + "learning_rate": 4.9753650540825855e-05, + "loss": 0.1813, + "step": 423 + }, + { + "epoch": 0.22071837584591358, + "grad_norm": 0.3610859855606606, + "learning_rate": 4.975167907917187e-05, + "loss": 0.1862, + "step": 424 + }, + { + "epoch": 0.22123893805309736, + "grad_norm": 0.35330258565992684, + "learning_rate": 4.974969979978577e-05, + "loss": 0.1769, + "step": 425 + }, + { + "epoch": 0.2217595002602811, + "grad_norm": 0.384329198269658, + "learning_rate": 4.9747712703292714e-05, + "loss": 0.1807, + "step": 426 + }, + { + "epoch": 0.22228006246746487, + "grad_norm": 0.34746642781914455, + "learning_rate": 4.9745717790320344e-05, + "loss": 0.1809, + "step": 427 + }, + { + "epoch": 0.22280062467464862, + "grad_norm": 0.3880165680244598, + "learning_rate": 4.9743715061498716e-05, + "loss": 0.1867, + "step": 428 + }, + { + "epoch": 0.2233211868818324, + "grad_norm": 0.351470867292313, + "learning_rate": 4.9741704517460406e-05, + "loss": 0.1894, + "step": 429 + }, + { + "epoch": 0.22384174908901613, + "grad_norm": 0.3645151488001888, + "learning_rate": 4.973968615884043e-05, + "loss": 0.1788, + "step": 430 + }, + { + "epoch": 0.2243623112961999, + "grad_norm": 0.3625527069125457, + "learning_rate": 4.973765998627628e-05, + "loss": 0.1914, + "step": 431 + }, + { + "epoch": 0.22488287350338365, + "grad_norm": 0.343361514535999, + "learning_rate": 4.973562600040791e-05, + "loss": 0.1848, + "step": 432 + }, + { + "epoch": 0.22540343571056742, + "grad_norm": 0.37503966422043744, + "learning_rate": 4.973358420187776e-05, + "loss": 0.1879, + "step": 433 + }, + { + "epoch": 0.22592399791775117, + "grad_norm": 0.3474973215881361, + "learning_rate": 4.973153459133071e-05, + "loss": 0.1894, + "step": 434 + }, + { + "epoch": 0.22644456012493494, + "grad_norm": 0.3508809916689539, + "learning_rate": 4.972947716941413e-05, + "loss": 0.1895, + "step": 435 + }, + { + "epoch": 0.22696512233211869, + "grad_norm": 0.3643384265149809, + "learning_rate": 4.9727411936777854e-05, + "loss": 0.1829, + "step": 436 + }, + { + "epoch": 0.22748568453930246, + "grad_norm": 0.36350332295792714, + "learning_rate": 4.972533889407417e-05, + "loss": 0.1813, + "step": 437 + }, + { + "epoch": 0.2280062467464862, + "grad_norm": 0.3539689125746054, + "learning_rate": 4.972325804195784e-05, + "loss": 0.1842, + "step": 438 + }, + { + "epoch": 0.22852680895366997, + "grad_norm": 0.3822911141420241, + "learning_rate": 4.972116938108611e-05, + "loss": 0.193, + "step": 439 + }, + { + "epoch": 0.22904737116085372, + "grad_norm": 0.38773043508499666, + "learning_rate": 4.971907291211866e-05, + "loss": 0.188, + "step": 440 + }, + { + "epoch": 0.2295679333680375, + "grad_norm": 0.35091644126465377, + "learning_rate": 4.971696863571765e-05, + "loss": 0.1824, + "step": 441 + }, + { + "epoch": 0.23008849557522124, + "grad_norm": 0.34427733124831933, + "learning_rate": 4.971485655254773e-05, + "loss": 0.175, + "step": 442 + }, + { + "epoch": 0.230609057782405, + "grad_norm": 0.3543495011979818, + "learning_rate": 4.9712736663275974e-05, + "loss": 0.1849, + "step": 443 + }, + { + "epoch": 0.23112961998958875, + "grad_norm": 0.36206158086227636, + "learning_rate": 4.971060896857195e-05, + "loss": 0.1891, + "step": 444 + }, + { + "epoch": 0.23165018219677252, + "grad_norm": 0.3599367526208678, + "learning_rate": 4.9708473469107676e-05, + "loss": 0.1853, + "step": 445 + }, + { + "epoch": 0.23217074440395627, + "grad_norm": 0.34979470112428146, + "learning_rate": 4.970633016555765e-05, + "loss": 0.1761, + "step": 446 + }, + { + "epoch": 0.23269130661114004, + "grad_norm": 0.4092021184005037, + "learning_rate": 4.9704179058598824e-05, + "loss": 0.1842, + "step": 447 + }, + { + "epoch": 0.2332118688183238, + "grad_norm": 0.3745830889783029, + "learning_rate": 4.970202014891062e-05, + "loss": 0.1854, + "step": 448 + }, + { + "epoch": 0.23373243102550756, + "grad_norm": 0.3787251823910667, + "learning_rate": 4.969985343717492e-05, + "loss": 0.1882, + "step": 449 + }, + { + "epoch": 0.2342529932326913, + "grad_norm": 0.3986619843101879, + "learning_rate": 4.9697678924076066e-05, + "loss": 0.1856, + "step": 450 + }, + { + "epoch": 0.23477355543987508, + "grad_norm": 0.3430314984382981, + "learning_rate": 4.969549661030089e-05, + "loss": 0.1839, + "step": 451 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.3621352775817525, + "learning_rate": 4.969330649653864e-05, + "loss": 0.1848, + "step": 452 + }, + { + "epoch": 0.2358146798542426, + "grad_norm": 0.35888862276629535, + "learning_rate": 4.969110858348108e-05, + "loss": 0.1874, + "step": 453 + }, + { + "epoch": 0.23633524206142634, + "grad_norm": 0.3942881061832858, + "learning_rate": 4.96889028718224e-05, + "loss": 0.178, + "step": 454 + }, + { + "epoch": 0.2368558042686101, + "grad_norm": 0.33338571503996683, + "learning_rate": 4.968668936225928e-05, + "loss": 0.1915, + "step": 455 + }, + { + "epoch": 0.23737636647579385, + "grad_norm": 0.3789691197396563, + "learning_rate": 4.968446805549082e-05, + "loss": 0.1814, + "step": 456 + }, + { + "epoch": 0.23789692868297763, + "grad_norm": 0.36459554389496945, + "learning_rate": 4.968223895221865e-05, + "loss": 0.1874, + "step": 457 + }, + { + "epoch": 0.23841749089016137, + "grad_norm": 0.40802558988494547, + "learning_rate": 4.96800020531468e-05, + "loss": 0.1876, + "step": 458 + }, + { + "epoch": 0.23893805309734514, + "grad_norm": 0.35893156732538567, + "learning_rate": 4.967775735898179e-05, + "loss": 0.1825, + "step": 459 + }, + { + "epoch": 0.2394586153045289, + "grad_norm": 0.3651439723126297, + "learning_rate": 4.967550487043261e-05, + "loss": 0.1853, + "step": 460 + }, + { + "epoch": 0.23997917751171266, + "grad_norm": 0.3624747101403952, + "learning_rate": 4.9673244588210684e-05, + "loss": 0.1783, + "step": 461 + }, + { + "epoch": 0.2404997397188964, + "grad_norm": 0.3360578895946992, + "learning_rate": 4.967097651302993e-05, + "loss": 0.1786, + "step": 462 + }, + { + "epoch": 0.24102030192608018, + "grad_norm": 0.3929754036367695, + "learning_rate": 4.9668700645606704e-05, + "loss": 0.188, + "step": 463 + }, + { + "epoch": 0.24154086413326392, + "grad_norm": 0.34507860442461075, + "learning_rate": 4.966641698665982e-05, + "loss": 0.183, + "step": 464 + }, + { + "epoch": 0.2420614263404477, + "grad_norm": 0.363947787934868, + "learning_rate": 4.9664125536910597e-05, + "loss": 0.179, + "step": 465 + }, + { + "epoch": 0.24258198854763144, + "grad_norm": 0.3444630195014257, + "learning_rate": 4.966182629708275e-05, + "loss": 0.1815, + "step": 466 + }, + { + "epoch": 0.2431025507548152, + "grad_norm": 0.35943929861594537, + "learning_rate": 4.965951926790249e-05, + "loss": 0.1873, + "step": 467 + }, + { + "epoch": 0.24362311296199896, + "grad_norm": 0.3629355287385724, + "learning_rate": 4.9657204450098496e-05, + "loss": 0.1857, + "step": 468 + }, + { + "epoch": 0.24414367516918273, + "grad_norm": 0.32513099997931477, + "learning_rate": 4.9654881844401886e-05, + "loss": 0.1845, + "step": 469 + }, + { + "epoch": 0.24466423737636647, + "grad_norm": 0.375672815309898, + "learning_rate": 4.965255145154625e-05, + "loss": 0.1825, + "step": 470 + }, + { + "epoch": 0.24518479958355024, + "grad_norm": 0.32902747038638014, + "learning_rate": 4.965021327226764e-05, + "loss": 0.1831, + "step": 471 + }, + { + "epoch": 0.245705361790734, + "grad_norm": 0.3723575912403685, + "learning_rate": 4.964786730730455e-05, + "loss": 0.1828, + "step": 472 + }, + { + "epoch": 0.24622592399791776, + "grad_norm": 0.3291696338939864, + "learning_rate": 4.964551355739795e-05, + "loss": 0.1825, + "step": 473 + }, + { + "epoch": 0.2467464862051015, + "grad_norm": 0.40037826390407993, + "learning_rate": 4.964315202329127e-05, + "loss": 0.1948, + "step": 474 + }, + { + "epoch": 0.24726704841228528, + "grad_norm": 0.3980798213979412, + "learning_rate": 4.9640782705730394e-05, + "loss": 0.1818, + "step": 475 + }, + { + "epoch": 0.24778761061946902, + "grad_norm": 0.3307799216619339, + "learning_rate": 4.9638405605463646e-05, + "loss": 0.177, + "step": 476 + }, + { + "epoch": 0.2483081728266528, + "grad_norm": 0.37858138177738815, + "learning_rate": 4.963602072324184e-05, + "loss": 0.1829, + "step": 477 + }, + { + "epoch": 0.24882873503383654, + "grad_norm": 0.34060237598449605, + "learning_rate": 4.963362805981823e-05, + "loss": 0.1872, + "step": 478 + }, + { + "epoch": 0.2493492972410203, + "grad_norm": 0.3491413889386904, + "learning_rate": 4.9631227615948516e-05, + "loss": 0.1719, + "step": 479 + }, + { + "epoch": 0.24986985944820406, + "grad_norm": 0.3693858175983402, + "learning_rate": 4.962881939239089e-05, + "loss": 0.1855, + "step": 480 + }, + { + "epoch": 0.25039042165538783, + "grad_norm": 0.329979856271684, + "learning_rate": 4.962640338990598e-05, + "loss": 0.1885, + "step": 481 + }, + { + "epoch": 0.2509109838625716, + "grad_norm": 0.3796178436862751, + "learning_rate": 4.962397960925686e-05, + "loss": 0.1896, + "step": 482 + }, + { + "epoch": 0.2514315460697553, + "grad_norm": 0.34129494053229703, + "learning_rate": 4.9621548051209075e-05, + "loss": 0.1772, + "step": 483 + }, + { + "epoch": 0.2519521082769391, + "grad_norm": 0.33923486612711795, + "learning_rate": 4.961910871653063e-05, + "loss": 0.1809, + "step": 484 + }, + { + "epoch": 0.25247267048412286, + "grad_norm": 0.32895841536481185, + "learning_rate": 4.961666160599198e-05, + "loss": 0.1801, + "step": 485 + }, + { + "epoch": 0.25299323269130664, + "grad_norm": 0.34829345184997373, + "learning_rate": 4.961420672036603e-05, + "loss": 0.1783, + "step": 486 + }, + { + "epoch": 0.25351379489849035, + "grad_norm": 0.36383868089193916, + "learning_rate": 4.9611744060428156e-05, + "loss": 0.1822, + "step": 487 + }, + { + "epoch": 0.2540343571056741, + "grad_norm": 0.36477630270026584, + "learning_rate": 4.960927362695617e-05, + "loss": 0.1808, + "step": 488 + }, + { + "epoch": 0.2545549193128579, + "grad_norm": 0.39127337500191006, + "learning_rate": 4.960679542073036e-05, + "loss": 0.1889, + "step": 489 + }, + { + "epoch": 0.25507548152004167, + "grad_norm": 0.364605809032773, + "learning_rate": 4.9604309442533454e-05, + "loss": 0.1837, + "step": 490 + }, + { + "epoch": 0.2555960437272254, + "grad_norm": 0.387563172322994, + "learning_rate": 4.960181569315064e-05, + "loss": 0.1813, + "step": 491 + }, + { + "epoch": 0.25611660593440916, + "grad_norm": 0.3574697533499239, + "learning_rate": 4.959931417336956e-05, + "loss": 0.1778, + "step": 492 + }, + { + "epoch": 0.25663716814159293, + "grad_norm": 0.3772487104771915, + "learning_rate": 4.959680488398031e-05, + "loss": 0.1852, + "step": 493 + }, + { + "epoch": 0.2571577303487767, + "grad_norm": 0.37856588039150824, + "learning_rate": 4.959428782577544e-05, + "loss": 0.1852, + "step": 494 + }, + { + "epoch": 0.2576782925559604, + "grad_norm": 0.3473512333204632, + "learning_rate": 4.959176299954995e-05, + "loss": 0.1883, + "step": 495 + }, + { + "epoch": 0.2581988547631442, + "grad_norm": 0.3717884159305744, + "learning_rate": 4.958923040610132e-05, + "loss": 0.1901, + "step": 496 + }, + { + "epoch": 0.25871941697032796, + "grad_norm": 0.3333259893749479, + "learning_rate": 4.958669004622942e-05, + "loss": 0.1755, + "step": 497 + }, + { + "epoch": 0.25923997917751174, + "grad_norm": 0.352187312912872, + "learning_rate": 4.9584141920736656e-05, + "loss": 0.1814, + "step": 498 + }, + { + "epoch": 0.25976054138469545, + "grad_norm": 0.35402911428193723, + "learning_rate": 4.958158603042782e-05, + "loss": 0.1862, + "step": 499 + }, + { + "epoch": 0.2602811035918792, + "grad_norm": 0.351609210599415, + "learning_rate": 4.957902237611018e-05, + "loss": 0.1853, + "step": 500 + }, + { + "epoch": 0.260801665799063, + "grad_norm": 0.3961682023708728, + "learning_rate": 4.957645095859348e-05, + "loss": 0.1804, + "step": 501 + }, + { + "epoch": 0.26132222800624677, + "grad_norm": 0.3432093967728205, + "learning_rate": 4.957387177868986e-05, + "loss": 0.1777, + "step": 502 + }, + { + "epoch": 0.2618427902134305, + "grad_norm": 0.43841871784060643, + "learning_rate": 4.957128483721398e-05, + "loss": 0.1967, + "step": 503 + }, + { + "epoch": 0.26236335242061426, + "grad_norm": 0.32944557567419036, + "learning_rate": 4.9568690134982884e-05, + "loss": 0.1847, + "step": 504 + }, + { + "epoch": 0.26288391462779803, + "grad_norm": 0.4072278216406123, + "learning_rate": 4.956608767281612e-05, + "loss": 0.1916, + "step": 505 + }, + { + "epoch": 0.2634044768349818, + "grad_norm": 0.3470914106501254, + "learning_rate": 4.9563477451535664e-05, + "loss": 0.1686, + "step": 506 + }, + { + "epoch": 0.2639250390421655, + "grad_norm": 0.3431768907003393, + "learning_rate": 4.956085947196595e-05, + "loss": 0.1844, + "step": 507 + }, + { + "epoch": 0.2644456012493493, + "grad_norm": 0.373990259509472, + "learning_rate": 4.955823373493385e-05, + "loss": 0.1866, + "step": 508 + }, + { + "epoch": 0.26496616345653307, + "grad_norm": 0.36035756528381147, + "learning_rate": 4.955560024126868e-05, + "loss": 0.1828, + "step": 509 + }, + { + "epoch": 0.26548672566371684, + "grad_norm": 0.37277974712871015, + "learning_rate": 4.9552958991802245e-05, + "loss": 0.1877, + "step": 510 + }, + { + "epoch": 0.26600728787090056, + "grad_norm": 0.3791475893537772, + "learning_rate": 4.955030998736876e-05, + "loss": 0.1787, + "step": 511 + }, + { + "epoch": 0.2665278500780843, + "grad_norm": 0.34601195974476934, + "learning_rate": 4.9547653228804915e-05, + "loss": 0.178, + "step": 512 + }, + { + "epoch": 0.2670484122852681, + "grad_norm": 0.3860891277749901, + "learning_rate": 4.954498871694982e-05, + "loss": 0.1786, + "step": 513 + }, + { + "epoch": 0.26756897449245187, + "grad_norm": 0.36197045930962224, + "learning_rate": 4.954231645264507e-05, + "loss": 0.181, + "step": 514 + }, + { + "epoch": 0.2680895366996356, + "grad_norm": 0.35660823996500324, + "learning_rate": 4.953963643673468e-05, + "loss": 0.1816, + "step": 515 + }, + { + "epoch": 0.26861009890681936, + "grad_norm": 0.37899933398158164, + "learning_rate": 4.953694867006513e-05, + "loss": 0.1794, + "step": 516 + }, + { + "epoch": 0.26913066111400313, + "grad_norm": 0.37134644471067046, + "learning_rate": 4.953425315348534e-05, + "loss": 0.1829, + "step": 517 + }, + { + "epoch": 0.2696512233211869, + "grad_norm": 0.35629118982864444, + "learning_rate": 4.953154988784667e-05, + "loss": 0.1825, + "step": 518 + }, + { + "epoch": 0.2701717855283706, + "grad_norm": 0.3569372189926499, + "learning_rate": 4.952883887400296e-05, + "loss": 0.1852, + "step": 519 + }, + { + "epoch": 0.2706923477355544, + "grad_norm": 0.3784207348633246, + "learning_rate": 4.9526120112810445e-05, + "loss": 0.1792, + "step": 520 + }, + { + "epoch": 0.27121290994273817, + "grad_norm": 0.3571363618294585, + "learning_rate": 4.952339360512786e-05, + "loss": 0.1829, + "step": 521 + }, + { + "epoch": 0.27173347214992194, + "grad_norm": 0.37323568055590195, + "learning_rate": 4.952065935181635e-05, + "loss": 0.1844, + "step": 522 + }, + { + "epoch": 0.27225403435710566, + "grad_norm": 0.3405402620718736, + "learning_rate": 4.951791735373953e-05, + "loss": 0.1817, + "step": 523 + }, + { + "epoch": 0.27277459656428943, + "grad_norm": 0.3604521476507971, + "learning_rate": 4.9515167611763434e-05, + "loss": 0.1897, + "step": 524 + }, + { + "epoch": 0.2732951587714732, + "grad_norm": 0.33208621053766485, + "learning_rate": 4.951241012675657e-05, + "loss": 0.1792, + "step": 525 + }, + { + "epoch": 0.273815720978657, + "grad_norm": 0.3839270762469873, + "learning_rate": 4.950964489958988e-05, + "loss": 0.1841, + "step": 526 + }, + { + "epoch": 0.2743362831858407, + "grad_norm": 0.31535848820876206, + "learning_rate": 4.950687193113676e-05, + "loss": 0.1743, + "step": 527 + }, + { + "epoch": 0.27485684539302446, + "grad_norm": 0.37827478667026426, + "learning_rate": 4.950409122227302e-05, + "loss": 0.1883, + "step": 528 + }, + { + "epoch": 0.27537740760020823, + "grad_norm": 0.3582790920870559, + "learning_rate": 4.950130277387695e-05, + "loss": 0.1772, + "step": 529 + }, + { + "epoch": 0.275897969807392, + "grad_norm": 0.3584180526860497, + "learning_rate": 4.949850658682929e-05, + "loss": 0.1876, + "step": 530 + }, + { + "epoch": 0.2764185320145757, + "grad_norm": 0.5738445813183063, + "learning_rate": 4.949570266201317e-05, + "loss": 0.178, + "step": 531 + }, + { + "epoch": 0.2769390942217595, + "grad_norm": 0.3744948980693089, + "learning_rate": 4.949289100031422e-05, + "loss": 0.1823, + "step": 532 + }, + { + "epoch": 0.27745965642894327, + "grad_norm": 0.3379331374106325, + "learning_rate": 4.949007160262049e-05, + "loss": 0.1768, + "step": 533 + }, + { + "epoch": 0.27798021863612704, + "grad_norm": 0.3480919801415185, + "learning_rate": 4.948724446982248e-05, + "loss": 0.1859, + "step": 534 + }, + { + "epoch": 0.27850078084331076, + "grad_norm": 0.3842609731913717, + "learning_rate": 4.948440960281313e-05, + "loss": 0.1753, + "step": 535 + }, + { + "epoch": 0.27902134305049453, + "grad_norm": 0.37532802640166946, + "learning_rate": 4.948156700248782e-05, + "loss": 0.1837, + "step": 536 + }, + { + "epoch": 0.2795419052576783, + "grad_norm": 0.37959061595323806, + "learning_rate": 4.947871666974437e-05, + "loss": 0.1833, + "step": 537 + }, + { + "epoch": 0.2800624674648621, + "grad_norm": 0.4342019133871026, + "learning_rate": 4.9475858605483074e-05, + "loss": 0.1765, + "step": 538 + }, + { + "epoch": 0.2805830296720458, + "grad_norm": 0.3600696888138904, + "learning_rate": 4.94729928106066e-05, + "loss": 0.1839, + "step": 539 + }, + { + "epoch": 0.28110359187922956, + "grad_norm": 0.4075659366581729, + "learning_rate": 4.9470119286020134e-05, + "loss": 0.178, + "step": 540 + }, + { + "epoch": 0.28162415408641334, + "grad_norm": 0.3288846470475826, + "learning_rate": 4.946723803263125e-05, + "loss": 0.1822, + "step": 541 + }, + { + "epoch": 0.2821447162935971, + "grad_norm": 0.4265164240412001, + "learning_rate": 4.946434905134999e-05, + "loss": 0.1857, + "step": 542 + }, + { + "epoch": 0.2826652785007808, + "grad_norm": 0.3490790807245664, + "learning_rate": 4.9461452343088835e-05, + "loss": 0.1792, + "step": 543 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 0.3906850709137392, + "learning_rate": 4.945854790876268e-05, + "loss": 0.183, + "step": 544 + }, + { + "epoch": 0.28370640291514837, + "grad_norm": 0.3673184770495506, + "learning_rate": 4.94556357492889e-05, + "loss": 0.1791, + "step": 545 + }, + { + "epoch": 0.28422696512233214, + "grad_norm": 0.3435991187690188, + "learning_rate": 4.9452715865587274e-05, + "loss": 0.1799, + "step": 546 + }, + { + "epoch": 0.28474752732951586, + "grad_norm": 0.33557383372158994, + "learning_rate": 4.944978825858005e-05, + "loss": 0.1754, + "step": 547 + }, + { + "epoch": 0.28526808953669963, + "grad_norm": 0.3472232615973859, + "learning_rate": 4.944685292919191e-05, + "loss": 0.1829, + "step": 548 + }, + { + "epoch": 0.2857886517438834, + "grad_norm": 0.34437324485625154, + "learning_rate": 4.9443909878349945e-05, + "loss": 0.1849, + "step": 549 + }, + { + "epoch": 0.2863092139510672, + "grad_norm": 0.3366544790820249, + "learning_rate": 4.944095910698372e-05, + "loss": 0.1732, + "step": 550 + }, + { + "epoch": 0.2868297761582509, + "grad_norm": 0.3249404378916035, + "learning_rate": 4.9438000616025226e-05, + "loss": 0.1756, + "step": 551 + }, + { + "epoch": 0.28735033836543467, + "grad_norm": 0.3849346781758215, + "learning_rate": 4.94350344064089e-05, + "loss": 0.1856, + "step": 552 + }, + { + "epoch": 0.28787090057261844, + "grad_norm": 0.3243739567095206, + "learning_rate": 4.9432060479071584e-05, + "loss": 0.183, + "step": 553 + }, + { + "epoch": 0.2883914627798022, + "grad_norm": 0.3612277561023643, + "learning_rate": 4.942907883495261e-05, + "loss": 0.1786, + "step": 554 + }, + { + "epoch": 0.2889120249869859, + "grad_norm": 0.3323067953459406, + "learning_rate": 4.9426089474993696e-05, + "loss": 0.1816, + "step": 555 + }, + { + "epoch": 0.2894325871941697, + "grad_norm": 0.3316679874226504, + "learning_rate": 4.942309240013905e-05, + "loss": 0.1787, + "step": 556 + }, + { + "epoch": 0.28995314940135347, + "grad_norm": 0.36297638930718357, + "learning_rate": 4.9420087611335265e-05, + "loss": 0.1895, + "step": 557 + }, + { + "epoch": 0.29047371160853724, + "grad_norm": 0.34447907177875525, + "learning_rate": 4.94170751095314e-05, + "loss": 0.1831, + "step": 558 + }, + { + "epoch": 0.29099427381572096, + "grad_norm": 0.37466310329246094, + "learning_rate": 4.941405489567893e-05, + "loss": 0.18, + "step": 559 + }, + { + "epoch": 0.29151483602290473, + "grad_norm": 0.3240041218848788, + "learning_rate": 4.9411026970731805e-05, + "loss": 0.1695, + "step": 560 + }, + { + "epoch": 0.2920353982300885, + "grad_norm": 0.34612245370701383, + "learning_rate": 4.940799133564637e-05, + "loss": 0.1722, + "step": 561 + }, + { + "epoch": 0.2925559604372723, + "grad_norm": 0.35368918014173173, + "learning_rate": 4.9404947991381416e-05, + "loss": 0.1845, + "step": 562 + }, + { + "epoch": 0.293076522644456, + "grad_norm": 0.3048992546896397, + "learning_rate": 4.9401896938898185e-05, + "loss": 0.1764, + "step": 563 + }, + { + "epoch": 0.29359708485163977, + "grad_norm": 0.3780353682612039, + "learning_rate": 4.9398838179160326e-05, + "loss": 0.1763, + "step": 564 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.33431520426027345, + "learning_rate": 4.939577171313395e-05, + "loss": 0.1738, + "step": 565 + }, + { + "epoch": 0.2946382092660073, + "grad_norm": 0.3574740739801916, + "learning_rate": 4.9392697541787587e-05, + "loss": 0.1864, + "step": 566 + }, + { + "epoch": 0.29515877147319103, + "grad_norm": 0.3080198068295583, + "learning_rate": 4.93896156660922e-05, + "loss": 0.1758, + "step": 567 + }, + { + "epoch": 0.2956793336803748, + "grad_norm": 0.34221401560536524, + "learning_rate": 4.938652608702119e-05, + "loss": 0.1787, + "step": 568 + }, + { + "epoch": 0.2961998958875586, + "grad_norm": 0.3422289356320483, + "learning_rate": 4.938342880555039e-05, + "loss": 0.1791, + "step": 569 + }, + { + "epoch": 0.29672045809474235, + "grad_norm": 0.32294407935462033, + "learning_rate": 4.938032382265807e-05, + "loss": 0.1804, + "step": 570 + }, + { + "epoch": 0.29724102030192606, + "grad_norm": 0.32293386743920566, + "learning_rate": 4.937721113932493e-05, + "loss": 0.1745, + "step": 571 + }, + { + "epoch": 0.29776158250910983, + "grad_norm": 0.3339982925147738, + "learning_rate": 4.937409075653408e-05, + "loss": 0.1855, + "step": 572 + }, + { + "epoch": 0.2982821447162936, + "grad_norm": 0.33201821976883394, + "learning_rate": 4.9370962675271106e-05, + "loss": 0.1867, + "step": 573 + }, + { + "epoch": 0.2988027069234774, + "grad_norm": 0.36333200193928655, + "learning_rate": 4.936782689652399e-05, + "loss": 0.1798, + "step": 574 + }, + { + "epoch": 0.2993232691306611, + "grad_norm": 0.3170978099956422, + "learning_rate": 4.936468342128315e-05, + "loss": 0.1734, + "step": 575 + }, + { + "epoch": 0.29984383133784487, + "grad_norm": 0.3809200990521368, + "learning_rate": 4.936153225054146e-05, + "loss": 0.1731, + "step": 576 + }, + { + "epoch": 0.30036439354502864, + "grad_norm": 0.32675460808837287, + "learning_rate": 4.93583733852942e-05, + "loss": 0.1796, + "step": 577 + }, + { + "epoch": 0.3008849557522124, + "grad_norm": 0.34942233525996, + "learning_rate": 4.935520682653908e-05, + "loss": 0.182, + "step": 578 + }, + { + "epoch": 0.30140551795939613, + "grad_norm": 0.34564982764778135, + "learning_rate": 4.9352032575276255e-05, + "loss": 0.1778, + "step": 579 + }, + { + "epoch": 0.3019260801665799, + "grad_norm": 0.3716440462553411, + "learning_rate": 4.9348850632508295e-05, + "loss": 0.1793, + "step": 580 + }, + { + "epoch": 0.3024466423737637, + "grad_norm": 0.38121896546844525, + "learning_rate": 4.934566099924021e-05, + "loss": 0.1846, + "step": 581 + }, + { + "epoch": 0.30296720458094745, + "grad_norm": 0.34320281278816944, + "learning_rate": 4.9342463676479424e-05, + "loss": 0.1877, + "step": 582 + }, + { + "epoch": 0.30348776678813116, + "grad_norm": 0.356611150010106, + "learning_rate": 4.933925866523581e-05, + "loss": 0.1787, + "step": 583 + }, + { + "epoch": 0.30400832899531494, + "grad_norm": 0.3535551866752405, + "learning_rate": 4.933604596652166e-05, + "loss": 0.1753, + "step": 584 + }, + { + "epoch": 0.3045288912024987, + "grad_norm": 0.3587966333569922, + "learning_rate": 4.933282558135169e-05, + "loss": 0.1767, + "step": 585 + }, + { + "epoch": 0.3050494534096825, + "grad_norm": 0.32820492797103323, + "learning_rate": 4.932959751074305e-05, + "loss": 0.1765, + "step": 586 + }, + { + "epoch": 0.3055700156168662, + "grad_norm": 0.3668795974443606, + "learning_rate": 4.932636175571531e-05, + "loss": 0.1847, + "step": 587 + }, + { + "epoch": 0.30609057782404997, + "grad_norm": 0.3893815535302477, + "learning_rate": 4.932311831729048e-05, + "loss": 0.1769, + "step": 588 + }, + { + "epoch": 0.30661114003123374, + "grad_norm": 0.32745455373903526, + "learning_rate": 4.931986719649299e-05, + "loss": 0.1803, + "step": 589 + }, + { + "epoch": 0.3071317022384175, + "grad_norm": 0.33500065348948266, + "learning_rate": 4.9316608394349684e-05, + "loss": 0.1759, + "step": 590 + }, + { + "epoch": 0.30765226444560123, + "grad_norm": 0.34040682897700947, + "learning_rate": 4.931334191188985e-05, + "loss": 0.1809, + "step": 591 + }, + { + "epoch": 0.308172826652785, + "grad_norm": 0.30393987030284975, + "learning_rate": 4.93100677501452e-05, + "loss": 0.1707, + "step": 592 + }, + { + "epoch": 0.3086933888599688, + "grad_norm": 0.3351048511454525, + "learning_rate": 4.930678591014986e-05, + "loss": 0.1792, + "step": 593 + }, + { + "epoch": 0.30921395106715255, + "grad_norm": 0.36537436869160295, + "learning_rate": 4.930349639294038e-05, + "loss": 0.1808, + "step": 594 + }, + { + "epoch": 0.30973451327433627, + "grad_norm": 0.32634199146147447, + "learning_rate": 4.930019919955576e-05, + "loss": 0.1736, + "step": 595 + }, + { + "epoch": 0.31025507548152004, + "grad_norm": 0.3299845870559719, + "learning_rate": 4.9296894331037405e-05, + "loss": 0.1793, + "step": 596 + }, + { + "epoch": 0.3107756376887038, + "grad_norm": 0.34165209773973343, + "learning_rate": 4.9293581788429136e-05, + "loss": 0.1814, + "step": 597 + }, + { + "epoch": 0.3112961998958876, + "grad_norm": 0.3453422147378798, + "learning_rate": 4.92902615727772e-05, + "loss": 0.177, + "step": 598 + }, + { + "epoch": 0.3118167621030713, + "grad_norm": 0.32274993522205253, + "learning_rate": 4.92869336851303e-05, + "loss": 0.1708, + "step": 599 + }, + { + "epoch": 0.31233732431025507, + "grad_norm": 0.32030522939856365, + "learning_rate": 4.9283598126539524e-05, + "loss": 0.173, + "step": 600 + }, + { + "epoch": 0.31285788651743884, + "grad_norm": 0.3482608585554554, + "learning_rate": 4.92802548980584e-05, + "loss": 0.179, + "step": 601 + }, + { + "epoch": 0.3133784487246226, + "grad_norm": 0.3692046388241526, + "learning_rate": 4.927690400074286e-05, + "loss": 0.1812, + "step": 602 + }, + { + "epoch": 0.31389901093180633, + "grad_norm": 0.33322086716168214, + "learning_rate": 4.92735454356513e-05, + "loss": 0.1852, + "step": 603 + }, + { + "epoch": 0.3144195731389901, + "grad_norm": 0.3715077169273792, + "learning_rate": 4.92701792038445e-05, + "loss": 0.1767, + "step": 604 + }, + { + "epoch": 0.3149401353461739, + "grad_norm": 0.33348425102156853, + "learning_rate": 4.926680530638567e-05, + "loss": 0.179, + "step": 605 + }, + { + "epoch": 0.31546069755335765, + "grad_norm": 0.34957898455987335, + "learning_rate": 4.926342374434043e-05, + "loss": 0.1824, + "step": 606 + }, + { + "epoch": 0.31598125976054137, + "grad_norm": 0.35273594539974945, + "learning_rate": 4.926003451877687e-05, + "loss": 0.1784, + "step": 607 + }, + { + "epoch": 0.31650182196772514, + "grad_norm": 0.32859387991060474, + "learning_rate": 4.9256637630765425e-05, + "loss": 0.1761, + "step": 608 + }, + { + "epoch": 0.3170223841749089, + "grad_norm": 0.3364281217569474, + "learning_rate": 4.9253233081379024e-05, + "loss": 0.1757, + "step": 609 + }, + { + "epoch": 0.3175429463820927, + "grad_norm": 0.3180815183990718, + "learning_rate": 4.924982087169296e-05, + "loss": 0.1688, + "step": 610 + }, + { + "epoch": 0.3180635085892764, + "grad_norm": 0.3265120523959636, + "learning_rate": 4.9246401002784976e-05, + "loss": 0.1794, + "step": 611 + }, + { + "epoch": 0.3185840707964602, + "grad_norm": 0.3435491116451335, + "learning_rate": 4.9242973475735224e-05, + "loss": 0.1807, + "step": 612 + }, + { + "epoch": 0.31910463300364394, + "grad_norm": 0.3601397832581144, + "learning_rate": 4.923953829162628e-05, + "loss": 0.1844, + "step": 613 + }, + { + "epoch": 0.3196251952108277, + "grad_norm": 0.33676918035319736, + "learning_rate": 4.923609545154313e-05, + "loss": 0.1794, + "step": 614 + }, + { + "epoch": 0.32014575741801143, + "grad_norm": 0.35847840458223534, + "learning_rate": 4.923264495657319e-05, + "loss": 0.1796, + "step": 615 + }, + { + "epoch": 0.3206663196251952, + "grad_norm": 0.3517065818342989, + "learning_rate": 4.9229186807806284e-05, + "loss": 0.1772, + "step": 616 + }, + { + "epoch": 0.321186881832379, + "grad_norm": 0.32840575469268135, + "learning_rate": 4.9225721006334644e-05, + "loss": 0.183, + "step": 617 + }, + { + "epoch": 0.32170744403956275, + "grad_norm": 0.3304656690924349, + "learning_rate": 4.922224755325295e-05, + "loss": 0.1826, + "step": 618 + }, + { + "epoch": 0.32222800624674647, + "grad_norm": 0.3365642578033948, + "learning_rate": 4.921876644965827e-05, + "loss": 0.1849, + "step": 619 + }, + { + "epoch": 0.32274856845393024, + "grad_norm": 0.321873293657072, + "learning_rate": 4.921527769665011e-05, + "loss": 0.1842, + "step": 620 + }, + { + "epoch": 0.323269130661114, + "grad_norm": 0.3304218237484672, + "learning_rate": 4.921178129533036e-05, + "loss": 0.1766, + "step": 621 + }, + { + "epoch": 0.3237896928682978, + "grad_norm": 0.36159820669310355, + "learning_rate": 4.920827724680336e-05, + "loss": 0.1833, + "step": 622 + }, + { + "epoch": 0.3243102550754815, + "grad_norm": 0.3125115981551096, + "learning_rate": 4.9204765552175857e-05, + "loss": 0.1687, + "step": 623 + }, + { + "epoch": 0.3248308172826653, + "grad_norm": 0.355646049290069, + "learning_rate": 4.920124621255699e-05, + "loss": 0.1831, + "step": 624 + }, + { + "epoch": 0.32535137948984905, + "grad_norm": 0.31743405014825066, + "learning_rate": 4.9197719229058346e-05, + "loss": 0.1719, + "step": 625 + }, + { + "epoch": 0.3258719416970328, + "grad_norm": 0.3320164493908055, + "learning_rate": 4.9194184602793904e-05, + "loss": 0.1779, + "step": 626 + }, + { + "epoch": 0.32639250390421654, + "grad_norm": 0.3569014079597437, + "learning_rate": 4.919064233488006e-05, + "loss": 0.1792, + "step": 627 + }, + { + "epoch": 0.3269130661114003, + "grad_norm": 0.3422394188289934, + "learning_rate": 4.9187092426435634e-05, + "loss": 0.1853, + "step": 628 + }, + { + "epoch": 0.3274336283185841, + "grad_norm": 0.45779658269843443, + "learning_rate": 4.918353487858185e-05, + "loss": 0.1865, + "step": 629 + }, + { + "epoch": 0.32795419052576785, + "grad_norm": 0.36967762282280137, + "learning_rate": 4.917996969244235e-05, + "loss": 0.1842, + "step": 630 + }, + { + "epoch": 0.32847475273295157, + "grad_norm": 0.3383443636578056, + "learning_rate": 4.917639686914317e-05, + "loss": 0.177, + "step": 631 + }, + { + "epoch": 0.32899531494013534, + "grad_norm": 0.34576867722134114, + "learning_rate": 4.91728164098128e-05, + "loss": 0.1741, + "step": 632 + }, + { + "epoch": 0.3295158771473191, + "grad_norm": 0.33889725319396574, + "learning_rate": 4.9169228315582094e-05, + "loss": 0.1733, + "step": 633 + }, + { + "epoch": 0.3300364393545029, + "grad_norm": 0.3181574991844868, + "learning_rate": 4.9165632587584346e-05, + "loss": 0.1712, + "step": 634 + }, + { + "epoch": 0.3305570015616866, + "grad_norm": 0.3150178248046848, + "learning_rate": 4.916202922695526e-05, + "loss": 0.1769, + "step": 635 + }, + { + "epoch": 0.3310775637688704, + "grad_norm": 0.3231916056988664, + "learning_rate": 4.9158418234832935e-05, + "loss": 0.1711, + "step": 636 + }, + { + "epoch": 0.33159812597605415, + "grad_norm": 0.3219246488562882, + "learning_rate": 4.9154799612357905e-05, + "loss": 0.1742, + "step": 637 + }, + { + "epoch": 0.3321186881832379, + "grad_norm": 0.3490019970784047, + "learning_rate": 4.915117336067308e-05, + "loss": 0.187, + "step": 638 + }, + { + "epoch": 0.33263925039042164, + "grad_norm": 0.3404922017535072, + "learning_rate": 4.914753948092381e-05, + "loss": 0.1814, + "step": 639 + }, + { + "epoch": 0.3331598125976054, + "grad_norm": 0.33038580290518343, + "learning_rate": 4.9143897974257845e-05, + "loss": 0.1836, + "step": 640 + }, + { + "epoch": 0.3336803748047892, + "grad_norm": 0.48447675580050537, + "learning_rate": 4.914024884182534e-05, + "loss": 0.1723, + "step": 641 + }, + { + "epoch": 0.33420093701197295, + "grad_norm": 0.31792440208506284, + "learning_rate": 4.913659208477886e-05, + "loss": 0.1744, + "step": 642 + }, + { + "epoch": 0.33472149921915667, + "grad_norm": 1.1187782722485253, + "learning_rate": 4.9132927704273376e-05, + "loss": 0.1782, + "step": 643 + }, + { + "epoch": 0.33524206142634044, + "grad_norm": 0.46784530940515706, + "learning_rate": 4.9129255701466284e-05, + "loss": 0.1785, + "step": 644 + }, + { + "epoch": 0.3357626236335242, + "grad_norm": 3.7910679408191483, + "learning_rate": 4.9125576077517356e-05, + "loss": 0.19, + "step": 645 + }, + { + "epoch": 0.336283185840708, + "grad_norm": 0.5244754308415, + "learning_rate": 4.9121888833588795e-05, + "loss": 0.179, + "step": 646 + }, + { + "epoch": 0.3368037480478917, + "grad_norm": 1.3332280542365016, + "learning_rate": 4.91181939708452e-05, + "loss": 0.1785, + "step": 647 + }, + { + "epoch": 0.3373243102550755, + "grad_norm": 0.6152204952369942, + "learning_rate": 4.9114491490453585e-05, + "loss": 0.1817, + "step": 648 + }, + { + "epoch": 0.33784487246225925, + "grad_norm": 0.6322972902919656, + "learning_rate": 4.911078139358337e-05, + "loss": 0.1834, + "step": 649 + }, + { + "epoch": 0.338365434669443, + "grad_norm": 0.4481338546785872, + "learning_rate": 4.9107063681406376e-05, + "loss": 0.1825, + "step": 650 + }, + { + "epoch": 0.33888599687662674, + "grad_norm": 0.9859952621710755, + "learning_rate": 4.910333835509682e-05, + "loss": 0.1809, + "step": 651 + }, + { + "epoch": 0.3394065590838105, + "grad_norm": 0.4410398257105147, + "learning_rate": 4.909960541583133e-05, + "loss": 0.1758, + "step": 652 + }, + { + "epoch": 0.3399271212909943, + "grad_norm": 0.46192330205640203, + "learning_rate": 4.909586486478897e-05, + "loss": 0.1745, + "step": 653 + }, + { + "epoch": 0.34044768349817806, + "grad_norm": 0.4347198500546606, + "learning_rate": 4.909211670315114e-05, + "loss": 0.1763, + "step": 654 + }, + { + "epoch": 0.34096824570536177, + "grad_norm": 0.47923777050652683, + "learning_rate": 4.908836093210172e-05, + "loss": 0.186, + "step": 655 + }, + { + "epoch": 0.34148880791254554, + "grad_norm": 0.3778493845758958, + "learning_rate": 4.9084597552826935e-05, + "loss": 0.1778, + "step": 656 + }, + { + "epoch": 0.3420093701197293, + "grad_norm": 0.4629741259881899, + "learning_rate": 4.908082656651544e-05, + "loss": 0.1835, + "step": 657 + }, + { + "epoch": 0.3425299323269131, + "grad_norm": 0.43536744466295096, + "learning_rate": 4.907704797435829e-05, + "loss": 0.1836, + "step": 658 + }, + { + "epoch": 0.3430504945340968, + "grad_norm": 0.390115416460036, + "learning_rate": 4.9073261777548954e-05, + "loss": 0.1835, + "step": 659 + }, + { + "epoch": 0.3435710567412806, + "grad_norm": 0.46457793969633604, + "learning_rate": 4.9069467977283255e-05, + "loss": 0.1771, + "step": 660 + }, + { + "epoch": 0.34409161894846435, + "grad_norm": 0.3963393707607232, + "learning_rate": 4.906566657475949e-05, + "loss": 0.1769, + "step": 661 + }, + { + "epoch": 0.3446121811556481, + "grad_norm": 0.36870322720760557, + "learning_rate": 4.906185757117829e-05, + "loss": 0.1781, + "step": 662 + }, + { + "epoch": 0.34513274336283184, + "grad_norm": 0.3979707755147091, + "learning_rate": 4.905804096774274e-05, + "loss": 0.1801, + "step": 663 + }, + { + "epoch": 0.3456533055700156, + "grad_norm": 0.36701302139429726, + "learning_rate": 4.905421676565827e-05, + "loss": 0.181, + "step": 664 + }, + { + "epoch": 0.3461738677771994, + "grad_norm": 0.3698838374588819, + "learning_rate": 4.905038496613277e-05, + "loss": 0.1877, + "step": 665 + }, + { + "epoch": 0.34669442998438316, + "grad_norm": 0.352717338392521, + "learning_rate": 4.9046545570376484e-05, + "loss": 0.1789, + "step": 666 + }, + { + "epoch": 0.3472149921915669, + "grad_norm": 0.3809906085196877, + "learning_rate": 4.904269857960207e-05, + "loss": 0.1779, + "step": 667 + }, + { + "epoch": 0.34773555439875065, + "grad_norm": 0.3717632611173711, + "learning_rate": 4.9038843995024606e-05, + "loss": 0.1841, + "step": 668 + }, + { + "epoch": 0.3482561166059344, + "grad_norm": 0.3616057869533888, + "learning_rate": 4.9034981817861534e-05, + "loss": 0.1832, + "step": 669 + }, + { + "epoch": 0.3487766788131182, + "grad_norm": 0.3720170377297904, + "learning_rate": 4.9031112049332715e-05, + "loss": 0.1773, + "step": 670 + }, + { + "epoch": 0.3492972410203019, + "grad_norm": 0.38176483972868525, + "learning_rate": 4.9027234690660396e-05, + "loss": 0.1795, + "step": 671 + }, + { + "epoch": 0.3498178032274857, + "grad_norm": 0.3971475485254792, + "learning_rate": 4.902334974306924e-05, + "loss": 0.187, + "step": 672 + }, + { + "epoch": 0.35033836543466945, + "grad_norm": 0.33041713668617295, + "learning_rate": 4.9019457207786265e-05, + "loss": 0.168, + "step": 673 + }, + { + "epoch": 0.3508589276418532, + "grad_norm": 0.3660144363430606, + "learning_rate": 4.901555708604095e-05, + "loss": 0.1856, + "step": 674 + }, + { + "epoch": 0.35137948984903694, + "grad_norm": 0.3766530978916359, + "learning_rate": 4.901164937906511e-05, + "loss": 0.1794, + "step": 675 + }, + { + "epoch": 0.3519000520562207, + "grad_norm": 0.3378578696974802, + "learning_rate": 4.900773408809299e-05, + "loss": 0.1757, + "step": 676 + }, + { + "epoch": 0.3524206142634045, + "grad_norm": 0.421166687268464, + "learning_rate": 4.900381121436123e-05, + "loss": 0.1824, + "step": 677 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.36026982081841086, + "learning_rate": 4.8999880759108844e-05, + "loss": 0.1811, + "step": 678 + }, + { + "epoch": 0.353461738677772, + "grad_norm": 0.3655859893200682, + "learning_rate": 4.899594272357726e-05, + "loss": 0.1739, + "step": 679 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 0.3821275209491641, + "learning_rate": 4.899199710901028e-05, + "loss": 0.1783, + "step": 680 + }, + { + "epoch": 0.3545028630921395, + "grad_norm": 0.3409239426235484, + "learning_rate": 4.8988043916654126e-05, + "loss": 0.1782, + "step": 681 + }, + { + "epoch": 0.3550234252993233, + "grad_norm": 0.37076066161212423, + "learning_rate": 4.89840831477574e-05, + "loss": 0.1732, + "step": 682 + }, + { + "epoch": 0.355543987506507, + "grad_norm": 0.3742846141003235, + "learning_rate": 4.8980114803571084e-05, + "loss": 0.1874, + "step": 683 + }, + { + "epoch": 0.3560645497136908, + "grad_norm": 0.3537697722706192, + "learning_rate": 4.8976138885348575e-05, + "loss": 0.1813, + "step": 684 + }, + { + "epoch": 0.35658511192087455, + "grad_norm": 0.34743989109554174, + "learning_rate": 4.897215539434566e-05, + "loss": 0.1759, + "step": 685 + }, + { + "epoch": 0.3571056741280583, + "grad_norm": 0.3591186392260258, + "learning_rate": 4.89681643318205e-05, + "loss": 0.1677, + "step": 686 + }, + { + "epoch": 0.35762623633524204, + "grad_norm": 0.35400264664937536, + "learning_rate": 4.896416569903366e-05, + "loss": 0.1755, + "step": 687 + }, + { + "epoch": 0.3581467985424258, + "grad_norm": 0.36086365972267287, + "learning_rate": 4.89601594972481e-05, + "loss": 0.1738, + "step": 688 + }, + { + "epoch": 0.3586673607496096, + "grad_norm": 0.34105586937400784, + "learning_rate": 4.8956145727729156e-05, + "loss": 0.1743, + "step": 689 + }, + { + "epoch": 0.35918792295679336, + "grad_norm": 0.3373349436933188, + "learning_rate": 4.895212439174457e-05, + "loss": 0.1733, + "step": 690 + }, + { + "epoch": 0.3597084851639771, + "grad_norm": 0.3503292696755782, + "learning_rate": 4.894809549056447e-05, + "loss": 0.1727, + "step": 691 + }, + { + "epoch": 0.36022904737116085, + "grad_norm": 0.35086053297097963, + "learning_rate": 4.894405902546136e-05, + "loss": 0.1863, + "step": 692 + }, + { + "epoch": 0.3607496095783446, + "grad_norm": 0.3839353193339784, + "learning_rate": 4.894001499771015e-05, + "loss": 0.183, + "step": 693 + }, + { + "epoch": 0.3612701717855284, + "grad_norm": 0.32399392427850626, + "learning_rate": 4.8935963408588134e-05, + "loss": 0.1748, + "step": 694 + }, + { + "epoch": 0.3617907339927121, + "grad_norm": 0.3585859771864591, + "learning_rate": 4.893190425937499e-05, + "loss": 0.1811, + "step": 695 + }, + { + "epoch": 0.3623112961998959, + "grad_norm": 0.3227714956193789, + "learning_rate": 4.8927837551352784e-05, + "loss": 0.1782, + "step": 696 + }, + { + "epoch": 0.36283185840707965, + "grad_norm": 0.3474440141200088, + "learning_rate": 4.8923763285805965e-05, + "loss": 0.1754, + "step": 697 + }, + { + "epoch": 0.3633524206142634, + "grad_norm": 0.3322694368519195, + "learning_rate": 4.89196814640214e-05, + "loss": 0.1774, + "step": 698 + }, + { + "epoch": 0.36387298282144714, + "grad_norm": 0.34820159808484086, + "learning_rate": 4.89155920872883e-05, + "loss": 0.177, + "step": 699 + }, + { + "epoch": 0.3643935450286309, + "grad_norm": 0.3031746341954232, + "learning_rate": 4.891149515689827e-05, + "loss": 0.1736, + "step": 700 + }, + { + "epoch": 0.3649141072358147, + "grad_norm": 0.36173867640457913, + "learning_rate": 4.8907390674145335e-05, + "loss": 0.1861, + "step": 701 + }, + { + "epoch": 0.36543466944299846, + "grad_norm": 0.30797603475460056, + "learning_rate": 4.890327864032587e-05, + "loss": 0.1766, + "step": 702 + }, + { + "epoch": 0.3659552316501822, + "grad_norm": 0.4370440494274063, + "learning_rate": 4.8899159056738646e-05, + "loss": 0.1791, + "step": 703 + }, + { + "epoch": 0.36647579385736595, + "grad_norm": 0.33055435550054857, + "learning_rate": 4.889503192468482e-05, + "loss": 0.1794, + "step": 704 + }, + { + "epoch": 0.3669963560645497, + "grad_norm": 0.3232689437227807, + "learning_rate": 4.8890897245467934e-05, + "loss": 0.1692, + "step": 705 + }, + { + "epoch": 0.3675169182717335, + "grad_norm": 0.32294570761168684, + "learning_rate": 4.8886755020393915e-05, + "loss": 0.1815, + "step": 706 + }, + { + "epoch": 0.3680374804789172, + "grad_norm": 0.34062024576611194, + "learning_rate": 4.888260525077106e-05, + "loss": 0.1702, + "step": 707 + }, + { + "epoch": 0.368558042686101, + "grad_norm": 0.30465620374884, + "learning_rate": 4.887844793791008e-05, + "loss": 0.1751, + "step": 708 + }, + { + "epoch": 0.36907860489328476, + "grad_norm": 0.3420067292282057, + "learning_rate": 4.887428308312402e-05, + "loss": 0.1787, + "step": 709 + }, + { + "epoch": 0.36959916710046853, + "grad_norm": 0.32157102458891257, + "learning_rate": 4.887011068772835e-05, + "loss": 0.1757, + "step": 710 + }, + { + "epoch": 0.37011972930765225, + "grad_norm": 0.3128376282076131, + "learning_rate": 4.886593075304091e-05, + "loss": 0.1743, + "step": 711 + }, + { + "epoch": 0.370640291514836, + "grad_norm": 0.3401322277124052, + "learning_rate": 4.886174328038191e-05, + "loss": 0.1777, + "step": 712 + }, + { + "epoch": 0.3711608537220198, + "grad_norm": 0.33579339844499495, + "learning_rate": 4.885754827107395e-05, + "loss": 0.1772, + "step": 713 + }, + { + "epoch": 0.37168141592920356, + "grad_norm": 0.3153726122024893, + "learning_rate": 4.885334572644202e-05, + "loss": 0.1789, + "step": 714 + }, + { + "epoch": 0.3722019781363873, + "grad_norm": 0.32118752532405626, + "learning_rate": 4.884913564781346e-05, + "loss": 0.1719, + "step": 715 + }, + { + "epoch": 0.37272254034357105, + "grad_norm": 0.3213935784665974, + "learning_rate": 4.884491803651803e-05, + "loss": 0.169, + "step": 716 + }, + { + "epoch": 0.3732431025507548, + "grad_norm": 0.33814274890520907, + "learning_rate": 4.884069289388783e-05, + "loss": 0.1781, + "step": 717 + }, + { + "epoch": 0.3737636647579386, + "grad_norm": 0.30359841482721706, + "learning_rate": 4.883646022125736e-05, + "loss": 0.1733, + "step": 718 + }, + { + "epoch": 0.3742842269651223, + "grad_norm": 0.30429885008494284, + "learning_rate": 4.8832220019963514e-05, + "loss": 0.1693, + "step": 719 + }, + { + "epoch": 0.3748047891723061, + "grad_norm": 0.331016348005072, + "learning_rate": 4.882797229134551e-05, + "loss": 0.1771, + "step": 720 + }, + { + "epoch": 0.37532535137948986, + "grad_norm": 0.3339222828334093, + "learning_rate": 4.882371703674501e-05, + "loss": 0.1782, + "step": 721 + }, + { + "epoch": 0.37584591358667363, + "grad_norm": 0.2991710011140988, + "learning_rate": 4.8819454257506015e-05, + "loss": 0.1672, + "step": 722 + }, + { + "epoch": 0.37636647579385735, + "grad_norm": 0.36396331898025386, + "learning_rate": 4.8815183954974896e-05, + "loss": 0.1715, + "step": 723 + }, + { + "epoch": 0.3768870380010411, + "grad_norm": 0.33416943073891897, + "learning_rate": 4.881090613050042e-05, + "loss": 0.1771, + "step": 724 + }, + { + "epoch": 0.3774076002082249, + "grad_norm": 0.36523650639258104, + "learning_rate": 4.8806620785433726e-05, + "loss": 0.164, + "step": 725 + }, + { + "epoch": 0.37792816241540866, + "grad_norm": 0.32073950251793465, + "learning_rate": 4.880232792112832e-05, + "loss": 0.171, + "step": 726 + }, + { + "epoch": 0.3784487246225924, + "grad_norm": 0.3484933060375919, + "learning_rate": 4.879802753894009e-05, + "loss": 0.1803, + "step": 727 + }, + { + "epoch": 0.37896928682977615, + "grad_norm": 0.33643384179749536, + "learning_rate": 4.879371964022731e-05, + "loss": 0.1762, + "step": 728 + }, + { + "epoch": 0.3794898490369599, + "grad_norm": 0.3844044688078634, + "learning_rate": 4.878940422635059e-05, + "loss": 0.1757, + "step": 729 + }, + { + "epoch": 0.3800104112441437, + "grad_norm": 0.31212059732422176, + "learning_rate": 4.878508129867296e-05, + "loss": 0.1744, + "step": 730 + }, + { + "epoch": 0.3805309734513274, + "grad_norm": 0.37952982057783435, + "learning_rate": 4.8780750858559794e-05, + "loss": 0.1873, + "step": 731 + }, + { + "epoch": 0.3810515356585112, + "grad_norm": 0.36193360687044024, + "learning_rate": 4.877641290737884e-05, + "loss": 0.1746, + "step": 732 + }, + { + "epoch": 0.38157209786569496, + "grad_norm": 0.3415108925924172, + "learning_rate": 4.8772067446500235e-05, + "loss": 0.176, + "step": 733 + }, + { + "epoch": 0.38209266007287873, + "grad_norm": 0.35739113924155885, + "learning_rate": 4.8767714477296475e-05, + "loss": 0.1773, + "step": 734 + }, + { + "epoch": 0.38261322228006245, + "grad_norm": 0.299683012812736, + "learning_rate": 4.8763354001142426e-05, + "loss": 0.1742, + "step": 735 + }, + { + "epoch": 0.3831337844872462, + "grad_norm": 0.3281156150608031, + "learning_rate": 4.875898601941533e-05, + "loss": 0.1773, + "step": 736 + }, + { + "epoch": 0.38365434669443, + "grad_norm": 0.29779263356766955, + "learning_rate": 4.875461053349481e-05, + "loss": 0.1754, + "step": 737 + }, + { + "epoch": 0.38417490890161377, + "grad_norm": 0.36739569269258815, + "learning_rate": 4.875022754476283e-05, + "loss": 0.1768, + "step": 738 + }, + { + "epoch": 0.3846954711087975, + "grad_norm": 0.3305403555222801, + "learning_rate": 4.8745837054603746e-05, + "loss": 0.1782, + "step": 739 + }, + { + "epoch": 0.38521603331598125, + "grad_norm": 0.3461843255219805, + "learning_rate": 4.87414390644043e-05, + "loss": 0.1761, + "step": 740 + }, + { + "epoch": 0.385736595523165, + "grad_norm": 0.34024915037224157, + "learning_rate": 4.8737033575553556e-05, + "loss": 0.1783, + "step": 741 + }, + { + "epoch": 0.3862571577303488, + "grad_norm": 0.36955073863687815, + "learning_rate": 4.873262058944299e-05, + "loss": 0.1801, + "step": 742 + }, + { + "epoch": 0.3867777199375325, + "grad_norm": 0.33810479957816336, + "learning_rate": 4.872820010746641e-05, + "loss": 0.1781, + "step": 743 + }, + { + "epoch": 0.3872982821447163, + "grad_norm": 0.31816896200979733, + "learning_rate": 4.872377213102003e-05, + "loss": 0.1701, + "step": 744 + }, + { + "epoch": 0.38781884435190006, + "grad_norm": 0.2994338543173188, + "learning_rate": 4.871933666150239e-05, + "loss": 0.1768, + "step": 745 + }, + { + "epoch": 0.38833940655908383, + "grad_norm": 0.3143457501312435, + "learning_rate": 4.8714893700314445e-05, + "loss": 0.1745, + "step": 746 + }, + { + "epoch": 0.38885996876626755, + "grad_norm": 0.3323503389224882, + "learning_rate": 4.8710443248859464e-05, + "loss": 0.1747, + "step": 747 + }, + { + "epoch": 0.3893805309734513, + "grad_norm": 0.3144112661589175, + "learning_rate": 4.870598530854312e-05, + "loss": 0.1722, + "step": 748 + }, + { + "epoch": 0.3899010931806351, + "grad_norm": 0.3299614127535381, + "learning_rate": 4.870151988077343e-05, + "loss": 0.177, + "step": 749 + }, + { + "epoch": 0.39042165538781887, + "grad_norm": 0.3303857038262982, + "learning_rate": 4.869704696696079e-05, + "loss": 0.1686, + "step": 750 + }, + { + "epoch": 0.3909422175950026, + "grad_norm": 0.30943696088141864, + "learning_rate": 4.869256656851795e-05, + "loss": 0.1748, + "step": 751 + }, + { + "epoch": 0.39146277980218636, + "grad_norm": 0.3347597384412751, + "learning_rate": 4.8688078686860025e-05, + "loss": 0.1848, + "step": 752 + }, + { + "epoch": 0.39198334200937013, + "grad_norm": 0.3357985035932315, + "learning_rate": 4.8683583323404514e-05, + "loss": 0.1722, + "step": 753 + }, + { + "epoch": 0.3925039042165539, + "grad_norm": 0.3229921181970955, + "learning_rate": 4.867908047957125e-05, + "loss": 0.1716, + "step": 754 + }, + { + "epoch": 0.3930244664237376, + "grad_norm": 0.4022384622310971, + "learning_rate": 4.867457015678244e-05, + "loss": 0.1786, + "step": 755 + }, + { + "epoch": 0.3935450286309214, + "grad_norm": 0.3049771099599678, + "learning_rate": 4.867005235646265e-05, + "loss": 0.1793, + "step": 756 + }, + { + "epoch": 0.39406559083810516, + "grad_norm": 0.4173130163682277, + "learning_rate": 4.866552708003882e-05, + "loss": 0.1778, + "step": 757 + }, + { + "epoch": 0.39458615304528893, + "grad_norm": 0.3313304796655795, + "learning_rate": 4.8660994328940235e-05, + "loss": 0.179, + "step": 758 + }, + { + "epoch": 0.39510671525247265, + "grad_norm": 0.3218402610994695, + "learning_rate": 4.865645410459856e-05, + "loss": 0.1628, + "step": 759 + }, + { + "epoch": 0.3956272774596564, + "grad_norm": 0.32361763518417075, + "learning_rate": 4.8651906408447795e-05, + "loss": 0.1749, + "step": 760 + }, + { + "epoch": 0.3961478396668402, + "grad_norm": 0.33838851288907523, + "learning_rate": 4.864735124192432e-05, + "loss": 0.1731, + "step": 761 + }, + { + "epoch": 0.39666840187402397, + "grad_norm": 0.30076334190054943, + "learning_rate": 4.8642788606466884e-05, + "loss": 0.1697, + "step": 762 + }, + { + "epoch": 0.3971889640812077, + "grad_norm": 0.2972937365361002, + "learning_rate": 4.8638218503516554e-05, + "loss": 0.1732, + "step": 763 + }, + { + "epoch": 0.39770952628839146, + "grad_norm": 0.2930332941508139, + "learning_rate": 4.863364093451679e-05, + "loss": 0.1767, + "step": 764 + }, + { + "epoch": 0.39823008849557523, + "grad_norm": 0.29879453433882425, + "learning_rate": 4.862905590091341e-05, + "loss": 0.1649, + "step": 765 + }, + { + "epoch": 0.398750650702759, + "grad_norm": 0.30522662679820955, + "learning_rate": 4.8624463404154575e-05, + "loss": 0.1779, + "step": 766 + }, + { + "epoch": 0.3992712129099427, + "grad_norm": 0.29429201663178467, + "learning_rate": 4.8619863445690804e-05, + "loss": 0.1742, + "step": 767 + }, + { + "epoch": 0.3997917751171265, + "grad_norm": 0.2828872044198585, + "learning_rate": 4.861525602697499e-05, + "loss": 0.169, + "step": 768 + }, + { + "epoch": 0.40031233732431026, + "grad_norm": 0.31056313318997847, + "learning_rate": 4.861064114946236e-05, + "loss": 0.1728, + "step": 769 + }, + { + "epoch": 0.40083289953149404, + "grad_norm": 0.28343310391927706, + "learning_rate": 4.860601881461051e-05, + "loss": 0.171, + "step": 770 + }, + { + "epoch": 0.40135346173867775, + "grad_norm": 0.31511791024068336, + "learning_rate": 4.86013890238794e-05, + "loss": 0.1754, + "step": 771 + }, + { + "epoch": 0.4018740239458615, + "grad_norm": 0.3106130269067802, + "learning_rate": 4.85967517787313e-05, + "loss": 0.1755, + "step": 772 + }, + { + "epoch": 0.4023945861530453, + "grad_norm": 0.3216761568260499, + "learning_rate": 4.859210708063091e-05, + "loss": 0.1709, + "step": 773 + }, + { + "epoch": 0.40291514836022907, + "grad_norm": 0.3169473466081924, + "learning_rate": 4.858745493104521e-05, + "loss": 0.1677, + "step": 774 + }, + { + "epoch": 0.4034357105674128, + "grad_norm": 0.2858364547750457, + "learning_rate": 4.858279533144358e-05, + "loss": 0.173, + "step": 775 + }, + { + "epoch": 0.40395627277459656, + "grad_norm": 0.31534006666184045, + "learning_rate": 4.8578128283297736e-05, + "loss": 0.1701, + "step": 776 + }, + { + "epoch": 0.40447683498178033, + "grad_norm": 0.2978277533533381, + "learning_rate": 4.857345378808175e-05, + "loss": 0.1642, + "step": 777 + }, + { + "epoch": 0.4049973971889641, + "grad_norm": 0.30248488777067317, + "learning_rate": 4.856877184727204e-05, + "loss": 0.1763, + "step": 778 + }, + { + "epoch": 0.4055179593961478, + "grad_norm": 0.31081161780013244, + "learning_rate": 4.856408246234739e-05, + "loss": 0.1672, + "step": 779 + }, + { + "epoch": 0.4060385216033316, + "grad_norm": 0.33291363782085454, + "learning_rate": 4.855938563478892e-05, + "loss": 0.1773, + "step": 780 + }, + { + "epoch": 0.40655908381051536, + "grad_norm": 0.3108957897270069, + "learning_rate": 4.8554681366080104e-05, + "loss": 0.1693, + "step": 781 + }, + { + "epoch": 0.40707964601769914, + "grad_norm": 0.32908383574112354, + "learning_rate": 4.8549969657706775e-05, + "loss": 0.1708, + "step": 782 + }, + { + "epoch": 0.40760020822488285, + "grad_norm": 0.3107460244869553, + "learning_rate": 4.85452505111571e-05, + "loss": 0.1683, + "step": 783 + }, + { + "epoch": 0.4081207704320666, + "grad_norm": 0.31853260828349084, + "learning_rate": 4.8540523927921616e-05, + "loss": 0.1687, + "step": 784 + }, + { + "epoch": 0.4086413326392504, + "grad_norm": 0.3316073002641256, + "learning_rate": 4.85357899094932e-05, + "loss": 0.1684, + "step": 785 + }, + { + "epoch": 0.40916189484643417, + "grad_norm": 0.3027467331091613, + "learning_rate": 4.853104845736706e-05, + "loss": 0.1715, + "step": 786 + }, + { + "epoch": 0.4096824570536179, + "grad_norm": 0.3278257429615449, + "learning_rate": 4.852629957304078e-05, + "loss": 0.1773, + "step": 787 + }, + { + "epoch": 0.41020301926080166, + "grad_norm": 0.291153268830071, + "learning_rate": 4.8521543258014276e-05, + "loss": 0.1681, + "step": 788 + }, + { + "epoch": 0.41072358146798543, + "grad_norm": 0.3559444910615816, + "learning_rate": 4.8516779513789815e-05, + "loss": 0.1807, + "step": 789 + }, + { + "epoch": 0.4112441436751692, + "grad_norm": 0.3085670500497958, + "learning_rate": 4.8512008341872e-05, + "loss": 0.1719, + "step": 790 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.35264383435574276, + "learning_rate": 4.85072297437678e-05, + "loss": 0.1794, + "step": 791 + }, + { + "epoch": 0.4122852680895367, + "grad_norm": 0.3356960894753313, + "learning_rate": 4.850244372098651e-05, + "loss": 0.1769, + "step": 792 + }, + { + "epoch": 0.41280583029672047, + "grad_norm": 0.32740161142053137, + "learning_rate": 4.8497650275039795e-05, + "loss": 0.1783, + "step": 793 + }, + { + "epoch": 0.41332639250390424, + "grad_norm": 0.3351198703316088, + "learning_rate": 4.849284940744163e-05, + "loss": 0.1733, + "step": 794 + }, + { + "epoch": 0.41384695471108796, + "grad_norm": 0.312070876021951, + "learning_rate": 4.848804111970836e-05, + "loss": 0.1866, + "step": 795 + }, + { + "epoch": 0.4143675169182717, + "grad_norm": 0.3282848492692099, + "learning_rate": 4.8483225413358663e-05, + "loss": 0.1789, + "step": 796 + }, + { + "epoch": 0.4148880791254555, + "grad_norm": 0.286266024247938, + "learning_rate": 4.8478402289913566e-05, + "loss": 0.1698, + "step": 797 + }, + { + "epoch": 0.4154086413326393, + "grad_norm": 0.33058577040283377, + "learning_rate": 4.847357175089643e-05, + "loss": 0.1684, + "step": 798 + }, + { + "epoch": 0.415929203539823, + "grad_norm": 0.2934521521206386, + "learning_rate": 4.846873379783298e-05, + "loss": 0.1753, + "step": 799 + }, + { + "epoch": 0.41644976574700676, + "grad_norm": 0.31483778450446187, + "learning_rate": 4.846388843225125e-05, + "loss": 0.1665, + "step": 800 + }, + { + "epoch": 0.41697032795419053, + "grad_norm": 0.30494366672931933, + "learning_rate": 4.845903565568164e-05, + "loss": 0.1791, + "step": 801 + }, + { + "epoch": 0.4174908901613743, + "grad_norm": 0.2737424192822608, + "learning_rate": 4.845417546965688e-05, + "loss": 0.1689, + "step": 802 + }, + { + "epoch": 0.418011452368558, + "grad_norm": 0.3176283439520924, + "learning_rate": 4.844930787571204e-05, + "loss": 0.1728, + "step": 803 + }, + { + "epoch": 0.4185320145757418, + "grad_norm": 0.3013277659758763, + "learning_rate": 4.844443287538454e-05, + "loss": 0.1793, + "step": 804 + }, + { + "epoch": 0.41905257678292557, + "grad_norm": 0.31841896666187924, + "learning_rate": 4.8439550470214124e-05, + "loss": 0.1725, + "step": 805 + }, + { + "epoch": 0.41957313899010934, + "grad_norm": 0.32537384441510847, + "learning_rate": 4.8434660661742894e-05, + "loss": 0.1726, + "step": 806 + }, + { + "epoch": 0.42009370119729306, + "grad_norm": 0.2918667124673871, + "learning_rate": 4.8429763451515263e-05, + "loss": 0.1713, + "step": 807 + }, + { + "epoch": 0.42061426340447683, + "grad_norm": 0.3068100516789931, + "learning_rate": 4.842485884107801e-05, + "loss": 0.177, + "step": 808 + }, + { + "epoch": 0.4211348256116606, + "grad_norm": 0.32460789829684644, + "learning_rate": 4.8419946831980236e-05, + "loss": 0.1742, + "step": 809 + }, + { + "epoch": 0.4216553878188444, + "grad_norm": 0.2946290268979509, + "learning_rate": 4.8415027425773386e-05, + "loss": 0.1681, + "step": 810 + }, + { + "epoch": 0.4221759500260281, + "grad_norm": 0.3258905754569639, + "learning_rate": 4.841010062401123e-05, + "loss": 0.1741, + "step": 811 + }, + { + "epoch": 0.42269651223321186, + "grad_norm": 0.29790984995596925, + "learning_rate": 4.840516642824988e-05, + "loss": 0.1694, + "step": 812 + }, + { + "epoch": 0.42321707444039564, + "grad_norm": 0.31430235551801683, + "learning_rate": 4.8400224840047795e-05, + "loss": 0.1692, + "step": 813 + }, + { + "epoch": 0.4237376366475794, + "grad_norm": 0.28497704460198, + "learning_rate": 4.839527586096575e-05, + "loss": 0.1692, + "step": 814 + }, + { + "epoch": 0.4242581988547631, + "grad_norm": 0.3251548281082162, + "learning_rate": 4.839031949256687e-05, + "loss": 0.1786, + "step": 815 + }, + { + "epoch": 0.4247787610619469, + "grad_norm": 0.3037172282531429, + "learning_rate": 4.838535573641661e-05, + "loss": 0.1784, + "step": 816 + }, + { + "epoch": 0.42529932326913067, + "grad_norm": 0.2921332397976667, + "learning_rate": 4.838038459408273e-05, + "loss": 0.1627, + "step": 817 + }, + { + "epoch": 0.42581988547631444, + "grad_norm": 0.2871659618387349, + "learning_rate": 4.837540606713538e-05, + "loss": 0.1669, + "step": 818 + }, + { + "epoch": 0.42634044768349816, + "grad_norm": 0.3022037954071766, + "learning_rate": 4.837042015714698e-05, + "loss": 0.1737, + "step": 819 + }, + { + "epoch": 0.42686100989068193, + "grad_norm": 0.31565270517728566, + "learning_rate": 4.8365426865692345e-05, + "loss": 0.1787, + "step": 820 + }, + { + "epoch": 0.4273815720978657, + "grad_norm": 0.32793352347608645, + "learning_rate": 4.836042619434856e-05, + "loss": 0.1748, + "step": 821 + }, + { + "epoch": 0.4279021343050495, + "grad_norm": 0.317161732106535, + "learning_rate": 4.835541814469509e-05, + "loss": 0.1782, + "step": 822 + }, + { + "epoch": 0.4284226965122332, + "grad_norm": 0.3049128671196971, + "learning_rate": 4.83504027183137e-05, + "loss": 0.1816, + "step": 823 + }, + { + "epoch": 0.42894325871941696, + "grad_norm": 0.28919765868015046, + "learning_rate": 4.8345379916788505e-05, + "loss": 0.1672, + "step": 824 + }, + { + "epoch": 0.42946382092660074, + "grad_norm": 0.33488488955581175, + "learning_rate": 4.834034974170592e-05, + "loss": 0.1688, + "step": 825 + }, + { + "epoch": 0.4299843831337845, + "grad_norm": 0.31612472215220894, + "learning_rate": 4.833531219465473e-05, + "loss": 0.1639, + "step": 826 + }, + { + "epoch": 0.4305049453409682, + "grad_norm": 0.30731831334383447, + "learning_rate": 4.8330267277226006e-05, + "loss": 0.1712, + "step": 827 + }, + { + "epoch": 0.431025507548152, + "grad_norm": 0.31217223215929757, + "learning_rate": 4.832521499101319e-05, + "loss": 0.176, + "step": 828 + }, + { + "epoch": 0.43154606975533577, + "grad_norm": 0.2795778724288852, + "learning_rate": 4.8320155337612014e-05, + "loss": 0.1703, + "step": 829 + }, + { + "epoch": 0.43206663196251954, + "grad_norm": 0.2970259430025653, + "learning_rate": 4.831508831862055e-05, + "loss": 0.1693, + "step": 830 + }, + { + "epoch": 0.43258719416970326, + "grad_norm": 0.31169918235532607, + "learning_rate": 4.8310013935639206e-05, + "loss": 0.1758, + "step": 831 + }, + { + "epoch": 0.43310775637688703, + "grad_norm": 0.3282580950489392, + "learning_rate": 4.830493219027071e-05, + "loss": 0.1701, + "step": 832 + }, + { + "epoch": 0.4336283185840708, + "grad_norm": 0.3035542216950865, + "learning_rate": 4.829984308412011e-05, + "loss": 0.1759, + "step": 833 + }, + { + "epoch": 0.4341488807912546, + "grad_norm": 0.3170042392985621, + "learning_rate": 4.8294746618794786e-05, + "loss": 0.1714, + "step": 834 + }, + { + "epoch": 0.4346694429984383, + "grad_norm": 0.30492935178098757, + "learning_rate": 4.8289642795904433e-05, + "loss": 0.1769, + "step": 835 + }, + { + "epoch": 0.43519000520562207, + "grad_norm": 0.3023169478849347, + "learning_rate": 4.828453161706108e-05, + "loss": 0.1684, + "step": 836 + }, + { + "epoch": 0.43571056741280584, + "grad_norm": 0.2934309140121328, + "learning_rate": 4.8279413083879063e-05, + "loss": 0.1726, + "step": 837 + }, + { + "epoch": 0.4362311296199896, + "grad_norm": 0.33964067234560835, + "learning_rate": 4.827428719797508e-05, + "loss": 0.1811, + "step": 838 + }, + { + "epoch": 0.4367516918271733, + "grad_norm": 0.3239469204735507, + "learning_rate": 4.8269153960968094e-05, + "loss": 0.1801, + "step": 839 + }, + { + "epoch": 0.4372722540343571, + "grad_norm": 0.34044634387274025, + "learning_rate": 4.8264013374479446e-05, + "loss": 0.1734, + "step": 840 + }, + { + "epoch": 0.43779281624154087, + "grad_norm": 0.3143167652108177, + "learning_rate": 4.825886544013275e-05, + "loss": 0.1742, + "step": 841 + }, + { + "epoch": 0.43831337844872464, + "grad_norm": 0.2944577153853259, + "learning_rate": 4.825371015955398e-05, + "loss": 0.1653, + "step": 842 + }, + { + "epoch": 0.43883394065590836, + "grad_norm": 0.3052896712979636, + "learning_rate": 4.82485475343714e-05, + "loss": 0.1704, + "step": 843 + }, + { + "epoch": 0.43935450286309213, + "grad_norm": 0.3177119837657539, + "learning_rate": 4.8243377566215616e-05, + "loss": 0.1719, + "step": 844 + }, + { + "epoch": 0.4398750650702759, + "grad_norm": 0.30859779939404325, + "learning_rate": 4.8238200256719554e-05, + "loss": 0.1779, + "step": 845 + }, + { + "epoch": 0.4403956272774597, + "grad_norm": 0.3226603688721493, + "learning_rate": 4.823301560751843e-05, + "loss": 0.1686, + "step": 846 + }, + { + "epoch": 0.4409161894846434, + "grad_norm": 0.29317543800324647, + "learning_rate": 4.82278236202498e-05, + "loss": 0.1673, + "step": 847 + }, + { + "epoch": 0.44143675169182717, + "grad_norm": 0.31460096487006833, + "learning_rate": 4.8222624296553554e-05, + "loss": 0.1753, + "step": 848 + }, + { + "epoch": 0.44195731389901094, + "grad_norm": 0.2919013066978429, + "learning_rate": 4.821741763807186e-05, + "loss": 0.1694, + "step": 849 + }, + { + "epoch": 0.4424778761061947, + "grad_norm": 0.33306663050219115, + "learning_rate": 4.821220364644923e-05, + "loss": 0.1725, + "step": 850 + }, + { + "epoch": 0.44299843831337843, + "grad_norm": 0.32508594947874425, + "learning_rate": 4.8206982323332485e-05, + "loss": 0.1703, + "step": 851 + }, + { + "epoch": 0.4435190005205622, + "grad_norm": 0.312739185199514, + "learning_rate": 4.820175367037076e-05, + "loss": 0.1747, + "step": 852 + }, + { + "epoch": 0.444039562727746, + "grad_norm": 0.3139934324616478, + "learning_rate": 4.8196517689215515e-05, + "loss": 0.1697, + "step": 853 + }, + { + "epoch": 0.44456012493492975, + "grad_norm": 0.28334748849023433, + "learning_rate": 4.8191274381520515e-05, + "loss": 0.1701, + "step": 854 + }, + { + "epoch": 0.44508068714211346, + "grad_norm": 0.31522836280773053, + "learning_rate": 4.818602374894182e-05, + "loss": 0.1724, + "step": 855 + }, + { + "epoch": 0.44560124934929723, + "grad_norm": 0.28857799909498244, + "learning_rate": 4.8180765793137856e-05, + "loss": 0.1748, + "step": 856 + }, + { + "epoch": 0.446121811556481, + "grad_norm": 0.2972500213315476, + "learning_rate": 4.817550051576931e-05, + "loss": 0.1653, + "step": 857 + }, + { + "epoch": 0.4466423737636648, + "grad_norm": 0.29255400412368143, + "learning_rate": 4.81702279184992e-05, + "loss": 0.1615, + "step": 858 + }, + { + "epoch": 0.4471629359708485, + "grad_norm": 0.2886655749254805, + "learning_rate": 4.8164948002992874e-05, + "loss": 0.1634, + "step": 859 + }, + { + "epoch": 0.44768349817803227, + "grad_norm": 0.31437191386746677, + "learning_rate": 4.815966077091796e-05, + "loss": 0.1749, + "step": 860 + }, + { + "epoch": 0.44820406038521604, + "grad_norm": 0.29304891186896304, + "learning_rate": 4.815436622394441e-05, + "loss": 0.1708, + "step": 861 + }, + { + "epoch": 0.4487246225923998, + "grad_norm": 0.31745839082325317, + "learning_rate": 4.814906436374451e-05, + "loss": 0.1686, + "step": 862 + }, + { + "epoch": 0.44924518479958353, + "grad_norm": 0.3046376488171113, + "learning_rate": 4.814375519199281e-05, + "loss": 0.1654, + "step": 863 + }, + { + "epoch": 0.4497657470067673, + "grad_norm": 0.30408750750055136, + "learning_rate": 4.8138438710366204e-05, + "loss": 0.1692, + "step": 864 + }, + { + "epoch": 0.4502863092139511, + "grad_norm": 0.2900405154788453, + "learning_rate": 4.813311492054388e-05, + "loss": 0.1719, + "step": 865 + }, + { + "epoch": 0.45080687142113485, + "grad_norm": 0.307651302837432, + "learning_rate": 4.8127783824207344e-05, + "loss": 0.1768, + "step": 866 + }, + { + "epoch": 0.45132743362831856, + "grad_norm": 0.3003033631697678, + "learning_rate": 4.812244542304041e-05, + "loss": 0.1687, + "step": 867 + }, + { + "epoch": 0.45184799583550234, + "grad_norm": 0.3226224029065988, + "learning_rate": 4.811709971872918e-05, + "loss": 0.1737, + "step": 868 + }, + { + "epoch": 0.4523685580426861, + "grad_norm": 0.3431087957838699, + "learning_rate": 4.8111746712962066e-05, + "loss": 0.1792, + "step": 869 + }, + { + "epoch": 0.4528891202498699, + "grad_norm": 0.3241044954870311, + "learning_rate": 4.810638640742983e-05, + "loss": 0.1747, + "step": 870 + }, + { + "epoch": 0.4534096824570536, + "grad_norm": 0.29640273607618955, + "learning_rate": 4.810101880382548e-05, + "loss": 0.165, + "step": 871 + }, + { + "epoch": 0.45393024466423737, + "grad_norm": 0.2992144530417321, + "learning_rate": 4.809564390384437e-05, + "loss": 0.1734, + "step": 872 + }, + { + "epoch": 0.45445080687142114, + "grad_norm": 0.33183641077540893, + "learning_rate": 4.809026170918414e-05, + "loss": 0.1737, + "step": 873 + }, + { + "epoch": 0.4549713690786049, + "grad_norm": 0.28653542372832375, + "learning_rate": 4.808487222154472e-05, + "loss": 0.1634, + "step": 874 + }, + { + "epoch": 0.45549193128578863, + "grad_norm": 0.32334785309979097, + "learning_rate": 4.807947544262839e-05, + "loss": 0.1733, + "step": 875 + }, + { + "epoch": 0.4560124934929724, + "grad_norm": 0.3030305396087403, + "learning_rate": 4.807407137413967e-05, + "loss": 0.1658, + "step": 876 + }, + { + "epoch": 0.4565330557001562, + "grad_norm": 0.296769998908498, + "learning_rate": 4.806866001778545e-05, + "loss": 0.1767, + "step": 877 + }, + { + "epoch": 0.45705361790733995, + "grad_norm": 0.31049567683022117, + "learning_rate": 4.806324137527487e-05, + "loss": 0.1797, + "step": 878 + }, + { + "epoch": 0.45757418011452367, + "grad_norm": 0.30534013515879393, + "learning_rate": 4.8057815448319394e-05, + "loss": 0.1691, + "step": 879 + }, + { + "epoch": 0.45809474232170744, + "grad_norm": 0.31938583186414876, + "learning_rate": 4.8052382238632774e-05, + "loss": 0.1729, + "step": 880 + }, + { + "epoch": 0.4586153045288912, + "grad_norm": 0.3046756327864025, + "learning_rate": 4.804694174793108e-05, + "loss": 0.1623, + "step": 881 + }, + { + "epoch": 0.459135866736075, + "grad_norm": 0.30819124791549257, + "learning_rate": 4.8041493977932685e-05, + "loss": 0.1739, + "step": 882 + }, + { + "epoch": 0.4596564289432587, + "grad_norm": 0.29113272544105995, + "learning_rate": 4.803603893035822e-05, + "loss": 0.1635, + "step": 883 + }, + { + "epoch": 0.46017699115044247, + "grad_norm": 0.2908354554872441, + "learning_rate": 4.803057660693065e-05, + "loss": 0.1642, + "step": 884 + }, + { + "epoch": 0.46069755335762624, + "grad_norm": 0.3228709959167957, + "learning_rate": 4.8025107009375246e-05, + "loss": 0.1687, + "step": 885 + }, + { + "epoch": 0.46121811556481, + "grad_norm": 0.2875173089594343, + "learning_rate": 4.8019630139419555e-05, + "loss": 0.1779, + "step": 886 + }, + { + "epoch": 0.46173867777199373, + "grad_norm": 0.33584341705384174, + "learning_rate": 4.8014145998793416e-05, + "loss": 0.1733, + "step": 887 + }, + { + "epoch": 0.4622592399791775, + "grad_norm": 0.2896189188284826, + "learning_rate": 4.8008654589228984e-05, + "loss": 0.1684, + "step": 888 + }, + { + "epoch": 0.4627798021863613, + "grad_norm": 0.3260162748091208, + "learning_rate": 4.800315591246071e-05, + "loss": 0.1679, + "step": 889 + }, + { + "epoch": 0.46330036439354505, + "grad_norm": 0.28906298710463635, + "learning_rate": 4.799764997022532e-05, + "loss": 0.1748, + "step": 890 + }, + { + "epoch": 0.46382092660072877, + "grad_norm": 0.28621158344042275, + "learning_rate": 4.799213676426185e-05, + "loss": 0.1745, + "step": 891 + }, + { + "epoch": 0.46434148880791254, + "grad_norm": 0.309418801701727, + "learning_rate": 4.798661629631163e-05, + "loss": 0.168, + "step": 892 + }, + { + "epoch": 0.4648620510150963, + "grad_norm": 0.2872933837593369, + "learning_rate": 4.798108856811828e-05, + "loss": 0.1695, + "step": 893 + }, + { + "epoch": 0.4653826132222801, + "grad_norm": 0.31032860424563535, + "learning_rate": 4.7975553581427715e-05, + "loss": 0.164, + "step": 894 + }, + { + "epoch": 0.4659031754294638, + "grad_norm": 0.2944281461376994, + "learning_rate": 4.797001133798813e-05, + "loss": 0.1715, + "step": 895 + }, + { + "epoch": 0.4664237376366476, + "grad_norm": 0.3185115115776022, + "learning_rate": 4.796446183955003e-05, + "loss": 0.1664, + "step": 896 + }, + { + "epoch": 0.46694429984383135, + "grad_norm": 0.2898044563512308, + "learning_rate": 4.795890508786622e-05, + "loss": 0.1681, + "step": 897 + }, + { + "epoch": 0.4674648620510151, + "grad_norm": 0.30483790574973163, + "learning_rate": 4.795334108469176e-05, + "loss": 0.1687, + "step": 898 + }, + { + "epoch": 0.46798542425819883, + "grad_norm": 0.30599381056013947, + "learning_rate": 4.794776983178403e-05, + "loss": 0.1711, + "step": 899 + }, + { + "epoch": 0.4685059864653826, + "grad_norm": 0.3093671697842366, + "learning_rate": 4.794219133090269e-05, + "loss": 0.1696, + "step": 900 + }, + { + "epoch": 0.4690265486725664, + "grad_norm": 0.3000416412774675, + "learning_rate": 4.793660558380969e-05, + "loss": 0.1748, + "step": 901 + }, + { + "epoch": 0.46954711087975015, + "grad_norm": 0.3242303350202561, + "learning_rate": 4.793101259226927e-05, + "loss": 0.1775, + "step": 902 + }, + { + "epoch": 0.47006767308693387, + "grad_norm": 0.31647313130174715, + "learning_rate": 4.792541235804796e-05, + "loss": 0.167, + "step": 903 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.32293924392398315, + "learning_rate": 4.791980488291456e-05, + "loss": 0.1763, + "step": 904 + }, + { + "epoch": 0.4711087975013014, + "grad_norm": 0.29224165341381086, + "learning_rate": 4.7914190168640196e-05, + "loss": 0.1678, + "step": 905 + }, + { + "epoch": 0.4716293597084852, + "grad_norm": 0.30657161857494225, + "learning_rate": 4.790856821699823e-05, + "loss": 0.1739, + "step": 906 + }, + { + "epoch": 0.4721499219156689, + "grad_norm": 0.29572827932510726, + "learning_rate": 4.790293902976435e-05, + "loss": 0.1635, + "step": 907 + }, + { + "epoch": 0.4726704841228527, + "grad_norm": 0.3140759869076264, + "learning_rate": 4.789730260871651e-05, + "loss": 0.169, + "step": 908 + }, + { + "epoch": 0.47319104633003645, + "grad_norm": 0.30710825268098163, + "learning_rate": 4.7891658955634964e-05, + "loss": 0.1658, + "step": 909 + }, + { + "epoch": 0.4737116085372202, + "grad_norm": 0.31793790159265933, + "learning_rate": 4.7886008072302235e-05, + "loss": 0.1753, + "step": 910 + }, + { + "epoch": 0.47423217074440394, + "grad_norm": 0.3014852750545304, + "learning_rate": 4.788034996050314e-05, + "loss": 0.1709, + "step": 911 + }, + { + "epoch": 0.4747527329515877, + "grad_norm": 0.30747571574129023, + "learning_rate": 4.787468462202476e-05, + "loss": 0.1612, + "step": 912 + }, + { + "epoch": 0.4752732951587715, + "grad_norm": 0.3065142379561827, + "learning_rate": 4.786901205865647e-05, + "loss": 0.1621, + "step": 913 + }, + { + "epoch": 0.47579385736595525, + "grad_norm": 0.3017055320367006, + "learning_rate": 4.786333227218995e-05, + "loss": 0.1759, + "step": 914 + }, + { + "epoch": 0.47631441957313897, + "grad_norm": 0.3063810493732478, + "learning_rate": 4.785764526441913e-05, + "loss": 0.1766, + "step": 915 + }, + { + "epoch": 0.47683498178032274, + "grad_norm": 0.3139212066831126, + "learning_rate": 4.7851951037140234e-05, + "loss": 0.1686, + "step": 916 + }, + { + "epoch": 0.4773555439875065, + "grad_norm": 0.2976317597214064, + "learning_rate": 4.784624959215176e-05, + "loss": 0.169, + "step": 917 + }, + { + "epoch": 0.4778761061946903, + "grad_norm": 0.28710602424260284, + "learning_rate": 4.78405409312545e-05, + "loss": 0.1653, + "step": 918 + }, + { + "epoch": 0.478396668401874, + "grad_norm": 0.30826508978927336, + "learning_rate": 4.783482505625149e-05, + "loss": 0.1716, + "step": 919 + }, + { + "epoch": 0.4789172306090578, + "grad_norm": 0.31724938862767077, + "learning_rate": 4.7829101968948095e-05, + "loss": 0.1682, + "step": 920 + }, + { + "epoch": 0.47943779281624155, + "grad_norm": 0.30073475133813604, + "learning_rate": 4.782337167115193e-05, + "loss": 0.1666, + "step": 921 + }, + { + "epoch": 0.4799583550234253, + "grad_norm": 0.2941398432084064, + "learning_rate": 4.7817634164672875e-05, + "loss": 0.1654, + "step": 922 + }, + { + "epoch": 0.48047891723060904, + "grad_norm": 0.30183359482377575, + "learning_rate": 4.7811889451323114e-05, + "loss": 0.1704, + "step": 923 + }, + { + "epoch": 0.4809994794377928, + "grad_norm": 0.27257120459980766, + "learning_rate": 4.7806137532917085e-05, + "loss": 0.1588, + "step": 924 + }, + { + "epoch": 0.4815200416449766, + "grad_norm": 0.3017092170804171, + "learning_rate": 4.780037841127152e-05, + "loss": 0.1632, + "step": 925 + }, + { + "epoch": 0.48204060385216035, + "grad_norm": 0.31115577416098184, + "learning_rate": 4.779461208820541e-05, + "loss": 0.1723, + "step": 926 + }, + { + "epoch": 0.48256116605934407, + "grad_norm": 0.294393697239707, + "learning_rate": 4.778883856554004e-05, + "loss": 0.1709, + "step": 927 + }, + { + "epoch": 0.48308172826652784, + "grad_norm": 0.31179094440237864, + "learning_rate": 4.778305784509894e-05, + "loss": 0.1837, + "step": 928 + }, + { + "epoch": 0.4836022904737116, + "grad_norm": 0.29561645952616644, + "learning_rate": 4.7777269928707946e-05, + "loss": 0.1679, + "step": 929 + }, + { + "epoch": 0.4841228526808954, + "grad_norm": 0.31643861362801334, + "learning_rate": 4.777147481819515e-05, + "loss": 0.1652, + "step": 930 + }, + { + "epoch": 0.4846434148880791, + "grad_norm": 0.2990057285412248, + "learning_rate": 4.776567251539091e-05, + "loss": 0.1652, + "step": 931 + }, + { + "epoch": 0.4851639770952629, + "grad_norm": 0.31236323289513435, + "learning_rate": 4.7759863022127864e-05, + "loss": 0.1704, + "step": 932 + }, + { + "epoch": 0.48568453930244665, + "grad_norm": 0.3453944960772348, + "learning_rate": 4.775404634024093e-05, + "loss": 0.1748, + "step": 933 + }, + { + "epoch": 0.4862051015096304, + "grad_norm": 0.3629879836622649, + "learning_rate": 4.7748222471567275e-05, + "loss": 0.177, + "step": 934 + }, + { + "epoch": 0.48672566371681414, + "grad_norm": 0.3120022773445736, + "learning_rate": 4.7742391417946345e-05, + "loss": 0.1741, + "step": 935 + }, + { + "epoch": 0.4872462259239979, + "grad_norm": 0.3335724998214946, + "learning_rate": 4.773655318121987e-05, + "loss": 0.1748, + "step": 936 + }, + { + "epoch": 0.4877667881311817, + "grad_norm": 0.34481706739223045, + "learning_rate": 4.7730707763231844e-05, + "loss": 0.1764, + "step": 937 + }, + { + "epoch": 0.48828735033836546, + "grad_norm": 0.32529182532452, + "learning_rate": 4.7724855165828497e-05, + "loss": 0.1689, + "step": 938 + }, + { + "epoch": 0.4888079125455492, + "grad_norm": 0.32960723240412076, + "learning_rate": 4.771899539085837e-05, + "loss": 0.1706, + "step": 939 + }, + { + "epoch": 0.48932847475273294, + "grad_norm": 0.331766989087599, + "learning_rate": 4.7713128440172244e-05, + "loss": 0.1691, + "step": 940 + }, + { + "epoch": 0.4898490369599167, + "grad_norm": 0.32802357802208776, + "learning_rate": 4.770725431562318e-05, + "loss": 0.1753, + "step": 941 + }, + { + "epoch": 0.4903695991671005, + "grad_norm": 0.30860493564486086, + "learning_rate": 4.77013730190665e-05, + "loss": 0.1728, + "step": 942 + }, + { + "epoch": 0.4908901613742842, + "grad_norm": 0.3022595357659953, + "learning_rate": 4.7695484552359794e-05, + "loss": 0.1722, + "step": 943 + }, + { + "epoch": 0.491410723581468, + "grad_norm": 0.2934783342140039, + "learning_rate": 4.7689588917362905e-05, + "loss": 0.1608, + "step": 944 + }, + { + "epoch": 0.49193128578865175, + "grad_norm": 0.28487899547455736, + "learning_rate": 4.768368611593795e-05, + "loss": 0.167, + "step": 945 + }, + { + "epoch": 0.4924518479958355, + "grad_norm": 0.2990203794907809, + "learning_rate": 4.7677776149949315e-05, + "loss": 0.1651, + "step": 946 + }, + { + "epoch": 0.49297241020301924, + "grad_norm": 0.32061252229163467, + "learning_rate": 4.767185902126364e-05, + "loss": 0.1626, + "step": 947 + }, + { + "epoch": 0.493492972410203, + "grad_norm": 0.2807651296414107, + "learning_rate": 4.7665934731749825e-05, + "loss": 0.1668, + "step": 948 + }, + { + "epoch": 0.4940135346173868, + "grad_norm": 0.2967222786779605, + "learning_rate": 4.7660003283279045e-05, + "loss": 0.1721, + "step": 949 + }, + { + "epoch": 0.49453409682457056, + "grad_norm": 0.29814117868358586, + "learning_rate": 4.765406467772472e-05, + "loss": 0.1778, + "step": 950 + }, + { + "epoch": 0.4950546590317543, + "grad_norm": 0.2788975714416808, + "learning_rate": 4.7648118916962535e-05, + "loss": 0.1681, + "step": 951 + }, + { + "epoch": 0.49557522123893805, + "grad_norm": 0.28418700831999905, + "learning_rate": 4.7642166002870455e-05, + "loss": 0.1727, + "step": 952 + }, + { + "epoch": 0.4960957834461218, + "grad_norm": 0.29906644459887316, + "learning_rate": 4.763620593732867e-05, + "loss": 0.1623, + "step": 953 + }, + { + "epoch": 0.4966163456533056, + "grad_norm": 0.26491394611771796, + "learning_rate": 4.763023872221965e-05, + "loss": 0.1677, + "step": 954 + }, + { + "epoch": 0.4971369078604893, + "grad_norm": 0.27853554570105976, + "learning_rate": 4.762426435942812e-05, + "loss": 0.1738, + "step": 955 + }, + { + "epoch": 0.4976574700676731, + "grad_norm": 0.3177673300895612, + "learning_rate": 4.761828285084107e-05, + "loss": 0.1758, + "step": 956 + }, + { + "epoch": 0.49817803227485685, + "grad_norm": 0.2822714144980152, + "learning_rate": 4.761229419834772e-05, + "loss": 0.1688, + "step": 957 + }, + { + "epoch": 0.4986985944820406, + "grad_norm": 0.2931606790760609, + "learning_rate": 4.7606298403839586e-05, + "loss": 0.1714, + "step": 958 + }, + { + "epoch": 0.49921915668922434, + "grad_norm": 0.3067169511093501, + "learning_rate": 4.760029546921041e-05, + "loss": 0.1707, + "step": 959 + }, + { + "epoch": 0.4997397188964081, + "grad_norm": 0.28137140614635753, + "learning_rate": 4.7594285396356184e-05, + "loss": 0.1761, + "step": 960 + }, + { + "epoch": 0.5002602811035919, + "grad_norm": 0.28844189796145525, + "learning_rate": 4.75882681871752e-05, + "loss": 0.1666, + "step": 961 + }, + { + "epoch": 0.5007808433107757, + "grad_norm": 0.2878826071675348, + "learning_rate": 4.758224384356795e-05, + "loss": 0.1683, + "step": 962 + }, + { + "epoch": 0.5013014055179594, + "grad_norm": 0.2998487365957084, + "learning_rate": 4.75762123674372e-05, + "loss": 0.1686, + "step": 963 + }, + { + "epoch": 0.5018219677251432, + "grad_norm": 0.27860322662131287, + "learning_rate": 4.757017376068799e-05, + "loss": 0.1729, + "step": 964 + }, + { + "epoch": 0.5023425299323269, + "grad_norm": 0.27665779300643556, + "learning_rate": 4.7564128025227566e-05, + "loss": 0.1669, + "step": 965 + }, + { + "epoch": 0.5028630921395106, + "grad_norm": 0.2961362102520933, + "learning_rate": 4.755807516296548e-05, + "loss": 0.1731, + "step": 966 + }, + { + "epoch": 0.5033836543466944, + "grad_norm": 0.28373948335406174, + "learning_rate": 4.755201517581349e-05, + "loss": 0.1647, + "step": 967 + }, + { + "epoch": 0.5039042165538782, + "grad_norm": 0.3027972283519984, + "learning_rate": 4.754594806568562e-05, + "loss": 0.1744, + "step": 968 + }, + { + "epoch": 0.504424778761062, + "grad_norm": 0.32929413135285035, + "learning_rate": 4.753987383449816e-05, + "loss": 0.1677, + "step": 969 + }, + { + "epoch": 0.5049453409682457, + "grad_norm": 0.29541171416091216, + "learning_rate": 4.753379248416963e-05, + "loss": 0.1718, + "step": 970 + }, + { + "epoch": 0.5054659031754295, + "grad_norm": 0.3445502102948572, + "learning_rate": 4.75277040166208e-05, + "loss": 0.1753, + "step": 971 + }, + { + "epoch": 0.5059864653826133, + "grad_norm": 0.2832624623321996, + "learning_rate": 4.752160843377469e-05, + "loss": 0.1674, + "step": 972 + }, + { + "epoch": 0.5065070275897969, + "grad_norm": 0.2837250649227678, + "learning_rate": 4.751550573755658e-05, + "loss": 0.1648, + "step": 973 + }, + { + "epoch": 0.5070275897969807, + "grad_norm": 0.30259169690721716, + "learning_rate": 4.750939592989396e-05, + "loss": 0.1715, + "step": 974 + }, + { + "epoch": 0.5075481520041645, + "grad_norm": 0.29554573173203236, + "learning_rate": 4.750327901271662e-05, + "loss": 0.1767, + "step": 975 + }, + { + "epoch": 0.5080687142113482, + "grad_norm": 0.3048549802014968, + "learning_rate": 4.7497154987956554e-05, + "loss": 0.1736, + "step": 976 + }, + { + "epoch": 0.508589276418532, + "grad_norm": 0.28859818824566424, + "learning_rate": 4.749102385754802e-05, + "loss": 0.1762, + "step": 977 + }, + { + "epoch": 0.5091098386257158, + "grad_norm": 0.2907661277669279, + "learning_rate": 4.74848856234275e-05, + "loss": 0.169, + "step": 978 + }, + { + "epoch": 0.5096304008328996, + "grad_norm": 0.3020314402188709, + "learning_rate": 4.747874028753375e-05, + "loss": 0.1705, + "step": 979 + }, + { + "epoch": 0.5101509630400833, + "grad_norm": 0.28916453332411907, + "learning_rate": 4.747258785180774e-05, + "loss": 0.169, + "step": 980 + }, + { + "epoch": 0.510671525247267, + "grad_norm": 0.27306730871455454, + "learning_rate": 4.746642831819271e-05, + "loss": 0.1712, + "step": 981 + }, + { + "epoch": 0.5111920874544508, + "grad_norm": 0.3577975221682602, + "learning_rate": 4.746026168863412e-05, + "loss": 0.1714, + "step": 982 + }, + { + "epoch": 0.5117126496616345, + "grad_norm": 0.28463660820354614, + "learning_rate": 4.7454087965079675e-05, + "loss": 0.1622, + "step": 983 + }, + { + "epoch": 0.5122332118688183, + "grad_norm": 0.31003657876962476, + "learning_rate": 4.744790714947932e-05, + "loss": 0.1695, + "step": 984 + }, + { + "epoch": 0.5127537740760021, + "grad_norm": 0.27961769726505104, + "learning_rate": 4.744171924378526e-05, + "loss": 0.163, + "step": 985 + }, + { + "epoch": 0.5132743362831859, + "grad_norm": 0.33088943157458217, + "learning_rate": 4.743552424995191e-05, + "loss": 0.17, + "step": 986 + }, + { + "epoch": 0.5137948984903696, + "grad_norm": 0.3056859101086278, + "learning_rate": 4.7429322169935955e-05, + "loss": 0.1697, + "step": 987 + }, + { + "epoch": 0.5143154606975534, + "grad_norm": 0.31119710051950294, + "learning_rate": 4.7423113005696275e-05, + "loss": 0.1671, + "step": 988 + }, + { + "epoch": 0.5148360229047371, + "grad_norm": 0.3143711776010868, + "learning_rate": 4.741689675919403e-05, + "loss": 0.1698, + "step": 989 + }, + { + "epoch": 0.5153565851119208, + "grad_norm": 0.3061616967117495, + "learning_rate": 4.7410673432392596e-05, + "loss": 0.1842, + "step": 990 + }, + { + "epoch": 0.5158771473191046, + "grad_norm": 0.29278315778967423, + "learning_rate": 4.740444302725759e-05, + "loss": 0.1678, + "step": 991 + }, + { + "epoch": 0.5163977095262884, + "grad_norm": 0.2957634913771008, + "learning_rate": 4.7398205545756863e-05, + "loss": 0.1751, + "step": 992 + }, + { + "epoch": 0.5169182717334722, + "grad_norm": 0.279576531903814, + "learning_rate": 4.7391960989860504e-05, + "loss": 0.1633, + "step": 993 + }, + { + "epoch": 0.5174388339406559, + "grad_norm": 0.3073973561502015, + "learning_rate": 4.738570936154083e-05, + "loss": 0.1645, + "step": 994 + }, + { + "epoch": 0.5179593961478397, + "grad_norm": 0.29041369434808123, + "learning_rate": 4.7379450662772394e-05, + "loss": 0.1626, + "step": 995 + }, + { + "epoch": 0.5184799583550235, + "grad_norm": 0.30177127364477313, + "learning_rate": 4.737318489553199e-05, + "loss": 0.175, + "step": 996 + }, + { + "epoch": 0.5190005205622071, + "grad_norm": 0.29152000130899824, + "learning_rate": 4.736691206179864e-05, + "loss": 0.1713, + "step": 997 + }, + { + "epoch": 0.5195210827693909, + "grad_norm": 0.3104231023579747, + "learning_rate": 4.7360632163553595e-05, + "loss": 0.1703, + "step": 998 + }, + { + "epoch": 0.5200416449765747, + "grad_norm": 0.28598295804809104, + "learning_rate": 4.735434520278034e-05, + "loss": 0.1685, + "step": 999 + }, + { + "epoch": 0.5205622071837585, + "grad_norm": 0.31603631518913927, + "learning_rate": 4.734805118146459e-05, + "loss": 0.1786, + "step": 1000 + }, + { + "epoch": 0.5210827693909422, + "grad_norm": 0.3129741436841726, + "learning_rate": 4.734175010159428e-05, + "loss": 0.1732, + "step": 1001 + }, + { + "epoch": 0.521603331598126, + "grad_norm": 0.3176913349121328, + "learning_rate": 4.73354419651596e-05, + "loss": 0.1678, + "step": 1002 + }, + { + "epoch": 0.5221238938053098, + "grad_norm": 0.33041896990240904, + "learning_rate": 4.732912677415294e-05, + "loss": 0.1645, + "step": 1003 + }, + { + "epoch": 0.5226444560124935, + "grad_norm": 0.2830876561876967, + "learning_rate": 4.732280453056894e-05, + "loss": 0.1679, + "step": 1004 + }, + { + "epoch": 0.5231650182196772, + "grad_norm": 0.34261613836366955, + "learning_rate": 4.7316475236404454e-05, + "loss": 0.1737, + "step": 1005 + }, + { + "epoch": 0.523685580426861, + "grad_norm": 0.2760763443773014, + "learning_rate": 4.731013889365857e-05, + "loss": 0.1709, + "step": 1006 + }, + { + "epoch": 0.5242061426340447, + "grad_norm": 0.28543357092624544, + "learning_rate": 4.7303795504332604e-05, + "loss": 0.1694, + "step": 1007 + }, + { + "epoch": 0.5247267048412285, + "grad_norm": 0.2968860076469517, + "learning_rate": 4.729744507043008e-05, + "loss": 0.1686, + "step": 1008 + }, + { + "epoch": 0.5252472670484123, + "grad_norm": 0.2955425625928012, + "learning_rate": 4.729108759395677e-05, + "loss": 0.1668, + "step": 1009 + }, + { + "epoch": 0.5257678292555961, + "grad_norm": 0.3064862906090259, + "learning_rate": 4.728472307692067e-05, + "loss": 0.1717, + "step": 1010 + }, + { + "epoch": 0.5262883914627798, + "grad_norm": 0.2944863868003504, + "learning_rate": 4.727835152133197e-05, + "loss": 0.1658, + "step": 1011 + }, + { + "epoch": 0.5268089536699636, + "grad_norm": 0.29581781869973445, + "learning_rate": 4.727197292920312e-05, + "loss": 0.1664, + "step": 1012 + }, + { + "epoch": 0.5273295158771473, + "grad_norm": 0.2655727030333062, + "learning_rate": 4.7265587302548766e-05, + "loss": 0.1588, + "step": 1013 + }, + { + "epoch": 0.527850078084331, + "grad_norm": 0.32401890632087926, + "learning_rate": 4.7259194643385796e-05, + "loss": 0.1689, + "step": 1014 + }, + { + "epoch": 0.5283706402915148, + "grad_norm": 0.31117831297374177, + "learning_rate": 4.7252794953733294e-05, + "loss": 0.1776, + "step": 1015 + }, + { + "epoch": 0.5288912024986986, + "grad_norm": 0.3060852587762897, + "learning_rate": 4.72463882356126e-05, + "loss": 0.1671, + "step": 1016 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.32556487990025834, + "learning_rate": 4.7239974491047236e-05, + "loss": 0.1775, + "step": 1017 + }, + { + "epoch": 0.5299323269130661, + "grad_norm": 0.2799847281876232, + "learning_rate": 4.723355372206297e-05, + "loss": 0.1675, + "step": 1018 + }, + { + "epoch": 0.5304528891202499, + "grad_norm": 0.2821768720138456, + "learning_rate": 4.722712593068779e-05, + "loss": 0.1664, + "step": 1019 + }, + { + "epoch": 0.5309734513274337, + "grad_norm": 0.3123419119344323, + "learning_rate": 4.722069111895187e-05, + "loss": 0.1733, + "step": 1020 + }, + { + "epoch": 0.5314940135346173, + "grad_norm": 0.30697486370866406, + "learning_rate": 4.721424928888763e-05, + "loss": 0.1722, + "step": 1021 + }, + { + "epoch": 0.5320145757418011, + "grad_norm": 0.2895747411066602, + "learning_rate": 4.7207800442529706e-05, + "loss": 0.1647, + "step": 1022 + }, + { + "epoch": 0.5325351379489849, + "grad_norm": 0.28336131313295926, + "learning_rate": 4.720134458191494e-05, + "loss": 0.1657, + "step": 1023 + }, + { + "epoch": 0.5330557001561687, + "grad_norm": 0.2902463741761351, + "learning_rate": 4.719488170908239e-05, + "loss": 0.1579, + "step": 1024 + }, + { + "epoch": 0.5335762623633524, + "grad_norm": 0.295514891496485, + "learning_rate": 4.718841182607334e-05, + "loss": 0.1716, + "step": 1025 + }, + { + "epoch": 0.5340968245705362, + "grad_norm": 0.32915905626702163, + "learning_rate": 4.718193493493127e-05, + "loss": 0.1671, + "step": 1026 + }, + { + "epoch": 0.53461738677772, + "grad_norm": 0.3026674578937171, + "learning_rate": 4.717545103770189e-05, + "loss": 0.158, + "step": 1027 + }, + { + "epoch": 0.5351379489849037, + "grad_norm": 0.28381405365121987, + "learning_rate": 4.716896013643313e-05, + "loss": 0.1677, + "step": 1028 + }, + { + "epoch": 0.5356585111920874, + "grad_norm": 0.29824689910486607, + "learning_rate": 4.716246223317509e-05, + "loss": 0.1703, + "step": 1029 + }, + { + "epoch": 0.5361790733992712, + "grad_norm": 0.33329704323409864, + "learning_rate": 4.7155957329980126e-05, + "loss": 0.1698, + "step": 1030 + }, + { + "epoch": 0.536699635606455, + "grad_norm": 0.3004672079802441, + "learning_rate": 4.7149445428902786e-05, + "loss": 0.1589, + "step": 1031 + }, + { + "epoch": 0.5372201978136387, + "grad_norm": 0.31860987261597545, + "learning_rate": 4.714292653199984e-05, + "loss": 0.1672, + "step": 1032 + }, + { + "epoch": 0.5377407600208225, + "grad_norm": 0.3442296126297135, + "learning_rate": 4.713640064133025e-05, + "loss": 0.1632, + "step": 1033 + }, + { + "epoch": 0.5382613222280063, + "grad_norm": 0.3064387531445744, + "learning_rate": 4.7129867758955196e-05, + "loss": 0.1635, + "step": 1034 + }, + { + "epoch": 0.53878188443519, + "grad_norm": 0.3123991398030029, + "learning_rate": 4.7123327886938076e-05, + "loss": 0.1664, + "step": 1035 + }, + { + "epoch": 0.5393024466423738, + "grad_norm": 0.2900379032500118, + "learning_rate": 4.711678102734447e-05, + "loss": 0.1699, + "step": 1036 + }, + { + "epoch": 0.5398230088495575, + "grad_norm": 0.3101402740827743, + "learning_rate": 4.711022718224218e-05, + "loss": 0.1674, + "step": 1037 + }, + { + "epoch": 0.5403435710567412, + "grad_norm": 0.2978308638116788, + "learning_rate": 4.710366635370124e-05, + "loss": 0.1629, + "step": 1038 + }, + { + "epoch": 0.540864133263925, + "grad_norm": 0.2934026273877542, + "learning_rate": 4.709709854379385e-05, + "loss": 0.1557, + "step": 1039 + }, + { + "epoch": 0.5413846954711088, + "grad_norm": 0.29754336522736463, + "learning_rate": 4.709052375459442e-05, + "loss": 0.1708, + "step": 1040 + }, + { + "epoch": 0.5419052576782926, + "grad_norm": 0.2941537759903196, + "learning_rate": 4.7083941988179594e-05, + "loss": 0.1688, + "step": 1041 + }, + { + "epoch": 0.5424258198854763, + "grad_norm": 0.3197429755584853, + "learning_rate": 4.707735324662818e-05, + "loss": 0.1711, + "step": 1042 + }, + { + "epoch": 0.5429463820926601, + "grad_norm": 0.29878401691495904, + "learning_rate": 4.7070757532021224e-05, + "loss": 0.1667, + "step": 1043 + }, + { + "epoch": 0.5434669442998439, + "grad_norm": 0.34624169271494265, + "learning_rate": 4.706415484644195e-05, + "loss": 0.1719, + "step": 1044 + }, + { + "epoch": 0.5439875065070275, + "grad_norm": 0.2825166282554898, + "learning_rate": 4.705754519197581e-05, + "loss": 0.163, + "step": 1045 + }, + { + "epoch": 0.5445080687142113, + "grad_norm": 0.3328465767542314, + "learning_rate": 4.705092857071042e-05, + "loss": 0.1655, + "step": 1046 + }, + { + "epoch": 0.5450286309213951, + "grad_norm": 0.3231665017996849, + "learning_rate": 4.704430498473562e-05, + "loss": 0.1666, + "step": 1047 + }, + { + "epoch": 0.5455491931285789, + "grad_norm": 0.33401260004138217, + "learning_rate": 4.7037674436143466e-05, + "loss": 0.1704, + "step": 1048 + }, + { + "epoch": 0.5460697553357626, + "grad_norm": 0.30948523577299875, + "learning_rate": 4.703103692702817e-05, + "loss": 0.1682, + "step": 1049 + }, + { + "epoch": 0.5465903175429464, + "grad_norm": 0.33513343731742634, + "learning_rate": 4.7024392459486176e-05, + "loss": 0.1719, + "step": 1050 + }, + { + "epoch": 0.5471108797501302, + "grad_norm": 0.29920676443214966, + "learning_rate": 4.7017741035616124e-05, + "loss": 0.1624, + "step": 1051 + }, + { + "epoch": 0.547631441957314, + "grad_norm": 0.3063741016132584, + "learning_rate": 4.701108265751884e-05, + "loss": 0.1651, + "step": 1052 + }, + { + "epoch": 0.5481520041644976, + "grad_norm": 0.2934234878431351, + "learning_rate": 4.7004417327297325e-05, + "loss": 0.1648, + "step": 1053 + }, + { + "epoch": 0.5486725663716814, + "grad_norm": 0.30182853291753425, + "learning_rate": 4.6997745047056836e-05, + "loss": 0.1626, + "step": 1054 + }, + { + "epoch": 0.5491931285788652, + "grad_norm": 0.3175333083074021, + "learning_rate": 4.699106581890477e-05, + "loss": 0.1719, + "step": 1055 + }, + { + "epoch": 0.5497136907860489, + "grad_norm": 0.31314464288948607, + "learning_rate": 4.698437964495074e-05, + "loss": 0.1598, + "step": 1056 + }, + { + "epoch": 0.5502342529932327, + "grad_norm": 0.32599179752881036, + "learning_rate": 4.6977686527306556e-05, + "loss": 0.1631, + "step": 1057 + }, + { + "epoch": 0.5507548152004165, + "grad_norm": 0.3308531331011867, + "learning_rate": 4.697098646808621e-05, + "loss": 0.17, + "step": 1058 + }, + { + "epoch": 0.5512753774076002, + "grad_norm": 0.2982979929430078, + "learning_rate": 4.69642794694059e-05, + "loss": 0.1708, + "step": 1059 + }, + { + "epoch": 0.551795939614784, + "grad_norm": 0.32955914457005925, + "learning_rate": 4.695756553338401e-05, + "loss": 0.1671, + "step": 1060 + }, + { + "epoch": 0.5523165018219677, + "grad_norm": 0.3070588254800077, + "learning_rate": 4.6950844662141096e-05, + "loss": 0.171, + "step": 1061 + }, + { + "epoch": 0.5528370640291514, + "grad_norm": 0.31217947398655915, + "learning_rate": 4.6944116857799936e-05, + "loss": 0.1697, + "step": 1062 + }, + { + "epoch": 0.5533576262363352, + "grad_norm": 0.2934020560226863, + "learning_rate": 4.6937382122485484e-05, + "loss": 0.1744, + "step": 1063 + }, + { + "epoch": 0.553878188443519, + "grad_norm": 0.2854842827185457, + "learning_rate": 4.693064045832488e-05, + "loss": 0.161, + "step": 1064 + }, + { + "epoch": 0.5543987506507028, + "grad_norm": 0.2815405598740039, + "learning_rate": 4.692389186744745e-05, + "loss": 0.1621, + "step": 1065 + }, + { + "epoch": 0.5549193128578865, + "grad_norm": 0.28832423549074593, + "learning_rate": 4.691713635198473e-05, + "loss": 0.1621, + "step": 1066 + }, + { + "epoch": 0.5554398750650703, + "grad_norm": 0.29233220575342456, + "learning_rate": 4.6910373914070404e-05, + "loss": 0.1664, + "step": 1067 + }, + { + "epoch": 0.5559604372722541, + "grad_norm": 0.2904867981234466, + "learning_rate": 4.6903604555840374e-05, + "loss": 0.1709, + "step": 1068 + }, + { + "epoch": 0.5564809994794377, + "grad_norm": 0.33583477749715884, + "learning_rate": 4.6896828279432725e-05, + "loss": 0.1592, + "step": 1069 + }, + { + "epoch": 0.5570015616866215, + "grad_norm": 0.2809477801971266, + "learning_rate": 4.689004508698771e-05, + "loss": 0.1676, + "step": 1070 + }, + { + "epoch": 0.5575221238938053, + "grad_norm": 0.32697745725327254, + "learning_rate": 4.6883254980647787e-05, + "loss": 0.1671, + "step": 1071 + }, + { + "epoch": 0.5580426861009891, + "grad_norm": 0.28644207450920894, + "learning_rate": 4.6876457962557575e-05, + "loss": 0.1704, + "step": 1072 + }, + { + "epoch": 0.5585632483081728, + "grad_norm": 0.3293642883398715, + "learning_rate": 4.68696540348639e-05, + "loss": 0.1722, + "step": 1073 + }, + { + "epoch": 0.5590838105153566, + "grad_norm": 0.2844767472247252, + "learning_rate": 4.686284319971576e-05, + "loss": 0.166, + "step": 1074 + }, + { + "epoch": 0.5596043727225404, + "grad_norm": 0.30371406025926573, + "learning_rate": 4.685602545926432e-05, + "loss": 0.1718, + "step": 1075 + }, + { + "epoch": 0.5601249349297241, + "grad_norm": 0.303036966019252, + "learning_rate": 4.684920081566295e-05, + "loss": 0.1696, + "step": 1076 + }, + { + "epoch": 0.5606454971369078, + "grad_norm": 0.2771701255154809, + "learning_rate": 4.6842369271067185e-05, + "loss": 0.1606, + "step": 1077 + }, + { + "epoch": 0.5611660593440916, + "grad_norm": 0.3233636024191239, + "learning_rate": 4.683553082763475e-05, + "loss": 0.1699, + "step": 1078 + }, + { + "epoch": 0.5616866215512754, + "grad_norm": 0.28548453667968193, + "learning_rate": 4.6828685487525554e-05, + "loss": 0.1621, + "step": 1079 + }, + { + "epoch": 0.5622071837584591, + "grad_norm": 0.33431595584526136, + "learning_rate": 4.6821833252901646e-05, + "loss": 0.1579, + "step": 1080 + }, + { + "epoch": 0.5627277459656429, + "grad_norm": 0.29278248350889036, + "learning_rate": 4.6814974125927304e-05, + "loss": 0.1622, + "step": 1081 + }, + { + "epoch": 0.5632483081728267, + "grad_norm": 0.31082966905234066, + "learning_rate": 4.680810810876895e-05, + "loss": 0.1745, + "step": 1082 + }, + { + "epoch": 0.5637688703800104, + "grad_norm": 0.304493159369903, + "learning_rate": 4.6801235203595195e-05, + "loss": 0.1657, + "step": 1083 + }, + { + "epoch": 0.5642894325871942, + "grad_norm": 0.2949622274273174, + "learning_rate": 4.679435541257682e-05, + "loss": 0.1608, + "step": 1084 + }, + { + "epoch": 0.5648099947943779, + "grad_norm": 0.27199415580975606, + "learning_rate": 4.678746873788677e-05, + "loss": 0.1626, + "step": 1085 + }, + { + "epoch": 0.5653305570015617, + "grad_norm": 0.30641660626180384, + "learning_rate": 4.678057518170021e-05, + "loss": 0.1705, + "step": 1086 + }, + { + "epoch": 0.5658511192087454, + "grad_norm": 0.3045178599347487, + "learning_rate": 4.677367474619442e-05, + "loss": 0.164, + "step": 1087 + }, + { + "epoch": 0.5663716814159292, + "grad_norm": 0.2942177088983781, + "learning_rate": 4.6766767433548885e-05, + "loss": 0.1702, + "step": 1088 + }, + { + "epoch": 0.566892243623113, + "grad_norm": 0.3112050068584472, + "learning_rate": 4.6759853245945256e-05, + "loss": 0.1727, + "step": 1089 + }, + { + "epoch": 0.5674128058302967, + "grad_norm": 0.2968014199642348, + "learning_rate": 4.675293218556735e-05, + "loss": 0.1605, + "step": 1090 + }, + { + "epoch": 0.5679333680374805, + "grad_norm": 0.30379202567617314, + "learning_rate": 4.6746004254601184e-05, + "loss": 0.1665, + "step": 1091 + }, + { + "epoch": 0.5684539302446643, + "grad_norm": 0.4356850886928418, + "learning_rate": 4.6739069455234886e-05, + "loss": 0.1737, + "step": 1092 + }, + { + "epoch": 0.568974492451848, + "grad_norm": 0.2873286927122915, + "learning_rate": 4.673212778965881e-05, + "loss": 0.1671, + "step": 1093 + }, + { + "epoch": 0.5694950546590317, + "grad_norm": 0.307373163176704, + "learning_rate": 4.672517926006545e-05, + "loss": 0.1696, + "step": 1094 + }, + { + "epoch": 0.5700156168662155, + "grad_norm": 0.284743433783671, + "learning_rate": 4.671822386864948e-05, + "loss": 0.1603, + "step": 1095 + }, + { + "epoch": 0.5705361790733993, + "grad_norm": 0.3019989838556474, + "learning_rate": 4.6711261617607725e-05, + "loss": 0.1735, + "step": 1096 + }, + { + "epoch": 0.571056741280583, + "grad_norm": 0.3033519694233394, + "learning_rate": 4.670429250913921e-05, + "loss": 0.1639, + "step": 1097 + }, + { + "epoch": 0.5715773034877668, + "grad_norm": 0.31363574509134545, + "learning_rate": 4.669731654544508e-05, + "loss": 0.161, + "step": 1098 + }, + { + "epoch": 0.5720978656949506, + "grad_norm": 0.28285834283513167, + "learning_rate": 4.669033372872868e-05, + "loss": 0.1665, + "step": 1099 + }, + { + "epoch": 0.5726184279021344, + "grad_norm": 0.2628857284685492, + "learning_rate": 4.668334406119551e-05, + "loss": 0.1566, + "step": 1100 + }, + { + "epoch": 0.573138990109318, + "grad_norm": 0.2655054843532405, + "learning_rate": 4.667634754505323e-05, + "loss": 0.1604, + "step": 1101 + }, + { + "epoch": 0.5736595523165018, + "grad_norm": 0.26626184972682054, + "learning_rate": 4.666934418251166e-05, + "loss": 0.1554, + "step": 1102 + }, + { + "epoch": 0.5741801145236856, + "grad_norm": 0.28305815343523066, + "learning_rate": 4.6662333975782795e-05, + "loss": 0.1698, + "step": 1103 + }, + { + "epoch": 0.5747006767308693, + "grad_norm": 0.2707887337606054, + "learning_rate": 4.6655316927080784e-05, + "loss": 0.165, + "step": 1104 + }, + { + "epoch": 0.5752212389380531, + "grad_norm": 0.2917568726342999, + "learning_rate": 4.664829303862194e-05, + "loss": 0.1686, + "step": 1105 + }, + { + "epoch": 0.5757418011452369, + "grad_norm": 0.26069329637196703, + "learning_rate": 4.664126231262472e-05, + "loss": 0.1625, + "step": 1106 + }, + { + "epoch": 0.5762623633524206, + "grad_norm": 0.27239030458009167, + "learning_rate": 4.663422475130977e-05, + "loss": 0.1537, + "step": 1107 + }, + { + "epoch": 0.5767829255596044, + "grad_norm": 0.2575395498508633, + "learning_rate": 4.662718035689987e-05, + "loss": 0.1617, + "step": 1108 + }, + { + "epoch": 0.5773034877667881, + "grad_norm": 0.27545374757966323, + "learning_rate": 4.662012913161997e-05, + "loss": 0.1616, + "step": 1109 + }, + { + "epoch": 0.5778240499739719, + "grad_norm": 0.28448600250229067, + "learning_rate": 4.661307107769718e-05, + "loss": 0.1646, + "step": 1110 + }, + { + "epoch": 0.5783446121811556, + "grad_norm": 0.27298548291298, + "learning_rate": 4.660600619736076e-05, + "loss": 0.1628, + "step": 1111 + }, + { + "epoch": 0.5788651743883394, + "grad_norm": 0.3145003842578059, + "learning_rate": 4.6598934492842114e-05, + "loss": 0.1737, + "step": 1112 + }, + { + "epoch": 0.5793857365955232, + "grad_norm": 0.27194978692066746, + "learning_rate": 4.659185596637484e-05, + "loss": 0.1621, + "step": 1113 + }, + { + "epoch": 0.5799062988027069, + "grad_norm": 0.28956670097111964, + "learning_rate": 4.658477062019465e-05, + "loss": 0.1722, + "step": 1114 + }, + { + "epoch": 0.5804268610098907, + "grad_norm": 0.2826807429961728, + "learning_rate": 4.657767845653943e-05, + "loss": 0.1688, + "step": 1115 + }, + { + "epoch": 0.5809474232170745, + "grad_norm": 0.28708843741623624, + "learning_rate": 4.657057947764922e-05, + "loss": 0.1649, + "step": 1116 + }, + { + "epoch": 0.5814679854242581, + "grad_norm": 0.29098772804179435, + "learning_rate": 4.656347368576619e-05, + "loss": 0.169, + "step": 1117 + }, + { + "epoch": 0.5819885476314419, + "grad_norm": 0.3050742875550255, + "learning_rate": 4.6556361083134705e-05, + "loss": 0.1736, + "step": 1118 + }, + { + "epoch": 0.5825091098386257, + "grad_norm": 0.2883430860003268, + "learning_rate": 4.654924167200123e-05, + "loss": 0.1636, + "step": 1119 + }, + { + "epoch": 0.5830296720458095, + "grad_norm": 0.2694576550703567, + "learning_rate": 4.654211545461443e-05, + "loss": 0.1596, + "step": 1120 + }, + { + "epoch": 0.5835502342529932, + "grad_norm": 0.29077979216048633, + "learning_rate": 4.653498243322508e-05, + "loss": 0.1614, + "step": 1121 + }, + { + "epoch": 0.584070796460177, + "grad_norm": 0.30454032706720835, + "learning_rate": 4.652784261008613e-05, + "loss": 0.1657, + "step": 1122 + }, + { + "epoch": 0.5845913586673608, + "grad_norm": 0.2809170140826117, + "learning_rate": 4.652069598745267e-05, + "loss": 0.1674, + "step": 1123 + }, + { + "epoch": 0.5851119208745446, + "grad_norm": 0.2998875317771076, + "learning_rate": 4.6513542567581914e-05, + "loss": 0.1745, + "step": 1124 + }, + { + "epoch": 0.5856324830817282, + "grad_norm": 0.2907341448806623, + "learning_rate": 4.650638235273327e-05, + "loss": 0.1722, + "step": 1125 + }, + { + "epoch": 0.586153045288912, + "grad_norm": 0.3240717556080497, + "learning_rate": 4.6499215345168255e-05, + "loss": 0.1715, + "step": 1126 + }, + { + "epoch": 0.5866736074960958, + "grad_norm": 0.2856383542388814, + "learning_rate": 4.6492041547150555e-05, + "loss": 0.1704, + "step": 1127 + }, + { + "epoch": 0.5871941697032795, + "grad_norm": 0.29395504573057224, + "learning_rate": 4.648486096094597e-05, + "loss": 0.1644, + "step": 1128 + }, + { + "epoch": 0.5877147319104633, + "grad_norm": 0.29455509913506334, + "learning_rate": 4.647767358882249e-05, + "loss": 0.1637, + "step": 1129 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.27422990925352025, + "learning_rate": 4.647047943305019e-05, + "loss": 0.1581, + "step": 1130 + }, + { + "epoch": 0.5887558563248309, + "grad_norm": 0.28624654561233115, + "learning_rate": 4.646327849590134e-05, + "loss": 0.1659, + "step": 1131 + }, + { + "epoch": 0.5892764185320146, + "grad_norm": 0.2589094816834805, + "learning_rate": 4.6456070779650326e-05, + "loss": 0.1545, + "step": 1132 + }, + { + "epoch": 0.5897969807391983, + "grad_norm": 0.28489485708163403, + "learning_rate": 4.6448856286573684e-05, + "loss": 0.1669, + "step": 1133 + }, + { + "epoch": 0.5903175429463821, + "grad_norm": 0.27991494765414887, + "learning_rate": 4.644163501895008e-05, + "loss": 0.1692, + "step": 1134 + }, + { + "epoch": 0.5908381051535658, + "grad_norm": 0.271863121129334, + "learning_rate": 4.643440697906033e-05, + "loss": 0.1703, + "step": 1135 + }, + { + "epoch": 0.5913586673607496, + "grad_norm": 0.261056414536902, + "learning_rate": 4.642717216918738e-05, + "loss": 0.1621, + "step": 1136 + }, + { + "epoch": 0.5918792295679334, + "grad_norm": 0.2929328942515898, + "learning_rate": 4.6419930591616336e-05, + "loss": 0.1629, + "step": 1137 + }, + { + "epoch": 0.5923997917751171, + "grad_norm": 0.2606767168924215, + "learning_rate": 4.641268224863441e-05, + "loss": 0.1654, + "step": 1138 + }, + { + "epoch": 0.5929203539823009, + "grad_norm": 0.2908040086603664, + "learning_rate": 4.6405427142530954e-05, + "loss": 0.1588, + "step": 1139 + }, + { + "epoch": 0.5934409161894847, + "grad_norm": 0.2611032398127685, + "learning_rate": 4.6398165275597494e-05, + "loss": 0.1616, + "step": 1140 + }, + { + "epoch": 0.5939614783966684, + "grad_norm": 0.27150334105117246, + "learning_rate": 4.6390896650127656e-05, + "loss": 0.1693, + "step": 1141 + }, + { + "epoch": 0.5944820406038521, + "grad_norm": 0.27540568978983837, + "learning_rate": 4.638362126841721e-05, + "loss": 0.167, + "step": 1142 + }, + { + "epoch": 0.5950026028110359, + "grad_norm": 0.2676573123405451, + "learning_rate": 4.637633913276406e-05, + "loss": 0.1643, + "step": 1143 + }, + { + "epoch": 0.5955231650182197, + "grad_norm": 0.2607951523773393, + "learning_rate": 4.6369050245468243e-05, + "loss": 0.1672, + "step": 1144 + }, + { + "epoch": 0.5960437272254034, + "grad_norm": 0.28522127225344346, + "learning_rate": 4.636175460883193e-05, + "loss": 0.1689, + "step": 1145 + }, + { + "epoch": 0.5965642894325872, + "grad_norm": 0.28313291478789127, + "learning_rate": 4.6354452225159416e-05, + "loss": 0.1678, + "step": 1146 + }, + { + "epoch": 0.597084851639771, + "grad_norm": 0.2781466635540155, + "learning_rate": 4.634714309675714e-05, + "loss": 0.1693, + "step": 1147 + }, + { + "epoch": 0.5976054138469548, + "grad_norm": 0.2993701779876083, + "learning_rate": 4.6339827225933665e-05, + "loss": 0.178, + "step": 1148 + }, + { + "epoch": 0.5981259760541384, + "grad_norm": 0.2757353899839654, + "learning_rate": 4.6332504614999684e-05, + "loss": 0.1686, + "step": 1149 + }, + { + "epoch": 0.5986465382613222, + "grad_norm": 0.2893083043239989, + "learning_rate": 4.6325175266268005e-05, + "loss": 0.1613, + "step": 1150 + }, + { + "epoch": 0.599167100468506, + "grad_norm": 0.28169522960512217, + "learning_rate": 4.6317839182053603e-05, + "loss": 0.1612, + "step": 1151 + }, + { + "epoch": 0.5996876626756897, + "grad_norm": 0.32089924116837154, + "learning_rate": 4.6310496364673534e-05, + "loss": 0.1688, + "step": 1152 + }, + { + "epoch": 0.6002082248828735, + "grad_norm": 0.259177259755011, + "learning_rate": 4.630314681644701e-05, + "loss": 0.159, + "step": 1153 + }, + { + "epoch": 0.6007287870900573, + "grad_norm": 0.3124787935064602, + "learning_rate": 4.6295790539695354e-05, + "loss": 0.1642, + "step": 1154 + }, + { + "epoch": 0.601249349297241, + "grad_norm": 0.297935731727392, + "learning_rate": 4.628842753674203e-05, + "loss": 0.1592, + "step": 1155 + }, + { + "epoch": 0.6017699115044248, + "grad_norm": 0.2917605841000676, + "learning_rate": 4.628105780991261e-05, + "loss": 0.1738, + "step": 1156 + }, + { + "epoch": 0.6022904737116085, + "grad_norm": 0.2742189116346638, + "learning_rate": 4.6273681361534796e-05, + "loss": 0.1676, + "step": 1157 + }, + { + "epoch": 0.6028110359187923, + "grad_norm": 0.2831911499474275, + "learning_rate": 4.626629819393842e-05, + "loss": 0.1659, + "step": 1158 + }, + { + "epoch": 0.603331598125976, + "grad_norm": 0.30018675422590946, + "learning_rate": 4.6258908309455424e-05, + "loss": 0.1624, + "step": 1159 + }, + { + "epoch": 0.6038521603331598, + "grad_norm": 0.29631188813134685, + "learning_rate": 4.625151171041988e-05, + "loss": 0.1671, + "step": 1160 + }, + { + "epoch": 0.6043727225403436, + "grad_norm": 0.29206096012142957, + "learning_rate": 4.624410839916798e-05, + "loss": 0.1755, + "step": 1161 + }, + { + "epoch": 0.6048932847475273, + "grad_norm": 0.29805577625196694, + "learning_rate": 4.6236698378038026e-05, + "loss": 0.1674, + "step": 1162 + }, + { + "epoch": 0.6054138469547111, + "grad_norm": 0.284263114326214, + "learning_rate": 4.622928164937046e-05, + "loss": 0.1611, + "step": 1163 + }, + { + "epoch": 0.6059344091618949, + "grad_norm": 0.29763872316389595, + "learning_rate": 4.622185821550782e-05, + "loss": 0.1753, + "step": 1164 + }, + { + "epoch": 0.6064549713690786, + "grad_norm": 0.2869777890373756, + "learning_rate": 4.621442807879477e-05, + "loss": 0.1554, + "step": 1165 + }, + { + "epoch": 0.6069755335762623, + "grad_norm": 0.2743988330225703, + "learning_rate": 4.6206991241578115e-05, + "loss": 0.1662, + "step": 1166 + }, + { + "epoch": 0.6074960957834461, + "grad_norm": 0.27540743834473974, + "learning_rate": 4.6199547706206726e-05, + "loss": 0.1632, + "step": 1167 + }, + { + "epoch": 0.6080166579906299, + "grad_norm": 0.28312699949690634, + "learning_rate": 4.619209747503163e-05, + "loss": 0.1636, + "step": 1168 + }, + { + "epoch": 0.6085372201978136, + "grad_norm": 0.2637175185952636, + "learning_rate": 4.618464055040595e-05, + "loss": 0.1649, + "step": 1169 + }, + { + "epoch": 0.6090577824049974, + "grad_norm": 0.2907917728207116, + "learning_rate": 4.617717693468494e-05, + "loss": 0.1688, + "step": 1170 + }, + { + "epoch": 0.6095783446121812, + "grad_norm": 0.2726681176203265, + "learning_rate": 4.616970663022596e-05, + "loss": 0.1695, + "step": 1171 + }, + { + "epoch": 0.610098906819365, + "grad_norm": 0.3021128819540319, + "learning_rate": 4.616222963938847e-05, + "loss": 0.1689, + "step": 1172 + }, + { + "epoch": 0.6106194690265486, + "grad_norm": 0.2611672657701669, + "learning_rate": 4.615474596453405e-05, + "loss": 0.1601, + "step": 1173 + }, + { + "epoch": 0.6111400312337324, + "grad_norm": 0.27250524535283555, + "learning_rate": 4.6147255608026394e-05, + "loss": 0.1595, + "step": 1174 + }, + { + "epoch": 0.6116605934409162, + "grad_norm": 0.2780403650376052, + "learning_rate": 4.6139758572231315e-05, + "loss": 0.1613, + "step": 1175 + }, + { + "epoch": 0.6121811556480999, + "grad_norm": 0.28819496612165146, + "learning_rate": 4.613225485951672e-05, + "loss": 0.1701, + "step": 1176 + }, + { + "epoch": 0.6127017178552837, + "grad_norm": 0.289358937511583, + "learning_rate": 4.612474447225263e-05, + "loss": 0.1663, + "step": 1177 + }, + { + "epoch": 0.6132222800624675, + "grad_norm": 0.2720168641397797, + "learning_rate": 4.611722741281118e-05, + "loss": 0.1637, + "step": 1178 + }, + { + "epoch": 0.6137428422696513, + "grad_norm": 0.278359207879303, + "learning_rate": 4.610970368356659e-05, + "loss": 0.1644, + "step": 1179 + }, + { + "epoch": 0.614263404476835, + "grad_norm": 0.26805244317170895, + "learning_rate": 4.610217328689522e-05, + "loss": 0.1681, + "step": 1180 + }, + { + "epoch": 0.6147839666840187, + "grad_norm": 0.28205230211004056, + "learning_rate": 4.609463622517551e-05, + "loss": 0.1669, + "step": 1181 + }, + { + "epoch": 0.6153045288912025, + "grad_norm": 0.269236517123811, + "learning_rate": 4.608709250078803e-05, + "loss": 0.1619, + "step": 1182 + }, + { + "epoch": 0.6158250910983862, + "grad_norm": 0.29722253174417096, + "learning_rate": 4.607954211611543e-05, + "loss": 0.1677, + "step": 1183 + }, + { + "epoch": 0.61634565330557, + "grad_norm": 0.2726195453326547, + "learning_rate": 4.6071985073542464e-05, + "loss": 0.1656, + "step": 1184 + }, + { + "epoch": 0.6168662155127538, + "grad_norm": 0.2945066289976058, + "learning_rate": 4.606442137545602e-05, + "loss": 0.1577, + "step": 1185 + }, + { + "epoch": 0.6173867777199376, + "grad_norm": 0.3225752803253219, + "learning_rate": 4.605685102424504e-05, + "loss": 0.1711, + "step": 1186 + }, + { + "epoch": 0.6179073399271213, + "grad_norm": 0.3155533738497284, + "learning_rate": 4.6049274022300604e-05, + "loss": 0.1711, + "step": 1187 + }, + { + "epoch": 0.6184279021343051, + "grad_norm": 0.29404463902874495, + "learning_rate": 4.604169037201589e-05, + "loss": 0.1644, + "step": 1188 + }, + { + "epoch": 0.6189484643414888, + "grad_norm": 0.28530333471749375, + "learning_rate": 4.603410007578616e-05, + "loss": 0.1611, + "step": 1189 + }, + { + "epoch": 0.6194690265486725, + "grad_norm": 0.2841758773756127, + "learning_rate": 4.602650313600878e-05, + "loss": 0.1655, + "step": 1190 + }, + { + "epoch": 0.6199895887558563, + "grad_norm": 0.29874212816673795, + "learning_rate": 4.601889955508322e-05, + "loss": 0.1678, + "step": 1191 + }, + { + "epoch": 0.6205101509630401, + "grad_norm": 0.2790940324283885, + "learning_rate": 4.601128933541105e-05, + "loss": 0.1669, + "step": 1192 + }, + { + "epoch": 0.6210307131702238, + "grad_norm": 0.2819144485571506, + "learning_rate": 4.600367247939591e-05, + "loss": 0.1594, + "step": 1193 + }, + { + "epoch": 0.6215512753774076, + "grad_norm": 0.2789287724673938, + "learning_rate": 4.5996048989443597e-05, + "loss": 0.163, + "step": 1194 + }, + { + "epoch": 0.6220718375845914, + "grad_norm": 0.2862188445057931, + "learning_rate": 4.598841886796192e-05, + "loss": 0.1752, + "step": 1195 + }, + { + "epoch": 0.6225923997917752, + "grad_norm": 0.3007115626630004, + "learning_rate": 4.598078211736086e-05, + "loss": 0.1795, + "step": 1196 + }, + { + "epoch": 0.6231129619989588, + "grad_norm": 0.2655000681844505, + "learning_rate": 4.5973138740052455e-05, + "loss": 0.1652, + "step": 1197 + }, + { + "epoch": 0.6236335242061426, + "grad_norm": 0.3051733991758238, + "learning_rate": 4.596548873845081e-05, + "loss": 0.1638, + "step": 1198 + }, + { + "epoch": 0.6241540864133264, + "grad_norm": 0.2928160917194925, + "learning_rate": 4.595783211497219e-05, + "loss": 0.169, + "step": 1199 + }, + { + "epoch": 0.6246746486205101, + "grad_norm": 0.29112623504997737, + "learning_rate": 4.5950168872034885e-05, + "loss": 0.163, + "step": 1200 + }, + { + "epoch": 0.6251952108276939, + "grad_norm": 0.2675061785401877, + "learning_rate": 4.5942499012059316e-05, + "loss": 0.1657, + "step": 1201 + }, + { + "epoch": 0.6257157730348777, + "grad_norm": 0.3040715761688371, + "learning_rate": 4.593482253746798e-05, + "loss": 0.1623, + "step": 1202 + }, + { + "epoch": 0.6262363352420615, + "grad_norm": 0.29614925372633366, + "learning_rate": 4.592713945068545e-05, + "loss": 0.1643, + "step": 1203 + }, + { + "epoch": 0.6267568974492452, + "grad_norm": 0.2638537701916355, + "learning_rate": 4.591944975413843e-05, + "loss": 0.1618, + "step": 1204 + }, + { + "epoch": 0.6272774596564289, + "grad_norm": 0.3005257646399674, + "learning_rate": 4.5911753450255665e-05, + "loss": 0.1598, + "step": 1205 + }, + { + "epoch": 0.6277980218636127, + "grad_norm": 0.29494144677802137, + "learning_rate": 4.590405054146802e-05, + "loss": 0.1718, + "step": 1206 + }, + { + "epoch": 0.6283185840707964, + "grad_norm": 0.29636762114195014, + "learning_rate": 4.5896341030208415e-05, + "loss": 0.1637, + "step": 1207 + }, + { + "epoch": 0.6288391462779802, + "grad_norm": 0.2934192989066668, + "learning_rate": 4.5888624918911884e-05, + "loss": 0.158, + "step": 1208 + }, + { + "epoch": 0.629359708485164, + "grad_norm": 0.3124230206823834, + "learning_rate": 4.588090221001553e-05, + "loss": 0.1672, + "step": 1209 + }, + { + "epoch": 0.6298802706923478, + "grad_norm": 0.30615588299221824, + "learning_rate": 4.587317290595855e-05, + "loss": 0.1629, + "step": 1210 + }, + { + "epoch": 0.6304008328995315, + "grad_norm": 0.2988825912755345, + "learning_rate": 4.586543700918221e-05, + "loss": 0.1723, + "step": 1211 + }, + { + "epoch": 0.6309213951067153, + "grad_norm": 0.29738980460994413, + "learning_rate": 4.5857694522129855e-05, + "loss": 0.1573, + "step": 1212 + }, + { + "epoch": 0.631441957313899, + "grad_norm": 0.2797032464087098, + "learning_rate": 4.584994544724695e-05, + "loss": 0.1678, + "step": 1213 + }, + { + "epoch": 0.6319625195210827, + "grad_norm": 0.26491065606515474, + "learning_rate": 4.584218978698099e-05, + "loss": 0.1626, + "step": 1214 + }, + { + "epoch": 0.6324830817282665, + "grad_norm": 0.2652322894153915, + "learning_rate": 4.5834427543781596e-05, + "loss": 0.1666, + "step": 1215 + }, + { + "epoch": 0.6330036439354503, + "grad_norm": 0.264807354619328, + "learning_rate": 4.582665872010043e-05, + "loss": 0.1641, + "step": 1216 + }, + { + "epoch": 0.633524206142634, + "grad_norm": 0.2738204236926736, + "learning_rate": 4.581888331839125e-05, + "loss": 0.1703, + "step": 1217 + }, + { + "epoch": 0.6340447683498178, + "grad_norm": 0.299863744726911, + "learning_rate": 4.581110134110989e-05, + "loss": 0.1667, + "step": 1218 + }, + { + "epoch": 0.6345653305570016, + "grad_norm": 0.2956315906472553, + "learning_rate": 4.580331279071426e-05, + "loss": 0.1685, + "step": 1219 + }, + { + "epoch": 0.6350858927641854, + "grad_norm": 0.28753743007877214, + "learning_rate": 4.579551766966435e-05, + "loss": 0.1608, + "step": 1220 + }, + { + "epoch": 0.635606454971369, + "grad_norm": 0.28657496675188177, + "learning_rate": 4.578771598042221e-05, + "loss": 0.1692, + "step": 1221 + }, + { + "epoch": 0.6361270171785528, + "grad_norm": 0.27201718233205613, + "learning_rate": 4.5779907725452e-05, + "loss": 0.1592, + "step": 1222 + }, + { + "epoch": 0.6366475793857366, + "grad_norm": 0.26136498905384203, + "learning_rate": 4.577209290721991e-05, + "loss": 0.1625, + "step": 1223 + }, + { + "epoch": 0.6371681415929203, + "grad_norm": 0.2898833505811805, + "learning_rate": 4.576427152819423e-05, + "loss": 0.1699, + "step": 1224 + }, + { + "epoch": 0.6376887038001041, + "grad_norm": 0.2668251937477724, + "learning_rate": 4.575644359084532e-05, + "loss": 0.1591, + "step": 1225 + }, + { + "epoch": 0.6382092660072879, + "grad_norm": 0.3003314921218274, + "learning_rate": 4.5748609097645595e-05, + "loss": 0.1673, + "step": 1226 + }, + { + "epoch": 0.6387298282144717, + "grad_norm": 0.2684194237776865, + "learning_rate": 4.574076805106956e-05, + "loss": 0.1587, + "step": 1227 + }, + { + "epoch": 0.6392503904216554, + "grad_norm": 0.2845756731306716, + "learning_rate": 4.5732920453593785e-05, + "loss": 0.1642, + "step": 1228 + }, + { + "epoch": 0.6397709526288391, + "grad_norm": 0.27739696831325483, + "learning_rate": 4.572506630769691e-05, + "loss": 0.1623, + "step": 1229 + }, + { + "epoch": 0.6402915148360229, + "grad_norm": 0.27961191274533287, + "learning_rate": 4.571720561585963e-05, + "loss": 0.155, + "step": 1230 + }, + { + "epoch": 0.6408120770432066, + "grad_norm": 0.2807787449921395, + "learning_rate": 4.570933838056472e-05, + "loss": 0.1569, + "step": 1231 + }, + { + "epoch": 0.6413326392503904, + "grad_norm": 0.27535967325114985, + "learning_rate": 4.570146460429701e-05, + "loss": 0.1534, + "step": 1232 + }, + { + "epoch": 0.6418532014575742, + "grad_norm": 0.31053243876551606, + "learning_rate": 4.569358428954343e-05, + "loss": 0.1687, + "step": 1233 + }, + { + "epoch": 0.642373763664758, + "grad_norm": 0.2735613339054094, + "learning_rate": 4.568569743879293e-05, + "loss": 0.1619, + "step": 1234 + }, + { + "epoch": 0.6428943258719417, + "grad_norm": 0.31025137855515883, + "learning_rate": 4.5677804054536544e-05, + "loss": 0.1669, + "step": 1235 + }, + { + "epoch": 0.6434148880791255, + "grad_norm": 0.3108370328370114, + "learning_rate": 4.566990413926738e-05, + "loss": 0.1616, + "step": 1236 + }, + { + "epoch": 0.6439354502863092, + "grad_norm": 0.3339548204834231, + "learning_rate": 4.5661997695480595e-05, + "loss": 0.1616, + "step": 1237 + }, + { + "epoch": 0.6444560124934929, + "grad_norm": 0.3056231219128003, + "learning_rate": 4.5654084725673404e-05, + "loss": 0.1654, + "step": 1238 + }, + { + "epoch": 0.6449765747006767, + "grad_norm": 0.29210355340648536, + "learning_rate": 4.564616523234511e-05, + "loss": 0.1686, + "step": 1239 + }, + { + "epoch": 0.6454971369078605, + "grad_norm": 0.29213232167646325, + "learning_rate": 4.5638239217997034e-05, + "loss": 0.1621, + "step": 1240 + }, + { + "epoch": 0.6460176991150443, + "grad_norm": 0.28705338754520415, + "learning_rate": 4.56303066851326e-05, + "loss": 0.166, + "step": 1241 + }, + { + "epoch": 0.646538261322228, + "grad_norm": 0.3057168972516095, + "learning_rate": 4.5622367636257264e-05, + "loss": 0.161, + "step": 1242 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.2716038210905968, + "learning_rate": 4.561442207387854e-05, + "loss": 0.1628, + "step": 1243 + }, + { + "epoch": 0.6475793857365956, + "grad_norm": 0.3359502419591334, + "learning_rate": 4.560647000050602e-05, + "loss": 0.1641, + "step": 1244 + }, + { + "epoch": 0.6480999479437792, + "grad_norm": 0.2677924387144485, + "learning_rate": 4.5598511418651324e-05, + "loss": 0.1692, + "step": 1245 + }, + { + "epoch": 0.648620510150963, + "grad_norm": 0.2687659769423282, + "learning_rate": 4.5590546330828154e-05, + "loss": 0.1652, + "step": 1246 + }, + { + "epoch": 0.6491410723581468, + "grad_norm": 0.27911537151113053, + "learning_rate": 4.5582574739552254e-05, + "loss": 0.1681, + "step": 1247 + }, + { + "epoch": 0.6496616345653305, + "grad_norm": 0.258050298522539, + "learning_rate": 4.557459664734141e-05, + "loss": 0.1632, + "step": 1248 + }, + { + "epoch": 0.6501821967725143, + "grad_norm": 0.2714387670557213, + "learning_rate": 4.5566612056715494e-05, + "loss": 0.1602, + "step": 1249 + }, + { + "epoch": 0.6507027589796981, + "grad_norm": 0.3007289110925111, + "learning_rate": 4.5558620970196406e-05, + "loss": 0.1603, + "step": 1250 + }, + { + "epoch": 0.6512233211868819, + "grad_norm": 0.2751562123537327, + "learning_rate": 4.5550623390308086e-05, + "loss": 0.1637, + "step": 1251 + }, + { + "epoch": 0.6517438833940656, + "grad_norm": 0.26206723907101465, + "learning_rate": 4.554261931957657e-05, + "loss": 0.1602, + "step": 1252 + }, + { + "epoch": 0.6522644456012493, + "grad_norm": 0.29168906928554644, + "learning_rate": 4.5534608760529895e-05, + "loss": 0.1655, + "step": 1253 + }, + { + "epoch": 0.6527850078084331, + "grad_norm": 0.272148724671491, + "learning_rate": 4.552659171569817e-05, + "loss": 0.1567, + "step": 1254 + }, + { + "epoch": 0.6533055700156168, + "grad_norm": 0.2918024931656026, + "learning_rate": 4.551856818761357e-05, + "loss": 0.1656, + "step": 1255 + }, + { + "epoch": 0.6538261322228006, + "grad_norm": 0.2897642359961369, + "learning_rate": 4.551053817881028e-05, + "loss": 0.1635, + "step": 1256 + }, + { + "epoch": 0.6543466944299844, + "grad_norm": 0.2675950487049928, + "learning_rate": 4.550250169182455e-05, + "loss": 0.1571, + "step": 1257 + }, + { + "epoch": 0.6548672566371682, + "grad_norm": 0.28801786916053895, + "learning_rate": 4.549445872919468e-05, + "loss": 0.1616, + "step": 1258 + }, + { + "epoch": 0.6553878188443519, + "grad_norm": 0.2602745738457694, + "learning_rate": 4.548640929346102e-05, + "loss": 0.1629, + "step": 1259 + }, + { + "epoch": 0.6559083810515357, + "grad_norm": 0.2876893050906943, + "learning_rate": 4.5478353387165946e-05, + "loss": 0.1575, + "step": 1260 + }, + { + "epoch": 0.6564289432587194, + "grad_norm": 0.2881481837684052, + "learning_rate": 4.547029101285389e-05, + "loss": 0.1625, + "step": 1261 + }, + { + "epoch": 0.6569495054659031, + "grad_norm": 0.2612428197323997, + "learning_rate": 4.5462222173071335e-05, + "loss": 0.1652, + "step": 1262 + }, + { + "epoch": 0.6574700676730869, + "grad_norm": 0.2701617714724092, + "learning_rate": 4.5454146870366775e-05, + "loss": 0.1652, + "step": 1263 + }, + { + "epoch": 0.6579906298802707, + "grad_norm": 0.28509359219297975, + "learning_rate": 4.5446065107290786e-05, + "loss": 0.164, + "step": 1264 + }, + { + "epoch": 0.6585111920874545, + "grad_norm": 0.26256618912001867, + "learning_rate": 4.543797688639596e-05, + "loss": 0.1554, + "step": 1265 + }, + { + "epoch": 0.6590317542946382, + "grad_norm": 0.27155835804114575, + "learning_rate": 4.5429882210236926e-05, + "loss": 0.1613, + "step": 1266 + }, + { + "epoch": 0.659552316501822, + "grad_norm": 0.2918933060230762, + "learning_rate": 4.542178108137038e-05, + "loss": 0.1656, + "step": 1267 + }, + { + "epoch": 0.6600728787090058, + "grad_norm": 0.26850912684301664, + "learning_rate": 4.5413673502355e-05, + "loss": 0.1658, + "step": 1268 + }, + { + "epoch": 0.6605934409161894, + "grad_norm": 0.2555610557403195, + "learning_rate": 4.540555947575157e-05, + "loss": 0.1555, + "step": 1269 + }, + { + "epoch": 0.6611140031233732, + "grad_norm": 0.492044807765271, + "learning_rate": 4.539743900412287e-05, + "loss": 0.1668, + "step": 1270 + }, + { + "epoch": 0.661634565330557, + "grad_norm": 0.2749406988327086, + "learning_rate": 4.53893120900337e-05, + "loss": 0.1571, + "step": 1271 + }, + { + "epoch": 0.6621551275377408, + "grad_norm": 0.2755114222031435, + "learning_rate": 4.538117873605094e-05, + "loss": 0.1604, + "step": 1272 + }, + { + "epoch": 0.6626756897449245, + "grad_norm": 0.27347742130970076, + "learning_rate": 4.537303894474349e-05, + "loss": 0.1628, + "step": 1273 + }, + { + "epoch": 0.6631962519521083, + "grad_norm": 0.2859200379145871, + "learning_rate": 4.536489271868225e-05, + "loss": 0.1663, + "step": 1274 + }, + { + "epoch": 0.6637168141592921, + "grad_norm": 0.2855335291443677, + "learning_rate": 4.5356740060440194e-05, + "loss": 0.1699, + "step": 1275 + }, + { + "epoch": 0.6642373763664758, + "grad_norm": 0.26154237753790416, + "learning_rate": 4.53485809725923e-05, + "loss": 0.159, + "step": 1276 + }, + { + "epoch": 0.6647579385736595, + "grad_norm": 0.2993939894006488, + "learning_rate": 4.53404154577156e-05, + "loss": 0.166, + "step": 1277 + }, + { + "epoch": 0.6652785007808433, + "grad_norm": 0.2747912605662342, + "learning_rate": 4.533224351838914e-05, + "loss": 0.1515, + "step": 1278 + }, + { + "epoch": 0.665799062988027, + "grad_norm": 0.2736057262584086, + "learning_rate": 4.532406515719399e-05, + "loss": 0.1642, + "step": 1279 + }, + { + "epoch": 0.6663196251952108, + "grad_norm": 0.2616489484367182, + "learning_rate": 4.531588037671326e-05, + "loss": 0.161, + "step": 1280 + }, + { + "epoch": 0.6668401874023946, + "grad_norm": 0.26322733292018735, + "learning_rate": 4.5307689179532085e-05, + "loss": 0.1574, + "step": 1281 + }, + { + "epoch": 0.6673607496095784, + "grad_norm": 0.2522456597339631, + "learning_rate": 4.529949156823764e-05, + "loss": 0.1615, + "step": 1282 + }, + { + "epoch": 0.6678813118167621, + "grad_norm": 0.2588976061704534, + "learning_rate": 4.529128754541909e-05, + "loss": 0.155, + "step": 1283 + }, + { + "epoch": 0.6684018740239459, + "grad_norm": 0.27256034050552175, + "learning_rate": 4.528307711366767e-05, + "loss": 0.171, + "step": 1284 + }, + { + "epoch": 0.6689224362311296, + "grad_norm": 0.2538123767756482, + "learning_rate": 4.527486027557659e-05, + "loss": 0.1617, + "step": 1285 + }, + { + "epoch": 0.6694429984383133, + "grad_norm": 0.27020857968681916, + "learning_rate": 4.526663703374113e-05, + "loss": 0.1648, + "step": 1286 + }, + { + "epoch": 0.6699635606454971, + "grad_norm": 0.2772645799194081, + "learning_rate": 4.525840739075857e-05, + "loss": 0.1679, + "step": 1287 + }, + { + "epoch": 0.6704841228526809, + "grad_norm": 0.2842658573717009, + "learning_rate": 4.525017134922821e-05, + "loss": 0.1704, + "step": 1288 + }, + { + "epoch": 0.6710046850598647, + "grad_norm": 0.2695068859454936, + "learning_rate": 4.524192891175138e-05, + "loss": 0.1685, + "step": 1289 + }, + { + "epoch": 0.6715252472670484, + "grad_norm": 0.2563249875383298, + "learning_rate": 4.5233680080931415e-05, + "loss": 0.1575, + "step": 1290 + }, + { + "epoch": 0.6720458094742322, + "grad_norm": 0.25282111526223566, + "learning_rate": 4.522542485937369e-05, + "loss": 0.1581, + "step": 1291 + }, + { + "epoch": 0.672566371681416, + "grad_norm": 0.27360763527857895, + "learning_rate": 4.521716324968558e-05, + "loss": 0.1611, + "step": 1292 + }, + { + "epoch": 0.6730869338885996, + "grad_norm": 0.2635465269224253, + "learning_rate": 4.520889525447649e-05, + "loss": 0.1613, + "step": 1293 + }, + { + "epoch": 0.6736074960957834, + "grad_norm": 0.2826648530069749, + "learning_rate": 4.520062087635784e-05, + "loss": 0.1597, + "step": 1294 + }, + { + "epoch": 0.6741280583029672, + "grad_norm": 0.27562512183656485, + "learning_rate": 4.5192340117943063e-05, + "loss": 0.1606, + "step": 1295 + }, + { + "epoch": 0.674648620510151, + "grad_norm": 0.25422173470452253, + "learning_rate": 4.51840529818476e-05, + "loss": 0.1591, + "step": 1296 + }, + { + "epoch": 0.6751691827173347, + "grad_norm": 0.28058709543239363, + "learning_rate": 4.517575947068893e-05, + "loss": 0.1595, + "step": 1297 + }, + { + "epoch": 0.6756897449245185, + "grad_norm": 0.26381435173704143, + "learning_rate": 4.516745958708652e-05, + "loss": 0.1613, + "step": 1298 + }, + { + "epoch": 0.6762103071317023, + "grad_norm": 0.28533524054694864, + "learning_rate": 4.5159153333661854e-05, + "loss": 0.1611, + "step": 1299 + }, + { + "epoch": 0.676730869338886, + "grad_norm": 0.26541389475587734, + "learning_rate": 4.515084071303843e-05, + "loss": 0.1638, + "step": 1300 + }, + { + "epoch": 0.6772514315460697, + "grad_norm": 0.2695669437026599, + "learning_rate": 4.514252172784178e-05, + "loss": 0.1654, + "step": 1301 + }, + { + "epoch": 0.6777719937532535, + "grad_norm": 0.252500452369742, + "learning_rate": 4.513419638069942e-05, + "loss": 0.1612, + "step": 1302 + }, + { + "epoch": 0.6782925559604372, + "grad_norm": 0.2593569283907477, + "learning_rate": 4.512586467424087e-05, + "loss": 0.1573, + "step": 1303 + }, + { + "epoch": 0.678813118167621, + "grad_norm": 0.2783028111560752, + "learning_rate": 4.511752661109768e-05, + "loss": 0.1734, + "step": 1304 + }, + { + "epoch": 0.6793336803748048, + "grad_norm": 0.26185446584580446, + "learning_rate": 4.51091821939034e-05, + "loss": 0.1634, + "step": 1305 + }, + { + "epoch": 0.6798542425819886, + "grad_norm": 0.2713826383452847, + "learning_rate": 4.510083142529359e-05, + "loss": 0.1627, + "step": 1306 + }, + { + "epoch": 0.6803748047891723, + "grad_norm": 0.31074372566311925, + "learning_rate": 4.5092474307905785e-05, + "loss": 0.1587, + "step": 1307 + }, + { + "epoch": 0.6808953669963561, + "grad_norm": 0.2487669262126004, + "learning_rate": 4.5084110844379584e-05, + "loss": 0.1637, + "step": 1308 + }, + { + "epoch": 0.6814159292035398, + "grad_norm": 0.2869857982196813, + "learning_rate": 4.507574103735654e-05, + "loss": 0.1698, + "step": 1309 + }, + { + "epoch": 0.6819364914107235, + "grad_norm": 0.2790644846811937, + "learning_rate": 4.506736488948024e-05, + "loss": 0.1653, + "step": 1310 + }, + { + "epoch": 0.6824570536179073, + "grad_norm": 0.27483161848797866, + "learning_rate": 4.5058982403396244e-05, + "loss": 0.1572, + "step": 1311 + }, + { + "epoch": 0.6829776158250911, + "grad_norm": 0.26249570703461655, + "learning_rate": 4.505059358175214e-05, + "loss": 0.1545, + "step": 1312 + }, + { + "epoch": 0.6834981780322749, + "grad_norm": 0.27003457349130416, + "learning_rate": 4.504219842719751e-05, + "loss": 0.1548, + "step": 1313 + }, + { + "epoch": 0.6840187402394586, + "grad_norm": 0.27502243535142196, + "learning_rate": 4.503379694238394e-05, + "loss": 0.1626, + "step": 1314 + }, + { + "epoch": 0.6845393024466424, + "grad_norm": 0.3145544670289913, + "learning_rate": 4.502538912996499e-05, + "loss": 0.1674, + "step": 1315 + }, + { + "epoch": 0.6850598646538262, + "grad_norm": 0.2662239312987116, + "learning_rate": 4.501697499259626e-05, + "loss": 0.1597, + "step": 1316 + }, + { + "epoch": 0.6855804268610098, + "grad_norm": 0.2745260554615778, + "learning_rate": 4.500855453293532e-05, + "loss": 0.163, + "step": 1317 + }, + { + "epoch": 0.6861009890681936, + "grad_norm": 0.2657154339820778, + "learning_rate": 4.500012775364173e-05, + "loss": 0.1635, + "step": 1318 + }, + { + "epoch": 0.6866215512753774, + "grad_norm": 0.2832483197266812, + "learning_rate": 4.499169465737708e-05, + "loss": 0.169, + "step": 1319 + }, + { + "epoch": 0.6871421134825612, + "grad_norm": 0.2893358983294465, + "learning_rate": 4.498325524680492e-05, + "loss": 0.1657, + "step": 1320 + }, + { + "epoch": 0.6876626756897449, + "grad_norm": 0.2716434135511077, + "learning_rate": 4.4974809524590814e-05, + "loss": 0.1682, + "step": 1321 + }, + { + "epoch": 0.6881832378969287, + "grad_norm": 0.31689018683469905, + "learning_rate": 4.496635749340231e-05, + "loss": 0.1634, + "step": 1322 + }, + { + "epoch": 0.6887038001041125, + "grad_norm": 0.27519106860257103, + "learning_rate": 4.495789915590895e-05, + "loss": 0.1571, + "step": 1323 + }, + { + "epoch": 0.6892243623112962, + "grad_norm": 0.3070166511158656, + "learning_rate": 4.494943451478229e-05, + "loss": 0.1625, + "step": 1324 + }, + { + "epoch": 0.6897449245184799, + "grad_norm": 0.26270345276600776, + "learning_rate": 4.4940963572695836e-05, + "loss": 0.1594, + "step": 1325 + }, + { + "epoch": 0.6902654867256637, + "grad_norm": 0.2885003932017069, + "learning_rate": 4.4932486332325115e-05, + "loss": 0.1595, + "step": 1326 + }, + { + "epoch": 0.6907860489328475, + "grad_norm": 0.2571626418463362, + "learning_rate": 4.492400279634763e-05, + "loss": 0.1616, + "step": 1327 + }, + { + "epoch": 0.6913066111400312, + "grad_norm": 0.27955455405062174, + "learning_rate": 4.491551296744288e-05, + "loss": 0.172, + "step": 1328 + }, + { + "epoch": 0.691827173347215, + "grad_norm": 0.2691060387491177, + "learning_rate": 4.490701684829235e-05, + "loss": 0.1613, + "step": 1329 + }, + { + "epoch": 0.6923477355543988, + "grad_norm": 0.2739293243057916, + "learning_rate": 4.48985144415795e-05, + "loss": 0.1665, + "step": 1330 + }, + { + "epoch": 0.6928682977615825, + "grad_norm": 0.29877047824582253, + "learning_rate": 4.489000574998979e-05, + "loss": 0.1644, + "step": 1331 + }, + { + "epoch": 0.6933888599687663, + "grad_norm": 0.2707072638145847, + "learning_rate": 4.488149077621067e-05, + "loss": 0.1614, + "step": 1332 + }, + { + "epoch": 0.69390942217595, + "grad_norm": 0.2782882849164749, + "learning_rate": 4.4872969522931556e-05, + "loss": 0.1646, + "step": 1333 + }, + { + "epoch": 0.6944299843831337, + "grad_norm": 0.27327557400755875, + "learning_rate": 4.486444199284386e-05, + "loss": 0.1659, + "step": 1334 + }, + { + "epoch": 0.6949505465903175, + "grad_norm": 0.2806747248809842, + "learning_rate": 4.4855908188640973e-05, + "loss": 0.1622, + "step": 1335 + }, + { + "epoch": 0.6954711087975013, + "grad_norm": 0.2690587883243696, + "learning_rate": 4.484736811301826e-05, + "loss": 0.1573, + "step": 1336 + }, + { + "epoch": 0.6959916710046851, + "grad_norm": 0.27243238739702624, + "learning_rate": 4.483882176867308e-05, + "loss": 0.1686, + "step": 1337 + }, + { + "epoch": 0.6965122332118688, + "grad_norm": 0.28204045583043863, + "learning_rate": 4.483026915830477e-05, + "loss": 0.163, + "step": 1338 + }, + { + "epoch": 0.6970327954190526, + "grad_norm": 0.2772739553002315, + "learning_rate": 4.4821710284614636e-05, + "loss": 0.1624, + "step": 1339 + }, + { + "epoch": 0.6975533576262364, + "grad_norm": 0.29231455388507804, + "learning_rate": 4.4813145150305965e-05, + "loss": 0.1616, + "step": 1340 + }, + { + "epoch": 0.69807391983342, + "grad_norm": 0.2640575107760888, + "learning_rate": 4.4804573758084046e-05, + "loss": 0.1585, + "step": 1341 + }, + { + "epoch": 0.6985944820406038, + "grad_norm": 0.3071872152649576, + "learning_rate": 4.4795996110656105e-05, + "loss": 0.1673, + "step": 1342 + }, + { + "epoch": 0.6991150442477876, + "grad_norm": 0.2749143884079691, + "learning_rate": 4.478741221073136e-05, + "loss": 0.1609, + "step": 1343 + }, + { + "epoch": 0.6996356064549714, + "grad_norm": 0.2754640339891129, + "learning_rate": 4.477882206102101e-05, + "loss": 0.1606, + "step": 1344 + }, + { + "epoch": 0.7001561686621551, + "grad_norm": 0.2639768649579088, + "learning_rate": 4.477022566423823e-05, + "loss": 0.162, + "step": 1345 + }, + { + "epoch": 0.7006767308693389, + "grad_norm": 0.30604022298071126, + "learning_rate": 4.476162302309815e-05, + "loss": 0.1628, + "step": 1346 + }, + { + "epoch": 0.7011972930765227, + "grad_norm": 0.26151484803979685, + "learning_rate": 4.475301414031791e-05, + "loss": 0.1561, + "step": 1347 + }, + { + "epoch": 0.7017178552837064, + "grad_norm": 0.3069542520801269, + "learning_rate": 4.4744399018616566e-05, + "loss": 0.1717, + "step": 1348 + }, + { + "epoch": 0.7022384174908901, + "grad_norm": 0.26118236966791997, + "learning_rate": 4.4735777660715186e-05, + "loss": 0.1594, + "step": 1349 + }, + { + "epoch": 0.7027589796980739, + "grad_norm": 0.2821690722820644, + "learning_rate": 4.472715006933681e-05, + "loss": 0.17, + "step": 1350 + }, + { + "epoch": 0.7032795419052577, + "grad_norm": 0.28425332393518327, + "learning_rate": 4.47185162472064e-05, + "loss": 0.1656, + "step": 1351 + }, + { + "epoch": 0.7038001041124414, + "grad_norm": 0.2895241985407839, + "learning_rate": 4.470987619705095e-05, + "loss": 0.172, + "step": 1352 + }, + { + "epoch": 0.7043206663196252, + "grad_norm": 0.2907907968206427, + "learning_rate": 4.470122992159938e-05, + "loss": 0.1577, + "step": 1353 + }, + { + "epoch": 0.704841228526809, + "grad_norm": 0.26360172964644457, + "learning_rate": 4.469257742358258e-05, + "loss": 0.1641, + "step": 1354 + }, + { + "epoch": 0.7053617907339927, + "grad_norm": 0.30825700427174524, + "learning_rate": 4.468391870573342e-05, + "loss": 0.1613, + "step": 1355 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.2832274278276974, + "learning_rate": 4.467525377078672e-05, + "loss": 0.1616, + "step": 1356 + }, + { + "epoch": 0.7064029151483602, + "grad_norm": 0.2918275203692509, + "learning_rate": 4.466658262147927e-05, + "loss": 0.1576, + "step": 1357 + }, + { + "epoch": 0.706923477355544, + "grad_norm": 0.2623509716876954, + "learning_rate": 4.465790526054983e-05, + "loss": 0.1548, + "step": 1358 + }, + { + "epoch": 0.7074440395627277, + "grad_norm": 0.2830506524726103, + "learning_rate": 4.4649221690739095e-05, + "loss": 0.1577, + "step": 1359 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 0.2668706458778001, + "learning_rate": 4.464053191478976e-05, + "loss": 0.1693, + "step": 1360 + }, + { + "epoch": 0.7084851639770953, + "grad_norm": 0.2529153576757616, + "learning_rate": 4.463183593544647e-05, + "loss": 0.1637, + "step": 1361 + }, + { + "epoch": 0.709005726184279, + "grad_norm": 0.2624787559358483, + "learning_rate": 4.462313375545579e-05, + "loss": 0.1612, + "step": 1362 + }, + { + "epoch": 0.7095262883914628, + "grad_norm": 0.2751465060223429, + "learning_rate": 4.461442537756629e-05, + "loss": 0.1549, + "step": 1363 + }, + { + "epoch": 0.7100468505986466, + "grad_norm": 0.24739529719217984, + "learning_rate": 4.4605710804528474e-05, + "loss": 0.162, + "step": 1364 + }, + { + "epoch": 0.7105674128058302, + "grad_norm": 0.26495855736115675, + "learning_rate": 4.459699003909482e-05, + "loss": 0.1659, + "step": 1365 + }, + { + "epoch": 0.711087975013014, + "grad_norm": 0.27568214342295516, + "learning_rate": 4.4588263084019746e-05, + "loss": 0.1635, + "step": 1366 + }, + { + "epoch": 0.7116085372201978, + "grad_norm": 0.25595986364541223, + "learning_rate": 4.457952994205963e-05, + "loss": 0.1569, + "step": 1367 + }, + { + "epoch": 0.7121290994273816, + "grad_norm": 0.2731708239567679, + "learning_rate": 4.457079061597281e-05, + "loss": 0.1603, + "step": 1368 + }, + { + "epoch": 0.7126496616345653, + "grad_norm": 0.2587032809571712, + "learning_rate": 4.4562045108519565e-05, + "loss": 0.1609, + "step": 1369 + }, + { + "epoch": 0.7131702238417491, + "grad_norm": 0.26523949719681184, + "learning_rate": 4.4553293422462134e-05, + "loss": 0.1578, + "step": 1370 + }, + { + "epoch": 0.7136907860489329, + "grad_norm": 0.2594662248421297, + "learning_rate": 4.454453556056471e-05, + "loss": 0.1585, + "step": 1371 + }, + { + "epoch": 0.7142113482561167, + "grad_norm": 0.2557915985298669, + "learning_rate": 4.4535771525593426e-05, + "loss": 0.1536, + "step": 1372 + }, + { + "epoch": 0.7147319104633003, + "grad_norm": 0.2742705642572711, + "learning_rate": 4.452700132031638e-05, + "loss": 0.1673, + "step": 1373 + }, + { + "epoch": 0.7152524726704841, + "grad_norm": 0.2806193185281088, + "learning_rate": 4.451822494750362e-05, + "loss": 0.1648, + "step": 1374 + }, + { + "epoch": 0.7157730348776679, + "grad_norm": 0.2809327056175134, + "learning_rate": 4.450944240992711e-05, + "loss": 0.1656, + "step": 1375 + }, + { + "epoch": 0.7162935970848516, + "grad_norm": 0.2759622382555981, + "learning_rate": 4.45006537103608e-05, + "loss": 0.1612, + "step": 1376 + }, + { + "epoch": 0.7168141592920354, + "grad_norm": 0.2624638331201247, + "learning_rate": 4.449185885158056e-05, + "loss": 0.1668, + "step": 1377 + }, + { + "epoch": 0.7173347214992192, + "grad_norm": 0.2689887733722017, + "learning_rate": 4.4483057836364225e-05, + "loss": 0.1608, + "step": 1378 + }, + { + "epoch": 0.717855283706403, + "grad_norm": 0.26933355871425685, + "learning_rate": 4.4474250667491567e-05, + "loss": 0.1629, + "step": 1379 + }, + { + "epoch": 0.7183758459135867, + "grad_norm": 0.2735573234930184, + "learning_rate": 4.4465437347744285e-05, + "loss": 0.1647, + "step": 1380 + }, + { + "epoch": 0.7188964081207704, + "grad_norm": 0.2680034708829736, + "learning_rate": 4.4456617879906056e-05, + "loss": 0.1678, + "step": 1381 + }, + { + "epoch": 0.7194169703279542, + "grad_norm": 0.2519063095093893, + "learning_rate": 4.444779226676246e-05, + "loss": 0.1592, + "step": 1382 + }, + { + "epoch": 0.7199375325351379, + "grad_norm": 0.2602912105474266, + "learning_rate": 4.4438960511101046e-05, + "loss": 0.1575, + "step": 1383 + }, + { + "epoch": 0.7204580947423217, + "grad_norm": 0.2726477230959709, + "learning_rate": 4.443012261571129e-05, + "loss": 0.1604, + "step": 1384 + }, + { + "epoch": 0.7209786569495055, + "grad_norm": 0.27905940371356286, + "learning_rate": 4.442127858338462e-05, + "loss": 0.1654, + "step": 1385 + }, + { + "epoch": 0.7214992191566892, + "grad_norm": 0.2583560743210376, + "learning_rate": 4.441242841691438e-05, + "loss": 0.1528, + "step": 1386 + }, + { + "epoch": 0.722019781363873, + "grad_norm": 0.27099777286648374, + "learning_rate": 4.440357211909586e-05, + "loss": 0.1667, + "step": 1387 + }, + { + "epoch": 0.7225403435710568, + "grad_norm": 0.2894163802959934, + "learning_rate": 4.439470969272631e-05, + "loss": 0.1645, + "step": 1388 + }, + { + "epoch": 0.7230609057782404, + "grad_norm": 0.2544924261078653, + "learning_rate": 4.4385841140604884e-05, + "loss": 0.1585, + "step": 1389 + }, + { + "epoch": 0.7235814679854242, + "grad_norm": 0.27636086548700217, + "learning_rate": 4.437696646553269e-05, + "loss": 0.1523, + "step": 1390 + }, + { + "epoch": 0.724102030192608, + "grad_norm": 0.2713258603157374, + "learning_rate": 4.4368085670312755e-05, + "loss": 0.1645, + "step": 1391 + }, + { + "epoch": 0.7246225923997918, + "grad_norm": 0.2764758232297857, + "learning_rate": 4.435919875775005e-05, + "loss": 0.1526, + "step": 1392 + }, + { + "epoch": 0.7251431546069755, + "grad_norm": 0.26060022386382026, + "learning_rate": 4.435030573065149e-05, + "loss": 0.1524, + "step": 1393 + }, + { + "epoch": 0.7256637168141593, + "grad_norm": 0.29792250327932235, + "learning_rate": 4.434140659182588e-05, + "loss": 0.1666, + "step": 1394 + }, + { + "epoch": 0.7261842790213431, + "grad_norm": 0.26037287596745995, + "learning_rate": 4.433250134408401e-05, + "loss": 0.1586, + "step": 1395 + }, + { + "epoch": 0.7267048412285269, + "grad_norm": 0.26727896558958614, + "learning_rate": 4.4323589990238545e-05, + "loss": 0.164, + "step": 1396 + }, + { + "epoch": 0.7272254034357105, + "grad_norm": 0.2754571354466698, + "learning_rate": 4.431467253310413e-05, + "loss": 0.1659, + "step": 1397 + }, + { + "epoch": 0.7277459656428943, + "grad_norm": 0.26954157638947546, + "learning_rate": 4.4305748975497294e-05, + "loss": 0.1591, + "step": 1398 + }, + { + "epoch": 0.7282665278500781, + "grad_norm": 0.262164138500046, + "learning_rate": 4.4296819320236524e-05, + "loss": 0.1605, + "step": 1399 + }, + { + "epoch": 0.7287870900572618, + "grad_norm": 0.27456638161498215, + "learning_rate": 4.428788357014222e-05, + "loss": 0.1572, + "step": 1400 + }, + { + "epoch": 0.7293076522644456, + "grad_norm": 0.2720040860336858, + "learning_rate": 4.4278941728036696e-05, + "loss": 0.1667, + "step": 1401 + }, + { + "epoch": 0.7298282144716294, + "grad_norm": 0.2636935543695637, + "learning_rate": 4.426999379674421e-05, + "loss": 0.1678, + "step": 1402 + }, + { + "epoch": 0.7303487766788131, + "grad_norm": 0.27172824164676596, + "learning_rate": 4.426103977909094e-05, + "loss": 0.1654, + "step": 1403 + }, + { + "epoch": 0.7308693388859969, + "grad_norm": 0.2557001119750677, + "learning_rate": 4.425207967790497e-05, + "loss": 0.1598, + "step": 1404 + }, + { + "epoch": 0.7313899010931806, + "grad_norm": 0.26111748434009874, + "learning_rate": 4.4243113496016326e-05, + "loss": 0.1587, + "step": 1405 + }, + { + "epoch": 0.7319104633003644, + "grad_norm": 0.2833129354173884, + "learning_rate": 4.423414123625694e-05, + "loss": 0.162, + "step": 1406 + }, + { + "epoch": 0.7324310255075481, + "grad_norm": 0.2575834825018457, + "learning_rate": 4.4225162901460676e-05, + "loss": 0.1597, + "step": 1407 + }, + { + "epoch": 0.7329515877147319, + "grad_norm": 0.2753343459846711, + "learning_rate": 4.42161784944633e-05, + "loss": 0.17, + "step": 1408 + }, + { + "epoch": 0.7334721499219157, + "grad_norm": 0.2971924494704701, + "learning_rate": 4.420718801810252e-05, + "loss": 0.1563, + "step": 1409 + }, + { + "epoch": 0.7339927121290994, + "grad_norm": 0.2579981593383931, + "learning_rate": 4.419819147521793e-05, + "loss": 0.1586, + "step": 1410 + }, + { + "epoch": 0.7345132743362832, + "grad_norm": 0.2763781873570336, + "learning_rate": 4.418918886865108e-05, + "loss": 0.1679, + "step": 1411 + }, + { + "epoch": 0.735033836543467, + "grad_norm": 0.27888896994369716, + "learning_rate": 4.418018020124538e-05, + "loss": 0.158, + "step": 1412 + }, + { + "epoch": 0.7355543987506507, + "grad_norm": 0.27992425465308046, + "learning_rate": 4.417116547584621e-05, + "loss": 0.1582, + "step": 1413 + }, + { + "epoch": 0.7360749609578344, + "grad_norm": 0.28643991156800214, + "learning_rate": 4.4162144695300834e-05, + "loss": 0.1674, + "step": 1414 + }, + { + "epoch": 0.7365955231650182, + "grad_norm": 0.2642558460300512, + "learning_rate": 4.415311786245843e-05, + "loss": 0.1645, + "step": 1415 + }, + { + "epoch": 0.737116085372202, + "grad_norm": 0.27570896762545, + "learning_rate": 4.41440849801701e-05, + "loss": 0.1634, + "step": 1416 + }, + { + "epoch": 0.7376366475793857, + "grad_norm": 0.26823860392672444, + "learning_rate": 4.413504605128885e-05, + "loss": 0.1615, + "step": 1417 + }, + { + "epoch": 0.7381572097865695, + "grad_norm": 0.27413381020561156, + "learning_rate": 4.4126001078669574e-05, + "loss": 0.1638, + "step": 1418 + }, + { + "epoch": 0.7386777719937533, + "grad_norm": 0.2682256952523028, + "learning_rate": 4.4116950065169124e-05, + "loss": 0.1604, + "step": 1419 + }, + { + "epoch": 0.7391983342009371, + "grad_norm": 0.2934261761780093, + "learning_rate": 4.410789301364621e-05, + "loss": 0.1665, + "step": 1420 + }, + { + "epoch": 0.7397188964081207, + "grad_norm": 0.26702280332753814, + "learning_rate": 4.409882992696148e-05, + "loss": 0.1584, + "step": 1421 + }, + { + "epoch": 0.7402394586153045, + "grad_norm": 0.27945453060552594, + "learning_rate": 4.4089760807977474e-05, + "loss": 0.153, + "step": 1422 + }, + { + "epoch": 0.7407600208224883, + "grad_norm": 0.26111156671410596, + "learning_rate": 4.408068565955865e-05, + "loss": 0.1633, + "step": 1423 + }, + { + "epoch": 0.741280583029672, + "grad_norm": 0.2618834889479115, + "learning_rate": 4.407160448457135e-05, + "loss": 0.1633, + "step": 1424 + }, + { + "epoch": 0.7418011452368558, + "grad_norm": 0.2548316686323859, + "learning_rate": 4.406251728588384e-05, + "loss": 0.1611, + "step": 1425 + }, + { + "epoch": 0.7423217074440396, + "grad_norm": 0.27198720127436277, + "learning_rate": 4.405342406636627e-05, + "loss": 0.1518, + "step": 1426 + }, + { + "epoch": 0.7428422696512234, + "grad_norm": 0.25780399068453724, + "learning_rate": 4.4044324828890715e-05, + "loss": 0.1539, + "step": 1427 + }, + { + "epoch": 0.7433628318584071, + "grad_norm": 0.28708604768264273, + "learning_rate": 4.403521957633113e-05, + "loss": 0.1677, + "step": 1428 + }, + { + "epoch": 0.7438833940655908, + "grad_norm": 0.2854990852578142, + "learning_rate": 4.4026108311563394e-05, + "loss": 0.1591, + "step": 1429 + }, + { + "epoch": 0.7444039562727746, + "grad_norm": 0.2601422292209927, + "learning_rate": 4.401699103746524e-05, + "loss": 0.1628, + "step": 1430 + }, + { + "epoch": 0.7449245184799583, + "grad_norm": 0.26792776342524677, + "learning_rate": 4.4007867756916345e-05, + "loss": 0.1576, + "step": 1431 + }, + { + "epoch": 0.7454450806871421, + "grad_norm": 0.2553899632837398, + "learning_rate": 4.399873847279827e-05, + "loss": 0.1689, + "step": 1432 + }, + { + "epoch": 0.7459656428943259, + "grad_norm": 0.2631912627429382, + "learning_rate": 4.3989603187994454e-05, + "loss": 0.1544, + "step": 1433 + }, + { + "epoch": 0.7464862051015096, + "grad_norm": 0.2719680455212801, + "learning_rate": 4.398046190539025e-05, + "loss": 0.1648, + "step": 1434 + }, + { + "epoch": 0.7470067673086934, + "grad_norm": 0.2623299175088546, + "learning_rate": 4.39713146278729e-05, + "loss": 0.1685, + "step": 1435 + }, + { + "epoch": 0.7475273295158772, + "grad_norm": 0.27789094278392973, + "learning_rate": 4.3962161358331546e-05, + "loss": 0.1588, + "step": 1436 + }, + { + "epoch": 0.7480478917230609, + "grad_norm": 0.2625407847886398, + "learning_rate": 4.395300209965721e-05, + "loss": 0.1616, + "step": 1437 + }, + { + "epoch": 0.7485684539302446, + "grad_norm": 0.2506574404285458, + "learning_rate": 4.394383685474281e-05, + "loss": 0.1522, + "step": 1438 + }, + { + "epoch": 0.7490890161374284, + "grad_norm": 0.26122428518589536, + "learning_rate": 4.3934665626483175e-05, + "loss": 0.1654, + "step": 1439 + }, + { + "epoch": 0.7496095783446122, + "grad_norm": 0.25983335544974423, + "learning_rate": 4.392548841777497e-05, + "loss": 0.1592, + "step": 1440 + }, + { + "epoch": 0.7501301405517959, + "grad_norm": 0.26244778657616386, + "learning_rate": 4.391630523151683e-05, + "loss": 0.163, + "step": 1441 + }, + { + "epoch": 0.7506507027589797, + "grad_norm": 0.24661772307545587, + "learning_rate": 4.390711607060919e-05, + "loss": 0.156, + "step": 1442 + }, + { + "epoch": 0.7511712649661635, + "grad_norm": 0.2681126416443138, + "learning_rate": 4.389792093795444e-05, + "loss": 0.1658, + "step": 1443 + }, + { + "epoch": 0.7516918271733473, + "grad_norm": 0.28319213581654684, + "learning_rate": 4.3888719836456823e-05, + "loss": 0.1603, + "step": 1444 + }, + { + "epoch": 0.7522123893805309, + "grad_norm": 0.2717403925825211, + "learning_rate": 4.3879512769022485e-05, + "loss": 0.1618, + "step": 1445 + }, + { + "epoch": 0.7527329515877147, + "grad_norm": 0.2672807713995564, + "learning_rate": 4.387029973855943e-05, + "loss": 0.164, + "step": 1446 + }, + { + "epoch": 0.7532535137948985, + "grad_norm": 0.29074179763902774, + "learning_rate": 4.3861080747977565e-05, + "loss": 0.159, + "step": 1447 + }, + { + "epoch": 0.7537740760020822, + "grad_norm": 0.2644544725678007, + "learning_rate": 4.385185580018869e-05, + "loss": 0.1608, + "step": 1448 + }, + { + "epoch": 0.754294638209266, + "grad_norm": 0.2786228789121159, + "learning_rate": 4.3842624898106464e-05, + "loss": 0.1663, + "step": 1449 + }, + { + "epoch": 0.7548152004164498, + "grad_norm": 0.2568411909485992, + "learning_rate": 4.383338804464643e-05, + "loss": 0.1632, + "step": 1450 + }, + { + "epoch": 0.7553357626236336, + "grad_norm": 0.2704063306890479, + "learning_rate": 4.382414524272602e-05, + "loss": 0.1559, + "step": 1451 + }, + { + "epoch": 0.7558563248308173, + "grad_norm": 0.3052629008083044, + "learning_rate": 4.3814896495264544e-05, + "loss": 0.1667, + "step": 1452 + }, + { + "epoch": 0.756376887038001, + "grad_norm": 0.25388945533730933, + "learning_rate": 4.380564180518318e-05, + "loss": 0.1596, + "step": 1453 + }, + { + "epoch": 0.7568974492451848, + "grad_norm": 0.2876303341079924, + "learning_rate": 4.3796381175405014e-05, + "loss": 0.1571, + "step": 1454 + }, + { + "epoch": 0.7574180114523685, + "grad_norm": 0.26037804849757196, + "learning_rate": 4.378711460885494e-05, + "loss": 0.157, + "step": 1455 + }, + { + "epoch": 0.7579385736595523, + "grad_norm": 0.281072192425641, + "learning_rate": 4.377784210845981e-05, + "loss": 0.1605, + "step": 1456 + }, + { + "epoch": 0.7584591358667361, + "grad_norm": 0.28910101581498615, + "learning_rate": 4.376856367714829e-05, + "loss": 0.1627, + "step": 1457 + }, + { + "epoch": 0.7589796980739199, + "grad_norm": 0.27247204660508845, + "learning_rate": 4.375927931785095e-05, + "loss": 0.16, + "step": 1458 + }, + { + "epoch": 0.7595002602811036, + "grad_norm": 0.24843057073483057, + "learning_rate": 4.3749989033500224e-05, + "loss": 0.1552, + "step": 1459 + }, + { + "epoch": 0.7600208224882874, + "grad_norm": 0.27297992132744714, + "learning_rate": 4.3740692827030404e-05, + "loss": 0.1545, + "step": 1460 + }, + { + "epoch": 0.7605413846954711, + "grad_norm": 0.25731476919380347, + "learning_rate": 4.3731390701377675e-05, + "loss": 0.1547, + "step": 1461 + }, + { + "epoch": 0.7610619469026548, + "grad_norm": 0.27094205358391144, + "learning_rate": 4.3722082659480076e-05, + "loss": 0.1509, + "step": 1462 + }, + { + "epoch": 0.7615825091098386, + "grad_norm": 0.27051817962373237, + "learning_rate": 4.371276870427753e-05, + "loss": 0.1514, + "step": 1463 + }, + { + "epoch": 0.7621030713170224, + "grad_norm": 0.2776605729402276, + "learning_rate": 4.37034488387118e-05, + "loss": 0.1648, + "step": 1464 + }, + { + "epoch": 0.7626236335242061, + "grad_norm": 0.2632398573808106, + "learning_rate": 4.3694123065726553e-05, + "loss": 0.1625, + "step": 1465 + }, + { + "epoch": 0.7631441957313899, + "grad_norm": 0.2631443834365562, + "learning_rate": 4.3684791388267287e-05, + "loss": 0.1588, + "step": 1466 + }, + { + "epoch": 0.7636647579385737, + "grad_norm": 0.26943148019545626, + "learning_rate": 4.367545380928139e-05, + "loss": 0.1552, + "step": 1467 + }, + { + "epoch": 0.7641853201457575, + "grad_norm": 0.2649233188506122, + "learning_rate": 4.36661103317181e-05, + "loss": 0.1615, + "step": 1468 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.26835305552005984, + "learning_rate": 4.3656760958528506e-05, + "loss": 0.1637, + "step": 1469 + }, + { + "epoch": 0.7652264445601249, + "grad_norm": 0.2578891624619545, + "learning_rate": 4.364740569266561e-05, + "loss": 0.152, + "step": 1470 + }, + { + "epoch": 0.7657470067673087, + "grad_norm": 0.26524236019881625, + "learning_rate": 4.363804453708421e-05, + "loss": 0.1676, + "step": 1471 + }, + { + "epoch": 0.7662675689744924, + "grad_norm": 0.2583254156227514, + "learning_rate": 4.362867749474101e-05, + "loss": 0.1543, + "step": 1472 + }, + { + "epoch": 0.7667881311816762, + "grad_norm": 0.2577114988061728, + "learning_rate": 4.361930456859455e-05, + "loss": 0.1546, + "step": 1473 + }, + { + "epoch": 0.76730869338886, + "grad_norm": 0.2655743069628856, + "learning_rate": 4.360992576160524e-05, + "loss": 0.1563, + "step": 1474 + }, + { + "epoch": 0.7678292555960438, + "grad_norm": 0.25925048966807795, + "learning_rate": 4.3600541076735346e-05, + "loss": 0.1562, + "step": 1475 + }, + { + "epoch": 0.7683498178032275, + "grad_norm": 0.26321769667524203, + "learning_rate": 4.359115051694898e-05, + "loss": 0.1587, + "step": 1476 + }, + { + "epoch": 0.7688703800104112, + "grad_norm": 0.2792326144746915, + "learning_rate": 4.358175408521212e-05, + "loss": 0.1555, + "step": 1477 + }, + { + "epoch": 0.769390942217595, + "grad_norm": 0.26756537678776365, + "learning_rate": 4.357235178449261e-05, + "loss": 0.1666, + "step": 1478 + }, + { + "epoch": 0.7699115044247787, + "grad_norm": 0.27412529165548094, + "learning_rate": 4.356294361776012e-05, + "loss": 0.164, + "step": 1479 + }, + { + "epoch": 0.7704320666319625, + "grad_norm": 0.2632617066780282, + "learning_rate": 4.3553529587986184e-05, + "loss": 0.1554, + "step": 1480 + }, + { + "epoch": 0.7709526288391463, + "grad_norm": 0.2665209273532276, + "learning_rate": 4.3544109698144206e-05, + "loss": 0.157, + "step": 1481 + }, + { + "epoch": 0.77147319104633, + "grad_norm": 0.26609079034706307, + "learning_rate": 4.3534683951209416e-05, + "loss": 0.1561, + "step": 1482 + }, + { + "epoch": 0.7719937532535138, + "grad_norm": 0.2782722665482425, + "learning_rate": 4.3525252350158904e-05, + "loss": 0.1633, + "step": 1483 + }, + { + "epoch": 0.7725143154606976, + "grad_norm": 0.24586603055391748, + "learning_rate": 4.351581489797161e-05, + "loss": 0.1625, + "step": 1484 + }, + { + "epoch": 0.7730348776678813, + "grad_norm": 0.27660175665693065, + "learning_rate": 4.350637159762831e-05, + "loss": 0.166, + "step": 1485 + }, + { + "epoch": 0.773555439875065, + "grad_norm": 0.2718655540992913, + "learning_rate": 4.3496922452111656e-05, + "loss": 0.162, + "step": 1486 + }, + { + "epoch": 0.7740760020822488, + "grad_norm": 0.2583348630376606, + "learning_rate": 4.348746746440612e-05, + "loss": 0.1632, + "step": 1487 + }, + { + "epoch": 0.7745965642894326, + "grad_norm": 0.2692273301860101, + "learning_rate": 4.347800663749801e-05, + "loss": 0.1513, + "step": 1488 + }, + { + "epoch": 0.7751171264966163, + "grad_norm": 0.26291032421691457, + "learning_rate": 4.3468539974375534e-05, + "loss": 0.1545, + "step": 1489 + }, + { + "epoch": 0.7756376887038001, + "grad_norm": 0.27809527086476604, + "learning_rate": 4.345906747802867e-05, + "loss": 0.1593, + "step": 1490 + }, + { + "epoch": 0.7761582509109839, + "grad_norm": 0.25920104592248266, + "learning_rate": 4.344958915144929e-05, + "loss": 0.1609, + "step": 1491 + }, + { + "epoch": 0.7766788131181677, + "grad_norm": 0.29330224785096815, + "learning_rate": 4.3440104997631084e-05, + "loss": 0.1686, + "step": 1492 + }, + { + "epoch": 0.7771993753253513, + "grad_norm": 0.2727948495509996, + "learning_rate": 4.343061501956959e-05, + "loss": 0.1592, + "step": 1493 + }, + { + "epoch": 0.7777199375325351, + "grad_norm": 0.25156183079449285, + "learning_rate": 4.3421119220262185e-05, + "loss": 0.161, + "step": 1494 + }, + { + "epoch": 0.7782404997397189, + "grad_norm": 0.26655483033933713, + "learning_rate": 4.3411617602708085e-05, + "loss": 0.1556, + "step": 1495 + }, + { + "epoch": 0.7787610619469026, + "grad_norm": 0.25869405345561675, + "learning_rate": 4.340211016990834e-05, + "loss": 0.1518, + "step": 1496 + }, + { + "epoch": 0.7792816241540864, + "grad_norm": 0.25895490167143587, + "learning_rate": 4.3392596924865854e-05, + "loss": 0.1583, + "step": 1497 + }, + { + "epoch": 0.7798021863612702, + "grad_norm": 0.26930038307801346, + "learning_rate": 4.3383077870585334e-05, + "loss": 0.1578, + "step": 1498 + }, + { + "epoch": 0.780322748568454, + "grad_norm": 0.2655149958667704, + "learning_rate": 4.3373553010073355e-05, + "loss": 0.1666, + "step": 1499 + }, + { + "epoch": 0.7808433107756377, + "grad_norm": 0.2700875671699295, + "learning_rate": 4.3364022346338295e-05, + "loss": 0.1499, + "step": 1500 + }, + { + "epoch": 0.7813638729828214, + "grad_norm": 0.26841277303835387, + "learning_rate": 4.335448588239039e-05, + "loss": 0.16, + "step": 1501 + }, + { + "epoch": 0.7818844351900052, + "grad_norm": 0.2664241191896805, + "learning_rate": 4.33449436212417e-05, + "loss": 0.1566, + "step": 1502 + }, + { + "epoch": 0.7824049973971889, + "grad_norm": 0.2748558495343656, + "learning_rate": 4.333539556590612e-05, + "loss": 0.1561, + "step": 1503 + }, + { + "epoch": 0.7829255596043727, + "grad_norm": 0.260891571763646, + "learning_rate": 4.332584171939936e-05, + "loss": 0.1639, + "step": 1504 + }, + { + "epoch": 0.7834461218115565, + "grad_norm": 0.26117170009910184, + "learning_rate": 4.331628208473897e-05, + "loss": 0.1535, + "step": 1505 + }, + { + "epoch": 0.7839666840187403, + "grad_norm": 0.2501439526366261, + "learning_rate": 4.3306716664944344e-05, + "loss": 0.1558, + "step": 1506 + }, + { + "epoch": 0.784487246225924, + "grad_norm": 0.24457951812109338, + "learning_rate": 4.329714546303666e-05, + "loss": 0.1539, + "step": 1507 + }, + { + "epoch": 0.7850078084331078, + "grad_norm": 0.25274194830382657, + "learning_rate": 4.328756848203897e-05, + "loss": 0.1564, + "step": 1508 + }, + { + "epoch": 0.7855283706402915, + "grad_norm": 0.24212505466279638, + "learning_rate": 4.327798572497612e-05, + "loss": 0.1585, + "step": 1509 + }, + { + "epoch": 0.7860489328474752, + "grad_norm": 0.26756424368954107, + "learning_rate": 4.3268397194874796e-05, + "loss": 0.1617, + "step": 1510 + }, + { + "epoch": 0.786569495054659, + "grad_norm": 0.24184787477349412, + "learning_rate": 4.32588028947635e-05, + "loss": 0.1619, + "step": 1511 + }, + { + "epoch": 0.7870900572618428, + "grad_norm": 0.25168399053293283, + "learning_rate": 4.3249202827672564e-05, + "loss": 0.1602, + "step": 1512 + }, + { + "epoch": 0.7876106194690266, + "grad_norm": 0.26161299334783633, + "learning_rate": 4.3239596996634125e-05, + "loss": 0.1576, + "step": 1513 + }, + { + "epoch": 0.7881311816762103, + "grad_norm": 0.23097070104546066, + "learning_rate": 4.322998540468216e-05, + "loss": 0.149, + "step": 1514 + }, + { + "epoch": 0.7886517438833941, + "grad_norm": 0.26176672102662274, + "learning_rate": 4.322036805485245e-05, + "loss": 0.1604, + "step": 1515 + }, + { + "epoch": 0.7891723060905779, + "grad_norm": 0.2509842627379737, + "learning_rate": 4.3210744950182603e-05, + "loss": 0.1609, + "step": 1516 + }, + { + "epoch": 0.7896928682977615, + "grad_norm": 0.25286215696526987, + "learning_rate": 4.3201116093712045e-05, + "loss": 0.1519, + "step": 1517 + }, + { + "epoch": 0.7902134305049453, + "grad_norm": 0.2554977617373658, + "learning_rate": 4.319148148848202e-05, + "loss": 0.1507, + "step": 1518 + }, + { + "epoch": 0.7907339927121291, + "grad_norm": 0.2610030848646668, + "learning_rate": 4.3181841137535585e-05, + "loss": 0.1565, + "step": 1519 + }, + { + "epoch": 0.7912545549193128, + "grad_norm": 0.25847858497390264, + "learning_rate": 4.317219504391761e-05, + "loss": 0.1513, + "step": 1520 + }, + { + "epoch": 0.7917751171264966, + "grad_norm": 0.25023349404312234, + "learning_rate": 4.316254321067477e-05, + "loss": 0.1632, + "step": 1521 + }, + { + "epoch": 0.7922956793336804, + "grad_norm": 0.2641861638565703, + "learning_rate": 4.315288564085558e-05, + "loss": 0.1618, + "step": 1522 + }, + { + "epoch": 0.7928162415408642, + "grad_norm": 0.2421590693178086, + "learning_rate": 4.314322233751034e-05, + "loss": 0.1558, + "step": 1523 + }, + { + "epoch": 0.7933368037480479, + "grad_norm": 0.26085418193866505, + "learning_rate": 4.313355330369117e-05, + "loss": 0.1595, + "step": 1524 + }, + { + "epoch": 0.7938573659552316, + "grad_norm": 0.2439161457038632, + "learning_rate": 4.312387854245201e-05, + "loss": 0.1466, + "step": 1525 + }, + { + "epoch": 0.7943779281624154, + "grad_norm": 0.2713532358303694, + "learning_rate": 4.3114198056848585e-05, + "loss": 0.1608, + "step": 1526 + }, + { + "epoch": 0.7948984903695991, + "grad_norm": 0.25922505436788407, + "learning_rate": 4.3104511849938464e-05, + "loss": 0.1557, + "step": 1527 + }, + { + "epoch": 0.7954190525767829, + "grad_norm": 0.2886790825671015, + "learning_rate": 4.309481992478098e-05, + "loss": 0.1644, + "step": 1528 + }, + { + "epoch": 0.7959396147839667, + "grad_norm": 0.24108201278459665, + "learning_rate": 4.308512228443731e-05, + "loss": 0.1596, + "step": 1529 + }, + { + "epoch": 0.7964601769911505, + "grad_norm": 0.27215617885447624, + "learning_rate": 4.30754189319704e-05, + "loss": 0.1573, + "step": 1530 + }, + { + "epoch": 0.7969807391983342, + "grad_norm": 0.28694200101027506, + "learning_rate": 4.306570987044505e-05, + "loss": 0.1686, + "step": 1531 + }, + { + "epoch": 0.797501301405518, + "grad_norm": 0.24652959196034746, + "learning_rate": 4.305599510292781e-05, + "loss": 0.159, + "step": 1532 + }, + { + "epoch": 0.7980218636127017, + "grad_norm": 0.2819197614862714, + "learning_rate": 4.304627463248706e-05, + "loss": 0.1588, + "step": 1533 + }, + { + "epoch": 0.7985424258198854, + "grad_norm": 0.2909511370680243, + "learning_rate": 4.3036548462192986e-05, + "loss": 0.1606, + "step": 1534 + }, + { + "epoch": 0.7990629880270692, + "grad_norm": 0.2530278943329065, + "learning_rate": 4.302681659511755e-05, + "loss": 0.1544, + "step": 1535 + }, + { + "epoch": 0.799583550234253, + "grad_norm": 0.282363644399392, + "learning_rate": 4.301707903433454e-05, + "loss": 0.1556, + "step": 1536 + }, + { + "epoch": 0.8001041124414368, + "grad_norm": 0.2676419532705495, + "learning_rate": 4.300733578291953e-05, + "loss": 0.1637, + "step": 1537 + }, + { + "epoch": 0.8006246746486205, + "grad_norm": 0.25796205124564936, + "learning_rate": 4.29975868439499e-05, + "loss": 0.1609, + "step": 1538 + }, + { + "epoch": 0.8011452368558043, + "grad_norm": 0.2819765513723248, + "learning_rate": 4.29878322205048e-05, + "loss": 0.1624, + "step": 1539 + }, + { + "epoch": 0.8016657990629881, + "grad_norm": 0.27897029452667205, + "learning_rate": 4.297807191566521e-05, + "loss": 0.1572, + "step": 1540 + }, + { + "epoch": 0.8021863612701717, + "grad_norm": 0.27824823196428977, + "learning_rate": 4.2968305932513866e-05, + "loss": 0.1622, + "step": 1541 + }, + { + "epoch": 0.8027069234773555, + "grad_norm": 0.2738961827219555, + "learning_rate": 4.295853427413535e-05, + "loss": 0.1601, + "step": 1542 + }, + { + "epoch": 0.8032274856845393, + "grad_norm": 0.2665241074273365, + "learning_rate": 4.2948756943615985e-05, + "loss": 0.1534, + "step": 1543 + }, + { + "epoch": 0.803748047891723, + "grad_norm": 0.27182571850263426, + "learning_rate": 4.293897394404392e-05, + "loss": 0.1664, + "step": 1544 + }, + { + "epoch": 0.8042686100989068, + "grad_norm": 0.2553846747993963, + "learning_rate": 4.292918527850907e-05, + "loss": 0.1543, + "step": 1545 + }, + { + "epoch": 0.8047891723060906, + "grad_norm": 0.25252091088164785, + "learning_rate": 4.291939095010316e-05, + "loss": 0.1569, + "step": 1546 + }, + { + "epoch": 0.8053097345132744, + "grad_norm": 0.3033445790518775, + "learning_rate": 4.290959096191969e-05, + "loss": 0.156, + "step": 1547 + }, + { + "epoch": 0.8058302967204581, + "grad_norm": 0.26780210520674946, + "learning_rate": 4.289978531705395e-05, + "loss": 0.1629, + "step": 1548 + }, + { + "epoch": 0.8063508589276418, + "grad_norm": 0.25340358566449683, + "learning_rate": 4.288997401860303e-05, + "loss": 0.1676, + "step": 1549 + }, + { + "epoch": 0.8068714211348256, + "grad_norm": 0.26032260808346724, + "learning_rate": 4.288015706966578e-05, + "loss": 0.1599, + "step": 1550 + }, + { + "epoch": 0.8073919833420093, + "grad_norm": 0.253837369944548, + "learning_rate": 4.287033447334286e-05, + "loss": 0.1584, + "step": 1551 + }, + { + "epoch": 0.8079125455491931, + "grad_norm": 0.26639544259069964, + "learning_rate": 4.2860506232736706e-05, + "loss": 0.165, + "step": 1552 + }, + { + "epoch": 0.8084331077563769, + "grad_norm": 0.2374985977664141, + "learning_rate": 4.2850672350951516e-05, + "loss": 0.1545, + "step": 1553 + }, + { + "epoch": 0.8089536699635607, + "grad_norm": 0.2528600110588083, + "learning_rate": 4.284083283109331e-05, + "loss": 0.1579, + "step": 1554 + }, + { + "epoch": 0.8094742321707444, + "grad_norm": 0.25899495284687546, + "learning_rate": 4.283098767626984e-05, + "loss": 0.1533, + "step": 1555 + }, + { + "epoch": 0.8099947943779282, + "grad_norm": 0.2596540439723637, + "learning_rate": 4.2821136889590696e-05, + "loss": 0.1623, + "step": 1556 + }, + { + "epoch": 0.8105153565851119, + "grad_norm": 0.2593899738580702, + "learning_rate": 4.281128047416719e-05, + "loss": 0.1622, + "step": 1557 + }, + { + "epoch": 0.8110359187922956, + "grad_norm": 0.27318043091362676, + "learning_rate": 4.280141843311244e-05, + "loss": 0.1614, + "step": 1558 + }, + { + "epoch": 0.8115564809994794, + "grad_norm": 0.2650371042158417, + "learning_rate": 4.279155076954135e-05, + "loss": 0.1551, + "step": 1559 + }, + { + "epoch": 0.8120770432066632, + "grad_norm": 0.2554530943228691, + "learning_rate": 4.2781677486570576e-05, + "loss": 0.1622, + "step": 1560 + }, + { + "epoch": 0.812597605413847, + "grad_norm": 0.2742519335456577, + "learning_rate": 4.277179858731857e-05, + "loss": 0.1619, + "step": 1561 + }, + { + "epoch": 0.8131181676210307, + "grad_norm": 0.2613073326722418, + "learning_rate": 4.276191407490553e-05, + "loss": 0.1603, + "step": 1562 + }, + { + "epoch": 0.8136387298282145, + "grad_norm": 0.26817158410006026, + "learning_rate": 4.2752023952453465e-05, + "loss": 0.1571, + "step": 1563 + }, + { + "epoch": 0.8141592920353983, + "grad_norm": 0.26632267357937267, + "learning_rate": 4.274212822308612e-05, + "loss": 0.1599, + "step": 1564 + }, + { + "epoch": 0.8146798542425819, + "grad_norm": 0.2596167990888178, + "learning_rate": 4.273222688992904e-05, + "loss": 0.1588, + "step": 1565 + }, + { + "epoch": 0.8152004164497657, + "grad_norm": 0.27568862315736015, + "learning_rate": 4.272231995610952e-05, + "loss": 0.1542, + "step": 1566 + }, + { + "epoch": 0.8157209786569495, + "grad_norm": 0.27114645918535957, + "learning_rate": 4.271240742475664e-05, + "loss": 0.1576, + "step": 1567 + }, + { + "epoch": 0.8162415408641333, + "grad_norm": 0.24591250210741347, + "learning_rate": 4.2702489299001224e-05, + "loss": 0.1542, + "step": 1568 + }, + { + "epoch": 0.816762103071317, + "grad_norm": 0.25998302637843507, + "learning_rate": 4.269256558197588e-05, + "loss": 0.1597, + "step": 1569 + }, + { + "epoch": 0.8172826652785008, + "grad_norm": 0.277349113622191, + "learning_rate": 4.2682636276815e-05, + "loss": 0.1656, + "step": 1570 + }, + { + "epoch": 0.8178032274856846, + "grad_norm": 0.2591716268174999, + "learning_rate": 4.267270138665469e-05, + "loss": 0.1636, + "step": 1571 + }, + { + "epoch": 0.8183237896928683, + "grad_norm": 0.2626931448102875, + "learning_rate": 4.266276091463286e-05, + "loss": 0.1582, + "step": 1572 + }, + { + "epoch": 0.818844351900052, + "grad_norm": 0.2533409992404636, + "learning_rate": 4.26528148638892e-05, + "loss": 0.1559, + "step": 1573 + }, + { + "epoch": 0.8193649141072358, + "grad_norm": 0.2635890206268693, + "learning_rate": 4.26428632375651e-05, + "loss": 0.1587, + "step": 1574 + }, + { + "epoch": 0.8198854763144195, + "grad_norm": 0.2781221733340093, + "learning_rate": 4.2632906038803765e-05, + "loss": 0.1661, + "step": 1575 + }, + { + "epoch": 0.8204060385216033, + "grad_norm": 0.2466422831727149, + "learning_rate": 4.262294327075014e-05, + "loss": 0.1609, + "step": 1576 + }, + { + "epoch": 0.8209266007287871, + "grad_norm": 0.26214772989894536, + "learning_rate": 4.261297493655092e-05, + "loss": 0.156, + "step": 1577 + }, + { + "epoch": 0.8214471629359709, + "grad_norm": 0.25723469064882787, + "learning_rate": 4.260300103935459e-05, + "loss": 0.1562, + "step": 1578 + }, + { + "epoch": 0.8219677251431546, + "grad_norm": 0.2597832572362618, + "learning_rate": 4.2593021582311354e-05, + "loss": 0.163, + "step": 1579 + }, + { + "epoch": 0.8224882873503384, + "grad_norm": 0.25814069220724356, + "learning_rate": 4.2583036568573184e-05, + "loss": 0.1583, + "step": 1580 + }, + { + "epoch": 0.8230088495575221, + "grad_norm": 0.2680515018520899, + "learning_rate": 4.257304600129384e-05, + "loss": 0.1544, + "step": 1581 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.26943464813073753, + "learning_rate": 4.256304988362878e-05, + "loss": 0.1579, + "step": 1582 + }, + { + "epoch": 0.8240499739718896, + "grad_norm": 0.2536098084394136, + "learning_rate": 4.2553048218735256e-05, + "loss": 0.155, + "step": 1583 + }, + { + "epoch": 0.8245705361790734, + "grad_norm": 0.2757591782017878, + "learning_rate": 4.254304100977225e-05, + "loss": 0.1606, + "step": 1584 + }, + { + "epoch": 0.8250910983862572, + "grad_norm": 0.24717329055472512, + "learning_rate": 4.253302825990051e-05, + "loss": 0.1456, + "step": 1585 + }, + { + "epoch": 0.8256116605934409, + "grad_norm": 0.27846929929536657, + "learning_rate": 4.2523009972282534e-05, + "loss": 0.1619, + "step": 1586 + }, + { + "epoch": 0.8261322228006247, + "grad_norm": 0.2456486639173495, + "learning_rate": 4.2512986150082555e-05, + "loss": 0.163, + "step": 1587 + }, + { + "epoch": 0.8266527850078085, + "grad_norm": 0.28099219066923725, + "learning_rate": 4.250295679646657e-05, + "loss": 0.1581, + "step": 1588 + }, + { + "epoch": 0.8271733472149921, + "grad_norm": 0.28012381846944834, + "learning_rate": 4.24929219146023e-05, + "loss": 0.1618, + "step": 1589 + }, + { + "epoch": 0.8276939094221759, + "grad_norm": 0.2514914295217897, + "learning_rate": 4.248288150765925e-05, + "loss": 0.1565, + "step": 1590 + }, + { + "epoch": 0.8282144716293597, + "grad_norm": 0.2716012154641059, + "learning_rate": 4.2472835578808635e-05, + "loss": 0.1589, + "step": 1591 + }, + { + "epoch": 0.8287350338365435, + "grad_norm": 0.2815268850024281, + "learning_rate": 4.2462784131223434e-05, + "loss": 0.15, + "step": 1592 + }, + { + "epoch": 0.8292555960437272, + "grad_norm": 0.26832812464875594, + "learning_rate": 4.245272716807834e-05, + "loss": 0.1516, + "step": 1593 + }, + { + "epoch": 0.829776158250911, + "grad_norm": 0.255986260897498, + "learning_rate": 4.244266469254984e-05, + "loss": 0.1557, + "step": 1594 + }, + { + "epoch": 0.8302967204580948, + "grad_norm": 0.27756007774738645, + "learning_rate": 4.243259670781611e-05, + "loss": 0.1582, + "step": 1595 + }, + { + "epoch": 0.8308172826652785, + "grad_norm": 0.2788845280857182, + "learning_rate": 4.2422523217057104e-05, + "loss": 0.1611, + "step": 1596 + }, + { + "epoch": 0.8313378448724622, + "grad_norm": 0.265204765669698, + "learning_rate": 4.241244422345448e-05, + "loss": 0.1654, + "step": 1597 + }, + { + "epoch": 0.831858407079646, + "grad_norm": 0.29862671442559635, + "learning_rate": 4.240235973019168e-05, + "loss": 0.1628, + "step": 1598 + }, + { + "epoch": 0.8323789692868298, + "grad_norm": 0.25068737790327783, + "learning_rate": 4.239226974045383e-05, + "loss": 0.1612, + "step": 1599 + }, + { + "epoch": 0.8328995314940135, + "grad_norm": 0.26109969950747847, + "learning_rate": 4.2382174257427845e-05, + "loss": 0.1526, + "step": 1600 + }, + { + "epoch": 0.8334200937011973, + "grad_norm": 0.2708108326367472, + "learning_rate": 4.237207328430232e-05, + "loss": 0.1574, + "step": 1601 + }, + { + "epoch": 0.8339406559083811, + "grad_norm": 0.2603788294937712, + "learning_rate": 4.236196682426762e-05, + "loss": 0.1603, + "step": 1602 + }, + { + "epoch": 0.8344612181155648, + "grad_norm": 0.28645615014842474, + "learning_rate": 4.235185488051585e-05, + "loss": 0.164, + "step": 1603 + }, + { + "epoch": 0.8349817803227486, + "grad_norm": 0.25263310940554584, + "learning_rate": 4.2341737456240815e-05, + "loss": 0.152, + "step": 1604 + }, + { + "epoch": 0.8355023425299323, + "grad_norm": 0.29024837078909443, + "learning_rate": 4.233161455463809e-05, + "loss": 0.1584, + "step": 1605 + }, + { + "epoch": 0.836022904737116, + "grad_norm": 0.2698645950290543, + "learning_rate": 4.232148617890493e-05, + "loss": 0.1603, + "step": 1606 + }, + { + "epoch": 0.8365434669442998, + "grad_norm": 0.28134226221542846, + "learning_rate": 4.231135233224037e-05, + "loss": 0.1492, + "step": 1607 + }, + { + "epoch": 0.8370640291514836, + "grad_norm": 0.2436518446483368, + "learning_rate": 4.2301213017845144e-05, + "loss": 0.1543, + "step": 1608 + }, + { + "epoch": 0.8375845913586674, + "grad_norm": 0.28214111662149993, + "learning_rate": 4.2291068238921714e-05, + "loss": 0.1689, + "step": 1609 + }, + { + "epoch": 0.8381051535658511, + "grad_norm": 0.2863837316615571, + "learning_rate": 4.228091799867427e-05, + "loss": 0.1578, + "step": 1610 + }, + { + "epoch": 0.8386257157730349, + "grad_norm": 0.28091970790308574, + "learning_rate": 4.227076230030875e-05, + "loss": 0.1577, + "step": 1611 + }, + { + "epoch": 0.8391462779802187, + "grad_norm": 0.2593788595939422, + "learning_rate": 4.226060114703278e-05, + "loss": 0.1591, + "step": 1612 + }, + { + "epoch": 0.8396668401874023, + "grad_norm": 0.270786976366135, + "learning_rate": 4.225043454205573e-05, + "loss": 0.1646, + "step": 1613 + }, + { + "epoch": 0.8401874023945861, + "grad_norm": 0.2689847000833549, + "learning_rate": 4.224026248858868e-05, + "loss": 0.1574, + "step": 1614 + }, + { + "epoch": 0.8407079646017699, + "grad_norm": 0.2577066953840738, + "learning_rate": 4.2230084989844454e-05, + "loss": 0.155, + "step": 1615 + }, + { + "epoch": 0.8412285268089537, + "grad_norm": 0.2550154489290601, + "learning_rate": 4.221990204903756e-05, + "loss": 0.1564, + "step": 1616 + }, + { + "epoch": 0.8417490890161374, + "grad_norm": 0.273560448413933, + "learning_rate": 4.220971366938425e-05, + "loss": 0.1578, + "step": 1617 + }, + { + "epoch": 0.8422696512233212, + "grad_norm": 0.24371532149607608, + "learning_rate": 4.21995198541025e-05, + "loss": 0.1531, + "step": 1618 + }, + { + "epoch": 0.842790213430505, + "grad_norm": 0.28921374195808397, + "learning_rate": 4.218932060641198e-05, + "loss": 0.1561, + "step": 1619 + }, + { + "epoch": 0.8433107756376887, + "grad_norm": 0.2829460232916761, + "learning_rate": 4.217911592953409e-05, + "loss": 0.1621, + "step": 1620 + }, + { + "epoch": 0.8438313378448724, + "grad_norm": 0.26061758786155376, + "learning_rate": 4.216890582669194e-05, + "loss": 0.1484, + "step": 1621 + }, + { + "epoch": 0.8443519000520562, + "grad_norm": 0.2619069964371061, + "learning_rate": 4.2158690301110366e-05, + "loss": 0.1622, + "step": 1622 + }, + { + "epoch": 0.84487246225924, + "grad_norm": 0.27728638024745567, + "learning_rate": 4.2148469356015896e-05, + "loss": 0.1573, + "step": 1623 + }, + { + "epoch": 0.8453930244664237, + "grad_norm": 0.2660951427495475, + "learning_rate": 4.213824299463678e-05, + "loss": 0.1653, + "step": 1624 + }, + { + "epoch": 0.8459135866736075, + "grad_norm": 0.2617492948395115, + "learning_rate": 4.2128011220202976e-05, + "loss": 0.1633, + "step": 1625 + }, + { + "epoch": 0.8464341488807913, + "grad_norm": 0.29138627520898736, + "learning_rate": 4.211777403594617e-05, + "loss": 0.1596, + "step": 1626 + }, + { + "epoch": 0.846954711087975, + "grad_norm": 0.24888350370609963, + "learning_rate": 4.210753144509972e-05, + "loss": 0.1606, + "step": 1627 + }, + { + "epoch": 0.8474752732951588, + "grad_norm": 0.2505367676114268, + "learning_rate": 4.209728345089873e-05, + "loss": 0.1629, + "step": 1628 + }, + { + "epoch": 0.8479958355023425, + "grad_norm": 0.25895259510640567, + "learning_rate": 4.208703005657999e-05, + "loss": 0.1591, + "step": 1629 + }, + { + "epoch": 0.8485163977095262, + "grad_norm": 0.25678447658914816, + "learning_rate": 4.207677126538199e-05, + "loss": 0.1606, + "step": 1630 + }, + { + "epoch": 0.84903695991671, + "grad_norm": 0.26827714044896583, + "learning_rate": 4.206650708054494e-05, + "loss": 0.1457, + "step": 1631 + }, + { + "epoch": 0.8495575221238938, + "grad_norm": 0.2722481858513276, + "learning_rate": 4.205623750531076e-05, + "loss": 0.1575, + "step": 1632 + }, + { + "epoch": 0.8500780843310776, + "grad_norm": 0.27881803489843254, + "learning_rate": 4.204596254292303e-05, + "loss": 0.1581, + "step": 1633 + }, + { + "epoch": 0.8505986465382613, + "grad_norm": 0.2519593987498123, + "learning_rate": 4.203568219662709e-05, + "loss": 0.1565, + "step": 1634 + }, + { + "epoch": 0.8511192087454451, + "grad_norm": 0.26920318449984887, + "learning_rate": 4.202539646966993e-05, + "loss": 0.1525, + "step": 1635 + }, + { + "epoch": 0.8516397709526289, + "grad_norm": 0.26653026491152526, + "learning_rate": 4.2015105365300276e-05, + "loss": 0.1609, + "step": 1636 + }, + { + "epoch": 0.8521603331598125, + "grad_norm": 0.2827093407759518, + "learning_rate": 4.200480888676853e-05, + "loss": 0.1611, + "step": 1637 + }, + { + "epoch": 0.8526808953669963, + "grad_norm": 0.2652398385704472, + "learning_rate": 4.199450703732681e-05, + "loss": 0.1513, + "step": 1638 + }, + { + "epoch": 0.8532014575741801, + "grad_norm": 0.2696612536923782, + "learning_rate": 4.19841998202289e-05, + "loss": 0.1566, + "step": 1639 + }, + { + "epoch": 0.8537220197813639, + "grad_norm": 0.28633933577187565, + "learning_rate": 4.197388723873032e-05, + "loss": 0.1578, + "step": 1640 + }, + { + "epoch": 0.8542425819885476, + "grad_norm": 0.2690705270209936, + "learning_rate": 4.196356929608825e-05, + "loss": 0.1558, + "step": 1641 + }, + { + "epoch": 0.8547631441957314, + "grad_norm": 0.24483032302626148, + "learning_rate": 4.195324599556158e-05, + "loss": 0.1494, + "step": 1642 + }, + { + "epoch": 0.8552837064029152, + "grad_norm": 0.25311938346196466, + "learning_rate": 4.194291734041089e-05, + "loss": 0.1585, + "step": 1643 + }, + { + "epoch": 0.855804268610099, + "grad_norm": 0.2687437228508878, + "learning_rate": 4.193258333389844e-05, + "loss": 0.1585, + "step": 1644 + }, + { + "epoch": 0.8563248308172826, + "grad_norm": 0.23790698337821162, + "learning_rate": 4.1922243979288205e-05, + "loss": 0.1591, + "step": 1645 + }, + { + "epoch": 0.8568453930244664, + "grad_norm": 0.2720940365664535, + "learning_rate": 4.191189927984583e-05, + "loss": 0.1591, + "step": 1646 + }, + { + "epoch": 0.8573659552316502, + "grad_norm": 0.23733908382168187, + "learning_rate": 4.190154923883865e-05, + "loss": 0.1497, + "step": 1647 + }, + { + "epoch": 0.8578865174388339, + "grad_norm": 0.2767062941528966, + "learning_rate": 4.1891193859535686e-05, + "loss": 0.1606, + "step": 1648 + }, + { + "epoch": 0.8584070796460177, + "grad_norm": 0.23093654286620274, + "learning_rate": 4.1880833145207655e-05, + "loss": 0.15, + "step": 1649 + }, + { + "epoch": 0.8589276418532015, + "grad_norm": 0.24409975896473038, + "learning_rate": 4.187046709912695e-05, + "loss": 0.1608, + "step": 1650 + }, + { + "epoch": 0.8594482040603852, + "grad_norm": 0.2641900349527886, + "learning_rate": 4.186009572456765e-05, + "loss": 0.1583, + "step": 1651 + }, + { + "epoch": 0.859968766267569, + "grad_norm": 0.25565972911270796, + "learning_rate": 4.184971902480552e-05, + "loss": 0.1543, + "step": 1652 + }, + { + "epoch": 0.8604893284747527, + "grad_norm": 0.26824705481294847, + "learning_rate": 4.183933700311801e-05, + "loss": 0.1673, + "step": 1653 + }, + { + "epoch": 0.8610098906819365, + "grad_norm": 0.2430609538785938, + "learning_rate": 4.1828949662784236e-05, + "loss": 0.1462, + "step": 1654 + }, + { + "epoch": 0.8615304528891202, + "grad_norm": 0.2846978931791095, + "learning_rate": 4.1818557007085e-05, + "loss": 0.163, + "step": 1655 + }, + { + "epoch": 0.862051015096304, + "grad_norm": 0.25943378302982295, + "learning_rate": 4.1808159039302795e-05, + "loss": 0.1561, + "step": 1656 + }, + { + "epoch": 0.8625715773034878, + "grad_norm": 0.26198173198500524, + "learning_rate": 4.1797755762721787e-05, + "loss": 0.1552, + "step": 1657 + }, + { + "epoch": 0.8630921395106715, + "grad_norm": 0.24923720021291665, + "learning_rate": 4.17873471806278e-05, + "loss": 0.1586, + "step": 1658 + }, + { + "epoch": 0.8636127017178553, + "grad_norm": 0.2974201345925281, + "learning_rate": 4.177693329630837e-05, + "loss": 0.1578, + "step": 1659 + }, + { + "epoch": 0.8641332639250391, + "grad_norm": 0.24813334349056498, + "learning_rate": 4.176651411305266e-05, + "loss": 0.1585, + "step": 1660 + }, + { + "epoch": 0.8646538261322227, + "grad_norm": 0.2688434182973993, + "learning_rate": 4.175608963415155e-05, + "loss": 0.1627, + "step": 1661 + }, + { + "epoch": 0.8651743883394065, + "grad_norm": 0.2525026939637232, + "learning_rate": 4.174565986289758e-05, + "loss": 0.1588, + "step": 1662 + }, + { + "epoch": 0.8656949505465903, + "grad_norm": 0.2734677290901909, + "learning_rate": 4.1735224802584946e-05, + "loss": 0.1566, + "step": 1663 + }, + { + "epoch": 0.8662155127537741, + "grad_norm": 0.24890893238886064, + "learning_rate": 4.172478445650953e-05, + "loss": 0.1605, + "step": 1664 + }, + { + "epoch": 0.8667360749609578, + "grad_norm": 0.2705875269181692, + "learning_rate": 4.171433882796888e-05, + "loss": 0.1629, + "step": 1665 + }, + { + "epoch": 0.8672566371681416, + "grad_norm": 0.2576695174793914, + "learning_rate": 4.1703887920262195e-05, + "loss": 0.153, + "step": 1666 + }, + { + "epoch": 0.8677771993753254, + "grad_norm": 0.2566955998695762, + "learning_rate": 4.1693431736690386e-05, + "loss": 0.1538, + "step": 1667 + }, + { + "epoch": 0.8682977615825092, + "grad_norm": 0.2795710749963426, + "learning_rate": 4.1682970280555986e-05, + "loss": 0.1593, + "step": 1668 + }, + { + "epoch": 0.8688183237896928, + "grad_norm": 0.2565679446639193, + "learning_rate": 4.1672503555163215e-05, + "loss": 0.167, + "step": 1669 + }, + { + "epoch": 0.8693388859968766, + "grad_norm": 0.26408964383887445, + "learning_rate": 4.166203156381795e-05, + "loss": 0.1518, + "step": 1670 + }, + { + "epoch": 0.8698594482040604, + "grad_norm": 0.24825446479368055, + "learning_rate": 4.1651554309827725e-05, + "loss": 0.1589, + "step": 1671 + }, + { + "epoch": 0.8703800104112441, + "grad_norm": 0.2674815525271392, + "learning_rate": 4.1641071796501764e-05, + "loss": 0.1579, + "step": 1672 + }, + { + "epoch": 0.8709005726184279, + "grad_norm": 0.26322901088276796, + "learning_rate": 4.163058402715091e-05, + "loss": 0.1587, + "step": 1673 + }, + { + "epoch": 0.8714211348256117, + "grad_norm": 0.27350744005632355, + "learning_rate": 4.1620091005087714e-05, + "loss": 0.165, + "step": 1674 + }, + { + "epoch": 0.8719416970327954, + "grad_norm": 0.263352010398878, + "learning_rate": 4.1609592733626335e-05, + "loss": 0.155, + "step": 1675 + }, + { + "epoch": 0.8724622592399792, + "grad_norm": 0.2693710231228204, + "learning_rate": 4.159908921608263e-05, + "loss": 0.1589, + "step": 1676 + }, + { + "epoch": 0.8729828214471629, + "grad_norm": 0.2846789455817984, + "learning_rate": 4.158858045577409e-05, + "loss": 0.1557, + "step": 1677 + }, + { + "epoch": 0.8735033836543467, + "grad_norm": 0.25566156183607636, + "learning_rate": 4.157806645601988e-05, + "loss": 0.1645, + "step": 1678 + }, + { + "epoch": 0.8740239458615304, + "grad_norm": 0.2817627765773898, + "learning_rate": 4.1567547220140814e-05, + "loss": 0.1601, + "step": 1679 + }, + { + "epoch": 0.8745445080687142, + "grad_norm": 0.2505616741147018, + "learning_rate": 4.155702275145934e-05, + "loss": 0.1547, + "step": 1680 + }, + { + "epoch": 0.875065070275898, + "grad_norm": 0.2553052002789369, + "learning_rate": 4.154649305329958e-05, + "loss": 0.1591, + "step": 1681 + }, + { + "epoch": 0.8755856324830817, + "grad_norm": 0.25048785354273917, + "learning_rate": 4.153595812898732e-05, + "loss": 0.1597, + "step": 1682 + }, + { + "epoch": 0.8761061946902655, + "grad_norm": 0.25427482435094684, + "learning_rate": 4.152541798184995e-05, + "loss": 0.161, + "step": 1683 + }, + { + "epoch": 0.8766267568974493, + "grad_norm": 0.24530895241431033, + "learning_rate": 4.151487261521656e-05, + "loss": 0.1581, + "step": 1684 + }, + { + "epoch": 0.877147319104633, + "grad_norm": 0.25440752591956056, + "learning_rate": 4.1504322032417864e-05, + "loss": 0.1638, + "step": 1685 + }, + { + "epoch": 0.8776678813118167, + "grad_norm": 0.2587793514600556, + "learning_rate": 4.149376623678623e-05, + "loss": 0.1587, + "step": 1686 + }, + { + "epoch": 0.8781884435190005, + "grad_norm": 0.24239699887208244, + "learning_rate": 4.148320523165566e-05, + "loss": 0.1514, + "step": 1687 + }, + { + "epoch": 0.8787090057261843, + "grad_norm": 0.24553152349960053, + "learning_rate": 4.147263902036181e-05, + "loss": 0.1538, + "step": 1688 + }, + { + "epoch": 0.879229567933368, + "grad_norm": 0.2499331408281633, + "learning_rate": 4.146206760624199e-05, + "loss": 0.1603, + "step": 1689 + }, + { + "epoch": 0.8797501301405518, + "grad_norm": 0.24703228194514043, + "learning_rate": 4.145149099263515e-05, + "loss": 0.1577, + "step": 1690 + }, + { + "epoch": 0.8802706923477356, + "grad_norm": 0.2723118994778378, + "learning_rate": 4.1440909182881857e-05, + "loss": 0.1587, + "step": 1691 + }, + { + "epoch": 0.8807912545549194, + "grad_norm": 0.2754696773958591, + "learning_rate": 4.143032218032435e-05, + "loss": 0.1562, + "step": 1692 + }, + { + "epoch": 0.881311816762103, + "grad_norm": 0.2617623912854439, + "learning_rate": 4.141972998830651e-05, + "loss": 0.1585, + "step": 1693 + }, + { + "epoch": 0.8818323789692868, + "grad_norm": 0.30890725644539296, + "learning_rate": 4.140913261017382e-05, + "loss": 0.1624, + "step": 1694 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.24884174691256375, + "learning_rate": 4.139853004927344e-05, + "loss": 0.1559, + "step": 1695 + }, + { + "epoch": 0.8828735033836543, + "grad_norm": 0.27439772981131394, + "learning_rate": 4.1387922308954154e-05, + "loss": 0.1562, + "step": 1696 + }, + { + "epoch": 0.8833940655908381, + "grad_norm": 0.2542935955914035, + "learning_rate": 4.137730939256636e-05, + "loss": 0.1619, + "step": 1697 + }, + { + "epoch": 0.8839146277980219, + "grad_norm": 0.28136889554044375, + "learning_rate": 4.1366691303462144e-05, + "loss": 0.1625, + "step": 1698 + }, + { + "epoch": 0.8844351900052057, + "grad_norm": 0.247375024118648, + "learning_rate": 4.135606804499516e-05, + "loss": 0.1599, + "step": 1699 + }, + { + "epoch": 0.8849557522123894, + "grad_norm": 0.2608875547949653, + "learning_rate": 4.1345439620520744e-05, + "loss": 0.1639, + "step": 1700 + }, + { + "epoch": 0.8854763144195731, + "grad_norm": 0.2556809971776182, + "learning_rate": 4.1334806033395845e-05, + "loss": 0.1596, + "step": 1701 + }, + { + "epoch": 0.8859968766267569, + "grad_norm": 0.25036073808421017, + "learning_rate": 4.132416728697905e-05, + "loss": 0.1567, + "step": 1702 + }, + { + "epoch": 0.8865174388339406, + "grad_norm": 0.24789643869487482, + "learning_rate": 4.131352338463056e-05, + "loss": 0.1551, + "step": 1703 + }, + { + "epoch": 0.8870380010411244, + "grad_norm": 0.2597436931442242, + "learning_rate": 4.130287432971222e-05, + "loss": 0.1635, + "step": 1704 + }, + { + "epoch": 0.8875585632483082, + "grad_norm": 0.25176527895305045, + "learning_rate": 4.1292220125587494e-05, + "loss": 0.1529, + "step": 1705 + }, + { + "epoch": 0.888079125455492, + "grad_norm": 0.22936720495484114, + "learning_rate": 4.1281560775621475e-05, + "loss": 0.1536, + "step": 1706 + }, + { + "epoch": 0.8885996876626757, + "grad_norm": 0.29208711846524826, + "learning_rate": 4.1270896283180896e-05, + "loss": 0.1669, + "step": 1707 + }, + { + "epoch": 0.8891202498698595, + "grad_norm": 0.24185973845450207, + "learning_rate": 4.1260226651634074e-05, + "loss": 0.153, + "step": 1708 + }, + { + "epoch": 0.8896408120770432, + "grad_norm": 0.2405834957600981, + "learning_rate": 4.1249551884351e-05, + "loss": 0.1548, + "step": 1709 + }, + { + "epoch": 0.8901613742842269, + "grad_norm": 0.2640002798637536, + "learning_rate": 4.1238871984703255e-05, + "loss": 0.152, + "step": 1710 + }, + { + "epoch": 0.8906819364914107, + "grad_norm": 0.3051529300584874, + "learning_rate": 4.122818695606403e-05, + "loss": 0.161, + "step": 1711 + }, + { + "epoch": 0.8912024986985945, + "grad_norm": 0.2700106707452003, + "learning_rate": 4.121749680180818e-05, + "loss": 0.1564, + "step": 1712 + }, + { + "epoch": 0.8917230609057782, + "grad_norm": 0.26880881799117995, + "learning_rate": 4.1206801525312144e-05, + "loss": 0.1584, + "step": 1713 + }, + { + "epoch": 0.892243623112962, + "grad_norm": 0.2658890951024931, + "learning_rate": 4.119610112995398e-05, + "loss": 0.1567, + "step": 1714 + }, + { + "epoch": 0.8927641853201458, + "grad_norm": 0.2546814903680581, + "learning_rate": 4.118539561911339e-05, + "loss": 0.1554, + "step": 1715 + }, + { + "epoch": 0.8932847475273296, + "grad_norm": 0.24712755281104856, + "learning_rate": 4.1174684996171644e-05, + "loss": 0.1535, + "step": 1716 + }, + { + "epoch": 0.8938053097345132, + "grad_norm": 0.2806241667002411, + "learning_rate": 4.116396926451168e-05, + "loss": 0.1563, + "step": 1717 + }, + { + "epoch": 0.894325871941697, + "grad_norm": 0.27342193998178105, + "learning_rate": 4.115324842751802e-05, + "loss": 0.1592, + "step": 1718 + }, + { + "epoch": 0.8948464341488808, + "grad_norm": 0.23962746337276375, + "learning_rate": 4.114252248857679e-05, + "loss": 0.1472, + "step": 1719 + }, + { + "epoch": 0.8953669963560645, + "grad_norm": 0.26324287304353916, + "learning_rate": 4.1131791451075755e-05, + "loss": 0.159, + "step": 1720 + }, + { + "epoch": 0.8958875585632483, + "grad_norm": 0.26150443069954205, + "learning_rate": 4.1121055318404264e-05, + "loss": 0.1539, + "step": 1721 + }, + { + "epoch": 0.8964081207704321, + "grad_norm": 0.24755890951530543, + "learning_rate": 4.1110314093953305e-05, + "loss": 0.1582, + "step": 1722 + }, + { + "epoch": 0.8969286829776159, + "grad_norm": 0.27523785393031364, + "learning_rate": 4.109956778111544e-05, + "loss": 0.1629, + "step": 1723 + }, + { + "epoch": 0.8974492451847996, + "grad_norm": 0.2591994387571103, + "learning_rate": 4.108881638328486e-05, + "loss": 0.1478, + "step": 1724 + }, + { + "epoch": 0.8979698073919833, + "grad_norm": 0.29966605704015586, + "learning_rate": 4.1078059903857355e-05, + "loss": 0.1601, + "step": 1725 + }, + { + "epoch": 0.8984903695991671, + "grad_norm": 0.25390075851127014, + "learning_rate": 4.1067298346230335e-05, + "loss": 0.1501, + "step": 1726 + }, + { + "epoch": 0.8990109318063508, + "grad_norm": 0.2687980166473991, + "learning_rate": 4.105653171380278e-05, + "loss": 0.1498, + "step": 1727 + }, + { + "epoch": 0.8995314940135346, + "grad_norm": 0.28950396723952554, + "learning_rate": 4.10457600099753e-05, + "loss": 0.1549, + "step": 1728 + }, + { + "epoch": 0.9000520562207184, + "grad_norm": 0.25514115726981085, + "learning_rate": 4.103498323815011e-05, + "loss": 0.1647, + "step": 1729 + }, + { + "epoch": 0.9005726184279021, + "grad_norm": 0.2587811609001319, + "learning_rate": 4.1024201401731005e-05, + "loss": 0.1583, + "step": 1730 + }, + { + "epoch": 0.9010931806350859, + "grad_norm": 0.2604547864469705, + "learning_rate": 4.1013414504123396e-05, + "loss": 0.1561, + "step": 1731 + }, + { + "epoch": 0.9016137428422697, + "grad_norm": 0.24850657249226776, + "learning_rate": 4.1002622548734296e-05, + "loss": 0.1522, + "step": 1732 + }, + { + "epoch": 0.9021343050494534, + "grad_norm": 0.2625089988338353, + "learning_rate": 4.099182553897229e-05, + "loss": 0.1618, + "step": 1733 + }, + { + "epoch": 0.9026548672566371, + "grad_norm": 0.2632632791671295, + "learning_rate": 4.098102347824758e-05, + "loss": 0.1555, + "step": 1734 + }, + { + "epoch": 0.9031754294638209, + "grad_norm": 0.23666054780572104, + "learning_rate": 4.097021636997196e-05, + "loss": 0.1595, + "step": 1735 + }, + { + "epoch": 0.9036959916710047, + "grad_norm": 0.2591388444201557, + "learning_rate": 4.095940421755883e-05, + "loss": 0.1656, + "step": 1736 + }, + { + "epoch": 0.9042165538781884, + "grad_norm": 0.2591712441056336, + "learning_rate": 4.094858702442316e-05, + "loss": 0.1526, + "step": 1737 + }, + { + "epoch": 0.9047371160853722, + "grad_norm": 0.24844682559914494, + "learning_rate": 4.093776479398151e-05, + "loss": 0.157, + "step": 1738 + }, + { + "epoch": 0.905257678292556, + "grad_norm": 0.2559696276702533, + "learning_rate": 4.092693752965208e-05, + "loss": 0.155, + "step": 1739 + }, + { + "epoch": 0.9057782404997398, + "grad_norm": 0.2568180726661143, + "learning_rate": 4.091610523485458e-05, + "loss": 0.1567, + "step": 1740 + }, + { + "epoch": 0.9062988027069234, + "grad_norm": 0.27176199980897775, + "learning_rate": 4.09052679130104e-05, + "loss": 0.1473, + "step": 1741 + }, + { + "epoch": 0.9068193649141072, + "grad_norm": 0.26678573538868655, + "learning_rate": 4.089442556754243e-05, + "loss": 0.1591, + "step": 1742 + }, + { + "epoch": 0.907339927121291, + "grad_norm": 0.2568264736060921, + "learning_rate": 4.088357820187521e-05, + "loss": 0.1624, + "step": 1743 + }, + { + "epoch": 0.9078604893284747, + "grad_norm": 0.2551009665433581, + "learning_rate": 4.087272581943483e-05, + "loss": 0.164, + "step": 1744 + }, + { + "epoch": 0.9083810515356585, + "grad_norm": 0.2596099419866776, + "learning_rate": 4.0861868423648985e-05, + "loss": 0.1593, + "step": 1745 + }, + { + "epoch": 0.9089016137428423, + "grad_norm": 0.25573551965558855, + "learning_rate": 4.085100601794695e-05, + "loss": 0.1598, + "step": 1746 + }, + { + "epoch": 0.9094221759500261, + "grad_norm": 0.25024824627264436, + "learning_rate": 4.084013860575956e-05, + "loss": 0.1499, + "step": 1747 + }, + { + "epoch": 0.9099427381572098, + "grad_norm": 0.24563669418483275, + "learning_rate": 4.0829266190519264e-05, + "loss": 0.1533, + "step": 1748 + }, + { + "epoch": 0.9104633003643935, + "grad_norm": 0.25857714800578796, + "learning_rate": 4.0818388775660083e-05, + "loss": 0.1616, + "step": 1749 + }, + { + "epoch": 0.9109838625715773, + "grad_norm": 0.25995854352713227, + "learning_rate": 4.08075063646176e-05, + "loss": 0.1607, + "step": 1750 + }, + { + "epoch": 0.911504424778761, + "grad_norm": 0.24199870728320377, + "learning_rate": 4.079661896082899e-05, + "loss": 0.155, + "step": 1751 + }, + { + "epoch": 0.9120249869859448, + "grad_norm": 0.2585528070362897, + "learning_rate": 4.0785726567733e-05, + "loss": 0.1695, + "step": 1752 + }, + { + "epoch": 0.9125455491931286, + "grad_norm": 0.2535260703066128, + "learning_rate": 4.0774829188769946e-05, + "loss": 0.1553, + "step": 1753 + }, + { + "epoch": 0.9130661114003124, + "grad_norm": 0.24975570980491604, + "learning_rate": 4.076392682738175e-05, + "loss": 0.155, + "step": 1754 + }, + { + "epoch": 0.9135866736074961, + "grad_norm": 0.23729277290872783, + "learning_rate": 4.075301948701186e-05, + "loss": 0.1559, + "step": 1755 + }, + { + "epoch": 0.9141072358146799, + "grad_norm": 0.2576836288112198, + "learning_rate": 4.074210717110534e-05, + "loss": 0.1508, + "step": 1756 + }, + { + "epoch": 0.9146277980218636, + "grad_norm": 0.23905943214710218, + "learning_rate": 4.07311898831088e-05, + "loss": 0.1595, + "step": 1757 + }, + { + "epoch": 0.9151483602290473, + "grad_norm": 0.2623063356166442, + "learning_rate": 4.072026762647043e-05, + "loss": 0.155, + "step": 1758 + }, + { + "epoch": 0.9156689224362311, + "grad_norm": 0.25004028247757853, + "learning_rate": 4.070934040463998e-05, + "loss": 0.151, + "step": 1759 + }, + { + "epoch": 0.9161894846434149, + "grad_norm": 0.24742792524698254, + "learning_rate": 4.069840822106879e-05, + "loss": 0.158, + "step": 1760 + }, + { + "epoch": 0.9167100468505986, + "grad_norm": 0.2536956962460334, + "learning_rate": 4.068747107920974e-05, + "loss": 0.1525, + "step": 1761 + }, + { + "epoch": 0.9172306090577824, + "grad_norm": 0.2483073074101714, + "learning_rate": 4.067652898251729e-05, + "loss": 0.1516, + "step": 1762 + }, + { + "epoch": 0.9177511712649662, + "grad_norm": 0.24230992836435572, + "learning_rate": 4.066558193444746e-05, + "loss": 0.1521, + "step": 1763 + }, + { + "epoch": 0.91827173347215, + "grad_norm": 0.26776759518460846, + "learning_rate": 4.065462993845784e-05, + "loss": 0.1615, + "step": 1764 + }, + { + "epoch": 0.9187922956793336, + "grad_norm": 0.247701252468107, + "learning_rate": 4.0643672998007593e-05, + "loss": 0.156, + "step": 1765 + }, + { + "epoch": 0.9193128578865174, + "grad_norm": 0.2525494661335194, + "learning_rate": 4.063271111655741e-05, + "loss": 0.1544, + "step": 1766 + }, + { + "epoch": 0.9198334200937012, + "grad_norm": 0.24152047171275015, + "learning_rate": 4.062174429756958e-05, + "loss": 0.1582, + "step": 1767 + }, + { + "epoch": 0.9203539823008849, + "grad_norm": 0.24113773563979826, + "learning_rate": 4.0610772544507925e-05, + "loss": 0.157, + "step": 1768 + }, + { + "epoch": 0.9208745445080687, + "grad_norm": 0.24499303443994122, + "learning_rate": 4.059979586083783e-05, + "loss": 0.1589, + "step": 1769 + }, + { + "epoch": 0.9213951067152525, + "grad_norm": 0.2441852531096595, + "learning_rate": 4.0588814250026255e-05, + "loss": 0.1514, + "step": 1770 + }, + { + "epoch": 0.9219156689224363, + "grad_norm": 0.2618027069982354, + "learning_rate": 4.05778277155417e-05, + "loss": 0.1581, + "step": 1771 + }, + { + "epoch": 0.92243623112962, + "grad_norm": 0.2604906720770046, + "learning_rate": 4.056683626085422e-05, + "loss": 0.161, + "step": 1772 + }, + { + "epoch": 0.9229567933368037, + "grad_norm": 0.23267751863286024, + "learning_rate": 4.0555839889435446e-05, + "loss": 0.1482, + "step": 1773 + }, + { + "epoch": 0.9234773555439875, + "grad_norm": 0.25641954777935727, + "learning_rate": 4.054483860475851e-05, + "loss": 0.1541, + "step": 1774 + }, + { + "epoch": 0.9239979177511712, + "grad_norm": 0.2596515890918495, + "learning_rate": 4.053383241029815e-05, + "loss": 0.1583, + "step": 1775 + }, + { + "epoch": 0.924518479958355, + "grad_norm": 0.23481778190953714, + "learning_rate": 4.0522821309530635e-05, + "loss": 0.1548, + "step": 1776 + }, + { + "epoch": 0.9250390421655388, + "grad_norm": 0.24362441004243998, + "learning_rate": 4.051180530593379e-05, + "loss": 0.1584, + "step": 1777 + }, + { + "epoch": 0.9255596043727226, + "grad_norm": 0.2520010539214911, + "learning_rate": 4.0500784402986956e-05, + "loss": 0.1551, + "step": 1778 + }, + { + "epoch": 0.9260801665799063, + "grad_norm": 0.26110890320468977, + "learning_rate": 4.0489758604171076e-05, + "loss": 0.1531, + "step": 1779 + }, + { + "epoch": 0.9266007287870901, + "grad_norm": 0.27497633290869694, + "learning_rate": 4.047872791296859e-05, + "loss": 0.1618, + "step": 1780 + }, + { + "epoch": 0.9271212909942738, + "grad_norm": 0.26975908040013474, + "learning_rate": 4.0467692332863515e-05, + "loss": 0.1487, + "step": 1781 + }, + { + "epoch": 0.9276418532014575, + "grad_norm": 0.26862955487856527, + "learning_rate": 4.04566518673414e-05, + "loss": 0.1561, + "step": 1782 + }, + { + "epoch": 0.9281624154086413, + "grad_norm": 0.25511699277211003, + "learning_rate": 4.044560651988933e-05, + "loss": 0.1615, + "step": 1783 + }, + { + "epoch": 0.9286829776158251, + "grad_norm": 0.2667422054425098, + "learning_rate": 4.043455629399594e-05, + "loss": 0.1582, + "step": 1784 + }, + { + "epoch": 0.9292035398230089, + "grad_norm": 0.2706278456881864, + "learning_rate": 4.0423501193151416e-05, + "loss": 0.1537, + "step": 1785 + }, + { + "epoch": 0.9297241020301926, + "grad_norm": 0.2540911793302258, + "learning_rate": 4.041244122084747e-05, + "loss": 0.1513, + "step": 1786 + }, + { + "epoch": 0.9302446642373764, + "grad_norm": 0.25585354118375886, + "learning_rate": 4.040137638057735e-05, + "loss": 0.1558, + "step": 1787 + }, + { + "epoch": 0.9307652264445602, + "grad_norm": 0.23363718441591247, + "learning_rate": 4.039030667583585e-05, + "loss": 0.1468, + "step": 1788 + }, + { + "epoch": 0.9312857886517438, + "grad_norm": 0.2779496142875094, + "learning_rate": 4.037923211011929e-05, + "loss": 0.1513, + "step": 1789 + }, + { + "epoch": 0.9318063508589276, + "grad_norm": 0.235742116241504, + "learning_rate": 4.036815268692556e-05, + "loss": 0.1501, + "step": 1790 + }, + { + "epoch": 0.9323269130661114, + "grad_norm": 0.2561251442991573, + "learning_rate": 4.035706840975403e-05, + "loss": 0.1611, + "step": 1791 + }, + { + "epoch": 0.9328474752732951, + "grad_norm": 0.2564452558632798, + "learning_rate": 4.0345979282105637e-05, + "loss": 0.1573, + "step": 1792 + }, + { + "epoch": 0.9333680374804789, + "grad_norm": 0.2545473340743804, + "learning_rate": 4.033488530748285e-05, + "loss": 0.1543, + "step": 1793 + }, + { + "epoch": 0.9338885996876627, + "grad_norm": 0.25319840729825216, + "learning_rate": 4.032378648938966e-05, + "loss": 0.1591, + "step": 1794 + }, + { + "epoch": 0.9344091618948465, + "grad_norm": 0.24462206700432024, + "learning_rate": 4.031268283133158e-05, + "loss": 0.1492, + "step": 1795 + }, + { + "epoch": 0.9349297241020302, + "grad_norm": 0.2442034089324051, + "learning_rate": 4.030157433681568e-05, + "loss": 0.1564, + "step": 1796 + }, + { + "epoch": 0.9354502863092139, + "grad_norm": 0.23610337924487748, + "learning_rate": 4.0290461009350535e-05, + "loss": 0.1446, + "step": 1797 + }, + { + "epoch": 0.9359708485163977, + "grad_norm": 0.7555257854589889, + "learning_rate": 4.0279342852446234e-05, + "loss": 0.154, + "step": 1798 + }, + { + "epoch": 0.9364914107235814, + "grad_norm": 0.2730083918906193, + "learning_rate": 4.026821986961443e-05, + "loss": 0.1557, + "step": 1799 + }, + { + "epoch": 0.9370119729307652, + "grad_norm": 0.24844427506768513, + "learning_rate": 4.0257092064368266e-05, + "loss": 0.1556, + "step": 1800 + }, + { + "epoch": 0.937532535137949, + "grad_norm": 0.2550146011348462, + "learning_rate": 4.0245959440222425e-05, + "loss": 0.1526, + "step": 1801 + }, + { + "epoch": 0.9380530973451328, + "grad_norm": 0.2643186732751096, + "learning_rate": 4.023482200069311e-05, + "loss": 0.1648, + "step": 1802 + }, + { + "epoch": 0.9385736595523165, + "grad_norm": 0.2375368349668477, + "learning_rate": 4.0223679749298025e-05, + "loss": 0.1558, + "step": 1803 + }, + { + "epoch": 0.9390942217595003, + "grad_norm": 0.24780404748279455, + "learning_rate": 4.021253268955644e-05, + "loss": 0.1558, + "step": 1804 + }, + { + "epoch": 0.939614783966684, + "grad_norm": 0.24937088179496925, + "learning_rate": 4.02013808249891e-05, + "loss": 0.1525, + "step": 1805 + }, + { + "epoch": 0.9401353461738677, + "grad_norm": 0.247516280623822, + "learning_rate": 4.019022415911828e-05, + "loss": 0.1497, + "step": 1806 + }, + { + "epoch": 0.9406559083810515, + "grad_norm": 0.2565387438992933, + "learning_rate": 4.0179062695467784e-05, + "loss": 0.154, + "step": 1807 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.25957570816875336, + "learning_rate": 4.016789643756291e-05, + "loss": 0.1575, + "step": 1808 + }, + { + "epoch": 0.941697032795419, + "grad_norm": 0.25711864420404396, + "learning_rate": 4.0156725388930495e-05, + "loss": 0.1583, + "step": 1809 + }, + { + "epoch": 0.9422175950026028, + "grad_norm": 0.2439002536135534, + "learning_rate": 4.014554955309886e-05, + "loss": 0.1538, + "step": 1810 + }, + { + "epoch": 0.9427381572097866, + "grad_norm": 0.2734062021761085, + "learning_rate": 4.0134368933597863e-05, + "loss": 0.1587, + "step": 1811 + }, + { + "epoch": 0.9432587194169704, + "grad_norm": 0.24078383939318468, + "learning_rate": 4.012318353395887e-05, + "loss": 0.1541, + "step": 1812 + }, + { + "epoch": 0.943779281624154, + "grad_norm": 0.25527670699498406, + "learning_rate": 4.011199335771475e-05, + "loss": 0.1641, + "step": 1813 + }, + { + "epoch": 0.9442998438313378, + "grad_norm": 0.26131896589926484, + "learning_rate": 4.010079840839987e-05, + "loss": 0.1602, + "step": 1814 + }, + { + "epoch": 0.9448204060385216, + "grad_norm": 0.2457034759853004, + "learning_rate": 4.0089598689550126e-05, + "loss": 0.149, + "step": 1815 + }, + { + "epoch": 0.9453409682457053, + "grad_norm": 0.2466235816689623, + "learning_rate": 4.0078394204702895e-05, + "loss": 0.151, + "step": 1816 + }, + { + "epoch": 0.9458615304528891, + "grad_norm": 0.261112482926145, + "learning_rate": 4.0067184957397096e-05, + "loss": 0.1536, + "step": 1817 + }, + { + "epoch": 0.9463820926600729, + "grad_norm": 0.2758752170090889, + "learning_rate": 4.0055970951173116e-05, + "loss": 0.1633, + "step": 1818 + }, + { + "epoch": 0.9469026548672567, + "grad_norm": 0.266272488103662, + "learning_rate": 4.004475218957287e-05, + "loss": 0.1568, + "step": 1819 + }, + { + "epoch": 0.9474232170744404, + "grad_norm": 0.2733192385804763, + "learning_rate": 4.003352867613975e-05, + "loss": 0.1526, + "step": 1820 + }, + { + "epoch": 0.9479437792816241, + "grad_norm": 0.2502565692501297, + "learning_rate": 4.002230041441868e-05, + "loss": 0.1498, + "step": 1821 + }, + { + "epoch": 0.9484643414888079, + "grad_norm": 0.2981229168833489, + "learning_rate": 4.001106740795607e-05, + "loss": 0.1535, + "step": 1822 + }, + { + "epoch": 0.9489849036959916, + "grad_norm": 0.25792533892269764, + "learning_rate": 3.9999829660299806e-05, + "loss": 0.1546, + "step": 1823 + }, + { + "epoch": 0.9495054659031754, + "grad_norm": 0.2754778565483347, + "learning_rate": 3.998858717499931e-05, + "loss": 0.154, + "step": 1824 + }, + { + "epoch": 0.9500260281103592, + "grad_norm": 0.25581628643187587, + "learning_rate": 3.997733995560547e-05, + "loss": 0.1588, + "step": 1825 + }, + { + "epoch": 0.950546590317543, + "grad_norm": 0.29974295677568763, + "learning_rate": 3.9966088005670686e-05, + "loss": 0.1581, + "step": 1826 + }, + { + "epoch": 0.9510671525247267, + "grad_norm": 0.2566154047129761, + "learning_rate": 3.995483132874885e-05, + "loss": 0.1521, + "step": 1827 + }, + { + "epoch": 0.9515877147319105, + "grad_norm": 0.26876875526607946, + "learning_rate": 3.994356992839535e-05, + "loss": 0.1608, + "step": 1828 + }, + { + "epoch": 0.9521082769390942, + "grad_norm": 0.2774171688241344, + "learning_rate": 3.993230380816705e-05, + "loss": 0.1515, + "step": 1829 + }, + { + "epoch": 0.9526288391462779, + "grad_norm": 0.24564551896719408, + "learning_rate": 3.9921032971622306e-05, + "loss": 0.1561, + "step": 1830 + }, + { + "epoch": 0.9531494013534617, + "grad_norm": 0.2463503110268932, + "learning_rate": 3.9909757422321e-05, + "loss": 0.1537, + "step": 1831 + }, + { + "epoch": 0.9536699635606455, + "grad_norm": 0.267020735900316, + "learning_rate": 3.9898477163824454e-05, + "loss": 0.1522, + "step": 1832 + }, + { + "epoch": 0.9541905257678293, + "grad_norm": 0.2602448863644486, + "learning_rate": 3.98871921996955e-05, + "loss": 0.1571, + "step": 1833 + }, + { + "epoch": 0.954711087975013, + "grad_norm": 0.24776161325403143, + "learning_rate": 3.9875902533498465e-05, + "loss": 0.1578, + "step": 1834 + }, + { + "epoch": 0.9552316501821968, + "grad_norm": 0.25779665449960754, + "learning_rate": 3.986460816879913e-05, + "loss": 0.1496, + "step": 1835 + }, + { + "epoch": 0.9557522123893806, + "grad_norm": 0.2701760173076487, + "learning_rate": 3.985330910916482e-05, + "loss": 0.1569, + "step": 1836 + }, + { + "epoch": 0.9562727745965642, + "grad_norm": 0.25325580552113314, + "learning_rate": 3.984200535816427e-05, + "loss": 0.1566, + "step": 1837 + }, + { + "epoch": 0.956793336803748, + "grad_norm": 0.29664763959854834, + "learning_rate": 3.983069691936773e-05, + "loss": 0.1534, + "step": 1838 + }, + { + "epoch": 0.9573138990109318, + "grad_norm": 0.27443913965950784, + "learning_rate": 3.981938379634696e-05, + "loss": 0.1587, + "step": 1839 + }, + { + "epoch": 0.9578344612181156, + "grad_norm": 0.3036453223080745, + "learning_rate": 3.980806599267514e-05, + "loss": 0.1622, + "step": 1840 + }, + { + "epoch": 0.9583550234252993, + "grad_norm": 0.2703683477848925, + "learning_rate": 3.979674351192697e-05, + "loss": 0.1512, + "step": 1841 + }, + { + "epoch": 0.9588755856324831, + "grad_norm": 0.25118682959451855, + "learning_rate": 3.978541635767862e-05, + "loss": 0.1479, + "step": 1842 + }, + { + "epoch": 0.9593961478396669, + "grad_norm": 0.27007643798115183, + "learning_rate": 3.977408453350773e-05, + "loss": 0.1478, + "step": 1843 + }, + { + "epoch": 0.9599167100468506, + "grad_norm": 0.24905501625360876, + "learning_rate": 3.976274804299342e-05, + "loss": 0.1492, + "step": 1844 + }, + { + "epoch": 0.9604372722540343, + "grad_norm": 0.2721594566362824, + "learning_rate": 3.975140688971628e-05, + "loss": 0.1577, + "step": 1845 + }, + { + "epoch": 0.9609578344612181, + "grad_norm": 0.26822647420622714, + "learning_rate": 3.974006107725837e-05, + "loss": 0.1542, + "step": 1846 + }, + { + "epoch": 0.9614783966684018, + "grad_norm": 0.2610073474569343, + "learning_rate": 3.972871060920323e-05, + "loss": 0.1573, + "step": 1847 + }, + { + "epoch": 0.9619989588755856, + "grad_norm": 0.24741924184000724, + "learning_rate": 3.971735548913586e-05, + "loss": 0.1572, + "step": 1848 + }, + { + "epoch": 0.9625195210827694, + "grad_norm": 0.2497560572681831, + "learning_rate": 3.970599572064275e-05, + "loss": 0.1627, + "step": 1849 + }, + { + "epoch": 0.9630400832899532, + "grad_norm": 0.261255891586638, + "learning_rate": 3.969463130731183e-05, + "loss": 0.146, + "step": 1850 + }, + { + "epoch": 0.9635606454971369, + "grad_norm": 0.2661862951079788, + "learning_rate": 3.968326225273251e-05, + "loss": 0.1549, + "step": 1851 + }, + { + "epoch": 0.9640812077043207, + "grad_norm": 0.26696523143923034, + "learning_rate": 3.9671888560495676e-05, + "loss": 0.1584, + "step": 1852 + }, + { + "epoch": 0.9646017699115044, + "grad_norm": 0.28425969486144664, + "learning_rate": 3.966051023419366e-05, + "loss": 0.1585, + "step": 1853 + }, + { + "epoch": 0.9651223321186881, + "grad_norm": 0.2252880385971139, + "learning_rate": 3.964912727742027e-05, + "loss": 0.1468, + "step": 1854 + }, + { + "epoch": 0.9656428943258719, + "grad_norm": 0.2746297685611525, + "learning_rate": 3.963773969377077e-05, + "loss": 0.1526, + "step": 1855 + }, + { + "epoch": 0.9661634565330557, + "grad_norm": 0.2562090900129971, + "learning_rate": 3.9626347486841896e-05, + "loss": 0.1574, + "step": 1856 + }, + { + "epoch": 0.9666840187402395, + "grad_norm": 0.2641947002589958, + "learning_rate": 3.961495066023184e-05, + "loss": 0.1579, + "step": 1857 + }, + { + "epoch": 0.9672045809474232, + "grad_norm": 0.2576610491884646, + "learning_rate": 3.9603549217540235e-05, + "loss": 0.1566, + "step": 1858 + }, + { + "epoch": 0.967725143154607, + "grad_norm": 0.22546307618447003, + "learning_rate": 3.959214316236821e-05, + "loss": 0.1451, + "step": 1859 + }, + { + "epoch": 0.9682457053617908, + "grad_norm": 0.27228400782836965, + "learning_rate": 3.95807324983183e-05, + "loss": 0.1544, + "step": 1860 + }, + { + "epoch": 0.9687662675689744, + "grad_norm": 0.2404970285210459, + "learning_rate": 3.956931722899454e-05, + "loss": 0.1586, + "step": 1861 + }, + { + "epoch": 0.9692868297761582, + "grad_norm": 0.2483984351066557, + "learning_rate": 3.955789735800241e-05, + "loss": 0.1519, + "step": 1862 + }, + { + "epoch": 0.969807391983342, + "grad_norm": 0.26080827724904837, + "learning_rate": 3.954647288894883e-05, + "loss": 0.1577, + "step": 1863 + }, + { + "epoch": 0.9703279541905258, + "grad_norm": 0.24279634879644127, + "learning_rate": 3.953504382544216e-05, + "loss": 0.1529, + "step": 1864 + }, + { + "epoch": 0.9708485163977095, + "grad_norm": 0.24834103365291627, + "learning_rate": 3.952361017109226e-05, + "loss": 0.1494, + "step": 1865 + }, + { + "epoch": 0.9713690786048933, + "grad_norm": 0.24460951509898635, + "learning_rate": 3.95121719295104e-05, + "loss": 0.1571, + "step": 1866 + }, + { + "epoch": 0.9718896408120771, + "grad_norm": 0.26683915335760894, + "learning_rate": 3.95007291043093e-05, + "loss": 0.1531, + "step": 1867 + }, + { + "epoch": 0.9724102030192608, + "grad_norm": 0.24054568963656242, + "learning_rate": 3.9489281699103145e-05, + "loss": 0.1516, + "step": 1868 + }, + { + "epoch": 0.9729307652264445, + "grad_norm": 0.24375115372025324, + "learning_rate": 3.947782971750755e-05, + "loss": 0.1488, + "step": 1869 + }, + { + "epoch": 0.9734513274336283, + "grad_norm": 0.25412067934433286, + "learning_rate": 3.94663731631396e-05, + "loss": 0.1607, + "step": 1870 + }, + { + "epoch": 0.973971889640812, + "grad_norm": 0.2397702543825332, + "learning_rate": 3.945491203961779e-05, + "loss": 0.1511, + "step": 1871 + }, + { + "epoch": 0.9744924518479958, + "grad_norm": 0.2806540915038973, + "learning_rate": 3.94434463505621e-05, + "loss": 0.154, + "step": 1872 + }, + { + "epoch": 0.9750130140551796, + "grad_norm": 0.2546854416214566, + "learning_rate": 3.9431976099593896e-05, + "loss": 0.1605, + "step": 1873 + }, + { + "epoch": 0.9755335762623634, + "grad_norm": 0.2575288721304389, + "learning_rate": 3.942050129033603e-05, + "loss": 0.1485, + "step": 1874 + }, + { + "epoch": 0.9760541384695471, + "grad_norm": 0.26908803144635207, + "learning_rate": 3.9409021926412795e-05, + "loss": 0.1549, + "step": 1875 + }, + { + "epoch": 0.9765747006767309, + "grad_norm": 0.29289715788111687, + "learning_rate": 3.9397538011449894e-05, + "loss": 0.1563, + "step": 1876 + }, + { + "epoch": 0.9770952628839146, + "grad_norm": 0.23831369536611632, + "learning_rate": 3.938604954907449e-05, + "loss": 0.1541, + "step": 1877 + }, + { + "epoch": 0.9776158250910983, + "grad_norm": 0.2589658450603122, + "learning_rate": 3.9374556542915167e-05, + "loss": 0.1519, + "step": 1878 + }, + { + "epoch": 0.9781363872982821, + "grad_norm": 0.3008301402515451, + "learning_rate": 3.936305899660195e-05, + "loss": 0.1554, + "step": 1879 + }, + { + "epoch": 0.9786569495054659, + "grad_norm": 0.2579134695099253, + "learning_rate": 3.935155691376631e-05, + "loss": 0.16, + "step": 1880 + }, + { + "epoch": 0.9791775117126497, + "grad_norm": 0.2755461345867494, + "learning_rate": 3.934005029804112e-05, + "loss": 0.1495, + "step": 1881 + }, + { + "epoch": 0.9796980739198334, + "grad_norm": 0.25850824899183367, + "learning_rate": 3.9328539153060725e-05, + "loss": 0.1589, + "step": 1882 + }, + { + "epoch": 0.9802186361270172, + "grad_norm": 0.2823202784003672, + "learning_rate": 3.931702348246087e-05, + "loss": 0.1588, + "step": 1883 + }, + { + "epoch": 0.980739198334201, + "grad_norm": 0.25907134737796256, + "learning_rate": 3.930550328987875e-05, + "loss": 0.1516, + "step": 1884 + }, + { + "epoch": 0.9812597605413846, + "grad_norm": 0.26429422471247077, + "learning_rate": 3.929397857895297e-05, + "loss": 0.1571, + "step": 1885 + }, + { + "epoch": 0.9817803227485684, + "grad_norm": 0.25776033042758834, + "learning_rate": 3.928244935332356e-05, + "loss": 0.1565, + "step": 1886 + }, + { + "epoch": 0.9823008849557522, + "grad_norm": 0.23117386280694466, + "learning_rate": 3.9270915616632e-05, + "loss": 0.1532, + "step": 1887 + }, + { + "epoch": 0.982821447162936, + "grad_norm": 0.2589601378005726, + "learning_rate": 3.9259377372521176e-05, + "loss": 0.1603, + "step": 1888 + }, + { + "epoch": 0.9833420093701197, + "grad_norm": 0.24736053021683718, + "learning_rate": 3.924783462463541e-05, + "loss": 0.1482, + "step": 1889 + }, + { + "epoch": 0.9838625715773035, + "grad_norm": 0.24050172557296737, + "learning_rate": 3.923628737662043e-05, + "loss": 0.1558, + "step": 1890 + }, + { + "epoch": 0.9843831337844873, + "grad_norm": 0.25204285474497545, + "learning_rate": 3.9224735632123395e-05, + "loss": 0.1588, + "step": 1891 + }, + { + "epoch": 0.984903695991671, + "grad_norm": 0.24569292644864882, + "learning_rate": 3.921317939479289e-05, + "loss": 0.149, + "step": 1892 + }, + { + "epoch": 0.9854242581988547, + "grad_norm": 0.2591242502253903, + "learning_rate": 3.920161866827889e-05, + "loss": 0.1515, + "step": 1893 + }, + { + "epoch": 0.9859448204060385, + "grad_norm": 0.25314114488184825, + "learning_rate": 3.919005345623285e-05, + "loss": 0.1583, + "step": 1894 + }, + { + "epoch": 0.9864653826132223, + "grad_norm": 0.2707662697095083, + "learning_rate": 3.917848376230757e-05, + "loss": 0.1474, + "step": 1895 + }, + { + "epoch": 0.986985944820406, + "grad_norm": 0.2489741320109898, + "learning_rate": 3.916690959015731e-05, + "loss": 0.1604, + "step": 1896 + }, + { + "epoch": 0.9875065070275898, + "grad_norm": 0.26650693500099853, + "learning_rate": 3.915533094343773e-05, + "loss": 0.1484, + "step": 1897 + }, + { + "epoch": 0.9880270692347736, + "grad_norm": 0.24576821619143538, + "learning_rate": 3.914374782580591e-05, + "loss": 0.1533, + "step": 1898 + }, + { + "epoch": 0.9885476314419573, + "grad_norm": 0.2242445667075516, + "learning_rate": 3.913216024092032e-05, + "loss": 0.1453, + "step": 1899 + }, + { + "epoch": 0.9890681936491411, + "grad_norm": 0.24170044956082523, + "learning_rate": 3.912056819244089e-05, + "loss": 0.15, + "step": 1900 + }, + { + "epoch": 0.9895887558563248, + "grad_norm": 0.23999183798149767, + "learning_rate": 3.910897168402889e-05, + "loss": 0.1445, + "step": 1901 + }, + { + "epoch": 0.9901093180635085, + "grad_norm": 0.24978308599133406, + "learning_rate": 3.909737071934707e-05, + "loss": 0.1516, + "step": 1902 + }, + { + "epoch": 0.9906298802706923, + "grad_norm": 0.2477655018309477, + "learning_rate": 3.9085765302059554e-05, + "loss": 0.1617, + "step": 1903 + }, + { + "epoch": 0.9911504424778761, + "grad_norm": 0.24166671338937565, + "learning_rate": 3.907415543583184e-05, + "loss": 0.1537, + "step": 1904 + }, + { + "epoch": 0.9916710046850599, + "grad_norm": 0.24364319933406497, + "learning_rate": 3.9062541124330884e-05, + "loss": 0.1529, + "step": 1905 + }, + { + "epoch": 0.9921915668922436, + "grad_norm": 0.2646257000445511, + "learning_rate": 3.905092237122504e-05, + "loss": 0.1554, + "step": 1906 + }, + { + "epoch": 0.9927121290994274, + "grad_norm": 0.2496525283069466, + "learning_rate": 3.903929918018403e-05, + "loss": 0.1511, + "step": 1907 + }, + { + "epoch": 0.9932326913066112, + "grad_norm": 0.2309200833828225, + "learning_rate": 3.902767155487901e-05, + "loss": 0.154, + "step": 1908 + }, + { + "epoch": 0.9937532535137948, + "grad_norm": 0.2451270375417471, + "learning_rate": 3.9016039498982515e-05, + "loss": 0.1453, + "step": 1909 + }, + { + "epoch": 0.9942738157209786, + "grad_norm": 0.2699658100912516, + "learning_rate": 3.90044030161685e-05, + "loss": 0.1545, + "step": 1910 + }, + { + "epoch": 0.9947943779281624, + "grad_norm": 0.26692292482327223, + "learning_rate": 3.8992762110112304e-05, + "loss": 0.1561, + "step": 1911 + }, + { + "epoch": 0.9953149401353462, + "grad_norm": 0.24585278390934867, + "learning_rate": 3.8981116784490666e-05, + "loss": 0.147, + "step": 1912 + }, + { + "epoch": 0.9958355023425299, + "grad_norm": 0.25147680505901215, + "learning_rate": 3.896946704298172e-05, + "loss": 0.1521, + "step": 1913 + }, + { + "epoch": 0.9963560645497137, + "grad_norm": 0.25230893137578037, + "learning_rate": 3.8957812889265e-05, + "loss": 0.1541, + "step": 1914 + }, + { + "epoch": 0.9968766267568975, + "grad_norm": 0.24038428447771637, + "learning_rate": 3.8946154327021434e-05, + "loss": 0.1498, + "step": 1915 + }, + { + "epoch": 0.9973971889640812, + "grad_norm": 0.2466022950313137, + "learning_rate": 3.893449135993333e-05, + "loss": 0.1632, + "step": 1916 + }, + { + "epoch": 0.9979177511712649, + "grad_norm": 0.25366577440341587, + "learning_rate": 3.89228239916844e-05, + "loss": 0.146, + "step": 1917 + }, + { + "epoch": 0.9984383133784487, + "grad_norm": 0.24171289918790415, + "learning_rate": 3.8911152225959743e-05, + "loss": 0.1561, + "step": 1918 + }, + { + "epoch": 0.9989588755856325, + "grad_norm": 0.2581505141982399, + "learning_rate": 3.889947606644584e-05, + "loss": 0.1595, + "step": 1919 + }, + { + "epoch": 0.9994794377928162, + "grad_norm": 0.24747053780836764, + "learning_rate": 3.888779551683057e-05, + "loss": 0.1519, + "step": 1920 + }, + { + "epoch": 1.0, + "grad_norm": 0.237124505232837, + "learning_rate": 3.8876110580803186e-05, + "loss": 0.1481, + "step": 1921 + }, + { + "epoch": 1.0005205622071838, + "grad_norm": 0.2661627018308619, + "learning_rate": 3.886442126205435e-05, + "loss": 0.1194, + "step": 1922 + }, + { + "epoch": 1.0010411244143675, + "grad_norm": 0.2520650370090721, + "learning_rate": 3.8852727564276086e-05, + "loss": 0.1134, + "step": 1923 + }, + { + "epoch": 1.0015616866215513, + "grad_norm": 0.22725768987546488, + "learning_rate": 3.884102949116181e-05, + "loss": 0.1149, + "step": 1924 + }, + { + "epoch": 1.002082248828735, + "grad_norm": 0.25305995229234013, + "learning_rate": 3.8829327046406304e-05, + "loss": 0.1111, + "step": 1925 + }, + { + "epoch": 1.0026028110359189, + "grad_norm": 0.27939931613711055, + "learning_rate": 3.881762023370576e-05, + "loss": 0.1104, + "step": 1926 + }, + { + "epoch": 1.0031233732431026, + "grad_norm": 0.3043520089700027, + "learning_rate": 3.880590905675773e-05, + "loss": 0.1232, + "step": 1927 + }, + { + "epoch": 1.0036439354502864, + "grad_norm": 0.261721428329474, + "learning_rate": 3.879419351926115e-05, + "loss": 0.1076, + "step": 1928 + }, + { + "epoch": 1.0041644976574702, + "grad_norm": 0.2700146216167829, + "learning_rate": 3.878247362491633e-05, + "loss": 0.1143, + "step": 1929 + }, + { + "epoch": 1.0046850598646537, + "grad_norm": 0.2512971409772084, + "learning_rate": 3.877074937742495e-05, + "loss": 0.1063, + "step": 1930 + }, + { + "epoch": 1.0052056220718375, + "grad_norm": 0.24157054726586163, + "learning_rate": 3.8759020780490094e-05, + "loss": 0.1139, + "step": 1931 + }, + { + "epoch": 1.0057261842790213, + "grad_norm": 0.25231694869276755, + "learning_rate": 3.8747287837816184e-05, + "loss": 0.1167, + "step": 1932 + }, + { + "epoch": 1.006246746486205, + "grad_norm": 0.28056071702883195, + "learning_rate": 3.8735550553109024e-05, + "loss": 0.1148, + "step": 1933 + }, + { + "epoch": 1.0067673086933888, + "grad_norm": 0.23802118673016223, + "learning_rate": 3.87238089300758e-05, + "loss": 0.1061, + "step": 1934 + }, + { + "epoch": 1.0072878709005726, + "grad_norm": 0.2518313655139843, + "learning_rate": 3.8712062972425077e-05, + "loss": 0.1144, + "step": 1935 + }, + { + "epoch": 1.0078084331077564, + "grad_norm": 0.2687038083950964, + "learning_rate": 3.870031268386676e-05, + "loss": 0.1153, + "step": 1936 + }, + { + "epoch": 1.0083289953149401, + "grad_norm": 0.2618181075208367, + "learning_rate": 3.868855806811212e-05, + "loss": 0.1177, + "step": 1937 + }, + { + "epoch": 1.008849557522124, + "grad_norm": 0.2685854416587716, + "learning_rate": 3.867679912887385e-05, + "loss": 0.1186, + "step": 1938 + }, + { + "epoch": 1.0093701197293077, + "grad_norm": 0.26559471123613715, + "learning_rate": 3.866503586986595e-05, + "loss": 0.1161, + "step": 1939 + }, + { + "epoch": 1.0098906819364915, + "grad_norm": 0.24409238460263524, + "learning_rate": 3.865326829480381e-05, + "loss": 0.1102, + "step": 1940 + }, + { + "epoch": 1.0104112441436752, + "grad_norm": 0.2937743298818115, + "learning_rate": 3.864149640740417e-05, + "loss": 0.1145, + "step": 1941 + }, + { + "epoch": 1.010931806350859, + "grad_norm": 0.25001092468218145, + "learning_rate": 3.862972021138514e-05, + "loss": 0.1117, + "step": 1942 + }, + { + "epoch": 1.0114523685580428, + "grad_norm": 0.24032698365614474, + "learning_rate": 3.86179397104662e-05, + "loss": 0.1083, + "step": 1943 + }, + { + "epoch": 1.0119729307652265, + "grad_norm": 0.26454925482485414, + "learning_rate": 3.860615490836817e-05, + "loss": 0.1122, + "step": 1944 + }, + { + "epoch": 1.01249349297241, + "grad_norm": 0.25931975068403923, + "learning_rate": 3.859436580881325e-05, + "loss": 0.113, + "step": 1945 + }, + { + "epoch": 1.0130140551795939, + "grad_norm": 0.28362163389354017, + "learning_rate": 3.858257241552498e-05, + "loss": 0.1158, + "step": 1946 + }, + { + "epoch": 1.0135346173867776, + "grad_norm": 0.24978572696444806, + "learning_rate": 3.857077473222825e-05, + "loss": 0.1084, + "step": 1947 + }, + { + "epoch": 1.0140551795939614, + "grad_norm": 0.24629450541328152, + "learning_rate": 3.855897276264934e-05, + "loss": 0.1108, + "step": 1948 + }, + { + "epoch": 1.0145757418011452, + "grad_norm": 0.24448076133842503, + "learning_rate": 3.8547166510515854e-05, + "loss": 0.1142, + "step": 1949 + }, + { + "epoch": 1.015096304008329, + "grad_norm": 0.25912362617154444, + "learning_rate": 3.8535355979556755e-05, + "loss": 0.1185, + "step": 1950 + }, + { + "epoch": 1.0156168662155127, + "grad_norm": 0.26406004163474434, + "learning_rate": 3.852354117350235e-05, + "loss": 0.1137, + "step": 1951 + }, + { + "epoch": 1.0161374284226965, + "grad_norm": 0.23169769727224848, + "learning_rate": 3.8511722096084313e-05, + "loss": 0.1058, + "step": 1952 + }, + { + "epoch": 1.0166579906298803, + "grad_norm": 0.2420173219027427, + "learning_rate": 3.8499898751035656e-05, + "loss": 0.1091, + "step": 1953 + }, + { + "epoch": 1.017178552837064, + "grad_norm": 0.24237055917616832, + "learning_rate": 3.848807114209074e-05, + "loss": 0.1121, + "step": 1954 + }, + { + "epoch": 1.0176991150442478, + "grad_norm": 0.2417178678114401, + "learning_rate": 3.8476239272985284e-05, + "loss": 0.1138, + "step": 1955 + }, + { + "epoch": 1.0182196772514316, + "grad_norm": 0.24659252456869502, + "learning_rate": 3.846440314745633e-05, + "loss": 0.1173, + "step": 1956 + }, + { + "epoch": 1.0187402394586154, + "grad_norm": 0.234545477823226, + "learning_rate": 3.8452562769242276e-05, + "loss": 0.1136, + "step": 1957 + }, + { + "epoch": 1.0192608016657991, + "grad_norm": 0.23798416850355458, + "learning_rate": 3.844071814208288e-05, + "loss": 0.1121, + "step": 1958 + }, + { + "epoch": 1.019781363872983, + "grad_norm": 0.26896997384709637, + "learning_rate": 3.842886926971922e-05, + "loss": 0.1119, + "step": 1959 + }, + { + "epoch": 1.0203019260801667, + "grad_norm": 0.2461574213164607, + "learning_rate": 3.8417016155893716e-05, + "loss": 0.1127, + "step": 1960 + }, + { + "epoch": 1.0208224882873504, + "grad_norm": 0.23767808403209464, + "learning_rate": 3.840515880435013e-05, + "loss": 0.1085, + "step": 1961 + }, + { + "epoch": 1.021343050494534, + "grad_norm": 0.2713143335946204, + "learning_rate": 3.839329721883358e-05, + "loss": 0.1111, + "step": 1962 + }, + { + "epoch": 1.0218636127017178, + "grad_norm": 0.2599294251623557, + "learning_rate": 3.8381431403090494e-05, + "loss": 0.1126, + "step": 1963 + }, + { + "epoch": 1.0223841749089015, + "grad_norm": 0.24545293936346022, + "learning_rate": 3.8369561360868656e-05, + "loss": 0.1105, + "step": 1964 + }, + { + "epoch": 1.0229047371160853, + "grad_norm": 0.24512412624134056, + "learning_rate": 3.835768709591717e-05, + "loss": 0.1097, + "step": 1965 + }, + { + "epoch": 1.023425299323269, + "grad_norm": 0.2458438658532382, + "learning_rate": 3.8345808611986485e-05, + "loss": 0.1068, + "step": 1966 + }, + { + "epoch": 1.0239458615304529, + "grad_norm": 0.2426897803892748, + "learning_rate": 3.8333925912828384e-05, + "loss": 0.1108, + "step": 1967 + }, + { + "epoch": 1.0244664237376366, + "grad_norm": 0.24168253147617336, + "learning_rate": 3.832203900219597e-05, + "loss": 0.1103, + "step": 1968 + }, + { + "epoch": 1.0249869859448204, + "grad_norm": 0.25236496660379437, + "learning_rate": 3.8310147883843684e-05, + "loss": 0.109, + "step": 1969 + }, + { + "epoch": 1.0255075481520042, + "grad_norm": 0.25308298024794296, + "learning_rate": 3.829825256152729e-05, + "loss": 0.1119, + "step": 1970 + }, + { + "epoch": 1.026028110359188, + "grad_norm": 0.256204183314334, + "learning_rate": 3.82863530390039e-05, + "loss": 0.1108, + "step": 1971 + }, + { + "epoch": 1.0265486725663717, + "grad_norm": 0.24156358649582654, + "learning_rate": 3.8274449320031926e-05, + "loss": 0.1084, + "step": 1972 + }, + { + "epoch": 1.0270692347735555, + "grad_norm": 0.24950618643926795, + "learning_rate": 3.826254140837111e-05, + "loss": 0.1087, + "step": 1973 + }, + { + "epoch": 1.0275897969807393, + "grad_norm": 0.2704998223408513, + "learning_rate": 3.8250629307782535e-05, + "loss": 0.1133, + "step": 1974 + }, + { + "epoch": 1.028110359187923, + "grad_norm": 0.25017914226933313, + "learning_rate": 3.8238713022028595e-05, + "loss": 0.1105, + "step": 1975 + }, + { + "epoch": 1.0286309213951068, + "grad_norm": 0.2327093355495663, + "learning_rate": 3.8226792554873004e-05, + "loss": 0.113, + "step": 1976 + }, + { + "epoch": 1.0291514836022904, + "grad_norm": 0.2719152461496494, + "learning_rate": 3.821486791008081e-05, + "loss": 0.1239, + "step": 1977 + }, + { + "epoch": 1.0296720458094741, + "grad_norm": 0.2397787036646458, + "learning_rate": 3.820293909141835e-05, + "loss": 0.1153, + "step": 1978 + }, + { + "epoch": 1.030192608016658, + "grad_norm": 0.24038853770587257, + "learning_rate": 3.819100610265332e-05, + "loss": 0.1184, + "step": 1979 + }, + { + "epoch": 1.0307131702238417, + "grad_norm": 0.24961313789239628, + "learning_rate": 3.8179068947554705e-05, + "loss": 0.1124, + "step": 1980 + }, + { + "epoch": 1.0312337324310255, + "grad_norm": 0.2382377547825293, + "learning_rate": 3.8167127629892815e-05, + "loss": 0.11, + "step": 1981 + }, + { + "epoch": 1.0317542946382092, + "grad_norm": 0.2532706658437514, + "learning_rate": 3.815518215343928e-05, + "loss": 0.1179, + "step": 1982 + }, + { + "epoch": 1.032274856845393, + "grad_norm": 0.2656733745200885, + "learning_rate": 3.8143232521967023e-05, + "loss": 0.1149, + "step": 1983 + }, + { + "epoch": 1.0327954190525768, + "grad_norm": 0.23840852166865217, + "learning_rate": 3.813127873925031e-05, + "loss": 0.1068, + "step": 1984 + }, + { + "epoch": 1.0333159812597605, + "grad_norm": 0.25521240664720873, + "learning_rate": 3.811932080906468e-05, + "loss": 0.1085, + "step": 1985 + }, + { + "epoch": 1.0338365434669443, + "grad_norm": 0.24469566160163492, + "learning_rate": 3.8107358735187036e-05, + "loss": 0.1128, + "step": 1986 + }, + { + "epoch": 1.034357105674128, + "grad_norm": 0.25281133665260414, + "learning_rate": 3.809539252139553e-05, + "loss": 0.1128, + "step": 1987 + }, + { + "epoch": 1.0348776678813119, + "grad_norm": 0.236637048209141, + "learning_rate": 3.8083422171469666e-05, + "loss": 0.104, + "step": 1988 + }, + { + "epoch": 1.0353982300884956, + "grad_norm": 0.2729698170542485, + "learning_rate": 3.807144768919022e-05, + "loss": 0.1153, + "step": 1989 + }, + { + "epoch": 1.0359187922956794, + "grad_norm": 0.26111532376784635, + "learning_rate": 3.8059469078339305e-05, + "loss": 0.1096, + "step": 1990 + }, + { + "epoch": 1.0364393545028632, + "grad_norm": 0.2528115929930001, + "learning_rate": 3.8047486342700314e-05, + "loss": 0.1117, + "step": 1991 + }, + { + "epoch": 1.036959916710047, + "grad_norm": 0.2614038022166877, + "learning_rate": 3.803549948605797e-05, + "loss": 0.1145, + "step": 1992 + }, + { + "epoch": 1.0374804789172307, + "grad_norm": 0.2584809095985116, + "learning_rate": 3.8023508512198256e-05, + "loss": 0.1125, + "step": 1993 + }, + { + "epoch": 1.0380010411244143, + "grad_norm": 0.2510595198139831, + "learning_rate": 3.8011513424908504e-05, + "loss": 0.1182, + "step": 1994 + }, + { + "epoch": 1.038521603331598, + "grad_norm": 0.2490798615545912, + "learning_rate": 3.7999514227977304e-05, + "loss": 0.1069, + "step": 1995 + }, + { + "epoch": 1.0390421655387818, + "grad_norm": 0.23506443265432583, + "learning_rate": 3.798751092519456e-05, + "loss": 0.1091, + "step": 1996 + }, + { + "epoch": 1.0395627277459656, + "grad_norm": 0.2542184882450879, + "learning_rate": 3.7975503520351487e-05, + "loss": 0.1168, + "step": 1997 + }, + { + "epoch": 1.0400832899531494, + "grad_norm": 0.2622709219446434, + "learning_rate": 3.796349201724058e-05, + "loss": 0.1121, + "step": 1998 + }, + { + "epoch": 1.0406038521603331, + "grad_norm": 0.24618731175514902, + "learning_rate": 3.795147641965561e-05, + "loss": 0.1143, + "step": 1999 + }, + { + "epoch": 1.041124414367517, + "grad_norm": 0.26095944678719163, + "learning_rate": 3.7939456731391684e-05, + "loss": 0.1157, + "step": 2000 + }, + { + "epoch": 1.0416449765747007, + "grad_norm": 0.24783987249021047, + "learning_rate": 3.792743295624517e-05, + "loss": 0.1168, + "step": 2001 + }, + { + "epoch": 1.0421655387818844, + "grad_norm": 0.2528026533974678, + "learning_rate": 3.791540509801373e-05, + "loss": 0.1114, + "step": 2002 + }, + { + "epoch": 1.0426861009890682, + "grad_norm": 0.23900993330244627, + "learning_rate": 3.7903373160496345e-05, + "loss": 0.1089, + "step": 2003 + }, + { + "epoch": 1.043206663196252, + "grad_norm": 0.2464392822573349, + "learning_rate": 3.789133714749323e-05, + "loss": 0.1115, + "step": 2004 + }, + { + "epoch": 1.0437272254034358, + "grad_norm": 0.26170131605706815, + "learning_rate": 3.787929706280594e-05, + "loss": 0.1181, + "step": 2005 + }, + { + "epoch": 1.0442477876106195, + "grad_norm": 0.2366142963015739, + "learning_rate": 3.786725291023728e-05, + "loss": 0.1084, + "step": 2006 + }, + { + "epoch": 1.0447683498178033, + "grad_norm": 0.266920747864476, + "learning_rate": 3.785520469359138e-05, + "loss": 0.1193, + "step": 2007 + }, + { + "epoch": 1.045288912024987, + "grad_norm": 0.27917270091917423, + "learning_rate": 3.784315241667359e-05, + "loss": 0.1216, + "step": 2008 + }, + { + "epoch": 1.0458094742321706, + "grad_norm": 0.2524069132942275, + "learning_rate": 3.7831096083290606e-05, + "loss": 0.1145, + "step": 2009 + }, + { + "epoch": 1.0463300364393544, + "grad_norm": 0.27357050117506915, + "learning_rate": 3.781903569725036e-05, + "loss": 0.1171, + "step": 2010 + }, + { + "epoch": 1.0468505986465382, + "grad_norm": 0.23696843787160707, + "learning_rate": 3.780697126236211e-05, + "loss": 0.1143, + "step": 2011 + }, + { + "epoch": 1.047371160853722, + "grad_norm": 0.23491005269928766, + "learning_rate": 3.779490278243634e-05, + "loss": 0.1082, + "step": 2012 + }, + { + "epoch": 1.0478917230609057, + "grad_norm": 0.24529859449463776, + "learning_rate": 3.778283026128485e-05, + "loss": 0.1138, + "step": 2013 + }, + { + "epoch": 1.0484122852680895, + "grad_norm": 0.24704977515279603, + "learning_rate": 3.7770753702720704e-05, + "loss": 0.1124, + "step": 2014 + }, + { + "epoch": 1.0489328474752733, + "grad_norm": 0.26691179513701707, + "learning_rate": 3.775867311055823e-05, + "loss": 0.1162, + "step": 2015 + }, + { + "epoch": 1.049453409682457, + "grad_norm": 0.2427635803168682, + "learning_rate": 3.7746588488613066e-05, + "loss": 0.1085, + "step": 2016 + }, + { + "epoch": 1.0499739718896408, + "grad_norm": 0.2551238836654876, + "learning_rate": 3.773449984070207e-05, + "loss": 0.11, + "step": 2017 + }, + { + "epoch": 1.0504945340968246, + "grad_norm": 0.24382085261536607, + "learning_rate": 3.7722407170643416e-05, + "loss": 0.1103, + "step": 2018 + }, + { + "epoch": 1.0510150963040084, + "grad_norm": 0.2349225844442972, + "learning_rate": 3.7710310482256526e-05, + "loss": 0.115, + "step": 2019 + }, + { + "epoch": 1.0515356585111921, + "grad_norm": 0.24160862206632094, + "learning_rate": 3.7698209779362105e-05, + "loss": 0.1111, + "step": 2020 + }, + { + "epoch": 1.052056220718376, + "grad_norm": 0.24412984723875725, + "learning_rate": 3.768610506578211e-05, + "loss": 0.1079, + "step": 2021 + }, + { + "epoch": 1.0525767829255597, + "grad_norm": 0.24330022038065835, + "learning_rate": 3.7673996345339765e-05, + "loss": 0.1101, + "step": 2022 + }, + { + "epoch": 1.0530973451327434, + "grad_norm": 0.26374011548874077, + "learning_rate": 3.7661883621859585e-05, + "loss": 0.1063, + "step": 2023 + }, + { + "epoch": 1.0536179073399272, + "grad_norm": 0.24698833556767213, + "learning_rate": 3.764976689916732e-05, + "loss": 0.1128, + "step": 2024 + }, + { + "epoch": 1.054138469547111, + "grad_norm": 0.2534386235100205, + "learning_rate": 3.7637646181090006e-05, + "loss": 0.1116, + "step": 2025 + }, + { + "epoch": 1.0546590317542945, + "grad_norm": 0.2698669420224393, + "learning_rate": 3.7625521471455914e-05, + "loss": 0.1146, + "step": 2026 + }, + { + "epoch": 1.0551795939614783, + "grad_norm": 0.2399199660805788, + "learning_rate": 3.76133927740946e-05, + "loss": 0.1147, + "step": 2027 + }, + { + "epoch": 1.055700156168662, + "grad_norm": 0.24762045246035963, + "learning_rate": 3.760126009283688e-05, + "loss": 0.1114, + "step": 2028 + }, + { + "epoch": 1.0562207183758459, + "grad_norm": 0.24458494730365296, + "learning_rate": 3.758912343151481e-05, + "loss": 0.1138, + "step": 2029 + }, + { + "epoch": 1.0567412805830296, + "grad_norm": 0.23831510708786413, + "learning_rate": 3.75769827939617e-05, + "loss": 0.1093, + "step": 2030 + }, + { + "epoch": 1.0572618427902134, + "grad_norm": 0.24160472062392452, + "learning_rate": 3.7564838184012155e-05, + "loss": 0.1113, + "step": 2031 + }, + { + "epoch": 1.0577824049973972, + "grad_norm": 0.25777500815863746, + "learning_rate": 3.755268960550199e-05, + "loss": 0.1126, + "step": 2032 + }, + { + "epoch": 1.058302967204581, + "grad_norm": 0.2469273261838509, + "learning_rate": 3.754053706226829e-05, + "loss": 0.1093, + "step": 2033 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.2403499464753065, + "learning_rate": 3.75283805581494e-05, + "loss": 0.1093, + "step": 2034 + }, + { + "epoch": 1.0593440916189485, + "grad_norm": 0.28349200510154626, + "learning_rate": 3.751622009698492e-05, + "loss": 0.1128, + "step": 2035 + }, + { + "epoch": 1.0598646538261323, + "grad_norm": 0.25717282657195967, + "learning_rate": 3.7504055682615676e-05, + "loss": 0.1128, + "step": 2036 + }, + { + "epoch": 1.060385216033316, + "grad_norm": 0.2604631820265362, + "learning_rate": 3.749188731888375e-05, + "loss": 0.1094, + "step": 2037 + }, + { + "epoch": 1.0609057782404998, + "grad_norm": 0.26157189061505526, + "learning_rate": 3.7479715009632486e-05, + "loss": 0.1152, + "step": 2038 + }, + { + "epoch": 1.0614263404476836, + "grad_norm": 0.2558378121103914, + "learning_rate": 3.7467538758706476e-05, + "loss": 0.1084, + "step": 2039 + }, + { + "epoch": 1.0619469026548674, + "grad_norm": 0.24072034047832436, + "learning_rate": 3.7455358569951535e-05, + "loss": 0.109, + "step": 2040 + }, + { + "epoch": 1.062467464862051, + "grad_norm": 0.2560891463466215, + "learning_rate": 3.744317444721473e-05, + "loss": 0.1097, + "step": 2041 + }, + { + "epoch": 1.0629880270692347, + "grad_norm": 0.2441562331368522, + "learning_rate": 3.743098639434438e-05, + "loss": 0.1126, + "step": 2042 + }, + { + "epoch": 1.0635085892764184, + "grad_norm": 0.25909422394913734, + "learning_rate": 3.7418794415190037e-05, + "loss": 0.1126, + "step": 2043 + }, + { + "epoch": 1.0640291514836022, + "grad_norm": 0.25192159362102, + "learning_rate": 3.74065985136025e-05, + "loss": 0.113, + "step": 2044 + }, + { + "epoch": 1.064549713690786, + "grad_norm": 0.24579868026393167, + "learning_rate": 3.73943986934338e-05, + "loss": 0.1136, + "step": 2045 + }, + { + "epoch": 1.0650702758979698, + "grad_norm": 0.25827104066791506, + "learning_rate": 3.738219495853721e-05, + "loss": 0.1161, + "step": 2046 + }, + { + "epoch": 1.0655908381051535, + "grad_norm": 0.23400282041641796, + "learning_rate": 3.736998731276722e-05, + "loss": 0.1115, + "step": 2047 + }, + { + "epoch": 1.0661114003123373, + "grad_norm": 0.26815101610132586, + "learning_rate": 3.7357775759979605e-05, + "loss": 0.1113, + "step": 2048 + }, + { + "epoch": 1.066631962519521, + "grad_norm": 0.24275941822819658, + "learning_rate": 3.734556030403131e-05, + "loss": 0.1115, + "step": 2049 + }, + { + "epoch": 1.0671525247267049, + "grad_norm": 0.2718950911696723, + "learning_rate": 3.733334094878057e-05, + "loss": 0.1155, + "step": 2050 + }, + { + "epoch": 1.0676730869338886, + "grad_norm": 0.2598394861210938, + "learning_rate": 3.73211176980868e-05, + "loss": 0.112, + "step": 2051 + }, + { + "epoch": 1.0681936491410724, + "grad_norm": 0.2694385440911815, + "learning_rate": 3.73088905558107e-05, + "loss": 0.1155, + "step": 2052 + }, + { + "epoch": 1.0687142113482562, + "grad_norm": 0.24106350307174138, + "learning_rate": 3.7296659525814146e-05, + "loss": 0.1106, + "step": 2053 + }, + { + "epoch": 1.06923477355544, + "grad_norm": 0.262198101864761, + "learning_rate": 3.7284424611960275e-05, + "loss": 0.1111, + "step": 2054 + }, + { + "epoch": 1.0697553357626237, + "grad_norm": 0.2546146585988391, + "learning_rate": 3.727218581811346e-05, + "loss": 0.1142, + "step": 2055 + }, + { + "epoch": 1.0702758979698075, + "grad_norm": 0.24247156568617306, + "learning_rate": 3.725994314813925e-05, + "loss": 0.1093, + "step": 2056 + }, + { + "epoch": 1.0707964601769913, + "grad_norm": 0.23958179241477287, + "learning_rate": 3.724769660590447e-05, + "loss": 0.1062, + "step": 2057 + }, + { + "epoch": 1.0713170223841748, + "grad_norm": 0.2396103519223926, + "learning_rate": 3.723544619527714e-05, + "loss": 0.1158, + "step": 2058 + }, + { + "epoch": 1.0718375845913586, + "grad_norm": 0.24141752361795885, + "learning_rate": 3.722319192012652e-05, + "loss": 0.1163, + "step": 2059 + }, + { + "epoch": 1.0723581467985424, + "grad_norm": 0.2530546244951208, + "learning_rate": 3.721093378432306e-05, + "loss": 0.1103, + "step": 2060 + }, + { + "epoch": 1.0728787090057261, + "grad_norm": 0.2304892834331245, + "learning_rate": 3.7198671791738475e-05, + "loss": 0.1138, + "step": 2061 + }, + { + "epoch": 1.07339927121291, + "grad_norm": 0.24116172581054715, + "learning_rate": 3.718640594624566e-05, + "loss": 0.1131, + "step": 2062 + }, + { + "epoch": 1.0739198334200937, + "grad_norm": 0.2514432482339406, + "learning_rate": 3.7174136251718736e-05, + "loss": 0.1147, + "step": 2063 + }, + { + "epoch": 1.0744403956272774, + "grad_norm": 0.24575627264010017, + "learning_rate": 3.716186271203305e-05, + "loss": 0.1112, + "step": 2064 + }, + { + "epoch": 1.0749609578344612, + "grad_norm": 0.2408037168496382, + "learning_rate": 3.714958533106515e-05, + "loss": 0.1094, + "step": 2065 + }, + { + "epoch": 1.075481520041645, + "grad_norm": 0.248061764942323, + "learning_rate": 3.713730411269282e-05, + "loss": 0.1165, + "step": 2066 + }, + { + "epoch": 1.0760020822488288, + "grad_norm": 0.2372533367139487, + "learning_rate": 3.7125019060795024e-05, + "loss": 0.1062, + "step": 2067 + }, + { + "epoch": 1.0765226444560125, + "grad_norm": 0.2383605045988699, + "learning_rate": 3.711273017925196e-05, + "loss": 0.1029, + "step": 2068 + }, + { + "epoch": 1.0770432066631963, + "grad_norm": 0.2609070795749889, + "learning_rate": 3.710043747194503e-05, + "loss": 0.1099, + "step": 2069 + }, + { + "epoch": 1.07756376887038, + "grad_norm": 0.25094133107875816, + "learning_rate": 3.708814094275683e-05, + "loss": 0.1104, + "step": 2070 + }, + { + "epoch": 1.0780843310775639, + "grad_norm": 0.2555320702860627, + "learning_rate": 3.70758405955712e-05, + "loss": 0.1118, + "step": 2071 + }, + { + "epoch": 1.0786048932847476, + "grad_norm": 0.2557360841701921, + "learning_rate": 3.7063536434273135e-05, + "loss": 0.1078, + "step": 2072 + }, + { + "epoch": 1.0791254554919312, + "grad_norm": 0.30210930503756817, + "learning_rate": 3.705122846274889e-05, + "loss": 0.1116, + "step": 2073 + }, + { + "epoch": 1.079646017699115, + "grad_norm": 0.283413324681584, + "learning_rate": 3.703891668488587e-05, + "loss": 0.1168, + "step": 2074 + }, + { + "epoch": 1.0801665799062987, + "grad_norm": 0.22741963249503896, + "learning_rate": 3.702660110457272e-05, + "loss": 0.1086, + "step": 2075 + }, + { + "epoch": 1.0806871421134825, + "grad_norm": 0.24697100297311528, + "learning_rate": 3.7014281725699276e-05, + "loss": 0.1138, + "step": 2076 + }, + { + "epoch": 1.0812077043206663, + "grad_norm": 0.2555266097287812, + "learning_rate": 3.700195855215656e-05, + "loss": 0.1094, + "step": 2077 + }, + { + "epoch": 1.08172826652785, + "grad_norm": 0.25611897463687516, + "learning_rate": 3.6989631587836814e-05, + "loss": 0.1131, + "step": 2078 + }, + { + "epoch": 1.0822488287350338, + "grad_norm": 0.24097390848814207, + "learning_rate": 3.6977300836633466e-05, + "loss": 0.1082, + "step": 2079 + }, + { + "epoch": 1.0827693909422176, + "grad_norm": 0.27090232208959586, + "learning_rate": 3.696496630244114e-05, + "loss": 0.113, + "step": 2080 + }, + { + "epoch": 1.0832899531494014, + "grad_norm": 0.2435755711000546, + "learning_rate": 3.695262798915564e-05, + "loss": 0.1082, + "step": 2081 + }, + { + "epoch": 1.0838105153565851, + "grad_norm": 0.2440321008013806, + "learning_rate": 3.694028590067401e-05, + "loss": 0.1088, + "step": 2082 + }, + { + "epoch": 1.084331077563769, + "grad_norm": 0.2523999632061997, + "learning_rate": 3.6927940040894424e-05, + "loss": 0.1123, + "step": 2083 + }, + { + "epoch": 1.0848516397709527, + "grad_norm": 0.25809490414472963, + "learning_rate": 3.691559041371631e-05, + "loss": 0.1156, + "step": 2084 + }, + { + "epoch": 1.0853722019781364, + "grad_norm": 0.2764366832810147, + "learning_rate": 3.6903237023040235e-05, + "loss": 0.1098, + "step": 2085 + }, + { + "epoch": 1.0858927641853202, + "grad_norm": 0.24467790732933922, + "learning_rate": 3.689087987276797e-05, + "loss": 0.1083, + "step": 2086 + }, + { + "epoch": 1.086413326392504, + "grad_norm": 0.2590589962094532, + "learning_rate": 3.68785189668025e-05, + "loss": 0.1142, + "step": 2087 + }, + { + "epoch": 1.0869338885996878, + "grad_norm": 0.23869572486032326, + "learning_rate": 3.686615430904795e-05, + "loss": 0.1141, + "step": 2088 + }, + { + "epoch": 1.0874544508068715, + "grad_norm": 0.24558579990983817, + "learning_rate": 3.685378590340968e-05, + "loss": 0.1086, + "step": 2089 + }, + { + "epoch": 1.087975013014055, + "grad_norm": 0.2451590372072089, + "learning_rate": 3.684141375379418e-05, + "loss": 0.1132, + "step": 2090 + }, + { + "epoch": 1.0884955752212389, + "grad_norm": 0.24564234860463044, + "learning_rate": 3.6829037864109176e-05, + "loss": 0.1109, + "step": 2091 + }, + { + "epoch": 1.0890161374284226, + "grad_norm": 0.26529150512656974, + "learning_rate": 3.6816658238263525e-05, + "loss": 0.1199, + "step": 2092 + }, + { + "epoch": 1.0895366996356064, + "grad_norm": 0.2585931700922823, + "learning_rate": 3.680427488016731e-05, + "loss": 0.1194, + "step": 2093 + }, + { + "epoch": 1.0900572618427902, + "grad_norm": 0.2464657000667077, + "learning_rate": 3.679188779373177e-05, + "loss": 0.1147, + "step": 2094 + }, + { + "epoch": 1.090577824049974, + "grad_norm": 0.2452855437235244, + "learning_rate": 3.677949698286931e-05, + "loss": 0.1149, + "step": 2095 + }, + { + "epoch": 1.0910983862571577, + "grad_norm": 0.2534553426066007, + "learning_rate": 3.676710245149353e-05, + "loss": 0.1171, + "step": 2096 + }, + { + "epoch": 1.0916189484643415, + "grad_norm": 0.23973694981857727, + "learning_rate": 3.675470420351921e-05, + "loss": 0.1158, + "step": 2097 + }, + { + "epoch": 1.0921395106715253, + "grad_norm": 0.24051783041794145, + "learning_rate": 3.6742302242862284e-05, + "loss": 0.1105, + "step": 2098 + }, + { + "epoch": 1.092660072878709, + "grad_norm": 0.24443405018149944, + "learning_rate": 3.6729896573439867e-05, + "loss": 0.1116, + "step": 2099 + }, + { + "epoch": 1.0931806350858928, + "grad_norm": 0.2597312632422425, + "learning_rate": 3.671748719917025e-05, + "loss": 0.1124, + "step": 2100 + }, + { + "epoch": 1.0937011972930766, + "grad_norm": 0.24822503893302403, + "learning_rate": 3.6705074123972885e-05, + "loss": 0.1096, + "step": 2101 + }, + { + "epoch": 1.0942217595002603, + "grad_norm": 0.24740697095379377, + "learning_rate": 3.669265735176842e-05, + "loss": 0.1083, + "step": 2102 + }, + { + "epoch": 1.0947423217074441, + "grad_norm": 0.2645450510720223, + "learning_rate": 3.668023688647863e-05, + "loss": 0.1126, + "step": 2103 + }, + { + "epoch": 1.095262883914628, + "grad_norm": 0.2535454890824587, + "learning_rate": 3.666781273202646e-05, + "loss": 0.1208, + "step": 2104 + }, + { + "epoch": 1.0957834461218114, + "grad_norm": 0.2575428275855921, + "learning_rate": 3.6655384892336075e-05, + "loss": 0.1168, + "step": 2105 + }, + { + "epoch": 1.0963040083289952, + "grad_norm": 0.2309286179235019, + "learning_rate": 3.664295337133274e-05, + "loss": 0.1124, + "step": 2106 + }, + { + "epoch": 1.096824570536179, + "grad_norm": 0.2512339973023405, + "learning_rate": 3.6630518172942915e-05, + "loss": 0.1151, + "step": 2107 + }, + { + "epoch": 1.0973451327433628, + "grad_norm": 0.2386785747665995, + "learning_rate": 3.6618079301094216e-05, + "loss": 0.1172, + "step": 2108 + }, + { + "epoch": 1.0978656949505465, + "grad_norm": 0.23724836929193713, + "learning_rate": 3.660563675971541e-05, + "loss": 0.1115, + "step": 2109 + }, + { + "epoch": 1.0983862571577303, + "grad_norm": 0.23600725505996434, + "learning_rate": 3.659319055273644e-05, + "loss": 0.1144, + "step": 2110 + }, + { + "epoch": 1.098906819364914, + "grad_norm": 0.23772868705388486, + "learning_rate": 3.6580740684088396e-05, + "loss": 0.113, + "step": 2111 + }, + { + "epoch": 1.0994273815720979, + "grad_norm": 0.237076578253062, + "learning_rate": 3.656828715770352e-05, + "loss": 0.1093, + "step": 2112 + }, + { + "epoch": 1.0999479437792816, + "grad_norm": 0.24382321669254056, + "learning_rate": 3.655582997751521e-05, + "loss": 0.1106, + "step": 2113 + }, + { + "epoch": 1.1004685059864654, + "grad_norm": 0.2377191561622143, + "learning_rate": 3.654336914745804e-05, + "loss": 0.1149, + "step": 2114 + }, + { + "epoch": 1.1009890681936492, + "grad_norm": 0.2490830706313632, + "learning_rate": 3.65309046714677e-05, + "loss": 0.1216, + "step": 2115 + }, + { + "epoch": 1.101509630400833, + "grad_norm": 0.23768207413624293, + "learning_rate": 3.651843655348107e-05, + "loss": 0.1116, + "step": 2116 + }, + { + "epoch": 1.1020301926080167, + "grad_norm": 0.23880298996606503, + "learning_rate": 3.650596479743616e-05, + "loss": 0.1111, + "step": 2117 + }, + { + "epoch": 1.1025507548152005, + "grad_norm": 0.241400343263037, + "learning_rate": 3.649348940727212e-05, + "loss": 0.1106, + "step": 2118 + }, + { + "epoch": 1.1030713170223843, + "grad_norm": 0.22973487630981618, + "learning_rate": 3.6481010386929264e-05, + "loss": 0.1107, + "step": 2119 + }, + { + "epoch": 1.103591879229568, + "grad_norm": 0.24451820553198225, + "learning_rate": 3.6468527740349045e-05, + "loss": 0.115, + "step": 2120 + }, + { + "epoch": 1.1041124414367518, + "grad_norm": 0.27311985417564316, + "learning_rate": 3.645604147147408e-05, + "loss": 0.1123, + "step": 2121 + }, + { + "epoch": 1.1046330036439354, + "grad_norm": 0.2438128553800211, + "learning_rate": 3.644355158424808e-05, + "loss": 0.1138, + "step": 2122 + }, + { + "epoch": 1.1051535658511191, + "grad_norm": 0.25561227871892656, + "learning_rate": 3.6431058082615964e-05, + "loss": 0.1236, + "step": 2123 + }, + { + "epoch": 1.105674128058303, + "grad_norm": 0.24292979052263167, + "learning_rate": 3.6418560970523745e-05, + "loss": 0.1126, + "step": 2124 + }, + { + "epoch": 1.1061946902654867, + "grad_norm": 0.23356756437347412, + "learning_rate": 3.6406060251918604e-05, + "loss": 0.1092, + "step": 2125 + }, + { + "epoch": 1.1067152524726704, + "grad_norm": 0.2510189996756236, + "learning_rate": 3.6393555930748846e-05, + "loss": 0.1134, + "step": 2126 + }, + { + "epoch": 1.1072358146798542, + "grad_norm": 0.2389541755449117, + "learning_rate": 3.63810480109639e-05, + "loss": 0.1101, + "step": 2127 + }, + { + "epoch": 1.107756376887038, + "grad_norm": 0.24947876309951245, + "learning_rate": 3.636853649651438e-05, + "loss": 0.1134, + "step": 2128 + }, + { + "epoch": 1.1082769390942218, + "grad_norm": 0.24327455652806065, + "learning_rate": 3.6356021391351976e-05, + "loss": 0.1128, + "step": 2129 + }, + { + "epoch": 1.1087975013014055, + "grad_norm": 0.25621361610147303, + "learning_rate": 3.634350269942956e-05, + "loss": 0.115, + "step": 2130 + }, + { + "epoch": 1.1093180635085893, + "grad_norm": 0.24171392174728737, + "learning_rate": 3.633098042470111e-05, + "loss": 0.1145, + "step": 2131 + }, + { + "epoch": 1.109838625715773, + "grad_norm": 0.24255117880153898, + "learning_rate": 3.631845457112174e-05, + "loss": 0.1118, + "step": 2132 + }, + { + "epoch": 1.1103591879229568, + "grad_norm": 0.2589821697017109, + "learning_rate": 3.63059251426477e-05, + "loss": 0.1118, + "step": 2133 + }, + { + "epoch": 1.1108797501301406, + "grad_norm": 0.2546318202945326, + "learning_rate": 3.6293392143236374e-05, + "loss": 0.1149, + "step": 2134 + }, + { + "epoch": 1.1114003123373244, + "grad_norm": 0.24797440602614088, + "learning_rate": 3.628085557684625e-05, + "loss": 0.1151, + "step": 2135 + }, + { + "epoch": 1.1119208745445082, + "grad_norm": 0.25596352809519574, + "learning_rate": 3.6268315447436976e-05, + "loss": 0.1104, + "step": 2136 + }, + { + "epoch": 1.1124414367516917, + "grad_norm": 0.24903661132125504, + "learning_rate": 3.6255771758969303e-05, + "loss": 0.1149, + "step": 2137 + }, + { + "epoch": 1.1129619989588755, + "grad_norm": 0.24389645955042044, + "learning_rate": 3.62432245154051e-05, + "loss": 0.1155, + "step": 2138 + }, + { + "epoch": 1.1134825611660593, + "grad_norm": 0.235781861888092, + "learning_rate": 3.6230673720707393e-05, + "loss": 0.112, + "step": 2139 + }, + { + "epoch": 1.114003123373243, + "grad_norm": 0.2308900894585995, + "learning_rate": 3.621811937884029e-05, + "loss": 0.1077, + "step": 2140 + }, + { + "epoch": 1.1145236855804268, + "grad_norm": 0.2753295219850416, + "learning_rate": 3.620556149376904e-05, + "loss": 0.1156, + "step": 2141 + }, + { + "epoch": 1.1150442477876106, + "grad_norm": 0.2470693560397974, + "learning_rate": 3.6193000069460006e-05, + "loss": 0.1096, + "step": 2142 + }, + { + "epoch": 1.1155648099947943, + "grad_norm": 0.24067635208687654, + "learning_rate": 3.618043510988068e-05, + "loss": 0.1102, + "step": 2143 + }, + { + "epoch": 1.1160853722019781, + "grad_norm": 0.2553351905414518, + "learning_rate": 3.616786661899965e-05, + "loss": 0.1135, + "step": 2144 + }, + { + "epoch": 1.116605934409162, + "grad_norm": 0.26177624105631986, + "learning_rate": 3.6155294600786625e-05, + "loss": 0.1082, + "step": 2145 + }, + { + "epoch": 1.1171264966163457, + "grad_norm": 0.2657931741696369, + "learning_rate": 3.6142719059212456e-05, + "loss": 0.1168, + "step": 2146 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.25488377144439966, + "learning_rate": 3.613013999824906e-05, + "loss": 0.113, + "step": 2147 + }, + { + "epoch": 1.1181676210307132, + "grad_norm": 0.2755475530717958, + "learning_rate": 3.6117557421869506e-05, + "loss": 0.1098, + "step": 2148 + }, + { + "epoch": 1.118688183237897, + "grad_norm": 0.25201657034289093, + "learning_rate": 3.6104971334047956e-05, + "loss": 0.1086, + "step": 2149 + }, + { + "epoch": 1.1192087454450808, + "grad_norm": 0.2357938145901859, + "learning_rate": 3.609238173875966e-05, + "loss": 0.1087, + "step": 2150 + }, + { + "epoch": 1.1197293076522645, + "grad_norm": 0.2540699466524578, + "learning_rate": 3.6079788639981036e-05, + "loss": 0.1143, + "step": 2151 + }, + { + "epoch": 1.1202498698594483, + "grad_norm": 0.24546722256380496, + "learning_rate": 3.606719204168954e-05, + "loss": 0.1117, + "step": 2152 + }, + { + "epoch": 1.120770432066632, + "grad_norm": 0.23557984732078704, + "learning_rate": 3.6054591947863784e-05, + "loss": 0.1108, + "step": 2153 + }, + { + "epoch": 1.1212909942738156, + "grad_norm": 0.24342719518637673, + "learning_rate": 3.604198836248344e-05, + "loss": 0.1106, + "step": 2154 + }, + { + "epoch": 1.1218115564809994, + "grad_norm": 0.24766981664804752, + "learning_rate": 3.602938128952933e-05, + "loss": 0.1145, + "step": 2155 + }, + { + "epoch": 1.1223321186881832, + "grad_norm": 0.25337709609875236, + "learning_rate": 3.6016770732983344e-05, + "loss": 0.1107, + "step": 2156 + }, + { + "epoch": 1.122852680895367, + "grad_norm": 0.2777055344080702, + "learning_rate": 3.600415669682849e-05, + "loss": 0.1171, + "step": 2157 + }, + { + "epoch": 1.1233732431025507, + "grad_norm": 0.24802200183807002, + "learning_rate": 3.599153918504886e-05, + "loss": 0.1148, + "step": 2158 + }, + { + "epoch": 1.1238938053097345, + "grad_norm": 0.249326912022627, + "learning_rate": 3.597891820162964e-05, + "loss": 0.114, + "step": 2159 + }, + { + "epoch": 1.1244143675169183, + "grad_norm": 0.24976539124029015, + "learning_rate": 3.596629375055716e-05, + "loss": 0.1108, + "step": 2160 + }, + { + "epoch": 1.124934929724102, + "grad_norm": 0.2549908652545141, + "learning_rate": 3.5953665835818765e-05, + "loss": 0.1161, + "step": 2161 + }, + { + "epoch": 1.1254554919312858, + "grad_norm": 0.2589331285801467, + "learning_rate": 3.594103446140297e-05, + "loss": 0.1144, + "step": 2162 + }, + { + "epoch": 1.1259760541384696, + "grad_norm": 0.24712526867777315, + "learning_rate": 3.592839963129934e-05, + "loss": 0.1202, + "step": 2163 + }, + { + "epoch": 1.1264966163456533, + "grad_norm": 0.252358949933663, + "learning_rate": 3.591576134949854e-05, + "loss": 0.1127, + "step": 2164 + }, + { + "epoch": 1.1270171785528371, + "grad_norm": 0.2357834325734446, + "learning_rate": 3.590311961999233e-05, + "loss": 0.1059, + "step": 2165 + }, + { + "epoch": 1.127537740760021, + "grad_norm": 0.25635650100122037, + "learning_rate": 3.589047444677355e-05, + "loss": 0.1154, + "step": 2166 + }, + { + "epoch": 1.1280583029672047, + "grad_norm": 0.28412878079518017, + "learning_rate": 3.587782583383615e-05, + "loss": 0.1173, + "step": 2167 + }, + { + "epoch": 1.1285788651743884, + "grad_norm": 0.26029129448235633, + "learning_rate": 3.586517378517514e-05, + "loss": 0.1131, + "step": 2168 + }, + { + "epoch": 1.129099427381572, + "grad_norm": 0.2592810117661404, + "learning_rate": 3.585251830478663e-05, + "loss": 0.1118, + "step": 2169 + }, + { + "epoch": 1.1296199895887558, + "grad_norm": 0.25743920274899534, + "learning_rate": 3.58398593966678e-05, + "loss": 0.1113, + "step": 2170 + }, + { + "epoch": 1.1301405517959395, + "grad_norm": 0.26024239886218314, + "learning_rate": 3.582719706481693e-05, + "loss": 0.1187, + "step": 2171 + }, + { + "epoch": 1.1306611140031233, + "grad_norm": 0.2534072347951432, + "learning_rate": 3.581453131323337e-05, + "loss": 0.1157, + "step": 2172 + }, + { + "epoch": 1.131181676210307, + "grad_norm": 0.2537355619987752, + "learning_rate": 3.580186214591756e-05, + "loss": 0.1095, + "step": 2173 + }, + { + "epoch": 1.1317022384174908, + "grad_norm": 0.23632427238631776, + "learning_rate": 3.578918956687101e-05, + "loss": 0.112, + "step": 2174 + }, + { + "epoch": 1.1322228006246746, + "grad_norm": 0.2412909737617186, + "learning_rate": 3.5776513580096315e-05, + "loss": 0.111, + "step": 2175 + }, + { + "epoch": 1.1327433628318584, + "grad_norm": 0.2483748166841977, + "learning_rate": 3.576383418959713e-05, + "loss": 0.1088, + "step": 2176 + }, + { + "epoch": 1.1332639250390422, + "grad_norm": 0.25025876140950926, + "learning_rate": 3.57511513993782e-05, + "loss": 0.1154, + "step": 2177 + }, + { + "epoch": 1.133784487246226, + "grad_norm": 0.24527483686430052, + "learning_rate": 3.5738465213445345e-05, + "loss": 0.1133, + "step": 2178 + }, + { + "epoch": 1.1343050494534097, + "grad_norm": 0.2400418074455827, + "learning_rate": 3.572577563580545e-05, + "loss": 0.1087, + "step": 2179 + }, + { + "epoch": 1.1348256116605935, + "grad_norm": 0.2473622001205245, + "learning_rate": 3.571308267046647e-05, + "loss": 0.1094, + "step": 2180 + }, + { + "epoch": 1.1353461738677773, + "grad_norm": 0.22926206403165528, + "learning_rate": 3.5700386321437446e-05, + "loss": 0.103, + "step": 2181 + }, + { + "epoch": 1.135866736074961, + "grad_norm": 0.2530508376636812, + "learning_rate": 3.5687686592728465e-05, + "loss": 0.1164, + "step": 2182 + }, + { + "epoch": 1.1363872982821448, + "grad_norm": 0.2547621831001377, + "learning_rate": 3.567498348835069e-05, + "loss": 0.1165, + "step": 2183 + }, + { + "epoch": 1.1369078604893286, + "grad_norm": 0.24555495677708386, + "learning_rate": 3.566227701231637e-05, + "loss": 0.1146, + "step": 2184 + }, + { + "epoch": 1.1374284226965123, + "grad_norm": 0.24522163832061175, + "learning_rate": 3.5649567168638786e-05, + "loss": 0.1094, + "step": 2185 + }, + { + "epoch": 1.1379489849036961, + "grad_norm": 0.24316629123363587, + "learning_rate": 3.56368539613323e-05, + "loss": 0.111, + "step": 2186 + }, + { + "epoch": 1.1384695471108797, + "grad_norm": 0.24441005322777853, + "learning_rate": 3.562413739441234e-05, + "loss": 0.1114, + "step": 2187 + }, + { + "epoch": 1.1389901093180634, + "grad_norm": 0.25037434739181624, + "learning_rate": 3.561141747189538e-05, + "loss": 0.1154, + "step": 2188 + }, + { + "epoch": 1.1395106715252472, + "grad_norm": 0.2561173459299334, + "learning_rate": 3.559869419779897e-05, + "loss": 0.1142, + "step": 2189 + }, + { + "epoch": 1.140031233732431, + "grad_norm": 0.23707593564275278, + "learning_rate": 3.558596757614172e-05, + "loss": 0.1099, + "step": 2190 + }, + { + "epoch": 1.1405517959396148, + "grad_norm": 0.2500778362026866, + "learning_rate": 3.5573237610943264e-05, + "loss": 0.1161, + "step": 2191 + }, + { + "epoch": 1.1410723581467985, + "grad_norm": 0.27492670789651996, + "learning_rate": 3.556050430622435e-05, + "loss": 0.1192, + "step": 2192 + }, + { + "epoch": 1.1415929203539823, + "grad_norm": 0.2605598668612606, + "learning_rate": 3.5547767666006735e-05, + "loss": 0.1132, + "step": 2193 + }, + { + "epoch": 1.142113482561166, + "grad_norm": 0.2605622258673461, + "learning_rate": 3.553502769431323e-05, + "loss": 0.114, + "step": 2194 + }, + { + "epoch": 1.1426340447683498, + "grad_norm": 0.2577447414552421, + "learning_rate": 3.5522284395167724e-05, + "loss": 0.1149, + "step": 2195 + }, + { + "epoch": 1.1431546069755336, + "grad_norm": 0.2728095113979058, + "learning_rate": 3.550953777259515e-05, + "loss": 0.1194, + "step": 2196 + }, + { + "epoch": 1.1436751691827174, + "grad_norm": 0.22575779206167243, + "learning_rate": 3.549678783062147e-05, + "loss": 0.1104, + "step": 2197 + }, + { + "epoch": 1.1441957313899012, + "grad_norm": 0.2551987577448281, + "learning_rate": 3.54840345732737e-05, + "loss": 0.1158, + "step": 2198 + }, + { + "epoch": 1.144716293597085, + "grad_norm": 0.24948023939005337, + "learning_rate": 3.547127800457994e-05, + "loss": 0.1139, + "step": 2199 + }, + { + "epoch": 1.1452368558042687, + "grad_norm": 0.260804602778483, + "learning_rate": 3.54585181285693e-05, + "loss": 0.1142, + "step": 2200 + }, + { + "epoch": 1.1457574180114523, + "grad_norm": 0.24626439137019285, + "learning_rate": 3.5445754949271924e-05, + "loss": 0.114, + "step": 2201 + }, + { + "epoch": 1.146277980218636, + "grad_norm": 0.23624464258694697, + "learning_rate": 3.543298847071904e-05, + "loss": 0.1101, + "step": 2202 + }, + { + "epoch": 1.1467985424258198, + "grad_norm": 0.2557734293730534, + "learning_rate": 3.542021869694289e-05, + "loss": 0.1176, + "step": 2203 + }, + { + "epoch": 1.1473191046330036, + "grad_norm": 0.25015139608483516, + "learning_rate": 3.5407445631976756e-05, + "loss": 0.1131, + "step": 2204 + }, + { + "epoch": 1.1478396668401873, + "grad_norm": 0.23610501538977566, + "learning_rate": 3.5394669279854966e-05, + "loss": 0.1153, + "step": 2205 + }, + { + "epoch": 1.1483602290473711, + "grad_norm": 0.22976718602322604, + "learning_rate": 3.53818896446129e-05, + "loss": 0.1184, + "step": 2206 + }, + { + "epoch": 1.148880791254555, + "grad_norm": 0.240542256545283, + "learning_rate": 3.536910673028695e-05, + "loss": 0.1121, + "step": 2207 + }, + { + "epoch": 1.1494013534617387, + "grad_norm": 0.23366916566434953, + "learning_rate": 3.5356320540914556e-05, + "loss": 0.1139, + "step": 2208 + }, + { + "epoch": 1.1499219156689224, + "grad_norm": 0.2277321141291514, + "learning_rate": 3.534353108053419e-05, + "loss": 0.1108, + "step": 2209 + }, + { + "epoch": 1.1504424778761062, + "grad_norm": 0.2245190916941551, + "learning_rate": 3.5330738353185364e-05, + "loss": 0.1106, + "step": 2210 + }, + { + "epoch": 1.15096304008329, + "grad_norm": 0.2441026791856793, + "learning_rate": 3.531794236290862e-05, + "loss": 0.1111, + "step": 2211 + }, + { + "epoch": 1.1514836022904738, + "grad_norm": 0.2693550306160406, + "learning_rate": 3.530514311374552e-05, + "loss": 0.1185, + "step": 2212 + }, + { + "epoch": 1.1520041644976575, + "grad_norm": 0.2583486674649021, + "learning_rate": 3.529234060973867e-05, + "loss": 0.1053, + "step": 2213 + }, + { + "epoch": 1.1525247267048413, + "grad_norm": 0.24969080269353847, + "learning_rate": 3.527953485493168e-05, + "loss": 0.1149, + "step": 2214 + }, + { + "epoch": 1.153045288912025, + "grad_norm": 0.29109726318240187, + "learning_rate": 3.526672585336923e-05, + "loss": 0.113, + "step": 2215 + }, + { + "epoch": 1.1535658511192088, + "grad_norm": 0.2342749727367158, + "learning_rate": 3.525391360909697e-05, + "loss": 0.109, + "step": 2216 + }, + { + "epoch": 1.1540864133263926, + "grad_norm": 0.2500797999992455, + "learning_rate": 3.524109812616161e-05, + "loss": 0.1144, + "step": 2217 + }, + { + "epoch": 1.1546069755335764, + "grad_norm": 0.2500104491421713, + "learning_rate": 3.5228279408610895e-05, + "loss": 0.1139, + "step": 2218 + }, + { + "epoch": 1.15512753774076, + "grad_norm": 0.23246173483256596, + "learning_rate": 3.521545746049356e-05, + "loss": 0.1107, + "step": 2219 + }, + { + "epoch": 1.1556480999479437, + "grad_norm": 0.25759769841196517, + "learning_rate": 3.5202632285859364e-05, + "loss": 0.115, + "step": 2220 + }, + { + "epoch": 1.1561686621551275, + "grad_norm": 0.2518042175741102, + "learning_rate": 3.518980388875911e-05, + "loss": 0.1126, + "step": 2221 + }, + { + "epoch": 1.1566892243623113, + "grad_norm": 0.24279674908930854, + "learning_rate": 3.517697227324459e-05, + "loss": 0.1125, + "step": 2222 + }, + { + "epoch": 1.157209786569495, + "grad_norm": 0.24501321952749994, + "learning_rate": 3.516413744336863e-05, + "loss": 0.1096, + "step": 2223 + }, + { + "epoch": 1.1577303487766788, + "grad_norm": 0.24613219825749424, + "learning_rate": 3.5151299403185075e-05, + "loss": 0.1119, + "step": 2224 + }, + { + "epoch": 1.1582509109838626, + "grad_norm": 0.2614423597655891, + "learning_rate": 3.513845815674877e-05, + "loss": 0.1139, + "step": 2225 + }, + { + "epoch": 1.1587714731910463, + "grad_norm": 0.24042769917249954, + "learning_rate": 3.512561370811556e-05, + "loss": 0.1088, + "step": 2226 + }, + { + "epoch": 1.1592920353982301, + "grad_norm": 0.25698514658916083, + "learning_rate": 3.5112766061342344e-05, + "loss": 0.112, + "step": 2227 + }, + { + "epoch": 1.1598125976054139, + "grad_norm": 0.27566732529397137, + "learning_rate": 3.5099915220487e-05, + "loss": 0.1123, + "step": 2228 + }, + { + "epoch": 1.1603331598125977, + "grad_norm": 0.2612613930290791, + "learning_rate": 3.5087061189608425e-05, + "loss": 0.1138, + "step": 2229 + }, + { + "epoch": 1.1608537220197814, + "grad_norm": 0.2609146607471247, + "learning_rate": 3.50742039727665e-05, + "loss": 0.1131, + "step": 2230 + }, + { + "epoch": 1.1613742842269652, + "grad_norm": 0.23452789382351913, + "learning_rate": 3.506134357402216e-05, + "loss": 0.1155, + "step": 2231 + }, + { + "epoch": 1.161894846434149, + "grad_norm": 0.25381787284216584, + "learning_rate": 3.504847999743729e-05, + "loss": 0.1169, + "step": 2232 + }, + { + "epoch": 1.1624154086413325, + "grad_norm": 0.2467153144538376, + "learning_rate": 3.503561324707484e-05, + "loss": 0.1113, + "step": 2233 + }, + { + "epoch": 1.1629359708485163, + "grad_norm": 0.2365562667950988, + "learning_rate": 3.50227433269987e-05, + "loss": 0.1152, + "step": 2234 + }, + { + "epoch": 1.1634565330557, + "grad_norm": 0.23867811869220612, + "learning_rate": 3.50098702412738e-05, + "loss": 0.1157, + "step": 2235 + }, + { + "epoch": 1.1639770952628838, + "grad_norm": 0.26099442663987993, + "learning_rate": 3.4996993993966057e-05, + "loss": 0.117, + "step": 2236 + }, + { + "epoch": 1.1644976574700676, + "grad_norm": 0.23935359957958538, + "learning_rate": 3.498411458914238e-05, + "loss": 0.1111, + "step": 2237 + }, + { + "epoch": 1.1650182196772514, + "grad_norm": 0.25105828547638254, + "learning_rate": 3.497123203087072e-05, + "loss": 0.1158, + "step": 2238 + }, + { + "epoch": 1.1655387818844352, + "grad_norm": 0.23331248980187663, + "learning_rate": 3.4958346323219946e-05, + "loss": 0.1037, + "step": 2239 + }, + { + "epoch": 1.166059344091619, + "grad_norm": 0.23447637588391998, + "learning_rate": 3.494545747025999e-05, + "loss": 0.1111, + "step": 2240 + }, + { + "epoch": 1.1665799062988027, + "grad_norm": 0.2526432724715221, + "learning_rate": 3.493256547606174e-05, + "loss": 0.1169, + "step": 2241 + }, + { + "epoch": 1.1671004685059865, + "grad_norm": 0.23593617679344556, + "learning_rate": 3.4919670344697085e-05, + "loss": 0.1119, + "step": 2242 + }, + { + "epoch": 1.1676210307131702, + "grad_norm": 0.24062003884271277, + "learning_rate": 3.490677208023892e-05, + "loss": 0.11, + "step": 2243 + }, + { + "epoch": 1.168141592920354, + "grad_norm": 0.2606487027308462, + "learning_rate": 3.489387068676111e-05, + "loss": 0.1188, + "step": 2244 + }, + { + "epoch": 1.1686621551275378, + "grad_norm": 0.23693423265149932, + "learning_rate": 3.4880966168338515e-05, + "loss": 0.1099, + "step": 2245 + }, + { + "epoch": 1.1691827173347216, + "grad_norm": 0.24304853176359928, + "learning_rate": 3.4868058529046985e-05, + "loss": 0.1194, + "step": 2246 + }, + { + "epoch": 1.1697032795419053, + "grad_norm": 0.25302863952962923, + "learning_rate": 3.485514777296335e-05, + "loss": 0.1136, + "step": 2247 + }, + { + "epoch": 1.170223841749089, + "grad_norm": 0.26167932489486967, + "learning_rate": 3.484223390416543e-05, + "loss": 0.113, + "step": 2248 + }, + { + "epoch": 1.1707444039562729, + "grad_norm": 0.2571613695259598, + "learning_rate": 3.4829316926732025e-05, + "loss": 0.1168, + "step": 2249 + }, + { + "epoch": 1.1712649661634567, + "grad_norm": 0.24293615449947745, + "learning_rate": 3.481639684474291e-05, + "loss": 0.1082, + "step": 2250 + }, + { + "epoch": 1.1717855283706402, + "grad_norm": 0.2473405511928646, + "learning_rate": 3.4803473662278865e-05, + "loss": 0.1122, + "step": 2251 + }, + { + "epoch": 1.172306090577824, + "grad_norm": 0.25163795289435603, + "learning_rate": 3.479054738342162e-05, + "loss": 0.1146, + "step": 2252 + }, + { + "epoch": 1.1728266527850078, + "grad_norm": 0.25041465835608206, + "learning_rate": 3.47776180122539e-05, + "loss": 0.1166, + "step": 2253 + }, + { + "epoch": 1.1733472149921915, + "grad_norm": 0.24547313540848606, + "learning_rate": 3.47646855528594e-05, + "loss": 0.1148, + "step": 2254 + }, + { + "epoch": 1.1738677771993753, + "grad_norm": 0.23742977846106733, + "learning_rate": 3.4751750009322795e-05, + "loss": 0.1063, + "step": 2255 + }, + { + "epoch": 1.174388339406559, + "grad_norm": 0.23708783235578984, + "learning_rate": 3.473881138572973e-05, + "loss": 0.1155, + "step": 2256 + }, + { + "epoch": 1.1749089016137428, + "grad_norm": 0.23348367213721333, + "learning_rate": 3.472586968616682e-05, + "loss": 0.1067, + "step": 2257 + }, + { + "epoch": 1.1754294638209266, + "grad_norm": 0.2502161259943564, + "learning_rate": 3.4712924914721664e-05, + "loss": 0.111, + "step": 2258 + }, + { + "epoch": 1.1759500260281104, + "grad_norm": 0.26009313181484484, + "learning_rate": 3.469997707548281e-05, + "loss": 0.117, + "step": 2259 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.23101072131923964, + "learning_rate": 3.468702617253981e-05, + "loss": 0.1092, + "step": 2260 + }, + { + "epoch": 1.176991150442478, + "grad_norm": 0.2546409766040149, + "learning_rate": 3.4674072209983144e-05, + "loss": 0.1142, + "step": 2261 + }, + { + "epoch": 1.1775117126496617, + "grad_norm": 0.23922221142024416, + "learning_rate": 3.466111519190428e-05, + "loss": 0.1107, + "step": 2262 + }, + { + "epoch": 1.1780322748568455, + "grad_norm": 0.244686444864269, + "learning_rate": 3.464815512239565e-05, + "loss": 0.1113, + "step": 2263 + }, + { + "epoch": 1.1785528370640292, + "grad_norm": 0.23856819223838555, + "learning_rate": 3.463519200555064e-05, + "loss": 0.1113, + "step": 2264 + }, + { + "epoch": 1.1790733992712128, + "grad_norm": 0.26366348156801095, + "learning_rate": 3.462222584546363e-05, + "loss": 0.1079, + "step": 2265 + }, + { + "epoch": 1.1795939614783966, + "grad_norm": 0.24664767010017477, + "learning_rate": 3.46092566462299e-05, + "loss": 0.1131, + "step": 2266 + }, + { + "epoch": 1.1801145236855803, + "grad_norm": 0.24616840057630468, + "learning_rate": 3.4596284411945755e-05, + "loss": 0.1173, + "step": 2267 + }, + { + "epoch": 1.1806350858927641, + "grad_norm": 0.25367146760865916, + "learning_rate": 3.4583309146708406e-05, + "loss": 0.1154, + "step": 2268 + }, + { + "epoch": 1.1811556480999479, + "grad_norm": 0.23507264194144847, + "learning_rate": 3.457033085461607e-05, + "loss": 0.1037, + "step": 2269 + }, + { + "epoch": 1.1816762103071317, + "grad_norm": 0.23989782007015442, + "learning_rate": 3.455734953976789e-05, + "loss": 0.1128, + "step": 2270 + }, + { + "epoch": 1.1821967725143154, + "grad_norm": 0.2405068672443556, + "learning_rate": 3.454436520626396e-05, + "loss": 0.1128, + "step": 2271 + }, + { + "epoch": 1.1827173347214992, + "grad_norm": 0.2301126062735549, + "learning_rate": 3.453137785820534e-05, + "loss": 0.1091, + "step": 2272 + }, + { + "epoch": 1.183237896928683, + "grad_norm": 0.25627883239774635, + "learning_rate": 3.451838749969404e-05, + "loss": 0.1088, + "step": 2273 + }, + { + "epoch": 1.1837584591358667, + "grad_norm": 0.23620278190068159, + "learning_rate": 3.450539413483302e-05, + "loss": 0.1144, + "step": 2274 + }, + { + "epoch": 1.1842790213430505, + "grad_norm": 0.25644395644621587, + "learning_rate": 3.4492397767726195e-05, + "loss": 0.1152, + "step": 2275 + }, + { + "epoch": 1.1847995835502343, + "grad_norm": 0.22348749190590206, + "learning_rate": 3.4479398402478406e-05, + "loss": 0.108, + "step": 2276 + }, + { + "epoch": 1.185320145757418, + "grad_norm": 0.2526453426350537, + "learning_rate": 3.4466396043195484e-05, + "loss": 0.1153, + "step": 2277 + }, + { + "epoch": 1.1858407079646018, + "grad_norm": 0.2562747166960448, + "learning_rate": 3.445339069398415e-05, + "loss": 0.1194, + "step": 2278 + }, + { + "epoch": 1.1863612701717856, + "grad_norm": 0.23680974190421952, + "learning_rate": 3.444038235895212e-05, + "loss": 0.11, + "step": 2279 + }, + { + "epoch": 1.1868818323789694, + "grad_norm": 0.2545225578398651, + "learning_rate": 3.442737104220801e-05, + "loss": 0.1118, + "step": 2280 + }, + { + "epoch": 1.1874023945861532, + "grad_norm": 0.2461192137870807, + "learning_rate": 3.441435674786143e-05, + "loss": 0.1104, + "step": 2281 + }, + { + "epoch": 1.187922956793337, + "grad_norm": 0.2538161714997669, + "learning_rate": 3.4401339480022874e-05, + "loss": 0.1118, + "step": 2282 + }, + { + "epoch": 1.1884435190005205, + "grad_norm": 0.25314168955138006, + "learning_rate": 3.4388319242803806e-05, + "loss": 0.1091, + "step": 2283 + }, + { + "epoch": 1.1889640812077042, + "grad_norm": 0.2587245929932049, + "learning_rate": 3.437529604031663e-05, + "loss": 0.1125, + "step": 2284 + }, + { + "epoch": 1.189484643414888, + "grad_norm": 0.24732781819326619, + "learning_rate": 3.436226987667467e-05, + "loss": 0.1095, + "step": 2285 + }, + { + "epoch": 1.1900052056220718, + "grad_norm": 0.26399681078473874, + "learning_rate": 3.4349240755992216e-05, + "loss": 0.1144, + "step": 2286 + }, + { + "epoch": 1.1905257678292556, + "grad_norm": 0.254509691032438, + "learning_rate": 3.433620868238444e-05, + "loss": 0.1195, + "step": 2287 + }, + { + "epoch": 1.1910463300364393, + "grad_norm": 0.27960613887789765, + "learning_rate": 3.4323173659967506e-05, + "loss": 0.1147, + "step": 2288 + }, + { + "epoch": 1.191566892243623, + "grad_norm": 0.23122617317358943, + "learning_rate": 3.431013569285846e-05, + "loss": 0.1082, + "step": 2289 + }, + { + "epoch": 1.1920874544508069, + "grad_norm": 0.2452337995607666, + "learning_rate": 3.429709478517531e-05, + "loss": 0.1121, + "step": 2290 + }, + { + "epoch": 1.1926080166579907, + "grad_norm": 0.24517649301196232, + "learning_rate": 3.428405094103696e-05, + "loss": 0.1145, + "step": 2291 + }, + { + "epoch": 1.1931285788651744, + "grad_norm": 0.24289572986099975, + "learning_rate": 3.42710041645633e-05, + "loss": 0.1138, + "step": 2292 + }, + { + "epoch": 1.1936491410723582, + "grad_norm": 0.26963497608876424, + "learning_rate": 3.425795445987508e-05, + "loss": 0.1186, + "step": 2293 + }, + { + "epoch": 1.194169703279542, + "grad_norm": 0.2525547241342057, + "learning_rate": 3.4244901831094014e-05, + "loss": 0.1145, + "step": 2294 + }, + { + "epoch": 1.1946902654867257, + "grad_norm": 0.25154080236686166, + "learning_rate": 3.4231846282342725e-05, + "loss": 0.1158, + "step": 2295 + }, + { + "epoch": 1.1952108276939095, + "grad_norm": 0.2607865126803791, + "learning_rate": 3.4218787817744773e-05, + "loss": 0.1145, + "step": 2296 + }, + { + "epoch": 1.195731389901093, + "grad_norm": 0.28235116430206414, + "learning_rate": 3.420572644142463e-05, + "loss": 0.1157, + "step": 2297 + }, + { + "epoch": 1.1962519521082768, + "grad_norm": 0.23089667791953067, + "learning_rate": 3.419266215750767e-05, + "loss": 0.1131, + "step": 2298 + }, + { + "epoch": 1.1967725143154606, + "grad_norm": 0.2293790099648737, + "learning_rate": 3.4179594970120215e-05, + "loss": 0.1151, + "step": 2299 + }, + { + "epoch": 1.1972930765226444, + "grad_norm": 0.24575677024213127, + "learning_rate": 3.4166524883389476e-05, + "loss": 0.1128, + "step": 2300 + }, + { + "epoch": 1.1978136387298282, + "grad_norm": 0.25216046527261354, + "learning_rate": 3.415345190144362e-05, + "loss": 0.1137, + "step": 2301 + }, + { + "epoch": 1.198334200937012, + "grad_norm": 0.24305569802868002, + "learning_rate": 3.414037602841168e-05, + "loss": 0.1147, + "step": 2302 + }, + { + "epoch": 1.1988547631441957, + "grad_norm": 0.27127800645249683, + "learning_rate": 3.4127297268423636e-05, + "loss": 0.1181, + "step": 2303 + }, + { + "epoch": 1.1993753253513795, + "grad_norm": 0.23242248281346625, + "learning_rate": 3.411421562561037e-05, + "loss": 0.1128, + "step": 2304 + }, + { + "epoch": 1.1998958875585632, + "grad_norm": 0.2745166695623737, + "learning_rate": 3.410113110410366e-05, + "loss": 0.1117, + "step": 2305 + }, + { + "epoch": 1.200416449765747, + "grad_norm": 0.23978776170340857, + "learning_rate": 3.408804370803623e-05, + "loss": 0.1108, + "step": 2306 + }, + { + "epoch": 1.2009370119729308, + "grad_norm": 0.24928731609898175, + "learning_rate": 3.407495344154167e-05, + "loss": 0.1161, + "step": 2307 + }, + { + "epoch": 1.2014575741801146, + "grad_norm": 0.23242055883453808, + "learning_rate": 3.40618603087545e-05, + "loss": 0.1114, + "step": 2308 + }, + { + "epoch": 1.2019781363872983, + "grad_norm": 0.2418508216693535, + "learning_rate": 3.404876431381014e-05, + "loss": 0.1097, + "step": 2309 + }, + { + "epoch": 1.202498698594482, + "grad_norm": 0.2644872421572009, + "learning_rate": 3.403566546084493e-05, + "loss": 0.1198, + "step": 2310 + }, + { + "epoch": 1.2030192608016659, + "grad_norm": 0.2432882408100227, + "learning_rate": 3.4022563753996075e-05, + "loss": 0.1161, + "step": 2311 + }, + { + "epoch": 1.2035398230088497, + "grad_norm": 0.25959575629843024, + "learning_rate": 3.400945919740171e-05, + "loss": 0.1136, + "step": 2312 + }, + { + "epoch": 1.2040603852160334, + "grad_norm": 0.2331391415291296, + "learning_rate": 3.3996351795200865e-05, + "loss": 0.1096, + "step": 2313 + }, + { + "epoch": 1.2045809474232172, + "grad_norm": 0.24943855765809428, + "learning_rate": 3.3983241551533465e-05, + "loss": 0.1091, + "step": 2314 + }, + { + "epoch": 1.2051015096304007, + "grad_norm": 0.24791662409738544, + "learning_rate": 3.397012847054035e-05, + "loss": 0.1148, + "step": 2315 + }, + { + "epoch": 1.2056220718375845, + "grad_norm": 0.2618458677286621, + "learning_rate": 3.3957012556363224e-05, + "loss": 0.1119, + "step": 2316 + }, + { + "epoch": 1.2061426340447683, + "grad_norm": 0.2635221612106524, + "learning_rate": 3.394389381314471e-05, + "loss": 0.112, + "step": 2317 + }, + { + "epoch": 1.206663196251952, + "grad_norm": 0.24840413289756066, + "learning_rate": 3.393077224502832e-05, + "loss": 0.1104, + "step": 2318 + }, + { + "epoch": 1.2071837584591358, + "grad_norm": 0.2504601503439982, + "learning_rate": 3.391764785615845e-05, + "loss": 0.1137, + "step": 2319 + }, + { + "epoch": 1.2077043206663196, + "grad_norm": 0.22826339447866847, + "learning_rate": 3.3904520650680405e-05, + "loss": 0.1084, + "step": 2320 + }, + { + "epoch": 1.2082248828735034, + "grad_norm": 0.23110036065955847, + "learning_rate": 3.3891390632740345e-05, + "loss": 0.1063, + "step": 2321 + }, + { + "epoch": 1.2087454450806872, + "grad_norm": 0.25396241886265036, + "learning_rate": 3.387825780648536e-05, + "loss": 0.1209, + "step": 2322 + }, + { + "epoch": 1.209266007287871, + "grad_norm": 0.2437164195108459, + "learning_rate": 3.386512217606339e-05, + "loss": 0.1202, + "step": 2323 + }, + { + "epoch": 1.2097865694950547, + "grad_norm": 0.23771865004331263, + "learning_rate": 3.38519837456233e-05, + "loss": 0.1127, + "step": 2324 + }, + { + "epoch": 1.2103071317022385, + "grad_norm": 0.22893673951370747, + "learning_rate": 3.383884251931481e-05, + "loss": 0.1098, + "step": 2325 + }, + { + "epoch": 1.2108276939094222, + "grad_norm": 0.24464255415023003, + "learning_rate": 3.3825698501288516e-05, + "loss": 0.1114, + "step": 2326 + }, + { + "epoch": 1.211348256116606, + "grad_norm": 0.24641719397311446, + "learning_rate": 3.381255169569594e-05, + "loss": 0.1163, + "step": 2327 + }, + { + "epoch": 1.2118688183237898, + "grad_norm": 0.24202099555940623, + "learning_rate": 3.3799402106689416e-05, + "loss": 0.1122, + "step": 2328 + }, + { + "epoch": 1.2123893805309733, + "grad_norm": 0.23976840972172184, + "learning_rate": 3.3786249738422235e-05, + "loss": 0.1138, + "step": 2329 + }, + { + "epoch": 1.212909942738157, + "grad_norm": 0.26118454788338863, + "learning_rate": 3.3773094595048506e-05, + "loss": 0.1105, + "step": 2330 + }, + { + "epoch": 1.2134305049453409, + "grad_norm": 0.2504712191955974, + "learning_rate": 3.375993668072324e-05, + "loss": 0.1124, + "step": 2331 + }, + { + "epoch": 1.2139510671525247, + "grad_norm": 0.2325507565735545, + "learning_rate": 3.374677599960231e-05, + "loss": 0.1098, + "step": 2332 + }, + { + "epoch": 1.2144716293597084, + "grad_norm": 0.2590513905450211, + "learning_rate": 3.3733612555842486e-05, + "loss": 0.1124, + "step": 2333 + }, + { + "epoch": 1.2149921915668922, + "grad_norm": 0.24603883688487388, + "learning_rate": 3.3720446353601394e-05, + "loss": 0.1122, + "step": 2334 + }, + { + "epoch": 1.215512753774076, + "grad_norm": 0.2377595048373133, + "learning_rate": 3.370727739703752e-05, + "loss": 0.1124, + "step": 2335 + }, + { + "epoch": 1.2160333159812597, + "grad_norm": 0.23641590886231106, + "learning_rate": 3.369410569031024e-05, + "loss": 0.1114, + "step": 2336 + }, + { + "epoch": 1.2165538781884435, + "grad_norm": 0.2498625675970159, + "learning_rate": 3.36809312375798e-05, + "loss": 0.1145, + "step": 2337 + }, + { + "epoch": 1.2170744403956273, + "grad_norm": 0.22842565347219712, + "learning_rate": 3.36677540430073e-05, + "loss": 0.1129, + "step": 2338 + }, + { + "epoch": 1.217595002602811, + "grad_norm": 0.22898607545193125, + "learning_rate": 3.365457411075471e-05, + "loss": 0.1127, + "step": 2339 + }, + { + "epoch": 1.2181155648099948, + "grad_norm": 0.23953214969485537, + "learning_rate": 3.3641391444984864e-05, + "loss": 0.1112, + "step": 2340 + }, + { + "epoch": 1.2186361270171786, + "grad_norm": 0.24104626520668057, + "learning_rate": 3.362820604986147e-05, + "loss": 0.1125, + "step": 2341 + }, + { + "epoch": 1.2191566892243624, + "grad_norm": 0.23748691484838422, + "learning_rate": 3.361501792954908e-05, + "loss": 0.1095, + "step": 2342 + }, + { + "epoch": 1.2196772514315461, + "grad_norm": 0.2483740020552502, + "learning_rate": 3.360182708821312e-05, + "loss": 0.1158, + "step": 2343 + }, + { + "epoch": 1.22019781363873, + "grad_norm": 0.25425874287469785, + "learning_rate": 3.358863353001987e-05, + "loss": 0.1225, + "step": 2344 + }, + { + "epoch": 1.2207183758459137, + "grad_norm": 0.2586255317538337, + "learning_rate": 3.3575437259136474e-05, + "loss": 0.1164, + "step": 2345 + }, + { + "epoch": 1.2212389380530975, + "grad_norm": 0.23402320390581927, + "learning_rate": 3.3562238279730916e-05, + "loss": 0.1147, + "step": 2346 + }, + { + "epoch": 1.221759500260281, + "grad_norm": 0.2477059723049737, + "learning_rate": 3.354903659597207e-05, + "loss": 0.1115, + "step": 2347 + }, + { + "epoch": 1.2222800624674648, + "grad_norm": 0.23650878937783718, + "learning_rate": 3.353583221202962e-05, + "loss": 0.1067, + "step": 2348 + }, + { + "epoch": 1.2228006246746486, + "grad_norm": 0.24360478631058402, + "learning_rate": 3.352262513207413e-05, + "loss": 0.1113, + "step": 2349 + }, + { + "epoch": 1.2233211868818323, + "grad_norm": 0.239918587840492, + "learning_rate": 3.350941536027702e-05, + "loss": 0.1103, + "step": 2350 + }, + { + "epoch": 1.223841749089016, + "grad_norm": 0.23335823537471334, + "learning_rate": 3.349620290081055e-05, + "loss": 0.107, + "step": 2351 + }, + { + "epoch": 1.2243623112961999, + "grad_norm": 0.23023317202230778, + "learning_rate": 3.348298775784782e-05, + "loss": 0.1139, + "step": 2352 + }, + { + "epoch": 1.2248828735033837, + "grad_norm": 0.24137499470962637, + "learning_rate": 3.3469769935562796e-05, + "loss": 0.1111, + "step": 2353 + }, + { + "epoch": 1.2254034357105674, + "grad_norm": 0.23984175856275775, + "learning_rate": 3.345654943813027e-05, + "loss": 0.1126, + "step": 2354 + }, + { + "epoch": 1.2259239979177512, + "grad_norm": 0.2511343023919521, + "learning_rate": 3.34433262697259e-05, + "loss": 0.1235, + "step": 2355 + }, + { + "epoch": 1.226444560124935, + "grad_norm": 0.2248493142834007, + "learning_rate": 3.343010043452618e-05, + "loss": 0.11, + "step": 2356 + }, + { + "epoch": 1.2269651223321187, + "grad_norm": 0.23619956004612885, + "learning_rate": 3.3416871936708436e-05, + "loss": 0.1122, + "step": 2357 + }, + { + "epoch": 1.2274856845393025, + "grad_norm": 0.24154490816188087, + "learning_rate": 3.340364078045085e-05, + "loss": 0.118, + "step": 2358 + }, + { + "epoch": 1.2280062467464863, + "grad_norm": 0.23906919393439444, + "learning_rate": 3.339040696993243e-05, + "loss": 0.112, + "step": 2359 + }, + { + "epoch": 1.22852680895367, + "grad_norm": 0.23515780174889195, + "learning_rate": 3.3377170509333035e-05, + "loss": 0.1065, + "step": 2360 + }, + { + "epoch": 1.2290473711608536, + "grad_norm": 0.2448541840615606, + "learning_rate": 3.336393140283335e-05, + "loss": 0.1104, + "step": 2361 + }, + { + "epoch": 1.2295679333680374, + "grad_norm": 0.23502697559009228, + "learning_rate": 3.33506896546149e-05, + "loss": 0.1168, + "step": 2362 + }, + { + "epoch": 1.2300884955752212, + "grad_norm": 0.24547086115985342, + "learning_rate": 3.333744526886006e-05, + "loss": 0.1178, + "step": 2363 + }, + { + "epoch": 1.230609057782405, + "grad_norm": 0.2238019166845714, + "learning_rate": 3.3324198249752004e-05, + "loss": 0.1089, + "step": 2364 + }, + { + "epoch": 1.2311296199895887, + "grad_norm": 0.25163269197743565, + "learning_rate": 3.331094860147477e-05, + "loss": 0.1149, + "step": 2365 + }, + { + "epoch": 1.2316501821967725, + "grad_norm": 0.24302067777188527, + "learning_rate": 3.329769632821321e-05, + "loss": 0.1126, + "step": 2366 + }, + { + "epoch": 1.2321707444039562, + "grad_norm": 0.24052489202021649, + "learning_rate": 3.328444143415301e-05, + "loss": 0.1126, + "step": 2367 + }, + { + "epoch": 1.23269130661114, + "grad_norm": 0.23963819746590062, + "learning_rate": 3.327118392348068e-05, + "loss": 0.1111, + "step": 2368 + }, + { + "epoch": 1.2332118688183238, + "grad_norm": 0.2554674901122427, + "learning_rate": 3.325792380038356e-05, + "loss": 0.122, + "step": 2369 + }, + { + "epoch": 1.2337324310255076, + "grad_norm": 0.24101776937698122, + "learning_rate": 3.324466106904981e-05, + "loss": 0.1108, + "step": 2370 + }, + { + "epoch": 1.2342529932326913, + "grad_norm": 0.23475211548593766, + "learning_rate": 3.323139573366842e-05, + "loss": 0.1143, + "step": 2371 + }, + { + "epoch": 1.234773555439875, + "grad_norm": 0.23437107317802344, + "learning_rate": 3.32181277984292e-05, + "loss": 0.109, + "step": 2372 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.23380076784273857, + "learning_rate": 3.320485726752278e-05, + "loss": 0.1119, + "step": 2373 + }, + { + "epoch": 1.2358146798542426, + "grad_norm": 0.2540138119726505, + "learning_rate": 3.3191584145140626e-05, + "loss": 0.1171, + "step": 2374 + }, + { + "epoch": 1.2363352420614264, + "grad_norm": 0.2411702497970775, + "learning_rate": 3.317830843547499e-05, + "loss": 0.1161, + "step": 2375 + }, + { + "epoch": 1.2368558042686102, + "grad_norm": 0.233210046923344, + "learning_rate": 3.316503014271896e-05, + "loss": 0.1103, + "step": 2376 + }, + { + "epoch": 1.237376366475794, + "grad_norm": 0.24776647975273905, + "learning_rate": 3.3151749271066444e-05, + "loss": 0.1155, + "step": 2377 + }, + { + "epoch": 1.2378969286829777, + "grad_norm": 0.2273242590925185, + "learning_rate": 3.3138465824712164e-05, + "loss": 0.1124, + "step": 2378 + }, + { + "epoch": 1.2384174908901613, + "grad_norm": 0.2394314835262025, + "learning_rate": 3.312517980785164e-05, + "loss": 0.1128, + "step": 2379 + }, + { + "epoch": 1.238938053097345, + "grad_norm": 0.2448659548311951, + "learning_rate": 3.311189122468122e-05, + "loss": 0.1196, + "step": 2380 + }, + { + "epoch": 1.2394586153045288, + "grad_norm": 0.23448498331985973, + "learning_rate": 3.309860007939806e-05, + "loss": 0.1116, + "step": 2381 + }, + { + "epoch": 1.2399791775117126, + "grad_norm": 0.22911095022022188, + "learning_rate": 3.308530637620011e-05, + "loss": 0.1121, + "step": 2382 + }, + { + "epoch": 1.2404997397188964, + "grad_norm": 0.24025893459427192, + "learning_rate": 3.307201011928616e-05, + "loss": 0.1144, + "step": 2383 + }, + { + "epoch": 1.2410203019260801, + "grad_norm": 0.23259241617711043, + "learning_rate": 3.305871131285577e-05, + "loss": 0.1094, + "step": 2384 + }, + { + "epoch": 1.241540864133264, + "grad_norm": 0.2361852244544814, + "learning_rate": 3.3045409961109324e-05, + "loss": 0.1071, + "step": 2385 + }, + { + "epoch": 1.2420614263404477, + "grad_norm": 0.2561842268409719, + "learning_rate": 3.3032106068248014e-05, + "loss": 0.1198, + "step": 2386 + }, + { + "epoch": 1.2425819885476315, + "grad_norm": 0.2511394825375184, + "learning_rate": 3.301879963847383e-05, + "loss": 0.1106, + "step": 2387 + }, + { + "epoch": 1.2431025507548152, + "grad_norm": 0.22935577553554087, + "learning_rate": 3.3005490675989545e-05, + "loss": 0.1103, + "step": 2388 + }, + { + "epoch": 1.243623112961999, + "grad_norm": 0.24634598430303592, + "learning_rate": 3.2992179184998774e-05, + "loss": 0.1118, + "step": 2389 + }, + { + "epoch": 1.2441436751691828, + "grad_norm": 0.24037850632563773, + "learning_rate": 3.2978865169705885e-05, + "loss": 0.115, + "step": 2390 + }, + { + "epoch": 1.2446642373763666, + "grad_norm": 0.24026168911843124, + "learning_rate": 3.296554863431607e-05, + "loss": 0.1093, + "step": 2391 + }, + { + "epoch": 1.2451847995835503, + "grad_norm": 0.24022065182744629, + "learning_rate": 3.295222958303532e-05, + "loss": 0.1115, + "step": 2392 + }, + { + "epoch": 1.2457053617907339, + "grad_norm": 0.23185501118099175, + "learning_rate": 3.2938908020070404e-05, + "loss": 0.1101, + "step": 2393 + }, + { + "epoch": 1.2462259239979177, + "grad_norm": 0.23099404002234822, + "learning_rate": 3.292558394962888e-05, + "loss": 0.1132, + "step": 2394 + }, + { + "epoch": 1.2467464862051014, + "grad_norm": 0.24176376810983632, + "learning_rate": 3.2912257375919126e-05, + "loss": 0.1148, + "step": 2395 + }, + { + "epoch": 1.2472670484122852, + "grad_norm": 0.24222391889430964, + "learning_rate": 3.289892830315028e-05, + "loss": 0.1098, + "step": 2396 + }, + { + "epoch": 1.247787610619469, + "grad_norm": 0.23277468724181424, + "learning_rate": 3.28855967355323e-05, + "loss": 0.1081, + "step": 2397 + }, + { + "epoch": 1.2483081728266527, + "grad_norm": 0.2319208224863446, + "learning_rate": 3.2872262677275906e-05, + "loss": 0.1127, + "step": 2398 + }, + { + "epoch": 1.2488287350338365, + "grad_norm": 0.22600332372977186, + "learning_rate": 3.285892613259261e-05, + "loss": 0.1082, + "step": 2399 + }, + { + "epoch": 1.2493492972410203, + "grad_norm": 0.2380744039289219, + "learning_rate": 3.2845587105694716e-05, + "loss": 0.1157, + "step": 2400 + }, + { + "epoch": 1.249869859448204, + "grad_norm": 0.24199537993154474, + "learning_rate": 3.283224560079532e-05, + "loss": 0.1085, + "step": 2401 + }, + { + "epoch": 1.2503904216553878, + "grad_norm": 0.25841375982911335, + "learning_rate": 3.281890162210829e-05, + "loss": 0.1129, + "step": 2402 + }, + { + "epoch": 1.2509109838625716, + "grad_norm": 0.23379214223234784, + "learning_rate": 3.2805555173848254e-05, + "loss": 0.11, + "step": 2403 + }, + { + "epoch": 1.2514315460697554, + "grad_norm": 0.24808893501537926, + "learning_rate": 3.279220626023065e-05, + "loss": 0.1103, + "step": 2404 + }, + { + "epoch": 1.2519521082769391, + "grad_norm": 0.23963781789500202, + "learning_rate": 3.27788548854717e-05, + "loss": 0.1126, + "step": 2405 + }, + { + "epoch": 1.252472670484123, + "grad_norm": 0.2536793830039519, + "learning_rate": 3.276550105378838e-05, + "loss": 0.1188, + "step": 2406 + }, + { + "epoch": 1.2529932326913067, + "grad_norm": 0.22592288227783439, + "learning_rate": 3.275214476939845e-05, + "loss": 0.1111, + "step": 2407 + }, + { + "epoch": 1.2535137948984905, + "grad_norm": 0.2184518861484896, + "learning_rate": 3.273878603652045e-05, + "loss": 0.1101, + "step": 2408 + }, + { + "epoch": 1.2540343571056742, + "grad_norm": 0.22611064467072162, + "learning_rate": 3.272542485937369e-05, + "loss": 0.1078, + "step": 2409 + }, + { + "epoch": 1.254554919312858, + "grad_norm": 0.24252402472546458, + "learning_rate": 3.271206124217825e-05, + "loss": 0.1089, + "step": 2410 + }, + { + "epoch": 1.2550754815200418, + "grad_norm": 0.24827439034038407, + "learning_rate": 3.269869518915497e-05, + "loss": 0.1153, + "step": 2411 + }, + { + "epoch": 1.2555960437272253, + "grad_norm": 0.24447695733165611, + "learning_rate": 3.268532670452549e-05, + "loss": 0.111, + "step": 2412 + }, + { + "epoch": 1.256116605934409, + "grad_norm": 0.24933714928678632, + "learning_rate": 3.2671955792512186e-05, + "loss": 0.1113, + "step": 2413 + }, + { + "epoch": 1.2566371681415929, + "grad_norm": 0.2378454669519632, + "learning_rate": 3.265858245733824e-05, + "loss": 0.1152, + "step": 2414 + }, + { + "epoch": 1.2571577303487766, + "grad_norm": 0.2512478037323881, + "learning_rate": 3.2645206703227536e-05, + "loss": 0.1206, + "step": 2415 + }, + { + "epoch": 1.2576782925559604, + "grad_norm": 0.250096683928609, + "learning_rate": 3.263182853440479e-05, + "loss": 0.1135, + "step": 2416 + }, + { + "epoch": 1.2581988547631442, + "grad_norm": 0.23599764911190643, + "learning_rate": 3.261844795509542e-05, + "loss": 0.116, + "step": 2417 + }, + { + "epoch": 1.258719416970328, + "grad_norm": 0.2311757103757818, + "learning_rate": 3.260506496952567e-05, + "loss": 0.1154, + "step": 2418 + }, + { + "epoch": 1.2592399791775117, + "grad_norm": 0.23201479378425932, + "learning_rate": 3.259167958192249e-05, + "loss": 0.114, + "step": 2419 + }, + { + "epoch": 1.2597605413846955, + "grad_norm": 0.2365740831566155, + "learning_rate": 3.257829179651361e-05, + "loss": 0.1095, + "step": 2420 + }, + { + "epoch": 1.2602811035918793, + "grad_norm": 0.22738663769119088, + "learning_rate": 3.2564901617527513e-05, + "loss": 0.1069, + "step": 2421 + }, + { + "epoch": 1.260801665799063, + "grad_norm": 0.24513932299659597, + "learning_rate": 3.2551509049193444e-05, + "loss": 0.1126, + "step": 2422 + }, + { + "epoch": 1.2613222280062468, + "grad_norm": 0.2295326150055126, + "learning_rate": 3.253811409574141e-05, + "loss": 0.1135, + "step": 2423 + }, + { + "epoch": 1.2618427902134304, + "grad_norm": 0.2392260915445472, + "learning_rate": 3.252471676140215e-05, + "loss": 0.1133, + "step": 2424 + }, + { + "epoch": 1.2623633524206141, + "grad_norm": 0.2281032715398985, + "learning_rate": 3.251131705040716e-05, + "loss": 0.1086, + "step": 2425 + }, + { + "epoch": 1.262883914627798, + "grad_norm": 0.240395636614695, + "learning_rate": 3.24979149669887e-05, + "loss": 0.1145, + "step": 2426 + }, + { + "epoch": 1.2634044768349817, + "grad_norm": 0.2406505726843499, + "learning_rate": 3.2484510515379776e-05, + "loss": 0.1122, + "step": 2427 + }, + { + "epoch": 1.2639250390421655, + "grad_norm": 0.2645085343890866, + "learning_rate": 3.247110369981413e-05, + "loss": 0.1151, + "step": 2428 + }, + { + "epoch": 1.2644456012493492, + "grad_norm": 0.22557528612610186, + "learning_rate": 3.245769452452626e-05, + "loss": 0.1092, + "step": 2429 + }, + { + "epoch": 1.264966163456533, + "grad_norm": 0.23590364280733392, + "learning_rate": 3.244428299375141e-05, + "loss": 0.1157, + "step": 2430 + }, + { + "epoch": 1.2654867256637168, + "grad_norm": 0.23355810219083442, + "learning_rate": 3.243086911172555e-05, + "loss": 0.1112, + "step": 2431 + }, + { + "epoch": 1.2660072878709006, + "grad_norm": 0.22768610677426723, + "learning_rate": 3.241745288268544e-05, + "loss": 0.1112, + "step": 2432 + }, + { + "epoch": 1.2665278500780843, + "grad_norm": 0.2213720698521258, + "learning_rate": 3.240403431086853e-05, + "loss": 0.1115, + "step": 2433 + }, + { + "epoch": 1.267048412285268, + "grad_norm": 0.2511275509634126, + "learning_rate": 3.239061340051302e-05, + "loss": 0.1144, + "step": 2434 + }, + { + "epoch": 1.2675689744924519, + "grad_norm": 0.22474601806469852, + "learning_rate": 3.237719015585787e-05, + "loss": 0.1084, + "step": 2435 + }, + { + "epoch": 1.2680895366996356, + "grad_norm": 0.22967485481677846, + "learning_rate": 3.236376458114276e-05, + "loss": 0.1053, + "step": 2436 + }, + { + "epoch": 1.2686100989068194, + "grad_norm": 0.24508106364086055, + "learning_rate": 3.235033668060813e-05, + "loss": 0.1141, + "step": 2437 + }, + { + "epoch": 1.2691306611140032, + "grad_norm": 0.23936751039871776, + "learning_rate": 3.233690645849511e-05, + "loss": 0.1126, + "step": 2438 + }, + { + "epoch": 1.269651223321187, + "grad_norm": 0.2423970044613412, + "learning_rate": 3.23234739190456e-05, + "loss": 0.11, + "step": 2439 + }, + { + "epoch": 1.2701717855283707, + "grad_norm": 0.2408077607865761, + "learning_rate": 3.2310039066502224e-05, + "loss": 0.1186, + "step": 2440 + }, + { + "epoch": 1.2706923477355545, + "grad_norm": 0.24264263902809302, + "learning_rate": 3.229660190510833e-05, + "loss": 0.1168, + "step": 2441 + }, + { + "epoch": 1.2712129099427383, + "grad_norm": 0.23451047905283273, + "learning_rate": 3.2283162439108004e-05, + "loss": 0.1114, + "step": 2442 + }, + { + "epoch": 1.271733472149922, + "grad_norm": 0.24187390316258348, + "learning_rate": 3.226972067274605e-05, + "loss": 0.1111, + "step": 2443 + }, + { + "epoch": 1.2722540343571056, + "grad_norm": 0.23365628213616502, + "learning_rate": 3.2256276610268e-05, + "loss": 0.1177, + "step": 2444 + }, + { + "epoch": 1.2727745965642894, + "grad_norm": 0.2427390698922539, + "learning_rate": 3.224283025592011e-05, + "loss": 0.1111, + "step": 2445 + }, + { + "epoch": 1.2732951587714731, + "grad_norm": 0.2512975614866838, + "learning_rate": 3.22293816139494e-05, + "loss": 0.1182, + "step": 2446 + }, + { + "epoch": 1.273815720978657, + "grad_norm": 0.24295218403998542, + "learning_rate": 3.2215930688603525e-05, + "loss": 0.1165, + "step": 2447 + }, + { + "epoch": 1.2743362831858407, + "grad_norm": 0.23644527244752322, + "learning_rate": 3.220247748413094e-05, + "loss": 0.113, + "step": 2448 + }, + { + "epoch": 1.2748568453930245, + "grad_norm": 0.23785821712712837, + "learning_rate": 3.21890220047808e-05, + "loss": 0.114, + "step": 2449 + }, + { + "epoch": 1.2753774076002082, + "grad_norm": 0.23632989350892925, + "learning_rate": 3.217556425480296e-05, + "loss": 0.1111, + "step": 2450 + }, + { + "epoch": 1.275897969807392, + "grad_norm": 0.23263875426854946, + "learning_rate": 3.216210423844801e-05, + "loss": 0.1102, + "step": 2451 + }, + { + "epoch": 1.2764185320145758, + "grad_norm": 0.23188957689089493, + "learning_rate": 3.214864195996723e-05, + "loss": 0.1075, + "step": 2452 + }, + { + "epoch": 1.2769390942217596, + "grad_norm": 0.23375003589359838, + "learning_rate": 3.213517742361267e-05, + "loss": 0.1082, + "step": 2453 + }, + { + "epoch": 1.2774596564289433, + "grad_norm": 0.24021397633189484, + "learning_rate": 3.212171063363702e-05, + "loss": 0.1166, + "step": 2454 + }, + { + "epoch": 1.277980218636127, + "grad_norm": 0.2322622124803816, + "learning_rate": 3.2108241594293756e-05, + "loss": 0.1139, + "step": 2455 + }, + { + "epoch": 1.2785007808433106, + "grad_norm": 0.23120316428655097, + "learning_rate": 3.209477030983698e-05, + "loss": 0.112, + "step": 2456 + }, + { + "epoch": 1.2790213430504944, + "grad_norm": 0.23819406522964934, + "learning_rate": 3.20812967845216e-05, + "loss": 0.1065, + "step": 2457 + }, + { + "epoch": 1.2795419052576782, + "grad_norm": 0.23305129874267824, + "learning_rate": 3.206782102260316e-05, + "loss": 0.1111, + "step": 2458 + }, + { + "epoch": 1.280062467464862, + "grad_norm": 0.26056719528908606, + "learning_rate": 3.205434302833792e-05, + "loss": 0.1104, + "step": 2459 + }, + { + "epoch": 1.2805830296720457, + "grad_norm": 0.23577408055653468, + "learning_rate": 3.204086280598289e-05, + "loss": 0.1192, + "step": 2460 + }, + { + "epoch": 1.2811035918792295, + "grad_norm": 0.2659768658535487, + "learning_rate": 3.202738035979571e-05, + "loss": 0.1197, + "step": 2461 + }, + { + "epoch": 1.2816241540864133, + "grad_norm": 0.2439223726100321, + "learning_rate": 3.2013895694034804e-05, + "loss": 0.107, + "step": 2462 + }, + { + "epoch": 1.282144716293597, + "grad_norm": 0.23562944836320895, + "learning_rate": 3.200040881295922e-05, + "loss": 0.1104, + "step": 2463 + }, + { + "epoch": 1.2826652785007808, + "grad_norm": 0.25461295273652873, + "learning_rate": 3.198691972082878e-05, + "loss": 0.1142, + "step": 2464 + }, + { + "epoch": 1.2831858407079646, + "grad_norm": 0.23055086268252875, + "learning_rate": 3.197342842190394e-05, + "loss": 0.1072, + "step": 2465 + }, + { + "epoch": 1.2837064029151484, + "grad_norm": 0.2770919732305317, + "learning_rate": 3.1959934920445894e-05, + "loss": 0.1161, + "step": 2466 + }, + { + "epoch": 1.2842269651223321, + "grad_norm": 0.24915210522689957, + "learning_rate": 3.194643922071651e-05, + "loss": 0.1139, + "step": 2467 + }, + { + "epoch": 1.284747527329516, + "grad_norm": 0.23485697563408167, + "learning_rate": 3.193294132697835e-05, + "loss": 0.1111, + "step": 2468 + }, + { + "epoch": 1.2852680895366997, + "grad_norm": 0.2376200199679929, + "learning_rate": 3.191944124349471e-05, + "loss": 0.1075, + "step": 2469 + }, + { + "epoch": 1.2857886517438835, + "grad_norm": 0.23788594569376167, + "learning_rate": 3.190593897452951e-05, + "loss": 0.1086, + "step": 2470 + }, + { + "epoch": 1.2863092139510672, + "grad_norm": 0.2338576028196406, + "learning_rate": 3.189243452434741e-05, + "loss": 0.1147, + "step": 2471 + }, + { + "epoch": 1.286829776158251, + "grad_norm": 0.23732556832773136, + "learning_rate": 3.187892789721373e-05, + "loss": 0.1112, + "step": 2472 + }, + { + "epoch": 1.2873503383654348, + "grad_norm": 0.24378666037810956, + "learning_rate": 3.186541909739452e-05, + "loss": 0.1124, + "step": 2473 + }, + { + "epoch": 1.2878709005726185, + "grad_norm": 0.23279147802128494, + "learning_rate": 3.185190812915646e-05, + "loss": 0.1099, + "step": 2474 + }, + { + "epoch": 1.2883914627798023, + "grad_norm": 0.2237628651766288, + "learning_rate": 3.1838394996766946e-05, + "loss": 0.1116, + "step": 2475 + }, + { + "epoch": 1.2889120249869859, + "grad_norm": 0.22815623654435227, + "learning_rate": 3.182487970449407e-05, + "loss": 0.1094, + "step": 2476 + }, + { + "epoch": 1.2894325871941696, + "grad_norm": 0.2324375751464136, + "learning_rate": 3.181136225660657e-05, + "loss": 0.1078, + "step": 2477 + }, + { + "epoch": 1.2899531494013534, + "grad_norm": 0.24381943224649627, + "learning_rate": 3.179784265737392e-05, + "loss": 0.1186, + "step": 2478 + }, + { + "epoch": 1.2904737116085372, + "grad_norm": 0.22358232434006062, + "learning_rate": 3.178432091106619e-05, + "loss": 0.1086, + "step": 2479 + }, + { + "epoch": 1.290994273815721, + "grad_norm": 0.22505649527448798, + "learning_rate": 3.1770797021954216e-05, + "loss": 0.1149, + "step": 2480 + }, + { + "epoch": 1.2915148360229047, + "grad_norm": 0.2271327003271489, + "learning_rate": 3.1757270994309445e-05, + "loss": 0.1151, + "step": 2481 + }, + { + "epoch": 1.2920353982300885, + "grad_norm": 0.22354543223054793, + "learning_rate": 3.174374283240405e-05, + "loss": 0.1157, + "step": 2482 + }, + { + "epoch": 1.2925559604372723, + "grad_norm": 0.23601648934888142, + "learning_rate": 3.1730212540510835e-05, + "loss": 0.1134, + "step": 2483 + }, + { + "epoch": 1.293076522644456, + "grad_norm": 0.23493593163696477, + "learning_rate": 3.1716680122903294e-05, + "loss": 0.1146, + "step": 2484 + }, + { + "epoch": 1.2935970848516398, + "grad_norm": 0.24903210748416696, + "learning_rate": 3.170314558385562e-05, + "loss": 0.112, + "step": 2485 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.24092859338470038, + "learning_rate": 3.1689608927642624e-05, + "loss": 0.114, + "step": 2486 + }, + { + "epoch": 1.2946382092660074, + "grad_norm": 0.23580201734250328, + "learning_rate": 3.1676070158539825e-05, + "loss": 0.1132, + "step": 2487 + }, + { + "epoch": 1.295158771473191, + "grad_norm": 0.2356380954462469, + "learning_rate": 3.166252928082339e-05, + "loss": 0.1087, + "step": 2488 + }, + { + "epoch": 1.2956793336803747, + "grad_norm": 0.23755644465406894, + "learning_rate": 3.164898629877016e-05, + "loss": 0.1109, + "step": 2489 + }, + { + "epoch": 1.2961998958875585, + "grad_norm": 0.2319470903145725, + "learning_rate": 3.1635441216657636e-05, + "loss": 0.1085, + "step": 2490 + }, + { + "epoch": 1.2967204580947422, + "grad_norm": 0.23994882725651862, + "learning_rate": 3.1621894038763995e-05, + "loss": 0.1142, + "step": 2491 + }, + { + "epoch": 1.297241020301926, + "grad_norm": 0.23110760341343395, + "learning_rate": 3.1608344769368056e-05, + "loss": 0.105, + "step": 2492 + }, + { + "epoch": 1.2977615825091098, + "grad_norm": 0.2493451968765941, + "learning_rate": 3.1594793412749315e-05, + "loss": 0.1144, + "step": 2493 + }, + { + "epoch": 1.2982821447162936, + "grad_norm": 0.23459448167768657, + "learning_rate": 3.158123997318792e-05, + "loss": 0.1117, + "step": 2494 + }, + { + "epoch": 1.2988027069234773, + "grad_norm": 0.23267743984705816, + "learning_rate": 3.1567684454964675e-05, + "loss": 0.1092, + "step": 2495 + }, + { + "epoch": 1.299323269130661, + "grad_norm": 0.23495505350422138, + "learning_rate": 3.155412686236105e-05, + "loss": 0.1099, + "step": 2496 + }, + { + "epoch": 1.2998438313378449, + "grad_norm": 0.2271281943706107, + "learning_rate": 3.1540567199659154e-05, + "loss": 0.1104, + "step": 2497 + }, + { + "epoch": 1.3003643935450286, + "grad_norm": 0.25317690129572223, + "learning_rate": 3.152700547114177e-05, + "loss": 0.1118, + "step": 2498 + }, + { + "epoch": 1.3008849557522124, + "grad_norm": 0.24086272384333318, + "learning_rate": 3.15134416810923e-05, + "loss": 0.1132, + "step": 2499 + }, + { + "epoch": 1.3014055179593962, + "grad_norm": 0.25095110856441216, + "learning_rate": 3.149987583379486e-05, + "loss": 0.1184, + "step": 2500 + }, + { + "epoch": 1.30192608016658, + "grad_norm": 0.23992994556687544, + "learning_rate": 3.1486307933534143e-05, + "loss": 0.1095, + "step": 2501 + }, + { + "epoch": 1.3024466423737637, + "grad_norm": 0.2363209984551393, + "learning_rate": 3.147273798459553e-05, + "loss": 0.1114, + "step": 2502 + }, + { + "epoch": 1.3029672045809475, + "grad_norm": 0.2524480432523037, + "learning_rate": 3.145916599126506e-05, + "loss": 0.1124, + "step": 2503 + }, + { + "epoch": 1.3034877667881313, + "grad_norm": 0.22828236428464427, + "learning_rate": 3.1445591957829374e-05, + "loss": 0.105, + "step": 2504 + }, + { + "epoch": 1.304008328995315, + "grad_norm": 0.25760549471351335, + "learning_rate": 3.14320158885758e-05, + "loss": 0.1099, + "step": 2505 + }, + { + "epoch": 1.3045288912024988, + "grad_norm": 0.24230787911345597, + "learning_rate": 3.141843778779229e-05, + "loss": 0.1096, + "step": 2506 + }, + { + "epoch": 1.3050494534096826, + "grad_norm": 0.24593151994037304, + "learning_rate": 3.140485765976743e-05, + "loss": 0.1068, + "step": 2507 + }, + { + "epoch": 1.3055700156168661, + "grad_norm": 0.2367624744628246, + "learning_rate": 3.1391275508790476e-05, + "loss": 0.1089, + "step": 2508 + }, + { + "epoch": 1.30609057782405, + "grad_norm": 0.25001842938229535, + "learning_rate": 3.1377691339151285e-05, + "loss": 0.1154, + "step": 2509 + }, + { + "epoch": 1.3066111400312337, + "grad_norm": 0.23444841996962343, + "learning_rate": 3.136410515514038e-05, + "loss": 0.1111, + "step": 2510 + }, + { + "epoch": 1.3071317022384175, + "grad_norm": 0.25857566739815296, + "learning_rate": 3.13505169610489e-05, + "loss": 0.1184, + "step": 2511 + }, + { + "epoch": 1.3076522644456012, + "grad_norm": 0.23910560119186738, + "learning_rate": 3.133692676116865e-05, + "loss": 0.1072, + "step": 2512 + }, + { + "epoch": 1.308172826652785, + "grad_norm": 0.23446735180490522, + "learning_rate": 3.132333455979202e-05, + "loss": 0.1109, + "step": 2513 + }, + { + "epoch": 1.3086933888599688, + "grad_norm": 0.24121685895466385, + "learning_rate": 3.130974036121208e-05, + "loss": 0.1137, + "step": 2514 + }, + { + "epoch": 1.3092139510671525, + "grad_norm": 0.2471492387424443, + "learning_rate": 3.12961441697225e-05, + "loss": 0.1115, + "step": 2515 + }, + { + "epoch": 1.3097345132743363, + "grad_norm": 0.23961316944400285, + "learning_rate": 3.1282545989617595e-05, + "loss": 0.1174, + "step": 2516 + }, + { + "epoch": 1.31025507548152, + "grad_norm": 0.22528913743073048, + "learning_rate": 3.126894582519231e-05, + "loss": 0.1122, + "step": 2517 + }, + { + "epoch": 1.3107756376887039, + "grad_norm": 0.2222728768218916, + "learning_rate": 3.1255343680742195e-05, + "loss": 0.1088, + "step": 2518 + }, + { + "epoch": 1.3112961998958876, + "grad_norm": 0.2364525553318129, + "learning_rate": 3.1241739560563446e-05, + "loss": 0.1092, + "step": 2519 + }, + { + "epoch": 1.3118167621030712, + "grad_norm": 0.23549419626533213, + "learning_rate": 3.122813346895288e-05, + "loss": 0.1126, + "step": 2520 + }, + { + "epoch": 1.312337324310255, + "grad_norm": 0.22438892761571402, + "learning_rate": 3.121452541020793e-05, + "loss": 0.1057, + "step": 2521 + }, + { + "epoch": 1.3128578865174387, + "grad_norm": 0.2331411062533715, + "learning_rate": 3.1200915388626654e-05, + "loss": 0.1132, + "step": 2522 + }, + { + "epoch": 1.3133784487246225, + "grad_norm": 0.23459665267739663, + "learning_rate": 3.118730340850774e-05, + "loss": 0.1075, + "step": 2523 + }, + { + "epoch": 1.3138990109318063, + "grad_norm": 0.2347453526216641, + "learning_rate": 3.1173689474150476e-05, + "loss": 0.1098, + "step": 2524 + }, + { + "epoch": 1.31441957313899, + "grad_norm": 0.23323798829157819, + "learning_rate": 3.116007358985477e-05, + "loss": 0.1091, + "step": 2525 + }, + { + "epoch": 1.3149401353461738, + "grad_norm": 0.23608030886582382, + "learning_rate": 3.1146455759921166e-05, + "loss": 0.1116, + "step": 2526 + }, + { + "epoch": 1.3154606975533576, + "grad_norm": 0.24356320449137384, + "learning_rate": 3.11328359886508e-05, + "loss": 0.1072, + "step": 2527 + }, + { + "epoch": 1.3159812597605414, + "grad_norm": 0.23069187776813016, + "learning_rate": 3.111921428034544e-05, + "loss": 0.1072, + "step": 2528 + }, + { + "epoch": 1.3165018219677251, + "grad_norm": 0.24927589485206192, + "learning_rate": 3.110559063930743e-05, + "loss": 0.1165, + "step": 2529 + }, + { + "epoch": 1.317022384174909, + "grad_norm": 0.22698192027918843, + "learning_rate": 3.109196506983978e-05, + "loss": 0.1098, + "step": 2530 + }, + { + "epoch": 1.3175429463820927, + "grad_norm": 0.2503079852778464, + "learning_rate": 3.107833757624605e-05, + "loss": 0.1106, + "step": 2531 + }, + { + "epoch": 1.3180635085892765, + "grad_norm": 0.2459913765268515, + "learning_rate": 3.1064708162830466e-05, + "loss": 0.1176, + "step": 2532 + }, + { + "epoch": 1.3185840707964602, + "grad_norm": 0.24673270505003705, + "learning_rate": 3.105107683389781e-05, + "loss": 0.1162, + "step": 2533 + }, + { + "epoch": 1.319104633003644, + "grad_norm": 0.22092262416559946, + "learning_rate": 3.10374435937535e-05, + "loss": 0.1067, + "step": 2534 + }, + { + "epoch": 1.3196251952108278, + "grad_norm": 0.23778340367233575, + "learning_rate": 3.102380844670355e-05, + "loss": 0.1156, + "step": 2535 + }, + { + "epoch": 1.3201457574180115, + "grad_norm": 0.24483607513262326, + "learning_rate": 3.101017139705455e-05, + "loss": 0.1145, + "step": 2536 + }, + { + "epoch": 1.3206663196251953, + "grad_norm": 0.22769610941345692, + "learning_rate": 3.099653244911375e-05, + "loss": 0.1144, + "step": 2537 + }, + { + "epoch": 1.321186881832379, + "grad_norm": 0.24187095569563516, + "learning_rate": 3.098289160718895e-05, + "loss": 0.1146, + "step": 2538 + }, + { + "epoch": 1.3217074440395629, + "grad_norm": 0.22264068349148974, + "learning_rate": 3.096924887558855e-05, + "loss": 0.1111, + "step": 2539 + }, + { + "epoch": 1.3222280062467464, + "grad_norm": 0.24298069839995018, + "learning_rate": 3.095560425862157e-05, + "loss": 0.1138, + "step": 2540 + }, + { + "epoch": 1.3227485684539302, + "grad_norm": 0.23672363156144616, + "learning_rate": 3.094195776059763e-05, + "loss": 0.1086, + "step": 2541 + }, + { + "epoch": 1.323269130661114, + "grad_norm": 0.23121118320526707, + "learning_rate": 3.09283093858269e-05, + "loss": 0.1157, + "step": 2542 + }, + { + "epoch": 1.3237896928682977, + "grad_norm": 0.22330970680800596, + "learning_rate": 3.0914659138620186e-05, + "loss": 0.1062, + "step": 2543 + }, + { + "epoch": 1.3243102550754815, + "grad_norm": 0.24667833728809804, + "learning_rate": 3.090100702328888e-05, + "loss": 0.1203, + "step": 2544 + }, + { + "epoch": 1.3248308172826653, + "grad_norm": 0.24428638534778402, + "learning_rate": 3.088735304414494e-05, + "loss": 0.1183, + "step": 2545 + }, + { + "epoch": 1.325351379489849, + "grad_norm": 0.2357143337600258, + "learning_rate": 3.087369720550094e-05, + "loss": 0.1158, + "step": 2546 + }, + { + "epoch": 1.3258719416970328, + "grad_norm": 0.21965988630691494, + "learning_rate": 3.0860039511670024e-05, + "loss": 0.1103, + "step": 2547 + }, + { + "epoch": 1.3263925039042166, + "grad_norm": 0.22044710681349058, + "learning_rate": 3.084637996696592e-05, + "loss": 0.1148, + "step": 2548 + }, + { + "epoch": 1.3269130661114004, + "grad_norm": 0.2305177255025451, + "learning_rate": 3.083271857570297e-05, + "loss": 0.1093, + "step": 2549 + }, + { + "epoch": 1.3274336283185841, + "grad_norm": 0.23136166042256415, + "learning_rate": 3.0819055342196054e-05, + "loss": 0.1078, + "step": 2550 + }, + { + "epoch": 1.327954190525768, + "grad_norm": 0.2421297560595075, + "learning_rate": 3.080539027076066e-05, + "loss": 0.1125, + "step": 2551 + }, + { + "epoch": 1.3284747527329515, + "grad_norm": 0.22632609868379913, + "learning_rate": 3.0791723365712867e-05, + "loss": 0.1098, + "step": 2552 + }, + { + "epoch": 1.3289953149401352, + "grad_norm": 0.23994272711042058, + "learning_rate": 3.077805463136931e-05, + "loss": 0.111, + "step": 2553 + }, + { + "epoch": 1.329515877147319, + "grad_norm": 0.24457717811072027, + "learning_rate": 3.07643840720472e-05, + "loss": 0.1168, + "step": 2554 + }, + { + "epoch": 1.3300364393545028, + "grad_norm": 0.2375550486078291, + "learning_rate": 3.075071169206437e-05, + "loss": 0.1164, + "step": 2555 + }, + { + "epoch": 1.3305570015616865, + "grad_norm": 0.23885122754311797, + "learning_rate": 3.073703749573916e-05, + "loss": 0.1135, + "step": 2556 + }, + { + "epoch": 1.3310775637688703, + "grad_norm": 0.24172905569894784, + "learning_rate": 3.072336148739053e-05, + "loss": 0.1105, + "step": 2557 + }, + { + "epoch": 1.331598125976054, + "grad_norm": 0.24554361390660961, + "learning_rate": 3.0709683671338e-05, + "loss": 0.1169, + "step": 2558 + }, + { + "epoch": 1.3321186881832379, + "grad_norm": 0.22416722525242608, + "learning_rate": 3.069600405190167e-05, + "loss": 0.1099, + "step": 2559 + }, + { + "epoch": 1.3326392503904216, + "grad_norm": 0.23312900420360225, + "learning_rate": 3.068232263340218e-05, + "loss": 0.1126, + "step": 2560 + }, + { + "epoch": 1.3331598125976054, + "grad_norm": 0.23606607210162725, + "learning_rate": 3.066863942016077e-05, + "loss": 0.1089, + "step": 2561 + }, + { + "epoch": 1.3336803748047892, + "grad_norm": 0.2373211884453036, + "learning_rate": 3.0654954416499244e-05, + "loss": 0.109, + "step": 2562 + }, + { + "epoch": 1.334200937011973, + "grad_norm": 0.2343989018463782, + "learning_rate": 3.064126762673994e-05, + "loss": 0.1125, + "step": 2563 + }, + { + "epoch": 1.3347214992191567, + "grad_norm": 0.26796891784177396, + "learning_rate": 3.062757905520582e-05, + "loss": 0.122, + "step": 2564 + }, + { + "epoch": 1.3352420614263405, + "grad_norm": 0.24223466770689397, + "learning_rate": 3.0613888706220336e-05, + "loss": 0.1134, + "step": 2565 + }, + { + "epoch": 1.3357626236335243, + "grad_norm": 0.2549438338386172, + "learning_rate": 3.060019658410755e-05, + "loss": 0.117, + "step": 2566 + }, + { + "epoch": 1.336283185840708, + "grad_norm": 0.25893651098604287, + "learning_rate": 3.0586502693192074e-05, + "loss": 0.116, + "step": 2567 + }, + { + "epoch": 1.3368037480478918, + "grad_norm": 0.22061547809765672, + "learning_rate": 3.0572807037799075e-05, + "loss": 0.1069, + "step": 2568 + }, + { + "epoch": 1.3373243102550756, + "grad_norm": 0.2707795969101939, + "learning_rate": 3.055910962225428e-05, + "loss": 0.1084, + "step": 2569 + }, + { + "epoch": 1.3378448724622594, + "grad_norm": 0.23308154556319333, + "learning_rate": 3.054541045088396e-05, + "loss": 0.115, + "step": 2570 + }, + { + "epoch": 1.3383654346694431, + "grad_norm": 0.2567417526177613, + "learning_rate": 3.053170952801496e-05, + "loss": 0.115, + "step": 2571 + }, + { + "epoch": 1.3388859968766267, + "grad_norm": 0.2362561720344537, + "learning_rate": 3.0518006857974666e-05, + "loss": 0.1142, + "step": 2572 + }, + { + "epoch": 1.3394065590838105, + "grad_norm": 0.24896491322044217, + "learning_rate": 3.0504302445091027e-05, + "loss": 0.1142, + "step": 2573 + }, + { + "epoch": 1.3399271212909942, + "grad_norm": 0.26160908249539727, + "learning_rate": 3.0490596293692525e-05, + "loss": 0.1179, + "step": 2574 + }, + { + "epoch": 1.340447683498178, + "grad_norm": 0.23521571929263785, + "learning_rate": 3.0476888408108202e-05, + "loss": 0.1162, + "step": 2575 + }, + { + "epoch": 1.3409682457053618, + "grad_norm": 0.24915979845519137, + "learning_rate": 3.0463178792667645e-05, + "loss": 0.1127, + "step": 2576 + }, + { + "epoch": 1.3414888079125455, + "grad_norm": 0.23236370763142386, + "learning_rate": 3.0449467451700997e-05, + "loss": 0.1052, + "step": 2577 + }, + { + "epoch": 1.3420093701197293, + "grad_norm": 0.2204117728067703, + "learning_rate": 3.0435754389538928e-05, + "loss": 0.1048, + "step": 2578 + }, + { + "epoch": 1.342529932326913, + "grad_norm": 0.23813101057229322, + "learning_rate": 3.0422039610512666e-05, + "loss": 0.109, + "step": 2579 + }, + { + "epoch": 1.3430504945340969, + "grad_norm": 0.24713378801317953, + "learning_rate": 3.0408323118953968e-05, + "loss": 0.1121, + "step": 2580 + }, + { + "epoch": 1.3435710567412806, + "grad_norm": 0.25584268149164247, + "learning_rate": 3.0394604919195156e-05, + "loss": 0.1118, + "step": 2581 + }, + { + "epoch": 1.3440916189484644, + "grad_norm": 0.24392930456555278, + "learning_rate": 3.0380885015569067e-05, + "loss": 0.1083, + "step": 2582 + }, + { + "epoch": 1.3446121811556482, + "grad_norm": 0.2352412179715951, + "learning_rate": 3.036716341240908e-05, + "loss": 0.1099, + "step": 2583 + }, + { + "epoch": 1.3451327433628317, + "grad_norm": 0.25301035157344787, + "learning_rate": 3.0353440114049126e-05, + "loss": 0.1077, + "step": 2584 + }, + { + "epoch": 1.3456533055700155, + "grad_norm": 0.27102376417563373, + "learning_rate": 3.0339715124823652e-05, + "loss": 0.1133, + "step": 2585 + }, + { + "epoch": 1.3461738677771993, + "grad_norm": 0.245471066517425, + "learning_rate": 3.0325988449067654e-05, + "loss": 0.1063, + "step": 2586 + }, + { + "epoch": 1.346694429984383, + "grad_norm": 0.25444136673567697, + "learning_rate": 3.031226009111665e-05, + "loss": 0.1114, + "step": 2587 + }, + { + "epoch": 1.3472149921915668, + "grad_norm": 0.24276271471445565, + "learning_rate": 3.0298530055306708e-05, + "loss": 0.1131, + "step": 2588 + }, + { + "epoch": 1.3477355543987506, + "grad_norm": 0.26370328884810446, + "learning_rate": 3.028479834597439e-05, + "loss": 0.1118, + "step": 2589 + }, + { + "epoch": 1.3482561166059344, + "grad_norm": 0.252507222324151, + "learning_rate": 3.027106496745683e-05, + "loss": 0.1088, + "step": 2590 + }, + { + "epoch": 1.3487766788131181, + "grad_norm": 0.22861875840700396, + "learning_rate": 3.025732992409166e-05, + "loss": 0.1117, + "step": 2591 + }, + { + "epoch": 1.349297241020302, + "grad_norm": 0.24956628943142115, + "learning_rate": 3.0243593220217044e-05, + "loss": 0.1116, + "step": 2592 + }, + { + "epoch": 1.3498178032274857, + "grad_norm": 0.2390835790110749, + "learning_rate": 3.0229854860171662e-05, + "loss": 0.1131, + "step": 2593 + }, + { + "epoch": 1.3503383654346695, + "grad_norm": 0.24074530354960633, + "learning_rate": 3.021611484829475e-05, + "loss": 0.1111, + "step": 2594 + }, + { + "epoch": 1.3508589276418532, + "grad_norm": 0.2444101290555521, + "learning_rate": 3.0202373188926037e-05, + "loss": 0.1084, + "step": 2595 + }, + { + "epoch": 1.351379489849037, + "grad_norm": 0.24713525990174354, + "learning_rate": 3.0188629886405763e-05, + "loss": 0.1054, + "step": 2596 + }, + { + "epoch": 1.3519000520562208, + "grad_norm": 0.25164204870542045, + "learning_rate": 3.017488494507471e-05, + "loss": 0.1121, + "step": 2597 + }, + { + "epoch": 1.3524206142634045, + "grad_norm": 0.24569308639230078, + "learning_rate": 3.0161138369274177e-05, + "loss": 0.1121, + "step": 2598 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.24172753415281717, + "learning_rate": 3.0147390163345972e-05, + "loss": 0.1104, + "step": 2599 + }, + { + "epoch": 1.353461738677772, + "grad_norm": 0.2713692191610928, + "learning_rate": 3.013364033163241e-05, + "loss": 0.1175, + "step": 2600 + }, + { + "epoch": 1.3539823008849559, + "grad_norm": 0.2610319613907833, + "learning_rate": 3.0119888878476338e-05, + "loss": 0.1154, + "step": 2601 + }, + { + "epoch": 1.3545028630921396, + "grad_norm": 0.22606906047311914, + "learning_rate": 3.0106135808221093e-05, + "loss": 0.1057, + "step": 2602 + }, + { + "epoch": 1.3550234252993234, + "grad_norm": 0.2416710905492764, + "learning_rate": 3.009238112521054e-05, + "loss": 0.1111, + "step": 2603 + }, + { + "epoch": 1.355543987506507, + "grad_norm": 0.23725466138349213, + "learning_rate": 3.007862483378906e-05, + "loss": 0.114, + "step": 2604 + }, + { + "epoch": 1.3560645497136907, + "grad_norm": 0.22201764810825417, + "learning_rate": 3.0064866938301507e-05, + "loss": 0.1091, + "step": 2605 + }, + { + "epoch": 1.3565851119208745, + "grad_norm": 0.26553203846707873, + "learning_rate": 3.005110744309328e-05, + "loss": 0.1095, + "step": 2606 + }, + { + "epoch": 1.3571056741280583, + "grad_norm": 0.22897934347302173, + "learning_rate": 3.003734635251026e-05, + "loss": 0.1042, + "step": 2607 + }, + { + "epoch": 1.357626236335242, + "grad_norm": 0.22965225339232517, + "learning_rate": 3.0023583670898848e-05, + "loss": 0.1123, + "step": 2608 + }, + { + "epoch": 1.3581467985424258, + "grad_norm": 0.2374884978816423, + "learning_rate": 3.0009819402605938e-05, + "loss": 0.1065, + "step": 2609 + }, + { + "epoch": 1.3586673607496096, + "grad_norm": 0.23724855307575515, + "learning_rate": 2.999605355197892e-05, + "loss": 0.1138, + "step": 2610 + }, + { + "epoch": 1.3591879229567934, + "grad_norm": 0.2522681898496753, + "learning_rate": 2.9982286123365694e-05, + "loss": 0.1127, + "step": 2611 + }, + { + "epoch": 1.3597084851639771, + "grad_norm": 0.2390455975944688, + "learning_rate": 2.9968517121114652e-05, + "loss": 0.1103, + "step": 2612 + }, + { + "epoch": 1.360229047371161, + "grad_norm": 0.22922231459543907, + "learning_rate": 2.9954746549574697e-05, + "loss": 0.1017, + "step": 2613 + }, + { + "epoch": 1.3607496095783447, + "grad_norm": 0.23580922604859134, + "learning_rate": 2.9940974413095203e-05, + "loss": 0.1099, + "step": 2614 + }, + { + "epoch": 1.3612701717855284, + "grad_norm": 0.22858569901780046, + "learning_rate": 2.9927200716026055e-05, + "loss": 0.111, + "step": 2615 + }, + { + "epoch": 1.361790733992712, + "grad_norm": 0.2532805937856553, + "learning_rate": 2.9913425462717625e-05, + "loss": 0.113, + "step": 2616 + }, + { + "epoch": 1.3623112961998958, + "grad_norm": 0.23253715435079703, + "learning_rate": 2.989964865752079e-05, + "loss": 0.11, + "step": 2617 + }, + { + "epoch": 1.3628318584070795, + "grad_norm": 0.23333001277583243, + "learning_rate": 2.9885870304786896e-05, + "loss": 0.1116, + "step": 2618 + }, + { + "epoch": 1.3633524206142633, + "grad_norm": 0.21615247744152524, + "learning_rate": 2.9872090408867785e-05, + "loss": 0.1058, + "step": 2619 + }, + { + "epoch": 1.363872982821447, + "grad_norm": 0.2367438794292196, + "learning_rate": 2.9858308974115808e-05, + "loss": 0.112, + "step": 2620 + }, + { + "epoch": 1.3643935450286309, + "grad_norm": 0.23367571489326353, + "learning_rate": 2.9844526004883755e-05, + "loss": 0.1045, + "step": 2621 + }, + { + "epoch": 1.3649141072358146, + "grad_norm": 0.24556639627942822, + "learning_rate": 2.9830741505524958e-05, + "loss": 0.1217, + "step": 2622 + }, + { + "epoch": 1.3654346694429984, + "grad_norm": 0.23637425075744797, + "learning_rate": 2.9816955480393187e-05, + "loss": 0.111, + "step": 2623 + }, + { + "epoch": 1.3659552316501822, + "grad_norm": 0.25593497993008707, + "learning_rate": 2.9803167933842714e-05, + "loss": 0.11, + "step": 2624 + }, + { + "epoch": 1.366475793857366, + "grad_norm": 0.23985020724360726, + "learning_rate": 2.9789378870228283e-05, + "loss": 0.1109, + "step": 2625 + }, + { + "epoch": 1.3669963560645497, + "grad_norm": 0.23470299239423129, + "learning_rate": 2.9775588293905132e-05, + "loss": 0.1123, + "step": 2626 + }, + { + "epoch": 1.3675169182717335, + "grad_norm": 0.2331930805433966, + "learning_rate": 2.976179620922896e-05, + "loss": 0.1086, + "step": 2627 + }, + { + "epoch": 1.3680374804789173, + "grad_norm": 0.23535519041672734, + "learning_rate": 2.9748002620555944e-05, + "loss": 0.1084, + "step": 2628 + }, + { + "epoch": 1.368558042686101, + "grad_norm": 0.2506834392739139, + "learning_rate": 2.9734207532242754e-05, + "loss": 0.1134, + "step": 2629 + }, + { + "epoch": 1.3690786048932848, + "grad_norm": 0.22986664916695576, + "learning_rate": 2.9720410948646504e-05, + "loss": 0.1081, + "step": 2630 + }, + { + "epoch": 1.3695991671004686, + "grad_norm": 0.24834546542727068, + "learning_rate": 2.970661287412482e-05, + "loss": 0.1162, + "step": 2631 + }, + { + "epoch": 1.3701197293076524, + "grad_norm": 0.2308044732796431, + "learning_rate": 2.969281331303576e-05, + "loss": 0.1112, + "step": 2632 + }, + { + "epoch": 1.3706402915148361, + "grad_norm": 0.24372786534936366, + "learning_rate": 2.967901226973787e-05, + "loss": 0.1123, + "step": 2633 + }, + { + "epoch": 1.37116085372202, + "grad_norm": 0.23353876211260266, + "learning_rate": 2.966520974859016e-05, + "loss": 0.1063, + "step": 2634 + }, + { + "epoch": 1.3716814159292037, + "grad_norm": 0.23840470512997244, + "learning_rate": 2.965140575395211e-05, + "loss": 0.1123, + "step": 2635 + }, + { + "epoch": 1.3722019781363872, + "grad_norm": 0.2308432390077942, + "learning_rate": 2.9637600290183675e-05, + "loss": 0.1126, + "step": 2636 + }, + { + "epoch": 1.372722540343571, + "grad_norm": 0.23561930060975914, + "learning_rate": 2.9623793361645247e-05, + "loss": 0.1106, + "step": 2637 + }, + { + "epoch": 1.3732431025507548, + "grad_norm": 0.21718824756279217, + "learning_rate": 2.96099849726977e-05, + "loss": 0.1078, + "step": 2638 + }, + { + "epoch": 1.3737636647579385, + "grad_norm": 0.23549809364298932, + "learning_rate": 2.9596175127702368e-05, + "loss": 0.1155, + "step": 2639 + }, + { + "epoch": 1.3742842269651223, + "grad_norm": 0.2351146169717648, + "learning_rate": 2.958236383102105e-05, + "loss": 0.1089, + "step": 2640 + }, + { + "epoch": 1.374804789172306, + "grad_norm": 0.230720734914438, + "learning_rate": 2.956855108701599e-05, + "loss": 0.1125, + "step": 2641 + }, + { + "epoch": 1.3753253513794899, + "grad_norm": 0.23350953234570837, + "learning_rate": 2.9554736900049883e-05, + "loss": 0.1123, + "step": 2642 + }, + { + "epoch": 1.3758459135866736, + "grad_norm": 0.2272348761594922, + "learning_rate": 2.954092127448591e-05, + "loss": 0.1126, + "step": 2643 + }, + { + "epoch": 1.3763664757938574, + "grad_norm": 0.26121567593064826, + "learning_rate": 2.9527104214687685e-05, + "loss": 0.1171, + "step": 2644 + }, + { + "epoch": 1.3768870380010412, + "grad_norm": 0.24347665782131095, + "learning_rate": 2.951328572501928e-05, + "loss": 0.114, + "step": 2645 + }, + { + "epoch": 1.377407600208225, + "grad_norm": 0.22666019429301196, + "learning_rate": 2.94994658098452e-05, + "loss": 0.109, + "step": 2646 + }, + { + "epoch": 1.3779281624154087, + "grad_norm": 0.24888395219943815, + "learning_rate": 2.9485644473530437e-05, + "loss": 0.114, + "step": 2647 + }, + { + "epoch": 1.3784487246225923, + "grad_norm": 0.2339315941555209, + "learning_rate": 2.9471821720440406e-05, + "loss": 0.1054, + "step": 2648 + }, + { + "epoch": 1.378969286829776, + "grad_norm": 0.22715324176204416, + "learning_rate": 2.9457997554940974e-05, + "loss": 0.1071, + "step": 2649 + }, + { + "epoch": 1.3794898490369598, + "grad_norm": 0.23984980511126433, + "learning_rate": 2.944417198139846e-05, + "loss": 0.1104, + "step": 2650 + }, + { + "epoch": 1.3800104112441436, + "grad_norm": 0.2388805658491324, + "learning_rate": 2.9430345004179614e-05, + "loss": 0.1112, + "step": 2651 + }, + { + "epoch": 1.3805309734513274, + "grad_norm": 0.2609240796303999, + "learning_rate": 2.9416516627651647e-05, + "loss": 0.1201, + "step": 2652 + }, + { + "epoch": 1.3810515356585111, + "grad_norm": 0.22009015612627597, + "learning_rate": 2.9402686856182205e-05, + "loss": 0.1059, + "step": 2653 + }, + { + "epoch": 1.381572097865695, + "grad_norm": 0.25819973582961897, + "learning_rate": 2.9388855694139373e-05, + "loss": 0.1136, + "step": 2654 + }, + { + "epoch": 1.3820926600728787, + "grad_norm": 0.2626233123096237, + "learning_rate": 2.9375023145891666e-05, + "loss": 0.1182, + "step": 2655 + }, + { + "epoch": 1.3826132222800624, + "grad_norm": 0.24547878318931401, + "learning_rate": 2.936118921580806e-05, + "loss": 0.112, + "step": 2656 + }, + { + "epoch": 1.3831337844872462, + "grad_norm": 0.22938479950092555, + "learning_rate": 2.9347353908257936e-05, + "loss": 0.1112, + "step": 2657 + }, + { + "epoch": 1.38365434669443, + "grad_norm": 0.23335117709653277, + "learning_rate": 2.9333517227611152e-05, + "loss": 0.1116, + "step": 2658 + }, + { + "epoch": 1.3841749089016138, + "grad_norm": 0.2514180524501941, + "learning_rate": 2.9319679178237957e-05, + "loss": 0.1115, + "step": 2659 + }, + { + "epoch": 1.3846954711087975, + "grad_norm": 0.22641541429241346, + "learning_rate": 2.9305839764509058e-05, + "loss": 0.1071, + "step": 2660 + }, + { + "epoch": 1.3852160333159813, + "grad_norm": 0.23200240748290513, + "learning_rate": 2.929199899079558e-05, + "loss": 0.1067, + "step": 2661 + }, + { + "epoch": 1.385736595523165, + "grad_norm": 0.2390937747494344, + "learning_rate": 2.9278156861469096e-05, + "loss": 0.1081, + "step": 2662 + }, + { + "epoch": 1.3862571577303489, + "grad_norm": 0.23468619522538356, + "learning_rate": 2.9264313380901588e-05, + "loss": 0.1097, + "step": 2663 + }, + { + "epoch": 1.3867777199375326, + "grad_norm": 0.24345058195506755, + "learning_rate": 2.9250468553465466e-05, + "loss": 0.1089, + "step": 2664 + }, + { + "epoch": 1.3872982821447164, + "grad_norm": 0.25475277322641765, + "learning_rate": 2.9236622383533575e-05, + "loss": 0.1113, + "step": 2665 + }, + { + "epoch": 1.3878188443519002, + "grad_norm": 0.2532468565384966, + "learning_rate": 2.9222774875479176e-05, + "loss": 0.1079, + "step": 2666 + }, + { + "epoch": 1.388339406559084, + "grad_norm": 0.2512503551401563, + "learning_rate": 2.920892603367596e-05, + "loss": 0.1107, + "step": 2667 + }, + { + "epoch": 1.3888599687662675, + "grad_norm": 0.24339764746526638, + "learning_rate": 2.919507586249805e-05, + "loss": 0.1161, + "step": 2668 + }, + { + "epoch": 1.3893805309734513, + "grad_norm": 0.2523651063330094, + "learning_rate": 2.9181224366319947e-05, + "loss": 0.1139, + "step": 2669 + }, + { + "epoch": 1.389901093180635, + "grad_norm": 0.22988428164397576, + "learning_rate": 2.916737154951662e-05, + "loss": 0.1095, + "step": 2670 + }, + { + "epoch": 1.3904216553878188, + "grad_norm": 0.24090438564750494, + "learning_rate": 2.9153517416463418e-05, + "loss": 0.1097, + "step": 2671 + }, + { + "epoch": 1.3909422175950026, + "grad_norm": 0.24074539123393496, + "learning_rate": 2.913966197153613e-05, + "loss": 0.1068, + "step": 2672 + }, + { + "epoch": 1.3914627798021864, + "grad_norm": 0.2374411651100617, + "learning_rate": 2.9125805219110952e-05, + "loss": 0.1135, + "step": 2673 + }, + { + "epoch": 1.3919833420093701, + "grad_norm": 0.24364817979891987, + "learning_rate": 2.9111947163564478e-05, + "loss": 0.1077, + "step": 2674 + }, + { + "epoch": 1.392503904216554, + "grad_norm": 0.239001833403062, + "learning_rate": 2.9098087809273743e-05, + "loss": 0.1089, + "step": 2675 + }, + { + "epoch": 1.3930244664237377, + "grad_norm": 0.2415390586855587, + "learning_rate": 2.908422716061617e-05, + "loss": 0.108, + "step": 2676 + }, + { + "epoch": 1.3935450286309214, + "grad_norm": 0.26779259818902096, + "learning_rate": 2.9070365221969598e-05, + "loss": 0.1099, + "step": 2677 + }, + { + "epoch": 1.3940655908381052, + "grad_norm": 0.23464593639934386, + "learning_rate": 2.9056501997712267e-05, + "loss": 0.1111, + "step": 2678 + }, + { + "epoch": 1.394586153045289, + "grad_norm": 0.2343125934015767, + "learning_rate": 2.904263749222283e-05, + "loss": 0.1055, + "step": 2679 + }, + { + "epoch": 1.3951067152524725, + "grad_norm": 0.2366101209417697, + "learning_rate": 2.9028771709880342e-05, + "loss": 0.1104, + "step": 2680 + }, + { + "epoch": 1.3956272774596563, + "grad_norm": 0.24371168991559053, + "learning_rate": 2.9014904655064273e-05, + "loss": 0.1132, + "step": 2681 + }, + { + "epoch": 1.39614783966684, + "grad_norm": 0.23630952126438518, + "learning_rate": 2.9001036332154474e-05, + "loss": 0.1088, + "step": 2682 + }, + { + "epoch": 1.3966684018740239, + "grad_norm": 0.24538418864834158, + "learning_rate": 2.8987166745531207e-05, + "loss": 0.1126, + "step": 2683 + }, + { + "epoch": 1.3971889640812076, + "grad_norm": 0.23134351061219888, + "learning_rate": 2.897329589957514e-05, + "loss": 0.1144, + "step": 2684 + }, + { + "epoch": 1.3977095262883914, + "grad_norm": 0.23678786376287386, + "learning_rate": 2.8959423798667317e-05, + "loss": 0.1088, + "step": 2685 + }, + { + "epoch": 1.3982300884955752, + "grad_norm": 0.23975398835505177, + "learning_rate": 2.894555044718921e-05, + "loss": 0.1089, + "step": 2686 + }, + { + "epoch": 1.398750650702759, + "grad_norm": 0.23608185526452935, + "learning_rate": 2.893167584952266e-05, + "loss": 0.11, + "step": 2687 + }, + { + "epoch": 1.3992712129099427, + "grad_norm": 0.24040631010421307, + "learning_rate": 2.8917800010049917e-05, + "loss": 0.1169, + "step": 2688 + }, + { + "epoch": 1.3997917751171265, + "grad_norm": 0.2414692039364617, + "learning_rate": 2.8903922933153606e-05, + "loss": 0.1097, + "step": 2689 + }, + { + "epoch": 1.4003123373243103, + "grad_norm": 0.24897062492291208, + "learning_rate": 2.8890044623216763e-05, + "loss": 0.1149, + "step": 2690 + }, + { + "epoch": 1.400832899531494, + "grad_norm": 0.23371010023798855, + "learning_rate": 2.8876165084622797e-05, + "loss": 0.1115, + "step": 2691 + }, + { + "epoch": 1.4013534617386778, + "grad_norm": 0.21963840974379567, + "learning_rate": 2.8862284321755517e-05, + "loss": 0.1007, + "step": 2692 + }, + { + "epoch": 1.4018740239458616, + "grad_norm": 0.23159324050922378, + "learning_rate": 2.8848402338999115e-05, + "loss": 0.1115, + "step": 2693 + }, + { + "epoch": 1.4023945861530454, + "grad_norm": 0.23496545202276656, + "learning_rate": 2.8834519140738158e-05, + "loss": 0.1143, + "step": 2694 + }, + { + "epoch": 1.4029151483602291, + "grad_norm": 0.23229098782752947, + "learning_rate": 2.882063473135763e-05, + "loss": 0.1135, + "step": 2695 + }, + { + "epoch": 1.403435710567413, + "grad_norm": 0.23844892538632909, + "learning_rate": 2.880674911524284e-05, + "loss": 0.1125, + "step": 2696 + }, + { + "epoch": 1.4039562727745967, + "grad_norm": 0.22375205668172357, + "learning_rate": 2.8792862296779538e-05, + "loss": 0.1086, + "step": 2697 + }, + { + "epoch": 1.4044768349817804, + "grad_norm": 0.243507183609586, + "learning_rate": 2.8778974280353817e-05, + "loss": 0.1111, + "step": 2698 + }, + { + "epoch": 1.4049973971889642, + "grad_norm": 0.23876948702376113, + "learning_rate": 2.8765085070352153e-05, + "loss": 0.1116, + "step": 2699 + }, + { + "epoch": 1.4055179593961478, + "grad_norm": 0.23325769806586572, + "learning_rate": 2.8751194671161423e-05, + "loss": 0.1098, + "step": 2700 + }, + { + "epoch": 1.4060385216033315, + "grad_norm": 0.2508356243169636, + "learning_rate": 2.8737303087168837e-05, + "loss": 0.1071, + "step": 2701 + }, + { + "epoch": 1.4065590838105153, + "grad_norm": 0.24922685226760763, + "learning_rate": 2.8723410322762027e-05, + "loss": 0.1152, + "step": 2702 + }, + { + "epoch": 1.407079646017699, + "grad_norm": 0.2487978849373613, + "learning_rate": 2.8709516382328962e-05, + "loss": 0.113, + "step": 2703 + }, + { + "epoch": 1.4076002082248829, + "grad_norm": 0.24928009890621933, + "learning_rate": 2.8695621270258e-05, + "loss": 0.1174, + "step": 2704 + }, + { + "epoch": 1.4081207704320666, + "grad_norm": 0.23579163389801594, + "learning_rate": 2.8681724990937857e-05, + "loss": 0.1105, + "step": 2705 + }, + { + "epoch": 1.4086413326392504, + "grad_norm": 0.23415300436456926, + "learning_rate": 2.8667827548757624e-05, + "loss": 0.1102, + "step": 2706 + }, + { + "epoch": 1.4091618948464342, + "grad_norm": 0.22972294165859275, + "learning_rate": 2.865392894810678e-05, + "loss": 0.1061, + "step": 2707 + }, + { + "epoch": 1.409682457053618, + "grad_norm": 0.2345573651214216, + "learning_rate": 2.8640029193375128e-05, + "loss": 0.1046, + "step": 2708 + }, + { + "epoch": 1.4102030192608017, + "grad_norm": 0.23651806913382753, + "learning_rate": 2.8626128288952862e-05, + "loss": 0.1069, + "step": 2709 + }, + { + "epoch": 1.4107235814679855, + "grad_norm": 0.24904333761243203, + "learning_rate": 2.8612226239230532e-05, + "loss": 0.1129, + "step": 2710 + }, + { + "epoch": 1.4112441436751693, + "grad_norm": 0.23917081649980002, + "learning_rate": 2.8598323048599067e-05, + "loss": 0.1081, + "step": 2711 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.2456854467567518, + "learning_rate": 2.8584418721449724e-05, + "loss": 0.1103, + "step": 2712 + }, + { + "epoch": 1.4122852680895366, + "grad_norm": 0.2452287893186469, + "learning_rate": 2.8570513262174152e-05, + "loss": 0.1174, + "step": 2713 + }, + { + "epoch": 1.4128058302967204, + "grad_norm": 0.23238334523577805, + "learning_rate": 2.855660667516433e-05, + "loss": 0.1099, + "step": 2714 + }, + { + "epoch": 1.4133263925039041, + "grad_norm": 0.23389506698918613, + "learning_rate": 2.854269896481261e-05, + "loss": 0.1084, + "step": 2715 + }, + { + "epoch": 1.413846954711088, + "grad_norm": 0.23215202724524295, + "learning_rate": 2.8528790135511695e-05, + "loss": 0.1136, + "step": 2716 + }, + { + "epoch": 1.4143675169182717, + "grad_norm": 0.2394218131173443, + "learning_rate": 2.851488019165464e-05, + "loss": 0.1073, + "step": 2717 + }, + { + "epoch": 1.4148880791254554, + "grad_norm": 0.2260015168454527, + "learning_rate": 2.8500969137634853e-05, + "loss": 0.1111, + "step": 2718 + }, + { + "epoch": 1.4154086413326392, + "grad_norm": 0.2196531969922503, + "learning_rate": 2.8487056977846083e-05, + "loss": 0.1122, + "step": 2719 + }, + { + "epoch": 1.415929203539823, + "grad_norm": 0.23260699506685206, + "learning_rate": 2.8473143716682455e-05, + "loss": 0.1157, + "step": 2720 + }, + { + "epoch": 1.4164497657470068, + "grad_norm": 0.23668138862358012, + "learning_rate": 2.8459229358538407e-05, + "loss": 0.1115, + "step": 2721 + }, + { + "epoch": 1.4169703279541905, + "grad_norm": 0.2246223176402912, + "learning_rate": 2.8445313907808756e-05, + "loss": 0.1084, + "step": 2722 + }, + { + "epoch": 1.4174908901613743, + "grad_norm": 0.2301496115534156, + "learning_rate": 2.843139736888864e-05, + "loss": 0.1076, + "step": 2723 + }, + { + "epoch": 1.418011452368558, + "grad_norm": 0.22756253474564722, + "learning_rate": 2.841747974617355e-05, + "loss": 0.1086, + "step": 2724 + }, + { + "epoch": 1.4185320145757419, + "grad_norm": 0.22250208266300767, + "learning_rate": 2.8403561044059324e-05, + "loss": 0.1086, + "step": 2725 + }, + { + "epoch": 1.4190525767829256, + "grad_norm": 0.2254005836949918, + "learning_rate": 2.8389641266942124e-05, + "loss": 0.104, + "step": 2726 + }, + { + "epoch": 1.4195731389901094, + "grad_norm": 0.23920028284992914, + "learning_rate": 2.8375720419218488e-05, + "loss": 0.1161, + "step": 2727 + }, + { + "epoch": 1.4200937011972932, + "grad_norm": 0.23743660758278437, + "learning_rate": 2.836179850528523e-05, + "loss": 0.1085, + "step": 2728 + }, + { + "epoch": 1.420614263404477, + "grad_norm": 0.22280403669098997, + "learning_rate": 2.8347875529539576e-05, + "loss": 0.1074, + "step": 2729 + }, + { + "epoch": 1.4211348256116607, + "grad_norm": 0.2304981362316628, + "learning_rate": 2.8333951496379023e-05, + "loss": 0.1071, + "step": 2730 + }, + { + "epoch": 1.4216553878188445, + "grad_norm": 0.23280393574418745, + "learning_rate": 2.8320026410201445e-05, + "loss": 0.1102, + "step": 2731 + }, + { + "epoch": 1.422175950026028, + "grad_norm": 0.23611313379761717, + "learning_rate": 2.8306100275405024e-05, + "loss": 0.1101, + "step": 2732 + }, + { + "epoch": 1.4226965122332118, + "grad_norm": 0.2241908878458328, + "learning_rate": 2.829217309638828e-05, + "loss": 0.1102, + "step": 2733 + }, + { + "epoch": 1.4232170744403956, + "grad_norm": 0.2447870254970879, + "learning_rate": 2.827824487755007e-05, + "loss": 0.1119, + "step": 2734 + }, + { + "epoch": 1.4237376366475794, + "grad_norm": 0.23587811793675378, + "learning_rate": 2.8264315623289568e-05, + "loss": 0.115, + "step": 2735 + }, + { + "epoch": 1.4242581988547631, + "grad_norm": 0.23445583906080528, + "learning_rate": 2.8250385338006297e-05, + "loss": 0.1127, + "step": 2736 + }, + { + "epoch": 1.424778761061947, + "grad_norm": 0.22467550766807312, + "learning_rate": 2.823645402610006e-05, + "loss": 0.1073, + "step": 2737 + }, + { + "epoch": 1.4252993232691307, + "grad_norm": 0.22942028138580609, + "learning_rate": 2.8222521691971037e-05, + "loss": 0.1081, + "step": 2738 + }, + { + "epoch": 1.4258198854763144, + "grad_norm": 0.2258282159083254, + "learning_rate": 2.8208588340019703e-05, + "loss": 0.1137, + "step": 2739 + }, + { + "epoch": 1.4263404476834982, + "grad_norm": 0.2385778073200138, + "learning_rate": 2.8194653974646858e-05, + "loss": 0.1173, + "step": 2740 + }, + { + "epoch": 1.426861009890682, + "grad_norm": 0.2279308225257302, + "learning_rate": 2.8180718600253613e-05, + "loss": 0.1118, + "step": 2741 + }, + { + "epoch": 1.4273815720978658, + "grad_norm": 0.23064677854327556, + "learning_rate": 2.8166782221241418e-05, + "loss": 0.112, + "step": 2742 + }, + { + "epoch": 1.4279021343050495, + "grad_norm": 0.23251063489281165, + "learning_rate": 2.8152844842012034e-05, + "loss": 0.1141, + "step": 2743 + }, + { + "epoch": 1.428422696512233, + "grad_norm": 0.23020617762610876, + "learning_rate": 2.8138906466967518e-05, + "loss": 0.1081, + "step": 2744 + }, + { + "epoch": 1.4289432587194169, + "grad_norm": 0.2222890423486755, + "learning_rate": 2.812496710051028e-05, + "loss": 0.1065, + "step": 2745 + }, + { + "epoch": 1.4294638209266006, + "grad_norm": 0.2274018551434016, + "learning_rate": 2.8111026747043002e-05, + "loss": 0.1057, + "step": 2746 + }, + { + "epoch": 1.4299843831337844, + "grad_norm": 0.236085240555113, + "learning_rate": 2.80970854109687e-05, + "loss": 0.1056, + "step": 2747 + }, + { + "epoch": 1.4305049453409682, + "grad_norm": 0.2224073810854335, + "learning_rate": 2.80831430966907e-05, + "loss": 0.1094, + "step": 2748 + }, + { + "epoch": 1.431025507548152, + "grad_norm": 0.22383924274617603, + "learning_rate": 2.806919980861264e-05, + "loss": 0.1106, + "step": 2749 + }, + { + "epoch": 1.4315460697553357, + "grad_norm": 0.23203392604509374, + "learning_rate": 2.805525555113845e-05, + "loss": 0.1113, + "step": 2750 + }, + { + "epoch": 1.4320666319625195, + "grad_norm": 0.22742895594575752, + "learning_rate": 2.804131032867237e-05, + "loss": 0.1094, + "step": 2751 + }, + { + "epoch": 1.4325871941697033, + "grad_norm": 0.22277598495191936, + "learning_rate": 2.8027364145618967e-05, + "loss": 0.1107, + "step": 2752 + }, + { + "epoch": 1.433107756376887, + "grad_norm": 0.23109249376380572, + "learning_rate": 2.8013417006383076e-05, + "loss": 0.1089, + "step": 2753 + }, + { + "epoch": 1.4336283185840708, + "grad_norm": 0.23662969693774688, + "learning_rate": 2.799946891536987e-05, + "loss": 0.1129, + "step": 2754 + }, + { + "epoch": 1.4341488807912546, + "grad_norm": 0.2260021154026626, + "learning_rate": 2.7985519876984795e-05, + "loss": 0.1082, + "step": 2755 + }, + { + "epoch": 1.4346694429984383, + "grad_norm": 0.23726551030073853, + "learning_rate": 2.7971569895633604e-05, + "loss": 0.114, + "step": 2756 + }, + { + "epoch": 1.4351900052056221, + "grad_norm": 0.23082581298133342, + "learning_rate": 2.7957618975722362e-05, + "loss": 0.1095, + "step": 2757 + }, + { + "epoch": 1.435710567412806, + "grad_norm": 0.24003960853374687, + "learning_rate": 2.7943667121657412e-05, + "loss": 0.1118, + "step": 2758 + }, + { + "epoch": 1.4362311296199897, + "grad_norm": 0.234089806557142, + "learning_rate": 2.7929714337845396e-05, + "loss": 0.1101, + "step": 2759 + }, + { + "epoch": 1.4367516918271734, + "grad_norm": 0.23113107032487773, + "learning_rate": 2.7915760628693256e-05, + "loss": 0.1101, + "step": 2760 + }, + { + "epoch": 1.4372722540343572, + "grad_norm": 0.2219497904207552, + "learning_rate": 2.7901805998608217e-05, + "loss": 0.1059, + "step": 2761 + }, + { + "epoch": 1.437792816241541, + "grad_norm": 0.22605082077104915, + "learning_rate": 2.7887850451997805e-05, + "loss": 0.1068, + "step": 2762 + }, + { + "epoch": 1.4383133784487248, + "grad_norm": 0.22649858856913815, + "learning_rate": 2.787389399326984e-05, + "loss": 0.1095, + "step": 2763 + }, + { + "epoch": 1.4388339406559083, + "grad_norm": 0.2368963632813257, + "learning_rate": 2.7859936626832407e-05, + "loss": 0.1098, + "step": 2764 + }, + { + "epoch": 1.439354502863092, + "grad_norm": 0.22188698169925974, + "learning_rate": 2.7845978357093895e-05, + "loss": 0.1114, + "step": 2765 + }, + { + "epoch": 1.4398750650702759, + "grad_norm": 0.22991662693273585, + "learning_rate": 2.7832019188462977e-05, + "loss": 0.1128, + "step": 2766 + }, + { + "epoch": 1.4403956272774596, + "grad_norm": 0.25211405079579136, + "learning_rate": 2.7818059125348616e-05, + "loss": 0.1215, + "step": 2767 + }, + { + "epoch": 1.4409161894846434, + "grad_norm": 0.24479114751525108, + "learning_rate": 2.7804098172160038e-05, + "loss": 0.1172, + "step": 2768 + }, + { + "epoch": 1.4414367516918272, + "grad_norm": 0.22826017987069053, + "learning_rate": 2.779013633330676e-05, + "loss": 0.1108, + "step": 2769 + }, + { + "epoch": 1.441957313899011, + "grad_norm": 0.2346902016087554, + "learning_rate": 2.7776173613198592e-05, + "loss": 0.107, + "step": 2770 + }, + { + "epoch": 1.4424778761061947, + "grad_norm": 0.2353334199015863, + "learning_rate": 2.7762210016245605e-05, + "loss": 0.1132, + "step": 2771 + }, + { + "epoch": 1.4429984383133785, + "grad_norm": 0.2153683891298237, + "learning_rate": 2.7748245546858155e-05, + "loss": 0.1088, + "step": 2772 + }, + { + "epoch": 1.4435190005205623, + "grad_norm": 0.23715391047211573, + "learning_rate": 2.7734280209446865e-05, + "loss": 0.1097, + "step": 2773 + }, + { + "epoch": 1.444039562727746, + "grad_norm": 0.23698576922062276, + "learning_rate": 2.7720314008422636e-05, + "loss": 0.1105, + "step": 2774 + }, + { + "epoch": 1.4445601249349298, + "grad_norm": 0.24630914098283596, + "learning_rate": 2.770634694819666e-05, + "loss": 0.1119, + "step": 2775 + }, + { + "epoch": 1.4450806871421134, + "grad_norm": 0.24175525799827727, + "learning_rate": 2.7692379033180376e-05, + "loss": 0.111, + "step": 2776 + }, + { + "epoch": 1.4456012493492971, + "grad_norm": 0.22721483418002544, + "learning_rate": 2.7678410267785492e-05, + "loss": 0.1094, + "step": 2777 + }, + { + "epoch": 1.446121811556481, + "grad_norm": 0.24330824085525268, + "learning_rate": 2.7664440656424014e-05, + "loss": 0.112, + "step": 2778 + }, + { + "epoch": 1.4466423737636647, + "grad_norm": 0.22575100884114555, + "learning_rate": 2.7650470203508177e-05, + "loss": 0.1048, + "step": 2779 + }, + { + "epoch": 1.4471629359708484, + "grad_norm": 0.2302645170045773, + "learning_rate": 2.7636498913450508e-05, + "loss": 0.1088, + "step": 2780 + }, + { + "epoch": 1.4476834981780322, + "grad_norm": 0.24802303184573393, + "learning_rate": 2.7622526790663795e-05, + "loss": 0.1163, + "step": 2781 + }, + { + "epoch": 1.448204060385216, + "grad_norm": 0.23957252091431663, + "learning_rate": 2.760855383956108e-05, + "loss": 0.1104, + "step": 2782 + }, + { + "epoch": 1.4487246225923998, + "grad_norm": 0.2226274711661623, + "learning_rate": 2.7594580064555664e-05, + "loss": 0.1073, + "step": 2783 + }, + { + "epoch": 1.4492451847995835, + "grad_norm": 0.22154143748354907, + "learning_rate": 2.7580605470061126e-05, + "loss": 0.1063, + "step": 2784 + }, + { + "epoch": 1.4497657470067673, + "grad_norm": 0.2213471609280991, + "learning_rate": 2.7566630060491288e-05, + "loss": 0.1068, + "step": 2785 + }, + { + "epoch": 1.450286309213951, + "grad_norm": 0.22697992240357193, + "learning_rate": 2.7552653840260234e-05, + "loss": 0.1122, + "step": 2786 + }, + { + "epoch": 1.4508068714211348, + "grad_norm": 0.23909687959750933, + "learning_rate": 2.7538676813782315e-05, + "loss": 0.1109, + "step": 2787 + }, + { + "epoch": 1.4513274336283186, + "grad_norm": 0.22865206395405765, + "learning_rate": 2.752469898547211e-05, + "loss": 0.111, + "step": 2788 + }, + { + "epoch": 1.4518479958355024, + "grad_norm": 0.230864102308563, + "learning_rate": 2.751072035974448e-05, + "loss": 0.1101, + "step": 2789 + }, + { + "epoch": 1.4523685580426862, + "grad_norm": 0.22847986253277383, + "learning_rate": 2.749674094101452e-05, + "loss": 0.1059, + "step": 2790 + }, + { + "epoch": 1.45288912024987, + "grad_norm": 0.22746929861816742, + "learning_rate": 2.748276073369759e-05, + "loss": 0.1077, + "step": 2791 + }, + { + "epoch": 1.4534096824570537, + "grad_norm": 0.2490634493102354, + "learning_rate": 2.7468779742209272e-05, + "loss": 0.1149, + "step": 2792 + }, + { + "epoch": 1.4539302446642375, + "grad_norm": 0.23550416923749126, + "learning_rate": 2.745479797096543e-05, + "loss": 0.1078, + "step": 2793 + }, + { + "epoch": 1.4544508068714213, + "grad_norm": 0.23727643035605134, + "learning_rate": 2.744081542438215e-05, + "loss": 0.1102, + "step": 2794 + }, + { + "epoch": 1.454971369078605, + "grad_norm": 0.23639067924333218, + "learning_rate": 2.7426832106875772e-05, + "loss": 0.1123, + "step": 2795 + }, + { + "epoch": 1.4554919312857886, + "grad_norm": 0.22586950416876225, + "learning_rate": 2.741284802286288e-05, + "loss": 0.1079, + "step": 2796 + }, + { + "epoch": 1.4560124934929723, + "grad_norm": 0.23428485678259794, + "learning_rate": 2.7398863176760297e-05, + "loss": 0.1103, + "step": 2797 + }, + { + "epoch": 1.4565330557001561, + "grad_norm": 0.23671222076156578, + "learning_rate": 2.7384877572985096e-05, + "loss": 0.1131, + "step": 2798 + }, + { + "epoch": 1.45705361790734, + "grad_norm": 0.24175271615548818, + "learning_rate": 2.7370891215954568e-05, + "loss": 0.1079, + "step": 2799 + }, + { + "epoch": 1.4575741801145237, + "grad_norm": 0.25201562246791964, + "learning_rate": 2.7356904110086267e-05, + "loss": 0.1139, + "step": 2800 + }, + { + "epoch": 1.4580947423217074, + "grad_norm": 0.23688810004618258, + "learning_rate": 2.7342916259797964e-05, + "loss": 0.114, + "step": 2801 + }, + { + "epoch": 1.4586153045288912, + "grad_norm": 0.25496678426814795, + "learning_rate": 2.7328927669507675e-05, + "loss": 0.1123, + "step": 2802 + }, + { + "epoch": 1.459135866736075, + "grad_norm": 0.24119391804523413, + "learning_rate": 2.7314938343633656e-05, + "loss": 0.1117, + "step": 2803 + }, + { + "epoch": 1.4596564289432588, + "grad_norm": 0.23981545441457577, + "learning_rate": 2.7300948286594373e-05, + "loss": 0.1138, + "step": 2804 + }, + { + "epoch": 1.4601769911504425, + "grad_norm": 0.238290521313305, + "learning_rate": 2.7286957502808546e-05, + "loss": 0.1088, + "step": 2805 + }, + { + "epoch": 1.4606975533576263, + "grad_norm": 0.23496530630575055, + "learning_rate": 2.7272965996695116e-05, + "loss": 0.1091, + "step": 2806 + }, + { + "epoch": 1.46121811556481, + "grad_norm": 0.2219092228421417, + "learning_rate": 2.7258973772673247e-05, + "loss": 0.1069, + "step": 2807 + }, + { + "epoch": 1.4617386777719936, + "grad_norm": 0.23788495609882543, + "learning_rate": 2.7244980835162342e-05, + "loss": 0.1059, + "step": 2808 + }, + { + "epoch": 1.4622592399791774, + "grad_norm": 0.22578800709661323, + "learning_rate": 2.7230987188582008e-05, + "loss": 0.1111, + "step": 2809 + }, + { + "epoch": 1.4627798021863612, + "grad_norm": 0.2642738917819999, + "learning_rate": 2.7216992837352108e-05, + "loss": 0.111, + "step": 2810 + }, + { + "epoch": 1.463300364393545, + "grad_norm": 0.23319413750842335, + "learning_rate": 2.7202997785892688e-05, + "loss": 0.1102, + "step": 2811 + }, + { + "epoch": 1.4638209266007287, + "grad_norm": 0.25360100989894624, + "learning_rate": 2.718900203862406e-05, + "loss": 0.1128, + "step": 2812 + }, + { + "epoch": 1.4643414888079125, + "grad_norm": 0.2639966129821883, + "learning_rate": 2.7175005599966718e-05, + "loss": 0.1165, + "step": 2813 + }, + { + "epoch": 1.4648620510150963, + "grad_norm": 0.22862001367725718, + "learning_rate": 2.7161008474341393e-05, + "loss": 0.1059, + "step": 2814 + }, + { + "epoch": 1.46538261322228, + "grad_norm": 0.24334822660492816, + "learning_rate": 2.714701066616902e-05, + "loss": 0.1124, + "step": 2815 + }, + { + "epoch": 1.4659031754294638, + "grad_norm": 0.23329003664914877, + "learning_rate": 2.713301217987077e-05, + "loss": 0.1096, + "step": 2816 + }, + { + "epoch": 1.4664237376366476, + "grad_norm": 0.24018365846353792, + "learning_rate": 2.7119013019868013e-05, + "loss": 0.1137, + "step": 2817 + }, + { + "epoch": 1.4669442998438313, + "grad_norm": 0.22712008229642378, + "learning_rate": 2.710501319058233e-05, + "loss": 0.1098, + "step": 2818 + }, + { + "epoch": 1.4674648620510151, + "grad_norm": 0.22145732141152524, + "learning_rate": 2.7091012696435525e-05, + "loss": 0.1097, + "step": 2819 + }, + { + "epoch": 1.467985424258199, + "grad_norm": 0.22548306513700106, + "learning_rate": 2.70770115418496e-05, + "loss": 0.1084, + "step": 2820 + }, + { + "epoch": 1.4685059864653827, + "grad_norm": 0.2316717588456251, + "learning_rate": 2.706300973124678e-05, + "loss": 0.1119, + "step": 2821 + }, + { + "epoch": 1.4690265486725664, + "grad_norm": 0.24444595497147328, + "learning_rate": 2.7049007269049483e-05, + "loss": 0.112, + "step": 2822 + }, + { + "epoch": 1.4695471108797502, + "grad_norm": 0.223605647603151, + "learning_rate": 2.7035004159680332e-05, + "loss": 0.1069, + "step": 2823 + }, + { + "epoch": 1.470067673086934, + "grad_norm": 0.22488128372014973, + "learning_rate": 2.702100040756217e-05, + "loss": 0.1057, + "step": 2824 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.24269771382490615, + "learning_rate": 2.700699601711803e-05, + "loss": 0.111, + "step": 2825 + }, + { + "epoch": 1.4711087975013015, + "grad_norm": 0.23271593883892053, + "learning_rate": 2.699299099277115e-05, + "loss": 0.1034, + "step": 2826 + }, + { + "epoch": 1.4716293597084853, + "grad_norm": 0.2238190756200398, + "learning_rate": 2.6978985338944966e-05, + "loss": 0.1096, + "step": 2827 + }, + { + "epoch": 1.4721499219156688, + "grad_norm": 0.2466339353589823, + "learning_rate": 2.6964979060063123e-05, + "loss": 0.1116, + "step": 2828 + }, + { + "epoch": 1.4726704841228526, + "grad_norm": 0.2540666543177738, + "learning_rate": 2.6950972160549444e-05, + "loss": 0.1133, + "step": 2829 + }, + { + "epoch": 1.4731910463300364, + "grad_norm": 0.2250856415814824, + "learning_rate": 2.6936964644827973e-05, + "loss": 0.1078, + "step": 2830 + }, + { + "epoch": 1.4737116085372202, + "grad_norm": 0.23250731739398878, + "learning_rate": 2.692295651732293e-05, + "loss": 0.1102, + "step": 2831 + }, + { + "epoch": 1.474232170744404, + "grad_norm": 0.24620466821834674, + "learning_rate": 2.6908947782458728e-05, + "loss": 0.1111, + "step": 2832 + }, + { + "epoch": 1.4747527329515877, + "grad_norm": 0.2245002310950658, + "learning_rate": 2.6894938444659974e-05, + "loss": 0.1032, + "step": 2833 + }, + { + "epoch": 1.4752732951587715, + "grad_norm": 0.2327464778529271, + "learning_rate": 2.6880928508351484e-05, + "loss": 0.1098, + "step": 2834 + }, + { + "epoch": 1.4757938573659553, + "grad_norm": 0.2376686699449246, + "learning_rate": 2.6866917977958246e-05, + "loss": 0.1101, + "step": 2835 + }, + { + "epoch": 1.476314419573139, + "grad_norm": 0.2413687823738175, + "learning_rate": 2.685290685790542e-05, + "loss": 0.1125, + "step": 2836 + }, + { + "epoch": 1.4768349817803228, + "grad_norm": 0.23618732799015563, + "learning_rate": 2.6838895152618387e-05, + "loss": 0.112, + "step": 2837 + }, + { + "epoch": 1.4773555439875066, + "grad_norm": 0.22955043165979686, + "learning_rate": 2.682488286652269e-05, + "loss": 0.1059, + "step": 2838 + }, + { + "epoch": 1.4778761061946903, + "grad_norm": 0.27218809301190455, + "learning_rate": 2.6810870004044063e-05, + "loss": 0.1124, + "step": 2839 + }, + { + "epoch": 1.478396668401874, + "grad_norm": 0.2370646380825962, + "learning_rate": 2.6796856569608414e-05, + "loss": 0.1144, + "step": 2840 + }, + { + "epoch": 1.4789172306090577, + "grad_norm": 0.2379047779082474, + "learning_rate": 2.6782842567641842e-05, + "loss": 0.107, + "step": 2841 + }, + { + "epoch": 1.4794377928162414, + "grad_norm": 0.24483160304496035, + "learning_rate": 2.6768828002570623e-05, + "loss": 0.1099, + "step": 2842 + }, + { + "epoch": 1.4799583550234252, + "grad_norm": 0.2323379686773366, + "learning_rate": 2.6754812878821206e-05, + "loss": 0.1085, + "step": 2843 + }, + { + "epoch": 1.480478917230609, + "grad_norm": 0.23946063633545, + "learning_rate": 2.6740797200820228e-05, + "loss": 0.1096, + "step": 2844 + }, + { + "epoch": 1.4809994794377928, + "grad_norm": 0.24336186169601903, + "learning_rate": 2.672678097299447e-05, + "loss": 0.1111, + "step": 2845 + }, + { + "epoch": 1.4815200416449765, + "grad_norm": 0.23709636885936694, + "learning_rate": 2.6712764199770936e-05, + "loss": 0.1129, + "step": 2846 + }, + { + "epoch": 1.4820406038521603, + "grad_norm": 0.23972559290010753, + "learning_rate": 2.6698746885576746e-05, + "loss": 0.1137, + "step": 2847 + }, + { + "epoch": 1.482561166059344, + "grad_norm": 0.23976136195183834, + "learning_rate": 2.668472903483925e-05, + "loss": 0.1121, + "step": 2848 + }, + { + "epoch": 1.4830817282665278, + "grad_norm": 0.24400625472890314, + "learning_rate": 2.6670710651985924e-05, + "loss": 0.1135, + "step": 2849 + }, + { + "epoch": 1.4836022904737116, + "grad_norm": 0.2367697438563428, + "learning_rate": 2.6656691741444423e-05, + "loss": 0.107, + "step": 2850 + }, + { + "epoch": 1.4841228526808954, + "grad_norm": 0.23848746621745118, + "learning_rate": 2.6642672307642575e-05, + "loss": 0.1079, + "step": 2851 + }, + { + "epoch": 1.4846434148880792, + "grad_norm": 0.23917458954785298, + "learning_rate": 2.6628652355008364e-05, + "loss": 0.1135, + "step": 2852 + }, + { + "epoch": 1.485163977095263, + "grad_norm": 0.23079110138661027, + "learning_rate": 2.661463188796996e-05, + "loss": 0.113, + "step": 2853 + }, + { + "epoch": 1.4856845393024467, + "grad_norm": 0.22660723854160528, + "learning_rate": 2.6600610910955652e-05, + "loss": 0.1084, + "step": 2854 + }, + { + "epoch": 1.4862051015096305, + "grad_norm": 0.24763834613940575, + "learning_rate": 2.6586589428393944e-05, + "loss": 0.1139, + "step": 2855 + }, + { + "epoch": 1.4867256637168142, + "grad_norm": 0.22651685710665284, + "learning_rate": 2.6572567444713453e-05, + "loss": 0.1088, + "step": 2856 + }, + { + "epoch": 1.487246225923998, + "grad_norm": 0.23203385277212307, + "learning_rate": 2.655854496434299e-05, + "loss": 0.1082, + "step": 2857 + }, + { + "epoch": 1.4877667881311818, + "grad_norm": 0.23006346404589428, + "learning_rate": 2.6544521991711498e-05, + "loss": 0.109, + "step": 2858 + }, + { + "epoch": 1.4882873503383656, + "grad_norm": 0.24467746804171445, + "learning_rate": 2.6530498531248078e-05, + "loss": 0.1078, + "step": 2859 + }, + { + "epoch": 1.4888079125455491, + "grad_norm": 0.2376855115614867, + "learning_rate": 2.6516474587382002e-05, + "loss": 0.1038, + "step": 2860 + }, + { + "epoch": 1.489328474752733, + "grad_norm": 0.2278413610792057, + "learning_rate": 2.650245016454268e-05, + "loss": 0.1084, + "step": 2861 + }, + { + "epoch": 1.4898490369599167, + "grad_norm": 0.24707716899895538, + "learning_rate": 2.6488425267159688e-05, + "loss": 0.1044, + "step": 2862 + }, + { + "epoch": 1.4903695991671004, + "grad_norm": 0.2587262411800931, + "learning_rate": 2.6474399899662715e-05, + "loss": 0.1116, + "step": 2863 + }, + { + "epoch": 1.4908901613742842, + "grad_norm": 0.2435423020073306, + "learning_rate": 2.646037406648165e-05, + "loss": 0.1115, + "step": 2864 + }, + { + "epoch": 1.491410723581468, + "grad_norm": 0.23050479938532595, + "learning_rate": 2.6446347772046492e-05, + "loss": 0.1084, + "step": 2865 + }, + { + "epoch": 1.4919312857886518, + "grad_norm": 0.2374569801662734, + "learning_rate": 2.6432321020787403e-05, + "loss": 0.1088, + "step": 2866 + }, + { + "epoch": 1.4924518479958355, + "grad_norm": 0.2312604690786547, + "learning_rate": 2.641829381713468e-05, + "loss": 0.1109, + "step": 2867 + }, + { + "epoch": 1.4929724102030193, + "grad_norm": 0.23090596721393158, + "learning_rate": 2.6404266165518767e-05, + "loss": 0.114, + "step": 2868 + }, + { + "epoch": 1.493492972410203, + "grad_norm": 0.23229013774491258, + "learning_rate": 2.6390238070370255e-05, + "loss": 0.108, + "step": 2869 + }, + { + "epoch": 1.4940135346173868, + "grad_norm": 0.22185984755752802, + "learning_rate": 2.6376209536119856e-05, + "loss": 0.1074, + "step": 2870 + }, + { + "epoch": 1.4945340968245706, + "grad_norm": 0.22282503223676492, + "learning_rate": 2.6362180567198447e-05, + "loss": 0.1084, + "step": 2871 + }, + { + "epoch": 1.4950546590317542, + "grad_norm": 0.22439529261749314, + "learning_rate": 2.6348151168037028e-05, + "loss": 0.106, + "step": 2872 + }, + { + "epoch": 1.495575221238938, + "grad_norm": 0.23223806149855644, + "learning_rate": 2.633412134306672e-05, + "loss": 0.1107, + "step": 2873 + }, + { + "epoch": 1.4960957834461217, + "grad_norm": 0.23202694735599594, + "learning_rate": 2.6320091096718817e-05, + "loss": 0.1108, + "step": 2874 + }, + { + "epoch": 1.4966163456533055, + "grad_norm": 0.22150351815304223, + "learning_rate": 2.6306060433424712e-05, + "loss": 0.1089, + "step": 2875 + }, + { + "epoch": 1.4971369078604893, + "grad_norm": 0.2356343347877159, + "learning_rate": 2.629202935761595e-05, + "loss": 0.1066, + "step": 2876 + }, + { + "epoch": 1.497657470067673, + "grad_norm": 0.22946145700792592, + "learning_rate": 2.6277997873724182e-05, + "loss": 0.1092, + "step": 2877 + }, + { + "epoch": 1.4981780322748568, + "grad_norm": 0.24032297573317024, + "learning_rate": 2.6263965986181215e-05, + "loss": 0.1172, + "step": 2878 + }, + { + "epoch": 1.4986985944820406, + "grad_norm": 0.23337639493576984, + "learning_rate": 2.6249933699418965e-05, + "loss": 0.1092, + "step": 2879 + }, + { + "epoch": 1.4992191566892243, + "grad_norm": 0.23600113599122988, + "learning_rate": 2.6235901017869495e-05, + "loss": 0.1109, + "step": 2880 + }, + { + "epoch": 1.4997397188964081, + "grad_norm": 0.2239919599001527, + "learning_rate": 2.6221867945964966e-05, + "loss": 0.107, + "step": 2881 + }, + { + "epoch": 1.5002602811035919, + "grad_norm": 0.2438916295121852, + "learning_rate": 2.6207834488137677e-05, + "loss": 0.1071, + "step": 2882 + }, + { + "epoch": 1.5007808433107757, + "grad_norm": 0.24034447143613874, + "learning_rate": 2.6193800648820055e-05, + "loss": 0.1119, + "step": 2883 + }, + { + "epoch": 1.5013014055179594, + "grad_norm": 0.21632247281172787, + "learning_rate": 2.6179766432444624e-05, + "loss": 0.105, + "step": 2884 + }, + { + "epoch": 1.5018219677251432, + "grad_norm": 0.23446966101143912, + "learning_rate": 2.6165731843444063e-05, + "loss": 0.1053, + "step": 2885 + }, + { + "epoch": 1.502342529932327, + "grad_norm": 0.23530370722928862, + "learning_rate": 2.6151696886251126e-05, + "loss": 0.1079, + "step": 2886 + }, + { + "epoch": 1.5028630921395107, + "grad_norm": 0.2489383930332058, + "learning_rate": 2.6137661565298726e-05, + "loss": 0.1121, + "step": 2887 + }, + { + "epoch": 1.5033836543466945, + "grad_norm": 0.24643956447343202, + "learning_rate": 2.6123625885019854e-05, + "loss": 0.1091, + "step": 2888 + }, + { + "epoch": 1.5039042165538783, + "grad_norm": 0.23991702848744126, + "learning_rate": 2.6109589849847643e-05, + "loss": 0.1164, + "step": 2889 + }, + { + "epoch": 1.504424778761062, + "grad_norm": 0.2381158559586777, + "learning_rate": 2.6095553464215322e-05, + "loss": 0.1087, + "step": 2890 + }, + { + "epoch": 1.5049453409682458, + "grad_norm": 0.2264207570228489, + "learning_rate": 2.6081516732556226e-05, + "loss": 0.1077, + "step": 2891 + }, + { + "epoch": 1.5054659031754296, + "grad_norm": 0.23184141890912552, + "learning_rate": 2.606747965930383e-05, + "loss": 0.1093, + "step": 2892 + }, + { + "epoch": 1.5059864653826134, + "grad_norm": 0.2300329290920324, + "learning_rate": 2.605344224889167e-05, + "loss": 0.1114, + "step": 2893 + }, + { + "epoch": 1.506507027589797, + "grad_norm": 0.23186346307565206, + "learning_rate": 2.6039404505753433e-05, + "loss": 0.1052, + "step": 2894 + }, + { + "epoch": 1.5070275897969807, + "grad_norm": 0.22392258697887885, + "learning_rate": 2.602536643432288e-05, + "loss": 0.1041, + "step": 2895 + }, + { + "epoch": 1.5075481520041645, + "grad_norm": 0.24950950930317453, + "learning_rate": 2.601132803903389e-05, + "loss": 0.1097, + "step": 2896 + }, + { + "epoch": 1.5080687142113482, + "grad_norm": 0.22767870898021633, + "learning_rate": 2.599728932432044e-05, + "loss": 0.1049, + "step": 2897 + }, + { + "epoch": 1.508589276418532, + "grad_norm": 0.23533974509856762, + "learning_rate": 2.5983250294616618e-05, + "loss": 0.108, + "step": 2898 + }, + { + "epoch": 1.5091098386257158, + "grad_norm": 0.24447947730787556, + "learning_rate": 2.596921095435659e-05, + "loss": 0.1109, + "step": 2899 + }, + { + "epoch": 1.5096304008328996, + "grad_norm": 0.24152156647479783, + "learning_rate": 2.595517130797464e-05, + "loss": 0.1164, + "step": 2900 + }, + { + "epoch": 1.5101509630400833, + "grad_norm": 0.23201234882111582, + "learning_rate": 2.5941131359905146e-05, + "loss": 0.1057, + "step": 2901 + }, + { + "epoch": 1.5106715252472669, + "grad_norm": 0.22589534043240497, + "learning_rate": 2.5927091114582565e-05, + "loss": 0.1099, + "step": 2902 + }, + { + "epoch": 1.5111920874544507, + "grad_norm": 0.22101671654491578, + "learning_rate": 2.5913050576441477e-05, + "loss": 0.109, + "step": 2903 + }, + { + "epoch": 1.5117126496616344, + "grad_norm": 0.2186466588558665, + "learning_rate": 2.589900974991652e-05, + "loss": 0.1088, + "step": 2904 + }, + { + "epoch": 1.5122332118688182, + "grad_norm": 0.22498932105359454, + "learning_rate": 2.588496863944244e-05, + "loss": 0.1034, + "step": 2905 + }, + { + "epoch": 1.512753774076002, + "grad_norm": 0.21166235287356577, + "learning_rate": 2.5870927249454097e-05, + "loss": 0.1037, + "step": 2906 + }, + { + "epoch": 1.5132743362831858, + "grad_norm": 0.22559742909440408, + "learning_rate": 2.5856885584386393e-05, + "loss": 0.1045, + "step": 2907 + }, + { + "epoch": 1.5137948984903695, + "grad_norm": 0.22659709971519004, + "learning_rate": 2.584284364867435e-05, + "loss": 0.106, + "step": 2908 + }, + { + "epoch": 1.5143154606975533, + "grad_norm": 0.24273341457463493, + "learning_rate": 2.582880144675305e-05, + "loss": 0.1155, + "step": 2909 + }, + { + "epoch": 1.514836022904737, + "grad_norm": 0.23888844864219327, + "learning_rate": 2.5814758983057684e-05, + "loss": 0.1058, + "step": 2910 + }, + { + "epoch": 1.5153565851119208, + "grad_norm": 0.24063817042473679, + "learning_rate": 2.5800716262023515e-05, + "loss": 0.1078, + "step": 2911 + }, + { + "epoch": 1.5158771473191046, + "grad_norm": 0.2403037315106119, + "learning_rate": 2.5786673288085898e-05, + "loss": 0.1127, + "step": 2912 + }, + { + "epoch": 1.5163977095262884, + "grad_norm": 0.2273826223314166, + "learning_rate": 2.577263006568025e-05, + "loss": 0.1064, + "step": 2913 + }, + { + "epoch": 1.5169182717334722, + "grad_norm": 0.21909822724724917, + "learning_rate": 2.5758586599242057e-05, + "loss": 0.1036, + "step": 2914 + }, + { + "epoch": 1.517438833940656, + "grad_norm": 0.23969847647923984, + "learning_rate": 2.5744542893206924e-05, + "loss": 0.1106, + "step": 2915 + }, + { + "epoch": 1.5179593961478397, + "grad_norm": 0.24016614079483722, + "learning_rate": 2.5730498952010502e-05, + "loss": 0.1074, + "step": 2916 + }, + { + "epoch": 1.5184799583550235, + "grad_norm": 0.2308633624750452, + "learning_rate": 2.5716454780088512e-05, + "loss": 0.1115, + "step": 2917 + }, + { + "epoch": 1.5190005205622072, + "grad_norm": 0.23198908006842925, + "learning_rate": 2.570241038187675e-05, + "loss": 0.1128, + "step": 2918 + }, + { + "epoch": 1.519521082769391, + "grad_norm": 0.23442644263464943, + "learning_rate": 2.5688365761811116e-05, + "loss": 0.1049, + "step": 2919 + }, + { + "epoch": 1.5200416449765748, + "grad_norm": 0.22883414451848424, + "learning_rate": 2.5674320924327533e-05, + "loss": 0.1097, + "step": 2920 + }, + { + "epoch": 1.5205622071837586, + "grad_norm": 0.2361125411690794, + "learning_rate": 2.566027587386203e-05, + "loss": 0.1072, + "step": 2921 + }, + { + "epoch": 1.5210827693909423, + "grad_norm": 0.23120541766705457, + "learning_rate": 2.5646230614850673e-05, + "loss": 0.1082, + "step": 2922 + }, + { + "epoch": 1.521603331598126, + "grad_norm": 0.22388494741212342, + "learning_rate": 2.5632185151729616e-05, + "loss": 0.1091, + "step": 2923 + }, + { + "epoch": 1.5221238938053099, + "grad_norm": 0.2260163271545998, + "learning_rate": 2.5618139488935072e-05, + "loss": 0.1066, + "step": 2924 + }, + { + "epoch": 1.5226444560124937, + "grad_norm": 0.2365959253566713, + "learning_rate": 2.5604093630903307e-05, + "loss": 0.1147, + "step": 2925 + }, + { + "epoch": 1.5231650182196772, + "grad_norm": 0.2195917844320001, + "learning_rate": 2.559004758207067e-05, + "loss": 0.1016, + "step": 2926 + }, + { + "epoch": 1.523685580426861, + "grad_norm": 0.22196040442631998, + "learning_rate": 2.557600134687354e-05, + "loss": 0.1106, + "step": 2927 + }, + { + "epoch": 1.5242061426340447, + "grad_norm": 0.2249212499365945, + "learning_rate": 2.5561954929748382e-05, + "loss": 0.1091, + "step": 2928 + }, + { + "epoch": 1.5247267048412285, + "grad_norm": 0.21665296060051842, + "learning_rate": 2.5547908335131704e-05, + "loss": 0.1066, + "step": 2929 + }, + { + "epoch": 1.5252472670484123, + "grad_norm": 0.23396144154716453, + "learning_rate": 2.5533861567460077e-05, + "loss": 0.1122, + "step": 2930 + }, + { + "epoch": 1.525767829255596, + "grad_norm": 0.23254614834471804, + "learning_rate": 2.5519814631170125e-05, + "loss": 0.1099, + "step": 2931 + }, + { + "epoch": 1.5262883914627798, + "grad_norm": 0.2313903436634836, + "learning_rate": 2.550576753069852e-05, + "loss": 0.111, + "step": 2932 + }, + { + "epoch": 1.5268089536699636, + "grad_norm": 0.22589258005003615, + "learning_rate": 2.5491720270481994e-05, + "loss": 0.1086, + "step": 2933 + }, + { + "epoch": 1.5273295158771472, + "grad_norm": 0.24350507728473914, + "learning_rate": 2.5477672854957325e-05, + "loss": 0.1149, + "step": 2934 + }, + { + "epoch": 1.527850078084331, + "grad_norm": 0.2296799862656636, + "learning_rate": 2.5463625288561343e-05, + "loss": 0.1095, + "step": 2935 + }, + { + "epoch": 1.5283706402915147, + "grad_norm": 0.21068213738843436, + "learning_rate": 2.5449577575730908e-05, + "loss": 0.1037, + "step": 2936 + }, + { + "epoch": 1.5288912024986985, + "grad_norm": 0.2282417696222234, + "learning_rate": 2.5435529720902955e-05, + "loss": 0.1078, + "step": 2937 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.2287731855036949, + "learning_rate": 2.5421481728514456e-05, + "loss": 0.1077, + "step": 2938 + }, + { + "epoch": 1.529932326913066, + "grad_norm": 0.24416895094839683, + "learning_rate": 2.5407433603002417e-05, + "loss": 0.116, + "step": 2939 + }, + { + "epoch": 1.5304528891202498, + "grad_norm": 0.2254694810188856, + "learning_rate": 2.5393385348803877e-05, + "loss": 0.1101, + "step": 2940 + }, + { + "epoch": 1.5309734513274336, + "grad_norm": 0.22885825764719317, + "learning_rate": 2.5379336970355938e-05, + "loss": 0.1105, + "step": 2941 + }, + { + "epoch": 1.5314940135346173, + "grad_norm": 0.2240956766046063, + "learning_rate": 2.5365288472095734e-05, + "loss": 0.1105, + "step": 2942 + }, + { + "epoch": 1.532014575741801, + "grad_norm": 0.22399611393029464, + "learning_rate": 2.5351239858460423e-05, + "loss": 0.1116, + "step": 2943 + }, + { + "epoch": 1.5325351379489849, + "grad_norm": 0.2291513822745778, + "learning_rate": 2.5337191133887232e-05, + "loss": 0.1106, + "step": 2944 + }, + { + "epoch": 1.5330557001561687, + "grad_norm": 0.2278099394652461, + "learning_rate": 2.5323142302813384e-05, + "loss": 0.1108, + "step": 2945 + }, + { + "epoch": 1.5335762623633524, + "grad_norm": 0.23495759529423726, + "learning_rate": 2.5309093369676158e-05, + "loss": 0.1122, + "step": 2946 + }, + { + "epoch": 1.5340968245705362, + "grad_norm": 0.22433879417057873, + "learning_rate": 2.529504433891286e-05, + "loss": 0.1094, + "step": 2947 + }, + { + "epoch": 1.53461738677772, + "grad_norm": 0.2312010873910697, + "learning_rate": 2.5280995214960835e-05, + "loss": 0.1108, + "step": 2948 + }, + { + "epoch": 1.5351379489849037, + "grad_norm": 0.22683977448776502, + "learning_rate": 2.5266946002257447e-05, + "loss": 0.1078, + "step": 2949 + }, + { + "epoch": 1.5356585111920875, + "grad_norm": 0.24008434305692997, + "learning_rate": 2.525289670524008e-05, + "loss": 0.1058, + "step": 2950 + }, + { + "epoch": 1.5361790733992713, + "grad_norm": 0.2276628830953262, + "learning_rate": 2.523884732834617e-05, + "loss": 0.1156, + "step": 2951 + }, + { + "epoch": 1.536699635606455, + "grad_norm": 0.23018808698410154, + "learning_rate": 2.522479787601315e-05, + "loss": 0.1107, + "step": 2952 + }, + { + "epoch": 1.5372201978136388, + "grad_norm": 0.22488086495321882, + "learning_rate": 2.521074835267851e-05, + "loss": 0.1084, + "step": 2953 + }, + { + "epoch": 1.5377407600208226, + "grad_norm": 0.23467533242982103, + "learning_rate": 2.519669876277973e-05, + "loss": 0.11, + "step": 2954 + }, + { + "epoch": 1.5382613222280064, + "grad_norm": 0.24014862239225046, + "learning_rate": 2.5182649110754324e-05, + "loss": 0.117, + "step": 2955 + }, + { + "epoch": 1.5387818844351902, + "grad_norm": 0.2313151546820494, + "learning_rate": 2.5168599401039833e-05, + "loss": 0.1118, + "step": 2956 + }, + { + "epoch": 1.539302446642374, + "grad_norm": 0.24807712232283863, + "learning_rate": 2.515454963807381e-05, + "loss": 0.1088, + "step": 2957 + }, + { + "epoch": 1.5398230088495575, + "grad_norm": 0.22300185095946995, + "learning_rate": 2.514049982629381e-05, + "loss": 0.107, + "step": 2958 + }, + { + "epoch": 1.5403435710567412, + "grad_norm": 0.24240071957864304, + "learning_rate": 2.5126449970137427e-05, + "loss": 0.1118, + "step": 2959 + }, + { + "epoch": 1.540864133263925, + "grad_norm": 0.22785169569105423, + "learning_rate": 2.5112400074042264e-05, + "loss": 0.1097, + "step": 2960 + }, + { + "epoch": 1.5413846954711088, + "grad_norm": 0.23607456456416684, + "learning_rate": 2.509835014244592e-05, + "loss": 0.1047, + "step": 2961 + }, + { + "epoch": 1.5419052576782926, + "grad_norm": 0.24205109765791188, + "learning_rate": 2.5084300179786036e-05, + "loss": 0.1086, + "step": 2962 + }, + { + "epoch": 1.5424258198854763, + "grad_norm": 0.22178597028098188, + "learning_rate": 2.5070250190500223e-05, + "loss": 0.1079, + "step": 2963 + }, + { + "epoch": 1.54294638209266, + "grad_norm": 0.24576503225959828, + "learning_rate": 2.5056200179026128e-05, + "loss": 0.1124, + "step": 2964 + }, + { + "epoch": 1.5434669442998439, + "grad_norm": 0.219334903872518, + "learning_rate": 2.5042150149801408e-05, + "loss": 0.1045, + "step": 2965 + }, + { + "epoch": 1.5439875065070274, + "grad_norm": 0.2253492148046559, + "learning_rate": 2.5028100107263714e-05, + "loss": 0.1078, + "step": 2966 + }, + { + "epoch": 1.5445080687142112, + "grad_norm": 0.21980719949962024, + "learning_rate": 2.501405005585069e-05, + "loss": 0.1101, + "step": 2967 + }, + { + "epoch": 1.545028630921395, + "grad_norm": 0.22443739038898264, + "learning_rate": 2.5e-05, + "loss": 0.1075, + "step": 2968 + }, + { + "epoch": 1.5455491931285787, + "grad_norm": 0.22056246246510713, + "learning_rate": 2.4985949944149315e-05, + "loss": 0.1076, + "step": 2969 + }, + { + "epoch": 1.5460697553357625, + "grad_norm": 0.22545188726465185, + "learning_rate": 2.4971899892736295e-05, + "loss": 0.1095, + "step": 2970 + }, + { + "epoch": 1.5465903175429463, + "grad_norm": 0.2127466187917586, + "learning_rate": 2.4957849850198588e-05, + "loss": 0.1023, + "step": 2971 + }, + { + "epoch": 1.54711087975013, + "grad_norm": 0.24537811434768622, + "learning_rate": 2.494379982097387e-05, + "loss": 0.112, + "step": 2972 + }, + { + "epoch": 1.5476314419573138, + "grad_norm": 0.23557533614284934, + "learning_rate": 2.4929749809499786e-05, + "loss": 0.1109, + "step": 2973 + }, + { + "epoch": 1.5481520041644976, + "grad_norm": 0.23075351228444654, + "learning_rate": 2.4915699820213973e-05, + "loss": 0.1092, + "step": 2974 + }, + { + "epoch": 1.5486725663716814, + "grad_norm": 0.23468413572684516, + "learning_rate": 2.4901649857554082e-05, + "loss": 0.1106, + "step": 2975 + }, + { + "epoch": 1.5491931285788652, + "grad_norm": 0.2282770092156467, + "learning_rate": 2.488759992595774e-05, + "loss": 0.1055, + "step": 2976 + }, + { + "epoch": 1.549713690786049, + "grad_norm": 0.2344915741816126, + "learning_rate": 2.487355002986258e-05, + "loss": 0.1105, + "step": 2977 + }, + { + "epoch": 1.5502342529932327, + "grad_norm": 0.23092806515292114, + "learning_rate": 2.4859500173706195e-05, + "loss": 0.1101, + "step": 2978 + }, + { + "epoch": 1.5507548152004165, + "grad_norm": 0.25062475394919426, + "learning_rate": 2.48454503619262e-05, + "loss": 0.1087, + "step": 2979 + }, + { + "epoch": 1.5512753774076002, + "grad_norm": 0.22139929458785265, + "learning_rate": 2.4831400598960162e-05, + "loss": 0.1072, + "step": 2980 + }, + { + "epoch": 1.551795939614784, + "grad_norm": 0.24617079179057558, + "learning_rate": 2.4817350889245675e-05, + "loss": 0.1092, + "step": 2981 + }, + { + "epoch": 1.5523165018219678, + "grad_norm": 0.23491552403906255, + "learning_rate": 2.4803301237220277e-05, + "loss": 0.1052, + "step": 2982 + }, + { + "epoch": 1.5528370640291516, + "grad_norm": 0.22975191075816628, + "learning_rate": 2.4789251647321497e-05, + "loss": 0.1085, + "step": 2983 + }, + { + "epoch": 1.5533576262363353, + "grad_norm": 0.23209505246845444, + "learning_rate": 2.4775202123986855e-05, + "loss": 0.1108, + "step": 2984 + }, + { + "epoch": 1.553878188443519, + "grad_norm": 0.22191390499400956, + "learning_rate": 2.4761152671653835e-05, + "loss": 0.1046, + "step": 2985 + }, + { + "epoch": 1.5543987506507029, + "grad_norm": 0.22032710409929804, + "learning_rate": 2.4747103294759928e-05, + "loss": 0.1037, + "step": 2986 + }, + { + "epoch": 1.5549193128578866, + "grad_norm": 0.22728163589619113, + "learning_rate": 2.4733053997742562e-05, + "loss": 0.1102, + "step": 2987 + }, + { + "epoch": 1.5554398750650704, + "grad_norm": 0.22954424667321993, + "learning_rate": 2.4719004785039168e-05, + "loss": 0.1086, + "step": 2988 + }, + { + "epoch": 1.5559604372722542, + "grad_norm": 0.22332893329361106, + "learning_rate": 2.4704955661087137e-05, + "loss": 0.1095, + "step": 2989 + }, + { + "epoch": 1.5564809994794377, + "grad_norm": 0.2228519433822219, + "learning_rate": 2.4690906630323844e-05, + "loss": 0.1113, + "step": 2990 + }, + { + "epoch": 1.5570015616866215, + "grad_norm": 0.22383437614333498, + "learning_rate": 2.4676857697186625e-05, + "loss": 0.1107, + "step": 2991 + }, + { + "epoch": 1.5575221238938053, + "grad_norm": 0.2230728740150953, + "learning_rate": 2.4662808866112773e-05, + "loss": 0.1112, + "step": 2992 + }, + { + "epoch": 1.558042686100989, + "grad_norm": 0.21685734122134906, + "learning_rate": 2.464876014153958e-05, + "loss": 0.1088, + "step": 2993 + }, + { + "epoch": 1.5585632483081728, + "grad_norm": 0.2203672898784778, + "learning_rate": 2.4634711527904272e-05, + "loss": 0.1081, + "step": 2994 + }, + { + "epoch": 1.5590838105153566, + "grad_norm": 0.2197549868768869, + "learning_rate": 2.4620663029644068e-05, + "loss": 0.1042, + "step": 2995 + }, + { + "epoch": 1.5596043727225404, + "grad_norm": 0.22043617116963546, + "learning_rate": 2.460661465119613e-05, + "loss": 0.1069, + "step": 2996 + }, + { + "epoch": 1.5601249349297241, + "grad_norm": 0.23842553602217398, + "learning_rate": 2.459256639699759e-05, + "loss": 0.1137, + "step": 2997 + }, + { + "epoch": 1.5606454971369077, + "grad_norm": 0.21377025661406016, + "learning_rate": 2.457851827148554e-05, + "loss": 0.104, + "step": 2998 + }, + { + "epoch": 1.5611660593440915, + "grad_norm": 0.2274138298267601, + "learning_rate": 2.456447027909704e-05, + "loss": 0.1097, + "step": 2999 + }, + { + "epoch": 1.5616866215512752, + "grad_norm": 0.22165119897394892, + "learning_rate": 2.45504224242691e-05, + "loss": 0.1094, + "step": 3000 + }, + { + "epoch": 1.562207183758459, + "grad_norm": 0.21490549783538904, + "learning_rate": 2.453637471143867e-05, + "loss": 0.1125, + "step": 3001 + }, + { + "epoch": 1.5627277459656428, + "grad_norm": 0.21227852422993695, + "learning_rate": 2.4522327145042684e-05, + "loss": 0.1069, + "step": 3002 + }, + { + "epoch": 1.5632483081728266, + "grad_norm": 0.21598368023419817, + "learning_rate": 2.450827972951801e-05, + "loss": 0.1067, + "step": 3003 + }, + { + "epoch": 1.5637688703800103, + "grad_norm": 0.21825295793980276, + "learning_rate": 2.4494232469301485e-05, + "loss": 0.1071, + "step": 3004 + }, + { + "epoch": 1.564289432587194, + "grad_norm": 0.22548740973175538, + "learning_rate": 2.4480185368829877e-05, + "loss": 0.1129, + "step": 3005 + }, + { + "epoch": 1.5648099947943779, + "grad_norm": 0.22232346176457024, + "learning_rate": 2.446613843253993e-05, + "loss": 0.1067, + "step": 3006 + }, + { + "epoch": 1.5653305570015617, + "grad_norm": 0.24009122736060118, + "learning_rate": 2.4452091664868298e-05, + "loss": 0.111, + "step": 3007 + }, + { + "epoch": 1.5658511192087454, + "grad_norm": 0.2243097388008218, + "learning_rate": 2.4438045070251624e-05, + "loss": 0.1036, + "step": 3008 + }, + { + "epoch": 1.5663716814159292, + "grad_norm": 0.23190645268050183, + "learning_rate": 2.4423998653126472e-05, + "loss": 0.1124, + "step": 3009 + }, + { + "epoch": 1.566892243623113, + "grad_norm": 0.3486307228352415, + "learning_rate": 2.440995241792934e-05, + "loss": 0.1043, + "step": 3010 + }, + { + "epoch": 1.5674128058302967, + "grad_norm": 0.23470724899809928, + "learning_rate": 2.43959063690967e-05, + "loss": 0.1075, + "step": 3011 + }, + { + "epoch": 1.5679333680374805, + "grad_norm": 0.23329367665913123, + "learning_rate": 2.4381860511064933e-05, + "loss": 0.1139, + "step": 3012 + }, + { + "epoch": 1.5684539302446643, + "grad_norm": 0.23185338653990387, + "learning_rate": 2.4367814848270387e-05, + "loss": 0.1082, + "step": 3013 + }, + { + "epoch": 1.568974492451848, + "grad_norm": 0.2259910671539203, + "learning_rate": 2.435376938514933e-05, + "loss": 0.1114, + "step": 3014 + }, + { + "epoch": 1.5694950546590318, + "grad_norm": 0.2427562483119206, + "learning_rate": 2.4339724126137974e-05, + "loss": 0.1119, + "step": 3015 + }, + { + "epoch": 1.5700156168662156, + "grad_norm": 0.25899173179638624, + "learning_rate": 2.4325679075672462e-05, + "loss": 0.116, + "step": 3016 + }, + { + "epoch": 1.5705361790733994, + "grad_norm": 0.22878997419672623, + "learning_rate": 2.431163423818889e-05, + "loss": 0.1085, + "step": 3017 + }, + { + "epoch": 1.5710567412805831, + "grad_norm": 0.2384972240792906, + "learning_rate": 2.4297589618123258e-05, + "loss": 0.1147, + "step": 3018 + }, + { + "epoch": 1.571577303487767, + "grad_norm": 0.22510819520011185, + "learning_rate": 2.4283545219911503e-05, + "loss": 0.1069, + "step": 3019 + }, + { + "epoch": 1.5720978656949507, + "grad_norm": 0.2230992006833652, + "learning_rate": 2.4269501047989514e-05, + "loss": 0.1089, + "step": 3020 + }, + { + "epoch": 1.5726184279021345, + "grad_norm": 0.22853011702632323, + "learning_rate": 2.425545710679308e-05, + "loss": 0.1091, + "step": 3021 + }, + { + "epoch": 1.573138990109318, + "grad_norm": 0.2242485692166422, + "learning_rate": 2.424141340075795e-05, + "loss": 0.1049, + "step": 3022 + }, + { + "epoch": 1.5736595523165018, + "grad_norm": 0.2322211744480236, + "learning_rate": 2.422736993431976e-05, + "loss": 0.1125, + "step": 3023 + }, + { + "epoch": 1.5741801145236856, + "grad_norm": 0.2076629175930266, + "learning_rate": 2.4213326711914108e-05, + "loss": 0.102, + "step": 3024 + }, + { + "epoch": 1.5747006767308693, + "grad_norm": 0.2217478061639669, + "learning_rate": 2.419928373797648e-05, + "loss": 0.1059, + "step": 3025 + }, + { + "epoch": 1.575221238938053, + "grad_norm": 0.22413761407508365, + "learning_rate": 2.418524101694232e-05, + "loss": 0.1088, + "step": 3026 + }, + { + "epoch": 1.5757418011452369, + "grad_norm": 0.2261030701823896, + "learning_rate": 2.4171198553246967e-05, + "loss": 0.113, + "step": 3027 + }, + { + "epoch": 1.5762623633524206, + "grad_norm": 0.24473343847561574, + "learning_rate": 2.4157156351325668e-05, + "loss": 0.1062, + "step": 3028 + }, + { + "epoch": 1.5767829255596044, + "grad_norm": 0.22190551778893397, + "learning_rate": 2.414311441561362e-05, + "loss": 0.1086, + "step": 3029 + }, + { + "epoch": 1.577303487766788, + "grad_norm": 0.23472397117142715, + "learning_rate": 2.4129072750545912e-05, + "loss": 0.1137, + "step": 3030 + }, + { + "epoch": 1.5778240499739717, + "grad_norm": 0.22344372474806895, + "learning_rate": 2.4115031360557562e-05, + "loss": 0.1087, + "step": 3031 + }, + { + "epoch": 1.5783446121811555, + "grad_norm": 0.25965872299574905, + "learning_rate": 2.4100990250083487e-05, + "loss": 0.1139, + "step": 3032 + }, + { + "epoch": 1.5788651743883393, + "grad_norm": 0.22568577090311404, + "learning_rate": 2.4086949423558526e-05, + "loss": 0.1098, + "step": 3033 + }, + { + "epoch": 1.579385736595523, + "grad_norm": 0.21814595333115117, + "learning_rate": 2.4072908885417438e-05, + "loss": 0.1084, + "step": 3034 + }, + { + "epoch": 1.5799062988027068, + "grad_norm": 0.22292260467366531, + "learning_rate": 2.4058868640094857e-05, + "loss": 0.1087, + "step": 3035 + }, + { + "epoch": 1.5804268610098906, + "grad_norm": 0.22464387093431798, + "learning_rate": 2.404482869202537e-05, + "loss": 0.1043, + "step": 3036 + }, + { + "epoch": 1.5809474232170744, + "grad_norm": 0.22574493698081854, + "learning_rate": 2.4030789045643418e-05, + "loss": 0.1046, + "step": 3037 + }, + { + "epoch": 1.5814679854242581, + "grad_norm": 0.23794985547646166, + "learning_rate": 2.401674970538339e-05, + "loss": 0.1092, + "step": 3038 + }, + { + "epoch": 1.581988547631442, + "grad_norm": 0.23061947403120528, + "learning_rate": 2.4002710675679565e-05, + "loss": 0.1083, + "step": 3039 + }, + { + "epoch": 1.5825091098386257, + "grad_norm": 0.3114813728434248, + "learning_rate": 2.3988671960966113e-05, + "loss": 0.1135, + "step": 3040 + }, + { + "epoch": 1.5830296720458095, + "grad_norm": 0.24350087448310825, + "learning_rate": 2.3974633565677126e-05, + "loss": 0.1126, + "step": 3041 + }, + { + "epoch": 1.5835502342529932, + "grad_norm": 0.23325201805390236, + "learning_rate": 2.3960595494246573e-05, + "loss": 0.1088, + "step": 3042 + }, + { + "epoch": 1.584070796460177, + "grad_norm": 0.22534337357637105, + "learning_rate": 2.394655775110833e-05, + "loss": 0.1109, + "step": 3043 + }, + { + "epoch": 1.5845913586673608, + "grad_norm": 0.2186403250472862, + "learning_rate": 2.393252034069617e-05, + "loss": 0.1078, + "step": 3044 + }, + { + "epoch": 1.5851119208745446, + "grad_norm": 0.23893857707748875, + "learning_rate": 2.3918483267443777e-05, + "loss": 0.1096, + "step": 3045 + }, + { + "epoch": 1.5856324830817283, + "grad_norm": 0.2296564204097977, + "learning_rate": 2.3904446535784687e-05, + "loss": 0.1117, + "step": 3046 + }, + { + "epoch": 1.586153045288912, + "grad_norm": 0.2256094183111909, + "learning_rate": 2.3890410150152363e-05, + "loss": 0.11, + "step": 3047 + }, + { + "epoch": 1.5866736074960959, + "grad_norm": 0.2197512896748002, + "learning_rate": 2.387637411498015e-05, + "loss": 0.1085, + "step": 3048 + }, + { + "epoch": 1.5871941697032796, + "grad_norm": 0.22684246333884714, + "learning_rate": 2.386233843470128e-05, + "loss": 0.107, + "step": 3049 + }, + { + "epoch": 1.5877147319104634, + "grad_norm": 0.22474155235201673, + "learning_rate": 2.384830311374888e-05, + "loss": 0.1067, + "step": 3050 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.22213948016741225, + "learning_rate": 2.3834268156555943e-05, + "loss": 0.1066, + "step": 3051 + }, + { + "epoch": 1.588755856324831, + "grad_norm": 0.234760326851895, + "learning_rate": 2.3820233567555378e-05, + "loss": 0.1059, + "step": 3052 + }, + { + "epoch": 1.5892764185320147, + "grad_norm": 0.22463049283556413, + "learning_rate": 2.3806199351179948e-05, + "loss": 0.1119, + "step": 3053 + }, + { + "epoch": 1.5897969807391983, + "grad_norm": 0.2233971474985051, + "learning_rate": 2.379216551186233e-05, + "loss": 0.1039, + "step": 3054 + }, + { + "epoch": 1.590317542946382, + "grad_norm": 0.23716623599975512, + "learning_rate": 2.3778132054035043e-05, + "loss": 0.1077, + "step": 3055 + }, + { + "epoch": 1.5908381051535658, + "grad_norm": 0.22256838771775073, + "learning_rate": 2.3764098982130507e-05, + "loss": 0.1058, + "step": 3056 + }, + { + "epoch": 1.5913586673607496, + "grad_norm": 0.22537466848570817, + "learning_rate": 2.3750066300581037e-05, + "loss": 0.1039, + "step": 3057 + }, + { + "epoch": 1.5918792295679334, + "grad_norm": 0.2796215036036369, + "learning_rate": 2.3736034013818788e-05, + "loss": 0.1144, + "step": 3058 + }, + { + "epoch": 1.5923997917751171, + "grad_norm": 0.23128267449784587, + "learning_rate": 2.3722002126275824e-05, + "loss": 0.1105, + "step": 3059 + }, + { + "epoch": 1.592920353982301, + "grad_norm": 0.22108759830456698, + "learning_rate": 2.3707970642384056e-05, + "loss": 0.1059, + "step": 3060 + }, + { + "epoch": 1.5934409161894847, + "grad_norm": 0.23163491105689502, + "learning_rate": 2.3693939566575287e-05, + "loss": 0.1085, + "step": 3061 + }, + { + "epoch": 1.5939614783966682, + "grad_norm": 0.226474109078299, + "learning_rate": 2.3679908903281182e-05, + "loss": 0.1108, + "step": 3062 + }, + { + "epoch": 1.594482040603852, + "grad_norm": 0.2360898550334602, + "learning_rate": 2.366587865693328e-05, + "loss": 0.1046, + "step": 3063 + }, + { + "epoch": 1.5950026028110358, + "grad_norm": 0.21596317159749592, + "learning_rate": 2.3651848831962985e-05, + "loss": 0.1065, + "step": 3064 + }, + { + "epoch": 1.5955231650182196, + "grad_norm": 0.2340159211967676, + "learning_rate": 2.363781943280156e-05, + "loss": 0.1064, + "step": 3065 + }, + { + "epoch": 1.5960437272254033, + "grad_norm": 0.23811590741012165, + "learning_rate": 2.3623790463880153e-05, + "loss": 0.1101, + "step": 3066 + }, + { + "epoch": 1.596564289432587, + "grad_norm": 0.24057517611837248, + "learning_rate": 2.3609761929629755e-05, + "loss": 0.1038, + "step": 3067 + }, + { + "epoch": 1.5970848516397709, + "grad_norm": 0.22888857683722608, + "learning_rate": 2.3595733834481236e-05, + "loss": 0.1093, + "step": 3068 + }, + { + "epoch": 1.5976054138469546, + "grad_norm": 0.23000911098983517, + "learning_rate": 2.3581706182865324e-05, + "loss": 0.1133, + "step": 3069 + }, + { + "epoch": 1.5981259760541384, + "grad_norm": 0.23001119374170786, + "learning_rate": 2.3567678979212602e-05, + "loss": 0.1064, + "step": 3070 + }, + { + "epoch": 1.5986465382613222, + "grad_norm": 0.22412995503191058, + "learning_rate": 2.3553652227953507e-05, + "loss": 0.1138, + "step": 3071 + }, + { + "epoch": 1.599167100468506, + "grad_norm": 0.23039758715555397, + "learning_rate": 2.3539625933518354e-05, + "loss": 0.1095, + "step": 3072 + }, + { + "epoch": 1.5996876626756897, + "grad_norm": 0.2356318998221753, + "learning_rate": 2.3525600100337294e-05, + "loss": 0.1139, + "step": 3073 + }, + { + "epoch": 1.6002082248828735, + "grad_norm": 0.21699411021910314, + "learning_rate": 2.3511574732840325e-05, + "loss": 0.1024, + "step": 3074 + }, + { + "epoch": 1.6007287870900573, + "grad_norm": 0.2185662528539939, + "learning_rate": 2.3497549835457328e-05, + "loss": 0.1085, + "step": 3075 + }, + { + "epoch": 1.601249349297241, + "grad_norm": 0.22922474819533187, + "learning_rate": 2.3483525412618e-05, + "loss": 0.1097, + "step": 3076 + }, + { + "epoch": 1.6017699115044248, + "grad_norm": 0.21577899387549374, + "learning_rate": 2.3469501468751928e-05, + "loss": 0.1074, + "step": 3077 + }, + { + "epoch": 1.6022904737116086, + "grad_norm": 0.22051359137252985, + "learning_rate": 2.3455478008288508e-05, + "loss": 0.1039, + "step": 3078 + }, + { + "epoch": 1.6028110359187924, + "grad_norm": 0.23827856861336888, + "learning_rate": 2.3441455035657013e-05, + "loss": 0.1068, + "step": 3079 + }, + { + "epoch": 1.6033315981259761, + "grad_norm": 0.2338622329393293, + "learning_rate": 2.3427432555286543e-05, + "loss": 0.1072, + "step": 3080 + }, + { + "epoch": 1.60385216033316, + "grad_norm": 0.23740858644971805, + "learning_rate": 2.341341057160606e-05, + "loss": 0.1114, + "step": 3081 + }, + { + "epoch": 1.6043727225403437, + "grad_norm": 0.23132959270597508, + "learning_rate": 2.3399389089044354e-05, + "loss": 0.1058, + "step": 3082 + }, + { + "epoch": 1.6048932847475275, + "grad_norm": 0.253632972193079, + "learning_rate": 2.3385368112030052e-05, + "loss": 0.1089, + "step": 3083 + }, + { + "epoch": 1.6054138469547112, + "grad_norm": 0.2249408980909488, + "learning_rate": 2.337134764499164e-05, + "loss": 0.1059, + "step": 3084 + }, + { + "epoch": 1.605934409161895, + "grad_norm": 0.2220495865666503, + "learning_rate": 2.335732769235743e-05, + "loss": 0.105, + "step": 3085 + }, + { + "epoch": 1.6064549713690786, + "grad_norm": 0.2442265679104099, + "learning_rate": 2.3343308258555587e-05, + "loss": 0.1086, + "step": 3086 + }, + { + "epoch": 1.6069755335762623, + "grad_norm": 0.2336403991069815, + "learning_rate": 2.332928934801408e-05, + "loss": 0.1075, + "step": 3087 + }, + { + "epoch": 1.607496095783446, + "grad_norm": 0.22276349197581477, + "learning_rate": 2.3315270965160753e-05, + "loss": 0.1055, + "step": 3088 + }, + { + "epoch": 1.6080166579906299, + "grad_norm": 0.2280395730745695, + "learning_rate": 2.330125311442325e-05, + "loss": 0.1044, + "step": 3089 + }, + { + "epoch": 1.6085372201978136, + "grad_norm": 0.23389298695008165, + "learning_rate": 2.3287235800229073e-05, + "loss": 0.1047, + "step": 3090 + }, + { + "epoch": 1.6090577824049974, + "grad_norm": 0.22779890717062137, + "learning_rate": 2.327321902700554e-05, + "loss": 0.1092, + "step": 3091 + }, + { + "epoch": 1.6095783446121812, + "grad_norm": 0.21949265647763663, + "learning_rate": 2.3259202799179785e-05, + "loss": 0.1036, + "step": 3092 + }, + { + "epoch": 1.610098906819365, + "grad_norm": 0.22277867696874984, + "learning_rate": 2.3245187121178804e-05, + "loss": 0.1106, + "step": 3093 + }, + { + "epoch": 1.6106194690265485, + "grad_norm": 0.23819505127712554, + "learning_rate": 2.323117199742938e-05, + "loss": 0.1102, + "step": 3094 + }, + { + "epoch": 1.6111400312337323, + "grad_norm": 0.22647009120490577, + "learning_rate": 2.3217157432358164e-05, + "loss": 0.1048, + "step": 3095 + }, + { + "epoch": 1.611660593440916, + "grad_norm": 0.23823079455303328, + "learning_rate": 2.320314343039159e-05, + "loss": 0.1067, + "step": 3096 + }, + { + "epoch": 1.6121811556480998, + "grad_norm": 0.22520992501315207, + "learning_rate": 2.3189129995955943e-05, + "loss": 0.1053, + "step": 3097 + }, + { + "epoch": 1.6127017178552836, + "grad_norm": 0.2258137397120401, + "learning_rate": 2.3175117133477313e-05, + "loss": 0.1053, + "step": 3098 + }, + { + "epoch": 1.6132222800624674, + "grad_norm": 0.2271783410129132, + "learning_rate": 2.3161104847381608e-05, + "loss": 0.1102, + "step": 3099 + }, + { + "epoch": 1.6137428422696511, + "grad_norm": 0.2271396794206383, + "learning_rate": 2.314709314209459e-05, + "loss": 0.1025, + "step": 3100 + }, + { + "epoch": 1.614263404476835, + "grad_norm": 0.2512629230037695, + "learning_rate": 2.3133082022041767e-05, + "loss": 0.1088, + "step": 3101 + }, + { + "epoch": 1.6147839666840187, + "grad_norm": 0.23119313550176498, + "learning_rate": 2.3119071491648525e-05, + "loss": 0.108, + "step": 3102 + }, + { + "epoch": 1.6153045288912025, + "grad_norm": 0.21317276454406853, + "learning_rate": 2.310506155534003e-05, + "loss": 0.1034, + "step": 3103 + }, + { + "epoch": 1.6158250910983862, + "grad_norm": 0.24282928727082653, + "learning_rate": 2.3091052217541278e-05, + "loss": 0.11, + "step": 3104 + }, + { + "epoch": 1.61634565330557, + "grad_norm": 0.2408607507520557, + "learning_rate": 2.307704348267708e-05, + "loss": 0.1087, + "step": 3105 + }, + { + "epoch": 1.6168662155127538, + "grad_norm": 0.23585883743968147, + "learning_rate": 2.3063035355172026e-05, + "loss": 0.1117, + "step": 3106 + }, + { + "epoch": 1.6173867777199376, + "grad_norm": 0.23123042557678272, + "learning_rate": 2.304902783945056e-05, + "loss": 0.1056, + "step": 3107 + }, + { + "epoch": 1.6179073399271213, + "grad_norm": 0.23648141825819088, + "learning_rate": 2.3035020939936876e-05, + "loss": 0.1037, + "step": 3108 + }, + { + "epoch": 1.618427902134305, + "grad_norm": 0.22436906961143638, + "learning_rate": 2.302101466105504e-05, + "loss": 0.1017, + "step": 3109 + }, + { + "epoch": 1.6189484643414889, + "grad_norm": 0.21301990592667625, + "learning_rate": 2.3007009007228857e-05, + "loss": 0.1059, + "step": 3110 + }, + { + "epoch": 1.6194690265486726, + "grad_norm": 0.23396887540615707, + "learning_rate": 2.2993003982881975e-05, + "loss": 0.1062, + "step": 3111 + }, + { + "epoch": 1.6199895887558564, + "grad_norm": 0.2433106836264054, + "learning_rate": 2.2978999592437837e-05, + "loss": 0.1058, + "step": 3112 + }, + { + "epoch": 1.6205101509630402, + "grad_norm": 0.2216670932634246, + "learning_rate": 2.296499584031967e-05, + "loss": 0.1049, + "step": 3113 + }, + { + "epoch": 1.621030713170224, + "grad_norm": 0.23403580449109418, + "learning_rate": 2.2950992730950523e-05, + "loss": 0.1047, + "step": 3114 + }, + { + "epoch": 1.6215512753774077, + "grad_norm": 0.23966172688921225, + "learning_rate": 2.293699026875322e-05, + "loss": 0.1048, + "step": 3115 + }, + { + "epoch": 1.6220718375845915, + "grad_norm": 0.22806661382306423, + "learning_rate": 2.29229884581504e-05, + "loss": 0.1082, + "step": 3116 + }, + { + "epoch": 1.6225923997917753, + "grad_norm": 0.23681871295661272, + "learning_rate": 2.2908987303564474e-05, + "loss": 0.1044, + "step": 3117 + }, + { + "epoch": 1.6231129619989588, + "grad_norm": 0.22656571389726227, + "learning_rate": 2.2894986809417676e-05, + "loss": 0.1077, + "step": 3118 + }, + { + "epoch": 1.6236335242061426, + "grad_norm": 0.23062976184034878, + "learning_rate": 2.2880986980131996e-05, + "loss": 0.1031, + "step": 3119 + }, + { + "epoch": 1.6241540864133264, + "grad_norm": 0.22553061329604537, + "learning_rate": 2.2866987820129234e-05, + "loss": 0.1053, + "step": 3120 + }, + { + "epoch": 1.6246746486205101, + "grad_norm": 0.22883909177019793, + "learning_rate": 2.2852989333830988e-05, + "loss": 0.1059, + "step": 3121 + }, + { + "epoch": 1.625195210827694, + "grad_norm": 0.2267238669608568, + "learning_rate": 2.2838991525658616e-05, + "loss": 0.1092, + "step": 3122 + }, + { + "epoch": 1.6257157730348777, + "grad_norm": 0.2221538641240812, + "learning_rate": 2.282499440003329e-05, + "loss": 0.1019, + "step": 3123 + }, + { + "epoch": 1.6262363352420615, + "grad_norm": 0.23065218500300072, + "learning_rate": 2.281099796137594e-05, + "loss": 0.1073, + "step": 3124 + }, + { + "epoch": 1.6267568974492452, + "grad_norm": 0.21528998054447926, + "learning_rate": 2.279700221410731e-05, + "loss": 0.1038, + "step": 3125 + }, + { + "epoch": 1.6272774596564288, + "grad_norm": 0.22406127559380892, + "learning_rate": 2.2783007162647894e-05, + "loss": 0.1092, + "step": 3126 + }, + { + "epoch": 1.6277980218636126, + "grad_norm": 0.22732856950130653, + "learning_rate": 2.2769012811417998e-05, + "loss": 0.1043, + "step": 3127 + }, + { + "epoch": 1.6283185840707963, + "grad_norm": 0.24391940916747334, + "learning_rate": 2.275501916483767e-05, + "loss": 0.1084, + "step": 3128 + }, + { + "epoch": 1.62883914627798, + "grad_norm": 0.212950903066082, + "learning_rate": 2.274102622732676e-05, + "loss": 0.1021, + "step": 3129 + }, + { + "epoch": 1.6293597084851639, + "grad_norm": 0.22582328484751868, + "learning_rate": 2.2727034003304893e-05, + "loss": 0.1078, + "step": 3130 + }, + { + "epoch": 1.6298802706923476, + "grad_norm": 0.23178125241301198, + "learning_rate": 2.2713042497191456e-05, + "loss": 0.1052, + "step": 3131 + }, + { + "epoch": 1.6304008328995314, + "grad_norm": 0.22817672478922343, + "learning_rate": 2.2699051713405633e-05, + "loss": 0.113, + "step": 3132 + }, + { + "epoch": 1.6309213951067152, + "grad_norm": 0.21735110824642498, + "learning_rate": 2.2685061656366347e-05, + "loss": 0.1048, + "step": 3133 + }, + { + "epoch": 1.631441957313899, + "grad_norm": 0.219691390085283, + "learning_rate": 2.2671072330492328e-05, + "loss": 0.1063, + "step": 3134 + }, + { + "epoch": 1.6319625195210827, + "grad_norm": 0.23114194424793824, + "learning_rate": 2.2657083740202035e-05, + "loss": 0.1096, + "step": 3135 + }, + { + "epoch": 1.6324830817282665, + "grad_norm": 0.2194312392722462, + "learning_rate": 2.264309588991374e-05, + "loss": 0.1066, + "step": 3136 + }, + { + "epoch": 1.6330036439354503, + "grad_norm": 0.21507036864686763, + "learning_rate": 2.2629108784045438e-05, + "loss": 0.1077, + "step": 3137 + }, + { + "epoch": 1.633524206142634, + "grad_norm": 0.22777523976635303, + "learning_rate": 2.2615122427014913e-05, + "loss": 0.1086, + "step": 3138 + }, + { + "epoch": 1.6340447683498178, + "grad_norm": 0.22055091654855918, + "learning_rate": 2.260113682323971e-05, + "loss": 0.1035, + "step": 3139 + }, + { + "epoch": 1.6345653305570016, + "grad_norm": 0.2238767964565613, + "learning_rate": 2.2587151977137122e-05, + "loss": 0.1077, + "step": 3140 + }, + { + "epoch": 1.6350858927641854, + "grad_norm": 0.22760165259373993, + "learning_rate": 2.2573167893124237e-05, + "loss": 0.1128, + "step": 3141 + }, + { + "epoch": 1.6356064549713691, + "grad_norm": 0.22042911403521775, + "learning_rate": 2.2559184575617857e-05, + "loss": 0.1073, + "step": 3142 + }, + { + "epoch": 1.636127017178553, + "grad_norm": 0.2278962819089735, + "learning_rate": 2.254520202903458e-05, + "loss": 0.107, + "step": 3143 + }, + { + "epoch": 1.6366475793857367, + "grad_norm": 0.22694087159169066, + "learning_rate": 2.253122025779073e-05, + "loss": 0.1092, + "step": 3144 + }, + { + "epoch": 1.6371681415929205, + "grad_norm": 0.22199970972024982, + "learning_rate": 2.2517239266302424e-05, + "loss": 0.1088, + "step": 3145 + }, + { + "epoch": 1.6376887038001042, + "grad_norm": 0.22587462774553319, + "learning_rate": 2.2503259058985487e-05, + "loss": 0.1039, + "step": 3146 + }, + { + "epoch": 1.638209266007288, + "grad_norm": 0.22588919883230146, + "learning_rate": 2.2489279640255526e-05, + "loss": 0.1018, + "step": 3147 + }, + { + "epoch": 1.6387298282144718, + "grad_norm": 0.2361781102822983, + "learning_rate": 2.2475301014527897e-05, + "loss": 0.1078, + "step": 3148 + }, + { + "epoch": 1.6392503904216555, + "grad_norm": 0.22178776083606402, + "learning_rate": 2.246132318621769e-05, + "loss": 0.1065, + "step": 3149 + }, + { + "epoch": 1.639770952628839, + "grad_norm": 0.2301808192860147, + "learning_rate": 2.2447346159739772e-05, + "loss": 0.1069, + "step": 3150 + }, + { + "epoch": 1.6402915148360229, + "grad_norm": 0.23564349495869355, + "learning_rate": 2.2433369939508718e-05, + "loss": 0.1132, + "step": 3151 + }, + { + "epoch": 1.6408120770432066, + "grad_norm": 0.2142534971466308, + "learning_rate": 2.241939452993888e-05, + "loss": 0.1035, + "step": 3152 + }, + { + "epoch": 1.6413326392503904, + "grad_norm": 0.2269015837271862, + "learning_rate": 2.2405419935444338e-05, + "loss": 0.1146, + "step": 3153 + }, + { + "epoch": 1.6418532014575742, + "grad_norm": 0.23097131531972812, + "learning_rate": 2.2391446160438933e-05, + "loss": 0.1092, + "step": 3154 + }, + { + "epoch": 1.642373763664758, + "grad_norm": 0.22445851872935704, + "learning_rate": 2.2377473209336214e-05, + "loss": 0.1059, + "step": 3155 + }, + { + "epoch": 1.6428943258719417, + "grad_norm": 0.21849931770987602, + "learning_rate": 2.2363501086549498e-05, + "loss": 0.1067, + "step": 3156 + }, + { + "epoch": 1.6434148880791255, + "grad_norm": 0.22288897541665642, + "learning_rate": 2.234952979649183e-05, + "loss": 0.1072, + "step": 3157 + }, + { + "epoch": 1.643935450286309, + "grad_norm": 0.23784579556458776, + "learning_rate": 2.233555934357599e-05, + "loss": 0.1146, + "step": 3158 + }, + { + "epoch": 1.6444560124934928, + "grad_norm": 0.22729963389585045, + "learning_rate": 2.232158973221451e-05, + "loss": 0.1048, + "step": 3159 + }, + { + "epoch": 1.6449765747006766, + "grad_norm": 0.22444048797078522, + "learning_rate": 2.230762096681963e-05, + "loss": 0.1032, + "step": 3160 + }, + { + "epoch": 1.6454971369078604, + "grad_norm": 0.2274706569746683, + "learning_rate": 2.2293653051803344e-05, + "loss": 0.1125, + "step": 3161 + }, + { + "epoch": 1.6460176991150441, + "grad_norm": 0.2215985773170227, + "learning_rate": 2.2279685991577363e-05, + "loss": 0.1106, + "step": 3162 + }, + { + "epoch": 1.646538261322228, + "grad_norm": 0.2248236336469974, + "learning_rate": 2.2265719790553147e-05, + "loss": 0.1056, + "step": 3163 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.21576271667103167, + "learning_rate": 2.225175445314186e-05, + "loss": 0.1046, + "step": 3164 + }, + { + "epoch": 1.6475793857365955, + "grad_norm": 0.23144058603703913, + "learning_rate": 2.2237789983754405e-05, + "loss": 0.1113, + "step": 3165 + }, + { + "epoch": 1.6480999479437792, + "grad_norm": 0.2143838630761341, + "learning_rate": 2.2223826386801417e-05, + "loss": 0.1041, + "step": 3166 + }, + { + "epoch": 1.648620510150963, + "grad_norm": 0.2199889181336657, + "learning_rate": 2.2209863666693244e-05, + "loss": 0.1041, + "step": 3167 + }, + { + "epoch": 1.6491410723581468, + "grad_norm": 0.23070227108778804, + "learning_rate": 2.2195901827839965e-05, + "loss": 0.1092, + "step": 3168 + }, + { + "epoch": 1.6496616345653305, + "grad_norm": 0.21963888599506054, + "learning_rate": 2.2181940874651393e-05, + "loss": 0.1067, + "step": 3169 + }, + { + "epoch": 1.6501821967725143, + "grad_norm": 0.22331117871742942, + "learning_rate": 2.216798081153702e-05, + "loss": 0.107, + "step": 3170 + }, + { + "epoch": 1.650702758979698, + "grad_norm": 0.23747544142009083, + "learning_rate": 2.2154021642906107e-05, + "loss": 0.1125, + "step": 3171 + }, + { + "epoch": 1.6512233211868819, + "grad_norm": 0.2304050364200822, + "learning_rate": 2.2140063373167606e-05, + "loss": 0.1068, + "step": 3172 + }, + { + "epoch": 1.6517438833940656, + "grad_norm": 0.23358412449420016, + "learning_rate": 2.212610600673017e-05, + "loss": 0.1059, + "step": 3173 + }, + { + "epoch": 1.6522644456012494, + "grad_norm": 0.21636641417173472, + "learning_rate": 2.21121495480022e-05, + "loss": 0.1055, + "step": 3174 + }, + { + "epoch": 1.6527850078084332, + "grad_norm": 0.22691961656100734, + "learning_rate": 2.2098194001391785e-05, + "loss": 0.1039, + "step": 3175 + }, + { + "epoch": 1.653305570015617, + "grad_norm": 0.23081013769275266, + "learning_rate": 2.2084239371306753e-05, + "loss": 0.1108, + "step": 3176 + }, + { + "epoch": 1.6538261322228007, + "grad_norm": 0.24275473044090903, + "learning_rate": 2.2070285662154607e-05, + "loss": 0.1108, + "step": 3177 + }, + { + "epoch": 1.6543466944299845, + "grad_norm": 0.2161343954274846, + "learning_rate": 2.2056332878342594e-05, + "loss": 0.106, + "step": 3178 + }, + { + "epoch": 1.6548672566371683, + "grad_norm": 0.22720407589672256, + "learning_rate": 2.2042381024277637e-05, + "loss": 0.1044, + "step": 3179 + }, + { + "epoch": 1.655387818844352, + "grad_norm": 0.23757875519411803, + "learning_rate": 2.202843010436639e-05, + "loss": 0.1122, + "step": 3180 + }, + { + "epoch": 1.6559083810515358, + "grad_norm": 0.209322137149078, + "learning_rate": 2.2014480123015214e-05, + "loss": 0.1052, + "step": 3181 + }, + { + "epoch": 1.6564289432587194, + "grad_norm": 0.2238782072859011, + "learning_rate": 2.2000531084630137e-05, + "loss": 0.1092, + "step": 3182 + }, + { + "epoch": 1.6569495054659031, + "grad_norm": 0.21411441676663537, + "learning_rate": 2.1986582993616926e-05, + "loss": 0.0983, + "step": 3183 + }, + { + "epoch": 1.657470067673087, + "grad_norm": 0.23607167726054, + "learning_rate": 2.1972635854381042e-05, + "loss": 0.1062, + "step": 3184 + }, + { + "epoch": 1.6579906298802707, + "grad_norm": 0.23634245749199465, + "learning_rate": 2.1958689671327635e-05, + "loss": 0.1077, + "step": 3185 + }, + { + "epoch": 1.6585111920874545, + "grad_norm": 0.26476311947101455, + "learning_rate": 2.1944744448861557e-05, + "loss": 0.1029, + "step": 3186 + }, + { + "epoch": 1.6590317542946382, + "grad_norm": 0.2225889940109205, + "learning_rate": 2.1930800191387366e-05, + "loss": 0.1019, + "step": 3187 + }, + { + "epoch": 1.659552316501822, + "grad_norm": 0.2123029031111446, + "learning_rate": 2.1916856903309298e-05, + "loss": 0.1013, + "step": 3188 + }, + { + "epoch": 1.6600728787090058, + "grad_norm": 0.23091095057929104, + "learning_rate": 2.19029145890313e-05, + "loss": 0.1065, + "step": 3189 + }, + { + "epoch": 1.6605934409161893, + "grad_norm": 0.24015311654835608, + "learning_rate": 2.188897325295701e-05, + "loss": 0.1039, + "step": 3190 + }, + { + "epoch": 1.661114003123373, + "grad_norm": 0.23370275976758934, + "learning_rate": 2.187503289948973e-05, + "loss": 0.1104, + "step": 3191 + }, + { + "epoch": 1.6616345653305569, + "grad_norm": 0.22827631936172066, + "learning_rate": 2.1861093533032488e-05, + "loss": 0.1033, + "step": 3192 + }, + { + "epoch": 1.6621551275377406, + "grad_norm": 0.2208651631543092, + "learning_rate": 2.1847155157987972e-05, + "loss": 0.1046, + "step": 3193 + }, + { + "epoch": 1.6626756897449244, + "grad_norm": 0.22601221991866668, + "learning_rate": 2.1833217778758584e-05, + "loss": 0.1077, + "step": 3194 + }, + { + "epoch": 1.6631962519521082, + "grad_norm": 0.22968672480262192, + "learning_rate": 2.1819281399746392e-05, + "loss": 0.104, + "step": 3195 + }, + { + "epoch": 1.663716814159292, + "grad_norm": 0.23140457155651345, + "learning_rate": 2.180534602535315e-05, + "loss": 0.1068, + "step": 3196 + }, + { + "epoch": 1.6642373763664757, + "grad_norm": 0.2153723166687647, + "learning_rate": 2.17914116599803e-05, + "loss": 0.1063, + "step": 3197 + }, + { + "epoch": 1.6647579385736595, + "grad_norm": 0.23114435518218374, + "learning_rate": 2.1777478308028965e-05, + "loss": 0.1089, + "step": 3198 + }, + { + "epoch": 1.6652785007808433, + "grad_norm": 0.22821342024755512, + "learning_rate": 2.176354597389995e-05, + "loss": 0.1054, + "step": 3199 + }, + { + "epoch": 1.665799062988027, + "grad_norm": 0.2339992932085678, + "learning_rate": 2.1749614661993715e-05, + "loss": 0.1113, + "step": 3200 + }, + { + "epoch": 1.6663196251952108, + "grad_norm": 0.22259182696219096, + "learning_rate": 2.1735684376710435e-05, + "loss": 0.1044, + "step": 3201 + }, + { + "epoch": 1.6668401874023946, + "grad_norm": 0.2219044472166739, + "learning_rate": 2.1721755122449932e-05, + "loss": 0.1077, + "step": 3202 + }, + { + "epoch": 1.6673607496095784, + "grad_norm": 0.21727491507702684, + "learning_rate": 2.1707826903611726e-05, + "loss": 0.1028, + "step": 3203 + }, + { + "epoch": 1.6678813118167621, + "grad_norm": 0.2299535668577538, + "learning_rate": 2.169389972459498e-05, + "loss": 0.1098, + "step": 3204 + }, + { + "epoch": 1.668401874023946, + "grad_norm": 0.21563012365258863, + "learning_rate": 2.1679973589798564e-05, + "loss": 0.103, + "step": 3205 + }, + { + "epoch": 1.6689224362311297, + "grad_norm": 0.22726073168216962, + "learning_rate": 2.166604850362098e-05, + "loss": 0.1082, + "step": 3206 + }, + { + "epoch": 1.6694429984383135, + "grad_norm": 0.23344147957321737, + "learning_rate": 2.165212447046043e-05, + "loss": 0.11, + "step": 3207 + }, + { + "epoch": 1.6699635606454972, + "grad_norm": 0.21101470102115977, + "learning_rate": 2.163820149471478e-05, + "loss": 0.1067, + "step": 3208 + }, + { + "epoch": 1.670484122852681, + "grad_norm": 0.21852389821603915, + "learning_rate": 2.1624279580781525e-05, + "loss": 0.1094, + "step": 3209 + }, + { + "epoch": 1.6710046850598648, + "grad_norm": 0.21706627954133206, + "learning_rate": 2.1610358733057882e-05, + "loss": 0.1092, + "step": 3210 + }, + { + "epoch": 1.6715252472670485, + "grad_norm": 0.2260538276253177, + "learning_rate": 2.1596438955940682e-05, + "loss": 0.1079, + "step": 3211 + }, + { + "epoch": 1.6720458094742323, + "grad_norm": 0.23132753540493692, + "learning_rate": 2.1582520253826454e-05, + "loss": 0.1061, + "step": 3212 + }, + { + "epoch": 1.672566371681416, + "grad_norm": 0.22764404827151757, + "learning_rate": 2.156860263111136e-05, + "loss": 0.1087, + "step": 3213 + }, + { + "epoch": 1.6730869338885996, + "grad_norm": 0.23998945954385206, + "learning_rate": 2.155468609219125e-05, + "loss": 0.1068, + "step": 3214 + }, + { + "epoch": 1.6736074960957834, + "grad_norm": 0.22270579464295384, + "learning_rate": 2.154077064146159e-05, + "loss": 0.1031, + "step": 3215 + }, + { + "epoch": 1.6741280583029672, + "grad_norm": 0.23422116557049794, + "learning_rate": 2.152685628331755e-05, + "loss": 0.1083, + "step": 3216 + }, + { + "epoch": 1.674648620510151, + "grad_norm": 0.23808475492769957, + "learning_rate": 2.1512943022153926e-05, + "loss": 0.1108, + "step": 3217 + }, + { + "epoch": 1.6751691827173347, + "grad_norm": 0.2336184489419489, + "learning_rate": 2.149903086236516e-05, + "loss": 0.1072, + "step": 3218 + }, + { + "epoch": 1.6756897449245185, + "grad_norm": 0.21905361177253538, + "learning_rate": 2.1485119808345372e-05, + "loss": 0.1037, + "step": 3219 + }, + { + "epoch": 1.6762103071317023, + "grad_norm": 0.20981047900721037, + "learning_rate": 2.147120986448831e-05, + "loss": 0.0976, + "step": 3220 + }, + { + "epoch": 1.676730869338886, + "grad_norm": 0.2545642241580596, + "learning_rate": 2.1457301035187397e-05, + "loss": 0.1093, + "step": 3221 + }, + { + "epoch": 1.6772514315460696, + "grad_norm": 0.2218754138460833, + "learning_rate": 2.1443393324835675e-05, + "loss": 0.1047, + "step": 3222 + }, + { + "epoch": 1.6777719937532534, + "grad_norm": 0.23088102794993018, + "learning_rate": 2.1429486737825854e-05, + "loss": 0.1104, + "step": 3223 + }, + { + "epoch": 1.6782925559604371, + "grad_norm": 0.22005937755272137, + "learning_rate": 2.1415581278550275e-05, + "loss": 0.1073, + "step": 3224 + }, + { + "epoch": 1.678813118167621, + "grad_norm": 0.23400677719772744, + "learning_rate": 2.140167695140094e-05, + "loss": 0.1103, + "step": 3225 + }, + { + "epoch": 1.6793336803748047, + "grad_norm": 0.2365964192307508, + "learning_rate": 2.1387773760769474e-05, + "loss": 0.1085, + "step": 3226 + }, + { + "epoch": 1.6798542425819885, + "grad_norm": 0.21637550071988382, + "learning_rate": 2.137387171104715e-05, + "loss": 0.107, + "step": 3227 + }, + { + "epoch": 1.6803748047891722, + "grad_norm": 0.22333862046046274, + "learning_rate": 2.1359970806624885e-05, + "loss": 0.1084, + "step": 3228 + }, + { + "epoch": 1.680895366996356, + "grad_norm": 0.23203824461964603, + "learning_rate": 2.134607105189323e-05, + "loss": 0.1074, + "step": 3229 + }, + { + "epoch": 1.6814159292035398, + "grad_norm": 0.22765546349056484, + "learning_rate": 2.1332172451242378e-05, + "loss": 0.1069, + "step": 3230 + }, + { + "epoch": 1.6819364914107235, + "grad_norm": 0.23666560244713625, + "learning_rate": 2.131827500906215e-05, + "loss": 0.1034, + "step": 3231 + }, + { + "epoch": 1.6824570536179073, + "grad_norm": 0.2374292849318095, + "learning_rate": 2.1304378729742007e-05, + "loss": 0.1068, + "step": 3232 + }, + { + "epoch": 1.682977615825091, + "grad_norm": 0.2324037821047654, + "learning_rate": 2.129048361767104e-05, + "loss": 0.1038, + "step": 3233 + }, + { + "epoch": 1.6834981780322749, + "grad_norm": 0.23887705711017124, + "learning_rate": 2.127658967723797e-05, + "loss": 0.1116, + "step": 3234 + }, + { + "epoch": 1.6840187402394586, + "grad_norm": 0.24478758959999747, + "learning_rate": 2.126269691283117e-05, + "loss": 0.1078, + "step": 3235 + }, + { + "epoch": 1.6845393024466424, + "grad_norm": 0.24871328706272613, + "learning_rate": 2.124880532883859e-05, + "loss": 0.1081, + "step": 3236 + }, + { + "epoch": 1.6850598646538262, + "grad_norm": 0.2516650860777226, + "learning_rate": 2.123491492964785e-05, + "loss": 0.1085, + "step": 3237 + }, + { + "epoch": 1.68558042686101, + "grad_norm": 0.2253323571868253, + "learning_rate": 2.1221025719646193e-05, + "loss": 0.1019, + "step": 3238 + }, + { + "epoch": 1.6861009890681937, + "grad_norm": 0.22356011592203995, + "learning_rate": 2.1207137703220465e-05, + "loss": 0.1069, + "step": 3239 + }, + { + "epoch": 1.6866215512753775, + "grad_norm": 0.22129126827686282, + "learning_rate": 2.119325088475716e-05, + "loss": 0.1021, + "step": 3240 + }, + { + "epoch": 1.6871421134825613, + "grad_norm": 0.24363724324274239, + "learning_rate": 2.1179365268642374e-05, + "loss": 0.1073, + "step": 3241 + }, + { + "epoch": 1.687662675689745, + "grad_norm": 0.22571502175977035, + "learning_rate": 2.1165480859261838e-05, + "loss": 0.1086, + "step": 3242 + }, + { + "epoch": 1.6881832378969288, + "grad_norm": 0.2333938281028589, + "learning_rate": 2.1151597661000884e-05, + "loss": 0.1116, + "step": 3243 + }, + { + "epoch": 1.6887038001041126, + "grad_norm": 0.2418535802191957, + "learning_rate": 2.1137715678244492e-05, + "loss": 0.1122, + "step": 3244 + }, + { + "epoch": 1.6892243623112964, + "grad_norm": 0.2226092711205871, + "learning_rate": 2.1123834915377212e-05, + "loss": 0.1077, + "step": 3245 + }, + { + "epoch": 1.68974492451848, + "grad_norm": 0.2218422154239549, + "learning_rate": 2.1109955376783247e-05, + "loss": 0.1063, + "step": 3246 + }, + { + "epoch": 1.6902654867256637, + "grad_norm": 0.22432155018614183, + "learning_rate": 2.1096077066846404e-05, + "loss": 0.1042, + "step": 3247 + }, + { + "epoch": 1.6907860489328475, + "grad_norm": 0.23580900748913308, + "learning_rate": 2.1082199989950093e-05, + "loss": 0.1044, + "step": 3248 + }, + { + "epoch": 1.6913066111400312, + "grad_norm": 0.22358704155742054, + "learning_rate": 2.1068324150477346e-05, + "loss": 0.1069, + "step": 3249 + }, + { + "epoch": 1.691827173347215, + "grad_norm": 0.22829186908041899, + "learning_rate": 2.105444955281079e-05, + "loss": 0.1048, + "step": 3250 + }, + { + "epoch": 1.6923477355543988, + "grad_norm": 0.22170478945882835, + "learning_rate": 2.1040576201332685e-05, + "loss": 0.1044, + "step": 3251 + }, + { + "epoch": 1.6928682977615825, + "grad_norm": 0.23340110469012113, + "learning_rate": 2.1026704100424864e-05, + "loss": 0.1078, + "step": 3252 + }, + { + "epoch": 1.6933888599687663, + "grad_norm": 0.22247959374329498, + "learning_rate": 2.10128332544688e-05, + "loss": 0.1072, + "step": 3253 + }, + { + "epoch": 1.6939094221759499, + "grad_norm": 0.24563634319942448, + "learning_rate": 2.0998963667845535e-05, + "loss": 0.1069, + "step": 3254 + }, + { + "epoch": 1.6944299843831336, + "grad_norm": 0.2222993782818447, + "learning_rate": 2.0985095344935733e-05, + "loss": 0.1089, + "step": 3255 + }, + { + "epoch": 1.6949505465903174, + "grad_norm": 0.23032803072350344, + "learning_rate": 2.0971228290119664e-05, + "loss": 0.1065, + "step": 3256 + }, + { + "epoch": 1.6954711087975012, + "grad_norm": 0.22538817632991662, + "learning_rate": 2.0957362507777176e-05, + "loss": 0.1109, + "step": 3257 + }, + { + "epoch": 1.695991671004685, + "grad_norm": 0.22220876229186903, + "learning_rate": 2.0943498002287743e-05, + "loss": 0.1089, + "step": 3258 + }, + { + "epoch": 1.6965122332118687, + "grad_norm": 0.22756130766303528, + "learning_rate": 2.0929634778030408e-05, + "loss": 0.1082, + "step": 3259 + }, + { + "epoch": 1.6970327954190525, + "grad_norm": 0.2324022767031974, + "learning_rate": 2.0915772839383834e-05, + "loss": 0.1048, + "step": 3260 + }, + { + "epoch": 1.6975533576262363, + "grad_norm": 0.2224932908070099, + "learning_rate": 2.0901912190726256e-05, + "loss": 0.1052, + "step": 3261 + }, + { + "epoch": 1.69807391983342, + "grad_norm": 0.22064157879488797, + "learning_rate": 2.0888052836435524e-05, + "loss": 0.1034, + "step": 3262 + }, + { + "epoch": 1.6985944820406038, + "grad_norm": 0.22010208727090902, + "learning_rate": 2.087419478088906e-05, + "loss": 0.1048, + "step": 3263 + }, + { + "epoch": 1.6991150442477876, + "grad_norm": 0.2454058370043552, + "learning_rate": 2.0860338028463876e-05, + "loss": 0.1032, + "step": 3264 + }, + { + "epoch": 1.6996356064549714, + "grad_norm": 0.22591759232291647, + "learning_rate": 2.084648258353659e-05, + "loss": 0.1041, + "step": 3265 + }, + { + "epoch": 1.7001561686621551, + "grad_norm": 0.2269876270874768, + "learning_rate": 2.0832628450483388e-05, + "loss": 0.1054, + "step": 3266 + }, + { + "epoch": 1.700676730869339, + "grad_norm": 0.2323776942869489, + "learning_rate": 2.081877563368006e-05, + "loss": 0.107, + "step": 3267 + }, + { + "epoch": 1.7011972930765227, + "grad_norm": 0.23268241409871376, + "learning_rate": 2.0804924137501955e-05, + "loss": 0.1056, + "step": 3268 + }, + { + "epoch": 1.7017178552837064, + "grad_norm": 0.2356301736735351, + "learning_rate": 2.0791073966324037e-05, + "loss": 0.1035, + "step": 3269 + }, + { + "epoch": 1.7022384174908902, + "grad_norm": 0.22499277174575544, + "learning_rate": 2.0777225124520823e-05, + "loss": 0.1054, + "step": 3270 + }, + { + "epoch": 1.702758979698074, + "grad_norm": 0.2304220231846641, + "learning_rate": 2.0763377616466427e-05, + "loss": 0.108, + "step": 3271 + }, + { + "epoch": 1.7032795419052578, + "grad_norm": 0.21825438924376672, + "learning_rate": 2.0749531446534546e-05, + "loss": 0.1046, + "step": 3272 + }, + { + "epoch": 1.7038001041124415, + "grad_norm": 0.2237918265772953, + "learning_rate": 2.073568661909842e-05, + "loss": 0.1069, + "step": 3273 + }, + { + "epoch": 1.7043206663196253, + "grad_norm": 0.22614928472918566, + "learning_rate": 2.072184313853091e-05, + "loss": 0.1063, + "step": 3274 + }, + { + "epoch": 1.704841228526809, + "grad_norm": 0.21936797781205583, + "learning_rate": 2.070800100920442e-05, + "loss": 0.105, + "step": 3275 + }, + { + "epoch": 1.7053617907339929, + "grad_norm": 0.21812208623696525, + "learning_rate": 2.069416023549095e-05, + "loss": 0.1045, + "step": 3276 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.21349218293678732, + "learning_rate": 2.068032082176205e-05, + "loss": 0.1017, + "step": 3277 + }, + { + "epoch": 1.7064029151483602, + "grad_norm": 0.23388102739671973, + "learning_rate": 2.0666482772388853e-05, + "loss": 0.1065, + "step": 3278 + }, + { + "epoch": 1.706923477355544, + "grad_norm": 0.2114916830371581, + "learning_rate": 2.0652646091742063e-05, + "loss": 0.1001, + "step": 3279 + }, + { + "epoch": 1.7074440395627277, + "grad_norm": 0.22727043325186014, + "learning_rate": 2.0638810784191946e-05, + "loss": 0.1113, + "step": 3280 + }, + { + "epoch": 1.7079646017699115, + "grad_norm": 0.2509541792776318, + "learning_rate": 2.0624976854108347e-05, + "loss": 0.1133, + "step": 3281 + }, + { + "epoch": 1.7084851639770953, + "grad_norm": 0.22983253253459388, + "learning_rate": 2.061114430586064e-05, + "loss": 0.1056, + "step": 3282 + }, + { + "epoch": 1.709005726184279, + "grad_norm": 0.22546592705998422, + "learning_rate": 2.0597313143817804e-05, + "loss": 0.1074, + "step": 3283 + }, + { + "epoch": 1.7095262883914628, + "grad_norm": 0.22447733642224385, + "learning_rate": 2.0583483372348356e-05, + "loss": 0.1051, + "step": 3284 + }, + { + "epoch": 1.7100468505986466, + "grad_norm": 0.22044225269872403, + "learning_rate": 2.056965499582039e-05, + "loss": 0.107, + "step": 3285 + }, + { + "epoch": 1.7105674128058301, + "grad_norm": 0.23172108119736853, + "learning_rate": 2.055582801860155e-05, + "loss": 0.1101, + "step": 3286 + }, + { + "epoch": 1.711087975013014, + "grad_norm": 0.22418510588075263, + "learning_rate": 2.0542002445059032e-05, + "loss": 0.1031, + "step": 3287 + }, + { + "epoch": 1.7116085372201977, + "grad_norm": 0.21742221769182057, + "learning_rate": 2.0528178279559596e-05, + "loss": 0.1112, + "step": 3288 + }, + { + "epoch": 1.7121290994273815, + "grad_norm": 0.2285310956948703, + "learning_rate": 2.0514355526469566e-05, + "loss": 0.1082, + "step": 3289 + }, + { + "epoch": 1.7126496616345652, + "grad_norm": 0.23733520699349175, + "learning_rate": 2.0500534190154808e-05, + "loss": 0.1108, + "step": 3290 + }, + { + "epoch": 1.713170223841749, + "grad_norm": 0.224702532100698, + "learning_rate": 2.0486714274980732e-05, + "loss": 0.1054, + "step": 3291 + }, + { + "epoch": 1.7136907860489328, + "grad_norm": 0.23380286724902663, + "learning_rate": 2.0472895785312324e-05, + "loss": 0.1042, + "step": 3292 + }, + { + "epoch": 1.7142113482561165, + "grad_norm": 0.22363815675761445, + "learning_rate": 2.0459078725514092e-05, + "loss": 0.0998, + "step": 3293 + }, + { + "epoch": 1.7147319104633003, + "grad_norm": 0.23504429753004497, + "learning_rate": 2.0445263099950123e-05, + "loss": 0.1084, + "step": 3294 + }, + { + "epoch": 1.715252472670484, + "grad_norm": 0.22724060712973546, + "learning_rate": 2.043144891298402e-05, + "loss": 0.1048, + "step": 3295 + }, + { + "epoch": 1.7157730348776679, + "grad_norm": 0.23183122000489004, + "learning_rate": 2.0417636168978954e-05, + "loss": 0.0994, + "step": 3296 + }, + { + "epoch": 1.7162935970848516, + "grad_norm": 0.2417008008534072, + "learning_rate": 2.040382487229763e-05, + "loss": 0.108, + "step": 3297 + }, + { + "epoch": 1.7168141592920354, + "grad_norm": 0.21706572332557506, + "learning_rate": 2.03900150273023e-05, + "loss": 0.1044, + "step": 3298 + }, + { + "epoch": 1.7173347214992192, + "grad_norm": 0.2250832016912324, + "learning_rate": 2.0376206638354766e-05, + "loss": 0.1072, + "step": 3299 + }, + { + "epoch": 1.717855283706403, + "grad_norm": 0.21945238165897196, + "learning_rate": 2.036239970981633e-05, + "loss": 0.1056, + "step": 3300 + }, + { + "epoch": 1.7183758459135867, + "grad_norm": 0.2217903336181355, + "learning_rate": 2.0348594246047893e-05, + "loss": 0.1022, + "step": 3301 + }, + { + "epoch": 1.7188964081207705, + "grad_norm": 0.21983648892682253, + "learning_rate": 2.0334790251409845e-05, + "loss": 0.1006, + "step": 3302 + }, + { + "epoch": 1.7194169703279543, + "grad_norm": 0.2271718283385726, + "learning_rate": 2.0320987730262132e-05, + "loss": 0.1071, + "step": 3303 + }, + { + "epoch": 1.719937532535138, + "grad_norm": 0.2199145587427751, + "learning_rate": 2.0307186686964245e-05, + "loss": 0.1073, + "step": 3304 + }, + { + "epoch": 1.7204580947423218, + "grad_norm": 0.2231189412324603, + "learning_rate": 2.029338712587518e-05, + "loss": 0.1042, + "step": 3305 + }, + { + "epoch": 1.7209786569495056, + "grad_norm": 0.22383118284350628, + "learning_rate": 2.027958905135349e-05, + "loss": 0.1073, + "step": 3306 + }, + { + "epoch": 1.7214992191566894, + "grad_norm": 0.2179631138427114, + "learning_rate": 2.0265792467757248e-05, + "loss": 0.107, + "step": 3307 + }, + { + "epoch": 1.7220197813638731, + "grad_norm": 0.2367574980395077, + "learning_rate": 2.0251997379444062e-05, + "loss": 0.1114, + "step": 3308 + }, + { + "epoch": 1.722540343571057, + "grad_norm": 0.2103000613187075, + "learning_rate": 2.0238203790771054e-05, + "loss": 0.1028, + "step": 3309 + }, + { + "epoch": 1.7230609057782404, + "grad_norm": 0.21798620187439052, + "learning_rate": 2.0224411706094877e-05, + "loss": 0.1044, + "step": 3310 + }, + { + "epoch": 1.7235814679854242, + "grad_norm": 0.20804477935141746, + "learning_rate": 2.0210621129771722e-05, + "loss": 0.101, + "step": 3311 + }, + { + "epoch": 1.724102030192608, + "grad_norm": 0.22253756149900092, + "learning_rate": 2.019683206615729e-05, + "loss": 0.1045, + "step": 3312 + }, + { + "epoch": 1.7246225923997918, + "grad_norm": 0.21956300028974318, + "learning_rate": 2.018304451960682e-05, + "loss": 0.1073, + "step": 3313 + }, + { + "epoch": 1.7251431546069755, + "grad_norm": 0.21104867004615668, + "learning_rate": 2.016925849447504e-05, + "loss": 0.1049, + "step": 3314 + }, + { + "epoch": 1.7256637168141593, + "grad_norm": 0.21007160419878684, + "learning_rate": 2.015547399511624e-05, + "loss": 0.1002, + "step": 3315 + }, + { + "epoch": 1.726184279021343, + "grad_norm": 0.21210442829270762, + "learning_rate": 2.0141691025884195e-05, + "loss": 0.1039, + "step": 3316 + }, + { + "epoch": 1.7267048412285269, + "grad_norm": 0.20410554097153225, + "learning_rate": 2.0127909591132217e-05, + "loss": 0.1004, + "step": 3317 + }, + { + "epoch": 1.7272254034357104, + "grad_norm": 0.23935102740693862, + "learning_rate": 2.0114129695213114e-05, + "loss": 0.1056, + "step": 3318 + }, + { + "epoch": 1.7277459656428942, + "grad_norm": 0.23313707905631195, + "learning_rate": 2.0100351342479216e-05, + "loss": 0.1102, + "step": 3319 + }, + { + "epoch": 1.728266527850078, + "grad_norm": 0.23139683180145568, + "learning_rate": 2.008657453728238e-05, + "loss": 0.1099, + "step": 3320 + }, + { + "epoch": 1.7287870900572617, + "grad_norm": 0.24872510544166382, + "learning_rate": 2.007279928397395e-05, + "loss": 0.1066, + "step": 3321 + }, + { + "epoch": 1.7293076522644455, + "grad_norm": 0.23200706308434033, + "learning_rate": 2.0059025586904807e-05, + "loss": 0.1054, + "step": 3322 + }, + { + "epoch": 1.7298282144716293, + "grad_norm": 0.21942541308934396, + "learning_rate": 2.004525345042531e-05, + "loss": 0.1028, + "step": 3323 + }, + { + "epoch": 1.730348776678813, + "grad_norm": 0.2609746790506161, + "learning_rate": 2.003148287888535e-05, + "loss": 0.1088, + "step": 3324 + }, + { + "epoch": 1.7308693388859968, + "grad_norm": 0.2366478927625931, + "learning_rate": 2.0017713876634305e-05, + "loss": 0.1059, + "step": 3325 + }, + { + "epoch": 1.7313899010931806, + "grad_norm": 0.22258749938323874, + "learning_rate": 2.000394644802109e-05, + "loss": 0.105, + "step": 3326 + }, + { + "epoch": 1.7319104633003644, + "grad_norm": 0.22200613011043596, + "learning_rate": 1.9990180597394075e-05, + "loss": 0.1024, + "step": 3327 + }, + { + "epoch": 1.7324310255075481, + "grad_norm": 0.22635255297590737, + "learning_rate": 1.9976416329101154e-05, + "loss": 0.1112, + "step": 3328 + }, + { + "epoch": 1.732951587714732, + "grad_norm": 0.21816833986696646, + "learning_rate": 1.9962653647489745e-05, + "loss": 0.1083, + "step": 3329 + }, + { + "epoch": 1.7334721499219157, + "grad_norm": 0.2320214874149184, + "learning_rate": 1.9948892556906727e-05, + "loss": 0.1063, + "step": 3330 + }, + { + "epoch": 1.7339927121290994, + "grad_norm": 0.21304345539614472, + "learning_rate": 1.99351330616985e-05, + "loss": 0.1058, + "step": 3331 + }, + { + "epoch": 1.7345132743362832, + "grad_norm": 0.21513285495170326, + "learning_rate": 1.9921375166210948e-05, + "loss": 0.1045, + "step": 3332 + }, + { + "epoch": 1.735033836543467, + "grad_norm": 0.23454273842647397, + "learning_rate": 1.990761887478946e-05, + "loss": 0.1115, + "step": 3333 + }, + { + "epoch": 1.7355543987506508, + "grad_norm": 0.22338156044764015, + "learning_rate": 1.989386419177891e-05, + "loss": 0.1082, + "step": 3334 + }, + { + "epoch": 1.7360749609578345, + "grad_norm": 0.20836963730690877, + "learning_rate": 1.988011112152367e-05, + "loss": 0.1041, + "step": 3335 + }, + { + "epoch": 1.7365955231650183, + "grad_norm": 0.2236097112876602, + "learning_rate": 1.9866359668367594e-05, + "loss": 0.1086, + "step": 3336 + }, + { + "epoch": 1.737116085372202, + "grad_norm": 0.21123759334297273, + "learning_rate": 1.9852609836654034e-05, + "loss": 0.1062, + "step": 3337 + }, + { + "epoch": 1.7376366475793859, + "grad_norm": 0.22237040640657335, + "learning_rate": 1.9838861630725826e-05, + "loss": 0.108, + "step": 3338 + }, + { + "epoch": 1.7381572097865696, + "grad_norm": 0.2495840089981045, + "learning_rate": 1.982511505492529e-05, + "loss": 0.1076, + "step": 3339 + }, + { + "epoch": 1.7386777719937534, + "grad_norm": 0.22958506111561988, + "learning_rate": 1.9811370113594246e-05, + "loss": 0.1047, + "step": 3340 + }, + { + "epoch": 1.7391983342009372, + "grad_norm": 0.216172102598489, + "learning_rate": 1.9797626811073972e-05, + "loss": 0.1027, + "step": 3341 + }, + { + "epoch": 1.7397188964081207, + "grad_norm": 0.2221979457825088, + "learning_rate": 1.9783885151705252e-05, + "loss": 0.1056, + "step": 3342 + }, + { + "epoch": 1.7402394586153045, + "grad_norm": 0.22693748401563166, + "learning_rate": 1.9770145139828333e-05, + "loss": 0.1086, + "step": 3343 + }, + { + "epoch": 1.7407600208224883, + "grad_norm": 0.2273052352822081, + "learning_rate": 1.975640677978297e-05, + "loss": 0.1023, + "step": 3344 + }, + { + "epoch": 1.741280583029672, + "grad_norm": 0.23018043315752323, + "learning_rate": 1.974267007590835e-05, + "loss": 0.1073, + "step": 3345 + }, + { + "epoch": 1.7418011452368558, + "grad_norm": 0.21977354781533529, + "learning_rate": 1.9728935032543174e-05, + "loss": 0.1025, + "step": 3346 + }, + { + "epoch": 1.7423217074440396, + "grad_norm": 0.22170578071619307, + "learning_rate": 1.9715201654025614e-05, + "loss": 0.1053, + "step": 3347 + }, + { + "epoch": 1.7428422696512234, + "grad_norm": 0.22697352459866138, + "learning_rate": 1.9701469944693298e-05, + "loss": 0.1045, + "step": 3348 + }, + { + "epoch": 1.7433628318584071, + "grad_norm": 0.21983474061455138, + "learning_rate": 1.9687739908883352e-05, + "loss": 0.1015, + "step": 3349 + }, + { + "epoch": 1.7438833940655907, + "grad_norm": 0.22230299934979136, + "learning_rate": 1.967401155093235e-05, + "loss": 0.1075, + "step": 3350 + }, + { + "epoch": 1.7444039562727744, + "grad_norm": 0.2278927921689901, + "learning_rate": 1.9660284875176354e-05, + "loss": 0.104, + "step": 3351 + }, + { + "epoch": 1.7449245184799582, + "grad_norm": 0.2258066335818804, + "learning_rate": 1.9646559885950876e-05, + "loss": 0.1072, + "step": 3352 + }, + { + "epoch": 1.745445080687142, + "grad_norm": 0.21809762204524716, + "learning_rate": 1.9632836587590928e-05, + "loss": 0.1057, + "step": 3353 + }, + { + "epoch": 1.7459656428943258, + "grad_norm": 0.2332642653387811, + "learning_rate": 1.9619114984430946e-05, + "loss": 0.1101, + "step": 3354 + }, + { + "epoch": 1.7464862051015095, + "grad_norm": 0.22377591593309243, + "learning_rate": 1.960539508080485e-05, + "loss": 0.1034, + "step": 3355 + }, + { + "epoch": 1.7470067673086933, + "grad_norm": 0.2214624251282975, + "learning_rate": 1.9591676881046038e-05, + "loss": 0.1051, + "step": 3356 + }, + { + "epoch": 1.747527329515877, + "grad_norm": 0.22969375504708825, + "learning_rate": 1.957796038948734e-05, + "loss": 0.1042, + "step": 3357 + }, + { + "epoch": 1.7480478917230609, + "grad_norm": 0.2212002516504855, + "learning_rate": 1.9564245610461078e-05, + "loss": 0.1056, + "step": 3358 + }, + { + "epoch": 1.7485684539302446, + "grad_norm": 0.226595022233284, + "learning_rate": 1.955053254829901e-05, + "loss": 0.1022, + "step": 3359 + }, + { + "epoch": 1.7490890161374284, + "grad_norm": 0.21300220127544178, + "learning_rate": 1.9536821207332357e-05, + "loss": 0.1032, + "step": 3360 + }, + { + "epoch": 1.7496095783446122, + "grad_norm": 0.2237621475683394, + "learning_rate": 1.95231115918918e-05, + "loss": 0.1029, + "step": 3361 + }, + { + "epoch": 1.750130140551796, + "grad_norm": 0.22048593748404666, + "learning_rate": 1.9509403706307484e-05, + "loss": 0.1053, + "step": 3362 + }, + { + "epoch": 1.7506507027589797, + "grad_norm": 0.22433810240135404, + "learning_rate": 1.9495697554908986e-05, + "loss": 0.102, + "step": 3363 + }, + { + "epoch": 1.7511712649661635, + "grad_norm": 0.22380034181909983, + "learning_rate": 1.948199314202534e-05, + "loss": 0.0986, + "step": 3364 + }, + { + "epoch": 1.7516918271733473, + "grad_norm": 0.22338005341310205, + "learning_rate": 1.946829047198505e-05, + "loss": 0.1035, + "step": 3365 + }, + { + "epoch": 1.752212389380531, + "grad_norm": 0.230442140004699, + "learning_rate": 1.945458954911605e-05, + "loss": 0.108, + "step": 3366 + }, + { + "epoch": 1.7527329515877148, + "grad_norm": 0.2304987431815853, + "learning_rate": 1.944089037774573e-05, + "loss": 0.1082, + "step": 3367 + }, + { + "epoch": 1.7532535137948986, + "grad_norm": 0.2214657123371355, + "learning_rate": 1.942719296220093e-05, + "loss": 0.1027, + "step": 3368 + }, + { + "epoch": 1.7537740760020823, + "grad_norm": 0.20755785482599343, + "learning_rate": 1.9413497306807925e-05, + "loss": 0.1005, + "step": 3369 + }, + { + "epoch": 1.7542946382092661, + "grad_norm": 0.2109976471942146, + "learning_rate": 1.9399803415892454e-05, + "loss": 0.1001, + "step": 3370 + }, + { + "epoch": 1.75481520041645, + "grad_norm": 0.22412574394179605, + "learning_rate": 1.9386111293779673e-05, + "loss": 0.1011, + "step": 3371 + }, + { + "epoch": 1.7553357626236337, + "grad_norm": 0.2276297126469934, + "learning_rate": 1.937242094479419e-05, + "loss": 0.1016, + "step": 3372 + }, + { + "epoch": 1.7558563248308174, + "grad_norm": 0.224838207969345, + "learning_rate": 1.9358732373260056e-05, + "loss": 0.1049, + "step": 3373 + }, + { + "epoch": 1.756376887038001, + "grad_norm": 0.2211883964455484, + "learning_rate": 1.934504558350076e-05, + "loss": 0.1054, + "step": 3374 + }, + { + "epoch": 1.7568974492451848, + "grad_norm": 0.22067590673603887, + "learning_rate": 1.933136057983923e-05, + "loss": 0.1013, + "step": 3375 + }, + { + "epoch": 1.7574180114523685, + "grad_norm": 0.22690702349673778, + "learning_rate": 1.931767736659782e-05, + "loss": 0.1032, + "step": 3376 + }, + { + "epoch": 1.7579385736595523, + "grad_norm": 0.22520906999437268, + "learning_rate": 1.930399594809834e-05, + "loss": 0.1069, + "step": 3377 + }, + { + "epoch": 1.758459135866736, + "grad_norm": 0.22649242565754607, + "learning_rate": 1.9290316328662e-05, + "loss": 0.1049, + "step": 3378 + }, + { + "epoch": 1.7589796980739199, + "grad_norm": 0.22332071105437934, + "learning_rate": 1.9276638512609474e-05, + "loss": 0.1059, + "step": 3379 + }, + { + "epoch": 1.7595002602811036, + "grad_norm": 0.21894476859270845, + "learning_rate": 1.926296250426085e-05, + "loss": 0.1038, + "step": 3380 + }, + { + "epoch": 1.7600208224882874, + "grad_norm": 0.2303137164013817, + "learning_rate": 1.9249288307935642e-05, + "loss": 0.1055, + "step": 3381 + }, + { + "epoch": 1.760541384695471, + "grad_norm": 0.22126996251729855, + "learning_rate": 1.9235615927952804e-05, + "loss": 0.107, + "step": 3382 + }, + { + "epoch": 1.7610619469026547, + "grad_norm": 0.21780067256343733, + "learning_rate": 1.92219453686307e-05, + "loss": 0.1032, + "step": 3383 + }, + { + "epoch": 1.7615825091098385, + "grad_norm": 0.21443493771388783, + "learning_rate": 1.9208276634287143e-05, + "loss": 0.1047, + "step": 3384 + }, + { + "epoch": 1.7621030713170223, + "grad_norm": 0.22561009131363186, + "learning_rate": 1.9194609729239344e-05, + "loss": 0.1056, + "step": 3385 + }, + { + "epoch": 1.762623633524206, + "grad_norm": 0.22455128134416372, + "learning_rate": 1.9180944657803956e-05, + "loss": 0.1071, + "step": 3386 + }, + { + "epoch": 1.7631441957313898, + "grad_norm": 0.2264693768229407, + "learning_rate": 1.9167281424297035e-05, + "loss": 0.1109, + "step": 3387 + }, + { + "epoch": 1.7636647579385736, + "grad_norm": 0.2107756363167944, + "learning_rate": 1.9153620033034075e-05, + "loss": 0.1059, + "step": 3388 + }, + { + "epoch": 1.7641853201457574, + "grad_norm": 0.23268370370945063, + "learning_rate": 1.9139960488329985e-05, + "loss": 0.1125, + "step": 3389 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.2373633791421892, + "learning_rate": 1.912630279449906e-05, + "loss": 0.1025, + "step": 3390 + }, + { + "epoch": 1.765226444560125, + "grad_norm": 0.2310037923502031, + "learning_rate": 1.911264695585506e-05, + "loss": 0.1089, + "step": 3391 + }, + { + "epoch": 1.7657470067673087, + "grad_norm": 0.2123561360419581, + "learning_rate": 1.9098992976711123e-05, + "loss": 0.1023, + "step": 3392 + }, + { + "epoch": 1.7662675689744924, + "grad_norm": 0.2306983798259102, + "learning_rate": 1.9085340861379813e-05, + "loss": 0.1126, + "step": 3393 + }, + { + "epoch": 1.7667881311816762, + "grad_norm": 0.23277860448314053, + "learning_rate": 1.9071690614173102e-05, + "loss": 0.1029, + "step": 3394 + }, + { + "epoch": 1.76730869338886, + "grad_norm": 0.2087088586057, + "learning_rate": 1.9058042239402378e-05, + "loss": 0.0985, + "step": 3395 + }, + { + "epoch": 1.7678292555960438, + "grad_norm": 0.21479438193487035, + "learning_rate": 1.9044395741378425e-05, + "loss": 0.1002, + "step": 3396 + }, + { + "epoch": 1.7683498178032275, + "grad_norm": 0.22921709100541876, + "learning_rate": 1.903075112441145e-05, + "loss": 0.1058, + "step": 3397 + }, + { + "epoch": 1.7688703800104113, + "grad_norm": 0.22434060260788052, + "learning_rate": 1.9017108392811065e-05, + "loss": 0.1044, + "step": 3398 + }, + { + "epoch": 1.769390942217595, + "grad_norm": 0.23525431934160432, + "learning_rate": 1.9003467550886253e-05, + "loss": 0.1125, + "step": 3399 + }, + { + "epoch": 1.7699115044247788, + "grad_norm": 0.22009803271545916, + "learning_rate": 1.8989828602945454e-05, + "loss": 0.1047, + "step": 3400 + }, + { + "epoch": 1.7704320666319626, + "grad_norm": 0.22537635925080748, + "learning_rate": 1.897619155329646e-05, + "loss": 0.1043, + "step": 3401 + }, + { + "epoch": 1.7709526288391464, + "grad_norm": 0.2367752202655961, + "learning_rate": 1.8962556406246505e-05, + "loss": 0.1035, + "step": 3402 + }, + { + "epoch": 1.7714731910463302, + "grad_norm": 0.20474450022021, + "learning_rate": 1.8948923166102192e-05, + "loss": 0.0994, + "step": 3403 + }, + { + "epoch": 1.771993753253514, + "grad_norm": 0.21437274278815563, + "learning_rate": 1.893529183716954e-05, + "loss": 0.0997, + "step": 3404 + }, + { + "epoch": 1.7725143154606977, + "grad_norm": 0.22322450199964058, + "learning_rate": 1.892166242375395e-05, + "loss": 0.0996, + "step": 3405 + }, + { + "epoch": 1.7730348776678813, + "grad_norm": 0.22023539278452614, + "learning_rate": 1.8908034930160228e-05, + "loss": 0.1039, + "step": 3406 + }, + { + "epoch": 1.773555439875065, + "grad_norm": 0.2114044276184025, + "learning_rate": 1.889440936069258e-05, + "loss": 0.1026, + "step": 3407 + }, + { + "epoch": 1.7740760020822488, + "grad_norm": 0.21695286591737722, + "learning_rate": 1.8880785719654577e-05, + "loss": 0.1037, + "step": 3408 + }, + { + "epoch": 1.7745965642894326, + "grad_norm": 0.2363632816801158, + "learning_rate": 1.8867164011349208e-05, + "loss": 0.1071, + "step": 3409 + }, + { + "epoch": 1.7751171264966163, + "grad_norm": 0.22771210911879075, + "learning_rate": 1.885354424007884e-05, + "loss": 0.1083, + "step": 3410 + }, + { + "epoch": 1.7756376887038001, + "grad_norm": 0.2274868597566939, + "learning_rate": 1.8839926410145235e-05, + "loss": 0.1077, + "step": 3411 + }, + { + "epoch": 1.776158250910984, + "grad_norm": 0.21891521186548685, + "learning_rate": 1.882631052584953e-05, + "loss": 0.1022, + "step": 3412 + }, + { + "epoch": 1.7766788131181677, + "grad_norm": 0.22044753666985667, + "learning_rate": 1.8812696591492265e-05, + "loss": 0.1027, + "step": 3413 + }, + { + "epoch": 1.7771993753253512, + "grad_norm": 0.23004226671888997, + "learning_rate": 1.8799084611373345e-05, + "loss": 0.1085, + "step": 3414 + }, + { + "epoch": 1.777719937532535, + "grad_norm": 0.21243628528006495, + "learning_rate": 1.8785474589792074e-05, + "loss": 0.1044, + "step": 3415 + }, + { + "epoch": 1.7782404997397188, + "grad_norm": 0.220110579204587, + "learning_rate": 1.8771866531047133e-05, + "loss": 0.1023, + "step": 3416 + }, + { + "epoch": 1.7787610619469025, + "grad_norm": 0.22305394623742394, + "learning_rate": 1.8758260439436563e-05, + "loss": 0.1081, + "step": 3417 + }, + { + "epoch": 1.7792816241540863, + "grad_norm": 0.2276585098729061, + "learning_rate": 1.8744656319257817e-05, + "loss": 0.1073, + "step": 3418 + }, + { + "epoch": 1.77980218636127, + "grad_norm": 0.22252174513686673, + "learning_rate": 1.87310541748077e-05, + "loss": 0.1079, + "step": 3419 + }, + { + "epoch": 1.7803227485684539, + "grad_norm": 0.21570267838003582, + "learning_rate": 1.8717454010382407e-05, + "loss": 0.1049, + "step": 3420 + }, + { + "epoch": 1.7808433107756376, + "grad_norm": 0.21778548124027283, + "learning_rate": 1.87038558302775e-05, + "loss": 0.1014, + "step": 3421 + }, + { + "epoch": 1.7813638729828214, + "grad_norm": 0.21552687109805088, + "learning_rate": 1.8690259638787926e-05, + "loss": 0.0991, + "step": 3422 + }, + { + "epoch": 1.7818844351900052, + "grad_norm": 0.2345671900413482, + "learning_rate": 1.867666544020798e-05, + "loss": 0.107, + "step": 3423 + }, + { + "epoch": 1.782404997397189, + "grad_norm": 0.23258757335297145, + "learning_rate": 1.866307323883136e-05, + "loss": 0.1105, + "step": 3424 + }, + { + "epoch": 1.7829255596043727, + "grad_norm": 0.21889647232741902, + "learning_rate": 1.8649483038951107e-05, + "loss": 0.107, + "step": 3425 + }, + { + "epoch": 1.7834461218115565, + "grad_norm": 0.22215635618720045, + "learning_rate": 1.863589484485963e-05, + "loss": 0.1066, + "step": 3426 + }, + { + "epoch": 1.7839666840187403, + "grad_norm": 0.21415158989479102, + "learning_rate": 1.8622308660848724e-05, + "loss": 0.1029, + "step": 3427 + }, + { + "epoch": 1.784487246225924, + "grad_norm": 0.22781030552769713, + "learning_rate": 1.860872449120953e-05, + "loss": 0.1087, + "step": 3428 + }, + { + "epoch": 1.7850078084331078, + "grad_norm": 0.22290517689214315, + "learning_rate": 1.8595142340232575e-05, + "loss": 0.1019, + "step": 3429 + }, + { + "epoch": 1.7855283706402916, + "grad_norm": 0.21633222759985674, + "learning_rate": 1.8581562212207714e-05, + "loss": 0.1021, + "step": 3430 + }, + { + "epoch": 1.7860489328474753, + "grad_norm": 0.230030532368222, + "learning_rate": 1.8567984111424205e-05, + "loss": 0.1109, + "step": 3431 + }, + { + "epoch": 1.7865694950546591, + "grad_norm": 0.22735979206413187, + "learning_rate": 1.8554408042170628e-05, + "loss": 0.1089, + "step": 3432 + }, + { + "epoch": 1.787090057261843, + "grad_norm": 0.2253631691551036, + "learning_rate": 1.8540834008734943e-05, + "loss": 0.1023, + "step": 3433 + }, + { + "epoch": 1.7876106194690267, + "grad_norm": 0.21969328560623388, + "learning_rate": 1.8527262015404477e-05, + "loss": 0.1046, + "step": 3434 + }, + { + "epoch": 1.7881311816762104, + "grad_norm": 0.22505681618644893, + "learning_rate": 1.8513692066465866e-05, + "loss": 0.1061, + "step": 3435 + }, + { + "epoch": 1.7886517438833942, + "grad_norm": 0.22392160124207264, + "learning_rate": 1.8500124166205152e-05, + "loss": 0.1055, + "step": 3436 + }, + { + "epoch": 1.789172306090578, + "grad_norm": 0.22703973700198465, + "learning_rate": 1.84865583189077e-05, + "loss": 0.1053, + "step": 3437 + }, + { + "epoch": 1.7896928682977615, + "grad_norm": 0.2235806887467908, + "learning_rate": 1.847299452885824e-05, + "loss": 0.1059, + "step": 3438 + }, + { + "epoch": 1.7902134305049453, + "grad_norm": 0.22095971299700543, + "learning_rate": 1.8459432800340855e-05, + "loss": 0.1015, + "step": 3439 + }, + { + "epoch": 1.790733992712129, + "grad_norm": 0.23339506867569496, + "learning_rate": 1.8445873137638953e-05, + "loss": 0.1044, + "step": 3440 + }, + { + "epoch": 1.7912545549193128, + "grad_norm": 0.21709782713883458, + "learning_rate": 1.8432315545035328e-05, + "loss": 0.1034, + "step": 3441 + }, + { + "epoch": 1.7917751171264966, + "grad_norm": 0.2165477854014385, + "learning_rate": 1.841876002681208e-05, + "loss": 0.1041, + "step": 3442 + }, + { + "epoch": 1.7922956793336804, + "grad_norm": 0.23599068704496834, + "learning_rate": 1.840520658725069e-05, + "loss": 0.1082, + "step": 3443 + }, + { + "epoch": 1.7928162415408642, + "grad_norm": 0.22660187330424075, + "learning_rate": 1.8391655230631953e-05, + "loss": 0.1083, + "step": 3444 + }, + { + "epoch": 1.793336803748048, + "grad_norm": 0.21818235771020988, + "learning_rate": 1.837810596123601e-05, + "loss": 0.1041, + "step": 3445 + }, + { + "epoch": 1.7938573659552315, + "grad_norm": 0.21184422424865806, + "learning_rate": 1.836455878334237e-05, + "loss": 0.1013, + "step": 3446 + }, + { + "epoch": 1.7943779281624153, + "grad_norm": 0.21802505383850326, + "learning_rate": 1.8351013701229846e-05, + "loss": 0.1054, + "step": 3447 + }, + { + "epoch": 1.794898490369599, + "grad_norm": 0.2155910468353952, + "learning_rate": 1.833747071917662e-05, + "loss": 0.1011, + "step": 3448 + }, + { + "epoch": 1.7954190525767828, + "grad_norm": 0.21140794526123036, + "learning_rate": 1.832392984146018e-05, + "loss": 0.1022, + "step": 3449 + }, + { + "epoch": 1.7959396147839666, + "grad_norm": 0.20930278134313027, + "learning_rate": 1.8310391072357382e-05, + "loss": 0.0955, + "step": 3450 + }, + { + "epoch": 1.7964601769911503, + "grad_norm": 0.2316473180383841, + "learning_rate": 1.829685441614438e-05, + "loss": 0.1055, + "step": 3451 + }, + { + "epoch": 1.7969807391983341, + "grad_norm": 0.23166286020742097, + "learning_rate": 1.8283319877096705e-05, + "loss": 0.1025, + "step": 3452 + }, + { + "epoch": 1.797501301405518, + "grad_norm": 0.22381973015644815, + "learning_rate": 1.8269787459489174e-05, + "loss": 0.1048, + "step": 3453 + }, + { + "epoch": 1.7980218636127017, + "grad_norm": 0.22774693899244658, + "learning_rate": 1.8256257167595957e-05, + "loss": 0.1028, + "step": 3454 + }, + { + "epoch": 1.7985424258198854, + "grad_norm": 0.22509542841432917, + "learning_rate": 1.8242729005690557e-05, + "loss": 0.1012, + "step": 3455 + }, + { + "epoch": 1.7990629880270692, + "grad_norm": 0.22202303234657486, + "learning_rate": 1.822920297804579e-05, + "loss": 0.1025, + "step": 3456 + }, + { + "epoch": 1.799583550234253, + "grad_norm": 0.2268089623653908, + "learning_rate": 1.8215679088933813e-05, + "loss": 0.1046, + "step": 3457 + }, + { + "epoch": 1.8001041124414368, + "grad_norm": 0.22470714993267865, + "learning_rate": 1.8202157342626087e-05, + "loss": 0.1064, + "step": 3458 + }, + { + "epoch": 1.8006246746486205, + "grad_norm": 0.2153205950636323, + "learning_rate": 1.8188637743393423e-05, + "loss": 0.1003, + "step": 3459 + }, + { + "epoch": 1.8011452368558043, + "grad_norm": 0.20954593624683357, + "learning_rate": 1.8175120295505925e-05, + "loss": 0.1031, + "step": 3460 + }, + { + "epoch": 1.801665799062988, + "grad_norm": 0.21853659329976693, + "learning_rate": 1.8161605003233056e-05, + "loss": 0.1043, + "step": 3461 + }, + { + "epoch": 1.8021863612701718, + "grad_norm": 0.21719611628693516, + "learning_rate": 1.8148091870843554e-05, + "loss": 0.1029, + "step": 3462 + }, + { + "epoch": 1.8027069234773556, + "grad_norm": 0.21032988240795134, + "learning_rate": 1.813458090260549e-05, + "loss": 0.1012, + "step": 3463 + }, + { + "epoch": 1.8032274856845394, + "grad_norm": 0.22087081939591202, + "learning_rate": 1.8121072102786274e-05, + "loss": 0.1082, + "step": 3464 + }, + { + "epoch": 1.8037480478917232, + "grad_norm": 0.2190982660554764, + "learning_rate": 1.81075654756526e-05, + "loss": 0.1058, + "step": 3465 + }, + { + "epoch": 1.804268610098907, + "grad_norm": 0.22040569387096418, + "learning_rate": 1.8094061025470498e-05, + "loss": 0.1001, + "step": 3466 + }, + { + "epoch": 1.8047891723060907, + "grad_norm": 0.21603368220817645, + "learning_rate": 1.8080558756505294e-05, + "loss": 0.0998, + "step": 3467 + }, + { + "epoch": 1.8053097345132745, + "grad_norm": 0.20574278416396982, + "learning_rate": 1.8067058673021646e-05, + "loss": 0.099, + "step": 3468 + }, + { + "epoch": 1.8058302967204583, + "grad_norm": 0.2277361813536661, + "learning_rate": 1.8053560779283495e-05, + "loss": 0.1053, + "step": 3469 + }, + { + "epoch": 1.8063508589276418, + "grad_norm": 0.21219090161169762, + "learning_rate": 1.804006507955411e-05, + "loss": 0.1013, + "step": 3470 + }, + { + "epoch": 1.8068714211348256, + "grad_norm": 0.22700160243005704, + "learning_rate": 1.8026571578096068e-05, + "loss": 0.1038, + "step": 3471 + }, + { + "epoch": 1.8073919833420093, + "grad_norm": 0.2369122624279523, + "learning_rate": 1.8013080279171228e-05, + "loss": 0.1123, + "step": 3472 + }, + { + "epoch": 1.8079125455491931, + "grad_norm": 0.22360440845976917, + "learning_rate": 1.799959118704078e-05, + "loss": 0.1048, + "step": 3473 + }, + { + "epoch": 1.808433107756377, + "grad_norm": 0.2104592319944269, + "learning_rate": 1.7986104305965205e-05, + "loss": 0.0973, + "step": 3474 + }, + { + "epoch": 1.8089536699635607, + "grad_norm": 0.218593446956845, + "learning_rate": 1.7972619640204296e-05, + "loss": 0.1043, + "step": 3475 + }, + { + "epoch": 1.8094742321707444, + "grad_norm": 0.22148524142264853, + "learning_rate": 1.795913719401712e-05, + "loss": 0.1004, + "step": 3476 + }, + { + "epoch": 1.8099947943779282, + "grad_norm": 0.2155675656645299, + "learning_rate": 1.7945656971662085e-05, + "loss": 0.1029, + "step": 3477 + }, + { + "epoch": 1.8105153565851118, + "grad_norm": 0.23004321978824924, + "learning_rate": 1.7932178977396848e-05, + "loss": 0.1071, + "step": 3478 + }, + { + "epoch": 1.8110359187922955, + "grad_norm": 0.2323235989966905, + "learning_rate": 1.79187032154784e-05, + "loss": 0.1088, + "step": 3479 + }, + { + "epoch": 1.8115564809994793, + "grad_norm": 0.2174771770164572, + "learning_rate": 1.7905229690163023e-05, + "loss": 0.1032, + "step": 3480 + }, + { + "epoch": 1.812077043206663, + "grad_norm": 0.21461562545103127, + "learning_rate": 1.789175840570626e-05, + "loss": 0.1015, + "step": 3481 + }, + { + "epoch": 1.8125976054138468, + "grad_norm": 0.21080577930269348, + "learning_rate": 1.7878289366362984e-05, + "loss": 0.1057, + "step": 3482 + }, + { + "epoch": 1.8131181676210306, + "grad_norm": 0.2166558884121671, + "learning_rate": 1.7864822576387342e-05, + "loss": 0.1021, + "step": 3483 + }, + { + "epoch": 1.8136387298282144, + "grad_norm": 0.22936772903160424, + "learning_rate": 1.7851358040032774e-05, + "loss": 0.1058, + "step": 3484 + }, + { + "epoch": 1.8141592920353982, + "grad_norm": 0.21693822231883433, + "learning_rate": 1.7837895761552002e-05, + "loss": 0.0993, + "step": 3485 + }, + { + "epoch": 1.814679854242582, + "grad_norm": 0.21424260587284594, + "learning_rate": 1.782443574519705e-05, + "loss": 0.1045, + "step": 3486 + }, + { + "epoch": 1.8152004164497657, + "grad_norm": 0.21784231944571822, + "learning_rate": 1.7810977995219203e-05, + "loss": 0.101, + "step": 3487 + }, + { + "epoch": 1.8157209786569495, + "grad_norm": 0.2337885632751051, + "learning_rate": 1.7797522515869062e-05, + "loss": 0.1076, + "step": 3488 + }, + { + "epoch": 1.8162415408641333, + "grad_norm": 0.2281419186886461, + "learning_rate": 1.778406931139649e-05, + "loss": 0.1054, + "step": 3489 + }, + { + "epoch": 1.816762103071317, + "grad_norm": 0.21997153529814273, + "learning_rate": 1.777061838605062e-05, + "loss": 0.1057, + "step": 3490 + }, + { + "epoch": 1.8172826652785008, + "grad_norm": 0.21614117988760198, + "learning_rate": 1.7757169744079893e-05, + "loss": 0.1016, + "step": 3491 + }, + { + "epoch": 1.8178032274856846, + "grad_norm": 0.214012852621221, + "learning_rate": 1.774372338973201e-05, + "loss": 0.1044, + "step": 3492 + }, + { + "epoch": 1.8183237896928683, + "grad_norm": 0.21662067485067615, + "learning_rate": 1.7730279327253962e-05, + "loss": 0.1017, + "step": 3493 + }, + { + "epoch": 1.8188443519000521, + "grad_norm": 0.22564115079401081, + "learning_rate": 1.7716837560892e-05, + "loss": 0.1052, + "step": 3494 + }, + { + "epoch": 1.8193649141072359, + "grad_norm": 0.21323173454794608, + "learning_rate": 1.7703398094891673e-05, + "loss": 0.0973, + "step": 3495 + }, + { + "epoch": 1.8198854763144197, + "grad_norm": 0.21838902892220904, + "learning_rate": 1.7689960933497778e-05, + "loss": 0.1028, + "step": 3496 + }, + { + "epoch": 1.8204060385216034, + "grad_norm": 0.2271905848355706, + "learning_rate": 1.76765260809544e-05, + "loss": 0.1036, + "step": 3497 + }, + { + "epoch": 1.8209266007287872, + "grad_norm": 0.22011061951043087, + "learning_rate": 1.7663093541504905e-05, + "loss": 0.1019, + "step": 3498 + }, + { + "epoch": 1.821447162935971, + "grad_norm": 0.22881874170767702, + "learning_rate": 1.7649663319391883e-05, + "loss": 0.1073, + "step": 3499 + }, + { + "epoch": 1.8219677251431547, + "grad_norm": 0.22620885690592923, + "learning_rate": 1.7636235418857245e-05, + "loss": 0.1025, + "step": 3500 + }, + { + "epoch": 1.8224882873503385, + "grad_norm": 0.23410418806456498, + "learning_rate": 1.7622809844142137e-05, + "loss": 0.102, + "step": 3501 + }, + { + "epoch": 1.823008849557522, + "grad_norm": 0.22101701713784802, + "learning_rate": 1.760938659948699e-05, + "loss": 0.102, + "step": 3502 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.22714340766149158, + "learning_rate": 1.7595965689131484e-05, + "loss": 0.1001, + "step": 3503 + }, + { + "epoch": 1.8240499739718896, + "grad_norm": 0.22057919964249992, + "learning_rate": 1.7582547117314563e-05, + "loss": 0.102, + "step": 3504 + }, + { + "epoch": 1.8245705361790734, + "grad_norm": 0.21511543280496126, + "learning_rate": 1.7569130888274446e-05, + "loss": 0.1043, + "step": 3505 + }, + { + "epoch": 1.8250910983862572, + "grad_norm": 0.21440452554779588, + "learning_rate": 1.7555717006248594e-05, + "loss": 0.0959, + "step": 3506 + }, + { + "epoch": 1.825611660593441, + "grad_norm": 0.22709181091908154, + "learning_rate": 1.7542305475473746e-05, + "loss": 0.1043, + "step": 3507 + }, + { + "epoch": 1.8261322228006247, + "grad_norm": 0.2268531896037747, + "learning_rate": 1.7528896300185878e-05, + "loss": 0.103, + "step": 3508 + }, + { + "epoch": 1.8266527850078085, + "grad_norm": 0.22430552658597233, + "learning_rate": 1.751548948462023e-05, + "loss": 0.1041, + "step": 3509 + }, + { + "epoch": 1.827173347214992, + "grad_norm": 0.22793206081910328, + "learning_rate": 1.7502085033011302e-05, + "loss": 0.1019, + "step": 3510 + }, + { + "epoch": 1.8276939094221758, + "grad_norm": 0.23464781588284941, + "learning_rate": 1.748868294959284e-05, + "loss": 0.1054, + "step": 3511 + }, + { + "epoch": 1.8282144716293596, + "grad_norm": 0.22172768017872468, + "learning_rate": 1.7475283238597857e-05, + "loss": 0.1017, + "step": 3512 + }, + { + "epoch": 1.8287350338365433, + "grad_norm": 0.2518798662916246, + "learning_rate": 1.746188590425859e-05, + "loss": 0.1055, + "step": 3513 + }, + { + "epoch": 1.8292555960437271, + "grad_norm": 0.21708297533262325, + "learning_rate": 1.7448490950806552e-05, + "loss": 0.1041, + "step": 3514 + }, + { + "epoch": 1.829776158250911, + "grad_norm": 0.23188297562996182, + "learning_rate": 1.7435098382472486e-05, + "loss": 0.1094, + "step": 3515 + }, + { + "epoch": 1.8302967204580947, + "grad_norm": 0.21477736177121873, + "learning_rate": 1.74217082034864e-05, + "loss": 0.1037, + "step": 3516 + }, + { + "epoch": 1.8308172826652784, + "grad_norm": 0.2174218899457457, + "learning_rate": 1.740832041807752e-05, + "loss": 0.1041, + "step": 3517 + }, + { + "epoch": 1.8313378448724622, + "grad_norm": 0.22509449830210138, + "learning_rate": 1.7394935030474335e-05, + "loss": 0.0994, + "step": 3518 + }, + { + "epoch": 1.831858407079646, + "grad_norm": 0.2141160924638327, + "learning_rate": 1.738155204490458e-05, + "loss": 0.0995, + "step": 3519 + }, + { + "epoch": 1.8323789692868298, + "grad_norm": 0.23187384577530246, + "learning_rate": 1.7368171465595222e-05, + "loss": 0.1086, + "step": 3520 + }, + { + "epoch": 1.8328995314940135, + "grad_norm": 0.2130230900152308, + "learning_rate": 1.735479329677247e-05, + "loss": 0.1006, + "step": 3521 + }, + { + "epoch": 1.8334200937011973, + "grad_norm": 0.2235989907682875, + "learning_rate": 1.7341417542661767e-05, + "loss": 0.1055, + "step": 3522 + }, + { + "epoch": 1.833940655908381, + "grad_norm": 0.2304050686195293, + "learning_rate": 1.732804420748781e-05, + "loss": 0.1042, + "step": 3523 + }, + { + "epoch": 1.8344612181155648, + "grad_norm": 0.21354769016325825, + "learning_rate": 1.731467329547451e-05, + "loss": 0.0985, + "step": 3524 + }, + { + "epoch": 1.8349817803227486, + "grad_norm": 0.22881459978388433, + "learning_rate": 1.7301304810845037e-05, + "loss": 0.1009, + "step": 3525 + }, + { + "epoch": 1.8355023425299324, + "grad_norm": 0.21108826324764815, + "learning_rate": 1.7287938757821765e-05, + "loss": 0.1042, + "step": 3526 + }, + { + "epoch": 1.8360229047371162, + "grad_norm": 0.23139125405119665, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.1075, + "step": 3527 + }, + { + "epoch": 1.8365434669443, + "grad_norm": 0.2174665102211108, + "learning_rate": 1.7261213963479556e-05, + "loss": 0.1041, + "step": 3528 + }, + { + "epoch": 1.8370640291514837, + "grad_norm": 0.22610247555368754, + "learning_rate": 1.724785523060155e-05, + "loss": 0.1009, + "step": 3529 + }, + { + "epoch": 1.8375845913586675, + "grad_norm": 0.22191495291139376, + "learning_rate": 1.7234498946211625e-05, + "loss": 0.1066, + "step": 3530 + }, + { + "epoch": 1.8381051535658512, + "grad_norm": 0.22121309383263021, + "learning_rate": 1.7221145114528297e-05, + "loss": 0.1068, + "step": 3531 + }, + { + "epoch": 1.838625715773035, + "grad_norm": 0.2133266726862183, + "learning_rate": 1.7207793739769352e-05, + "loss": 0.1021, + "step": 3532 + }, + { + "epoch": 1.8391462779802188, + "grad_norm": 0.218435643614945, + "learning_rate": 1.7194444826151752e-05, + "loss": 0.1027, + "step": 3533 + }, + { + "epoch": 1.8396668401874023, + "grad_norm": 0.21306246655698843, + "learning_rate": 1.7181098377891723e-05, + "loss": 0.1028, + "step": 3534 + }, + { + "epoch": 1.8401874023945861, + "grad_norm": 0.2316381773531802, + "learning_rate": 1.7167754399204683e-05, + "loss": 0.1087, + "step": 3535 + }, + { + "epoch": 1.8407079646017699, + "grad_norm": 0.2324977841120054, + "learning_rate": 1.7154412894305283e-05, + "loss": 0.1083, + "step": 3536 + }, + { + "epoch": 1.8412285268089537, + "grad_norm": 0.21543732870890425, + "learning_rate": 1.7141073867407397e-05, + "loss": 0.1004, + "step": 3537 + }, + { + "epoch": 1.8417490890161374, + "grad_norm": 0.21111111952706546, + "learning_rate": 1.71277373227241e-05, + "loss": 0.1035, + "step": 3538 + }, + { + "epoch": 1.8422696512233212, + "grad_norm": 0.20655633250382696, + "learning_rate": 1.7114403264467703e-05, + "loss": 0.098, + "step": 3539 + }, + { + "epoch": 1.842790213430505, + "grad_norm": 0.22438903873520755, + "learning_rate": 1.710107169684972e-05, + "loss": 0.1067, + "step": 3540 + }, + { + "epoch": 1.8433107756376887, + "grad_norm": 0.2199314935079129, + "learning_rate": 1.7087742624080883e-05, + "loss": 0.1021, + "step": 3541 + }, + { + "epoch": 1.8438313378448723, + "grad_norm": 0.2188004654542698, + "learning_rate": 1.7074416050371122e-05, + "loss": 0.1063, + "step": 3542 + }, + { + "epoch": 1.844351900052056, + "grad_norm": 0.22136277736501705, + "learning_rate": 1.7061091979929612e-05, + "loss": 0.1041, + "step": 3543 + }, + { + "epoch": 1.8448724622592398, + "grad_norm": 0.22383412015373572, + "learning_rate": 1.7047770416964688e-05, + "loss": 0.1037, + "step": 3544 + }, + { + "epoch": 1.8453930244664236, + "grad_norm": 0.22366283880625565, + "learning_rate": 1.7034451365683927e-05, + "loss": 0.1057, + "step": 3545 + }, + { + "epoch": 1.8459135866736074, + "grad_norm": 0.22075942408527013, + "learning_rate": 1.702113483029412e-05, + "loss": 0.1017, + "step": 3546 + }, + { + "epoch": 1.8464341488807912, + "grad_norm": 0.2313958085020113, + "learning_rate": 1.700782081500123e-05, + "loss": 0.1046, + "step": 3547 + }, + { + "epoch": 1.846954711087975, + "grad_norm": 0.22215124468148365, + "learning_rate": 1.6994509324010457e-05, + "loss": 0.1038, + "step": 3548 + }, + { + "epoch": 1.8474752732951587, + "grad_norm": 0.2140104139101645, + "learning_rate": 1.6981200361526177e-05, + "loss": 0.1016, + "step": 3549 + }, + { + "epoch": 1.8479958355023425, + "grad_norm": 0.2199758204376425, + "learning_rate": 1.6967893931751988e-05, + "loss": 0.0985, + "step": 3550 + }, + { + "epoch": 1.8485163977095262, + "grad_norm": 0.21980628404627928, + "learning_rate": 1.695459003889068e-05, + "loss": 0.1027, + "step": 3551 + }, + { + "epoch": 1.84903695991671, + "grad_norm": 0.22964814499456035, + "learning_rate": 1.694128868714424e-05, + "loss": 0.1008, + "step": 3552 + }, + { + "epoch": 1.8495575221238938, + "grad_norm": 0.2194752903061473, + "learning_rate": 1.692798988071385e-05, + "loss": 0.1043, + "step": 3553 + }, + { + "epoch": 1.8500780843310776, + "grad_norm": 0.22695773488044693, + "learning_rate": 1.6914693623799894e-05, + "loss": 0.1063, + "step": 3554 + }, + { + "epoch": 1.8505986465382613, + "grad_norm": 0.22960187036725627, + "learning_rate": 1.690139992060195e-05, + "loss": 0.1032, + "step": 3555 + }, + { + "epoch": 1.851119208745445, + "grad_norm": 0.2174160418036627, + "learning_rate": 1.6888108775318785e-05, + "loss": 0.1009, + "step": 3556 + }, + { + "epoch": 1.8516397709526289, + "grad_norm": 0.2234493817495722, + "learning_rate": 1.6874820192148365e-05, + "loss": 0.104, + "step": 3557 + }, + { + "epoch": 1.8521603331598127, + "grad_norm": 0.220450115790806, + "learning_rate": 1.686153417528784e-05, + "loss": 0.1016, + "step": 3558 + }, + { + "epoch": 1.8526808953669964, + "grad_norm": 0.2274125870596465, + "learning_rate": 1.684825072893356e-05, + "loss": 0.1098, + "step": 3559 + }, + { + "epoch": 1.8532014575741802, + "grad_norm": 0.22368136276174644, + "learning_rate": 1.6834969857281042e-05, + "loss": 0.1048, + "step": 3560 + }, + { + "epoch": 1.853722019781364, + "grad_norm": 0.22278441077025057, + "learning_rate": 1.682169156452502e-05, + "loss": 0.1021, + "step": 3561 + }, + { + "epoch": 1.8542425819885477, + "grad_norm": 0.2144032315295473, + "learning_rate": 1.6808415854859384e-05, + "loss": 0.1009, + "step": 3562 + }, + { + "epoch": 1.8547631441957315, + "grad_norm": 0.21883351924688985, + "learning_rate": 1.679514273247722e-05, + "loss": 0.1046, + "step": 3563 + }, + { + "epoch": 1.8552837064029153, + "grad_norm": 0.21804994979018316, + "learning_rate": 1.678187220157081e-05, + "loss": 0.105, + "step": 3564 + }, + { + "epoch": 1.855804268610099, + "grad_norm": 0.23179533965357424, + "learning_rate": 1.6768604266331585e-05, + "loss": 0.1049, + "step": 3565 + }, + { + "epoch": 1.8563248308172826, + "grad_norm": 0.21771973013714693, + "learning_rate": 1.675533893095019e-05, + "loss": 0.1042, + "step": 3566 + }, + { + "epoch": 1.8568453930244664, + "grad_norm": 0.21510845344937907, + "learning_rate": 1.6742076199616448e-05, + "loss": 0.1035, + "step": 3567 + }, + { + "epoch": 1.8573659552316502, + "grad_norm": 0.22503185695899153, + "learning_rate": 1.6728816076519322e-05, + "loss": 0.1045, + "step": 3568 + }, + { + "epoch": 1.857886517438834, + "grad_norm": 0.21278464572186798, + "learning_rate": 1.671555856584699e-05, + "loss": 0.1043, + "step": 3569 + }, + { + "epoch": 1.8584070796460177, + "grad_norm": 0.21442678992556535, + "learning_rate": 1.6702303671786797e-05, + "loss": 0.1019, + "step": 3570 + }, + { + "epoch": 1.8589276418532015, + "grad_norm": 0.21995013323000376, + "learning_rate": 1.6689051398525233e-05, + "loss": 0.0996, + "step": 3571 + }, + { + "epoch": 1.8594482040603852, + "grad_norm": 0.2166463003463736, + "learning_rate": 1.6675801750247998e-05, + "loss": 0.1001, + "step": 3572 + }, + { + "epoch": 1.859968766267569, + "grad_norm": 0.21168851210455933, + "learning_rate": 1.6662554731139944e-05, + "loss": 0.0986, + "step": 3573 + }, + { + "epoch": 1.8604893284747526, + "grad_norm": 0.2187117583742439, + "learning_rate": 1.66493103453851e-05, + "loss": 0.099, + "step": 3574 + }, + { + "epoch": 1.8610098906819363, + "grad_norm": 0.2323426842133383, + "learning_rate": 1.6636068597166655e-05, + "loss": 0.1083, + "step": 3575 + }, + { + "epoch": 1.8615304528891201, + "grad_norm": 0.226528190668166, + "learning_rate": 1.6622829490666974e-05, + "loss": 0.1015, + "step": 3576 + }, + { + "epoch": 1.8620510150963039, + "grad_norm": 0.2099990904532976, + "learning_rate": 1.6609593030067574e-05, + "loss": 0.102, + "step": 3577 + }, + { + "epoch": 1.8625715773034877, + "grad_norm": 0.21653151598199424, + "learning_rate": 1.6596359219549158e-05, + "loss": 0.102, + "step": 3578 + }, + { + "epoch": 1.8630921395106714, + "grad_norm": 0.23703398831513672, + "learning_rate": 1.6583128063291576e-05, + "loss": 0.0998, + "step": 3579 + }, + { + "epoch": 1.8636127017178552, + "grad_norm": 0.2204375202877476, + "learning_rate": 1.6569899565473828e-05, + "loss": 0.1021, + "step": 3580 + }, + { + "epoch": 1.864133263925039, + "grad_norm": 0.21805115310094073, + "learning_rate": 1.6556673730274107e-05, + "loss": 0.1049, + "step": 3581 + }, + { + "epoch": 1.8646538261322227, + "grad_norm": 0.22105359053989074, + "learning_rate": 1.6543450561869732e-05, + "loss": 0.1069, + "step": 3582 + }, + { + "epoch": 1.8651743883394065, + "grad_norm": 0.226961101870007, + "learning_rate": 1.6530230064437213e-05, + "loss": 0.1039, + "step": 3583 + }, + { + "epoch": 1.8656949505465903, + "grad_norm": 0.21715026468515009, + "learning_rate": 1.651701224215218e-05, + "loss": 0.1058, + "step": 3584 + }, + { + "epoch": 1.866215512753774, + "grad_norm": 0.2185791885590469, + "learning_rate": 1.6503797099189453e-05, + "loss": 0.1033, + "step": 3585 + }, + { + "epoch": 1.8667360749609578, + "grad_norm": 0.21828087490161857, + "learning_rate": 1.6490584639722976e-05, + "loss": 0.1044, + "step": 3586 + }, + { + "epoch": 1.8672566371681416, + "grad_norm": 0.2258399116146915, + "learning_rate": 1.6477374867925867e-05, + "loss": 0.1046, + "step": 3587 + }, + { + "epoch": 1.8677771993753254, + "grad_norm": 0.21880046178089685, + "learning_rate": 1.646416778797039e-05, + "loss": 0.1029, + "step": 3588 + }, + { + "epoch": 1.8682977615825092, + "grad_norm": 0.22303778060525425, + "learning_rate": 1.645096340402794e-05, + "loss": 0.1014, + "step": 3589 + }, + { + "epoch": 1.868818323789693, + "grad_norm": 0.22130295830879798, + "learning_rate": 1.6437761720269087e-05, + "loss": 0.1021, + "step": 3590 + }, + { + "epoch": 1.8693388859968767, + "grad_norm": 0.2164153839615916, + "learning_rate": 1.642456274086353e-05, + "loss": 0.1033, + "step": 3591 + }, + { + "epoch": 1.8698594482040605, + "grad_norm": 0.22284825275166165, + "learning_rate": 1.6411366469980134e-05, + "loss": 0.1024, + "step": 3592 + }, + { + "epoch": 1.8703800104112442, + "grad_norm": 0.2110508578950555, + "learning_rate": 1.6398172911786883e-05, + "loss": 0.1024, + "step": 3593 + }, + { + "epoch": 1.870900572618428, + "grad_norm": 0.20736136520524462, + "learning_rate": 1.6384982070450922e-05, + "loss": 0.0949, + "step": 3594 + }, + { + "epoch": 1.8714211348256118, + "grad_norm": 0.21550440293843662, + "learning_rate": 1.637179395013853e-05, + "loss": 0.1, + "step": 3595 + }, + { + "epoch": 1.8719416970327956, + "grad_norm": 0.22901002637779894, + "learning_rate": 1.6358608555015135e-05, + "loss": 0.1085, + "step": 3596 + }, + { + "epoch": 1.8724622592399793, + "grad_norm": 0.21708597501510324, + "learning_rate": 1.6345425889245298e-05, + "loss": 0.0956, + "step": 3597 + }, + { + "epoch": 1.8729828214471629, + "grad_norm": 0.21299348538165125, + "learning_rate": 1.6332245956992703e-05, + "loss": 0.0975, + "step": 3598 + }, + { + "epoch": 1.8735033836543467, + "grad_norm": 0.21235644848347995, + "learning_rate": 1.6319068762420204e-05, + "loss": 0.1009, + "step": 3599 + }, + { + "epoch": 1.8740239458615304, + "grad_norm": 0.21460664666278179, + "learning_rate": 1.6305894309689763e-05, + "loss": 0.1024, + "step": 3600 + }, + { + "epoch": 1.8745445080687142, + "grad_norm": 0.22828274571427395, + "learning_rate": 1.629272260296249e-05, + "loss": 0.1071, + "step": 3601 + }, + { + "epoch": 1.875065070275898, + "grad_norm": 0.22326950946792548, + "learning_rate": 1.6279553646398615e-05, + "loss": 0.1061, + "step": 3602 + }, + { + "epoch": 1.8755856324830817, + "grad_norm": 0.21887665058394934, + "learning_rate": 1.626638744415752e-05, + "loss": 0.1052, + "step": 3603 + }, + { + "epoch": 1.8761061946902655, + "grad_norm": 0.22556670911635793, + "learning_rate": 1.625322400039769e-05, + "loss": 0.1034, + "step": 3604 + }, + { + "epoch": 1.8766267568974493, + "grad_norm": 0.22477039664046272, + "learning_rate": 1.6240063319276767e-05, + "loss": 0.1066, + "step": 3605 + }, + { + "epoch": 1.8771473191046328, + "grad_norm": 0.208448037377274, + "learning_rate": 1.6226905404951503e-05, + "loss": 0.0964, + "step": 3606 + }, + { + "epoch": 1.8776678813118166, + "grad_norm": 0.20964154673856936, + "learning_rate": 1.621375026157777e-05, + "loss": 0.0974, + "step": 3607 + }, + { + "epoch": 1.8781884435190004, + "grad_norm": 0.21982252986036593, + "learning_rate": 1.6200597893310586e-05, + "loss": 0.1041, + "step": 3608 + }, + { + "epoch": 1.8787090057261842, + "grad_norm": 0.21630070655518385, + "learning_rate": 1.618744830430407e-05, + "loss": 0.1028, + "step": 3609 + }, + { + "epoch": 1.879229567933368, + "grad_norm": 0.22686308982838688, + "learning_rate": 1.6174301498711486e-05, + "loss": 0.1037, + "step": 3610 + }, + { + "epoch": 1.8797501301405517, + "grad_norm": 0.22043831684254314, + "learning_rate": 1.6161157480685197e-05, + "loss": 0.103, + "step": 3611 + }, + { + "epoch": 1.8802706923477355, + "grad_norm": 0.2178337556402931, + "learning_rate": 1.6148016254376702e-05, + "loss": 0.0978, + "step": 3612 + }, + { + "epoch": 1.8807912545549192, + "grad_norm": 0.22627246524746905, + "learning_rate": 1.613487782393661e-05, + "loss": 0.1046, + "step": 3613 + }, + { + "epoch": 1.881311816762103, + "grad_norm": 0.22358601640561931, + "learning_rate": 1.6121742193514648e-05, + "loss": 0.0986, + "step": 3614 + }, + { + "epoch": 1.8818323789692868, + "grad_norm": 0.22423924786442362, + "learning_rate": 1.610860936725967e-05, + "loss": 0.1001, + "step": 3615 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.2187539346723351, + "learning_rate": 1.6095479349319607e-05, + "loss": 0.106, + "step": 3616 + }, + { + "epoch": 1.8828735033836543, + "grad_norm": 0.22405774990004992, + "learning_rate": 1.6082352143841555e-05, + "loss": 0.0999, + "step": 3617 + }, + { + "epoch": 1.883394065590838, + "grad_norm": 0.2268274693952615, + "learning_rate": 1.6069227754971683e-05, + "loss": 0.1007, + "step": 3618 + }, + { + "epoch": 1.8839146277980219, + "grad_norm": 0.2387625325422801, + "learning_rate": 1.6056106186855292e-05, + "loss": 0.1072, + "step": 3619 + }, + { + "epoch": 1.8844351900052057, + "grad_norm": 0.21912658060501727, + "learning_rate": 1.6042987443636775e-05, + "loss": 0.1043, + "step": 3620 + }, + { + "epoch": 1.8849557522123894, + "grad_norm": 0.22608005928160363, + "learning_rate": 1.6029871529459656e-05, + "loss": 0.1034, + "step": 3621 + }, + { + "epoch": 1.8854763144195732, + "grad_norm": 0.21470577776047628, + "learning_rate": 1.601675844846653e-05, + "loss": 0.0991, + "step": 3622 + }, + { + "epoch": 1.885996876626757, + "grad_norm": 0.21496261618823018, + "learning_rate": 1.600364820479914e-05, + "loss": 0.0995, + "step": 3623 + }, + { + "epoch": 1.8865174388339407, + "grad_norm": 0.20783054254928857, + "learning_rate": 1.5990540802598302e-05, + "loss": 0.0999, + "step": 3624 + }, + { + "epoch": 1.8870380010411245, + "grad_norm": 0.20378345368619807, + "learning_rate": 1.5977436246003937e-05, + "loss": 0.0936, + "step": 3625 + }, + { + "epoch": 1.8875585632483083, + "grad_norm": 0.2186409812177155, + "learning_rate": 1.5964334539155084e-05, + "loss": 0.1048, + "step": 3626 + }, + { + "epoch": 1.888079125455492, + "grad_norm": 0.22599765981965825, + "learning_rate": 1.5951235686189857e-05, + "loss": 0.1043, + "step": 3627 + }, + { + "epoch": 1.8885996876626758, + "grad_norm": 0.22083477562213608, + "learning_rate": 1.5938139691245505e-05, + "loss": 0.1018, + "step": 3628 + }, + { + "epoch": 1.8891202498698596, + "grad_norm": 0.2170688955855864, + "learning_rate": 1.5925046558458333e-05, + "loss": 0.1021, + "step": 3629 + }, + { + "epoch": 1.8896408120770432, + "grad_norm": 0.2148159717859146, + "learning_rate": 1.5911956291963775e-05, + "loss": 0.0983, + "step": 3630 + }, + { + "epoch": 1.890161374284227, + "grad_norm": 0.20873764640196554, + "learning_rate": 1.5898868895896334e-05, + "loss": 0.0974, + "step": 3631 + }, + { + "epoch": 1.8906819364914107, + "grad_norm": 0.3148651473791158, + "learning_rate": 1.5885784374389632e-05, + "loss": 0.1065, + "step": 3632 + }, + { + "epoch": 1.8912024986985945, + "grad_norm": 0.22482430722857094, + "learning_rate": 1.5872702731576373e-05, + "loss": 0.1042, + "step": 3633 + }, + { + "epoch": 1.8917230609057782, + "grad_norm": 0.22637467340246845, + "learning_rate": 1.585962397158833e-05, + "loss": 0.1087, + "step": 3634 + }, + { + "epoch": 1.892243623112962, + "grad_norm": 0.22354140941775033, + "learning_rate": 1.584654809855639e-05, + "loss": 0.0974, + "step": 3635 + }, + { + "epoch": 1.8927641853201458, + "grad_norm": 0.21626315203073032, + "learning_rate": 1.5833475116610523e-05, + "loss": 0.1005, + "step": 3636 + }, + { + "epoch": 1.8932847475273296, + "grad_norm": 0.2113336278345908, + "learning_rate": 1.582040502987979e-05, + "loss": 0.1041, + "step": 3637 + }, + { + "epoch": 1.893805309734513, + "grad_norm": 0.22087048570487774, + "learning_rate": 1.5807337842492337e-05, + "loss": 0.1036, + "step": 3638 + }, + { + "epoch": 1.8943258719416969, + "grad_norm": 0.21572578777284565, + "learning_rate": 1.5794273558575374e-05, + "loss": 0.0984, + "step": 3639 + }, + { + "epoch": 1.8948464341488807, + "grad_norm": 0.22769892060924338, + "learning_rate": 1.5781212182255226e-05, + "loss": 0.1122, + "step": 3640 + }, + { + "epoch": 1.8953669963560644, + "grad_norm": 0.213430358027192, + "learning_rate": 1.5768153717657268e-05, + "loss": 0.0988, + "step": 3641 + }, + { + "epoch": 1.8958875585632482, + "grad_norm": 0.2622995076619564, + "learning_rate": 1.5755098168905992e-05, + "loss": 0.098, + "step": 3642 + }, + { + "epoch": 1.896408120770432, + "grad_norm": 0.2307094720665387, + "learning_rate": 1.574204554012493e-05, + "loss": 0.0983, + "step": 3643 + }, + { + "epoch": 1.8969286829776157, + "grad_norm": 0.22053805471841584, + "learning_rate": 1.572899583543671e-05, + "loss": 0.1017, + "step": 3644 + }, + { + "epoch": 1.8974492451847995, + "grad_norm": 0.21911020706637074, + "learning_rate": 1.571594905896304e-05, + "loss": 0.1048, + "step": 3645 + }, + { + "epoch": 1.8979698073919833, + "grad_norm": 0.21973155556064952, + "learning_rate": 1.5702905214824705e-05, + "loss": 0.1047, + "step": 3646 + }, + { + "epoch": 1.898490369599167, + "grad_norm": 0.2446184787139059, + "learning_rate": 1.5689864307141548e-05, + "loss": 0.1003, + "step": 3647 + }, + { + "epoch": 1.8990109318063508, + "grad_norm": 0.21721103382874216, + "learning_rate": 1.56768263400325e-05, + "loss": 0.0934, + "step": 3648 + }, + { + "epoch": 1.8995314940135346, + "grad_norm": 0.22688257718189353, + "learning_rate": 1.566379131761556e-05, + "loss": 0.104, + "step": 3649 + }, + { + "epoch": 1.9000520562207184, + "grad_norm": 0.21482237576043078, + "learning_rate": 1.5650759244007783e-05, + "loss": 0.0964, + "step": 3650 + }, + { + "epoch": 1.9005726184279021, + "grad_norm": 0.22722159279382412, + "learning_rate": 1.5637730123325327e-05, + "loss": 0.104, + "step": 3651 + }, + { + "epoch": 1.901093180635086, + "grad_norm": 0.22648407542435803, + "learning_rate": 1.562470395968338e-05, + "loss": 0.1043, + "step": 3652 + }, + { + "epoch": 1.9016137428422697, + "grad_norm": 0.21123512550603266, + "learning_rate": 1.56116807571962e-05, + "loss": 0.0991, + "step": 3653 + }, + { + "epoch": 1.9021343050494535, + "grad_norm": 0.22587891194139853, + "learning_rate": 1.559866051997714e-05, + "loss": 0.1019, + "step": 3654 + }, + { + "epoch": 1.9026548672566372, + "grad_norm": 0.22485896129735808, + "learning_rate": 1.5585643252138577e-05, + "loss": 0.0991, + "step": 3655 + }, + { + "epoch": 1.903175429463821, + "grad_norm": 0.21654611342402638, + "learning_rate": 1.557262895779199e-05, + "loss": 0.1018, + "step": 3656 + }, + { + "epoch": 1.9036959916710048, + "grad_norm": 0.22992554106238536, + "learning_rate": 1.5559617641047886e-05, + "loss": 0.1052, + "step": 3657 + }, + { + "epoch": 1.9042165538781886, + "grad_norm": 0.2261900928255639, + "learning_rate": 1.5546609306015856e-05, + "loss": 0.1038, + "step": 3658 + }, + { + "epoch": 1.9047371160853723, + "grad_norm": 0.22144384444707318, + "learning_rate": 1.5533603956804522e-05, + "loss": 0.1015, + "step": 3659 + }, + { + "epoch": 1.905257678292556, + "grad_norm": 0.21449867047073035, + "learning_rate": 1.5520601597521596e-05, + "loss": 0.0986, + "step": 3660 + }, + { + "epoch": 1.9057782404997399, + "grad_norm": 0.22083165166943855, + "learning_rate": 1.5507602232273814e-05, + "loss": 0.0978, + "step": 3661 + }, + { + "epoch": 1.9062988027069234, + "grad_norm": 0.21908132571042455, + "learning_rate": 1.5494605865166983e-05, + "loss": 0.1009, + "step": 3662 + }, + { + "epoch": 1.9068193649141072, + "grad_norm": 0.2256794889232402, + "learning_rate": 1.5481612500305964e-05, + "loss": 0.1064, + "step": 3663 + }, + { + "epoch": 1.907339927121291, + "grad_norm": 0.20932873751015288, + "learning_rate": 1.5468622141794664e-05, + "loss": 0.1019, + "step": 3664 + }, + { + "epoch": 1.9078604893284747, + "grad_norm": 0.22557308680633772, + "learning_rate": 1.5455634793736046e-05, + "loss": 0.1051, + "step": 3665 + }, + { + "epoch": 1.9083810515356585, + "grad_norm": 0.21717795714503, + "learning_rate": 1.544265046023211e-05, + "loss": 0.1043, + "step": 3666 + }, + { + "epoch": 1.9089016137428423, + "grad_norm": 0.21382166007000752, + "learning_rate": 1.542966914538393e-05, + "loss": 0.1003, + "step": 3667 + }, + { + "epoch": 1.909422175950026, + "grad_norm": 0.21047577575173457, + "learning_rate": 1.541669085329159e-05, + "loss": 0.1021, + "step": 3668 + }, + { + "epoch": 1.9099427381572098, + "grad_norm": 0.21885604669207556, + "learning_rate": 1.540371558805425e-05, + "loss": 0.1045, + "step": 3669 + }, + { + "epoch": 1.9104633003643934, + "grad_norm": 0.2263376773671937, + "learning_rate": 1.539074335377011e-05, + "loss": 0.1079, + "step": 3670 + }, + { + "epoch": 1.9109838625715772, + "grad_norm": 0.21167752773460963, + "learning_rate": 1.537777415453638e-05, + "loss": 0.0996, + "step": 3671 + }, + { + "epoch": 1.911504424778761, + "grad_norm": 0.21780236850803966, + "learning_rate": 1.536480799444936e-05, + "loss": 0.104, + "step": 3672 + }, + { + "epoch": 1.9120249869859447, + "grad_norm": 0.2202957087255931, + "learning_rate": 1.5351844877604353e-05, + "loss": 0.101, + "step": 3673 + }, + { + "epoch": 1.9125455491931285, + "grad_norm": 0.2215233885356788, + "learning_rate": 1.5338884808095726e-05, + "loss": 0.1031, + "step": 3674 + }, + { + "epoch": 1.9130661114003122, + "grad_norm": 0.221036149093325, + "learning_rate": 1.5325927790016858e-05, + "loss": 0.1009, + "step": 3675 + }, + { + "epoch": 1.913586673607496, + "grad_norm": 0.21072634910142585, + "learning_rate": 1.5312973827460194e-05, + "loss": 0.1022, + "step": 3676 + }, + { + "epoch": 1.9141072358146798, + "grad_norm": 0.21579345962748916, + "learning_rate": 1.5300022924517186e-05, + "loss": 0.1014, + "step": 3677 + }, + { + "epoch": 1.9146277980218636, + "grad_norm": 0.22972340517503928, + "learning_rate": 1.528707508527834e-05, + "loss": 0.1066, + "step": 3678 + }, + { + "epoch": 1.9151483602290473, + "grad_norm": 0.22399399495289485, + "learning_rate": 1.527413031383319e-05, + "loss": 0.1062, + "step": 3679 + }, + { + "epoch": 1.915668922436231, + "grad_norm": 0.22928343502766305, + "learning_rate": 1.5261188614270278e-05, + "loss": 0.1044, + "step": 3680 + }, + { + "epoch": 1.9161894846434149, + "grad_norm": 0.20874236255449022, + "learning_rate": 1.5248249990677212e-05, + "loss": 0.1011, + "step": 3681 + }, + { + "epoch": 1.9167100468505986, + "grad_norm": 0.2238709567201973, + "learning_rate": 1.5235314447140603e-05, + "loss": 0.1049, + "step": 3682 + }, + { + "epoch": 1.9172306090577824, + "grad_norm": 0.22429081990026067, + "learning_rate": 1.5222381987746104e-05, + "loss": 0.1058, + "step": 3683 + }, + { + "epoch": 1.9177511712649662, + "grad_norm": 0.21422872549018027, + "learning_rate": 1.5209452616578379e-05, + "loss": 0.0997, + "step": 3684 + }, + { + "epoch": 1.91827173347215, + "grad_norm": 0.2133382379880559, + "learning_rate": 1.5196526337721137e-05, + "loss": 0.1031, + "step": 3685 + }, + { + "epoch": 1.9187922956793337, + "grad_norm": 0.2191123247317634, + "learning_rate": 1.5183603155257087e-05, + "loss": 0.1003, + "step": 3686 + }, + { + "epoch": 1.9193128578865175, + "grad_norm": 0.2203372017928507, + "learning_rate": 1.517068307326798e-05, + "loss": 0.1008, + "step": 3687 + }, + { + "epoch": 1.9198334200937013, + "grad_norm": 0.22009662025394322, + "learning_rate": 1.5157766095834581e-05, + "loss": 0.1015, + "step": 3688 + }, + { + "epoch": 1.920353982300885, + "grad_norm": 0.21359112700683444, + "learning_rate": 1.5144852227036658e-05, + "loss": 0.1021, + "step": 3689 + }, + { + "epoch": 1.9208745445080688, + "grad_norm": 0.2333651788145784, + "learning_rate": 1.5131941470953026e-05, + "loss": 0.1082, + "step": 3690 + }, + { + "epoch": 1.9213951067152526, + "grad_norm": 0.2199424676535258, + "learning_rate": 1.5119033831661489e-05, + "loss": 0.0969, + "step": 3691 + }, + { + "epoch": 1.9219156689224364, + "grad_norm": 0.21869088466324416, + "learning_rate": 1.5106129313238898e-05, + "loss": 0.1047, + "step": 3692 + }, + { + "epoch": 1.9224362311296201, + "grad_norm": 0.2257137266532471, + "learning_rate": 1.5093227919761082e-05, + "loss": 0.1059, + "step": 3693 + }, + { + "epoch": 1.9229567933368037, + "grad_norm": 0.2042823698266939, + "learning_rate": 1.5080329655302916e-05, + "loss": 0.0964, + "step": 3694 + }, + { + "epoch": 1.9234773555439875, + "grad_norm": 0.21289243041563377, + "learning_rate": 1.5067434523938263e-05, + "loss": 0.0969, + "step": 3695 + }, + { + "epoch": 1.9239979177511712, + "grad_norm": 0.2157354773694145, + "learning_rate": 1.5054542529740009e-05, + "loss": 0.1054, + "step": 3696 + }, + { + "epoch": 1.924518479958355, + "grad_norm": 0.20594700902414997, + "learning_rate": 1.504165367678006e-05, + "loss": 0.0991, + "step": 3697 + }, + { + "epoch": 1.9250390421655388, + "grad_norm": 0.2251525996090509, + "learning_rate": 1.5028767969129288e-05, + "loss": 0.0977, + "step": 3698 + }, + { + "epoch": 1.9255596043727226, + "grad_norm": 0.21960241687587326, + "learning_rate": 1.5015885410857616e-05, + "loss": 0.0994, + "step": 3699 + }, + { + "epoch": 1.9260801665799063, + "grad_norm": 0.20915888582744993, + "learning_rate": 1.5003006006033948e-05, + "loss": 0.0996, + "step": 3700 + }, + { + "epoch": 1.92660072878709, + "grad_norm": 0.2225064521826276, + "learning_rate": 1.4990129758726203e-05, + "loss": 0.1028, + "step": 3701 + }, + { + "epoch": 1.9271212909942737, + "grad_norm": 0.22458058269108447, + "learning_rate": 1.4977256673001305e-05, + "loss": 0.1027, + "step": 3702 + }, + { + "epoch": 1.9276418532014574, + "grad_norm": 0.21114252037998082, + "learning_rate": 1.4964386752925163e-05, + "loss": 0.0958, + "step": 3703 + }, + { + "epoch": 1.9281624154086412, + "grad_norm": 0.21678013606532165, + "learning_rate": 1.4951520002562705e-05, + "loss": 0.0992, + "step": 3704 + }, + { + "epoch": 1.928682977615825, + "grad_norm": 0.20924637264966345, + "learning_rate": 1.4938656425977842e-05, + "loss": 0.0971, + "step": 3705 + }, + { + "epoch": 1.9292035398230087, + "grad_norm": 0.21121389074080546, + "learning_rate": 1.4925796027233505e-05, + "loss": 0.1028, + "step": 3706 + }, + { + "epoch": 1.9297241020301925, + "grad_norm": 0.2188455100447437, + "learning_rate": 1.4912938810391591e-05, + "loss": 0.1019, + "step": 3707 + }, + { + "epoch": 1.9302446642373763, + "grad_norm": 0.22810063283033222, + "learning_rate": 1.4900084779513004e-05, + "loss": 0.1034, + "step": 3708 + }, + { + "epoch": 1.93076522644456, + "grad_norm": 0.2144538872962715, + "learning_rate": 1.488723393865766e-05, + "loss": 0.1016, + "step": 3709 + }, + { + "epoch": 1.9312857886517438, + "grad_norm": 0.20918446229699433, + "learning_rate": 1.487438629188444e-05, + "loss": 0.0993, + "step": 3710 + }, + { + "epoch": 1.9318063508589276, + "grad_norm": 0.23023524817864033, + "learning_rate": 1.4861541843251242e-05, + "loss": 0.1019, + "step": 3711 + }, + { + "epoch": 1.9323269130661114, + "grad_norm": 0.22076188050168008, + "learning_rate": 1.4848700596814926e-05, + "loss": 0.101, + "step": 3712 + }, + { + "epoch": 1.9328474752732951, + "grad_norm": 0.21201820939269886, + "learning_rate": 1.483586255663137e-05, + "loss": 0.1034, + "step": 3713 + }, + { + "epoch": 1.933368037480479, + "grad_norm": 0.2154132891161572, + "learning_rate": 1.482302772675541e-05, + "loss": 0.0982, + "step": 3714 + }, + { + "epoch": 1.9338885996876627, + "grad_norm": 0.22979358381751835, + "learning_rate": 1.4810196111240898e-05, + "loss": 0.105, + "step": 3715 + }, + { + "epoch": 1.9344091618948465, + "grad_norm": 0.2170365978314031, + "learning_rate": 1.4797367714140642e-05, + "loss": 0.1036, + "step": 3716 + }, + { + "epoch": 1.9349297241020302, + "grad_norm": 0.21104199550912994, + "learning_rate": 1.4784542539506447e-05, + "loss": 0.0989, + "step": 3717 + }, + { + "epoch": 1.935450286309214, + "grad_norm": 0.21704690446239305, + "learning_rate": 1.477172059138911e-05, + "loss": 0.0988, + "step": 3718 + }, + { + "epoch": 1.9359708485163978, + "grad_norm": 0.2216423717123099, + "learning_rate": 1.4758901873838387e-05, + "loss": 0.1008, + "step": 3719 + }, + { + "epoch": 1.9364914107235816, + "grad_norm": 0.2224915756445421, + "learning_rate": 1.4746086390903041e-05, + "loss": 0.1028, + "step": 3720 + }, + { + "epoch": 1.9370119729307653, + "grad_norm": 0.22165523853861174, + "learning_rate": 1.4733274146630782e-05, + "loss": 0.1027, + "step": 3721 + }, + { + "epoch": 1.937532535137949, + "grad_norm": 0.2104242110992599, + "learning_rate": 1.472046514506832e-05, + "loss": 0.0999, + "step": 3722 + }, + { + "epoch": 1.9380530973451329, + "grad_norm": 0.2183583161427088, + "learning_rate": 1.4707659390261336e-05, + "loss": 0.1008, + "step": 3723 + }, + { + "epoch": 1.9385736595523166, + "grad_norm": 0.2205753671797551, + "learning_rate": 1.4694856886254484e-05, + "loss": 0.0993, + "step": 3724 + }, + { + "epoch": 1.9390942217595004, + "grad_norm": 0.21265574296368048, + "learning_rate": 1.4682057637091386e-05, + "loss": 0.099, + "step": 3725 + }, + { + "epoch": 1.939614783966684, + "grad_norm": 0.21653176063173926, + "learning_rate": 1.4669261646814637e-05, + "loss": 0.0994, + "step": 3726 + }, + { + "epoch": 1.9401353461738677, + "grad_norm": 0.23412308864317125, + "learning_rate": 1.4656468919465816e-05, + "loss": 0.1081, + "step": 3727 + }, + { + "epoch": 1.9406559083810515, + "grad_norm": 0.22672128076546094, + "learning_rate": 1.4643679459085451e-05, + "loss": 0.0958, + "step": 3728 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.21722673840560922, + "learning_rate": 1.4630893269713058e-05, + "loss": 0.101, + "step": 3729 + }, + { + "epoch": 1.941697032795419, + "grad_norm": 0.21147974417849058, + "learning_rate": 1.4618110355387105e-05, + "loss": 0.1029, + "step": 3730 + }, + { + "epoch": 1.9422175950026028, + "grad_norm": 0.21231022676958888, + "learning_rate": 1.4605330720145036e-05, + "loss": 0.1004, + "step": 3731 + }, + { + "epoch": 1.9427381572097866, + "grad_norm": 0.2202858395550634, + "learning_rate": 1.4592554368023248e-05, + "loss": 0.1061, + "step": 3732 + }, + { + "epoch": 1.9432587194169704, + "grad_norm": 0.2079520954395182, + "learning_rate": 1.4579781303057122e-05, + "loss": 0.0993, + "step": 3733 + }, + { + "epoch": 1.943779281624154, + "grad_norm": 0.2232459002547435, + "learning_rate": 1.4567011529280963e-05, + "loss": 0.1016, + "step": 3734 + }, + { + "epoch": 1.9442998438313377, + "grad_norm": 0.2131387702891851, + "learning_rate": 1.4554245050728085e-05, + "loss": 0.0976, + "step": 3735 + }, + { + "epoch": 1.9448204060385215, + "grad_norm": 0.21742449638032005, + "learning_rate": 1.4541481871430712e-05, + "loss": 0.0997, + "step": 3736 + }, + { + "epoch": 1.9453409682457052, + "grad_norm": 0.21341755263048834, + "learning_rate": 1.4528721995420065e-05, + "loss": 0.1032, + "step": 3737 + }, + { + "epoch": 1.945861530452889, + "grad_norm": 0.22009074356689978, + "learning_rate": 1.4515965426726297e-05, + "loss": 0.1059, + "step": 3738 + }, + { + "epoch": 1.9463820926600728, + "grad_norm": 0.21912610193463983, + "learning_rate": 1.4503212169378549e-05, + "loss": 0.1001, + "step": 3739 + }, + { + "epoch": 1.9469026548672566, + "grad_norm": 0.2170439457377092, + "learning_rate": 1.449046222740486e-05, + "loss": 0.1057, + "step": 3740 + }, + { + "epoch": 1.9474232170744403, + "grad_norm": 0.21785895921005488, + "learning_rate": 1.4477715604832277e-05, + "loss": 0.1031, + "step": 3741 + }, + { + "epoch": 1.947943779281624, + "grad_norm": 0.21495265954061052, + "learning_rate": 1.4464972305686777e-05, + "loss": 0.0986, + "step": 3742 + }, + { + "epoch": 1.9484643414888079, + "grad_norm": 0.20444844893689895, + "learning_rate": 1.4452232333993271e-05, + "loss": 0.0953, + "step": 3743 + }, + { + "epoch": 1.9489849036959916, + "grad_norm": 0.22666704840339114, + "learning_rate": 1.4439495693775657e-05, + "loss": 0.1055, + "step": 3744 + }, + { + "epoch": 1.9495054659031754, + "grad_norm": 0.21327639711763433, + "learning_rate": 1.4426762389056735e-05, + "loss": 0.1034, + "step": 3745 + }, + { + "epoch": 1.9500260281103592, + "grad_norm": 0.2184419717310296, + "learning_rate": 1.4414032423858287e-05, + "loss": 0.1047, + "step": 3746 + }, + { + "epoch": 1.950546590317543, + "grad_norm": 0.21193552764734155, + "learning_rate": 1.4401305802201029e-05, + "loss": 0.1026, + "step": 3747 + }, + { + "epoch": 1.9510671525247267, + "grad_norm": 0.2209102393312676, + "learning_rate": 1.4388582528104628e-05, + "loss": 0.1011, + "step": 3748 + }, + { + "epoch": 1.9515877147319105, + "grad_norm": 0.21657115471489805, + "learning_rate": 1.4375862605587668e-05, + "loss": 0.0986, + "step": 3749 + }, + { + "epoch": 1.9521082769390943, + "grad_norm": 0.21434326778156354, + "learning_rate": 1.4363146038667702e-05, + "loss": 0.1012, + "step": 3750 + }, + { + "epoch": 1.952628839146278, + "grad_norm": 0.21320540069825253, + "learning_rate": 1.4350432831361221e-05, + "loss": 0.0988, + "step": 3751 + }, + { + "epoch": 1.9531494013534618, + "grad_norm": 0.22627494817868693, + "learning_rate": 1.4337722987683632e-05, + "loss": 0.101, + "step": 3752 + }, + { + "epoch": 1.9536699635606456, + "grad_norm": 0.2194395485328437, + "learning_rate": 1.4325016511649315e-05, + "loss": 0.0982, + "step": 3753 + }, + { + "epoch": 1.9541905257678294, + "grad_norm": 0.21191340497198383, + "learning_rate": 1.4312313407271543e-05, + "loss": 0.099, + "step": 3754 + }, + { + "epoch": 1.9547110879750131, + "grad_norm": 0.21476576576191445, + "learning_rate": 1.429961367856256e-05, + "loss": 0.0995, + "step": 3755 + }, + { + "epoch": 1.955231650182197, + "grad_norm": 0.21770044547304337, + "learning_rate": 1.4286917329533528e-05, + "loss": 0.1015, + "step": 3756 + }, + { + "epoch": 1.9557522123893807, + "grad_norm": 0.21158808266880086, + "learning_rate": 1.427422436419456e-05, + "loss": 0.098, + "step": 3757 + }, + { + "epoch": 1.9562727745965642, + "grad_norm": 0.21361489248889914, + "learning_rate": 1.4261534786554661e-05, + "loss": 0.1001, + "step": 3758 + }, + { + "epoch": 1.956793336803748, + "grad_norm": 0.21171491539393147, + "learning_rate": 1.4248848600621801e-05, + "loss": 0.1007, + "step": 3759 + }, + { + "epoch": 1.9573138990109318, + "grad_norm": 0.21237977936588603, + "learning_rate": 1.4236165810402879e-05, + "loss": 0.0974, + "step": 3760 + }, + { + "epoch": 1.9578344612181156, + "grad_norm": 0.22772433653264357, + "learning_rate": 1.4223486419903692e-05, + "loss": 0.105, + "step": 3761 + }, + { + "epoch": 1.9583550234252993, + "grad_norm": 0.21822253874910252, + "learning_rate": 1.4210810433128997e-05, + "loss": 0.1027, + "step": 3762 + }, + { + "epoch": 1.958875585632483, + "grad_norm": 0.21968663113465622, + "learning_rate": 1.4198137854082443e-05, + "loss": 0.0979, + "step": 3763 + }, + { + "epoch": 1.9593961478396669, + "grad_norm": 0.21509071607759678, + "learning_rate": 1.4185468686766628e-05, + "loss": 0.1002, + "step": 3764 + }, + { + "epoch": 1.9599167100468506, + "grad_norm": 0.21833030367787082, + "learning_rate": 1.4172802935183071e-05, + "loss": 0.104, + "step": 3765 + }, + { + "epoch": 1.9604372722540342, + "grad_norm": 0.21150315924836155, + "learning_rate": 1.41601406033322e-05, + "loss": 0.0976, + "step": 3766 + }, + { + "epoch": 1.960957834461218, + "grad_norm": 0.22119462315092003, + "learning_rate": 1.4147481695213377e-05, + "loss": 0.101, + "step": 3767 + }, + { + "epoch": 1.9614783966684017, + "grad_norm": 0.22377655608418107, + "learning_rate": 1.4134826214824859e-05, + "loss": 0.1087, + "step": 3768 + }, + { + "epoch": 1.9619989588755855, + "grad_norm": 0.214726683314058, + "learning_rate": 1.4122174166163853e-05, + "loss": 0.1011, + "step": 3769 + }, + { + "epoch": 1.9625195210827693, + "grad_norm": 0.22428658200555723, + "learning_rate": 1.410952555322645e-05, + "loss": 0.1002, + "step": 3770 + }, + { + "epoch": 1.963040083289953, + "grad_norm": 0.22440945050923508, + "learning_rate": 1.4096880380007673e-05, + "loss": 0.1079, + "step": 3771 + }, + { + "epoch": 1.9635606454971368, + "grad_norm": 0.22006424334394925, + "learning_rate": 1.4084238650501471e-05, + "loss": 0.103, + "step": 3772 + }, + { + "epoch": 1.9640812077043206, + "grad_norm": 0.2078275175323549, + "learning_rate": 1.4071600368700668e-05, + "loss": 0.0965, + "step": 3773 + }, + { + "epoch": 1.9646017699115044, + "grad_norm": 0.20908961325681313, + "learning_rate": 1.4058965538597033e-05, + "loss": 0.0958, + "step": 3774 + }, + { + "epoch": 1.9651223321186881, + "grad_norm": 0.23113004080134392, + "learning_rate": 1.4046334164181232e-05, + "loss": 0.1046, + "step": 3775 + }, + { + "epoch": 1.965642894325872, + "grad_norm": 0.21341710865458288, + "learning_rate": 1.4033706249442852e-05, + "loss": 0.0992, + "step": 3776 + }, + { + "epoch": 1.9661634565330557, + "grad_norm": 0.2255057605756392, + "learning_rate": 1.4021081798370356e-05, + "loss": 0.1076, + "step": 3777 + }, + { + "epoch": 1.9666840187402395, + "grad_norm": 0.22209298875300015, + "learning_rate": 1.4008460814951151e-05, + "loss": 0.1057, + "step": 3778 + }, + { + "epoch": 1.9672045809474232, + "grad_norm": 0.22187334155214908, + "learning_rate": 1.3995843303171517e-05, + "loss": 0.0982, + "step": 3779 + }, + { + "epoch": 1.967725143154607, + "grad_norm": 0.2103303538717618, + "learning_rate": 1.3983229267016653e-05, + "loss": 0.1011, + "step": 3780 + }, + { + "epoch": 1.9682457053617908, + "grad_norm": 0.2040935740244857, + "learning_rate": 1.3970618710470676e-05, + "loss": 0.0998, + "step": 3781 + }, + { + "epoch": 1.9687662675689745, + "grad_norm": 0.22402864584457863, + "learning_rate": 1.395801163751656e-05, + "loss": 0.1026, + "step": 3782 + }, + { + "epoch": 1.9692868297761583, + "grad_norm": 0.20692376854563577, + "learning_rate": 1.3945408052136222e-05, + "loss": 0.0981, + "step": 3783 + }, + { + "epoch": 1.969807391983342, + "grad_norm": 0.22980259328124997, + "learning_rate": 1.3932807958310456e-05, + "loss": 0.1076, + "step": 3784 + }, + { + "epoch": 1.9703279541905259, + "grad_norm": 0.20367490497979784, + "learning_rate": 1.392021136001897e-05, + "loss": 0.0968, + "step": 3785 + }, + { + "epoch": 1.9708485163977096, + "grad_norm": 0.20898246158228093, + "learning_rate": 1.3907618261240334e-05, + "loss": 0.1033, + "step": 3786 + }, + { + "epoch": 1.9713690786048934, + "grad_norm": 0.2113639999243892, + "learning_rate": 1.3895028665952058e-05, + "loss": 0.1014, + "step": 3787 + }, + { + "epoch": 1.9718896408120772, + "grad_norm": 0.2175367277530161, + "learning_rate": 1.38824425781305e-05, + "loss": 0.1008, + "step": 3788 + }, + { + "epoch": 1.972410203019261, + "grad_norm": 0.21182935807043565, + "learning_rate": 1.3869860001750942e-05, + "loss": 0.1013, + "step": 3789 + }, + { + "epoch": 1.9729307652264445, + "grad_norm": 0.22349166273303656, + "learning_rate": 1.3857280940787559e-05, + "loss": 0.0988, + "step": 3790 + }, + { + "epoch": 1.9734513274336283, + "grad_norm": 0.21190197883778825, + "learning_rate": 1.3844705399213379e-05, + "loss": 0.0995, + "step": 3791 + }, + { + "epoch": 1.973971889640812, + "grad_norm": 0.2193754184132712, + "learning_rate": 1.3832133381000359e-05, + "loss": 0.105, + "step": 3792 + }, + { + "epoch": 1.9744924518479958, + "grad_norm": 0.22565894195227415, + "learning_rate": 1.3819564890119325e-05, + "loss": 0.0992, + "step": 3793 + }, + { + "epoch": 1.9750130140551796, + "grad_norm": 0.2223063511163848, + "learning_rate": 1.3806999930539998e-05, + "loss": 0.1082, + "step": 3794 + }, + { + "epoch": 1.9755335762623634, + "grad_norm": 0.2182868394326183, + "learning_rate": 1.3794438506230967e-05, + "loss": 0.0969, + "step": 3795 + }, + { + "epoch": 1.9760541384695471, + "grad_norm": 0.21985719559638187, + "learning_rate": 1.378188062115972e-05, + "loss": 0.1031, + "step": 3796 + }, + { + "epoch": 1.976574700676731, + "grad_norm": 0.21709916965896411, + "learning_rate": 1.3769326279292616e-05, + "loss": 0.1, + "step": 3797 + }, + { + "epoch": 1.9770952628839145, + "grad_norm": 0.21263835779355764, + "learning_rate": 1.3756775484594896e-05, + "loss": 0.1004, + "step": 3798 + }, + { + "epoch": 1.9776158250910982, + "grad_norm": 0.21782936591583624, + "learning_rate": 1.374422824103071e-05, + "loss": 0.102, + "step": 3799 + }, + { + "epoch": 1.978136387298282, + "grad_norm": 0.21792766988581672, + "learning_rate": 1.373168455256303e-05, + "loss": 0.1007, + "step": 3800 + }, + { + "epoch": 1.9786569495054658, + "grad_norm": 0.20963925747095527, + "learning_rate": 1.3719144423153751e-05, + "loss": 0.0944, + "step": 3801 + }, + { + "epoch": 1.9791775117126496, + "grad_norm": 0.2122594378833588, + "learning_rate": 1.370660785676363e-05, + "loss": 0.0998, + "step": 3802 + }, + { + "epoch": 1.9796980739198333, + "grad_norm": 0.2213288489164458, + "learning_rate": 1.3694074857352308e-05, + "loss": 0.1002, + "step": 3803 + }, + { + "epoch": 1.980218636127017, + "grad_norm": 0.21750190142433626, + "learning_rate": 1.3681545428878268e-05, + "loss": 0.1015, + "step": 3804 + }, + { + "epoch": 1.9807391983342009, + "grad_norm": 0.21026897260525107, + "learning_rate": 1.3669019575298902e-05, + "loss": 0.097, + "step": 3805 + }, + { + "epoch": 1.9812597605413846, + "grad_norm": 0.22344596462757446, + "learning_rate": 1.3656497300570448e-05, + "loss": 0.1017, + "step": 3806 + }, + { + "epoch": 1.9817803227485684, + "grad_norm": 0.23421057868440046, + "learning_rate": 1.3643978608648028e-05, + "loss": 0.1042, + "step": 3807 + }, + { + "epoch": 1.9823008849557522, + "grad_norm": 0.2200839534373584, + "learning_rate": 1.3631463503485634e-05, + "loss": 0.0985, + "step": 3808 + }, + { + "epoch": 1.982821447162936, + "grad_norm": 0.21843886850429678, + "learning_rate": 1.3618951989036102e-05, + "loss": 0.1027, + "step": 3809 + }, + { + "epoch": 1.9833420093701197, + "grad_norm": 0.21096923350471475, + "learning_rate": 1.3606444069251162e-05, + "loss": 0.0993, + "step": 3810 + }, + { + "epoch": 1.9838625715773035, + "grad_norm": 0.21343331941361038, + "learning_rate": 1.3593939748081393e-05, + "loss": 0.0993, + "step": 3811 + }, + { + "epoch": 1.9843831337844873, + "grad_norm": 0.22152103395034903, + "learning_rate": 1.3581439029476255e-05, + "loss": 0.1011, + "step": 3812 + }, + { + "epoch": 1.984903695991671, + "grad_norm": 0.20750708820454272, + "learning_rate": 1.3568941917384036e-05, + "loss": 0.1011, + "step": 3813 + }, + { + "epoch": 1.9854242581988548, + "grad_norm": 0.22282972962697625, + "learning_rate": 1.3556448415751927e-05, + "loss": 0.1, + "step": 3814 + }, + { + "epoch": 1.9859448204060386, + "grad_norm": 0.21973378480721914, + "learning_rate": 1.3543958528525934e-05, + "loss": 0.0998, + "step": 3815 + }, + { + "epoch": 1.9864653826132224, + "grad_norm": 0.21975469487000557, + "learning_rate": 1.3531472259650956e-05, + "loss": 0.1027, + "step": 3816 + }, + { + "epoch": 1.9869859448204061, + "grad_norm": 0.20704264881854378, + "learning_rate": 1.3518989613070745e-05, + "loss": 0.0992, + "step": 3817 + }, + { + "epoch": 1.98750650702759, + "grad_norm": 0.2276001326716296, + "learning_rate": 1.3506510592727889e-05, + "loss": 0.103, + "step": 3818 + }, + { + "epoch": 1.9880270692347737, + "grad_norm": 0.21862242414923339, + "learning_rate": 1.3494035202563842e-05, + "loss": 0.102, + "step": 3819 + }, + { + "epoch": 1.9885476314419575, + "grad_norm": 0.20514072992531857, + "learning_rate": 1.3481563446518924e-05, + "loss": 0.0979, + "step": 3820 + }, + { + "epoch": 1.9890681936491412, + "grad_norm": 0.219390795219004, + "learning_rate": 1.3469095328532305e-05, + "loss": 0.1015, + "step": 3821 + }, + { + "epoch": 1.9895887558563248, + "grad_norm": 0.2234331591905043, + "learning_rate": 1.3456630852541968e-05, + "loss": 0.0986, + "step": 3822 + }, + { + "epoch": 1.9901093180635085, + "grad_norm": 0.22043672801905884, + "learning_rate": 1.3444170022484803e-05, + "loss": 0.1, + "step": 3823 + }, + { + "epoch": 1.9906298802706923, + "grad_norm": 0.2121077118944676, + "learning_rate": 1.3431712842296495e-05, + "loss": 0.0992, + "step": 3824 + }, + { + "epoch": 1.991150442477876, + "grad_norm": 0.2098280218821776, + "learning_rate": 1.3419259315911612e-05, + "loss": 0.0974, + "step": 3825 + }, + { + "epoch": 1.9916710046850599, + "grad_norm": 0.22151509852945767, + "learning_rate": 1.3406809447263569e-05, + "loss": 0.1058, + "step": 3826 + }, + { + "epoch": 1.9921915668922436, + "grad_norm": 0.2188216176247912, + "learning_rate": 1.3394363240284596e-05, + "loss": 0.1001, + "step": 3827 + }, + { + "epoch": 1.9927121290994274, + "grad_norm": 0.2175007496547397, + "learning_rate": 1.3381920698905787e-05, + "loss": 0.0997, + "step": 3828 + }, + { + "epoch": 1.9932326913066112, + "grad_norm": 0.22386791931729272, + "learning_rate": 1.3369481827057084e-05, + "loss": 0.1046, + "step": 3829 + }, + { + "epoch": 1.9937532535137947, + "grad_norm": 0.22189589045670546, + "learning_rate": 1.3357046628667266e-05, + "loss": 0.102, + "step": 3830 + }, + { + "epoch": 1.9942738157209785, + "grad_norm": 0.22222680495180996, + "learning_rate": 1.3344615107663929e-05, + "loss": 0.1015, + "step": 3831 + }, + { + "epoch": 1.9947943779281623, + "grad_norm": 0.21165370943689257, + "learning_rate": 1.3332187267973545e-05, + "loss": 0.1, + "step": 3832 + }, + { + "epoch": 1.995314940135346, + "grad_norm": 0.21493189063483512, + "learning_rate": 1.3319763113521388e-05, + "loss": 0.0988, + "step": 3833 + }, + { + "epoch": 1.9958355023425298, + "grad_norm": 0.2152720610885134, + "learning_rate": 1.3307342648231587e-05, + "loss": 0.1046, + "step": 3834 + }, + { + "epoch": 1.9963560645497136, + "grad_norm": 0.22504773486380045, + "learning_rate": 1.3294925876027112e-05, + "loss": 0.1018, + "step": 3835 + }, + { + "epoch": 1.9968766267568974, + "grad_norm": 0.21719340129386458, + "learning_rate": 1.3282512800829761e-05, + "loss": 0.0994, + "step": 3836 + }, + { + "epoch": 1.9973971889640811, + "grad_norm": 0.2351408224727131, + "learning_rate": 1.3270103426560138e-05, + "loss": 0.1031, + "step": 3837 + }, + { + "epoch": 1.997917751171265, + "grad_norm": 0.22730448971582903, + "learning_rate": 1.3257697757137722e-05, + "loss": 0.1018, + "step": 3838 + }, + { + "epoch": 1.9984383133784487, + "grad_norm": 0.2240262905997672, + "learning_rate": 1.3245295796480789e-05, + "loss": 0.0991, + "step": 3839 + }, + { + "epoch": 1.9989588755856325, + "grad_norm": 0.20159272727142402, + "learning_rate": 1.3232897548506473e-05, + "loss": 0.0929, + "step": 3840 + }, + { + "epoch": 1.9994794377928162, + "grad_norm": 0.2185282556675355, + "learning_rate": 1.3220503017130702e-05, + "loss": 0.1029, + "step": 3841 + }, + { + "epoch": 2.0, + "grad_norm": 0.21010022623920993, + "learning_rate": 1.3208112206268241e-05, + "loss": 0.0951, + "step": 3842 + }, + { + "epoch": 2.0005205622071838, + "grad_norm": 0.23888745487077462, + "learning_rate": 1.3195725119832692e-05, + "loss": 0.0589, + "step": 3843 + }, + { + "epoch": 2.0010411244143675, + "grad_norm": 0.22113727902582608, + "learning_rate": 1.3183341761736474e-05, + "loss": 0.059, + "step": 3844 + }, + { + "epoch": 2.0015616866215513, + "grad_norm": 0.2100996955115601, + "learning_rate": 1.3170962135890838e-05, + "loss": 0.057, + "step": 3845 + }, + { + "epoch": 2.002082248828735, + "grad_norm": 0.21218819629958469, + "learning_rate": 1.3158586246205823e-05, + "loss": 0.0553, + "step": 3846 + }, + { + "epoch": 2.002602811035919, + "grad_norm": 0.20870010973163983, + "learning_rate": 1.3146214096590326e-05, + "loss": 0.0527, + "step": 3847 + }, + { + "epoch": 2.0031233732431026, + "grad_norm": 0.2324010648172641, + "learning_rate": 1.3133845690952045e-05, + "loss": 0.0533, + "step": 3848 + }, + { + "epoch": 2.0036439354502864, + "grad_norm": 0.27402223811767706, + "learning_rate": 1.3121481033197508e-05, + "loss": 0.0548, + "step": 3849 + }, + { + "epoch": 2.00416449765747, + "grad_norm": 0.3136132661537596, + "learning_rate": 1.3109120127232039e-05, + "loss": 0.0561, + "step": 3850 + }, + { + "epoch": 2.004685059864654, + "grad_norm": 0.28713280826038895, + "learning_rate": 1.3096762976959776e-05, + "loss": 0.0554, + "step": 3851 + }, + { + "epoch": 2.0052056220718377, + "grad_norm": 0.2568407408897936, + "learning_rate": 1.3084409586283696e-05, + "loss": 0.0537, + "step": 3852 + }, + { + "epoch": 2.0057261842790215, + "grad_norm": 0.2422352941267545, + "learning_rate": 1.307205995910557e-05, + "loss": 0.0562, + "step": 3853 + }, + { + "epoch": 2.0062467464862053, + "grad_norm": 0.2393420163757943, + "learning_rate": 1.3059714099326e-05, + "loss": 0.055, + "step": 3854 + }, + { + "epoch": 2.006767308693389, + "grad_norm": 0.22062809727347588, + "learning_rate": 1.3047372010844361e-05, + "loss": 0.0532, + "step": 3855 + }, + { + "epoch": 2.007287870900573, + "grad_norm": 0.22318736134320938, + "learning_rate": 1.3035033697558868e-05, + "loss": 0.052, + "step": 3856 + }, + { + "epoch": 2.0078084331077566, + "grad_norm": 0.2175095710848304, + "learning_rate": 1.302269916336653e-05, + "loss": 0.0532, + "step": 3857 + }, + { + "epoch": 2.0083289953149404, + "grad_norm": 0.2043884688220176, + "learning_rate": 1.3010368412163187e-05, + "loss": 0.0522, + "step": 3858 + }, + { + "epoch": 2.0088495575221237, + "grad_norm": 0.21515354889304936, + "learning_rate": 1.2998041447843448e-05, + "loss": 0.055, + "step": 3859 + }, + { + "epoch": 2.0093701197293075, + "grad_norm": 0.22420587505565287, + "learning_rate": 1.2985718274300731e-05, + "loss": 0.0516, + "step": 3860 + }, + { + "epoch": 2.0098906819364912, + "grad_norm": 0.2049175307303528, + "learning_rate": 1.2973398895427283e-05, + "loss": 0.0519, + "step": 3861 + }, + { + "epoch": 2.010411244143675, + "grad_norm": 0.21591901082933884, + "learning_rate": 1.2961083315114131e-05, + "loss": 0.0527, + "step": 3862 + }, + { + "epoch": 2.0109318063508588, + "grad_norm": 0.22484304767098973, + "learning_rate": 1.2948771537251119e-05, + "loss": 0.053, + "step": 3863 + }, + { + "epoch": 2.0114523685580425, + "grad_norm": 0.22695563724281367, + "learning_rate": 1.2936463565726864e-05, + "loss": 0.0522, + "step": 3864 + }, + { + "epoch": 2.0119729307652263, + "grad_norm": 0.22138427669788502, + "learning_rate": 1.2924159404428803e-05, + "loss": 0.052, + "step": 3865 + }, + { + "epoch": 2.01249349297241, + "grad_norm": 0.2260270817057609, + "learning_rate": 1.2911859057243165e-05, + "loss": 0.052, + "step": 3866 + }, + { + "epoch": 2.013014055179594, + "grad_norm": 0.22818335275933718, + "learning_rate": 1.2899562528054981e-05, + "loss": 0.0514, + "step": 3867 + }, + { + "epoch": 2.0135346173867776, + "grad_norm": 0.2291490389139092, + "learning_rate": 1.2887269820748044e-05, + "loss": 0.0522, + "step": 3868 + }, + { + "epoch": 2.0140551795939614, + "grad_norm": 0.2253847436578246, + "learning_rate": 1.2874980939204984e-05, + "loss": 0.0514, + "step": 3869 + }, + { + "epoch": 2.014575741801145, + "grad_norm": 0.2376467323645594, + "learning_rate": 1.2862695887307186e-05, + "loss": 0.0531, + "step": 3870 + }, + { + "epoch": 2.015096304008329, + "grad_norm": 0.22645666631710104, + "learning_rate": 1.285041466893485e-05, + "loss": 0.052, + "step": 3871 + }, + { + "epoch": 2.0156168662155127, + "grad_norm": 0.2422877418801926, + "learning_rate": 1.2838137287966961e-05, + "loss": 0.0507, + "step": 3872 + }, + { + "epoch": 2.0161374284226965, + "grad_norm": 0.2506319561703479, + "learning_rate": 1.282586374828127e-05, + "loss": 0.0503, + "step": 3873 + }, + { + "epoch": 2.0166579906298803, + "grad_norm": 0.23453280208493882, + "learning_rate": 1.2813594053754346e-05, + "loss": 0.0543, + "step": 3874 + }, + { + "epoch": 2.017178552837064, + "grad_norm": 0.22699601063395236, + "learning_rate": 1.2801328208261526e-05, + "loss": 0.0524, + "step": 3875 + }, + { + "epoch": 2.017699115044248, + "grad_norm": 0.24112589242280094, + "learning_rate": 1.2789066215676943e-05, + "loss": 0.0522, + "step": 3876 + }, + { + "epoch": 2.0182196772514316, + "grad_norm": 0.23871504911812172, + "learning_rate": 1.2776808079873487e-05, + "loss": 0.0535, + "step": 3877 + }, + { + "epoch": 2.0187402394586154, + "grad_norm": 0.23353444655198632, + "learning_rate": 1.2764553804722867e-05, + "loss": 0.052, + "step": 3878 + }, + { + "epoch": 2.019260801665799, + "grad_norm": 0.21500168671332048, + "learning_rate": 1.2752303394095538e-05, + "loss": 0.0499, + "step": 3879 + }, + { + "epoch": 2.019781363872983, + "grad_norm": 0.22650014917043784, + "learning_rate": 1.2740056851860754e-05, + "loss": 0.0556, + "step": 3880 + }, + { + "epoch": 2.0203019260801667, + "grad_norm": 0.23063435161871682, + "learning_rate": 1.2727814181886555e-05, + "loss": 0.0529, + "step": 3881 + }, + { + "epoch": 2.0208224882873504, + "grad_norm": 0.22502317978846956, + "learning_rate": 1.2715575388039724e-05, + "loss": 0.0531, + "step": 3882 + }, + { + "epoch": 2.021343050494534, + "grad_norm": 0.23365639149073505, + "learning_rate": 1.2703340474185854e-05, + "loss": 0.0542, + "step": 3883 + }, + { + "epoch": 2.021863612701718, + "grad_norm": 0.22024899646428664, + "learning_rate": 1.2691109444189303e-05, + "loss": 0.0503, + "step": 3884 + }, + { + "epoch": 2.0223841749089018, + "grad_norm": 0.23270362452207452, + "learning_rate": 1.2678882301913202e-05, + "loss": 0.0551, + "step": 3885 + }, + { + "epoch": 2.0229047371160855, + "grad_norm": 0.23138418992597184, + "learning_rate": 1.2666659051219437e-05, + "loss": 0.053, + "step": 3886 + }, + { + "epoch": 2.0234252993232693, + "grad_norm": 0.2239621991704845, + "learning_rate": 1.2654439695968696e-05, + "loss": 0.0508, + "step": 3887 + }, + { + "epoch": 2.023945861530453, + "grad_norm": 0.21409030138660093, + "learning_rate": 1.2642224240020404e-05, + "loss": 0.0521, + "step": 3888 + }, + { + "epoch": 2.024466423737637, + "grad_norm": 0.2342885910165884, + "learning_rate": 1.263001268723278e-05, + "loss": 0.0511, + "step": 3889 + }, + { + "epoch": 2.02498698594482, + "grad_norm": 0.230010672562304, + "learning_rate": 1.2617805041462805e-05, + "loss": 0.0532, + "step": 3890 + }, + { + "epoch": 2.025507548152004, + "grad_norm": 0.22439174131590303, + "learning_rate": 1.2605601306566205e-05, + "loss": 0.0506, + "step": 3891 + }, + { + "epoch": 2.0260281103591877, + "grad_norm": 0.22388621675313647, + "learning_rate": 1.2593401486397499e-05, + "loss": 0.0517, + "step": 3892 + }, + { + "epoch": 2.0265486725663715, + "grad_norm": 0.22601607131458107, + "learning_rate": 1.258120558480996e-05, + "loss": 0.0526, + "step": 3893 + }, + { + "epoch": 2.0270692347735553, + "grad_norm": 0.22319982470499325, + "learning_rate": 1.2569013605655627e-05, + "loss": 0.0503, + "step": 3894 + }, + { + "epoch": 2.027589796980739, + "grad_norm": 0.23487166926356898, + "learning_rate": 1.2556825552785273e-05, + "loss": 0.0527, + "step": 3895 + }, + { + "epoch": 2.028110359187923, + "grad_norm": 0.23072170266065373, + "learning_rate": 1.2544641430048479e-05, + "loss": 0.052, + "step": 3896 + }, + { + "epoch": 2.0286309213951066, + "grad_norm": 0.22670085314206623, + "learning_rate": 1.2532461241293531e-05, + "loss": 0.0505, + "step": 3897 + }, + { + "epoch": 2.0291514836022904, + "grad_norm": 0.23480717165950493, + "learning_rate": 1.2520284990367514e-05, + "loss": 0.052, + "step": 3898 + }, + { + "epoch": 2.029672045809474, + "grad_norm": 0.22717862102204162, + "learning_rate": 1.2508112681116263e-05, + "loss": 0.0513, + "step": 3899 + }, + { + "epoch": 2.030192608016658, + "grad_norm": 0.2259905619825204, + "learning_rate": 1.2495944317384337e-05, + "loss": 0.0513, + "step": 3900 + }, + { + "epoch": 2.0307131702238417, + "grad_norm": 0.23979551126797663, + "learning_rate": 1.2483779903015086e-05, + "loss": 0.0498, + "step": 3901 + }, + { + "epoch": 2.0312337324310255, + "grad_norm": 0.22847333964606212, + "learning_rate": 1.2471619441850596e-05, + "loss": 0.0509, + "step": 3902 + }, + { + "epoch": 2.0317542946382092, + "grad_norm": 0.23177006094278615, + "learning_rate": 1.2459462937731708e-05, + "loss": 0.0506, + "step": 3903 + }, + { + "epoch": 2.032274856845393, + "grad_norm": 0.24262261994408266, + "learning_rate": 1.2447310394498019e-05, + "loss": 0.053, + "step": 3904 + }, + { + "epoch": 2.0327954190525768, + "grad_norm": 0.23652782752700235, + "learning_rate": 1.2435161815987859e-05, + "loss": 0.0528, + "step": 3905 + }, + { + "epoch": 2.0333159812597605, + "grad_norm": 0.23162625176396265, + "learning_rate": 1.2423017206038307e-05, + "loss": 0.0537, + "step": 3906 + }, + { + "epoch": 2.0338365434669443, + "grad_norm": 0.22628246516711711, + "learning_rate": 1.2410876568485203e-05, + "loss": 0.0523, + "step": 3907 + }, + { + "epoch": 2.034357105674128, + "grad_norm": 0.23814151638305578, + "learning_rate": 1.2398739907163124e-05, + "loss": 0.0518, + "step": 3908 + }, + { + "epoch": 2.034877667881312, + "grad_norm": 0.24212468958455002, + "learning_rate": 1.2386607225905405e-05, + "loss": 0.0522, + "step": 3909 + }, + { + "epoch": 2.0353982300884956, + "grad_norm": 0.22458499825758288, + "learning_rate": 1.2374478528544092e-05, + "loss": 0.0503, + "step": 3910 + }, + { + "epoch": 2.0359187922956794, + "grad_norm": 0.2154706609637643, + "learning_rate": 1.2362353818910002e-05, + "loss": 0.0505, + "step": 3911 + }, + { + "epoch": 2.036439354502863, + "grad_norm": 0.23207911313268095, + "learning_rate": 1.2350233100832678e-05, + "loss": 0.0523, + "step": 3912 + }, + { + "epoch": 2.036959916710047, + "grad_norm": 0.2275097951477535, + "learning_rate": 1.2338116378140424e-05, + "loss": 0.0508, + "step": 3913 + }, + { + "epoch": 2.0374804789172307, + "grad_norm": 0.24046310726808087, + "learning_rate": 1.2326003654660249e-05, + "loss": 0.0486, + "step": 3914 + }, + { + "epoch": 2.0380010411244145, + "grad_norm": 0.25638052879623396, + "learning_rate": 1.2313894934217907e-05, + "loss": 0.0506, + "step": 3915 + }, + { + "epoch": 2.0385216033315983, + "grad_norm": 0.2241861365775457, + "learning_rate": 1.2301790220637904e-05, + "loss": 0.0504, + "step": 3916 + }, + { + "epoch": 2.039042165538782, + "grad_norm": 0.21754058305545665, + "learning_rate": 1.2289689517743475e-05, + "loss": 0.0478, + "step": 3917 + }, + { + "epoch": 2.039562727745966, + "grad_norm": 0.2471236978327747, + "learning_rate": 1.2277592829356593e-05, + "loss": 0.0539, + "step": 3918 + }, + { + "epoch": 2.0400832899531496, + "grad_norm": 0.23244298598394517, + "learning_rate": 1.2265500159297935e-05, + "loss": 0.0514, + "step": 3919 + }, + { + "epoch": 2.0406038521603334, + "grad_norm": 0.2434195925375697, + "learning_rate": 1.2253411511386938e-05, + "loss": 0.0529, + "step": 3920 + }, + { + "epoch": 2.041124414367517, + "grad_norm": 0.23277356231448715, + "learning_rate": 1.2241326889441763e-05, + "loss": 0.0509, + "step": 3921 + }, + { + "epoch": 2.041644976574701, + "grad_norm": 0.2326730216527539, + "learning_rate": 1.2229246297279302e-05, + "loss": 0.0499, + "step": 3922 + }, + { + "epoch": 2.0421655387818842, + "grad_norm": 0.22920084540866484, + "learning_rate": 1.2217169738715162e-05, + "loss": 0.052, + "step": 3923 + }, + { + "epoch": 2.042686100989068, + "grad_norm": 0.2283017342208316, + "learning_rate": 1.2205097217563668e-05, + "loss": 0.0503, + "step": 3924 + }, + { + "epoch": 2.0432066631962518, + "grad_norm": 0.2316172935733046, + "learning_rate": 1.2193028737637897e-05, + "loss": 0.0519, + "step": 3925 + }, + { + "epoch": 2.0437272254034355, + "grad_norm": 0.2165080484448344, + "learning_rate": 1.2180964302749637e-05, + "loss": 0.0483, + "step": 3926 + }, + { + "epoch": 2.0442477876106193, + "grad_norm": 0.21978990050785774, + "learning_rate": 1.2168903916709404e-05, + "loss": 0.0499, + "step": 3927 + }, + { + "epoch": 2.044768349817803, + "grad_norm": 0.24452756991961144, + "learning_rate": 1.2156847583326414e-05, + "loss": 0.0532, + "step": 3928 + }, + { + "epoch": 2.045288912024987, + "grad_norm": 0.21604477471728578, + "learning_rate": 1.2144795306408626e-05, + "loss": 0.0484, + "step": 3929 + }, + { + "epoch": 2.0458094742321706, + "grad_norm": 0.22490194735030902, + "learning_rate": 1.213274708976271e-05, + "loss": 0.0507, + "step": 3930 + }, + { + "epoch": 2.0463300364393544, + "grad_norm": 0.23117597236920664, + "learning_rate": 1.2120702937194061e-05, + "loss": 0.0499, + "step": 3931 + }, + { + "epoch": 2.046850598646538, + "grad_norm": 0.2302153815021224, + "learning_rate": 1.2108662852506778e-05, + "loss": 0.0514, + "step": 3932 + }, + { + "epoch": 2.047371160853722, + "grad_norm": 0.22991731459055237, + "learning_rate": 1.2096626839503666e-05, + "loss": 0.0504, + "step": 3933 + }, + { + "epoch": 2.0478917230609057, + "grad_norm": 0.2336931030300491, + "learning_rate": 1.2084594901986271e-05, + "loss": 0.051, + "step": 3934 + }, + { + "epoch": 2.0484122852680895, + "grad_norm": 0.23611354689827643, + "learning_rate": 1.2072567043754837e-05, + "loss": 0.0505, + "step": 3935 + }, + { + "epoch": 2.0489328474752733, + "grad_norm": 0.24039532523121837, + "learning_rate": 1.2060543268608329e-05, + "loss": 0.0533, + "step": 3936 + }, + { + "epoch": 2.049453409682457, + "grad_norm": 0.23726727147966595, + "learning_rate": 1.2048523580344398e-05, + "loss": 0.0507, + "step": 3937 + }, + { + "epoch": 2.049973971889641, + "grad_norm": 0.2385742302487926, + "learning_rate": 1.2036507982759431e-05, + "loss": 0.0526, + "step": 3938 + }, + { + "epoch": 2.0504945340968246, + "grad_norm": 0.22713140069481194, + "learning_rate": 1.2024496479648514e-05, + "loss": 0.0507, + "step": 3939 + }, + { + "epoch": 2.0510150963040084, + "grad_norm": 0.22422405740318607, + "learning_rate": 1.2012489074805444e-05, + "loss": 0.049, + "step": 3940 + }, + { + "epoch": 2.051535658511192, + "grad_norm": 0.23105843716868302, + "learning_rate": 1.200048577202271e-05, + "loss": 0.0529, + "step": 3941 + }, + { + "epoch": 2.052056220718376, + "grad_norm": 0.23528988007664547, + "learning_rate": 1.1988486575091507e-05, + "loss": 0.0533, + "step": 3942 + }, + { + "epoch": 2.0525767829255597, + "grad_norm": 0.23894532699169194, + "learning_rate": 1.1976491487801748e-05, + "loss": 0.0524, + "step": 3943 + }, + { + "epoch": 2.0530973451327434, + "grad_norm": 0.23433351722426457, + "learning_rate": 1.1964500513942034e-05, + "loss": 0.0503, + "step": 3944 + }, + { + "epoch": 2.053617907339927, + "grad_norm": 0.2429361501735258, + "learning_rate": 1.1952513657299691e-05, + "loss": 0.0525, + "step": 3945 + }, + { + "epoch": 2.054138469547111, + "grad_norm": 0.2357366347477472, + "learning_rate": 1.1940530921660703e-05, + "loss": 0.0508, + "step": 3946 + }, + { + "epoch": 2.0546590317542948, + "grad_norm": 0.22479006903552398, + "learning_rate": 1.1928552310809785e-05, + "loss": 0.0499, + "step": 3947 + }, + { + "epoch": 2.0551795939614785, + "grad_norm": 0.22971193296557763, + "learning_rate": 1.191657782853034e-05, + "loss": 0.0505, + "step": 3948 + }, + { + "epoch": 2.0557001561686623, + "grad_norm": 0.2251516109143194, + "learning_rate": 1.1904607478604476e-05, + "loss": 0.0508, + "step": 3949 + }, + { + "epoch": 2.056220718375846, + "grad_norm": 0.21994611285526833, + "learning_rate": 1.1892641264812978e-05, + "loss": 0.0491, + "step": 3950 + }, + { + "epoch": 2.05674128058303, + "grad_norm": 0.23498529308957097, + "learning_rate": 1.1880679190935323e-05, + "loss": 0.0505, + "step": 3951 + }, + { + "epoch": 2.0572618427902136, + "grad_norm": 0.24181869335547906, + "learning_rate": 1.1868721260749699e-05, + "loss": 0.0506, + "step": 3952 + }, + { + "epoch": 2.0577824049973974, + "grad_norm": 0.251727766194006, + "learning_rate": 1.1856767478032979e-05, + "loss": 0.0524, + "step": 3953 + }, + { + "epoch": 2.0583029672045807, + "grad_norm": 0.22110211479371478, + "learning_rate": 1.1844817846560732e-05, + "loss": 0.0489, + "step": 3954 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.25357982403761625, + "learning_rate": 1.183287237010719e-05, + "loss": 0.0524, + "step": 3955 + }, + { + "epoch": 2.0593440916189483, + "grad_norm": 0.2435984952058527, + "learning_rate": 1.1820931052445297e-05, + "loss": 0.053, + "step": 3956 + }, + { + "epoch": 2.059864653826132, + "grad_norm": 0.23320265572221055, + "learning_rate": 1.180899389734668e-05, + "loss": 0.0494, + "step": 3957 + }, + { + "epoch": 2.060385216033316, + "grad_norm": 0.23298375809694774, + "learning_rate": 1.1797060908581656e-05, + "loss": 0.052, + "step": 3958 + }, + { + "epoch": 2.0609057782404996, + "grad_norm": 0.24071234192220964, + "learning_rate": 1.1785132089919208e-05, + "loss": 0.0513, + "step": 3959 + }, + { + "epoch": 2.0614263404476834, + "grad_norm": 0.21300744847374067, + "learning_rate": 1.1773207445127005e-05, + "loss": 0.0488, + "step": 3960 + }, + { + "epoch": 2.061946902654867, + "grad_norm": 0.22968959003299735, + "learning_rate": 1.1761286977971412e-05, + "loss": 0.0515, + "step": 3961 + }, + { + "epoch": 2.062467464862051, + "grad_norm": 0.2293890367687117, + "learning_rate": 1.1749370692217465e-05, + "loss": 0.0525, + "step": 3962 + }, + { + "epoch": 2.0629880270692347, + "grad_norm": 0.22697976848422804, + "learning_rate": 1.1737458591628897e-05, + "loss": 0.053, + "step": 3963 + }, + { + "epoch": 2.0635085892764184, + "grad_norm": 0.23070259973942472, + "learning_rate": 1.1725550679968084e-05, + "loss": 0.0515, + "step": 3964 + }, + { + "epoch": 2.064029151483602, + "grad_norm": 0.21962051644788558, + "learning_rate": 1.1713646960996102e-05, + "loss": 0.0509, + "step": 3965 + }, + { + "epoch": 2.064549713690786, + "grad_norm": 0.23542235445056928, + "learning_rate": 1.1701747438472704e-05, + "loss": 0.0513, + "step": 3966 + }, + { + "epoch": 2.0650702758979698, + "grad_norm": 0.2249101062878514, + "learning_rate": 1.1689852116156313e-05, + "loss": 0.051, + "step": 3967 + }, + { + "epoch": 2.0655908381051535, + "grad_norm": 0.22365269003898253, + "learning_rate": 1.1677960997804047e-05, + "loss": 0.0519, + "step": 3968 + }, + { + "epoch": 2.0661114003123373, + "grad_norm": 0.23393573206151985, + "learning_rate": 1.1666074087171627e-05, + "loss": 0.0505, + "step": 3969 + }, + { + "epoch": 2.066631962519521, + "grad_norm": 0.22998763268787198, + "learning_rate": 1.1654191388013521e-05, + "loss": 0.0484, + "step": 3970 + }, + { + "epoch": 2.067152524726705, + "grad_norm": 0.2439217995498246, + "learning_rate": 1.1642312904082835e-05, + "loss": 0.0517, + "step": 3971 + }, + { + "epoch": 2.0676730869338886, + "grad_norm": 0.24085294418610648, + "learning_rate": 1.163043863913135e-05, + "loss": 0.0538, + "step": 3972 + }, + { + "epoch": 2.0681936491410724, + "grad_norm": 0.22810652141332463, + "learning_rate": 1.1618568596909512e-05, + "loss": 0.0488, + "step": 3973 + }, + { + "epoch": 2.068714211348256, + "grad_norm": 0.23657856963046003, + "learning_rate": 1.1606702781166423e-05, + "loss": 0.0501, + "step": 3974 + }, + { + "epoch": 2.06923477355544, + "grad_norm": 0.23158656572125352, + "learning_rate": 1.1594841195649866e-05, + "loss": 0.0516, + "step": 3975 + }, + { + "epoch": 2.0697553357626237, + "grad_norm": 0.23564568907963002, + "learning_rate": 1.1582983844106282e-05, + "loss": 0.0523, + "step": 3976 + }, + { + "epoch": 2.0702758979698075, + "grad_norm": 0.21940900341049915, + "learning_rate": 1.1571130730280786e-05, + "loss": 0.0505, + "step": 3977 + }, + { + "epoch": 2.0707964601769913, + "grad_norm": 0.23826470500495703, + "learning_rate": 1.1559281857917125e-05, + "loss": 0.0502, + "step": 3978 + }, + { + "epoch": 2.071317022384175, + "grad_norm": 0.22049292585636585, + "learning_rate": 1.1547437230757726e-05, + "loss": 0.0504, + "step": 3979 + }, + { + "epoch": 2.071837584591359, + "grad_norm": 0.23925268301002867, + "learning_rate": 1.1535596852543675e-05, + "loss": 0.0528, + "step": 3980 + }, + { + "epoch": 2.0723581467985426, + "grad_norm": 0.22626393109224524, + "learning_rate": 1.1523760727014721e-05, + "loss": 0.0521, + "step": 3981 + }, + { + "epoch": 2.0728787090057263, + "grad_norm": 0.23358296619905644, + "learning_rate": 1.1511928857909266e-05, + "loss": 0.0521, + "step": 3982 + }, + { + "epoch": 2.07339927121291, + "grad_norm": 0.22578239853084178, + "learning_rate": 1.1500101248964348e-05, + "loss": 0.049, + "step": 3983 + }, + { + "epoch": 2.073919833420094, + "grad_norm": 0.22302089050267151, + "learning_rate": 1.1488277903915689e-05, + "loss": 0.0496, + "step": 3984 + }, + { + "epoch": 2.0744403956272777, + "grad_norm": 0.22645035221882673, + "learning_rate": 1.147645882649765e-05, + "loss": 0.0511, + "step": 3985 + }, + { + "epoch": 2.0749609578344614, + "grad_norm": 0.23257285368266278, + "learning_rate": 1.1464644020443253e-05, + "loss": 0.0498, + "step": 3986 + }, + { + "epoch": 2.0754815200416448, + "grad_norm": 0.23394643638643065, + "learning_rate": 1.1452833489484155e-05, + "loss": 0.0508, + "step": 3987 + }, + { + "epoch": 2.0760020822488285, + "grad_norm": 0.2302396225330739, + "learning_rate": 1.1441027237350663e-05, + "loss": 0.0501, + "step": 3988 + }, + { + "epoch": 2.0765226444560123, + "grad_norm": 0.2364153136455663, + "learning_rate": 1.142922526777175e-05, + "loss": 0.0532, + "step": 3989 + }, + { + "epoch": 2.077043206663196, + "grad_norm": 0.24046454093029992, + "learning_rate": 1.1417427584475027e-05, + "loss": 0.0526, + "step": 3990 + }, + { + "epoch": 2.07756376887038, + "grad_norm": 0.23257494130614306, + "learning_rate": 1.1405634191186759e-05, + "loss": 0.0515, + "step": 3991 + }, + { + "epoch": 2.0780843310775636, + "grad_norm": 0.2327035472109291, + "learning_rate": 1.1393845091631833e-05, + "loss": 0.0497, + "step": 3992 + }, + { + "epoch": 2.0786048932847474, + "grad_norm": 0.23438697636486627, + "learning_rate": 1.1382060289533804e-05, + "loss": 0.0507, + "step": 3993 + }, + { + "epoch": 2.079125455491931, + "grad_norm": 0.23128247990194226, + "learning_rate": 1.1370279788614856e-05, + "loss": 0.0508, + "step": 3994 + }, + { + "epoch": 2.079646017699115, + "grad_norm": 0.23693622205792916, + "learning_rate": 1.1358503592595837e-05, + "loss": 0.0508, + "step": 3995 + }, + { + "epoch": 2.0801665799062987, + "grad_norm": 0.23110049467220117, + "learning_rate": 1.1346731705196204e-05, + "loss": 0.0501, + "step": 3996 + }, + { + "epoch": 2.0806871421134825, + "grad_norm": 0.24306539756134846, + "learning_rate": 1.1334964130134055e-05, + "loss": 0.0538, + "step": 3997 + }, + { + "epoch": 2.0812077043206663, + "grad_norm": 0.219892553239681, + "learning_rate": 1.132320087112615e-05, + "loss": 0.0478, + "step": 3998 + }, + { + "epoch": 2.08172826652785, + "grad_norm": 0.24126403657509093, + "learning_rate": 1.1311441931887873e-05, + "loss": 0.0519, + "step": 3999 + }, + { + "epoch": 2.082248828735034, + "grad_norm": 0.2248359123191256, + "learning_rate": 1.1299687316133256e-05, + "loss": 0.0497, + "step": 4000 + }, + { + "epoch": 2.0827693909422176, + "grad_norm": 0.24495968302747279, + "learning_rate": 1.1287937027574933e-05, + "loss": 0.0536, + "step": 4001 + }, + { + "epoch": 2.0832899531494014, + "grad_norm": 0.2280996896351997, + "learning_rate": 1.1276191069924197e-05, + "loss": 0.0516, + "step": 4002 + }, + { + "epoch": 2.083810515356585, + "grad_norm": 0.22983408181704557, + "learning_rate": 1.1264449446890975e-05, + "loss": 0.0497, + "step": 4003 + }, + { + "epoch": 2.084331077563769, + "grad_norm": 0.22345330078372413, + "learning_rate": 1.1252712162183825e-05, + "loss": 0.0481, + "step": 4004 + }, + { + "epoch": 2.0848516397709527, + "grad_norm": 0.22929949981841313, + "learning_rate": 1.1240979219509917e-05, + "loss": 0.0508, + "step": 4005 + }, + { + "epoch": 2.0853722019781364, + "grad_norm": 0.22837492034567583, + "learning_rate": 1.1229250622575052e-05, + "loss": 0.0494, + "step": 4006 + }, + { + "epoch": 2.08589276418532, + "grad_norm": 0.2343978470902539, + "learning_rate": 1.1217526375083675e-05, + "loss": 0.0499, + "step": 4007 + }, + { + "epoch": 2.086413326392504, + "grad_norm": 0.2336189431913341, + "learning_rate": 1.120580648073885e-05, + "loss": 0.0516, + "step": 4008 + }, + { + "epoch": 2.0869338885996878, + "grad_norm": 0.23282836656117412, + "learning_rate": 1.1194090943242278e-05, + "loss": 0.0503, + "step": 4009 + }, + { + "epoch": 2.0874544508068715, + "grad_norm": 0.24746886009644017, + "learning_rate": 1.1182379766294244e-05, + "loss": 0.0522, + "step": 4010 + }, + { + "epoch": 2.0879750130140553, + "grad_norm": 0.23120603188037314, + "learning_rate": 1.1170672953593696e-05, + "loss": 0.0501, + "step": 4011 + }, + { + "epoch": 2.088495575221239, + "grad_norm": 0.23047710533447974, + "learning_rate": 1.1158970508838193e-05, + "loss": 0.0482, + "step": 4012 + }, + { + "epoch": 2.089016137428423, + "grad_norm": 0.2524446742959235, + "learning_rate": 1.1147272435723918e-05, + "loss": 0.05, + "step": 4013 + }, + { + "epoch": 2.0895366996356066, + "grad_norm": 0.25459941682903553, + "learning_rate": 1.113557873794566e-05, + "loss": 0.0521, + "step": 4014 + }, + { + "epoch": 2.0900572618427904, + "grad_norm": 0.22652476025578092, + "learning_rate": 1.1123889419196821e-05, + "loss": 0.0514, + "step": 4015 + }, + { + "epoch": 2.090577824049974, + "grad_norm": 0.2288103168103935, + "learning_rate": 1.1112204483169439e-05, + "loss": 0.0506, + "step": 4016 + }, + { + "epoch": 2.091098386257158, + "grad_norm": 0.24150820992396882, + "learning_rate": 1.1100523933554166e-05, + "loss": 0.05, + "step": 4017 + }, + { + "epoch": 2.0916189484643413, + "grad_norm": 0.2322103337709988, + "learning_rate": 1.108884777404027e-05, + "loss": 0.0507, + "step": 4018 + }, + { + "epoch": 2.092139510671525, + "grad_norm": 0.22947198807908026, + "learning_rate": 1.1077176008315606e-05, + "loss": 0.0509, + "step": 4019 + }, + { + "epoch": 2.092660072878709, + "grad_norm": 0.23943146290614956, + "learning_rate": 1.1065508640066672e-05, + "loss": 0.0505, + "step": 4020 + }, + { + "epoch": 2.0931806350858926, + "grad_norm": 0.2487064925589514, + "learning_rate": 1.1053845672978567e-05, + "loss": 0.0535, + "step": 4021 + }, + { + "epoch": 2.0937011972930764, + "grad_norm": 0.23593594385346, + "learning_rate": 1.1042187110735e-05, + "loss": 0.0503, + "step": 4022 + }, + { + "epoch": 2.09422175950026, + "grad_norm": 0.24229015214508823, + "learning_rate": 1.1030532957018288e-05, + "loss": 0.0512, + "step": 4023 + }, + { + "epoch": 2.094742321707444, + "grad_norm": 0.2257929833420632, + "learning_rate": 1.1018883215509343e-05, + "loss": 0.0485, + "step": 4024 + }, + { + "epoch": 2.0952628839146277, + "grad_norm": 0.23317922327013857, + "learning_rate": 1.10072378898877e-05, + "loss": 0.0496, + "step": 4025 + }, + { + "epoch": 2.0957834461218114, + "grad_norm": 0.2375777735308176, + "learning_rate": 1.0995596983831502e-05, + "loss": 0.0512, + "step": 4026 + }, + { + "epoch": 2.096304008328995, + "grad_norm": 0.23856279889569232, + "learning_rate": 1.0983960501017492e-05, + "loss": 0.0495, + "step": 4027 + }, + { + "epoch": 2.096824570536179, + "grad_norm": 0.23551216040588757, + "learning_rate": 1.0972328445121e-05, + "loss": 0.0495, + "step": 4028 + }, + { + "epoch": 2.0973451327433628, + "grad_norm": 0.24501438351869415, + "learning_rate": 1.0960700819815973e-05, + "loss": 0.0491, + "step": 4029 + }, + { + "epoch": 2.0978656949505465, + "grad_norm": 0.23782057609972138, + "learning_rate": 1.0949077628774961e-05, + "loss": 0.0488, + "step": 4030 + }, + { + "epoch": 2.0983862571577303, + "grad_norm": 0.23261361096280053, + "learning_rate": 1.0937458875669119e-05, + "loss": 0.0513, + "step": 4031 + }, + { + "epoch": 2.098906819364914, + "grad_norm": 0.24475133620833012, + "learning_rate": 1.0925844564168175e-05, + "loss": 0.05, + "step": 4032 + }, + { + "epoch": 2.099427381572098, + "grad_norm": 0.28758926057418177, + "learning_rate": 1.0914234697940465e-05, + "loss": 0.0496, + "step": 4033 + }, + { + "epoch": 2.0999479437792816, + "grad_norm": 0.22436454655048274, + "learning_rate": 1.0902629280652931e-05, + "loss": 0.0506, + "step": 4034 + }, + { + "epoch": 2.1004685059864654, + "grad_norm": 0.234024748448541, + "learning_rate": 1.0891028315971105e-05, + "loss": 0.0485, + "step": 4035 + }, + { + "epoch": 2.100989068193649, + "grad_norm": 0.22684434792800956, + "learning_rate": 1.0879431807559116e-05, + "loss": 0.0463, + "step": 4036 + }, + { + "epoch": 2.101509630400833, + "grad_norm": 0.23052518036453548, + "learning_rate": 1.0867839759079682e-05, + "loss": 0.0489, + "step": 4037 + }, + { + "epoch": 2.1020301926080167, + "grad_norm": 0.23554676365556773, + "learning_rate": 1.0856252174194096e-05, + "loss": 0.0501, + "step": 4038 + }, + { + "epoch": 2.1025507548152005, + "grad_norm": 0.2364179349009896, + "learning_rate": 1.084466905656227e-05, + "loss": 0.0486, + "step": 4039 + }, + { + "epoch": 2.1030713170223843, + "grad_norm": 0.2315928309233159, + "learning_rate": 1.0833090409842694e-05, + "loss": 0.0485, + "step": 4040 + }, + { + "epoch": 2.103591879229568, + "grad_norm": 0.24327580350563419, + "learning_rate": 1.0821516237692434e-05, + "loss": 0.0524, + "step": 4041 + }, + { + "epoch": 2.104112441436752, + "grad_norm": 0.22953852222832158, + "learning_rate": 1.080994654376716e-05, + "loss": 0.0475, + "step": 4042 + }, + { + "epoch": 2.1046330036439356, + "grad_norm": 0.23269426088814224, + "learning_rate": 1.0798381331721109e-05, + "loss": 0.0491, + "step": 4043 + }, + { + "epoch": 2.1051535658511193, + "grad_norm": 0.2393902668332969, + "learning_rate": 1.0786820605207117e-05, + "loss": 0.0496, + "step": 4044 + }, + { + "epoch": 2.105674128058303, + "grad_norm": 0.23157080328354862, + "learning_rate": 1.0775264367876605e-05, + "loss": 0.0509, + "step": 4045 + }, + { + "epoch": 2.106194690265487, + "grad_norm": 0.24639805362000278, + "learning_rate": 1.0763712623379577e-05, + "loss": 0.0517, + "step": 4046 + }, + { + "epoch": 2.1067152524726707, + "grad_norm": 0.25130598484708905, + "learning_rate": 1.0752165375364593e-05, + "loss": 0.0528, + "step": 4047 + }, + { + "epoch": 2.1072358146798544, + "grad_norm": 0.22906779523998783, + "learning_rate": 1.0740622627478821e-05, + "loss": 0.0505, + "step": 4048 + }, + { + "epoch": 2.107756376887038, + "grad_norm": 0.22056035942244043, + "learning_rate": 1.0729084383368005e-05, + "loss": 0.0489, + "step": 4049 + }, + { + "epoch": 2.108276939094222, + "grad_norm": 0.24068696987121577, + "learning_rate": 1.0717550646676443e-05, + "loss": 0.0508, + "step": 4050 + }, + { + "epoch": 2.1087975013014053, + "grad_norm": 0.22714859742711338, + "learning_rate": 1.0706021421047047e-05, + "loss": 0.0506, + "step": 4051 + }, + { + "epoch": 2.109318063508589, + "grad_norm": 0.22747067082854852, + "learning_rate": 1.0694496710121257e-05, + "loss": 0.0491, + "step": 4052 + }, + { + "epoch": 2.109838625715773, + "grad_norm": 0.23375027504810572, + "learning_rate": 1.0682976517539128e-05, + "loss": 0.05, + "step": 4053 + }, + { + "epoch": 2.1103591879229566, + "grad_norm": 0.22132944162068763, + "learning_rate": 1.0671460846939274e-05, + "loss": 0.0482, + "step": 4054 + }, + { + "epoch": 2.1108797501301404, + "grad_norm": 0.22882347387684723, + "learning_rate": 1.0659949701958885e-05, + "loss": 0.0499, + "step": 4055 + }, + { + "epoch": 2.111400312337324, + "grad_norm": 0.2312263646586655, + "learning_rate": 1.0648443086233697e-05, + "loss": 0.0511, + "step": 4056 + }, + { + "epoch": 2.111920874544508, + "grad_norm": 0.24220776541349695, + "learning_rate": 1.063694100339805e-05, + "loss": 0.0516, + "step": 4057 + }, + { + "epoch": 2.1124414367516917, + "grad_norm": 0.22625059430843475, + "learning_rate": 1.0625443457084841e-05, + "loss": 0.0494, + "step": 4058 + }, + { + "epoch": 2.1129619989588755, + "grad_norm": 0.23299358120448602, + "learning_rate": 1.0613950450925513e-05, + "loss": 0.0495, + "step": 4059 + }, + { + "epoch": 2.1134825611660593, + "grad_norm": 0.23342314525886373, + "learning_rate": 1.060246198855011e-05, + "loss": 0.0494, + "step": 4060 + }, + { + "epoch": 2.114003123373243, + "grad_norm": 0.2421280557624441, + "learning_rate": 1.059097807358721e-05, + "loss": 0.0503, + "step": 4061 + }, + { + "epoch": 2.114523685580427, + "grad_norm": 0.22689249003133796, + "learning_rate": 1.0579498709663968e-05, + "loss": 0.0479, + "step": 4062 + }, + { + "epoch": 2.1150442477876106, + "grad_norm": 0.2392848913272859, + "learning_rate": 1.0568023900406107e-05, + "loss": 0.052, + "step": 4063 + }, + { + "epoch": 2.1155648099947943, + "grad_norm": 0.25646230307725143, + "learning_rate": 1.0556553649437914e-05, + "loss": 0.0513, + "step": 4064 + }, + { + "epoch": 2.116085372201978, + "grad_norm": 0.2416279360872608, + "learning_rate": 1.0545087960382211e-05, + "loss": 0.0515, + "step": 4065 + }, + { + "epoch": 2.116605934409162, + "grad_norm": 0.2231969043709799, + "learning_rate": 1.05336268368604e-05, + "loss": 0.0491, + "step": 4066 + }, + { + "epoch": 2.1171264966163457, + "grad_norm": 0.2425556380715717, + "learning_rate": 1.0522170282492444e-05, + "loss": 0.0495, + "step": 4067 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.2383579552897591, + "learning_rate": 1.051071830089686e-05, + "loss": 0.0517, + "step": 4068 + }, + { + "epoch": 2.118167621030713, + "grad_norm": 0.24083746604101072, + "learning_rate": 1.049927089569071e-05, + "loss": 0.0514, + "step": 4069 + }, + { + "epoch": 2.118688183237897, + "grad_norm": 0.23603705033415373, + "learning_rate": 1.048782807048961e-05, + "loss": 0.0509, + "step": 4070 + }, + { + "epoch": 2.1192087454450808, + "grad_norm": 0.22699841700025328, + "learning_rate": 1.0476389828907743e-05, + "loss": 0.0507, + "step": 4071 + }, + { + "epoch": 2.1197293076522645, + "grad_norm": 0.2384365332964614, + "learning_rate": 1.046495617455784e-05, + "loss": 0.0506, + "step": 4072 + }, + { + "epoch": 2.1202498698594483, + "grad_norm": 0.23686492769560208, + "learning_rate": 1.0453527111051184e-05, + "loss": 0.0499, + "step": 4073 + }, + { + "epoch": 2.120770432066632, + "grad_norm": 0.23537377480311067, + "learning_rate": 1.0442102641997594e-05, + "loss": 0.0495, + "step": 4074 + }, + { + "epoch": 2.121290994273816, + "grad_norm": 0.23207116343425835, + "learning_rate": 1.0430682771005456e-05, + "loss": 0.0525, + "step": 4075 + }, + { + "epoch": 2.1218115564809996, + "grad_norm": 0.23332099551170052, + "learning_rate": 1.0419267501681699e-05, + "loss": 0.0498, + "step": 4076 + }, + { + "epoch": 2.1223321186881834, + "grad_norm": 0.23684848540512882, + "learning_rate": 1.0407856837631798e-05, + "loss": 0.0521, + "step": 4077 + }, + { + "epoch": 2.122852680895367, + "grad_norm": 0.22933556276740186, + "learning_rate": 1.0396450782459771e-05, + "loss": 0.0497, + "step": 4078 + }, + { + "epoch": 2.123373243102551, + "grad_norm": 0.2288593333175558, + "learning_rate": 1.0385049339768168e-05, + "loss": 0.0506, + "step": 4079 + }, + { + "epoch": 2.1238938053097347, + "grad_norm": 0.23192903613988014, + "learning_rate": 1.0373652513158105e-05, + "loss": 0.0513, + "step": 4080 + }, + { + "epoch": 2.1244143675169185, + "grad_norm": 0.23148424092687478, + "learning_rate": 1.0362260306229229e-05, + "loss": 0.0517, + "step": 4081 + }, + { + "epoch": 2.124934929724102, + "grad_norm": 0.23760449025559408, + "learning_rate": 1.0350872722579741e-05, + "loss": 0.0506, + "step": 4082 + }, + { + "epoch": 2.1254554919312856, + "grad_norm": 0.23072320281257291, + "learning_rate": 1.0339489765806345e-05, + "loss": 0.0519, + "step": 4083 + }, + { + "epoch": 2.1259760541384694, + "grad_norm": 0.2273989358106142, + "learning_rate": 1.032811143950433e-05, + "loss": 0.0492, + "step": 4084 + }, + { + "epoch": 2.126496616345653, + "grad_norm": 0.2286725964642402, + "learning_rate": 1.0316737747267486e-05, + "loss": 0.0508, + "step": 4085 + }, + { + "epoch": 2.127017178552837, + "grad_norm": 0.2403403105915755, + "learning_rate": 1.0305368692688174e-05, + "loss": 0.0498, + "step": 4086 + }, + { + "epoch": 2.1275377407600207, + "grad_norm": 0.22835020945257542, + "learning_rate": 1.0294004279357259e-05, + "loss": 0.0494, + "step": 4087 + }, + { + "epoch": 2.1280583029672044, + "grad_norm": 0.23886208707779596, + "learning_rate": 1.028264451086414e-05, + "loss": 0.05, + "step": 4088 + }, + { + "epoch": 2.128578865174388, + "grad_norm": 0.23621522502410017, + "learning_rate": 1.0271289390796771e-05, + "loss": 0.0495, + "step": 4089 + }, + { + "epoch": 2.129099427381572, + "grad_norm": 0.23986277451442098, + "learning_rate": 1.0259938922741627e-05, + "loss": 0.0505, + "step": 4090 + }, + { + "epoch": 2.1296199895887558, + "grad_norm": 0.2306221065006331, + "learning_rate": 1.0248593110283725e-05, + "loss": 0.0503, + "step": 4091 + }, + { + "epoch": 2.1301405517959395, + "grad_norm": 0.23069214444668543, + "learning_rate": 1.023725195700658e-05, + "loss": 0.0497, + "step": 4092 + }, + { + "epoch": 2.1306611140031233, + "grad_norm": 0.22731226780333583, + "learning_rate": 1.0225915466492267e-05, + "loss": 0.0492, + "step": 4093 + }, + { + "epoch": 2.131181676210307, + "grad_norm": 0.23439529582276133, + "learning_rate": 1.0214583642321376e-05, + "loss": 0.0499, + "step": 4094 + }, + { + "epoch": 2.131702238417491, + "grad_norm": 0.2390100567918747, + "learning_rate": 1.0203256488073034e-05, + "loss": 0.0498, + "step": 4095 + }, + { + "epoch": 2.1322228006246746, + "grad_norm": 0.22979409308662924, + "learning_rate": 1.0191934007324874e-05, + "loss": 0.0503, + "step": 4096 + }, + { + "epoch": 2.1327433628318584, + "grad_norm": 0.23224468806672807, + "learning_rate": 1.0180616203653054e-05, + "loss": 0.0498, + "step": 4097 + }, + { + "epoch": 2.133263925039042, + "grad_norm": 0.2310744812755954, + "learning_rate": 1.0169303080632272e-05, + "loss": 0.05, + "step": 4098 + }, + { + "epoch": 2.133784487246226, + "grad_norm": 0.2266485347614057, + "learning_rate": 1.0157994641835736e-05, + "loss": 0.05, + "step": 4099 + }, + { + "epoch": 2.1343050494534097, + "grad_norm": 0.231117064885941, + "learning_rate": 1.014669089083518e-05, + "loss": 0.0484, + "step": 4100 + }, + { + "epoch": 2.1348256116605935, + "grad_norm": 0.23246610511746577, + "learning_rate": 1.0135391831200866e-05, + "loss": 0.0509, + "step": 4101 + }, + { + "epoch": 2.1353461738677773, + "grad_norm": 0.24212771834516583, + "learning_rate": 1.012409746650154e-05, + "loss": 0.053, + "step": 4102 + }, + { + "epoch": 2.135866736074961, + "grad_norm": 0.23361341416282064, + "learning_rate": 1.01128078003045e-05, + "loss": 0.0482, + "step": 4103 + }, + { + "epoch": 2.136387298282145, + "grad_norm": 0.23836572088407046, + "learning_rate": 1.0101522836175555e-05, + "loss": 0.0504, + "step": 4104 + }, + { + "epoch": 2.1369078604893286, + "grad_norm": 0.2259664441526069, + "learning_rate": 1.0090242577679005e-05, + "loss": 0.0495, + "step": 4105 + }, + { + "epoch": 2.1374284226965123, + "grad_norm": 0.22082689150726414, + "learning_rate": 1.0078967028377697e-05, + "loss": 0.049, + "step": 4106 + }, + { + "epoch": 2.137948984903696, + "grad_norm": 0.2342587477443236, + "learning_rate": 1.006769619183296e-05, + "loss": 0.0502, + "step": 4107 + }, + { + "epoch": 2.13846954711088, + "grad_norm": 0.22943741247932514, + "learning_rate": 1.0056430071604653e-05, + "loss": 0.0487, + "step": 4108 + }, + { + "epoch": 2.1389901093180637, + "grad_norm": 0.22043167610871012, + "learning_rate": 1.0045168671251143e-05, + "loss": 0.049, + "step": 4109 + }, + { + "epoch": 2.1395106715252474, + "grad_norm": 0.2405489283322427, + "learning_rate": 1.0033911994329314e-05, + "loss": 0.051, + "step": 4110 + }, + { + "epoch": 2.140031233732431, + "grad_norm": 0.234193224915324, + "learning_rate": 1.0022660044394534e-05, + "loss": 0.0483, + "step": 4111 + }, + { + "epoch": 2.140551795939615, + "grad_norm": 0.22465165622300198, + "learning_rate": 1.0011412825000694e-05, + "loss": 0.0478, + "step": 4112 + }, + { + "epoch": 2.1410723581467987, + "grad_norm": 0.23278809432046188, + "learning_rate": 1.00001703397002e-05, + "loss": 0.0479, + "step": 4113 + }, + { + "epoch": 2.1415929203539825, + "grad_norm": 0.23156952891318974, + "learning_rate": 9.988932592043937e-06, + "loss": 0.048, + "step": 4114 + }, + { + "epoch": 2.1421134825611663, + "grad_norm": 0.24878058256395674, + "learning_rate": 9.977699585581324e-06, + "loss": 0.0529, + "step": 4115 + }, + { + "epoch": 2.1426340447683496, + "grad_norm": 0.2401759125829604, + "learning_rate": 9.966471323860251e-06, + "loss": 0.0507, + "step": 4116 + }, + { + "epoch": 2.1431546069755334, + "grad_norm": 0.2318433491416454, + "learning_rate": 9.955247810427137e-06, + "loss": 0.0506, + "step": 4117 + }, + { + "epoch": 2.143675169182717, + "grad_norm": 0.23626591123934843, + "learning_rate": 9.944029048826887e-06, + "loss": 0.0498, + "step": 4118 + }, + { + "epoch": 2.144195731389901, + "grad_norm": 0.23359239052440017, + "learning_rate": 9.932815042602913e-06, + "loss": 0.053, + "step": 4119 + }, + { + "epoch": 2.1447162935970847, + "grad_norm": 0.24007159621963173, + "learning_rate": 9.921605795297109e-06, + "loss": 0.0495, + "step": 4120 + }, + { + "epoch": 2.1452368558042685, + "grad_norm": 0.23718809404737304, + "learning_rate": 9.910401310449883e-06, + "loss": 0.0497, + "step": 4121 + }, + { + "epoch": 2.1457574180114523, + "grad_norm": 0.23382317156311533, + "learning_rate": 9.899201591600138e-06, + "loss": 0.0487, + "step": 4122 + }, + { + "epoch": 2.146277980218636, + "grad_norm": 0.23007554978616346, + "learning_rate": 9.888006642285255e-06, + "loss": 0.0514, + "step": 4123 + }, + { + "epoch": 2.14679854242582, + "grad_norm": 0.2360311167478857, + "learning_rate": 9.876816466041133e-06, + "loss": 0.0514, + "step": 4124 + }, + { + "epoch": 2.1473191046330036, + "grad_norm": 0.23498144817531677, + "learning_rate": 9.865631066402137e-06, + "loss": 0.0528, + "step": 4125 + }, + { + "epoch": 2.1478396668401873, + "grad_norm": 0.22215864699197535, + "learning_rate": 9.854450446901143e-06, + "loss": 0.0463, + "step": 4126 + }, + { + "epoch": 2.148360229047371, + "grad_norm": 0.22531836871998293, + "learning_rate": 9.843274611069509e-06, + "loss": 0.0472, + "step": 4127 + }, + { + "epoch": 2.148880791254555, + "grad_norm": 0.24034863053588068, + "learning_rate": 9.832103562437096e-06, + "loss": 0.0514, + "step": 4128 + }, + { + "epoch": 2.1494013534617387, + "grad_norm": 0.2526527336235642, + "learning_rate": 9.820937304532221e-06, + "loss": 0.0511, + "step": 4129 + }, + { + "epoch": 2.1499219156689224, + "grad_norm": 0.23965378370422113, + "learning_rate": 9.809775840881718e-06, + "loss": 0.0493, + "step": 4130 + }, + { + "epoch": 2.150442477876106, + "grad_norm": 0.23284542516117132, + "learning_rate": 9.798619175010907e-06, + "loss": 0.0486, + "step": 4131 + }, + { + "epoch": 2.15096304008329, + "grad_norm": 0.2517441778747228, + "learning_rate": 9.787467310443562e-06, + "loss": 0.0518, + "step": 4132 + }, + { + "epoch": 2.1514836022904738, + "grad_norm": 0.24113620179268663, + "learning_rate": 9.77632025070198e-06, + "loss": 0.0487, + "step": 4133 + }, + { + "epoch": 2.1520041644976575, + "grad_norm": 0.23483751096204794, + "learning_rate": 9.765177999306904e-06, + "loss": 0.05, + "step": 4134 + }, + { + "epoch": 2.1525247267048413, + "grad_norm": 0.2372927103941721, + "learning_rate": 9.754040559777583e-06, + "loss": 0.0498, + "step": 4135 + }, + { + "epoch": 2.153045288912025, + "grad_norm": 0.23825778506276546, + "learning_rate": 9.742907935631737e-06, + "loss": 0.049, + "step": 4136 + }, + { + "epoch": 2.153565851119209, + "grad_norm": 0.24176693832701435, + "learning_rate": 9.731780130385578e-06, + "loss": 0.0497, + "step": 4137 + }, + { + "epoch": 2.1540864133263926, + "grad_norm": 0.2335246650469084, + "learning_rate": 9.720657147553769e-06, + "loss": 0.05, + "step": 4138 + }, + { + "epoch": 2.1546069755335764, + "grad_norm": 0.23861089006529668, + "learning_rate": 9.709538990649472e-06, + "loss": 0.0503, + "step": 4139 + }, + { + "epoch": 2.15512753774076, + "grad_norm": 0.2291209928615605, + "learning_rate": 9.698425663184324e-06, + "loss": 0.0485, + "step": 4140 + }, + { + "epoch": 2.155648099947944, + "grad_norm": 0.2342784317879674, + "learning_rate": 9.68731716866842e-06, + "loss": 0.049, + "step": 4141 + }, + { + "epoch": 2.1561686621551277, + "grad_norm": 0.2320214212518064, + "learning_rate": 9.676213510610352e-06, + "loss": 0.0485, + "step": 4142 + }, + { + "epoch": 2.1566892243623115, + "grad_norm": 0.2377587377352587, + "learning_rate": 9.665114692517158e-06, + "loss": 0.0523, + "step": 4143 + }, + { + "epoch": 2.1572097865694952, + "grad_norm": 0.2348296445693297, + "learning_rate": 9.654020717894366e-06, + "loss": 0.0502, + "step": 4144 + }, + { + "epoch": 2.157730348776679, + "grad_norm": 0.21794445970232668, + "learning_rate": 9.642931590245973e-06, + "loss": 0.0474, + "step": 4145 + }, + { + "epoch": 2.1582509109838623, + "grad_norm": 0.2350475368286949, + "learning_rate": 9.63184731307445e-06, + "loss": 0.0502, + "step": 4146 + }, + { + "epoch": 2.158771473191046, + "grad_norm": 0.23095612217802725, + "learning_rate": 9.620767889880708e-06, + "loss": 0.0483, + "step": 4147 + }, + { + "epoch": 2.15929203539823, + "grad_norm": 0.21882801219665537, + "learning_rate": 9.609693324164154e-06, + "loss": 0.0476, + "step": 4148 + }, + { + "epoch": 2.1598125976054137, + "grad_norm": 0.23959480333572203, + "learning_rate": 9.59862361942266e-06, + "loss": 0.0499, + "step": 4149 + }, + { + "epoch": 2.1603331598125974, + "grad_norm": 0.2382618434451453, + "learning_rate": 9.587558779152536e-06, + "loss": 0.0483, + "step": 4150 + }, + { + "epoch": 2.160853722019781, + "grad_norm": 0.2481562495202432, + "learning_rate": 9.576498806848591e-06, + "loss": 0.0515, + "step": 4151 + }, + { + "epoch": 2.161374284226965, + "grad_norm": 0.23415717741022837, + "learning_rate": 9.565443706004065e-06, + "loss": 0.0506, + "step": 4152 + }, + { + "epoch": 2.1618948464341488, + "grad_norm": 0.2310272602868718, + "learning_rate": 9.554393480110677e-06, + "loss": 0.0501, + "step": 4153 + }, + { + "epoch": 2.1624154086413325, + "grad_norm": 0.21717437554416877, + "learning_rate": 9.543348132658602e-06, + "loss": 0.0471, + "step": 4154 + }, + { + "epoch": 2.1629359708485163, + "grad_norm": 0.24030597109135618, + "learning_rate": 9.532307667136493e-06, + "loss": 0.0503, + "step": 4155 + }, + { + "epoch": 2.1634565330557, + "grad_norm": 0.2271124234150207, + "learning_rate": 9.521272087031414e-06, + "loss": 0.0474, + "step": 4156 + }, + { + "epoch": 2.163977095262884, + "grad_norm": 0.23172566333461286, + "learning_rate": 9.510241395828926e-06, + "loss": 0.0509, + "step": 4157 + }, + { + "epoch": 2.1644976574700676, + "grad_norm": 0.23463565756110716, + "learning_rate": 9.499215597013048e-06, + "loss": 0.0519, + "step": 4158 + }, + { + "epoch": 2.1650182196772514, + "grad_norm": 0.2269853064996004, + "learning_rate": 9.488194694066219e-06, + "loss": 0.0503, + "step": 4159 + }, + { + "epoch": 2.165538781884435, + "grad_norm": 0.22448361065078423, + "learning_rate": 9.47717869046937e-06, + "loss": 0.0488, + "step": 4160 + }, + { + "epoch": 2.166059344091619, + "grad_norm": 0.2298514916947985, + "learning_rate": 9.466167589701855e-06, + "loss": 0.0492, + "step": 4161 + }, + { + "epoch": 2.1665799062988027, + "grad_norm": 0.2411385315001518, + "learning_rate": 9.455161395241496e-06, + "loss": 0.0514, + "step": 4162 + }, + { + "epoch": 2.1671004685059865, + "grad_norm": 0.23344249520016908, + "learning_rate": 9.444160110564562e-06, + "loss": 0.0507, + "step": 4163 + }, + { + "epoch": 2.1676210307131702, + "grad_norm": 0.22970482297994116, + "learning_rate": 9.433163739145773e-06, + "loss": 0.049, + "step": 4164 + }, + { + "epoch": 2.168141592920354, + "grad_norm": 0.24137131622878738, + "learning_rate": 9.422172284458303e-06, + "loss": 0.0501, + "step": 4165 + }, + { + "epoch": 2.168662155127538, + "grad_norm": 0.2391941048873966, + "learning_rate": 9.411185749973744e-06, + "loss": 0.0491, + "step": 4166 + }, + { + "epoch": 2.1691827173347216, + "grad_norm": 0.22597120615731292, + "learning_rate": 9.400204139162178e-06, + "loss": 0.0478, + "step": 4167 + }, + { + "epoch": 2.1697032795419053, + "grad_norm": 0.22407522799206378, + "learning_rate": 9.389227455492083e-06, + "loss": 0.0488, + "step": 4168 + }, + { + "epoch": 2.170223841749089, + "grad_norm": 0.23341996865358303, + "learning_rate": 9.378255702430425e-06, + "loss": 0.0495, + "step": 4169 + }, + { + "epoch": 2.170744403956273, + "grad_norm": 0.22756065098090528, + "learning_rate": 9.367288883442596e-06, + "loss": 0.0465, + "step": 4170 + }, + { + "epoch": 2.1712649661634567, + "grad_norm": 0.24032557476752722, + "learning_rate": 9.356327001992412e-06, + "loss": 0.0514, + "step": 4171 + }, + { + "epoch": 2.1717855283706404, + "grad_norm": 0.23279773656960784, + "learning_rate": 9.345370061542158e-06, + "loss": 0.049, + "step": 4172 + }, + { + "epoch": 2.172306090577824, + "grad_norm": 0.227184511533179, + "learning_rate": 9.334418065552538e-06, + "loss": 0.048, + "step": 4173 + }, + { + "epoch": 2.172826652785008, + "grad_norm": 0.2388318118994056, + "learning_rate": 9.323471017482718e-06, + "loss": 0.0492, + "step": 4174 + }, + { + "epoch": 2.1733472149921917, + "grad_norm": 0.23862458637474987, + "learning_rate": 9.312528920790265e-06, + "loss": 0.0507, + "step": 4175 + }, + { + "epoch": 2.1738677771993755, + "grad_norm": 0.2300694111237474, + "learning_rate": 9.301591778931218e-06, + "loss": 0.0496, + "step": 4176 + }, + { + "epoch": 2.1743883394065593, + "grad_norm": 0.22487725179561208, + "learning_rate": 9.290659595360018e-06, + "loss": 0.0474, + "step": 4177 + }, + { + "epoch": 2.174908901613743, + "grad_norm": 0.24227495997845988, + "learning_rate": 9.27973237352957e-06, + "loss": 0.0484, + "step": 4178 + }, + { + "epoch": 2.175429463820927, + "grad_norm": 0.24971312513542707, + "learning_rate": 9.268810116891205e-06, + "loss": 0.0522, + "step": 4179 + }, + { + "epoch": 2.17595002602811, + "grad_norm": 0.23240074321999316, + "learning_rate": 9.257892828894663e-06, + "loss": 0.0509, + "step": 4180 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.23411180744875743, + "learning_rate": 9.24698051298814e-06, + "loss": 0.0507, + "step": 4181 + }, + { + "epoch": 2.1769911504424777, + "grad_norm": 0.2283915870645207, + "learning_rate": 9.236073172618254e-06, + "loss": 0.0496, + "step": 4182 + }, + { + "epoch": 2.1775117126496615, + "grad_norm": 0.21862188186366638, + "learning_rate": 9.225170811230058e-06, + "loss": 0.0488, + "step": 4183 + }, + { + "epoch": 2.1780322748568453, + "grad_norm": 0.25870559139681704, + "learning_rate": 9.214273432267009e-06, + "loss": 0.0491, + "step": 4184 + }, + { + "epoch": 2.178552837064029, + "grad_norm": 0.2338660696928279, + "learning_rate": 9.203381039171022e-06, + "loss": 0.0488, + "step": 4185 + }, + { + "epoch": 2.179073399271213, + "grad_norm": 0.23653181541138485, + "learning_rate": 9.192493635382407e-06, + "loss": 0.0482, + "step": 4186 + }, + { + "epoch": 2.1795939614783966, + "grad_norm": 0.2475444397957008, + "learning_rate": 9.181611224339917e-06, + "loss": 0.051, + "step": 4187 + }, + { + "epoch": 2.1801145236855803, + "grad_norm": 0.23144206027749797, + "learning_rate": 9.170733809480738e-06, + "loss": 0.0478, + "step": 4188 + }, + { + "epoch": 2.180635085892764, + "grad_norm": 0.23938498072235687, + "learning_rate": 9.159861394240444e-06, + "loss": 0.0494, + "step": 4189 + }, + { + "epoch": 2.181155648099948, + "grad_norm": 0.24401500298605283, + "learning_rate": 9.148993982053058e-06, + "loss": 0.05, + "step": 4190 + }, + { + "epoch": 2.1816762103071317, + "grad_norm": 0.23262393333357667, + "learning_rate": 9.138131576351014e-06, + "loss": 0.0487, + "step": 4191 + }, + { + "epoch": 2.1821967725143154, + "grad_norm": 0.23923195938864855, + "learning_rate": 9.127274180565177e-06, + "loss": 0.0497, + "step": 4192 + }, + { + "epoch": 2.182717334721499, + "grad_norm": 0.2385711033015507, + "learning_rate": 9.116421798124794e-06, + "loss": 0.0508, + "step": 4193 + }, + { + "epoch": 2.183237896928683, + "grad_norm": 0.2305410960253734, + "learning_rate": 9.105574432457576e-06, + "loss": 0.0504, + "step": 4194 + }, + { + "epoch": 2.1837584591358667, + "grad_norm": 0.2315059122487158, + "learning_rate": 9.094732086989608e-06, + "loss": 0.0481, + "step": 4195 + }, + { + "epoch": 2.1842790213430505, + "grad_norm": 0.2388138458267806, + "learning_rate": 9.083894765145412e-06, + "loss": 0.0495, + "step": 4196 + }, + { + "epoch": 2.1847995835502343, + "grad_norm": 0.24338846982564916, + "learning_rate": 9.073062470347928e-06, + "loss": 0.0497, + "step": 4197 + }, + { + "epoch": 2.185320145757418, + "grad_norm": 0.24700761283541492, + "learning_rate": 9.062235206018488e-06, + "loss": 0.0512, + "step": 4198 + }, + { + "epoch": 2.185840707964602, + "grad_norm": 0.22202347104845527, + "learning_rate": 9.051412975576849e-06, + "loss": 0.0481, + "step": 4199 + }, + { + "epoch": 2.1863612701717856, + "grad_norm": 0.23160644856981238, + "learning_rate": 9.040595782441172e-06, + "loss": 0.0495, + "step": 4200 + }, + { + "epoch": 2.1868818323789694, + "grad_norm": 0.2192099194389428, + "learning_rate": 9.029783630028044e-06, + "loss": 0.0495, + "step": 4201 + }, + { + "epoch": 2.187402394586153, + "grad_norm": 0.23563639581238982, + "learning_rate": 9.018976521752426e-06, + "loss": 0.0519, + "step": 4202 + }, + { + "epoch": 2.187922956793337, + "grad_norm": 0.2229417962071288, + "learning_rate": 9.008174461027724e-06, + "loss": 0.0482, + "step": 4203 + }, + { + "epoch": 2.1884435190005207, + "grad_norm": 0.23972449679471294, + "learning_rate": 8.997377451265715e-06, + "loss": 0.0513, + "step": 4204 + }, + { + "epoch": 2.1889640812077045, + "grad_norm": 0.226247431389423, + "learning_rate": 8.986585495876605e-06, + "loss": 0.0493, + "step": 4205 + }, + { + "epoch": 2.1894846434148882, + "grad_norm": 0.23325199021404394, + "learning_rate": 8.975798598269002e-06, + "loss": 0.0506, + "step": 4206 + }, + { + "epoch": 2.190005205622072, + "grad_norm": 0.2245553471409382, + "learning_rate": 8.965016761849898e-06, + "loss": 0.0487, + "step": 4207 + }, + { + "epoch": 2.190525767829256, + "grad_norm": 0.23199873840527224, + "learning_rate": 8.954239990024704e-06, + "loss": 0.0485, + "step": 4208 + }, + { + "epoch": 2.1910463300364396, + "grad_norm": 0.22307074327270973, + "learning_rate": 8.943468286197224e-06, + "loss": 0.0477, + "step": 4209 + }, + { + "epoch": 2.191566892243623, + "grad_norm": 0.23538404056571152, + "learning_rate": 8.932701653769676e-06, + "loss": 0.0492, + "step": 4210 + }, + { + "epoch": 2.1920874544508067, + "grad_norm": 0.23621294854326094, + "learning_rate": 8.921940096142645e-06, + "loss": 0.0495, + "step": 4211 + }, + { + "epoch": 2.1926080166579904, + "grad_norm": 0.2286930249072395, + "learning_rate": 8.911183616715148e-06, + "loss": 0.0481, + "step": 4212 + }, + { + "epoch": 2.193128578865174, + "grad_norm": 0.2466558181780694, + "learning_rate": 8.900432218884567e-06, + "loss": 0.0509, + "step": 4213 + }, + { + "epoch": 2.193649141072358, + "grad_norm": 0.2372365759392663, + "learning_rate": 8.8896859060467e-06, + "loss": 0.0497, + "step": 4214 + }, + { + "epoch": 2.1941697032795417, + "grad_norm": 0.2319389953511392, + "learning_rate": 8.878944681595742e-06, + "loss": 0.0496, + "step": 4215 + }, + { + "epoch": 2.1946902654867255, + "grad_norm": 0.23364662530422042, + "learning_rate": 8.868208548924253e-06, + "loss": 0.0491, + "step": 4216 + }, + { + "epoch": 2.1952108276939093, + "grad_norm": 0.24276463476612706, + "learning_rate": 8.857477511423215e-06, + "loss": 0.05, + "step": 4217 + }, + { + "epoch": 2.195731389901093, + "grad_norm": 0.23011307596727132, + "learning_rate": 8.846751572481984e-06, + "loss": 0.0477, + "step": 4218 + }, + { + "epoch": 2.196251952108277, + "grad_norm": 0.2337315315374658, + "learning_rate": 8.836030735488327e-06, + "loss": 0.0489, + "step": 4219 + }, + { + "epoch": 2.1967725143154606, + "grad_norm": 0.2325310579261457, + "learning_rate": 8.825315003828358e-06, + "loss": 0.0465, + "step": 4220 + }, + { + "epoch": 2.1972930765226444, + "grad_norm": 0.22830561027317142, + "learning_rate": 8.814604380886623e-06, + "loss": 0.0476, + "step": 4221 + }, + { + "epoch": 2.197813638729828, + "grad_norm": 0.24012212633480134, + "learning_rate": 8.803898870046023e-06, + "loss": 0.0488, + "step": 4222 + }, + { + "epoch": 2.198334200937012, + "grad_norm": 0.23919713359665457, + "learning_rate": 8.79319847468786e-06, + "loss": 0.0489, + "step": 4223 + }, + { + "epoch": 2.1988547631441957, + "grad_norm": 0.2506742677947288, + "learning_rate": 8.782503198191828e-06, + "loss": 0.0514, + "step": 4224 + }, + { + "epoch": 2.1993753253513795, + "grad_norm": 0.23617636186386712, + "learning_rate": 8.771813043935972e-06, + "loss": 0.0494, + "step": 4225 + }, + { + "epoch": 2.1998958875585632, + "grad_norm": 0.23474992131532096, + "learning_rate": 8.761128015296754e-06, + "loss": 0.0494, + "step": 4226 + }, + { + "epoch": 2.200416449765747, + "grad_norm": 0.24128412011120356, + "learning_rate": 8.750448115649001e-06, + "loss": 0.0506, + "step": 4227 + }, + { + "epoch": 2.200937011972931, + "grad_norm": 0.23586375626059694, + "learning_rate": 8.739773348365928e-06, + "loss": 0.05, + "step": 4228 + }, + { + "epoch": 2.2014575741801146, + "grad_norm": 0.22292550967559244, + "learning_rate": 8.729103716819112e-06, + "loss": 0.0481, + "step": 4229 + }, + { + "epoch": 2.2019781363872983, + "grad_norm": 0.23125396362101913, + "learning_rate": 8.71843922437853e-06, + "loss": 0.0473, + "step": 4230 + }, + { + "epoch": 2.202498698594482, + "grad_norm": 0.22704085751147418, + "learning_rate": 8.707779874412514e-06, + "loss": 0.047, + "step": 4231 + }, + { + "epoch": 2.203019260801666, + "grad_norm": 0.23357200148046692, + "learning_rate": 8.697125670287787e-06, + "loss": 0.0486, + "step": 4232 + }, + { + "epoch": 2.2035398230088497, + "grad_norm": 0.24120365615695358, + "learning_rate": 8.686476615369451e-06, + "loss": 0.0507, + "step": 4233 + }, + { + "epoch": 2.2040603852160334, + "grad_norm": 0.2440416527287281, + "learning_rate": 8.67583271302096e-06, + "loss": 0.0498, + "step": 4234 + }, + { + "epoch": 2.204580947423217, + "grad_norm": 0.22550408972403665, + "learning_rate": 8.665193966604157e-06, + "loss": 0.0461, + "step": 4235 + }, + { + "epoch": 2.205101509630401, + "grad_norm": 0.24657257357860465, + "learning_rate": 8.654560379479257e-06, + "loss": 0.0508, + "step": 4236 + }, + { + "epoch": 2.2056220718375847, + "grad_norm": 0.2378711148974515, + "learning_rate": 8.643931955004839e-06, + "loss": 0.0504, + "step": 4237 + }, + { + "epoch": 2.2061426340447685, + "grad_norm": 0.23414764136210095, + "learning_rate": 8.633308696537865e-06, + "loss": 0.0486, + "step": 4238 + }, + { + "epoch": 2.2066631962519523, + "grad_norm": 0.24236213181260452, + "learning_rate": 8.622690607433644e-06, + "loss": 0.0506, + "step": 4239 + }, + { + "epoch": 2.207183758459136, + "grad_norm": 0.22663158777101486, + "learning_rate": 8.612077691045856e-06, + "loss": 0.0481, + "step": 4240 + }, + { + "epoch": 2.20770432066632, + "grad_norm": 0.23127324675787353, + "learning_rate": 8.601469950726562e-06, + "loss": 0.0494, + "step": 4241 + }, + { + "epoch": 2.2082248828735036, + "grad_norm": 0.23219971597920153, + "learning_rate": 8.59086738982618e-06, + "loss": 0.0482, + "step": 4242 + }, + { + "epoch": 2.2087454450806874, + "grad_norm": 0.22566023224763826, + "learning_rate": 8.580270011693498e-06, + "loss": 0.0501, + "step": 4243 + }, + { + "epoch": 2.2092660072878707, + "grad_norm": 0.22931201717326824, + "learning_rate": 8.569677819675646e-06, + "loss": 0.0469, + "step": 4244 + }, + { + "epoch": 2.2097865694950545, + "grad_norm": 0.22932887506999267, + "learning_rate": 8.55909081711814e-06, + "loss": 0.0474, + "step": 4245 + }, + { + "epoch": 2.2103071317022382, + "grad_norm": 0.23290519077904787, + "learning_rate": 8.548509007364849e-06, + "loss": 0.0484, + "step": 4246 + }, + { + "epoch": 2.210827693909422, + "grad_norm": 0.23050032981873889, + "learning_rate": 8.537932393758008e-06, + "loss": 0.0495, + "step": 4247 + }, + { + "epoch": 2.211348256116606, + "grad_norm": 0.23320169879502017, + "learning_rate": 8.527360979638196e-06, + "loss": 0.0487, + "step": 4248 + }, + { + "epoch": 2.2118688183237896, + "grad_norm": 0.22937417737271626, + "learning_rate": 8.51679476834435e-06, + "loss": 0.0492, + "step": 4249 + }, + { + "epoch": 2.2123893805309733, + "grad_norm": 0.23866247343146085, + "learning_rate": 8.506233763213776e-06, + "loss": 0.0481, + "step": 4250 + }, + { + "epoch": 2.212909942738157, + "grad_norm": 0.2544913382799314, + "learning_rate": 8.495677967582135e-06, + "loss": 0.0493, + "step": 4251 + }, + { + "epoch": 2.213430504945341, + "grad_norm": 0.2335235599829452, + "learning_rate": 8.485127384783446e-06, + "loss": 0.0486, + "step": 4252 + }, + { + "epoch": 2.2139510671525247, + "grad_norm": 0.24283971222288098, + "learning_rate": 8.474582018150054e-06, + "loss": 0.0484, + "step": 4253 + }, + { + "epoch": 2.2144716293597084, + "grad_norm": 0.23164997555170744, + "learning_rate": 8.464041871012687e-06, + "loss": 0.0497, + "step": 4254 + }, + { + "epoch": 2.214992191566892, + "grad_norm": 0.2413814098062649, + "learning_rate": 8.453506946700418e-06, + "loss": 0.0493, + "step": 4255 + }, + { + "epoch": 2.215512753774076, + "grad_norm": 0.2454607147482343, + "learning_rate": 8.442977248540667e-06, + "loss": 0.0479, + "step": 4256 + }, + { + "epoch": 2.2160333159812597, + "grad_norm": 0.23779862580138247, + "learning_rate": 8.4324527798592e-06, + "loss": 0.0497, + "step": 4257 + }, + { + "epoch": 2.2165538781884435, + "grad_norm": 0.23762422063366767, + "learning_rate": 8.421933543980126e-06, + "loss": 0.0505, + "step": 4258 + }, + { + "epoch": 2.2170744403956273, + "grad_norm": 0.23130632658477607, + "learning_rate": 8.411419544225913e-06, + "loss": 0.0487, + "step": 4259 + }, + { + "epoch": 2.217595002602811, + "grad_norm": 0.22356019790315676, + "learning_rate": 8.400910783917377e-06, + "loss": 0.0472, + "step": 4260 + }, + { + "epoch": 2.218115564809995, + "grad_norm": 0.22887925032024237, + "learning_rate": 8.390407266373674e-06, + "loss": 0.0476, + "step": 4261 + }, + { + "epoch": 2.2186361270171786, + "grad_norm": 0.23463897003025824, + "learning_rate": 8.379908994912294e-06, + "loss": 0.049, + "step": 4262 + }, + { + "epoch": 2.2191566892243624, + "grad_norm": 0.23240197161886428, + "learning_rate": 8.369415972849088e-06, + "loss": 0.0495, + "step": 4263 + }, + { + "epoch": 2.219677251431546, + "grad_norm": 0.22919016167185535, + "learning_rate": 8.358928203498236e-06, + "loss": 0.0502, + "step": 4264 + }, + { + "epoch": 2.22019781363873, + "grad_norm": 0.2202440420250215, + "learning_rate": 8.348445690172274e-06, + "loss": 0.0476, + "step": 4265 + }, + { + "epoch": 2.2207183758459137, + "grad_norm": 0.23110580036654815, + "learning_rate": 8.337968436182054e-06, + "loss": 0.0503, + "step": 4266 + }, + { + "epoch": 2.2212389380530975, + "grad_norm": 0.23758638402574997, + "learning_rate": 8.327496444836793e-06, + "loss": 0.0509, + "step": 4267 + }, + { + "epoch": 2.2217595002602812, + "grad_norm": 0.22791558989136954, + "learning_rate": 8.317029719444016e-06, + "loss": 0.0469, + "step": 4268 + }, + { + "epoch": 2.222280062467465, + "grad_norm": 0.23245148595947804, + "learning_rate": 8.306568263309616e-06, + "loss": 0.049, + "step": 4269 + }, + { + "epoch": 2.222800624674649, + "grad_norm": 0.22057289323649362, + "learning_rate": 8.296112079737808e-06, + "loss": 0.047, + "step": 4270 + }, + { + "epoch": 2.2233211868818326, + "grad_norm": 0.2291814857700997, + "learning_rate": 8.28566117203113e-06, + "loss": 0.0493, + "step": 4271 + }, + { + "epoch": 2.2238417490890163, + "grad_norm": 0.23917673247453083, + "learning_rate": 8.275215543490475e-06, + "loss": 0.0492, + "step": 4272 + }, + { + "epoch": 2.2243623112962, + "grad_norm": 0.23646886044374246, + "learning_rate": 8.264775197415053e-06, + "loss": 0.0495, + "step": 4273 + }, + { + "epoch": 2.2248828735033834, + "grad_norm": 0.23541029580737305, + "learning_rate": 8.254340137102426e-06, + "loss": 0.0485, + "step": 4274 + }, + { + "epoch": 2.225403435710567, + "grad_norm": 0.25343510785437356, + "learning_rate": 8.243910365848448e-06, + "loss": 0.051, + "step": 4275 + }, + { + "epoch": 2.225923997917751, + "grad_norm": 0.2298835859276625, + "learning_rate": 8.233485886947346e-06, + "loss": 0.0489, + "step": 4276 + }, + { + "epoch": 2.2264445601249347, + "grad_norm": 0.22912963408681325, + "learning_rate": 8.22306670369164e-06, + "loss": 0.0487, + "step": 4277 + }, + { + "epoch": 2.2269651223321185, + "grad_norm": 0.23556170673164437, + "learning_rate": 8.2126528193722e-06, + "loss": 0.0485, + "step": 4278 + }, + { + "epoch": 2.2274856845393023, + "grad_norm": 0.22736010372358842, + "learning_rate": 8.202244237278223e-06, + "loss": 0.0474, + "step": 4279 + }, + { + "epoch": 2.228006246746486, + "grad_norm": 0.2427914749744837, + "learning_rate": 8.19184096069721e-06, + "loss": 0.0501, + "step": 4280 + }, + { + "epoch": 2.22852680895367, + "grad_norm": 0.2351054578306983, + "learning_rate": 8.181442992915e-06, + "loss": 0.0486, + "step": 4281 + }, + { + "epoch": 2.2290473711608536, + "grad_norm": 0.22778548939677618, + "learning_rate": 8.171050337215767e-06, + "loss": 0.0483, + "step": 4282 + }, + { + "epoch": 2.2295679333680374, + "grad_norm": 0.22980336566734544, + "learning_rate": 8.160662996881996e-06, + "loss": 0.0468, + "step": 4283 + }, + { + "epoch": 2.230088495575221, + "grad_norm": 0.23245660876536978, + "learning_rate": 8.150280975194478e-06, + "loss": 0.0477, + "step": 4284 + }, + { + "epoch": 2.230609057782405, + "grad_norm": 0.22749204175239007, + "learning_rate": 8.139904275432354e-06, + "loss": 0.0489, + "step": 4285 + }, + { + "epoch": 2.2311296199895887, + "grad_norm": 0.23848791969754046, + "learning_rate": 8.129532900873051e-06, + "loss": 0.0495, + "step": 4286 + }, + { + "epoch": 2.2316501821967725, + "grad_norm": 0.24418712222939992, + "learning_rate": 8.119166854792345e-06, + "loss": 0.048, + "step": 4287 + }, + { + "epoch": 2.2321707444039562, + "grad_norm": 0.23832224809407973, + "learning_rate": 8.10880614046432e-06, + "loss": 0.0502, + "step": 4288 + }, + { + "epoch": 2.23269130661114, + "grad_norm": 0.2349337209019424, + "learning_rate": 8.098450761161356e-06, + "loss": 0.0488, + "step": 4289 + }, + { + "epoch": 2.233211868818324, + "grad_norm": 0.236101048702289, + "learning_rate": 8.08810072015417e-06, + "loss": 0.0486, + "step": 4290 + }, + { + "epoch": 2.2337324310255076, + "grad_norm": 0.2436594526550554, + "learning_rate": 8.07775602071179e-06, + "loss": 0.0502, + "step": 4291 + }, + { + "epoch": 2.2342529932326913, + "grad_norm": 0.24215638153778066, + "learning_rate": 8.067416666101562e-06, + "loss": 0.0501, + "step": 4292 + }, + { + "epoch": 2.234773555439875, + "grad_norm": 0.22910339256480491, + "learning_rate": 8.057082659589115e-06, + "loss": 0.0485, + "step": 4293 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.23759220672459413, + "learning_rate": 8.046754004438429e-06, + "loss": 0.0501, + "step": 4294 + }, + { + "epoch": 2.2358146798542426, + "grad_norm": 0.2396620539258785, + "learning_rate": 8.036430703911754e-06, + "loss": 0.0505, + "step": 4295 + }, + { + "epoch": 2.2363352420614264, + "grad_norm": 0.21656808633265354, + "learning_rate": 8.026112761269683e-06, + "loss": 0.0467, + "step": 4296 + }, + { + "epoch": 2.23685580426861, + "grad_norm": 0.23694210045408912, + "learning_rate": 8.015800179771105e-06, + "loss": 0.0492, + "step": 4297 + }, + { + "epoch": 2.237376366475794, + "grad_norm": 0.23066475593043279, + "learning_rate": 8.005492962673197e-06, + "loss": 0.0485, + "step": 4298 + }, + { + "epoch": 2.2378969286829777, + "grad_norm": 0.23780414518123882, + "learning_rate": 7.99519111323147e-06, + "loss": 0.0502, + "step": 4299 + }, + { + "epoch": 2.2384174908901615, + "grad_norm": 0.22874416790471078, + "learning_rate": 7.984894634699725e-06, + "loss": 0.0477, + "step": 4300 + }, + { + "epoch": 2.2389380530973453, + "grad_norm": 0.225516589384533, + "learning_rate": 7.974603530330069e-06, + "loss": 0.0478, + "step": 4301 + }, + { + "epoch": 2.239458615304529, + "grad_norm": 0.24222066580737353, + "learning_rate": 7.964317803372918e-06, + "loss": 0.0509, + "step": 4302 + }, + { + "epoch": 2.239979177511713, + "grad_norm": 0.24287142818992802, + "learning_rate": 7.95403745707698e-06, + "loss": 0.0497, + "step": 4303 + }, + { + "epoch": 2.2404997397188966, + "grad_norm": 0.24632751357990357, + "learning_rate": 7.943762494689252e-06, + "loss": 0.0503, + "step": 4304 + }, + { + "epoch": 2.2410203019260804, + "grad_norm": 0.2450475168217388, + "learning_rate": 7.93349291945506e-06, + "loss": 0.0484, + "step": 4305 + }, + { + "epoch": 2.241540864133264, + "grad_norm": 0.24379993992170595, + "learning_rate": 7.92322873461801e-06, + "loss": 0.0478, + "step": 4306 + }, + { + "epoch": 2.242061426340448, + "grad_norm": 0.24148406455105237, + "learning_rate": 7.912969943420018e-06, + "loss": 0.049, + "step": 4307 + }, + { + "epoch": 2.2425819885476312, + "grad_norm": 0.25211178202149287, + "learning_rate": 7.902716549101272e-06, + "loss": 0.0519, + "step": 4308 + }, + { + "epoch": 2.243102550754815, + "grad_norm": 0.22380206855764131, + "learning_rate": 7.892468554900278e-06, + "loss": 0.0457, + "step": 4309 + }, + { + "epoch": 2.243623112961999, + "grad_norm": 0.23150832565863866, + "learning_rate": 7.88222596405383e-06, + "loss": 0.048, + "step": 4310 + }, + { + "epoch": 2.2441436751691826, + "grad_norm": 0.24560507817753072, + "learning_rate": 7.871988779797024e-06, + "loss": 0.0509, + "step": 4311 + }, + { + "epoch": 2.2446642373763663, + "grad_norm": 0.22181091893953384, + "learning_rate": 7.861757005363232e-06, + "loss": 0.0475, + "step": 4312 + }, + { + "epoch": 2.24518479958355, + "grad_norm": 0.23679248354187557, + "learning_rate": 7.851530643984111e-06, + "loss": 0.0484, + "step": 4313 + }, + { + "epoch": 2.245705361790734, + "grad_norm": 0.23595668958031096, + "learning_rate": 7.841309698889638e-06, + "loss": 0.0501, + "step": 4314 + }, + { + "epoch": 2.2462259239979177, + "grad_norm": 0.22600608264316838, + "learning_rate": 7.831094173308056e-06, + "loss": 0.0483, + "step": 4315 + }, + { + "epoch": 2.2467464862051014, + "grad_norm": 0.23927132977588947, + "learning_rate": 7.820884070465914e-06, + "loss": 0.0497, + "step": 4316 + }, + { + "epoch": 2.247267048412285, + "grad_norm": 0.2291797616221194, + "learning_rate": 7.810679393588025e-06, + "loss": 0.049, + "step": 4317 + }, + { + "epoch": 2.247787610619469, + "grad_norm": 0.2252726239114742, + "learning_rate": 7.800480145897501e-06, + "loss": 0.0479, + "step": 4318 + }, + { + "epoch": 2.2483081728266527, + "grad_norm": 0.2209306053422971, + "learning_rate": 7.790286330615749e-06, + "loss": 0.0466, + "step": 4319 + }, + { + "epoch": 2.2488287350338365, + "grad_norm": 0.21256290410992668, + "learning_rate": 7.780097950962447e-06, + "loss": 0.0459, + "step": 4320 + }, + { + "epoch": 2.2493492972410203, + "grad_norm": 0.23304751234583473, + "learning_rate": 7.76991501015556e-06, + "loss": 0.0494, + "step": 4321 + }, + { + "epoch": 2.249869859448204, + "grad_norm": 0.24109946012783512, + "learning_rate": 7.759737511411325e-06, + "loss": 0.05, + "step": 4322 + }, + { + "epoch": 2.250390421655388, + "grad_norm": 0.23100844503880485, + "learning_rate": 7.749565457944274e-06, + "loss": 0.0477, + "step": 4323 + }, + { + "epoch": 2.2509109838625716, + "grad_norm": 0.22756640638272935, + "learning_rate": 7.73939885296722e-06, + "loss": 0.0488, + "step": 4324 + }, + { + "epoch": 2.2514315460697554, + "grad_norm": 0.2411358442421993, + "learning_rate": 7.729237699691254e-06, + "loss": 0.0514, + "step": 4325 + }, + { + "epoch": 2.251952108276939, + "grad_norm": 0.2385049139121903, + "learning_rate": 7.719082001325728e-06, + "loss": 0.0489, + "step": 4326 + }, + { + "epoch": 2.252472670484123, + "grad_norm": 0.24245891150143284, + "learning_rate": 7.70893176107829e-06, + "loss": 0.0492, + "step": 4327 + }, + { + "epoch": 2.2529932326913067, + "grad_norm": 0.23962567363070575, + "learning_rate": 7.698786982154857e-06, + "loss": 0.0486, + "step": 4328 + }, + { + "epoch": 2.2535137948984905, + "grad_norm": 0.23067229481030735, + "learning_rate": 7.688647667759633e-06, + "loss": 0.0482, + "step": 4329 + }, + { + "epoch": 2.2540343571056742, + "grad_norm": 0.23572306878031657, + "learning_rate": 7.678513821095076e-06, + "loss": 0.0488, + "step": 4330 + }, + { + "epoch": 2.254554919312858, + "grad_norm": 0.22767821391268697, + "learning_rate": 7.668385445361923e-06, + "loss": 0.0478, + "step": 4331 + }, + { + "epoch": 2.255075481520042, + "grad_norm": 0.22836659984706298, + "learning_rate": 7.658262543759184e-06, + "loss": 0.0465, + "step": 4332 + }, + { + "epoch": 2.2555960437272256, + "grad_norm": 0.2393729125564174, + "learning_rate": 7.648145119484152e-06, + "loss": 0.0482, + "step": 4333 + }, + { + "epoch": 2.2561166059344093, + "grad_norm": 0.23094639008000092, + "learning_rate": 7.638033175732385e-06, + "loss": 0.0483, + "step": 4334 + }, + { + "epoch": 2.256637168141593, + "grad_norm": 0.22558491929936555, + "learning_rate": 7.627926715697689e-06, + "loss": 0.0473, + "step": 4335 + }, + { + "epoch": 2.257157730348777, + "grad_norm": 0.23284991884566408, + "learning_rate": 7.617825742572163e-06, + "loss": 0.0488, + "step": 4336 + }, + { + "epoch": 2.25767829255596, + "grad_norm": 0.23896169808363904, + "learning_rate": 7.607730259546164e-06, + "loss": 0.0491, + "step": 4337 + }, + { + "epoch": 2.258198854763144, + "grad_norm": 0.229443652276642, + "learning_rate": 7.597640269808323e-06, + "loss": 0.0483, + "step": 4338 + }, + { + "epoch": 2.2587194169703277, + "grad_norm": 0.22477269530377628, + "learning_rate": 7.5875557765455245e-06, + "loss": 0.0485, + "step": 4339 + }, + { + "epoch": 2.2592399791775115, + "grad_norm": 0.2248750508583482, + "learning_rate": 7.577476782942905e-06, + "loss": 0.046, + "step": 4340 + }, + { + "epoch": 2.2597605413846953, + "grad_norm": 0.24282761232837097, + "learning_rate": 7.567403292183892e-06, + "loss": 0.0488, + "step": 4341 + }, + { + "epoch": 2.260281103591879, + "grad_norm": 0.24559688750387063, + "learning_rate": 7.557335307450164e-06, + "loss": 0.0487, + "step": 4342 + }, + { + "epoch": 2.260801665799063, + "grad_norm": 0.2279229009237183, + "learning_rate": 7.547272831921665e-06, + "loss": 0.0473, + "step": 4343 + }, + { + "epoch": 2.2613222280062466, + "grad_norm": 0.24163113811010245, + "learning_rate": 7.5372158687765784e-06, + "loss": 0.0488, + "step": 4344 + }, + { + "epoch": 2.2618427902134304, + "grad_norm": 0.22542343579424412, + "learning_rate": 7.527164421191369e-06, + "loss": 0.0473, + "step": 4345 + }, + { + "epoch": 2.262363352420614, + "grad_norm": 0.24354092974905714, + "learning_rate": 7.51711849234075e-06, + "loss": 0.0493, + "step": 4346 + }, + { + "epoch": 2.262883914627798, + "grad_norm": 0.2315919364564454, + "learning_rate": 7.507078085397701e-06, + "loss": 0.0481, + "step": 4347 + }, + { + "epoch": 2.2634044768349817, + "grad_norm": 0.23710368286301195, + "learning_rate": 7.497043203533444e-06, + "loss": 0.0481, + "step": 4348 + }, + { + "epoch": 2.2639250390421655, + "grad_norm": 0.23553140969761996, + "learning_rate": 7.487013849917454e-06, + "loss": 0.0479, + "step": 4349 + }, + { + "epoch": 2.2644456012493492, + "grad_norm": 0.24202468474373762, + "learning_rate": 7.476990027717473e-06, + "loss": 0.0502, + "step": 4350 + }, + { + "epoch": 2.264966163456533, + "grad_norm": 0.24252991253463546, + "learning_rate": 7.46697174009949e-06, + "loss": 0.0506, + "step": 4351 + }, + { + "epoch": 2.265486725663717, + "grad_norm": 0.24450739997748402, + "learning_rate": 7.456958990227761e-06, + "loss": 0.0493, + "step": 4352 + }, + { + "epoch": 2.2660072878709006, + "grad_norm": 0.23512299197952063, + "learning_rate": 7.446951781264755e-06, + "loss": 0.05, + "step": 4353 + }, + { + "epoch": 2.2665278500780843, + "grad_norm": 0.2349636567214315, + "learning_rate": 7.436950116371225e-06, + "loss": 0.0478, + "step": 4354 + }, + { + "epoch": 2.267048412285268, + "grad_norm": 0.22313490108520748, + "learning_rate": 7.4269539987061625e-06, + "loss": 0.0489, + "step": 4355 + }, + { + "epoch": 2.267568974492452, + "grad_norm": 0.24266098411924822, + "learning_rate": 7.416963431426815e-06, + "loss": 0.0508, + "step": 4356 + }, + { + "epoch": 2.2680895366996356, + "grad_norm": 0.22709180519413907, + "learning_rate": 7.406978417688659e-06, + "loss": 0.0476, + "step": 4357 + }, + { + "epoch": 2.2686100989068194, + "grad_norm": 0.2252843789851944, + "learning_rate": 7.396998960645418e-06, + "loss": 0.0475, + "step": 4358 + }, + { + "epoch": 2.269130661114003, + "grad_norm": 0.2280964931147517, + "learning_rate": 7.387025063449082e-06, + "loss": 0.0479, + "step": 4359 + }, + { + "epoch": 2.269651223321187, + "grad_norm": 0.22442020507845567, + "learning_rate": 7.377056729249865e-06, + "loss": 0.0474, + "step": 4360 + }, + { + "epoch": 2.2701717855283707, + "grad_norm": 0.23536807823855144, + "learning_rate": 7.3670939611962446e-06, + "loss": 0.0471, + "step": 4361 + }, + { + "epoch": 2.2706923477355545, + "grad_norm": 0.22341853555386254, + "learning_rate": 7.357136762434908e-06, + "loss": 0.0467, + "step": 4362 + }, + { + "epoch": 2.2712129099427383, + "grad_norm": 0.2314457941140156, + "learning_rate": 7.347185136110807e-06, + "loss": 0.0474, + "step": 4363 + }, + { + "epoch": 2.271733472149922, + "grad_norm": 0.23444389464917653, + "learning_rate": 7.337239085367134e-06, + "loss": 0.0473, + "step": 4364 + }, + { + "epoch": 2.272254034357106, + "grad_norm": 0.22895085279319216, + "learning_rate": 7.32729861334531e-06, + "loss": 0.0482, + "step": 4365 + }, + { + "epoch": 2.2727745965642896, + "grad_norm": 0.2301198469063954, + "learning_rate": 7.317363723185017e-06, + "loss": 0.0466, + "step": 4366 + }, + { + "epoch": 2.2732951587714734, + "grad_norm": 0.2375738571090302, + "learning_rate": 7.3074344180241225e-06, + "loss": 0.0493, + "step": 4367 + }, + { + "epoch": 2.273815720978657, + "grad_norm": 0.23547330811705733, + "learning_rate": 7.297510700998783e-06, + "loss": 0.0467, + "step": 4368 + }, + { + "epoch": 2.274336283185841, + "grad_norm": 0.23704718711434258, + "learning_rate": 7.2875925752433655e-06, + "loss": 0.0484, + "step": 4369 + }, + { + "epoch": 2.2748568453930247, + "grad_norm": 0.23007390803788796, + "learning_rate": 7.277680043890475e-06, + "loss": 0.0462, + "step": 4370 + }, + { + "epoch": 2.2753774076002085, + "grad_norm": 0.2382599584192981, + "learning_rate": 7.267773110070964e-06, + "loss": 0.0489, + "step": 4371 + }, + { + "epoch": 2.2758979698073922, + "grad_norm": 0.23792105081665424, + "learning_rate": 7.25787177691388e-06, + "loss": 0.0485, + "step": 4372 + }, + { + "epoch": 2.2764185320145756, + "grad_norm": 0.22893605244305398, + "learning_rate": 7.2479760475465395e-06, + "loss": 0.0457, + "step": 4373 + }, + { + "epoch": 2.2769390942217593, + "grad_norm": 0.2372593732772901, + "learning_rate": 7.238085925094468e-06, + "loss": 0.0486, + "step": 4374 + }, + { + "epoch": 2.277459656428943, + "grad_norm": 0.23222430381532452, + "learning_rate": 7.22820141268144e-06, + "loss": 0.0473, + "step": 4375 + }, + { + "epoch": 2.277980218636127, + "grad_norm": 0.23165532275317738, + "learning_rate": 7.2183225134294345e-06, + "loss": 0.0489, + "step": 4376 + }, + { + "epoch": 2.2785007808433106, + "grad_norm": 0.22857397337810045, + "learning_rate": 7.2084492304586586e-06, + "loss": 0.0485, + "step": 4377 + }, + { + "epoch": 2.2790213430504944, + "grad_norm": 0.2308547953684119, + "learning_rate": 7.19858156688756e-06, + "loss": 0.0459, + "step": 4378 + }, + { + "epoch": 2.279541905257678, + "grad_norm": 0.22575144523819185, + "learning_rate": 7.188719525832813e-06, + "loss": 0.0476, + "step": 4379 + }, + { + "epoch": 2.280062467464862, + "grad_norm": 0.24027575061928344, + "learning_rate": 7.1788631104093145e-06, + "loss": 0.0496, + "step": 4380 + }, + { + "epoch": 2.2805830296720457, + "grad_norm": 0.23540572654705236, + "learning_rate": 7.1690123237301596e-06, + "loss": 0.0501, + "step": 4381 + }, + { + "epoch": 2.2811035918792295, + "grad_norm": 0.24772965477869333, + "learning_rate": 7.159167168906694e-06, + "loss": 0.049, + "step": 4382 + }, + { + "epoch": 2.2816241540864133, + "grad_norm": 0.2320276288812207, + "learning_rate": 7.149327649048482e-06, + "loss": 0.047, + "step": 4383 + }, + { + "epoch": 2.282144716293597, + "grad_norm": 0.23303073974073554, + "learning_rate": 7.1394937672633e-06, + "loss": 0.0481, + "step": 4384 + }, + { + "epoch": 2.282665278500781, + "grad_norm": 0.2353449984385499, + "learning_rate": 7.129665526657145e-06, + "loss": 0.0479, + "step": 4385 + }, + { + "epoch": 2.2831858407079646, + "grad_norm": 0.23270277910500065, + "learning_rate": 7.119842930334222e-06, + "loss": 0.0479, + "step": 4386 + }, + { + "epoch": 2.2837064029151484, + "grad_norm": 0.24467249136218208, + "learning_rate": 7.110025981396975e-06, + "loss": 0.0503, + "step": 4387 + }, + { + "epoch": 2.284226965122332, + "grad_norm": 0.23634108219594238, + "learning_rate": 7.100214682946049e-06, + "loss": 0.0483, + "step": 4388 + }, + { + "epoch": 2.284747527329516, + "grad_norm": 0.2243524743535289, + "learning_rate": 7.090409038080317e-06, + "loss": 0.0477, + "step": 4389 + }, + { + "epoch": 2.2852680895366997, + "grad_norm": 0.2272063584018309, + "learning_rate": 7.080609049896844e-06, + "loss": 0.0486, + "step": 4390 + }, + { + "epoch": 2.2857886517438835, + "grad_norm": 0.22104862431300737, + "learning_rate": 7.0708147214909315e-06, + "loss": 0.0479, + "step": 4391 + }, + { + "epoch": 2.2863092139510672, + "grad_norm": 0.2324349902565072, + "learning_rate": 7.06102605595608e-06, + "loss": 0.0489, + "step": 4392 + }, + { + "epoch": 2.286829776158251, + "grad_norm": 0.23945786205814168, + "learning_rate": 7.051243056384016e-06, + "loss": 0.0473, + "step": 4393 + }, + { + "epoch": 2.2873503383654348, + "grad_norm": 0.2290382839925783, + "learning_rate": 7.04146572586466e-06, + "loss": 0.0476, + "step": 4394 + }, + { + "epoch": 2.2878709005726185, + "grad_norm": 0.22924448362356376, + "learning_rate": 7.031694067486136e-06, + "loss": 0.0474, + "step": 4395 + }, + { + "epoch": 2.2883914627798023, + "grad_norm": 0.23714424830350492, + "learning_rate": 7.0219280843348e-06, + "loss": 0.05, + "step": 4396 + }, + { + "epoch": 2.288912024986986, + "grad_norm": 0.2261718669538399, + "learning_rate": 7.012167779495201e-06, + "loss": 0.0471, + "step": 4397 + }, + { + "epoch": 2.28943258719417, + "grad_norm": 0.22758504532465043, + "learning_rate": 7.002413156050108e-06, + "loss": 0.0467, + "step": 4398 + }, + { + "epoch": 2.2899531494013536, + "grad_norm": 0.23789065938410056, + "learning_rate": 6.9926642170804665e-06, + "loss": 0.0485, + "step": 4399 + }, + { + "epoch": 2.2904737116085374, + "grad_norm": 0.24091702145465044, + "learning_rate": 6.982920965665457e-06, + "loss": 0.0455, + "step": 4400 + }, + { + "epoch": 2.2909942738157207, + "grad_norm": 0.2359330189743294, + "learning_rate": 6.9731834048824465e-06, + "loss": 0.0486, + "step": 4401 + }, + { + "epoch": 2.2915148360229045, + "grad_norm": 0.23437207513741382, + "learning_rate": 6.963451537807023e-06, + "loss": 0.0473, + "step": 4402 + }, + { + "epoch": 2.2920353982300883, + "grad_norm": 0.23735518742716438, + "learning_rate": 6.953725367512951e-06, + "loss": 0.0476, + "step": 4403 + }, + { + "epoch": 2.292555960437272, + "grad_norm": 0.2462383376150093, + "learning_rate": 6.944004897072201e-06, + "loss": 0.0487, + "step": 4404 + }, + { + "epoch": 2.293076522644456, + "grad_norm": 0.2411982549992137, + "learning_rate": 6.934290129554957e-06, + "loss": 0.0477, + "step": 4405 + }, + { + "epoch": 2.2935970848516396, + "grad_norm": 0.218847924533852, + "learning_rate": 6.924581068029598e-06, + "loss": 0.0456, + "step": 4406 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 0.22811627743109275, + "learning_rate": 6.914877715562704e-06, + "loss": 0.0464, + "step": 4407 + }, + { + "epoch": 2.294638209266007, + "grad_norm": 0.2463222783668148, + "learning_rate": 6.905180075219025e-06, + "loss": 0.0476, + "step": 4408 + }, + { + "epoch": 2.295158771473191, + "grad_norm": 0.2494378835645998, + "learning_rate": 6.895488150061541e-06, + "loss": 0.0481, + "step": 4409 + }, + { + "epoch": 2.2956793336803747, + "grad_norm": 0.22371944092940846, + "learning_rate": 6.88580194315141e-06, + "loss": 0.0456, + "step": 4410 + }, + { + "epoch": 2.2961998958875585, + "grad_norm": 0.23726791642543021, + "learning_rate": 6.876121457547996e-06, + "loss": 0.0477, + "step": 4411 + }, + { + "epoch": 2.2967204580947422, + "grad_norm": 0.22025059869257912, + "learning_rate": 6.866446696308837e-06, + "loss": 0.0444, + "step": 4412 + }, + { + "epoch": 2.297241020301926, + "grad_norm": 0.2373689563171938, + "learning_rate": 6.856777662489669e-06, + "loss": 0.0473, + "step": 4413 + }, + { + "epoch": 2.29776158250911, + "grad_norm": 0.22953651254536686, + "learning_rate": 6.847114359144427e-06, + "loss": 0.0476, + "step": 4414 + }, + { + "epoch": 2.2982821447162936, + "grad_norm": 0.2314904782948286, + "learning_rate": 6.83745678932523e-06, + "loss": 0.0475, + "step": 4415 + }, + { + "epoch": 2.2988027069234773, + "grad_norm": 0.23068456271896137, + "learning_rate": 6.8278049560824035e-06, + "loss": 0.0497, + "step": 4416 + }, + { + "epoch": 2.299323269130661, + "grad_norm": 0.22509485065033707, + "learning_rate": 6.818158862464422e-06, + "loss": 0.0473, + "step": 4417 + }, + { + "epoch": 2.299843831337845, + "grad_norm": 0.23754030254959319, + "learning_rate": 6.8085185115179836e-06, + "loss": 0.0484, + "step": 4418 + }, + { + "epoch": 2.3003643935450286, + "grad_norm": 0.23154423326948806, + "learning_rate": 6.798883906287956e-06, + "loss": 0.0476, + "step": 4419 + }, + { + "epoch": 2.3008849557522124, + "grad_norm": 0.2276877249333266, + "learning_rate": 6.789255049817406e-06, + "loss": 0.0474, + "step": 4420 + }, + { + "epoch": 2.301405517959396, + "grad_norm": 0.2455145987692723, + "learning_rate": 6.779631945147566e-06, + "loss": 0.0484, + "step": 4421 + }, + { + "epoch": 2.30192608016658, + "grad_norm": 0.23746214997113513, + "learning_rate": 6.770014595317853e-06, + "loss": 0.0464, + "step": 4422 + }, + { + "epoch": 2.3024466423737637, + "grad_norm": 0.22566035826376565, + "learning_rate": 6.760403003365884e-06, + "loss": 0.0471, + "step": 4423 + }, + { + "epoch": 2.3029672045809475, + "grad_norm": 0.23320331066661007, + "learning_rate": 6.750797172327442e-06, + "loss": 0.0492, + "step": 4424 + }, + { + "epoch": 2.3034877667881313, + "grad_norm": 0.22163307688943074, + "learning_rate": 6.741197105236505e-06, + "loss": 0.0456, + "step": 4425 + }, + { + "epoch": 2.304008328995315, + "grad_norm": 0.24047914367787057, + "learning_rate": 6.731602805125206e-06, + "loss": 0.0492, + "step": 4426 + }, + { + "epoch": 2.304528891202499, + "grad_norm": 0.22844822202025908, + "learning_rate": 6.72201427502388e-06, + "loss": 0.0477, + "step": 4427 + }, + { + "epoch": 2.3050494534096826, + "grad_norm": 0.22813096140516748, + "learning_rate": 6.712431517961029e-06, + "loss": 0.0472, + "step": 4428 + }, + { + "epoch": 2.3055700156168664, + "grad_norm": 0.22811760187365832, + "learning_rate": 6.702854536963343e-06, + "loss": 0.0472, + "step": 4429 + }, + { + "epoch": 2.30609057782405, + "grad_norm": 0.23994023415059773, + "learning_rate": 6.69328333505567e-06, + "loss": 0.0484, + "step": 4430 + }, + { + "epoch": 2.306611140031234, + "grad_norm": 0.23874833178382263, + "learning_rate": 6.683717915261034e-06, + "loss": 0.0498, + "step": 4431 + }, + { + "epoch": 2.3071317022384177, + "grad_norm": 0.2331323610830941, + "learning_rate": 6.674158280600645e-06, + "loss": 0.0479, + "step": 4432 + }, + { + "epoch": 2.3076522644456015, + "grad_norm": 0.23190741443914148, + "learning_rate": 6.6646044340938854e-06, + "loss": 0.0474, + "step": 4433 + }, + { + "epoch": 2.3081728266527852, + "grad_norm": 0.23628413869646864, + "learning_rate": 6.655056378758298e-06, + "loss": 0.0498, + "step": 4434 + }, + { + "epoch": 2.308693388859969, + "grad_norm": 0.2313885805739463, + "learning_rate": 6.645514117609616e-06, + "loss": 0.0475, + "step": 4435 + }, + { + "epoch": 2.3092139510671528, + "grad_norm": 0.2233459799496242, + "learning_rate": 6.6359776536617096e-06, + "loss": 0.0467, + "step": 4436 + }, + { + "epoch": 2.309734513274336, + "grad_norm": 0.2363879717090792, + "learning_rate": 6.626446989926652e-06, + "loss": 0.0484, + "step": 4437 + }, + { + "epoch": 2.31025507548152, + "grad_norm": 0.23027837691324807, + "learning_rate": 6.616922129414671e-06, + "loss": 0.0462, + "step": 4438 + }, + { + "epoch": 2.3107756376887036, + "grad_norm": 0.22061521966437242, + "learning_rate": 6.6074030751341496e-06, + "loss": 0.046, + "step": 4439 + }, + { + "epoch": 2.3112961998958874, + "grad_norm": 0.236489044723267, + "learning_rate": 6.597889830091664e-06, + "loss": 0.0491, + "step": 4440 + }, + { + "epoch": 2.311816762103071, + "grad_norm": 0.2256938496890428, + "learning_rate": 6.5883823972919205e-06, + "loss": 0.0463, + "step": 4441 + }, + { + "epoch": 2.312337324310255, + "grad_norm": 0.23195296992115286, + "learning_rate": 6.5788807797378196e-06, + "loss": 0.0486, + "step": 4442 + }, + { + "epoch": 2.3128578865174387, + "grad_norm": 0.2274872199904384, + "learning_rate": 6.569384980430415e-06, + "loss": 0.0466, + "step": 4443 + }, + { + "epoch": 2.3133784487246225, + "grad_norm": 0.2438659322406287, + "learning_rate": 6.559895002368927e-06, + "loss": 0.0502, + "step": 4444 + }, + { + "epoch": 2.3138990109318063, + "grad_norm": 0.24195755478933637, + "learning_rate": 6.5504108485507175e-06, + "loss": 0.0493, + "step": 4445 + }, + { + "epoch": 2.31441957313899, + "grad_norm": 0.2312187971962166, + "learning_rate": 6.5409325219713325e-06, + "loss": 0.0468, + "step": 4446 + }, + { + "epoch": 2.314940135346174, + "grad_norm": 0.23686834836229043, + "learning_rate": 6.531460025624475e-06, + "loss": 0.0473, + "step": 4447 + }, + { + "epoch": 2.3154606975533576, + "grad_norm": 0.22867574310554584, + "learning_rate": 6.521993362501988e-06, + "loss": 0.0458, + "step": 4448 + }, + { + "epoch": 2.3159812597605414, + "grad_norm": 0.23825031277590675, + "learning_rate": 6.512532535593896e-06, + "loss": 0.0474, + "step": 4449 + }, + { + "epoch": 2.316501821967725, + "grad_norm": 0.23330630762399085, + "learning_rate": 6.503077547888353e-06, + "loss": 0.047, + "step": 4450 + }, + { + "epoch": 2.317022384174909, + "grad_norm": 0.22866429601766594, + "learning_rate": 6.493628402371693e-06, + "loss": 0.047, + "step": 4451 + }, + { + "epoch": 2.3175429463820927, + "grad_norm": 0.23326929579367334, + "learning_rate": 6.484185102028398e-06, + "loss": 0.0467, + "step": 4452 + }, + { + "epoch": 2.3180635085892765, + "grad_norm": 0.21423278359694856, + "learning_rate": 6.474747649841103e-06, + "loss": 0.0458, + "step": 4453 + }, + { + "epoch": 2.3185840707964602, + "grad_norm": 0.22292986350540375, + "learning_rate": 6.465316048790587e-06, + "loss": 0.0473, + "step": 4454 + }, + { + "epoch": 2.319104633003644, + "grad_norm": 0.24046086305383696, + "learning_rate": 6.4558903018557936e-06, + "loss": 0.0506, + "step": 4455 + }, + { + "epoch": 2.3196251952108278, + "grad_norm": 0.23153182365151787, + "learning_rate": 6.446470412013817e-06, + "loss": 0.0471, + "step": 4456 + }, + { + "epoch": 2.3201457574180115, + "grad_norm": 0.2256223599408448, + "learning_rate": 6.437056382239884e-06, + "loss": 0.0475, + "step": 4457 + }, + { + "epoch": 2.3206663196251953, + "grad_norm": 0.22640346868399505, + "learning_rate": 6.427648215507398e-06, + "loss": 0.0497, + "step": 4458 + }, + { + "epoch": 2.321186881832379, + "grad_norm": 0.22663691130270067, + "learning_rate": 6.418245914787882e-06, + "loss": 0.0451, + "step": 4459 + }, + { + "epoch": 2.321707444039563, + "grad_norm": 0.22821336766562575, + "learning_rate": 6.408849483051024e-06, + "loss": 0.0472, + "step": 4460 + }, + { + "epoch": 2.3222280062467466, + "grad_norm": 0.23385635491282356, + "learning_rate": 6.399458923264659e-06, + "loss": 0.0482, + "step": 4461 + }, + { + "epoch": 2.3227485684539304, + "grad_norm": 0.2320806105076779, + "learning_rate": 6.3900742383947664e-06, + "loss": 0.0459, + "step": 4462 + }, + { + "epoch": 2.323269130661114, + "grad_norm": 0.2389305017153277, + "learning_rate": 6.380695431405453e-06, + "loss": 0.0496, + "step": 4463 + }, + { + "epoch": 2.323789692868298, + "grad_norm": 0.22813071098401094, + "learning_rate": 6.371322505258992e-06, + "loss": 0.0457, + "step": 4464 + }, + { + "epoch": 2.3243102550754813, + "grad_norm": 0.24014119920977195, + "learning_rate": 6.361955462915795e-06, + "loss": 0.0474, + "step": 4465 + }, + { + "epoch": 2.324830817282665, + "grad_norm": 0.2416415435940098, + "learning_rate": 6.352594307334395e-06, + "loss": 0.0473, + "step": 4466 + }, + { + "epoch": 2.325351379489849, + "grad_norm": 0.2656651285253611, + "learning_rate": 6.343239041471497e-06, + "loss": 0.0479, + "step": 4467 + }, + { + "epoch": 2.3258719416970326, + "grad_norm": 0.23550173668414345, + "learning_rate": 6.333889668281912e-06, + "loss": 0.0473, + "step": 4468 + }, + { + "epoch": 2.3263925039042164, + "grad_norm": 0.23828823064619004, + "learning_rate": 6.324546190718614e-06, + "loss": 0.0457, + "step": 4469 + }, + { + "epoch": 2.3269130661114, + "grad_norm": 0.24471560830756434, + "learning_rate": 6.3152086117327116e-06, + "loss": 0.0488, + "step": 4470 + }, + { + "epoch": 2.327433628318584, + "grad_norm": 0.22933514872804697, + "learning_rate": 6.305876934273452e-06, + "loss": 0.0468, + "step": 4471 + }, + { + "epoch": 2.3279541905257677, + "grad_norm": 0.23426764882810178, + "learning_rate": 6.296551161288197e-06, + "loss": 0.0485, + "step": 4472 + }, + { + "epoch": 2.3284747527329515, + "grad_norm": 0.22856682242539877, + "learning_rate": 6.28723129572247e-06, + "loss": 0.0462, + "step": 4473 + }, + { + "epoch": 2.3289953149401352, + "grad_norm": 0.23846002941423738, + "learning_rate": 6.277917340519918e-06, + "loss": 0.0482, + "step": 4474 + }, + { + "epoch": 2.329515877147319, + "grad_norm": 0.234480561210467, + "learning_rate": 6.268609298622327e-06, + "loss": 0.0482, + "step": 4475 + }, + { + "epoch": 2.3300364393545028, + "grad_norm": 0.24047174883310446, + "learning_rate": 6.259307172969606e-06, + "loss": 0.0477, + "step": 4476 + }, + { + "epoch": 2.3305570015616865, + "grad_norm": 0.2344548406347454, + "learning_rate": 6.250010966499786e-06, + "loss": 0.0484, + "step": 4477 + }, + { + "epoch": 2.3310775637688703, + "grad_norm": 0.22642383076214023, + "learning_rate": 6.240720682149054e-06, + "loss": 0.0471, + "step": 4478 + }, + { + "epoch": 2.331598125976054, + "grad_norm": 0.22569085877899356, + "learning_rate": 6.231436322851711e-06, + "loss": 0.0465, + "step": 4479 + }, + { + "epoch": 2.332118688183238, + "grad_norm": 0.21706380279472942, + "learning_rate": 6.222157891540198e-06, + "loss": 0.0465, + "step": 4480 + }, + { + "epoch": 2.3326392503904216, + "grad_norm": 0.22733120940204257, + "learning_rate": 6.21288539114506e-06, + "loss": 0.0475, + "step": 4481 + }, + { + "epoch": 2.3331598125976054, + "grad_norm": 0.22939434669257294, + "learning_rate": 6.203618824594995e-06, + "loss": 0.0463, + "step": 4482 + }, + { + "epoch": 2.333680374804789, + "grad_norm": 0.23734452631886388, + "learning_rate": 6.194358194816813e-06, + "loss": 0.0477, + "step": 4483 + }, + { + "epoch": 2.334200937011973, + "grad_norm": 0.23411419581489817, + "learning_rate": 6.1851035047354595e-06, + "loss": 0.0477, + "step": 4484 + }, + { + "epoch": 2.3347214992191567, + "grad_norm": 0.24002134917837561, + "learning_rate": 6.175854757273989e-06, + "loss": 0.0481, + "step": 4485 + }, + { + "epoch": 2.3352420614263405, + "grad_norm": 0.2370166595867462, + "learning_rate": 6.166611955353577e-06, + "loss": 0.0506, + "step": 4486 + }, + { + "epoch": 2.3357626236335243, + "grad_norm": 0.23745847434442083, + "learning_rate": 6.157375101893543e-06, + "loss": 0.047, + "step": 4487 + }, + { + "epoch": 2.336283185840708, + "grad_norm": 0.24957266727201047, + "learning_rate": 6.148144199811309e-06, + "loss": 0.0479, + "step": 4488 + }, + { + "epoch": 2.336803748047892, + "grad_norm": 0.2351547804630439, + "learning_rate": 6.138919252022435e-06, + "loss": 0.0474, + "step": 4489 + }, + { + "epoch": 2.3373243102550756, + "grad_norm": 0.24163133481886173, + "learning_rate": 6.129700261440574e-06, + "loss": 0.0475, + "step": 4490 + }, + { + "epoch": 2.3378448724622594, + "grad_norm": 0.24059572984473415, + "learning_rate": 6.120487230977517e-06, + "loss": 0.0502, + "step": 4491 + }, + { + "epoch": 2.338365434669443, + "grad_norm": 0.2355645081535075, + "learning_rate": 6.1112801635431704e-06, + "loss": 0.0497, + "step": 4492 + }, + { + "epoch": 2.338885996876627, + "grad_norm": 0.23939602704815524, + "learning_rate": 6.102079062045559e-06, + "loss": 0.0479, + "step": 4493 + }, + { + "epoch": 2.3394065590838107, + "grad_norm": 0.22977889226796178, + "learning_rate": 6.092883929390816e-06, + "loss": 0.0456, + "step": 4494 + }, + { + "epoch": 2.3399271212909944, + "grad_norm": 0.23478062109634587, + "learning_rate": 6.083694768483181e-06, + "loss": 0.0481, + "step": 4495 + }, + { + "epoch": 2.340447683498178, + "grad_norm": 0.22765522951948453, + "learning_rate": 6.074511582225029e-06, + "loss": 0.0484, + "step": 4496 + }, + { + "epoch": 2.340968245705362, + "grad_norm": 0.21743859076143826, + "learning_rate": 6.065334373516834e-06, + "loss": 0.0471, + "step": 4497 + }, + { + "epoch": 2.3414888079125458, + "grad_norm": 0.2314187819934621, + "learning_rate": 6.056163145257187e-06, + "loss": 0.0484, + "step": 4498 + }, + { + "epoch": 2.3420093701197295, + "grad_norm": 0.23842056215195048, + "learning_rate": 6.046997900342796e-06, + "loss": 0.048, + "step": 4499 + }, + { + "epoch": 2.3425299323269133, + "grad_norm": 0.22448513602756287, + "learning_rate": 6.037838641668459e-06, + "loss": 0.0459, + "step": 4500 + }, + { + "epoch": 2.3430504945340966, + "grad_norm": 0.2236326297612647, + "learning_rate": 6.028685372127099e-06, + "loss": 0.0469, + "step": 4501 + }, + { + "epoch": 2.3435710567412804, + "grad_norm": 0.22213192913695828, + "learning_rate": 6.019538094609759e-06, + "loss": 0.0466, + "step": 4502 + }, + { + "epoch": 2.344091618948464, + "grad_norm": 0.22847968405344984, + "learning_rate": 6.010396812005553e-06, + "loss": 0.0468, + "step": 4503 + }, + { + "epoch": 2.344612181155648, + "grad_norm": 0.23492086518084526, + "learning_rate": 6.00126152720174e-06, + "loss": 0.047, + "step": 4504 + }, + { + "epoch": 2.3451327433628317, + "grad_norm": 0.2394105213441764, + "learning_rate": 5.992132243083656e-06, + "loss": 0.0496, + "step": 4505 + }, + { + "epoch": 2.3456533055700155, + "grad_norm": 0.24111441487478028, + "learning_rate": 5.983008962534761e-06, + "loss": 0.0461, + "step": 4506 + }, + { + "epoch": 2.3461738677771993, + "grad_norm": 0.23107118946181654, + "learning_rate": 5.973891688436608e-06, + "loss": 0.0476, + "step": 4507 + }, + { + "epoch": 2.346694429984383, + "grad_norm": 0.23065663806986544, + "learning_rate": 5.964780423668867e-06, + "loss": 0.0481, + "step": 4508 + }, + { + "epoch": 2.347214992191567, + "grad_norm": 0.23820356615082777, + "learning_rate": 5.9556751711092844e-06, + "loss": 0.0472, + "step": 4509 + }, + { + "epoch": 2.3477355543987506, + "grad_norm": 0.2365896072704537, + "learning_rate": 5.94657593363373e-06, + "loss": 0.0473, + "step": 4510 + }, + { + "epoch": 2.3482561166059344, + "grad_norm": 0.23479232602004582, + "learning_rate": 5.9374827141161715e-06, + "loss": 0.0489, + "step": 4511 + }, + { + "epoch": 2.348776678813118, + "grad_norm": 0.23324065251358814, + "learning_rate": 5.928395515428656e-06, + "loss": 0.0469, + "step": 4512 + }, + { + "epoch": 2.349297241020302, + "grad_norm": 0.2271112729337206, + "learning_rate": 5.919314340441362e-06, + "loss": 0.0462, + "step": 4513 + }, + { + "epoch": 2.3498178032274857, + "grad_norm": 0.23075689196359045, + "learning_rate": 5.9102391920225315e-06, + "loss": 0.0484, + "step": 4514 + }, + { + "epoch": 2.3503383654346695, + "grad_norm": 0.22572112690840151, + "learning_rate": 5.901170073038523e-06, + "loss": 0.0466, + "step": 4515 + }, + { + "epoch": 2.3508589276418532, + "grad_norm": 0.23256450262742248, + "learning_rate": 5.89210698635379e-06, + "loss": 0.0461, + "step": 4516 + }, + { + "epoch": 2.351379489849037, + "grad_norm": 0.220207578677318, + "learning_rate": 5.883049934830884e-06, + "loss": 0.0465, + "step": 4517 + }, + { + "epoch": 2.3519000520562208, + "grad_norm": 0.22721249545677813, + "learning_rate": 5.873998921330426e-06, + "loss": 0.0455, + "step": 4518 + }, + { + "epoch": 2.3524206142634045, + "grad_norm": 0.2632945933248785, + "learning_rate": 5.864953948711155e-06, + "loss": 0.0481, + "step": 4519 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.23515126483913215, + "learning_rate": 5.855915019829902e-06, + "loss": 0.0469, + "step": 4520 + }, + { + "epoch": 2.353461738677772, + "grad_norm": 0.22931919368137274, + "learning_rate": 5.846882137541571e-06, + "loss": 0.0474, + "step": 4521 + }, + { + "epoch": 2.353982300884956, + "grad_norm": 0.24606325790404943, + "learning_rate": 5.837855304699175e-06, + "loss": 0.0484, + "step": 4522 + }, + { + "epoch": 2.3545028630921396, + "grad_norm": 0.23172001395098654, + "learning_rate": 5.828834524153795e-06, + "loss": 0.0475, + "step": 4523 + }, + { + "epoch": 2.3550234252993234, + "grad_norm": 0.21936760204415123, + "learning_rate": 5.819819798754625e-06, + "loss": 0.0458, + "step": 4524 + }, + { + "epoch": 2.355543987506507, + "grad_norm": 0.2193218125572941, + "learning_rate": 5.810811131348929e-06, + "loss": 0.0459, + "step": 4525 + }, + { + "epoch": 2.356064549713691, + "grad_norm": 0.2345064969588428, + "learning_rate": 5.801808524782071e-06, + "loss": 0.0468, + "step": 4526 + }, + { + "epoch": 2.3565851119208747, + "grad_norm": 0.24851347312329655, + "learning_rate": 5.792811981897484e-06, + "loss": 0.0492, + "step": 4527 + }, + { + "epoch": 2.3571056741280585, + "grad_norm": 0.23050774006520552, + "learning_rate": 5.783821505536696e-06, + "loss": 0.0467, + "step": 4528 + }, + { + "epoch": 2.357626236335242, + "grad_norm": 0.23084948710195863, + "learning_rate": 5.7748370985393295e-06, + "loss": 0.0454, + "step": 4529 + }, + { + "epoch": 2.3581467985424256, + "grad_norm": 0.24791040020859287, + "learning_rate": 5.765858763743062e-06, + "loss": 0.0476, + "step": 4530 + }, + { + "epoch": 2.3586673607496094, + "grad_norm": 0.23318412091442703, + "learning_rate": 5.756886503983683e-06, + "loss": 0.0475, + "step": 4531 + }, + { + "epoch": 2.359187922956793, + "grad_norm": 0.23289524027494776, + "learning_rate": 5.747920322095035e-06, + "loss": 0.0464, + "step": 4532 + }, + { + "epoch": 2.359708485163977, + "grad_norm": 0.23663155315479364, + "learning_rate": 5.738960220909068e-06, + "loss": 0.0465, + "step": 4533 + }, + { + "epoch": 2.3602290473711607, + "grad_norm": 0.2330747611757856, + "learning_rate": 5.730006203255792e-06, + "loss": 0.0466, + "step": 4534 + }, + { + "epoch": 2.3607496095783445, + "grad_norm": 0.23854565465214606, + "learning_rate": 5.721058271963311e-06, + "loss": 0.0466, + "step": 4535 + }, + { + "epoch": 2.3612701717855282, + "grad_norm": 0.23694164646216895, + "learning_rate": 5.712116429857789e-06, + "loss": 0.0473, + "step": 4536 + }, + { + "epoch": 2.361790733992712, + "grad_norm": 0.23150554800278614, + "learning_rate": 5.7031806797634755e-06, + "loss": 0.0466, + "step": 4537 + }, + { + "epoch": 2.3623112961998958, + "grad_norm": 0.23630006270170592, + "learning_rate": 5.694251024502709e-06, + "loss": 0.0484, + "step": 4538 + }, + { + "epoch": 2.3628318584070795, + "grad_norm": 0.2338320873775774, + "learning_rate": 5.685327466895874e-06, + "loss": 0.0471, + "step": 4539 + }, + { + "epoch": 2.3633524206142633, + "grad_norm": 0.2377908975210259, + "learning_rate": 5.6764100097614595e-06, + "loss": 0.0488, + "step": 4540 + }, + { + "epoch": 2.363872982821447, + "grad_norm": 0.23304039641551305, + "learning_rate": 5.667498655916001e-06, + "loss": 0.0462, + "step": 4541 + }, + { + "epoch": 2.364393545028631, + "grad_norm": 0.23372925814357076, + "learning_rate": 5.6585934081741205e-06, + "loss": 0.0471, + "step": 4542 + }, + { + "epoch": 2.3649141072358146, + "grad_norm": 0.23054147188963464, + "learning_rate": 5.649694269348516e-06, + "loss": 0.0487, + "step": 4543 + }, + { + "epoch": 2.3654346694429984, + "grad_norm": 0.2295991682759455, + "learning_rate": 5.640801242249952e-06, + "loss": 0.0466, + "step": 4544 + }, + { + "epoch": 2.365955231650182, + "grad_norm": 0.23056386741672127, + "learning_rate": 5.631914329687249e-06, + "loss": 0.0478, + "step": 4545 + }, + { + "epoch": 2.366475793857366, + "grad_norm": 0.22736615156094703, + "learning_rate": 5.623033534467315e-06, + "loss": 0.0459, + "step": 4546 + }, + { + "epoch": 2.3669963560645497, + "grad_norm": 0.23702181937720163, + "learning_rate": 5.614158859395122e-06, + "loss": 0.0481, + "step": 4547 + }, + { + "epoch": 2.3675169182717335, + "grad_norm": 0.2266535316694567, + "learning_rate": 5.605290307273694e-06, + "loss": 0.046, + "step": 4548 + }, + { + "epoch": 2.3680374804789173, + "grad_norm": 0.23582655875491848, + "learning_rate": 5.596427880904148e-06, + "loss": 0.0472, + "step": 4549 + }, + { + "epoch": 2.368558042686101, + "grad_norm": 0.24143542099238016, + "learning_rate": 5.587571583085632e-06, + "loss": 0.0467, + "step": 4550 + }, + { + "epoch": 2.369078604893285, + "grad_norm": 0.2307564717560343, + "learning_rate": 5.5787214166153875e-06, + "loss": 0.0467, + "step": 4551 + }, + { + "epoch": 2.3695991671004686, + "grad_norm": 0.24506784373841559, + "learning_rate": 5.569877384288708e-06, + "loss": 0.0474, + "step": 4552 + }, + { + "epoch": 2.3701197293076524, + "grad_norm": 0.32435382511813377, + "learning_rate": 5.5610394888989585e-06, + "loss": 0.0476, + "step": 4553 + }, + { + "epoch": 2.370640291514836, + "grad_norm": 0.2342486669021339, + "learning_rate": 5.5522077332375436e-06, + "loss": 0.0474, + "step": 4554 + }, + { + "epoch": 2.37116085372202, + "grad_norm": 0.21410206999675588, + "learning_rate": 5.543382120093946e-06, + "loss": 0.0442, + "step": 4555 + }, + { + "epoch": 2.3716814159292037, + "grad_norm": 0.2338596233871188, + "learning_rate": 5.5345626522557175e-06, + "loss": 0.0455, + "step": 4556 + }, + { + "epoch": 2.3722019781363874, + "grad_norm": 0.23916410275167593, + "learning_rate": 5.525749332508437e-06, + "loss": 0.0469, + "step": 4557 + }, + { + "epoch": 2.372722540343571, + "grad_norm": 0.23457736048555594, + "learning_rate": 5.51694216363578e-06, + "loss": 0.0456, + "step": 4558 + }, + { + "epoch": 2.373243102550755, + "grad_norm": 0.25322127032434727, + "learning_rate": 5.5081411484194435e-06, + "loss": 0.048, + "step": 4559 + }, + { + "epoch": 2.3737636647579388, + "grad_norm": 0.24019282093163075, + "learning_rate": 5.499346289639206e-06, + "loss": 0.0466, + "step": 4560 + }, + { + "epoch": 2.3742842269651225, + "grad_norm": 0.2264918241309705, + "learning_rate": 5.490557590072892e-06, + "loss": 0.0454, + "step": 4561 + }, + { + "epoch": 2.3748047891723063, + "grad_norm": 0.2357276481863, + "learning_rate": 5.4817750524963904e-06, + "loss": 0.047, + "step": 4562 + }, + { + "epoch": 2.37532535137949, + "grad_norm": 0.23230594226469076, + "learning_rate": 5.472998679683619e-06, + "loss": 0.0465, + "step": 4563 + }, + { + "epoch": 2.375845913586674, + "grad_norm": 0.22533858615611319, + "learning_rate": 5.4642284744065715e-06, + "loss": 0.0468, + "step": 4564 + }, + { + "epoch": 2.376366475793857, + "grad_norm": 0.22724858506067902, + "learning_rate": 5.455464439435299e-06, + "loss": 0.0481, + "step": 4565 + }, + { + "epoch": 2.376887038001041, + "grad_norm": 0.23606749438431365, + "learning_rate": 5.446706577537869e-06, + "loss": 0.0466, + "step": 4566 + }, + { + "epoch": 2.3774076002082247, + "grad_norm": 0.2370377994578295, + "learning_rate": 5.437954891480443e-06, + "loss": 0.0494, + "step": 4567 + }, + { + "epoch": 2.3779281624154085, + "grad_norm": 0.2240890350805136, + "learning_rate": 5.4292093840271955e-06, + "loss": 0.0458, + "step": 4568 + }, + { + "epoch": 2.3784487246225923, + "grad_norm": 0.2289474671965338, + "learning_rate": 5.420470057940372e-06, + "loss": 0.0467, + "step": 4569 + }, + { + "epoch": 2.378969286829776, + "grad_norm": 0.2298982841879016, + "learning_rate": 5.411736915980253e-06, + "loss": 0.049, + "step": 4570 + }, + { + "epoch": 2.37948984903696, + "grad_norm": 0.2268863902053528, + "learning_rate": 5.403009960905178e-06, + "loss": 0.0461, + "step": 4571 + }, + { + "epoch": 2.3800104112441436, + "grad_norm": 0.2436459860162038, + "learning_rate": 5.394289195471527e-06, + "loss": 0.0487, + "step": 4572 + }, + { + "epoch": 2.3805309734513274, + "grad_norm": 0.2632454708511737, + "learning_rate": 5.385574622433714e-06, + "loss": 0.0469, + "step": 4573 + }, + { + "epoch": 2.381051535658511, + "grad_norm": 0.2301709888777297, + "learning_rate": 5.3768662445442204e-06, + "loss": 0.0475, + "step": 4574 + }, + { + "epoch": 2.381572097865695, + "grad_norm": 0.22767022475445184, + "learning_rate": 5.368164064553541e-06, + "loss": 0.0473, + "step": 4575 + }, + { + "epoch": 2.3820926600728787, + "grad_norm": 0.22628560424820487, + "learning_rate": 5.359468085210237e-06, + "loss": 0.045, + "step": 4576 + }, + { + "epoch": 2.3826132222800624, + "grad_norm": 0.23342340422854826, + "learning_rate": 5.3507783092609095e-06, + "loss": 0.0486, + "step": 4577 + }, + { + "epoch": 2.383133784487246, + "grad_norm": 0.2245786257913053, + "learning_rate": 5.342094739450179e-06, + "loss": 0.0464, + "step": 4578 + }, + { + "epoch": 2.38365434669443, + "grad_norm": 0.2302258019263189, + "learning_rate": 5.333417378520733e-06, + "loss": 0.0474, + "step": 4579 + }, + { + "epoch": 2.3841749089016138, + "grad_norm": 0.2268167472789496, + "learning_rate": 5.324746229213282e-06, + "loss": 0.0455, + "step": 4580 + }, + { + "epoch": 2.3846954711087975, + "grad_norm": 0.235065648886395, + "learning_rate": 5.316081294266587e-06, + "loss": 0.0476, + "step": 4581 + }, + { + "epoch": 2.3852160333159813, + "grad_norm": 0.23538123207470538, + "learning_rate": 5.30742257641742e-06, + "loss": 0.0481, + "step": 4582 + }, + { + "epoch": 2.385736595523165, + "grad_norm": 0.23818619605234898, + "learning_rate": 5.298770078400628e-06, + "loss": 0.0468, + "step": 4583 + }, + { + "epoch": 2.386257157730349, + "grad_norm": 0.2320901545098414, + "learning_rate": 5.290123802949051e-06, + "loss": 0.0451, + "step": 4584 + }, + { + "epoch": 2.3867777199375326, + "grad_norm": 0.23469735771174544, + "learning_rate": 5.2814837527935975e-06, + "loss": 0.047, + "step": 4585 + }, + { + "epoch": 2.3872982821447164, + "grad_norm": 0.23767900812756343, + "learning_rate": 5.272849930663204e-06, + "loss": 0.0489, + "step": 4586 + }, + { + "epoch": 2.3878188443519, + "grad_norm": 0.23370065886236566, + "learning_rate": 5.264222339284816e-06, + "loss": 0.0461, + "step": 4587 + }, + { + "epoch": 2.388339406559084, + "grad_norm": 0.23170361003181508, + "learning_rate": 5.255600981383438e-06, + "loss": 0.0476, + "step": 4588 + }, + { + "epoch": 2.3888599687662677, + "grad_norm": 0.23695214156506306, + "learning_rate": 5.246985859682094e-06, + "loss": 0.0481, + "step": 4589 + }, + { + "epoch": 2.3893805309734515, + "grad_norm": 0.22780483346006977, + "learning_rate": 5.238376976901849e-06, + "loss": 0.0471, + "step": 4590 + }, + { + "epoch": 2.3899010931806353, + "grad_norm": 0.23083988277274833, + "learning_rate": 5.229774335761775e-06, + "loss": 0.0459, + "step": 4591 + }, + { + "epoch": 2.390421655387819, + "grad_norm": 0.22986426177716607, + "learning_rate": 5.221177938978999e-06, + "loss": 0.0446, + "step": 4592 + }, + { + "epoch": 2.3909422175950024, + "grad_norm": 0.23342974438965086, + "learning_rate": 5.2125877892686496e-06, + "loss": 0.0472, + "step": 4593 + }, + { + "epoch": 2.391462779802186, + "grad_norm": 0.23195959114003814, + "learning_rate": 5.204003889343906e-06, + "loss": 0.0461, + "step": 4594 + }, + { + "epoch": 2.39198334200937, + "grad_norm": 0.2216035633570572, + "learning_rate": 5.195426241915963e-06, + "loss": 0.045, + "step": 4595 + }, + { + "epoch": 2.3925039042165537, + "grad_norm": 0.2132065666076437, + "learning_rate": 5.186854849694034e-06, + "loss": 0.0445, + "step": 4596 + }, + { + "epoch": 2.3930244664237375, + "grad_norm": 0.23129878723213484, + "learning_rate": 5.178289715385368e-06, + "loss": 0.0469, + "step": 4597 + }, + { + "epoch": 2.3935450286309212, + "grad_norm": 0.2388488336500406, + "learning_rate": 5.169730841695233e-06, + "loss": 0.047, + "step": 4598 + }, + { + "epoch": 2.394065590838105, + "grad_norm": 0.22989393580863302, + "learning_rate": 5.161178231326927e-06, + "loss": 0.046, + "step": 4599 + }, + { + "epoch": 2.3945861530452888, + "grad_norm": 0.24387299564354334, + "learning_rate": 5.152631886981746e-06, + "loss": 0.0478, + "step": 4600 + }, + { + "epoch": 2.3951067152524725, + "grad_norm": 0.2184126453644583, + "learning_rate": 5.144091811359039e-06, + "loss": 0.0437, + "step": 4601 + }, + { + "epoch": 2.3956272774596563, + "grad_norm": 0.23254797708689787, + "learning_rate": 5.1355580071561465e-06, + "loss": 0.0455, + "step": 4602 + }, + { + "epoch": 2.39614783966684, + "grad_norm": 0.24362953858535674, + "learning_rate": 5.127030477068445e-06, + "loss": 0.0462, + "step": 4603 + }, + { + "epoch": 2.396668401874024, + "grad_norm": 0.23599914959344417, + "learning_rate": 5.118509223789336e-06, + "loss": 0.0467, + "step": 4604 + }, + { + "epoch": 2.3971889640812076, + "grad_norm": 0.22043950196000547, + "learning_rate": 5.109994250010211e-06, + "loss": 0.0439, + "step": 4605 + }, + { + "epoch": 2.3977095262883914, + "grad_norm": 0.23070921769768574, + "learning_rate": 5.101485558420505e-06, + "loss": 0.0467, + "step": 4606 + }, + { + "epoch": 2.398230088495575, + "grad_norm": 0.24197283057250074, + "learning_rate": 5.092983151707656e-06, + "loss": 0.0481, + "step": 4607 + }, + { + "epoch": 2.398750650702759, + "grad_norm": 0.22154942089827273, + "learning_rate": 5.0844870325571255e-06, + "loss": 0.0448, + "step": 4608 + }, + { + "epoch": 2.3992712129099427, + "grad_norm": 0.24014632166672964, + "learning_rate": 5.0759972036523715e-06, + "loss": 0.0479, + "step": 4609 + }, + { + "epoch": 2.3997917751171265, + "grad_norm": 0.22536943970116005, + "learning_rate": 5.067513667674892e-06, + "loss": 0.0457, + "step": 4610 + }, + { + "epoch": 2.4003123373243103, + "grad_norm": 0.2250611130271378, + "learning_rate": 5.059036427304167e-06, + "loss": 0.0447, + "step": 4611 + }, + { + "epoch": 2.400832899531494, + "grad_norm": 0.2408162294102492, + "learning_rate": 5.050565485217712e-06, + "loss": 0.0477, + "step": 4612 + }, + { + "epoch": 2.401353461738678, + "grad_norm": 0.24625623591280466, + "learning_rate": 5.04210084409105e-06, + "loss": 0.0484, + "step": 4613 + }, + { + "epoch": 2.4018740239458616, + "grad_norm": 0.2481755981581284, + "learning_rate": 5.033642506597694e-06, + "loss": 0.048, + "step": 4614 + }, + { + "epoch": 2.4023945861530454, + "grad_norm": 0.23987593424996412, + "learning_rate": 5.025190475409189e-06, + "loss": 0.0468, + "step": 4615 + }, + { + "epoch": 2.402915148360229, + "grad_norm": 0.23074712648149856, + "learning_rate": 5.01674475319508e-06, + "loss": 0.0473, + "step": 4616 + }, + { + "epoch": 2.403435710567413, + "grad_norm": 0.22920234691129646, + "learning_rate": 5.008305342622923e-06, + "loss": 0.0463, + "step": 4617 + }, + { + "epoch": 2.4039562727745967, + "grad_norm": 0.24431092684106212, + "learning_rate": 4.99987224635827e-06, + "loss": 0.0486, + "step": 4618 + }, + { + "epoch": 2.4044768349817804, + "grad_norm": 0.24071342837263257, + "learning_rate": 4.99144546706469e-06, + "loss": 0.0482, + "step": 4619 + }, + { + "epoch": 2.404997397188964, + "grad_norm": 0.22487442588291262, + "learning_rate": 4.9830250074037435e-06, + "loss": 0.0463, + "step": 4620 + }, + { + "epoch": 2.405517959396148, + "grad_norm": 0.23677496930387812, + "learning_rate": 4.97461087003501e-06, + "loss": 0.0472, + "step": 4621 + }, + { + "epoch": 2.4060385216033318, + "grad_norm": 0.2339609503507166, + "learning_rate": 4.966203057616073e-06, + "loss": 0.0463, + "step": 4622 + }, + { + "epoch": 2.4065590838105155, + "grad_norm": 0.22648734693696182, + "learning_rate": 4.9578015728024955e-06, + "loss": 0.0452, + "step": 4623 + }, + { + "epoch": 2.4070796460176993, + "grad_norm": 0.23140249756070344, + "learning_rate": 4.949406418247865e-06, + "loss": 0.046, + "step": 4624 + }, + { + "epoch": 2.407600208224883, + "grad_norm": 0.22814922822292663, + "learning_rate": 4.941017596603761e-06, + "loss": 0.0468, + "step": 4625 + }, + { + "epoch": 2.408120770432067, + "grad_norm": 0.24189988700195564, + "learning_rate": 4.9326351105197704e-06, + "loss": 0.0486, + "step": 4626 + }, + { + "epoch": 2.4086413326392506, + "grad_norm": 0.2363782391459408, + "learning_rate": 4.92425896264346e-06, + "loss": 0.0477, + "step": 4627 + }, + { + "epoch": 2.4091618948464344, + "grad_norm": 0.23153910984513215, + "learning_rate": 4.915889155620423e-06, + "loss": 0.0483, + "step": 4628 + }, + { + "epoch": 2.4096824570536177, + "grad_norm": 0.23130021099353434, + "learning_rate": 4.907525692094217e-06, + "loss": 0.0474, + "step": 4629 + }, + { + "epoch": 2.4102030192608015, + "grad_norm": 0.25831436035426264, + "learning_rate": 4.89916857470642e-06, + "loss": 0.0506, + "step": 4630 + }, + { + "epoch": 2.4107235814679853, + "grad_norm": 0.2321273528575011, + "learning_rate": 4.890817806096606e-06, + "loss": 0.049, + "step": 4631 + }, + { + "epoch": 2.411244143675169, + "grad_norm": 0.23553678312517642, + "learning_rate": 4.882473388902323e-06, + "loss": 0.0473, + "step": 4632 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 0.23546720700809365, + "learning_rate": 4.874135325759133e-06, + "loss": 0.0479, + "step": 4633 + }, + { + "epoch": 2.4122852680895366, + "grad_norm": 0.24113411652212086, + "learning_rate": 4.8658036193005855e-06, + "loss": 0.047, + "step": 4634 + }, + { + "epoch": 2.4128058302967204, + "grad_norm": 0.2324247954219874, + "learning_rate": 4.857478272158217e-06, + "loss": 0.0481, + "step": 4635 + }, + { + "epoch": 2.413326392503904, + "grad_norm": 0.23429448270749925, + "learning_rate": 4.849159286961571e-06, + "loss": 0.0474, + "step": 4636 + }, + { + "epoch": 2.413846954711088, + "grad_norm": 0.23663034771539931, + "learning_rate": 4.840846666338161e-06, + "loss": 0.047, + "step": 4637 + }, + { + "epoch": 2.4143675169182717, + "grad_norm": 0.2270730528314231, + "learning_rate": 4.8325404129134915e-06, + "loss": 0.0461, + "step": 4638 + }, + { + "epoch": 2.4148880791254554, + "grad_norm": 0.22378060847821732, + "learning_rate": 4.824240529311075e-06, + "loss": 0.0466, + "step": 4639 + }, + { + "epoch": 2.415408641332639, + "grad_norm": 0.225573866033014, + "learning_rate": 4.815947018152397e-06, + "loss": 0.0464, + "step": 4640 + }, + { + "epoch": 2.415929203539823, + "grad_norm": 0.23586252079175646, + "learning_rate": 4.807659882056945e-06, + "loss": 0.0466, + "step": 4641 + }, + { + "epoch": 2.4164497657470068, + "grad_norm": 0.22986325100510877, + "learning_rate": 4.799379123642162e-06, + "loss": 0.0446, + "step": 4642 + }, + { + "epoch": 2.4169703279541905, + "grad_norm": 0.23705062124271192, + "learning_rate": 4.791104745523509e-06, + "loss": 0.0479, + "step": 4643 + }, + { + "epoch": 2.4174908901613743, + "grad_norm": 0.23732585219476587, + "learning_rate": 4.78283675031442e-06, + "loss": 0.047, + "step": 4644 + }, + { + "epoch": 2.418011452368558, + "grad_norm": 0.23331514934441308, + "learning_rate": 4.7745751406263165e-06, + "loss": 0.0459, + "step": 4645 + }, + { + "epoch": 2.418532014575742, + "grad_norm": 0.23781982825270778, + "learning_rate": 4.766319919068593e-06, + "loss": 0.0476, + "step": 4646 + }, + { + "epoch": 2.4190525767829256, + "grad_norm": 0.24131605817905036, + "learning_rate": 4.758071088248628e-06, + "loss": 0.0483, + "step": 4647 + }, + { + "epoch": 2.4195731389901094, + "grad_norm": 0.2351381427413349, + "learning_rate": 4.7498286507717895e-06, + "loss": 0.047, + "step": 4648 + }, + { + "epoch": 2.420093701197293, + "grad_norm": 0.23702310918812222, + "learning_rate": 4.741592609241427e-06, + "loss": 0.0471, + "step": 4649 + }, + { + "epoch": 2.420614263404477, + "grad_norm": 0.2214982247537357, + "learning_rate": 4.733362966258869e-06, + "loss": 0.0439, + "step": 4650 + }, + { + "epoch": 2.4211348256116607, + "grad_norm": 0.23411796590355238, + "learning_rate": 4.725139724423411e-06, + "loss": 0.0461, + "step": 4651 + }, + { + "epoch": 2.4216553878188445, + "grad_norm": 0.23041508871761626, + "learning_rate": 4.716922886332334e-06, + "loss": 0.0449, + "step": 4652 + }, + { + "epoch": 2.4221759500260283, + "grad_norm": 0.2402349549586395, + "learning_rate": 4.7087124545809045e-06, + "loss": 0.046, + "step": 4653 + }, + { + "epoch": 2.422696512233212, + "grad_norm": 0.23216773676851069, + "learning_rate": 4.700508431762365e-06, + "loss": 0.0465, + "step": 4654 + }, + { + "epoch": 2.423217074440396, + "grad_norm": 0.2348648584242627, + "learning_rate": 4.692310820467919e-06, + "loss": 0.0478, + "step": 4655 + }, + { + "epoch": 2.4237376366475796, + "grad_norm": 0.22464964421666625, + "learning_rate": 4.684119623286748e-06, + "loss": 0.0445, + "step": 4656 + }, + { + "epoch": 2.424258198854763, + "grad_norm": 0.2246231882200223, + "learning_rate": 4.675934842806018e-06, + "loss": 0.0469, + "step": 4657 + }, + { + "epoch": 2.4247787610619467, + "grad_norm": 0.2288877796143045, + "learning_rate": 4.667756481610866e-06, + "loss": 0.0465, + "step": 4658 + }, + { + "epoch": 2.4252993232691304, + "grad_norm": 0.22672281731855962, + "learning_rate": 4.6595845422844035e-06, + "loss": 0.0461, + "step": 4659 + }, + { + "epoch": 2.425819885476314, + "grad_norm": 0.23471740148435588, + "learning_rate": 4.6514190274076996e-06, + "loss": 0.0478, + "step": 4660 + }, + { + "epoch": 2.426340447683498, + "grad_norm": 0.22362959400916294, + "learning_rate": 4.643259939559807e-06, + "loss": 0.0458, + "step": 4661 + }, + { + "epoch": 2.4268610098906818, + "grad_norm": 0.2227801341848505, + "learning_rate": 4.6351072813177495e-06, + "loss": 0.046, + "step": 4662 + }, + { + "epoch": 2.4273815720978655, + "grad_norm": 0.22031118417632997, + "learning_rate": 4.626961055256515e-06, + "loss": 0.0439, + "step": 4663 + }, + { + "epoch": 2.4279021343050493, + "grad_norm": 0.22982141836032516, + "learning_rate": 4.618821263949061e-06, + "loss": 0.0462, + "step": 4664 + }, + { + "epoch": 2.428422696512233, + "grad_norm": 0.23150711698760643, + "learning_rate": 4.610687909966304e-06, + "loss": 0.0456, + "step": 4665 + }, + { + "epoch": 2.428943258719417, + "grad_norm": 0.23045867200414713, + "learning_rate": 4.602560995877142e-06, + "loss": 0.0457, + "step": 4666 + }, + { + "epoch": 2.4294638209266006, + "grad_norm": 0.23591693108732284, + "learning_rate": 4.594440524248431e-06, + "loss": 0.0442, + "step": 4667 + }, + { + "epoch": 2.4299843831337844, + "grad_norm": 0.22810723603892324, + "learning_rate": 4.586326497645002e-06, + "loss": 0.046, + "step": 4668 + }, + { + "epoch": 2.430504945340968, + "grad_norm": 0.2399613227524779, + "learning_rate": 4.578218918629632e-06, + "loss": 0.0476, + "step": 4669 + }, + { + "epoch": 2.431025507548152, + "grad_norm": 0.2464507638072641, + "learning_rate": 4.570117789763073e-06, + "loss": 0.0473, + "step": 4670 + }, + { + "epoch": 2.4315460697553357, + "grad_norm": 0.23144494070866378, + "learning_rate": 4.562023113604041e-06, + "loss": 0.0461, + "step": 4671 + }, + { + "epoch": 2.4320666319625195, + "grad_norm": 0.2325135941641269, + "learning_rate": 4.553934892709216e-06, + "loss": 0.0477, + "step": 4672 + }, + { + "epoch": 2.4325871941697033, + "grad_norm": 0.22934582369828932, + "learning_rate": 4.545853129633226e-06, + "loss": 0.0468, + "step": 4673 + }, + { + "epoch": 2.433107756376887, + "grad_norm": 0.23396484474390528, + "learning_rate": 4.5377778269286766e-06, + "loss": 0.0468, + "step": 4674 + }, + { + "epoch": 2.433628318584071, + "grad_norm": 0.23530318910470144, + "learning_rate": 4.529708987146114e-06, + "loss": 0.0462, + "step": 4675 + }, + { + "epoch": 2.4341488807912546, + "grad_norm": 0.23344486790079444, + "learning_rate": 4.521646612834057e-06, + "loss": 0.0464, + "step": 4676 + }, + { + "epoch": 2.4346694429984383, + "grad_norm": 0.23330292282907472, + "learning_rate": 4.513590706538989e-06, + "loss": 0.0463, + "step": 4677 + }, + { + "epoch": 2.435190005205622, + "grad_norm": 0.22437040772272743, + "learning_rate": 4.5055412708053245e-06, + "loss": 0.0477, + "step": 4678 + }, + { + "epoch": 2.435710567412806, + "grad_norm": 0.2265909891553176, + "learning_rate": 4.497498308175454e-06, + "loss": 0.0461, + "step": 4679 + }, + { + "epoch": 2.4362311296199897, + "grad_norm": 0.23121513283352574, + "learning_rate": 4.489461821189725e-06, + "loss": 0.0464, + "step": 4680 + }, + { + "epoch": 2.4367516918271734, + "grad_norm": 0.2528579823174406, + "learning_rate": 4.481431812386436e-06, + "loss": 0.0495, + "step": 4681 + }, + { + "epoch": 2.437272254034357, + "grad_norm": 0.23289575073759872, + "learning_rate": 4.473408284301825e-06, + "loss": 0.0459, + "step": 4682 + }, + { + "epoch": 2.437792816241541, + "grad_norm": 0.23015881999073623, + "learning_rate": 4.465391239470112e-06, + "loss": 0.0453, + "step": 4683 + }, + { + "epoch": 2.4383133784487248, + "grad_norm": 0.22624808663286283, + "learning_rate": 4.457380680423434e-06, + "loss": 0.0453, + "step": 4684 + }, + { + "epoch": 2.4388339406559085, + "grad_norm": 0.2296916685174956, + "learning_rate": 4.4493766096919136e-06, + "loss": 0.0464, + "step": 4685 + }, + { + "epoch": 2.4393545028630923, + "grad_norm": 0.21738846706883652, + "learning_rate": 4.441379029803605e-06, + "loss": 0.0425, + "step": 4686 + }, + { + "epoch": 2.439875065070276, + "grad_norm": 0.2423083039591658, + "learning_rate": 4.433387943284511e-06, + "loss": 0.0468, + "step": 4687 + }, + { + "epoch": 2.44039562727746, + "grad_norm": 0.2341956963383124, + "learning_rate": 4.425403352658591e-06, + "loss": 0.0447, + "step": 4688 + }, + { + "epoch": 2.4409161894846436, + "grad_norm": 0.24101151048749272, + "learning_rate": 4.417425260447753e-06, + "loss": 0.0475, + "step": 4689 + }, + { + "epoch": 2.4414367516918274, + "grad_norm": 0.24063994225163093, + "learning_rate": 4.4094536691718505e-06, + "loss": 0.0465, + "step": 4690 + }, + { + "epoch": 2.441957313899011, + "grad_norm": 0.23071169180768644, + "learning_rate": 4.401488581348679e-06, + "loss": 0.0471, + "step": 4691 + }, + { + "epoch": 2.442477876106195, + "grad_norm": 0.2355448253070943, + "learning_rate": 4.393529999493989e-06, + "loss": 0.0471, + "step": 4692 + }, + { + "epoch": 2.4429984383133783, + "grad_norm": 0.23632196722116713, + "learning_rate": 4.385577926121464e-06, + "loss": 0.0474, + "step": 4693 + }, + { + "epoch": 2.443519000520562, + "grad_norm": 0.23534929674479932, + "learning_rate": 4.3776323637427395e-06, + "loss": 0.0452, + "step": 4694 + }, + { + "epoch": 2.444039562727746, + "grad_norm": 0.24304563929485506, + "learning_rate": 4.369693314867407e-06, + "loss": 0.047, + "step": 4695 + }, + { + "epoch": 2.4445601249349296, + "grad_norm": 0.23188182601997595, + "learning_rate": 4.3617607820029686e-06, + "loss": 0.045, + "step": 4696 + }, + { + "epoch": 2.4450806871421134, + "grad_norm": 0.23588489710907637, + "learning_rate": 4.353834767654896e-06, + "loss": 0.046, + "step": 4697 + }, + { + "epoch": 2.445601249349297, + "grad_norm": 0.2423879338466431, + "learning_rate": 4.345915274326595e-06, + "loss": 0.0486, + "step": 4698 + }, + { + "epoch": 2.446121811556481, + "grad_norm": 0.23190042633660438, + "learning_rate": 4.338002304519406e-06, + "loss": 0.0476, + "step": 4699 + }, + { + "epoch": 2.4466423737636647, + "grad_norm": 0.2371416200568294, + "learning_rate": 4.330095860732625e-06, + "loss": 0.0475, + "step": 4700 + }, + { + "epoch": 2.4471629359708484, + "grad_norm": 0.233119464425362, + "learning_rate": 4.322195945463464e-06, + "loss": 0.0453, + "step": 4701 + }, + { + "epoch": 2.447683498178032, + "grad_norm": 0.23660527240553358, + "learning_rate": 4.314302561207079e-06, + "loss": 0.047, + "step": 4702 + }, + { + "epoch": 2.448204060385216, + "grad_norm": 0.2225359791140698, + "learning_rate": 4.306415710456577e-06, + "loss": 0.0451, + "step": 4703 + }, + { + "epoch": 2.4487246225923998, + "grad_norm": 0.22352359233272487, + "learning_rate": 4.2985353957029876e-06, + "loss": 0.0438, + "step": 4704 + }, + { + "epoch": 2.4492451847995835, + "grad_norm": 0.22659412226171605, + "learning_rate": 4.29066161943529e-06, + "loss": 0.0453, + "step": 4705 + }, + { + "epoch": 2.4497657470067673, + "grad_norm": 0.23202678432385068, + "learning_rate": 4.282794384140379e-06, + "loss": 0.0456, + "step": 4706 + }, + { + "epoch": 2.450286309213951, + "grad_norm": 0.23436332924353653, + "learning_rate": 4.274933692303093e-06, + "loss": 0.0467, + "step": 4707 + }, + { + "epoch": 2.450806871421135, + "grad_norm": 0.23568145553682646, + "learning_rate": 4.267079546406211e-06, + "loss": 0.0457, + "step": 4708 + }, + { + "epoch": 2.4513274336283186, + "grad_norm": 0.23688538546245036, + "learning_rate": 4.259231948930442e-06, + "loss": 0.0465, + "step": 4709 + }, + { + "epoch": 2.4518479958355024, + "grad_norm": 0.22584612440650176, + "learning_rate": 4.251390902354413e-06, + "loss": 0.045, + "step": 4710 + }, + { + "epoch": 2.452368558042686, + "grad_norm": 0.22787648763996438, + "learning_rate": 4.243556409154692e-06, + "loss": 0.0447, + "step": 4711 + }, + { + "epoch": 2.45288912024987, + "grad_norm": 0.2308201824461422, + "learning_rate": 4.235728471805775e-06, + "loss": 0.0452, + "step": 4712 + }, + { + "epoch": 2.4534096824570537, + "grad_norm": 0.23117909367110293, + "learning_rate": 4.227907092780095e-06, + "loss": 0.0455, + "step": 4713 + }, + { + "epoch": 2.4539302446642375, + "grad_norm": 0.22631631172759079, + "learning_rate": 4.22009227454801e-06, + "loss": 0.0463, + "step": 4714 + }, + { + "epoch": 2.4544508068714213, + "grad_norm": 0.2344996227202645, + "learning_rate": 4.212284019577792e-06, + "loss": 0.0469, + "step": 4715 + }, + { + "epoch": 2.454971369078605, + "grad_norm": 0.22826293638212433, + "learning_rate": 4.204482330335657e-06, + "loss": 0.046, + "step": 4716 + }, + { + "epoch": 2.455491931285789, + "grad_norm": 0.24057079364699233, + "learning_rate": 4.196687209285744e-06, + "loss": 0.0475, + "step": 4717 + }, + { + "epoch": 2.4560124934929726, + "grad_norm": 0.23451233031179078, + "learning_rate": 4.188898658890117e-06, + "loss": 0.0462, + "step": 4718 + }, + { + "epoch": 2.4565330557001563, + "grad_norm": 0.23186747086362713, + "learning_rate": 4.1811166816087595e-06, + "loss": 0.0464, + "step": 4719 + }, + { + "epoch": 2.45705361790734, + "grad_norm": 0.22706734857716063, + "learning_rate": 4.173341279899576e-06, + "loss": 0.0442, + "step": 4720 + }, + { + "epoch": 2.4575741801145234, + "grad_norm": 0.23846594551848097, + "learning_rate": 4.165572456218405e-06, + "loss": 0.0492, + "step": 4721 + }, + { + "epoch": 2.458094742321707, + "grad_norm": 0.22761321129821427, + "learning_rate": 4.157810213019003e-06, + "loss": 0.0467, + "step": 4722 + }, + { + "epoch": 2.458615304528891, + "grad_norm": 0.21913183574661646, + "learning_rate": 4.150054552753055e-06, + "loss": 0.0449, + "step": 4723 + }, + { + "epoch": 2.4591358667360748, + "grad_norm": 0.2258958824847465, + "learning_rate": 4.1423054778701455e-06, + "loss": 0.0463, + "step": 4724 + }, + { + "epoch": 2.4596564289432585, + "grad_norm": 0.2535114295876158, + "learning_rate": 4.1345629908178e-06, + "loss": 0.0446, + "step": 4725 + }, + { + "epoch": 2.4601769911504423, + "grad_norm": 0.2188190545994277, + "learning_rate": 4.126827094041455e-06, + "loss": 0.045, + "step": 4726 + }, + { + "epoch": 2.460697553357626, + "grad_norm": 0.22451949242641625, + "learning_rate": 4.119097789984472e-06, + "loss": 0.0474, + "step": 4727 + }, + { + "epoch": 2.46121811556481, + "grad_norm": 0.23179831019249533, + "learning_rate": 4.111375081088123e-06, + "loss": 0.0456, + "step": 4728 + }, + { + "epoch": 2.4617386777719936, + "grad_norm": 0.23401390682247894, + "learning_rate": 4.103658969791588e-06, + "loss": 0.047, + "step": 4729 + }, + { + "epoch": 2.4622592399791774, + "grad_norm": 0.29525830006055254, + "learning_rate": 4.095949458531984e-06, + "loss": 0.045, + "step": 4730 + }, + { + "epoch": 2.462779802186361, + "grad_norm": 0.2259193500356729, + "learning_rate": 4.088246549744331e-06, + "loss": 0.0438, + "step": 4731 + }, + { + "epoch": 2.463300364393545, + "grad_norm": 0.22553481026955677, + "learning_rate": 4.0805502458615725e-06, + "loss": 0.0441, + "step": 4732 + }, + { + "epoch": 2.4638209266007287, + "grad_norm": 0.2533883156131254, + "learning_rate": 4.07286054931455e-06, + "loss": 0.0473, + "step": 4733 + }, + { + "epoch": 2.4643414888079125, + "grad_norm": 0.23645871037063934, + "learning_rate": 4.065177462532027e-06, + "loss": 0.0453, + "step": 4734 + }, + { + "epoch": 2.4648620510150963, + "grad_norm": 0.2390469842852221, + "learning_rate": 4.057500987940688e-06, + "loss": 0.0467, + "step": 4735 + }, + { + "epoch": 2.46538261322228, + "grad_norm": 0.24040218439850058, + "learning_rate": 4.04983112796512e-06, + "loss": 0.0464, + "step": 4736 + }, + { + "epoch": 2.465903175429464, + "grad_norm": 0.2323783034515998, + "learning_rate": 4.04216788502782e-06, + "loss": 0.0466, + "step": 4737 + }, + { + "epoch": 2.4664237376366476, + "grad_norm": 0.2421242614235067, + "learning_rate": 4.03451126154919e-06, + "loss": 0.0457, + "step": 4738 + }, + { + "epoch": 2.4669442998438313, + "grad_norm": 0.2241947463119316, + "learning_rate": 4.0268612599475534e-06, + "loss": 0.0442, + "step": 4739 + }, + { + "epoch": 2.467464862051015, + "grad_norm": 0.23390631221316124, + "learning_rate": 4.019217882639137e-06, + "loss": 0.0459, + "step": 4740 + }, + { + "epoch": 2.467985424258199, + "grad_norm": 0.23604571830215357, + "learning_rate": 4.011581132038078e-06, + "loss": 0.046, + "step": 4741 + }, + { + "epoch": 2.4685059864653827, + "grad_norm": 0.23679414599656148, + "learning_rate": 4.003951010556412e-06, + "loss": 0.0477, + "step": 4742 + }, + { + "epoch": 2.4690265486725664, + "grad_norm": 0.22466780147244902, + "learning_rate": 3.996327520604087e-06, + "loss": 0.0453, + "step": 4743 + }, + { + "epoch": 2.46954711087975, + "grad_norm": 0.23183844848666738, + "learning_rate": 3.9887106645889574e-06, + "loss": 0.0458, + "step": 4744 + }, + { + "epoch": 2.470067673086934, + "grad_norm": 0.2258352174710472, + "learning_rate": 3.981100444916788e-06, + "loss": 0.0446, + "step": 4745 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.23149767246195083, + "learning_rate": 3.973496863991233e-06, + "loss": 0.0447, + "step": 4746 + }, + { + "epoch": 2.4711087975013015, + "grad_norm": 0.23290847900975084, + "learning_rate": 3.965899924213851e-06, + "loss": 0.0446, + "step": 4747 + }, + { + "epoch": 2.4716293597084853, + "grad_norm": 0.2249778446324839, + "learning_rate": 3.958309627984116e-06, + "loss": 0.045, + "step": 4748 + }, + { + "epoch": 2.472149921915669, + "grad_norm": 0.21861880310195234, + "learning_rate": 3.950725977699396e-06, + "loss": 0.0443, + "step": 4749 + }, + { + "epoch": 2.472670484122853, + "grad_norm": 0.23380723096798692, + "learning_rate": 3.943148975754968e-06, + "loss": 0.0456, + "step": 4750 + }, + { + "epoch": 2.4731910463300366, + "grad_norm": 0.25812439912963125, + "learning_rate": 3.9355786245439896e-06, + "loss": 0.0492, + "step": 4751 + }, + { + "epoch": 2.4737116085372204, + "grad_norm": 0.2346925416518511, + "learning_rate": 3.928014926457532e-06, + "loss": 0.0469, + "step": 4752 + }, + { + "epoch": 2.474232170744404, + "grad_norm": 0.2324299060972178, + "learning_rate": 3.920457883884571e-06, + "loss": 0.0469, + "step": 4753 + }, + { + "epoch": 2.474752732951588, + "grad_norm": 0.22340130582451653, + "learning_rate": 3.9129074992119705e-06, + "loss": 0.0452, + "step": 4754 + }, + { + "epoch": 2.4752732951587717, + "grad_norm": 0.23185760003678735, + "learning_rate": 3.905363774824492e-06, + "loss": 0.0461, + "step": 4755 + }, + { + "epoch": 2.4757938573659555, + "grad_norm": 0.22369373177962135, + "learning_rate": 3.897826713104786e-06, + "loss": 0.0429, + "step": 4756 + }, + { + "epoch": 2.476314419573139, + "grad_norm": 0.22152244736340082, + "learning_rate": 3.8902963164334145e-06, + "loss": 0.0454, + "step": 4757 + }, + { + "epoch": 2.4768349817803226, + "grad_norm": 0.2288794662126069, + "learning_rate": 3.882772587188827e-06, + "loss": 0.044, + "step": 4758 + }, + { + "epoch": 2.4773555439875063, + "grad_norm": 0.22786850218271076, + "learning_rate": 3.875255527747376e-06, + "loss": 0.0455, + "step": 4759 + }, + { + "epoch": 2.47787610619469, + "grad_norm": 0.24127063199154952, + "learning_rate": 3.867745140483281e-06, + "loss": 0.0494, + "step": 4760 + }, + { + "epoch": 2.478396668401874, + "grad_norm": 0.22651816469440886, + "learning_rate": 3.860241427768682e-06, + "loss": 0.0474, + "step": 4761 + }, + { + "epoch": 2.4789172306090577, + "grad_norm": 0.23292917754764078, + "learning_rate": 3.852744391973601e-06, + "loss": 0.0453, + "step": 4762 + }, + { + "epoch": 2.4794377928162414, + "grad_norm": 0.23481848280563425, + "learning_rate": 3.845254035465951e-06, + "loss": 0.0462, + "step": 4763 + }, + { + "epoch": 2.479958355023425, + "grad_norm": 0.23409115715668288, + "learning_rate": 3.8377703606115425e-06, + "loss": 0.0456, + "step": 4764 + }, + { + "epoch": 2.480478917230609, + "grad_norm": 0.22688970649832368, + "learning_rate": 3.830293369774049e-06, + "loss": 0.046, + "step": 4765 + }, + { + "epoch": 2.4809994794377928, + "grad_norm": 0.24911131431251246, + "learning_rate": 3.822823065315062e-06, + "loss": 0.0477, + "step": 4766 + }, + { + "epoch": 2.4815200416449765, + "grad_norm": 0.23613842700910898, + "learning_rate": 3.815359449594053e-06, + "loss": 0.0464, + "step": 4767 + }, + { + "epoch": 2.4820406038521603, + "grad_norm": 0.22630425395246193, + "learning_rate": 3.8079025249683766e-06, + "loss": 0.0454, + "step": 4768 + }, + { + "epoch": 2.482561166059344, + "grad_norm": 0.23834407534931806, + "learning_rate": 3.800452293793283e-06, + "loss": 0.0474, + "step": 4769 + }, + { + "epoch": 2.483081728266528, + "grad_norm": 0.22299016955003192, + "learning_rate": 3.7930087584218924e-06, + "loss": 0.0447, + "step": 4770 + }, + { + "epoch": 2.4836022904737116, + "grad_norm": 0.22180738154042506, + "learning_rate": 3.785571921205225e-06, + "loss": 0.0444, + "step": 4771 + }, + { + "epoch": 2.4841228526808954, + "grad_norm": 0.23606413057780049, + "learning_rate": 3.7781417844921785e-06, + "loss": 0.0462, + "step": 4772 + }, + { + "epoch": 2.484643414888079, + "grad_norm": 0.23226564075490944, + "learning_rate": 3.770718350629543e-06, + "loss": 0.0471, + "step": 4773 + }, + { + "epoch": 2.485163977095263, + "grad_norm": 0.222173152299585, + "learning_rate": 3.7633016219619786e-06, + "loss": 0.044, + "step": 4774 + }, + { + "epoch": 2.4856845393024467, + "grad_norm": 0.22670387438422338, + "learning_rate": 3.755891600832026e-06, + "loss": 0.0451, + "step": 4775 + }, + { + "epoch": 2.4862051015096305, + "grad_norm": 0.24457041056832837, + "learning_rate": 3.748488289580124e-06, + "loss": 0.0471, + "step": 4776 + }, + { + "epoch": 2.4867256637168142, + "grad_norm": 0.2292671621830906, + "learning_rate": 3.7410916905445763e-06, + "loss": 0.0461, + "step": 4777 + }, + { + "epoch": 2.487246225923998, + "grad_norm": 0.2319982917968305, + "learning_rate": 3.7337018060615847e-06, + "loss": 0.0455, + "step": 4778 + }, + { + "epoch": 2.487766788131182, + "grad_norm": 0.22435841846525945, + "learning_rate": 3.7263186384652064e-06, + "loss": 0.043, + "step": 4779 + }, + { + "epoch": 2.4882873503383656, + "grad_norm": 0.22949123049534018, + "learning_rate": 3.7189421900873905e-06, + "loss": 0.045, + "step": 4780 + }, + { + "epoch": 2.4888079125455493, + "grad_norm": 0.23210355715087153, + "learning_rate": 3.71157246325797e-06, + "loss": 0.0441, + "step": 4781 + }, + { + "epoch": 2.489328474752733, + "grad_norm": 0.2260444385170093, + "learning_rate": 3.7042094603046473e-06, + "loss": 0.0436, + "step": 4782 + }, + { + "epoch": 2.489849036959917, + "grad_norm": 0.2293597694356391, + "learning_rate": 3.696853183552998e-06, + "loss": 0.0453, + "step": 4783 + }, + { + "epoch": 2.4903695991671007, + "grad_norm": 0.22728917524183118, + "learning_rate": 3.6895036353264716e-06, + "loss": 0.0473, + "step": 4784 + }, + { + "epoch": 2.490890161374284, + "grad_norm": 0.2258597144539425, + "learning_rate": 3.6821608179464006e-06, + "loss": 0.0447, + "step": 4785 + }, + { + "epoch": 2.4914107235814678, + "grad_norm": 0.23529432877833512, + "learning_rate": 3.674824733731991e-06, + "loss": 0.0454, + "step": 4786 + }, + { + "epoch": 2.4919312857886515, + "grad_norm": 0.2276587426156354, + "learning_rate": 3.6674953850003245e-06, + "loss": 0.0457, + "step": 4787 + }, + { + "epoch": 2.4924518479958353, + "grad_norm": 0.23595370854468917, + "learning_rate": 3.6601727740663395e-06, + "loss": 0.0466, + "step": 4788 + }, + { + "epoch": 2.492972410203019, + "grad_norm": 0.23081012893994207, + "learning_rate": 3.652856903242863e-06, + "loss": 0.0437, + "step": 4789 + }, + { + "epoch": 2.493492972410203, + "grad_norm": 0.23017085621336414, + "learning_rate": 3.6455477748405853e-06, + "loss": 0.0445, + "step": 4790 + }, + { + "epoch": 2.4940135346173866, + "grad_norm": 0.23217068455139228, + "learning_rate": 3.638245391168077e-06, + "loss": 0.0476, + "step": 4791 + }, + { + "epoch": 2.4945340968245704, + "grad_norm": 0.2224420779312252, + "learning_rate": 3.630949754531765e-06, + "loss": 0.0455, + "step": 4792 + }, + { + "epoch": 2.495054659031754, + "grad_norm": 0.2332204491629984, + "learning_rate": 3.623660867235945e-06, + "loss": 0.0455, + "step": 4793 + }, + { + "epoch": 2.495575221238938, + "grad_norm": 0.2302258850640075, + "learning_rate": 3.6163787315827894e-06, + "loss": 0.0461, + "step": 4794 + }, + { + "epoch": 2.4960957834461217, + "grad_norm": 0.23170180876721022, + "learning_rate": 3.609103349872342e-06, + "loss": 0.0462, + "step": 4795 + }, + { + "epoch": 2.4966163456533055, + "grad_norm": 0.22548808881382684, + "learning_rate": 3.6018347244025085e-06, + "loss": 0.0456, + "step": 4796 + }, + { + "epoch": 2.4971369078604893, + "grad_norm": 0.22318432696419793, + "learning_rate": 3.5945728574690474e-06, + "loss": 0.0451, + "step": 4797 + }, + { + "epoch": 2.497657470067673, + "grad_norm": 0.2328804121095348, + "learning_rate": 3.5873177513655985e-06, + "loss": 0.0465, + "step": 4798 + }, + { + "epoch": 2.498178032274857, + "grad_norm": 0.2362488676580557, + "learning_rate": 3.5800694083836685e-06, + "loss": 0.0458, + "step": 4799 + }, + { + "epoch": 2.4986985944820406, + "grad_norm": 0.22505683785956507, + "learning_rate": 3.5728278308126195e-06, + "loss": 0.0445, + "step": 4800 + }, + { + "epoch": 2.4992191566892243, + "grad_norm": 0.23832222888676316, + "learning_rate": 3.5655930209396783e-06, + "loss": 0.0451, + "step": 4801 + }, + { + "epoch": 2.499739718896408, + "grad_norm": 0.23018716395069688, + "learning_rate": 3.5583649810499246e-06, + "loss": 0.0442, + "step": 4802 + }, + { + "epoch": 2.500260281103592, + "grad_norm": 0.22556770408037974, + "learning_rate": 3.551143713426319e-06, + "loss": 0.0438, + "step": 4803 + }, + { + "epoch": 2.5007808433107757, + "grad_norm": 0.22278746289847356, + "learning_rate": 3.543929220349673e-06, + "loss": 0.0452, + "step": 4804 + }, + { + "epoch": 2.5013014055179594, + "grad_norm": 0.22592179865196593, + "learning_rate": 3.536721504098664e-06, + "loss": 0.0454, + "step": 4805 + }, + { + "epoch": 2.501821967725143, + "grad_norm": 0.2191833418264602, + "learning_rate": 3.529520566949812e-06, + "loss": 0.0429, + "step": 4806 + }, + { + "epoch": 2.502342529932327, + "grad_norm": 0.23567814923196181, + "learning_rate": 3.522326411177515e-06, + "loss": 0.0455, + "step": 4807 + }, + { + "epoch": 2.5028630921395107, + "grad_norm": 0.2329494968156347, + "learning_rate": 3.5151390390540245e-06, + "loss": 0.0456, + "step": 4808 + }, + { + "epoch": 2.5033836543466945, + "grad_norm": 0.23201928458314705, + "learning_rate": 3.5079584528494497e-06, + "loss": 0.0453, + "step": 4809 + }, + { + "epoch": 2.5039042165538783, + "grad_norm": 0.22257219879809637, + "learning_rate": 3.5007846548317487e-06, + "loss": 0.0432, + "step": 4810 + }, + { + "epoch": 2.504424778761062, + "grad_norm": 0.2357881279026739, + "learning_rate": 3.4936176472667337e-06, + "loss": 0.0455, + "step": 4811 + }, + { + "epoch": 2.504945340968246, + "grad_norm": 0.22156754335142384, + "learning_rate": 3.486457432418089e-06, + "loss": 0.0439, + "step": 4812 + }, + { + "epoch": 2.5054659031754296, + "grad_norm": 0.222223788188508, + "learning_rate": 3.479304012547338e-06, + "loss": 0.0437, + "step": 4813 + }, + { + "epoch": 2.5059864653826134, + "grad_norm": 0.23233184259143652, + "learning_rate": 3.4721573899138743e-06, + "loss": 0.045, + "step": 4814 + }, + { + "epoch": 2.506507027589797, + "grad_norm": 0.23772916739161976, + "learning_rate": 3.4650175667749223e-06, + "loss": 0.0466, + "step": 4815 + }, + { + "epoch": 2.507027589796981, + "grad_norm": 0.2380129235798541, + "learning_rate": 3.457884545385573e-06, + "loss": 0.0463, + "step": 4816 + }, + { + "epoch": 2.5075481520041647, + "grad_norm": 0.2428567210468212, + "learning_rate": 3.450758327998768e-06, + "loss": 0.0455, + "step": 4817 + }, + { + "epoch": 2.5080687142113485, + "grad_norm": 0.24116904429043992, + "learning_rate": 3.4436389168653023e-06, + "loss": 0.0441, + "step": 4818 + }, + { + "epoch": 2.5085892764185322, + "grad_norm": 0.2311244048249909, + "learning_rate": 3.436526314233815e-06, + "loss": 0.0459, + "step": 4819 + }, + { + "epoch": 2.509109838625716, + "grad_norm": 0.22619706111589447, + "learning_rate": 3.42942052235079e-06, + "loss": 0.0452, + "step": 4820 + }, + { + "epoch": 2.5096304008329, + "grad_norm": 0.22462384968951704, + "learning_rate": 3.4223215434605714e-06, + "loss": 0.0437, + "step": 4821 + }, + { + "epoch": 2.5101509630400836, + "grad_norm": 0.23196960288458288, + "learning_rate": 3.4152293798053486e-06, + "loss": 0.046, + "step": 4822 + }, + { + "epoch": 2.510671525247267, + "grad_norm": 0.22816810301577573, + "learning_rate": 3.408144033625163e-06, + "loss": 0.0449, + "step": 4823 + }, + { + "epoch": 2.5111920874544507, + "grad_norm": 0.22363210681026655, + "learning_rate": 3.401065507157883e-06, + "loss": 0.0426, + "step": 4824 + }, + { + "epoch": 2.5117126496616344, + "grad_norm": 0.22254268395086219, + "learning_rate": 3.393993802639245e-06, + "loss": 0.0445, + "step": 4825 + }, + { + "epoch": 2.512233211868818, + "grad_norm": 0.222987753608597, + "learning_rate": 3.3869289223028204e-06, + "loss": 0.0444, + "step": 4826 + }, + { + "epoch": 2.512753774076002, + "grad_norm": 0.24197883530585623, + "learning_rate": 3.3798708683800305e-06, + "loss": 0.0446, + "step": 4827 + }, + { + "epoch": 2.5132743362831858, + "grad_norm": 0.236727669006638, + "learning_rate": 3.372819643100139e-06, + "loss": 0.0467, + "step": 4828 + }, + { + "epoch": 2.5137948984903695, + "grad_norm": 0.24252487797178882, + "learning_rate": 3.3657752486902396e-06, + "loss": 0.046, + "step": 4829 + }, + { + "epoch": 2.5143154606975533, + "grad_norm": 0.23031235971733746, + "learning_rate": 3.3587376873752853e-06, + "loss": 0.0456, + "step": 4830 + }, + { + "epoch": 2.514836022904737, + "grad_norm": 0.22614796596777614, + "learning_rate": 3.351706961378068e-06, + "loss": 0.0439, + "step": 4831 + }, + { + "epoch": 2.515356585111921, + "grad_norm": 0.22456575523764083, + "learning_rate": 3.344683072919216e-06, + "loss": 0.0451, + "step": 4832 + }, + { + "epoch": 2.5158771473191046, + "grad_norm": 0.22971764854819687, + "learning_rate": 3.337666024217209e-06, + "loss": 0.0457, + "step": 4833 + }, + { + "epoch": 2.5163977095262884, + "grad_norm": 0.23225244680033916, + "learning_rate": 3.3306558174883427e-06, + "loss": 0.0469, + "step": 4834 + }, + { + "epoch": 2.516918271733472, + "grad_norm": 0.22408742635883183, + "learning_rate": 3.323652454946774e-06, + "loss": 0.0453, + "step": 4835 + }, + { + "epoch": 2.517438833940656, + "grad_norm": 0.22183055756851636, + "learning_rate": 3.3166559388044945e-06, + "loss": 0.0434, + "step": 4836 + }, + { + "epoch": 2.5179593961478397, + "grad_norm": 0.2260016674787736, + "learning_rate": 3.3096662712713224e-06, + "loss": 0.0448, + "step": 4837 + }, + { + "epoch": 2.5184799583550235, + "grad_norm": 0.2301369168176432, + "learning_rate": 3.3026834545549252e-06, + "loss": 0.0448, + "step": 4838 + }, + { + "epoch": 2.5190005205622072, + "grad_norm": 0.21883186300554672, + "learning_rate": 3.295707490860797e-06, + "loss": 0.0436, + "step": 4839 + }, + { + "epoch": 2.519521082769391, + "grad_norm": 0.22202859390787064, + "learning_rate": 3.288738382392273e-06, + "loss": 0.0449, + "step": 4840 + }, + { + "epoch": 2.520041644976575, + "grad_norm": 0.22761610071343236, + "learning_rate": 3.2817761313505226e-06, + "loss": 0.0468, + "step": 4841 + }, + { + "epoch": 2.5205622071837586, + "grad_norm": 0.22523059843548607, + "learning_rate": 3.2748207399345534e-06, + "loss": 0.0431, + "step": 4842 + }, + { + "epoch": 2.5210827693909423, + "grad_norm": 0.22572775303764772, + "learning_rate": 3.267872210341194e-06, + "loss": 0.0445, + "step": 4843 + }, + { + "epoch": 2.521603331598126, + "grad_norm": 0.23141678410475952, + "learning_rate": 3.2609305447651145e-06, + "loss": 0.0473, + "step": 4844 + }, + { + "epoch": 2.52212389380531, + "grad_norm": 0.4886518611240305, + "learning_rate": 3.2539957453988244e-06, + "loss": 0.0473, + "step": 4845 + }, + { + "epoch": 2.5226444560124937, + "grad_norm": 0.23911782956874156, + "learning_rate": 3.2470678144326442e-06, + "loss": 0.0469, + "step": 4846 + }, + { + "epoch": 2.523165018219677, + "grad_norm": 0.22949597789881793, + "learning_rate": 3.2401467540547485e-06, + "loss": 0.044, + "step": 4847 + }, + { + "epoch": 2.5236855804268608, + "grad_norm": 0.22907687990668468, + "learning_rate": 3.233232566451119e-06, + "loss": 0.0448, + "step": 4848 + }, + { + "epoch": 2.5242061426340445, + "grad_norm": 0.23226555702427493, + "learning_rate": 3.2263252538055816e-06, + "loss": 0.0457, + "step": 4849 + }, + { + "epoch": 2.5247267048412283, + "grad_norm": 0.22719820874022031, + "learning_rate": 3.2194248182997904e-06, + "loss": 0.0446, + "step": 4850 + }, + { + "epoch": 2.525247267048412, + "grad_norm": 0.2207201168091615, + "learning_rate": 3.2125312621132274e-06, + "loss": 0.0455, + "step": 4851 + }, + { + "epoch": 2.525767829255596, + "grad_norm": 0.21985764504983096, + "learning_rate": 3.2056445874231873e-06, + "loss": 0.0449, + "step": 4852 + }, + { + "epoch": 2.5262883914627796, + "grad_norm": 0.22836848944195356, + "learning_rate": 3.198764796404807e-06, + "loss": 0.0449, + "step": 4853 + }, + { + "epoch": 2.5268089536699634, + "grad_norm": 0.24033077551127982, + "learning_rate": 3.191891891231055e-06, + "loss": 0.0457, + "step": 4854 + }, + { + "epoch": 2.527329515877147, + "grad_norm": 0.2412658133769983, + "learning_rate": 3.1850258740726975e-06, + "loss": 0.0458, + "step": 4855 + }, + { + "epoch": 2.527850078084331, + "grad_norm": 0.22859774083041914, + "learning_rate": 3.178166747098357e-06, + "loss": 0.0436, + "step": 4856 + }, + { + "epoch": 2.5283706402915147, + "grad_norm": 0.23096275204448605, + "learning_rate": 3.171314512474452e-06, + "loss": 0.0458, + "step": 4857 + }, + { + "epoch": 2.5288912024986985, + "grad_norm": 0.22706448675029148, + "learning_rate": 3.1644691723652448e-06, + "loss": 0.0437, + "step": 4858 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.2378141750228416, + "learning_rate": 3.1576307289328117e-06, + "loss": 0.0483, + "step": 4859 + }, + { + "epoch": 2.529932326913066, + "grad_norm": 0.2289339944308606, + "learning_rate": 3.1507991843370526e-06, + "loss": 0.0437, + "step": 4860 + }, + { + "epoch": 2.53045288912025, + "grad_norm": 0.23778111816368155, + "learning_rate": 3.1439745407356835e-06, + "loss": 0.0471, + "step": 4861 + }, + { + "epoch": 2.5309734513274336, + "grad_norm": 0.2176967688986415, + "learning_rate": 3.1371568002842437e-06, + "loss": 0.0434, + "step": 4862 + }, + { + "epoch": 2.5314940135346173, + "grad_norm": 0.22833640651862924, + "learning_rate": 3.1303459651361026e-06, + "loss": 0.0459, + "step": 4863 + }, + { + "epoch": 2.532014575741801, + "grad_norm": 0.23296274792690516, + "learning_rate": 3.123542037442426e-06, + "loss": 0.0448, + "step": 4864 + }, + { + "epoch": 2.532535137948985, + "grad_norm": 0.23036570209219906, + "learning_rate": 3.1167450193522214e-06, + "loss": 0.0456, + "step": 4865 + }, + { + "epoch": 2.5330557001561687, + "grad_norm": 0.22673874717998874, + "learning_rate": 3.1099549130122944e-06, + "loss": 0.0444, + "step": 4866 + }, + { + "epoch": 2.5335762623633524, + "grad_norm": 0.23336621523932544, + "learning_rate": 3.10317172056728e-06, + "loss": 0.0459, + "step": 4867 + }, + { + "epoch": 2.534096824570536, + "grad_norm": 0.22381256494910945, + "learning_rate": 3.0963954441596277e-06, + "loss": 0.0426, + "step": 4868 + }, + { + "epoch": 2.53461738677772, + "grad_norm": 0.22193048669411228, + "learning_rate": 3.0896260859296035e-06, + "loss": 0.0428, + "step": 4869 + }, + { + "epoch": 2.5351379489849037, + "grad_norm": 0.233380501372848, + "learning_rate": 3.082863648015277e-06, + "loss": 0.0448, + "step": 4870 + }, + { + "epoch": 2.5356585111920875, + "grad_norm": 0.24772658711823495, + "learning_rate": 3.076108132552549e-06, + "loss": 0.0464, + "step": 4871 + }, + { + "epoch": 2.5361790733992713, + "grad_norm": 0.2333756921366926, + "learning_rate": 3.0693595416751207e-06, + "loss": 0.0449, + "step": 4872 + }, + { + "epoch": 2.536699635606455, + "grad_norm": 0.22180824102144125, + "learning_rate": 3.0626178775145175e-06, + "loss": 0.0445, + "step": 4873 + }, + { + "epoch": 2.537220197813639, + "grad_norm": 0.23966984007720837, + "learning_rate": 3.0558831422000695e-06, + "loss": 0.0465, + "step": 4874 + }, + { + "epoch": 2.5377407600208226, + "grad_norm": 0.23454749585264367, + "learning_rate": 3.0491553378589084e-06, + "loss": 0.0446, + "step": 4875 + }, + { + "epoch": 2.5382613222280064, + "grad_norm": 0.24426571149502913, + "learning_rate": 3.042434466615998e-06, + "loss": 0.0478, + "step": 4876 + }, + { + "epoch": 2.53878188443519, + "grad_norm": 0.2350973499571893, + "learning_rate": 3.0357205305940993e-06, + "loss": 0.0436, + "step": 4877 + }, + { + "epoch": 2.539302446642374, + "grad_norm": 0.2326275497622743, + "learning_rate": 3.0290135319137908e-06, + "loss": 0.0465, + "step": 4878 + }, + { + "epoch": 2.5398230088495577, + "grad_norm": 0.2281455888156863, + "learning_rate": 3.0223134726934472e-06, + "loss": 0.043, + "step": 4879 + }, + { + "epoch": 2.5403435710567415, + "grad_norm": 0.24217000636895653, + "learning_rate": 3.015620355049262e-06, + "loss": 0.0473, + "step": 4880 + }, + { + "epoch": 2.5408641332639252, + "grad_norm": 0.23048820939944398, + "learning_rate": 3.0089341810952327e-06, + "loss": 0.0443, + "step": 4881 + }, + { + "epoch": 2.541384695471109, + "grad_norm": 0.22378119144167463, + "learning_rate": 3.0022549529431704e-06, + "loss": 0.0438, + "step": 4882 + }, + { + "epoch": 2.541905257678293, + "grad_norm": 0.2296752533850872, + "learning_rate": 2.995582672702679e-06, + "loss": 0.0452, + "step": 4883 + }, + { + "epoch": 2.5424258198854766, + "grad_norm": 0.2243414666365788, + "learning_rate": 2.9889173424811735e-06, + "loss": 0.0442, + "step": 4884 + }, + { + "epoch": 2.5429463820926603, + "grad_norm": 0.2304455679727754, + "learning_rate": 2.98225896438388e-06, + "loss": 0.0443, + "step": 4885 + }, + { + "epoch": 2.543466944299844, + "grad_norm": 0.23018490529524177, + "learning_rate": 2.9756075405138222e-06, + "loss": 0.0454, + "step": 4886 + }, + { + "epoch": 2.5439875065070274, + "grad_norm": 0.24131348157714574, + "learning_rate": 2.9689630729718337e-06, + "loss": 0.0454, + "step": 4887 + }, + { + "epoch": 2.544508068714211, + "grad_norm": 0.21657168953862416, + "learning_rate": 2.9623255638565384e-06, + "loss": 0.0437, + "step": 4888 + }, + { + "epoch": 2.545028630921395, + "grad_norm": 0.23781682691026923, + "learning_rate": 2.9556950152643757e-06, + "loss": 0.0456, + "step": 4889 + }, + { + "epoch": 2.5455491931285787, + "grad_norm": 0.23395997912589891, + "learning_rate": 2.9490714292895822e-06, + "loss": 0.0451, + "step": 4890 + }, + { + "epoch": 2.5460697553357625, + "grad_norm": 0.2439448653848847, + "learning_rate": 2.9424548080241978e-06, + "loss": 0.0458, + "step": 4891 + }, + { + "epoch": 2.5465903175429463, + "grad_norm": 0.2184786715073653, + "learning_rate": 2.9358451535580534e-06, + "loss": 0.043, + "step": 4892 + }, + { + "epoch": 2.54711087975013, + "grad_norm": 0.22699072515393065, + "learning_rate": 2.9292424679787824e-06, + "loss": 0.0438, + "step": 4893 + }, + { + "epoch": 2.547631441957314, + "grad_norm": 0.23484480726234366, + "learning_rate": 2.9226467533718244e-06, + "loss": 0.0445, + "step": 4894 + }, + { + "epoch": 2.5481520041644976, + "grad_norm": 0.23093997831111027, + "learning_rate": 2.9160580118204104e-06, + "loss": 0.0452, + "step": 4895 + }, + { + "epoch": 2.5486725663716814, + "grad_norm": 0.23405936670463198, + "learning_rate": 2.9094762454055847e-06, + "loss": 0.0452, + "step": 4896 + }, + { + "epoch": 2.549193128578865, + "grad_norm": 0.23354436325740416, + "learning_rate": 2.902901456206156e-06, + "loss": 0.0452, + "step": 4897 + }, + { + "epoch": 2.549713690786049, + "grad_norm": 0.25244543484114845, + "learning_rate": 2.89633364629876e-06, + "loss": 0.0492, + "step": 4898 + }, + { + "epoch": 2.5502342529932327, + "grad_norm": 0.22617895203553662, + "learning_rate": 2.889772817757813e-06, + "loss": 0.0448, + "step": 4899 + }, + { + "epoch": 2.5507548152004165, + "grad_norm": 0.23586461110043583, + "learning_rate": 2.8832189726555383e-06, + "loss": 0.0461, + "step": 4900 + }, + { + "epoch": 2.5512753774076002, + "grad_norm": 0.22350522590122324, + "learning_rate": 2.8766721130619315e-06, + "loss": 0.0464, + "step": 4901 + }, + { + "epoch": 2.551795939614784, + "grad_norm": 0.23067382345830922, + "learning_rate": 2.8701322410448095e-06, + "loss": 0.0446, + "step": 4902 + }, + { + "epoch": 2.552316501821968, + "grad_norm": 0.23541409906186989, + "learning_rate": 2.8635993586697553e-06, + "loss": 0.0455, + "step": 4903 + }, + { + "epoch": 2.5528370640291516, + "grad_norm": 0.2196421849316751, + "learning_rate": 2.8570734680001627e-06, + "loss": 0.0417, + "step": 4904 + }, + { + "epoch": 2.5533576262363353, + "grad_norm": 0.21999629966823, + "learning_rate": 2.850554571097211e-06, + "loss": 0.0435, + "step": 4905 + }, + { + "epoch": 2.553878188443519, + "grad_norm": 0.22111791678119153, + "learning_rate": 2.844042670019878e-06, + "loss": 0.0446, + "step": 4906 + }, + { + "epoch": 2.554398750650703, + "grad_norm": 0.23774001294637379, + "learning_rate": 2.837537766824913e-06, + "loss": 0.0455, + "step": 4907 + }, + { + "epoch": 2.5549193128578866, + "grad_norm": 0.22855614328754512, + "learning_rate": 2.8310398635668755e-06, + "loss": 0.0449, + "step": 4908 + }, + { + "epoch": 2.5554398750650704, + "grad_norm": 0.22786759432794976, + "learning_rate": 2.824548962298107e-06, + "loss": 0.0436, + "step": 4909 + }, + { + "epoch": 2.555960437272254, + "grad_norm": 0.2217342917078937, + "learning_rate": 2.8180650650687287e-06, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 2.5564809994794375, + "grad_norm": 0.2337668784988823, + "learning_rate": 2.811588173926666e-06, + "loss": 0.0463, + "step": 4911 + }, + { + "epoch": 2.5570015616866213, + "grad_norm": 0.23907810436310875, + "learning_rate": 2.8051182909176133e-06, + "loss": 0.0467, + "step": 4912 + }, + { + "epoch": 2.557522123893805, + "grad_norm": 0.23815342311596505, + "learning_rate": 2.7986554180850665e-06, + "loss": 0.0464, + "step": 4913 + }, + { + "epoch": 2.558042686100989, + "grad_norm": 0.2310077217504325, + "learning_rate": 2.7921995574702986e-06, + "loss": 0.0449, + "step": 4914 + }, + { + "epoch": 2.5585632483081726, + "grad_norm": 0.22853140276683548, + "learning_rate": 2.7857507111123755e-06, + "loss": 0.0456, + "step": 4915 + }, + { + "epoch": 2.5590838105153564, + "grad_norm": 0.24059012312722752, + "learning_rate": 2.779308881048137e-06, + "loss": 0.0431, + "step": 4916 + }, + { + "epoch": 2.55960437272254, + "grad_norm": 0.24488720395307423, + "learning_rate": 2.7728740693122147e-06, + "loss": 0.0471, + "step": 4917 + }, + { + "epoch": 2.560124934929724, + "grad_norm": 0.22806652160138638, + "learning_rate": 2.7664462779370293e-06, + "loss": 0.0449, + "step": 4918 + }, + { + "epoch": 2.5606454971369077, + "grad_norm": 0.23288075306841058, + "learning_rate": 2.7600255089527626e-06, + "loss": 0.0447, + "step": 4919 + }, + { + "epoch": 2.5611660593440915, + "grad_norm": 0.23782264448640814, + "learning_rate": 2.7536117643874067e-06, + "loss": 0.0473, + "step": 4920 + }, + { + "epoch": 2.5616866215512752, + "grad_norm": 0.23345197126927117, + "learning_rate": 2.747205046266707e-06, + "loss": 0.0452, + "step": 4921 + }, + { + "epoch": 2.562207183758459, + "grad_norm": 0.22295930818431847, + "learning_rate": 2.7408053566142124e-06, + "loss": 0.0453, + "step": 4922 + }, + { + "epoch": 2.562727745965643, + "grad_norm": 0.2400982174167174, + "learning_rate": 2.734412697451236e-06, + "loss": 0.0448, + "step": 4923 + }, + { + "epoch": 2.5632483081728266, + "grad_norm": 0.2226346510144299, + "learning_rate": 2.7280270707968874e-06, + "loss": 0.0413, + "step": 4924 + }, + { + "epoch": 2.5637688703800103, + "grad_norm": 0.23435826768260656, + "learning_rate": 2.721648478668032e-06, + "loss": 0.044, + "step": 4925 + }, + { + "epoch": 2.564289432587194, + "grad_norm": 0.22124720780327345, + "learning_rate": 2.715276923079335e-06, + "loss": 0.0441, + "step": 4926 + }, + { + "epoch": 2.564809994794378, + "grad_norm": 0.23111451493845334, + "learning_rate": 2.708912406043229e-06, + "loss": 0.0443, + "step": 4927 + }, + { + "epoch": 2.5653305570015617, + "grad_norm": 0.22812981752484784, + "learning_rate": 2.70255492956992e-06, + "loss": 0.0437, + "step": 4928 + }, + { + "epoch": 2.5658511192087454, + "grad_norm": 0.2242696338157624, + "learning_rate": 2.6962044956674035e-06, + "loss": 0.044, + "step": 4929 + }, + { + "epoch": 2.566371681415929, + "grad_norm": 0.2526958000740198, + "learning_rate": 2.689861106341432e-06, + "loss": 0.0468, + "step": 4930 + }, + { + "epoch": 2.566892243623113, + "grad_norm": 0.2275462053374598, + "learning_rate": 2.6835247635955463e-06, + "loss": 0.0447, + "step": 4931 + }, + { + "epoch": 2.5674128058302967, + "grad_norm": 0.2237994819214662, + "learning_rate": 2.6771954694310597e-06, + "loss": 0.0438, + "step": 4932 + }, + { + "epoch": 2.5679333680374805, + "grad_norm": 0.22406574776804847, + "learning_rate": 2.670873225847062e-06, + "loss": 0.0435, + "step": 4933 + }, + { + "epoch": 2.5684539302446643, + "grad_norm": 0.2270917489269596, + "learning_rate": 2.664558034840403e-06, + "loss": 0.0447, + "step": 4934 + }, + { + "epoch": 2.568974492451848, + "grad_norm": 0.23089342107757108, + "learning_rate": 2.658249898405718e-06, + "loss": 0.0456, + "step": 4935 + }, + { + "epoch": 2.569495054659032, + "grad_norm": 0.2313393425399634, + "learning_rate": 2.6519488185354157e-06, + "loss": 0.0436, + "step": 4936 + }, + { + "epoch": 2.5700156168662156, + "grad_norm": 0.2221957268696646, + "learning_rate": 2.6456547972196625e-06, + "loss": 0.0443, + "step": 4937 + }, + { + "epoch": 2.5705361790733994, + "grad_norm": 0.23170888298200445, + "learning_rate": 2.6393678364464074e-06, + "loss": 0.0452, + "step": 4938 + }, + { + "epoch": 2.571056741280583, + "grad_norm": 0.2308739427020387, + "learning_rate": 2.6330879382013617e-06, + "loss": 0.0466, + "step": 4939 + }, + { + "epoch": 2.571577303487767, + "grad_norm": 0.23012613337882634, + "learning_rate": 2.6268151044680113e-06, + "loss": 0.0443, + "step": 4940 + }, + { + "epoch": 2.5720978656949507, + "grad_norm": 0.23042253316392428, + "learning_rate": 2.620549337227607e-06, + "loss": 0.0443, + "step": 4941 + }, + { + "epoch": 2.5726184279021345, + "grad_norm": 0.22649967085410116, + "learning_rate": 2.6142906384591798e-06, + "loss": 0.0443, + "step": 4942 + }, + { + "epoch": 2.5731389901093182, + "grad_norm": 0.22980513686165474, + "learning_rate": 2.6080390101395043e-06, + "loss": 0.0443, + "step": 4943 + }, + { + "epoch": 2.573659552316502, + "grad_norm": 0.22269196931932628, + "learning_rate": 2.6017944542431393e-06, + "loss": 0.043, + "step": 4944 + }, + { + "epoch": 2.574180114523686, + "grad_norm": 0.2220316442992769, + "learning_rate": 2.5955569727424163e-06, + "loss": 0.043, + "step": 4945 + }, + { + "epoch": 2.5747006767308696, + "grad_norm": 0.2382111826632995, + "learning_rate": 2.589326567607406e-06, + "loss": 0.045, + "step": 4946 + }, + { + "epoch": 2.5752212389380533, + "grad_norm": 0.2300585724875798, + "learning_rate": 2.5831032408059758e-06, + "loss": 0.0431, + "step": 4947 + }, + { + "epoch": 2.575741801145237, + "grad_norm": 0.2279139534194501, + "learning_rate": 2.576886994303729e-06, + "loss": 0.0461, + "step": 4948 + }, + { + "epoch": 2.576262363352421, + "grad_norm": 0.23023216030690563, + "learning_rate": 2.5706778300640527e-06, + "loss": 0.0454, + "step": 4949 + }, + { + "epoch": 2.5767829255596046, + "grad_norm": 0.22354558169408728, + "learning_rate": 2.564475750048087e-06, + "loss": 0.0452, + "step": 4950 + }, + { + "epoch": 2.577303487766788, + "grad_norm": 0.23125790378504177, + "learning_rate": 2.5582807562147455e-06, + "loss": 0.0448, + "step": 4951 + }, + { + "epoch": 2.5778240499739717, + "grad_norm": 0.22611942246092737, + "learning_rate": 2.552092850520682e-06, + "loss": 0.0445, + "step": 4952 + }, + { + "epoch": 2.5783446121811555, + "grad_norm": 0.2241937478852757, + "learning_rate": 2.545912034920331e-06, + "loss": 0.0434, + "step": 4953 + }, + { + "epoch": 2.5788651743883393, + "grad_norm": 0.228033785778967, + "learning_rate": 2.5397383113658883e-06, + "loss": 0.0461, + "step": 4954 + }, + { + "epoch": 2.579385736595523, + "grad_norm": 0.2248949405146001, + "learning_rate": 2.533571681807295e-06, + "loss": 0.0429, + "step": 4955 + }, + { + "epoch": 2.579906298802707, + "grad_norm": 0.231228519351519, + "learning_rate": 2.527412148192265e-06, + "loss": 0.0463, + "step": 4956 + }, + { + "epoch": 2.5804268610098906, + "grad_norm": 0.2171525044347602, + "learning_rate": 2.5212597124662564e-06, + "loss": 0.0439, + "step": 4957 + }, + { + "epoch": 2.5809474232170744, + "grad_norm": 0.23091260307400774, + "learning_rate": 2.5151143765725027e-06, + "loss": 0.046, + "step": 4958 + }, + { + "epoch": 2.581467985424258, + "grad_norm": 0.21624018139865478, + "learning_rate": 2.5089761424519853e-06, + "loss": 0.0422, + "step": 4959 + }, + { + "epoch": 2.581988547631442, + "grad_norm": 0.224287761302505, + "learning_rate": 2.5028450120434505e-06, + "loss": 0.0438, + "step": 4960 + }, + { + "epoch": 2.5825091098386257, + "grad_norm": 0.2339375258097004, + "learning_rate": 2.4967209872833822e-06, + "loss": 0.0449, + "step": 4961 + }, + { + "epoch": 2.5830296720458095, + "grad_norm": 0.2317454115663667, + "learning_rate": 2.4906040701060367e-06, + "loss": 0.0453, + "step": 4962 + }, + { + "epoch": 2.5835502342529932, + "grad_norm": 0.224028657841797, + "learning_rate": 2.484494262443429e-06, + "loss": 0.0427, + "step": 4963 + }, + { + "epoch": 2.584070796460177, + "grad_norm": 0.22363155592376474, + "learning_rate": 2.4783915662253103e-06, + "loss": 0.0432, + "step": 4964 + }, + { + "epoch": 2.584591358667361, + "grad_norm": 0.24028921492138963, + "learning_rate": 2.472295983379205e-06, + "loss": 0.0463, + "step": 4965 + }, + { + "epoch": 2.5851119208745446, + "grad_norm": 0.21789704784809844, + "learning_rate": 2.466207515830374e-06, + "loss": 0.043, + "step": 4966 + }, + { + "epoch": 2.5856324830817283, + "grad_norm": 0.23331471651872346, + "learning_rate": 2.4601261655018385e-06, + "loss": 0.045, + "step": 4967 + }, + { + "epoch": 2.586153045288912, + "grad_norm": 0.23970513589566356, + "learning_rate": 2.4540519343143774e-06, + "loss": 0.0456, + "step": 4968 + }, + { + "epoch": 2.586673607496096, + "grad_norm": 0.22417695780216113, + "learning_rate": 2.447984824186514e-06, + "loss": 0.043, + "step": 4969 + }, + { + "epoch": 2.5871941697032796, + "grad_norm": 0.22276549554035466, + "learning_rate": 2.441924837034529e-06, + "loss": 0.0423, + "step": 4970 + }, + { + "epoch": 2.5877147319104634, + "grad_norm": 0.22293938392333162, + "learning_rate": 2.4358719747724356e-06, + "loss": 0.0426, + "step": 4971 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.23230080511957873, + "learning_rate": 2.429826239312022e-06, + "loss": 0.0465, + "step": 4972 + }, + { + "epoch": 2.588755856324831, + "grad_norm": 0.24276681968930813, + "learning_rate": 2.4237876325628017e-06, + "loss": 0.0445, + "step": 4973 + }, + { + "epoch": 2.5892764185320147, + "grad_norm": 0.22442049970967431, + "learning_rate": 2.417756156432055e-06, + "loss": 0.0432, + "step": 4974 + }, + { + "epoch": 2.589796980739198, + "grad_norm": 0.23419001522856595, + "learning_rate": 2.411731812824808e-06, + "loss": 0.0456, + "step": 4975 + }, + { + "epoch": 2.590317542946382, + "grad_norm": 0.22466863339121676, + "learning_rate": 2.4057146036438135e-06, + "loss": 0.0439, + "step": 4976 + }, + { + "epoch": 2.5908381051535656, + "grad_norm": 0.22312298115872314, + "learning_rate": 2.3997045307895954e-06, + "loss": 0.0432, + "step": 4977 + }, + { + "epoch": 2.5913586673607494, + "grad_norm": 0.23722999672753922, + "learning_rate": 2.3937015961604165e-06, + "loss": 0.0461, + "step": 4978 + }, + { + "epoch": 2.591879229567933, + "grad_norm": 0.22912082388275234, + "learning_rate": 2.3877058016522816e-06, + "loss": 0.0447, + "step": 4979 + }, + { + "epoch": 2.592399791775117, + "grad_norm": 0.2355711003414976, + "learning_rate": 2.381717149158935e-06, + "loss": 0.0458, + "step": 4980 + }, + { + "epoch": 2.5929203539823007, + "grad_norm": 0.2281375391556885, + "learning_rate": 2.3757356405718854e-06, + "loss": 0.0441, + "step": 4981 + }, + { + "epoch": 2.5934409161894845, + "grad_norm": 0.23163425813558036, + "learning_rate": 2.369761277780355e-06, + "loss": 0.0445, + "step": 4982 + }, + { + "epoch": 2.5939614783966682, + "grad_norm": 0.24549021694669057, + "learning_rate": 2.3637940626713346e-06, + "loss": 0.0471, + "step": 4983 + }, + { + "epoch": 2.594482040603852, + "grad_norm": 0.2406789780817914, + "learning_rate": 2.357833997129552e-06, + "loss": 0.0463, + "step": 4984 + }, + { + "epoch": 2.595002602811036, + "grad_norm": 0.23803338315683714, + "learning_rate": 2.3518810830374634e-06, + "loss": 0.0448, + "step": 4985 + }, + { + "epoch": 2.5955231650182196, + "grad_norm": 0.2315546330265174, + "learning_rate": 2.3459353222752835e-06, + "loss": 0.0471, + "step": 4986 + }, + { + "epoch": 2.5960437272254033, + "grad_norm": 0.21931566525318558, + "learning_rate": 2.3399967167209576e-06, + "loss": 0.0425, + "step": 4987 + }, + { + "epoch": 2.596564289432587, + "grad_norm": 0.22188942682547205, + "learning_rate": 2.3340652682501767e-06, + "loss": 0.0432, + "step": 4988 + }, + { + "epoch": 2.597084851639771, + "grad_norm": 0.2292900755538919, + "learning_rate": 2.328140978736365e-06, + "loss": 0.0432, + "step": 4989 + }, + { + "epoch": 2.5976054138469546, + "grad_norm": 0.23064657071914493, + "learning_rate": 2.3222238500506923e-06, + "loss": 0.0449, + "step": 4990 + }, + { + "epoch": 2.5981259760541384, + "grad_norm": 0.22546541475421025, + "learning_rate": 2.3163138840620546e-06, + "loss": 0.0439, + "step": 4991 + }, + { + "epoch": 2.598646538261322, + "grad_norm": 0.23350588400038483, + "learning_rate": 2.310411082637101e-06, + "loss": 0.0465, + "step": 4992 + }, + { + "epoch": 2.599167100468506, + "grad_norm": 0.2314847500853126, + "learning_rate": 2.3045154476402154e-06, + "loss": 0.0453, + "step": 4993 + }, + { + "epoch": 2.5996876626756897, + "grad_norm": 0.23217992293328432, + "learning_rate": 2.2986269809335022e-06, + "loss": 0.0446, + "step": 4994 + }, + { + "epoch": 2.6002082248828735, + "grad_norm": 0.21965356479125087, + "learning_rate": 2.2927456843768206e-06, + "loss": 0.0426, + "step": 4995 + }, + { + "epoch": 2.6007287870900573, + "grad_norm": 0.2350569114494813, + "learning_rate": 2.286871559827758e-06, + "loss": 0.0451, + "step": 4996 + }, + { + "epoch": 2.601249349297241, + "grad_norm": 0.24155775370205634, + "learning_rate": 2.2810046091416374e-06, + "loss": 0.0479, + "step": 4997 + }, + { + "epoch": 2.601769911504425, + "grad_norm": 0.21928794475027916, + "learning_rate": 2.2751448341715083e-06, + "loss": 0.0453, + "step": 4998 + }, + { + "epoch": 2.6022904737116086, + "grad_norm": 0.2478180725379701, + "learning_rate": 2.269292236768167e-06, + "loss": 0.0443, + "step": 4999 + }, + { + "epoch": 2.6028110359187924, + "grad_norm": 0.22140285555828385, + "learning_rate": 2.2634468187801296e-06, + "loss": 0.0439, + "step": 5000 + }, + { + "epoch": 2.603331598125976, + "grad_norm": 0.22451869562094484, + "learning_rate": 2.257608582053655e-06, + "loss": 0.0446, + "step": 5001 + }, + { + "epoch": 2.60385216033316, + "grad_norm": 0.23546031458995, + "learning_rate": 2.251777528432736e-06, + "loss": 0.0451, + "step": 5002 + }, + { + "epoch": 2.6043727225403437, + "grad_norm": 0.2324359389521522, + "learning_rate": 2.2459536597590786e-06, + "loss": 0.0452, + "step": 5003 + }, + { + "epoch": 2.6048932847475275, + "grad_norm": 0.2206269507203779, + "learning_rate": 2.240136977872137e-06, + "loss": 0.0432, + "step": 5004 + }, + { + "epoch": 2.6054138469547112, + "grad_norm": 0.22420136480404548, + "learning_rate": 2.2343274846090918e-06, + "loss": 0.0438, + "step": 5005 + }, + { + "epoch": 2.605934409161895, + "grad_norm": 0.22114323880099848, + "learning_rate": 2.228525181804855e-06, + "loss": 0.0447, + "step": 5006 + }, + { + "epoch": 2.6064549713690788, + "grad_norm": 0.22862454975827587, + "learning_rate": 2.2227300712920534e-06, + "loss": 0.0433, + "step": 5007 + }, + { + "epoch": 2.6069755335762625, + "grad_norm": 0.22833262874322383, + "learning_rate": 2.2169421549010636e-06, + "loss": 0.0444, + "step": 5008 + }, + { + "epoch": 2.6074960957834463, + "grad_norm": 0.23904644665049268, + "learning_rate": 2.2111614344599683e-06, + "loss": 0.0459, + "step": 5009 + }, + { + "epoch": 2.60801665799063, + "grad_norm": 0.2305455365922837, + "learning_rate": 2.205387911794593e-06, + "loss": 0.0453, + "step": 5010 + }, + { + "epoch": 2.608537220197814, + "grad_norm": 0.2328552560660912, + "learning_rate": 2.19962158872849e-06, + "loss": 0.0452, + "step": 5011 + }, + { + "epoch": 2.6090577824049976, + "grad_norm": 0.22388814260008266, + "learning_rate": 2.1938624670829218e-06, + "loss": 0.045, + "step": 5012 + }, + { + "epoch": 2.6095783446121814, + "grad_norm": 0.22661115206828292, + "learning_rate": 2.1881105486768945e-06, + "loss": 0.0447, + "step": 5013 + }, + { + "epoch": 2.610098906819365, + "grad_norm": 0.21667312091492352, + "learning_rate": 2.18236583532713e-06, + "loss": 0.0415, + "step": 5014 + }, + { + "epoch": 2.6106194690265485, + "grad_norm": 0.23591261253651902, + "learning_rate": 2.1766283288480793e-06, + "loss": 0.0442, + "step": 5015 + }, + { + "epoch": 2.6111400312337323, + "grad_norm": 0.2345912526719469, + "learning_rate": 2.1708980310519045e-06, + "loss": 0.0437, + "step": 5016 + }, + { + "epoch": 2.611660593440916, + "grad_norm": 0.2268571102942271, + "learning_rate": 2.165174943748513e-06, + "loss": 0.044, + "step": 5017 + }, + { + "epoch": 2.6121811556481, + "grad_norm": 0.22718616461448765, + "learning_rate": 2.159459068745512e-06, + "loss": 0.0449, + "step": 5018 + }, + { + "epoch": 2.6127017178552836, + "grad_norm": 0.22275521228810305, + "learning_rate": 2.1537504078482427e-06, + "loss": 0.0413, + "step": 5019 + }, + { + "epoch": 2.6132222800624674, + "grad_norm": 0.23325969759410764, + "learning_rate": 2.1480489628597723e-06, + "loss": 0.0457, + "step": 5020 + }, + { + "epoch": 2.613742842269651, + "grad_norm": 0.23677448755403346, + "learning_rate": 2.142354735580873e-06, + "loss": 0.0458, + "step": 5021 + }, + { + "epoch": 2.614263404476835, + "grad_norm": 0.2231689376626884, + "learning_rate": 2.1366677278100487e-06, + "loss": 0.0445, + "step": 5022 + }, + { + "epoch": 2.6147839666840187, + "grad_norm": 0.22687556800748557, + "learning_rate": 2.1309879413435292e-06, + "loss": 0.0447, + "step": 5023 + }, + { + "epoch": 2.6153045288912025, + "grad_norm": 0.23231196286407194, + "learning_rate": 2.125315377975251e-06, + "loss": 0.0447, + "step": 5024 + }, + { + "epoch": 2.6158250910983862, + "grad_norm": 0.23049253549431217, + "learning_rate": 2.1196500394968678e-06, + "loss": 0.0455, + "step": 5025 + }, + { + "epoch": 2.61634565330557, + "grad_norm": 0.22425972462104576, + "learning_rate": 2.113991927697767e-06, + "loss": 0.0449, + "step": 5026 + }, + { + "epoch": 2.616866215512754, + "grad_norm": 0.22468948485303378, + "learning_rate": 2.1083410443650365e-06, + "loss": 0.0444, + "step": 5027 + }, + { + "epoch": 2.6173867777199376, + "grad_norm": 0.2384716964753025, + "learning_rate": 2.102697391283487e-06, + "loss": 0.0461, + "step": 5028 + }, + { + "epoch": 2.6179073399271213, + "grad_norm": 0.2316329860082873, + "learning_rate": 2.0970609702356563e-06, + "loss": 0.045, + "step": 5029 + }, + { + "epoch": 2.618427902134305, + "grad_norm": 0.22706235586772133, + "learning_rate": 2.0914317830017764e-06, + "loss": 0.0442, + "step": 5030 + }, + { + "epoch": 2.618948464341489, + "grad_norm": 0.2283217603622717, + "learning_rate": 2.085809831359814e-06, + "loss": 0.0434, + "step": 5031 + }, + { + "epoch": 2.6194690265486726, + "grad_norm": 0.23412543514217077, + "learning_rate": 2.08019511708544e-06, + "loss": 0.044, + "step": 5032 + }, + { + "epoch": 2.6199895887558564, + "grad_norm": 0.23322446403477373, + "learning_rate": 2.0745876419520442e-06, + "loss": 0.0454, + "step": 5033 + }, + { + "epoch": 2.62051015096304, + "grad_norm": 0.2225537808805055, + "learning_rate": 2.0689874077307325e-06, + "loss": 0.0441, + "step": 5034 + }, + { + "epoch": 2.621030713170224, + "grad_norm": 0.22326275306638776, + "learning_rate": 2.0633944161903144e-06, + "loss": 0.0427, + "step": 5035 + }, + { + "epoch": 2.6215512753774077, + "grad_norm": 0.2242939602375942, + "learning_rate": 2.0578086690973135e-06, + "loss": 0.045, + "step": 5036 + }, + { + "epoch": 2.6220718375845915, + "grad_norm": 0.22728481687598673, + "learning_rate": 2.052230168215971e-06, + "loss": 0.0456, + "step": 5037 + }, + { + "epoch": 2.6225923997917753, + "grad_norm": 0.22262615082530046, + "learning_rate": 2.0466589153082416e-06, + "loss": 0.0433, + "step": 5038 + }, + { + "epoch": 2.6231129619989586, + "grad_norm": 0.22560826041711385, + "learning_rate": 2.041094912133784e-06, + "loss": 0.0446, + "step": 5039 + }, + { + "epoch": 2.6236335242061424, + "grad_norm": 0.23114517964831685, + "learning_rate": 2.035538160449968e-06, + "loss": 0.0466, + "step": 5040 + }, + { + "epoch": 2.624154086413326, + "grad_norm": 0.2420652950928089, + "learning_rate": 2.0299886620118722e-06, + "loss": 0.0458, + "step": 5041 + }, + { + "epoch": 2.62467464862051, + "grad_norm": 0.22783597835324096, + "learning_rate": 2.02444641857229e-06, + "loss": 0.0446, + "step": 5042 + }, + { + "epoch": 2.6251952108276937, + "grad_norm": 0.22160692042897737, + "learning_rate": 2.018911431881723e-06, + "loss": 0.0446, + "step": 5043 + }, + { + "epoch": 2.6257157730348775, + "grad_norm": 0.2322363544425751, + "learning_rate": 2.0133837036883734e-06, + "loss": 0.0451, + "step": 5044 + }, + { + "epoch": 2.6262363352420612, + "grad_norm": 0.22850250345514178, + "learning_rate": 2.007863235738153e-06, + "loss": 0.0452, + "step": 5045 + }, + { + "epoch": 2.626756897449245, + "grad_norm": 0.22005963804167653, + "learning_rate": 2.0023500297746828e-06, + "loss": 0.0439, + "step": 5046 + }, + { + "epoch": 2.627277459656429, + "grad_norm": 0.2215482537319163, + "learning_rate": 1.9968440875392902e-06, + "loss": 0.0429, + "step": 5047 + }, + { + "epoch": 2.6277980218636126, + "grad_norm": 0.22063378797132666, + "learning_rate": 1.9913454107710173e-06, + "loss": 0.0425, + "step": 5048 + }, + { + "epoch": 2.6283185840707963, + "grad_norm": 0.2300782059424934, + "learning_rate": 1.9858540012065886e-06, + "loss": 0.0443, + "step": 5049 + }, + { + "epoch": 2.62883914627798, + "grad_norm": 0.21421042163884607, + "learning_rate": 1.9803698605804497e-06, + "loss": 0.0419, + "step": 5050 + }, + { + "epoch": 2.629359708485164, + "grad_norm": 0.2273905924198931, + "learning_rate": 1.974892990624752e-06, + "loss": 0.0448, + "step": 5051 + }, + { + "epoch": 2.6298802706923476, + "grad_norm": 0.23000101131311979, + "learning_rate": 1.96942339306935e-06, + "loss": 0.0449, + "step": 5052 + }, + { + "epoch": 2.6304008328995314, + "grad_norm": 0.2306219483746907, + "learning_rate": 1.96396106964179e-06, + "loss": 0.0434, + "step": 5053 + }, + { + "epoch": 2.630921395106715, + "grad_norm": 0.2254745000498489, + "learning_rate": 1.9585060220673247e-06, + "loss": 0.0441, + "step": 5054 + }, + { + "epoch": 2.631441957313899, + "grad_norm": 0.23129142941993588, + "learning_rate": 1.953058252068915e-06, + "loss": 0.0448, + "step": 5055 + }, + { + "epoch": 2.6319625195210827, + "grad_norm": 0.22233175432556077, + "learning_rate": 1.9476177613672237e-06, + "loss": 0.0438, + "step": 5056 + }, + { + "epoch": 2.6324830817282665, + "grad_norm": 0.22444580299782396, + "learning_rate": 1.942184551680612e-06, + "loss": 0.0451, + "step": 5057 + }, + { + "epoch": 2.6330036439354503, + "grad_norm": 0.22412724223445896, + "learning_rate": 1.9367586247251323e-06, + "loss": 0.044, + "step": 5058 + }, + { + "epoch": 2.633524206142634, + "grad_norm": 0.24134427074612705, + "learning_rate": 1.93133998221455e-06, + "loss": 0.0462, + "step": 5059 + }, + { + "epoch": 2.634044768349818, + "grad_norm": 0.23306486452552985, + "learning_rate": 1.9259286258603263e-06, + "loss": 0.0448, + "step": 5060 + }, + { + "epoch": 2.6345653305570016, + "grad_norm": 0.2357629035963088, + "learning_rate": 1.9205245573716197e-06, + "loss": 0.0464, + "step": 5061 + }, + { + "epoch": 2.6350858927641854, + "grad_norm": 0.22558172778330046, + "learning_rate": 1.9151277784552864e-06, + "loss": 0.0442, + "step": 5062 + }, + { + "epoch": 2.635606454971369, + "grad_norm": 0.23769331834375482, + "learning_rate": 1.9097382908158713e-06, + "loss": 0.0448, + "step": 5063 + }, + { + "epoch": 2.636127017178553, + "grad_norm": 0.22352280580432798, + "learning_rate": 1.9043560961556323e-06, + "loss": 0.0448, + "step": 5064 + }, + { + "epoch": 2.6366475793857367, + "grad_norm": 0.24155333261345938, + "learning_rate": 1.898981196174518e-06, + "loss": 0.0456, + "step": 5065 + }, + { + "epoch": 2.6371681415929205, + "grad_norm": 0.2153187541406319, + "learning_rate": 1.8936135925701732e-06, + "loss": 0.0428, + "step": 5066 + }, + { + "epoch": 2.6376887038001042, + "grad_norm": 0.23310151041795749, + "learning_rate": 1.8882532870379331e-06, + "loss": 0.0447, + "step": 5067 + }, + { + "epoch": 2.638209266007288, + "grad_norm": 0.23186210725957584, + "learning_rate": 1.8829002812708302e-06, + "loss": 0.0456, + "step": 5068 + }, + { + "epoch": 2.6387298282144718, + "grad_norm": 0.24386297462518527, + "learning_rate": 1.8775545769595975e-06, + "loss": 0.0454, + "step": 5069 + }, + { + "epoch": 2.6392503904216555, + "grad_norm": 0.21918598958276325, + "learning_rate": 1.8722161757926597e-06, + "loss": 0.0433, + "step": 5070 + }, + { + "epoch": 2.6397709526288393, + "grad_norm": 0.22662770705374358, + "learning_rate": 1.866885079456121e-06, + "loss": 0.0403, + "step": 5071 + }, + { + "epoch": 2.640291514836023, + "grad_norm": 0.22853002659541766, + "learning_rate": 1.8615612896338036e-06, + "loss": 0.0438, + "step": 5072 + }, + { + "epoch": 2.640812077043207, + "grad_norm": 0.22941524162811514, + "learning_rate": 1.856244808007196e-06, + "loss": 0.044, + "step": 5073 + }, + { + "epoch": 2.6413326392503906, + "grad_norm": 0.22276184054208367, + "learning_rate": 1.8509356362554963e-06, + "loss": 0.0451, + "step": 5074 + }, + { + "epoch": 2.6418532014575744, + "grad_norm": 0.22604762542260493, + "learning_rate": 1.8456337760555915e-06, + "loss": 0.0447, + "step": 5075 + }, + { + "epoch": 2.642373763664758, + "grad_norm": 0.23677583990143997, + "learning_rate": 1.840339229082047e-06, + "loss": 0.0455, + "step": 5076 + }, + { + "epoch": 2.642894325871942, + "grad_norm": 0.2286482680320122, + "learning_rate": 1.8350519970071312e-06, + "loss": 0.0442, + "step": 5077 + }, + { + "epoch": 2.6434148880791257, + "grad_norm": 0.22736556778997194, + "learning_rate": 1.8297720815007969e-06, + "loss": 0.0453, + "step": 5078 + }, + { + "epoch": 2.643935450286309, + "grad_norm": 0.22259072643024455, + "learning_rate": 1.824499484230696e-06, + "loss": 0.0435, + "step": 5079 + }, + { + "epoch": 2.644456012493493, + "grad_norm": 0.2199119162247651, + "learning_rate": 1.819234206862147e-06, + "loss": 0.0417, + "step": 5080 + }, + { + "epoch": 2.6449765747006766, + "grad_norm": 0.24083225482663506, + "learning_rate": 1.8139762510581804e-06, + "loss": 0.0451, + "step": 5081 + }, + { + "epoch": 2.6454971369078604, + "grad_norm": 0.23292508889547486, + "learning_rate": 1.8087256184794953e-06, + "loss": 0.0463, + "step": 5082 + }, + { + "epoch": 2.646017699115044, + "grad_norm": 0.2300747118454459, + "learning_rate": 1.8034823107844878e-06, + "loss": 0.0445, + "step": 5083 + }, + { + "epoch": 2.646538261322228, + "grad_norm": 0.2363535678951791, + "learning_rate": 1.798246329629244e-06, + "loss": 0.0431, + "step": 5084 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.22966034657954063, + "learning_rate": 1.793017676667519e-06, + "loss": 0.0437, + "step": 5085 + }, + { + "epoch": 2.6475793857365955, + "grad_norm": 0.22141333009963216, + "learning_rate": 1.7877963535507748e-06, + "loss": 0.044, + "step": 5086 + }, + { + "epoch": 2.6480999479437792, + "grad_norm": 0.22871432021572957, + "learning_rate": 1.7825823619281452e-06, + "loss": 0.0427, + "step": 5087 + }, + { + "epoch": 2.648620510150963, + "grad_norm": 0.22928264134273937, + "learning_rate": 1.7773757034464545e-06, + "loss": 0.0458, + "step": 5088 + }, + { + "epoch": 2.6491410723581468, + "grad_norm": 0.22874179067788555, + "learning_rate": 1.7721763797501984e-06, + "loss": 0.0448, + "step": 5089 + }, + { + "epoch": 2.6496616345653305, + "grad_norm": 0.23127911334565235, + "learning_rate": 1.766984392481577e-06, + "loss": 0.0439, + "step": 5090 + }, + { + "epoch": 2.6501821967725143, + "grad_norm": 0.22687967610681098, + "learning_rate": 1.761799743280454e-06, + "loss": 0.0447, + "step": 5091 + }, + { + "epoch": 2.650702758979698, + "grad_norm": 0.21699331621599646, + "learning_rate": 1.75662243378438e-06, + "loss": 0.0419, + "step": 5092 + }, + { + "epoch": 2.651223321186882, + "grad_norm": 0.23305173793665826, + "learning_rate": 1.751452465628603e-06, + "loss": 0.0435, + "step": 5093 + }, + { + "epoch": 2.6517438833940656, + "grad_norm": 0.230107160982093, + "learning_rate": 1.7462898404460275e-06, + "loss": 0.041, + "step": 5094 + }, + { + "epoch": 2.6522644456012494, + "grad_norm": 0.2298327980073945, + "learning_rate": 1.7411345598672523e-06, + "loss": 0.044, + "step": 5095 + }, + { + "epoch": 2.652785007808433, + "grad_norm": 0.2309553511497464, + "learning_rate": 1.7359866255205609e-06, + "loss": 0.0447, + "step": 5096 + }, + { + "epoch": 2.653305570015617, + "grad_norm": 0.23228524615867757, + "learning_rate": 1.7308460390319025e-06, + "loss": 0.0443, + "step": 5097 + }, + { + "epoch": 2.6538261322228007, + "grad_norm": 0.2323659311301738, + "learning_rate": 1.7257128020249258e-06, + "loss": 0.0464, + "step": 5098 + }, + { + "epoch": 2.6543466944299845, + "grad_norm": 0.23831555902873636, + "learning_rate": 1.7205869161209365e-06, + "loss": 0.0465, + "step": 5099 + }, + { + "epoch": 2.6548672566371683, + "grad_norm": 0.22428319592649987, + "learning_rate": 1.7154683829389284e-06, + "loss": 0.0421, + "step": 5100 + }, + { + "epoch": 2.655387818844352, + "grad_norm": 0.23529402262221027, + "learning_rate": 1.7103572040955696e-06, + "loss": 0.0442, + "step": 5101 + }, + { + "epoch": 2.655908381051536, + "grad_norm": 0.23011207860973742, + "learning_rate": 1.7052533812052157e-06, + "loss": 0.0448, + "step": 5102 + }, + { + "epoch": 2.656428943258719, + "grad_norm": 0.22964408679947323, + "learning_rate": 1.7001569158798914e-06, + "loss": 0.0439, + "step": 5103 + }, + { + "epoch": 2.656949505465903, + "grad_norm": 0.22620381387212382, + "learning_rate": 1.6950678097292893e-06, + "loss": 0.0432, + "step": 5104 + }, + { + "epoch": 2.6574700676730867, + "grad_norm": 0.23347058886316926, + "learning_rate": 1.6899860643607907e-06, + "loss": 0.0458, + "step": 5105 + }, + { + "epoch": 2.6579906298802705, + "grad_norm": 0.2229703409152849, + "learning_rate": 1.6849116813794503e-06, + "loss": 0.0436, + "step": 5106 + }, + { + "epoch": 2.6585111920874542, + "grad_norm": 0.22996346900113476, + "learning_rate": 1.6798446623879915e-06, + "loss": 0.0448, + "step": 5107 + }, + { + "epoch": 2.659031754294638, + "grad_norm": 0.22933075899722521, + "learning_rate": 1.6747850089868178e-06, + "loss": 0.0454, + "step": 5108 + }, + { + "epoch": 2.659552316501822, + "grad_norm": 0.22374222520186948, + "learning_rate": 1.669732722773995e-06, + "loss": 0.045, + "step": 5109 + }, + { + "epoch": 2.6600728787090056, + "grad_norm": 0.23066068965417216, + "learning_rate": 1.6646878053452776e-06, + "loss": 0.0431, + "step": 5110 + }, + { + "epoch": 2.6605934409161893, + "grad_norm": 0.224015399002532, + "learning_rate": 1.659650258294082e-06, + "loss": 0.0422, + "step": 5111 + }, + { + "epoch": 2.661114003123373, + "grad_norm": 0.23582456384176945, + "learning_rate": 1.6546200832115028e-06, + "loss": 0.0447, + "step": 5112 + }, + { + "epoch": 2.661634565330557, + "grad_norm": 0.2261905982992855, + "learning_rate": 1.649597281686302e-06, + "loss": 0.0414, + "step": 5113 + }, + { + "epoch": 2.6621551275377406, + "grad_norm": 0.23148944353074696, + "learning_rate": 1.644581855304911e-06, + "loss": 0.0451, + "step": 5114 + }, + { + "epoch": 2.6626756897449244, + "grad_norm": 0.22331597314340787, + "learning_rate": 1.639573805651437e-06, + "loss": 0.0419, + "step": 5115 + }, + { + "epoch": 2.663196251952108, + "grad_norm": 0.22397006275035072, + "learning_rate": 1.6345731343076626e-06, + "loss": 0.0437, + "step": 5116 + }, + { + "epoch": 2.663716814159292, + "grad_norm": 0.2324005116144098, + "learning_rate": 1.629579842853024e-06, + "loss": 0.0457, + "step": 5117 + }, + { + "epoch": 2.6642373763664757, + "grad_norm": 0.22967664976854812, + "learning_rate": 1.624593932864632e-06, + "loss": 0.0445, + "step": 5118 + }, + { + "epoch": 2.6647579385736595, + "grad_norm": 0.23099217882260017, + "learning_rate": 1.6196154059172742e-06, + "loss": 0.0437, + "step": 5119 + }, + { + "epoch": 2.6652785007808433, + "grad_norm": 0.23529689019600636, + "learning_rate": 1.6146442635834008e-06, + "loss": 0.0452, + "step": 5120 + }, + { + "epoch": 2.665799062988027, + "grad_norm": 0.24041486589271255, + "learning_rate": 1.6096805074331338e-06, + "loss": 0.0459, + "step": 5121 + }, + { + "epoch": 2.666319625195211, + "grad_norm": 0.2268769288085849, + "learning_rate": 1.6047241390342498e-06, + "loss": 0.0449, + "step": 5122 + }, + { + "epoch": 2.6668401874023946, + "grad_norm": 0.22397701381379848, + "learning_rate": 1.599775159952205e-06, + "loss": 0.0448, + "step": 5123 + }, + { + "epoch": 2.6673607496095784, + "grad_norm": 0.2243649170475879, + "learning_rate": 1.5948335717501179e-06, + "loss": 0.0422, + "step": 5124 + }, + { + "epoch": 2.667881311816762, + "grad_norm": 0.23886065101491008, + "learning_rate": 1.5898993759887765e-06, + "loss": 0.0452, + "step": 5125 + }, + { + "epoch": 2.668401874023946, + "grad_norm": 0.22837358130534788, + "learning_rate": 1.5849725742266231e-06, + "loss": 0.043, + "step": 5126 + }, + { + "epoch": 2.6689224362311297, + "grad_norm": 0.22398559556865122, + "learning_rate": 1.5800531680197683e-06, + "loss": 0.0443, + "step": 5127 + }, + { + "epoch": 2.6694429984383135, + "grad_norm": 0.22326877672343834, + "learning_rate": 1.5751411589219945e-06, + "loss": 0.0415, + "step": 5128 + }, + { + "epoch": 2.6699635606454972, + "grad_norm": 0.24686809704433404, + "learning_rate": 1.570236548484741e-06, + "loss": 0.0448, + "step": 5129 + }, + { + "epoch": 2.670484122852681, + "grad_norm": 0.22496284749102272, + "learning_rate": 1.5653393382571158e-06, + "loss": 0.0436, + "step": 5130 + }, + { + "epoch": 2.6710046850598648, + "grad_norm": 0.2216536409582885, + "learning_rate": 1.560449529785879e-06, + "loss": 0.0427, + "step": 5131 + }, + { + "epoch": 2.6715252472670485, + "grad_norm": 0.23039373528584386, + "learning_rate": 1.5555671246154647e-06, + "loss": 0.0456, + "step": 5132 + }, + { + "epoch": 2.6720458094742323, + "grad_norm": 0.2267257512361847, + "learning_rate": 1.5506921242879612e-06, + "loss": 0.0438, + "step": 5133 + }, + { + "epoch": 2.672566371681416, + "grad_norm": 0.22909660719838928, + "learning_rate": 1.5458245303431262e-06, + "loss": 0.0442, + "step": 5134 + }, + { + "epoch": 2.6730869338886, + "grad_norm": 0.23207465894660437, + "learning_rate": 1.5409643443183658e-06, + "loss": 0.0452, + "step": 5135 + }, + { + "epoch": 2.6736074960957836, + "grad_norm": 0.23464400344667644, + "learning_rate": 1.5361115677487548e-06, + "loss": 0.0452, + "step": 5136 + }, + { + "epoch": 2.6741280583029674, + "grad_norm": 0.23955054324523992, + "learning_rate": 1.5312662021670227e-06, + "loss": 0.0451, + "step": 5137 + }, + { + "epoch": 2.674648620510151, + "grad_norm": 0.22781012646604124, + "learning_rate": 1.5264282491035676e-06, + "loss": 0.0427, + "step": 5138 + }, + { + "epoch": 2.675169182717335, + "grad_norm": 0.22466051885015176, + "learning_rate": 1.5215977100864392e-06, + "loss": 0.0442, + "step": 5139 + }, + { + "epoch": 2.6756897449245187, + "grad_norm": 0.223780756891236, + "learning_rate": 1.5167745866413424e-06, + "loss": 0.0442, + "step": 5140 + }, + { + "epoch": 2.6762103071317025, + "grad_norm": 0.21773883268442826, + "learning_rate": 1.5119588802916445e-06, + "loss": 0.0426, + "step": 5141 + }, + { + "epoch": 2.6767308693388863, + "grad_norm": 0.23066656195993457, + "learning_rate": 1.5071505925583735e-06, + "loss": 0.044, + "step": 5142 + }, + { + "epoch": 2.6772514315460696, + "grad_norm": 0.2226523056001122, + "learning_rate": 1.5023497249602086e-06, + "loss": 0.0438, + "step": 5143 + }, + { + "epoch": 2.6777719937532534, + "grad_norm": 0.2188974751683572, + "learning_rate": 1.49755627901349e-06, + "loss": 0.0435, + "step": 5144 + }, + { + "epoch": 2.678292555960437, + "grad_norm": 0.2229966329377455, + "learning_rate": 1.4927702562322037e-06, + "loss": 0.0428, + "step": 5145 + }, + { + "epoch": 2.678813118167621, + "grad_norm": 0.24359513195681573, + "learning_rate": 1.4879916581280045e-06, + "loss": 0.0463, + "step": 5146 + }, + { + "epoch": 2.6793336803748047, + "grad_norm": 0.23261019586712137, + "learning_rate": 1.4832204862101906e-06, + "loss": 0.0429, + "step": 5147 + }, + { + "epoch": 2.6798542425819885, + "grad_norm": 0.2157186684270539, + "learning_rate": 1.4784567419857314e-06, + "loss": 0.0425, + "step": 5148 + }, + { + "epoch": 2.6803748047891722, + "grad_norm": 0.23508661356548638, + "learning_rate": 1.4737004269592236e-06, + "loss": 0.045, + "step": 5149 + }, + { + "epoch": 2.680895366996356, + "grad_norm": 0.22272454558258475, + "learning_rate": 1.468951542632943e-06, + "loss": 0.0437, + "step": 5150 + }, + { + "epoch": 2.6814159292035398, + "grad_norm": 0.22708962558308424, + "learning_rate": 1.4642100905068068e-06, + "loss": 0.0451, + "step": 5151 + }, + { + "epoch": 2.6819364914107235, + "grad_norm": 0.22570865365739282, + "learning_rate": 1.4594760720783863e-06, + "loss": 0.0426, + "step": 5152 + }, + { + "epoch": 2.6824570536179073, + "grad_norm": 0.22957967759748524, + "learning_rate": 1.4547494888429074e-06, + "loss": 0.0432, + "step": 5153 + }, + { + "epoch": 2.682977615825091, + "grad_norm": 0.22887509990760044, + "learning_rate": 1.4500303422932348e-06, + "loss": 0.0434, + "step": 5154 + }, + { + "epoch": 2.683498178032275, + "grad_norm": 0.23098550242907323, + "learning_rate": 1.4453186339199037e-06, + "loss": 0.0452, + "step": 5155 + }, + { + "epoch": 2.6840187402394586, + "grad_norm": 0.22547319635189877, + "learning_rate": 1.4406143652110875e-06, + "loss": 0.0423, + "step": 5156 + }, + { + "epoch": 2.6845393024466424, + "grad_norm": 0.236534240012221, + "learning_rate": 1.4359175376526174e-06, + "loss": 0.0469, + "step": 5157 + }, + { + "epoch": 2.685059864653826, + "grad_norm": 0.22335001771494034, + "learning_rate": 1.431228152727962e-06, + "loss": 0.0441, + "step": 5158 + }, + { + "epoch": 2.68558042686101, + "grad_norm": 0.22432122506376032, + "learning_rate": 1.4265462119182532e-06, + "loss": 0.0421, + "step": 5159 + }, + { + "epoch": 2.6861009890681937, + "grad_norm": 0.22882569581384093, + "learning_rate": 1.4218717167022638e-06, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 2.6866215512753775, + "grad_norm": 0.23103308439566217, + "learning_rate": 1.4172046685564212e-06, + "loss": 0.0447, + "step": 5161 + }, + { + "epoch": 2.6871421134825613, + "grad_norm": 0.22503412132865003, + "learning_rate": 1.412545068954796e-06, + "loss": 0.0435, + "step": 5162 + }, + { + "epoch": 2.687662675689745, + "grad_norm": 0.2228402807168184, + "learning_rate": 1.4078929193690998e-06, + "loss": 0.0426, + "step": 5163 + }, + { + "epoch": 2.688183237896929, + "grad_norm": 0.2287986065956918, + "learning_rate": 1.4032482212686993e-06, + "loss": 0.0434, + "step": 5164 + }, + { + "epoch": 2.6887038001041126, + "grad_norm": 0.2320753787979263, + "learning_rate": 1.3986109761206095e-06, + "loss": 0.0441, + "step": 5165 + }, + { + "epoch": 2.6892243623112964, + "grad_norm": 0.23129496956085294, + "learning_rate": 1.3939811853894896e-06, + "loss": 0.0456, + "step": 5166 + }, + { + "epoch": 2.6897449245184797, + "grad_norm": 0.2345935034216518, + "learning_rate": 1.389358850537642e-06, + "loss": 0.0448, + "step": 5167 + }, + { + "epoch": 2.6902654867256635, + "grad_norm": 0.23234893177636318, + "learning_rate": 1.38474397302501e-06, + "loss": 0.0442, + "step": 5168 + }, + { + "epoch": 2.6907860489328472, + "grad_norm": 0.23634107034074991, + "learning_rate": 1.3801365543091916e-06, + "loss": 0.0458, + "step": 5169 + }, + { + "epoch": 2.691306611140031, + "grad_norm": 0.23696395645252785, + "learning_rate": 1.3755365958454254e-06, + "loss": 0.0451, + "step": 5170 + }, + { + "epoch": 2.6918271733472148, + "grad_norm": 0.22469488655336936, + "learning_rate": 1.3709440990865908e-06, + "loss": 0.0447, + "step": 5171 + }, + { + "epoch": 2.6923477355543985, + "grad_norm": 0.23085933725904473, + "learning_rate": 1.366359065483211e-06, + "loss": 0.0446, + "step": 5172 + }, + { + "epoch": 2.6928682977615823, + "grad_norm": 0.23659193340498502, + "learning_rate": 1.3617814964834523e-06, + "loss": 0.0455, + "step": 5173 + }, + { + "epoch": 2.693388859968766, + "grad_norm": 0.2189403460488524, + "learning_rate": 1.3572113935331226e-06, + "loss": 0.044, + "step": 5174 + }, + { + "epoch": 2.69390942217595, + "grad_norm": 0.2247914833766047, + "learning_rate": 1.3526487580756752e-06, + "loss": 0.0441, + "step": 5175 + }, + { + "epoch": 2.6944299843831336, + "grad_norm": 0.21893245384497728, + "learning_rate": 1.3480935915522075e-06, + "loss": 0.043, + "step": 5176 + }, + { + "epoch": 2.6949505465903174, + "grad_norm": 0.23444838887341896, + "learning_rate": 1.3435458954014463e-06, + "loss": 0.0453, + "step": 5177 + }, + { + "epoch": 2.695471108797501, + "grad_norm": 0.23107549495251808, + "learning_rate": 1.3390056710597649e-06, + "loss": 0.0442, + "step": 5178 + }, + { + "epoch": 2.695991671004685, + "grad_norm": 0.22611578165673227, + "learning_rate": 1.3344729199611827e-06, + "loss": 0.0458, + "step": 5179 + }, + { + "epoch": 2.6965122332118687, + "grad_norm": 0.22290496338439902, + "learning_rate": 1.3299476435373548e-06, + "loss": 0.0429, + "step": 5180 + }, + { + "epoch": 2.6970327954190525, + "grad_norm": 0.21442995257650893, + "learning_rate": 1.3254298432175682e-06, + "loss": 0.0424, + "step": 5181 + }, + { + "epoch": 2.6975533576262363, + "grad_norm": 0.21876017392018013, + "learning_rate": 1.320919520428754e-06, + "loss": 0.0432, + "step": 5182 + }, + { + "epoch": 2.69807391983342, + "grad_norm": 0.22822179337752255, + "learning_rate": 1.3164166765954861e-06, + "loss": 0.0454, + "step": 5183 + }, + { + "epoch": 2.698594482040604, + "grad_norm": 0.23403568093314686, + "learning_rate": 1.3119213131399688e-06, + "loss": 0.0468, + "step": 5184 + }, + { + "epoch": 2.6991150442477876, + "grad_norm": 0.22524839908609418, + "learning_rate": 1.3074334314820551e-06, + "loss": 0.0447, + "step": 5185 + }, + { + "epoch": 2.6996356064549714, + "grad_norm": 0.22270469667567147, + "learning_rate": 1.3029530330392143e-06, + "loss": 0.0442, + "step": 5186 + }, + { + "epoch": 2.700156168662155, + "grad_norm": 0.23012017240768629, + "learning_rate": 1.2984801192265749e-06, + "loss": 0.0439, + "step": 5187 + }, + { + "epoch": 2.700676730869339, + "grad_norm": 0.21642941461801396, + "learning_rate": 1.2940146914568852e-06, + "loss": 0.0438, + "step": 5188 + }, + { + "epoch": 2.7011972930765227, + "grad_norm": 0.22328673189570775, + "learning_rate": 1.2895567511405414e-06, + "loss": 0.0439, + "step": 5189 + }, + { + "epoch": 2.7017178552837064, + "grad_norm": 0.23823980764075436, + "learning_rate": 1.285106299685565e-06, + "loss": 0.0457, + "step": 5190 + }, + { + "epoch": 2.70223841749089, + "grad_norm": 0.22391174202738537, + "learning_rate": 1.280663338497609e-06, + "loss": 0.0448, + "step": 5191 + }, + { + "epoch": 2.702758979698074, + "grad_norm": 0.22413624516419742, + "learning_rate": 1.276227868979976e-06, + "loss": 0.0422, + "step": 5192 + }, + { + "epoch": 2.7032795419052578, + "grad_norm": 0.219580469635659, + "learning_rate": 1.2717998925335927e-06, + "loss": 0.0425, + "step": 5193 + }, + { + "epoch": 2.7038001041124415, + "grad_norm": 0.23393120071768486, + "learning_rate": 1.267379410557018e-06, + "loss": 0.0463, + "step": 5194 + }, + { + "epoch": 2.7043206663196253, + "grad_norm": 0.22651080633930457, + "learning_rate": 1.2629664244464463e-06, + "loss": 0.0446, + "step": 5195 + }, + { + "epoch": 2.704841228526809, + "grad_norm": 0.2345516458739699, + "learning_rate": 1.258560935595704e-06, + "loss": 0.0457, + "step": 5196 + }, + { + "epoch": 2.705361790733993, + "grad_norm": 0.21826699278019435, + "learning_rate": 1.2541629453962479e-06, + "loss": 0.0427, + "step": 5197 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.21948973250772175, + "learning_rate": 1.2497724552371747e-06, + "loss": 0.0428, + "step": 5198 + }, + { + "epoch": 2.7064029151483604, + "grad_norm": 0.23291910481811462, + "learning_rate": 1.2453894665052008e-06, + "loss": 0.0445, + "step": 5199 + }, + { + "epoch": 2.706923477355544, + "grad_norm": 0.21755848524659785, + "learning_rate": 1.2410139805846738e-06, + "loss": 0.0423, + "step": 5200 + }, + { + "epoch": 2.707444039562728, + "grad_norm": 0.2319847511448208, + "learning_rate": 1.2366459988575774e-06, + "loss": 0.0438, + "step": 5201 + }, + { + "epoch": 2.7079646017699117, + "grad_norm": 0.2303536354287684, + "learning_rate": 1.2322855227035301e-06, + "loss": 0.0439, + "step": 5202 + }, + { + "epoch": 2.7084851639770955, + "grad_norm": 0.23065828693762852, + "learning_rate": 1.2279325534997693e-06, + "loss": 0.0446, + "step": 5203 + }, + { + "epoch": 2.7090057261842793, + "grad_norm": 0.24141647025860344, + "learning_rate": 1.2235870926211619e-06, + "loss": 0.0457, + "step": 5204 + }, + { + "epoch": 2.709526288391463, + "grad_norm": 0.2350329888138472, + "learning_rate": 1.2192491414402096e-06, + "loss": 0.044, + "step": 5205 + }, + { + "epoch": 2.710046850598647, + "grad_norm": 0.231757509837116, + "learning_rate": 1.2149187013270392e-06, + "loss": 0.045, + "step": 5206 + }, + { + "epoch": 2.71056741280583, + "grad_norm": 0.2317717587228352, + "learning_rate": 1.2105957736494089e-06, + "loss": 0.0435, + "step": 5207 + }, + { + "epoch": 2.711087975013014, + "grad_norm": 0.22055288340775916, + "learning_rate": 1.2062803597726963e-06, + "loss": 0.0411, + "step": 5208 + }, + { + "epoch": 2.7116085372201977, + "grad_norm": 0.2268087132163455, + "learning_rate": 1.2019724610599081e-06, + "loss": 0.0436, + "step": 5209 + }, + { + "epoch": 2.7121290994273815, + "grad_norm": 0.2303811370184185, + "learning_rate": 1.197672078871681e-06, + "loss": 0.044, + "step": 5210 + }, + { + "epoch": 2.7126496616345652, + "grad_norm": 0.23071049199969476, + "learning_rate": 1.193379214566276e-06, + "loss": 0.0446, + "step": 5211 + }, + { + "epoch": 2.713170223841749, + "grad_norm": 0.22811091103571507, + "learning_rate": 1.1890938694995829e-06, + "loss": 0.0432, + "step": 5212 + }, + { + "epoch": 2.7136907860489328, + "grad_norm": 0.2379584189281919, + "learning_rate": 1.1848160450251083e-06, + "loss": 0.0463, + "step": 5213 + }, + { + "epoch": 2.7142113482561165, + "grad_norm": 0.22764918185318186, + "learning_rate": 1.180545742493988e-06, + "loss": 0.0442, + "step": 5214 + }, + { + "epoch": 2.7147319104633003, + "grad_norm": 0.2254826094616572, + "learning_rate": 1.1762829632549849e-06, + "loss": 0.044, + "step": 5215 + }, + { + "epoch": 2.715252472670484, + "grad_norm": 0.22681141939496943, + "learning_rate": 1.1720277086544857e-06, + "loss": 0.0433, + "step": 5216 + }, + { + "epoch": 2.715773034877668, + "grad_norm": 0.21876872488915144, + "learning_rate": 1.1677799800364958e-06, + "loss": 0.0429, + "step": 5217 + }, + { + "epoch": 2.7162935970848516, + "grad_norm": 0.22378659997771677, + "learning_rate": 1.1635397787426366e-06, + "loss": 0.0438, + "step": 5218 + }, + { + "epoch": 2.7168141592920354, + "grad_norm": 0.2265493115245886, + "learning_rate": 1.159307106112173e-06, + "loss": 0.0442, + "step": 5219 + }, + { + "epoch": 2.717334721499219, + "grad_norm": 0.2298762644808635, + "learning_rate": 1.1550819634819743e-06, + "loss": 0.0443, + "step": 5220 + }, + { + "epoch": 2.717855283706403, + "grad_norm": 0.22228468158301753, + "learning_rate": 1.1508643521865397e-06, + "loss": 0.0429, + "step": 5221 + }, + { + "epoch": 2.7183758459135867, + "grad_norm": 0.23151020928005936, + "learning_rate": 1.1466542735579844e-06, + "loss": 0.0427, + "step": 5222 + }, + { + "epoch": 2.7188964081207705, + "grad_norm": 0.2214334447059201, + "learning_rate": 1.14245172892605e-06, + "loss": 0.0433, + "step": 5223 + }, + { + "epoch": 2.7194169703279543, + "grad_norm": 0.2283368759330757, + "learning_rate": 1.1382567196180916e-06, + "loss": 0.0437, + "step": 5224 + }, + { + "epoch": 2.719937532535138, + "grad_norm": 0.21320513712505007, + "learning_rate": 1.1340692469590964e-06, + "loss": 0.0422, + "step": 5225 + }, + { + "epoch": 2.720458094742322, + "grad_norm": 0.21709414283045186, + "learning_rate": 1.1298893122716563e-06, + "loss": 0.0438, + "step": 5226 + }, + { + "epoch": 2.7209786569495056, + "grad_norm": 0.2274072043634905, + "learning_rate": 1.125716916875988e-06, + "loss": 0.0433, + "step": 5227 + }, + { + "epoch": 2.7214992191566894, + "grad_norm": 0.2275711914621425, + "learning_rate": 1.1215520620899311e-06, + "loss": 0.0439, + "step": 5228 + }, + { + "epoch": 2.722019781363873, + "grad_norm": 0.22221898121836894, + "learning_rate": 1.1173947492289395e-06, + "loss": 0.0428, + "step": 5229 + }, + { + "epoch": 2.722540343571057, + "grad_norm": 0.21297199594248953, + "learning_rate": 1.1132449796060872e-06, + "loss": 0.0416, + "step": 5230 + }, + { + "epoch": 2.7230609057782402, + "grad_norm": 0.2255176774021255, + "learning_rate": 1.1091027545320654e-06, + "loss": 0.0426, + "step": 5231 + }, + { + "epoch": 2.723581467985424, + "grad_norm": 0.22910733482663895, + "learning_rate": 1.1049680753151798e-06, + "loss": 0.0449, + "step": 5232 + }, + { + "epoch": 2.7241020301926078, + "grad_norm": 0.22540358774719207, + "learning_rate": 1.1008409432613525e-06, + "loss": 0.0423, + "step": 5233 + }, + { + "epoch": 2.7246225923997915, + "grad_norm": 0.22668282262981723, + "learning_rate": 1.0967213596741327e-06, + "loss": 0.0417, + "step": 5234 + }, + { + "epoch": 2.7251431546069753, + "grad_norm": 0.22941299415233102, + "learning_rate": 1.0926093258546655e-06, + "loss": 0.0445, + "step": 5235 + }, + { + "epoch": 2.725663716814159, + "grad_norm": 0.2460091226274877, + "learning_rate": 1.0885048431017313e-06, + "loss": 0.0474, + "step": 5236 + }, + { + "epoch": 2.726184279021343, + "grad_norm": 0.22696800191590433, + "learning_rate": 1.0844079127117074e-06, + "loss": 0.0441, + "step": 5237 + }, + { + "epoch": 2.7267048412285266, + "grad_norm": 0.2279206422911676, + "learning_rate": 1.0803185359786028e-06, + "loss": 0.0435, + "step": 5238 + }, + { + "epoch": 2.7272254034357104, + "grad_norm": 0.2206185588040236, + "learning_rate": 1.0762367141940287e-06, + "loss": 0.0439, + "step": 5239 + }, + { + "epoch": 2.727745965642894, + "grad_norm": 0.22844650094437752, + "learning_rate": 1.0721624486472209e-06, + "loss": 0.0436, + "step": 5240 + }, + { + "epoch": 2.728266527850078, + "grad_norm": 0.22100974494576425, + "learning_rate": 1.0680957406250135e-06, + "loss": 0.0436, + "step": 5241 + }, + { + "epoch": 2.7287870900572617, + "grad_norm": 0.22075747241403315, + "learning_rate": 1.0640365914118682e-06, + "loss": 0.0424, + "step": 5242 + }, + { + "epoch": 2.7293076522644455, + "grad_norm": 0.230695539655403, + "learning_rate": 1.0599850022898539e-06, + "loss": 0.043, + "step": 5243 + }, + { + "epoch": 2.7298282144716293, + "grad_norm": 0.23168258489285376, + "learning_rate": 1.055940974538641e-06, + "loss": 0.0431, + "step": 5244 + }, + { + "epoch": 2.730348776678813, + "grad_norm": 0.22697802722231222, + "learning_rate": 1.0519045094355363e-06, + "loss": 0.0432, + "step": 5245 + }, + { + "epoch": 2.730869338885997, + "grad_norm": 0.2308978573693367, + "learning_rate": 1.0478756082554304e-06, + "loss": 0.0441, + "step": 5246 + }, + { + "epoch": 2.7313899010931806, + "grad_norm": 0.2304130228518842, + "learning_rate": 1.0438542722708445e-06, + "loss": 0.0427, + "step": 5247 + }, + { + "epoch": 2.7319104633003644, + "grad_norm": 0.2282468794565119, + "learning_rate": 1.0398405027519016e-06, + "loss": 0.0424, + "step": 5248 + }, + { + "epoch": 2.732431025507548, + "grad_norm": 0.21865000074849475, + "learning_rate": 1.0358343009663428e-06, + "loss": 0.0422, + "step": 5249 + }, + { + "epoch": 2.732951587714732, + "grad_norm": 0.21986811718537888, + "learning_rate": 1.0318356681795039e-06, + "loss": 0.0409, + "step": 5250 + }, + { + "epoch": 2.7334721499219157, + "grad_norm": 0.22870095194546886, + "learning_rate": 1.0278446056543406e-06, + "loss": 0.0434, + "step": 5251 + }, + { + "epoch": 2.7339927121290994, + "grad_norm": 0.22636557592510134, + "learning_rate": 1.0238611146514253e-06, + "loss": 0.0429, + "step": 5252 + }, + { + "epoch": 2.734513274336283, + "grad_norm": 0.21850314176206617, + "learning_rate": 1.0198851964289185e-06, + "loss": 0.0431, + "step": 5253 + }, + { + "epoch": 2.735033836543467, + "grad_norm": 0.2186586118194875, + "learning_rate": 1.01591685224261e-06, + "loss": 0.0418, + "step": 5254 + }, + { + "epoch": 2.7355543987506508, + "grad_norm": 0.24147015358727544, + "learning_rate": 1.0119560833458775e-06, + "loss": 0.0444, + "step": 5255 + }, + { + "epoch": 2.7360749609578345, + "grad_norm": 0.2315952637276368, + "learning_rate": 1.0080028909897233e-06, + "loss": 0.0463, + "step": 5256 + }, + { + "epoch": 2.7365955231650183, + "grad_norm": 0.2200895690176689, + "learning_rate": 1.0040572764227458e-06, + "loss": 0.0432, + "step": 5257 + }, + { + "epoch": 2.737116085372202, + "grad_norm": 0.22812841777700557, + "learning_rate": 1.0001192408911592e-06, + "loss": 0.0442, + "step": 5258 + }, + { + "epoch": 2.737636647579386, + "grad_norm": 0.2224052677158258, + "learning_rate": 9.961887856387714e-07, + "loss": 0.0426, + "step": 5259 + }, + { + "epoch": 2.7381572097865696, + "grad_norm": 0.22285780659280513, + "learning_rate": 9.92265911907006e-07, + "loss": 0.0429, + "step": 5260 + }, + { + "epoch": 2.7386777719937534, + "grad_norm": 0.22292964049968192, + "learning_rate": 9.883506209348914e-07, + "loss": 0.0415, + "step": 5261 + }, + { + "epoch": 2.739198334200937, + "grad_norm": 0.2253363805591684, + "learning_rate": 9.84442913959055e-07, + "loss": 0.0445, + "step": 5262 + }, + { + "epoch": 2.739718896408121, + "grad_norm": 0.22914905603731295, + "learning_rate": 9.805427922137373e-07, + "loss": 0.0436, + "step": 5263 + }, + { + "epoch": 2.7402394586153047, + "grad_norm": 0.22423612517145042, + "learning_rate": 9.766502569307722e-07, + "loss": 0.0443, + "step": 5264 + }, + { + "epoch": 2.7407600208224885, + "grad_norm": 0.22723159502834736, + "learning_rate": 9.727653093396044e-07, + "loss": 0.0447, + "step": 5265 + }, + { + "epoch": 2.7412805830296723, + "grad_norm": 0.22822872201192684, + "learning_rate": 9.688879506672854e-07, + "loss": 0.043, + "step": 5266 + }, + { + "epoch": 2.741801145236856, + "grad_norm": 0.21393631920738038, + "learning_rate": 9.650181821384636e-07, + "loss": 0.0412, + "step": 5267 + }, + { + "epoch": 2.74232170744404, + "grad_norm": 0.22094569188225746, + "learning_rate": 9.611560049753915e-07, + "loss": 0.0434, + "step": 5268 + }, + { + "epoch": 2.7428422696512236, + "grad_norm": 0.22305606790036275, + "learning_rate": 9.573014203979242e-07, + "loss": 0.0425, + "step": 5269 + }, + { + "epoch": 2.7433628318584073, + "grad_norm": 0.22902027508759243, + "learning_rate": 9.534544296235181e-07, + "loss": 0.0428, + "step": 5270 + }, + { + "epoch": 2.7438833940655907, + "grad_norm": 0.23242670385926198, + "learning_rate": 9.49615033867235e-07, + "loss": 0.0446, + "step": 5271 + }, + { + "epoch": 2.7444039562727744, + "grad_norm": 0.2347141380798654, + "learning_rate": 9.45783234341735e-07, + "loss": 0.0425, + "step": 5272 + }, + { + "epoch": 2.744924518479958, + "grad_norm": 0.22332579684782305, + "learning_rate": 9.419590322572725e-07, + "loss": 0.0443, + "step": 5273 + }, + { + "epoch": 2.745445080687142, + "grad_norm": 0.22189445785006218, + "learning_rate": 9.381424288217117e-07, + "loss": 0.0423, + "step": 5274 + }, + { + "epoch": 2.7459656428943258, + "grad_norm": 0.2261260353066565, + "learning_rate": 9.343334252405133e-07, + "loss": 0.0434, + "step": 5275 + }, + { + "epoch": 2.7464862051015095, + "grad_norm": 0.22674984875383666, + "learning_rate": 9.30532022716743e-07, + "loss": 0.0434, + "step": 5276 + }, + { + "epoch": 2.7470067673086933, + "grad_norm": 0.22247669966218622, + "learning_rate": 9.26738222451054e-07, + "loss": 0.0431, + "step": 5277 + }, + { + "epoch": 2.747527329515877, + "grad_norm": 0.22557924637748528, + "learning_rate": 9.229520256417073e-07, + "loss": 0.0456, + "step": 5278 + }, + { + "epoch": 2.748047891723061, + "grad_norm": 0.22992144412583332, + "learning_rate": 9.191734334845603e-07, + "loss": 0.0452, + "step": 5279 + }, + { + "epoch": 2.7485684539302446, + "grad_norm": 0.22300684919494487, + "learning_rate": 9.154024471730721e-07, + "loss": 0.0438, + "step": 5280 + }, + { + "epoch": 2.7490890161374284, + "grad_norm": 0.22245433446016094, + "learning_rate": 9.1163906789829e-07, + "loss": 0.042, + "step": 5281 + }, + { + "epoch": 2.749609578344612, + "grad_norm": 0.22206770415357324, + "learning_rate": 9.078832968488632e-07, + "loss": 0.0434, + "step": 5282 + }, + { + "epoch": 2.750130140551796, + "grad_norm": 0.23376405716262028, + "learning_rate": 9.041351352110427e-07, + "loss": 0.0431, + "step": 5283 + }, + { + "epoch": 2.7506507027589797, + "grad_norm": 0.22966704496277124, + "learning_rate": 9.003945841686707e-07, + "loss": 0.0434, + "step": 5284 + }, + { + "epoch": 2.7511712649661635, + "grad_norm": 0.22838593078199046, + "learning_rate": 8.966616449031906e-07, + "loss": 0.0422, + "step": 5285 + }, + { + "epoch": 2.7516918271733473, + "grad_norm": 0.22176204273902209, + "learning_rate": 8.929363185936346e-07, + "loss": 0.0443, + "step": 5286 + }, + { + "epoch": 2.752212389380531, + "grad_norm": 0.22581365238705586, + "learning_rate": 8.892186064166335e-07, + "loss": 0.0441, + "step": 5287 + }, + { + "epoch": 2.752732951587715, + "grad_norm": 0.2244870202646647, + "learning_rate": 8.855085095464149e-07, + "loss": 0.0429, + "step": 5288 + }, + { + "epoch": 2.7532535137948986, + "grad_norm": 0.22803770453270514, + "learning_rate": 8.818060291548053e-07, + "loss": 0.0436, + "step": 5289 + }, + { + "epoch": 2.7537740760020823, + "grad_norm": 0.2250742592276933, + "learning_rate": 8.781111664112162e-07, + "loss": 0.042, + "step": 5290 + }, + { + "epoch": 2.754294638209266, + "grad_norm": 0.2253420328172488, + "learning_rate": 8.744239224826534e-07, + "loss": 0.0419, + "step": 5291 + }, + { + "epoch": 2.75481520041645, + "grad_norm": 0.2264536838481127, + "learning_rate": 8.707442985337239e-07, + "loss": 0.0441, + "step": 5292 + }, + { + "epoch": 2.7553357626236337, + "grad_norm": 0.2202483024468351, + "learning_rate": 8.670722957266231e-07, + "loss": 0.0443, + "step": 5293 + }, + { + "epoch": 2.7558563248308174, + "grad_norm": 0.2303490443562307, + "learning_rate": 8.634079152211427e-07, + "loss": 0.0429, + "step": 5294 + }, + { + "epoch": 2.7563768870380008, + "grad_norm": 0.22221701362424096, + "learning_rate": 8.597511581746626e-07, + "loss": 0.0433, + "step": 5295 + }, + { + "epoch": 2.7568974492451845, + "grad_norm": 0.23951455762675466, + "learning_rate": 8.56102025742156e-07, + "loss": 0.0447, + "step": 5296 + }, + { + "epoch": 2.7574180114523683, + "grad_norm": 0.2374125127215411, + "learning_rate": 8.524605190761897e-07, + "loss": 0.0451, + "step": 5297 + }, + { + "epoch": 2.757938573659552, + "grad_norm": 0.23192693160940323, + "learning_rate": 8.488266393269245e-07, + "loss": 0.0432, + "step": 5298 + }, + { + "epoch": 2.758459135866736, + "grad_norm": 0.2342718094829394, + "learning_rate": 8.452003876421033e-07, + "loss": 0.0437, + "step": 5299 + }, + { + "epoch": 2.7589796980739196, + "grad_norm": 0.2250652941295221, + "learning_rate": 8.415817651670654e-07, + "loss": 0.0437, + "step": 5300 + }, + { + "epoch": 2.7595002602811034, + "grad_norm": 0.22979466231086934, + "learning_rate": 8.379707730447439e-07, + "loss": 0.0437, + "step": 5301 + }, + { + "epoch": 2.760020822488287, + "grad_norm": 0.23361883586709467, + "learning_rate": 8.343674124156542e-07, + "loss": 0.0443, + "step": 5302 + }, + { + "epoch": 2.760541384695471, + "grad_norm": 0.22631184004097923, + "learning_rate": 8.307716844179081e-07, + "loss": 0.0416, + "step": 5303 + }, + { + "epoch": 2.7610619469026547, + "grad_norm": 0.23184232037276983, + "learning_rate": 8.271835901872055e-07, + "loss": 0.0439, + "step": 5304 + }, + { + "epoch": 2.7615825091098385, + "grad_norm": 0.23202421193452336, + "learning_rate": 8.236031308568287e-07, + "loss": 0.0431, + "step": 5305 + }, + { + "epoch": 2.7621030713170223, + "grad_norm": 0.22241507265189012, + "learning_rate": 8.200303075576565e-07, + "loss": 0.0425, + "step": 5306 + }, + { + "epoch": 2.762623633524206, + "grad_norm": 0.22697877013798187, + "learning_rate": 8.164651214181556e-07, + "loss": 0.044, + "step": 5307 + }, + { + "epoch": 2.76314419573139, + "grad_norm": 0.2247465473110427, + "learning_rate": 8.129075735643698e-07, + "loss": 0.0429, + "step": 5308 + }, + { + "epoch": 2.7636647579385736, + "grad_norm": 0.22274117285283673, + "learning_rate": 8.093576651199447e-07, + "loss": 0.0425, + "step": 5309 + }, + { + "epoch": 2.7641853201457574, + "grad_norm": 0.22736433771242023, + "learning_rate": 8.058153972061027e-07, + "loss": 0.0415, + "step": 5310 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.22964935642008533, + "learning_rate": 8.022807709416575e-07, + "loss": 0.0433, + "step": 5311 + }, + { + "epoch": 2.765226444560125, + "grad_norm": 0.23653451017732308, + "learning_rate": 7.987537874430101e-07, + "loss": 0.0447, + "step": 5312 + }, + { + "epoch": 2.7657470067673087, + "grad_norm": 0.22232202569716866, + "learning_rate": 7.952344478241503e-07, + "loss": 0.042, + "step": 5313 + }, + { + "epoch": 2.7662675689744924, + "grad_norm": 0.22865924001277593, + "learning_rate": 7.917227531966387e-07, + "loss": 0.0443, + "step": 5314 + }, + { + "epoch": 2.766788131181676, + "grad_norm": 0.2199762097546008, + "learning_rate": 7.88218704669641e-07, + "loss": 0.0416, + "step": 5315 + }, + { + "epoch": 2.76730869338886, + "grad_norm": 0.22907960763658383, + "learning_rate": 7.847223033498968e-07, + "loss": 0.0443, + "step": 5316 + }, + { + "epoch": 2.7678292555960438, + "grad_norm": 0.2224849881577553, + "learning_rate": 7.812335503417284e-07, + "loss": 0.0434, + "step": 5317 + }, + { + "epoch": 2.7683498178032275, + "grad_norm": 0.2229443345455532, + "learning_rate": 7.777524467470515e-07, + "loss": 0.0423, + "step": 5318 + }, + { + "epoch": 2.7688703800104113, + "grad_norm": 0.22816646258582196, + "learning_rate": 7.742789936653561e-07, + "loss": 0.0429, + "step": 5319 + }, + { + "epoch": 2.769390942217595, + "grad_norm": 0.22839763781886158, + "learning_rate": 7.708131921937229e-07, + "loss": 0.0441, + "step": 5320 + }, + { + "epoch": 2.769911504424779, + "grad_norm": 0.22727070637222285, + "learning_rate": 7.673550434268123e-07, + "loss": 0.045, + "step": 5321 + }, + { + "epoch": 2.7704320666319626, + "grad_norm": 0.22633561873190844, + "learning_rate": 7.639045484568702e-07, + "loss": 0.045, + "step": 5322 + }, + { + "epoch": 2.7709526288391464, + "grad_norm": 0.22765311723057183, + "learning_rate": 7.60461708373722e-07, + "loss": 0.044, + "step": 5323 + }, + { + "epoch": 2.77147319104633, + "grad_norm": 0.2272959845769333, + "learning_rate": 7.570265242647784e-07, + "loss": 0.0432, + "step": 5324 + }, + { + "epoch": 2.771993753253514, + "grad_norm": 0.2181349991575582, + "learning_rate": 7.535989972150298e-07, + "loss": 0.0426, + "step": 5325 + }, + { + "epoch": 2.7725143154606977, + "grad_norm": 0.21996140541009873, + "learning_rate": 7.501791283070436e-07, + "loss": 0.0431, + "step": 5326 + }, + { + "epoch": 2.7730348776678815, + "grad_norm": 0.22240120810935823, + "learning_rate": 7.467669186209836e-07, + "loss": 0.0426, + "step": 5327 + }, + { + "epoch": 2.7735554398750653, + "grad_norm": 0.23526222986344317, + "learning_rate": 7.433623692345765e-07, + "loss": 0.0428, + "step": 5328 + }, + { + "epoch": 2.774076002082249, + "grad_norm": 0.23391764152544686, + "learning_rate": 7.399654812231399e-07, + "loss": 0.0441, + "step": 5329 + }, + { + "epoch": 2.774596564289433, + "grad_norm": 0.23704308426871976, + "learning_rate": 7.365762556595685e-07, + "loss": 0.0442, + "step": 5330 + }, + { + "epoch": 2.7751171264966166, + "grad_norm": 0.22376362085484436, + "learning_rate": 7.331946936143392e-07, + "loss": 0.0456, + "step": 5331 + }, + { + "epoch": 2.7756376887038003, + "grad_norm": 0.2265634568261286, + "learning_rate": 7.298207961555031e-07, + "loss": 0.0443, + "step": 5332 + }, + { + "epoch": 2.776158250910984, + "grad_norm": 0.23042926713303066, + "learning_rate": 7.264545643486997e-07, + "loss": 0.0429, + "step": 5333 + }, + { + "epoch": 2.776678813118168, + "grad_norm": 0.2213224298794227, + "learning_rate": 7.230959992571368e-07, + "loss": 0.0421, + "step": 5334 + }, + { + "epoch": 2.777199375325351, + "grad_norm": 0.2210558855748437, + "learning_rate": 7.197451019416073e-07, + "loss": 0.0425, + "step": 5335 + }, + { + "epoch": 2.777719937532535, + "grad_norm": 0.2258634646023793, + "learning_rate": 7.164018734604816e-07, + "loss": 0.0437, + "step": 5336 + }, + { + "epoch": 2.7782404997397188, + "grad_norm": 0.23229199016420934, + "learning_rate": 7.130663148697037e-07, + "loss": 0.0464, + "step": 5337 + }, + { + "epoch": 2.7787610619469025, + "grad_norm": 0.22326405991841328, + "learning_rate": 7.097384272228003e-07, + "loss": 0.043, + "step": 5338 + }, + { + "epoch": 2.7792816241540863, + "grad_norm": 0.23391749933423464, + "learning_rate": 7.064182115708723e-07, + "loss": 0.0445, + "step": 5339 + }, + { + "epoch": 2.77980218636127, + "grad_norm": 0.23115860660431092, + "learning_rate": 7.031056689626031e-07, + "loss": 0.0437, + "step": 5340 + }, + { + "epoch": 2.780322748568454, + "grad_norm": 0.22704336146442725, + "learning_rate": 6.998008004442391e-07, + "loss": 0.0457, + "step": 5341 + }, + { + "epoch": 2.7808433107756376, + "grad_norm": 0.2226141103882042, + "learning_rate": 6.965036070596175e-07, + "loss": 0.0437, + "step": 5342 + }, + { + "epoch": 2.7813638729828214, + "grad_norm": 0.22972774937475848, + "learning_rate": 6.932140898501471e-07, + "loss": 0.0437, + "step": 5343 + }, + { + "epoch": 2.781884435190005, + "grad_norm": 0.22763949291942495, + "learning_rate": 6.899322498548022e-07, + "loss": 0.0444, + "step": 5344 + }, + { + "epoch": 2.782404997397189, + "grad_norm": 0.2269364277590439, + "learning_rate": 6.866580881101508e-07, + "loss": 0.0455, + "step": 5345 + }, + { + "epoch": 2.7829255596043727, + "grad_norm": 0.22415876770946747, + "learning_rate": 6.833916056503187e-07, + "loss": 0.0413, + "step": 5346 + }, + { + "epoch": 2.7834461218115565, + "grad_norm": 0.2169182386149355, + "learning_rate": 6.801328035070137e-07, + "loss": 0.0411, + "step": 5347 + }, + { + "epoch": 2.7839666840187403, + "grad_norm": 0.22859639804739612, + "learning_rate": 6.768816827095182e-07, + "loss": 0.0439, + "step": 5348 + }, + { + "epoch": 2.784487246225924, + "grad_norm": 0.2252301895314262, + "learning_rate": 6.736382442846911e-07, + "loss": 0.0435, + "step": 5349 + }, + { + "epoch": 2.785007808433108, + "grad_norm": 0.23199340249555175, + "learning_rate": 6.70402489256955e-07, + "loss": 0.0433, + "step": 5350 + }, + { + "epoch": 2.7855283706402916, + "grad_norm": 0.2217393408623345, + "learning_rate": 6.671744186483143e-07, + "loss": 0.0427, + "step": 5351 + }, + { + "epoch": 2.7860489328474753, + "grad_norm": 0.22705878796369014, + "learning_rate": 6.639540334783478e-07, + "loss": 0.0436, + "step": 5352 + }, + { + "epoch": 2.786569495054659, + "grad_norm": 0.22046258969292104, + "learning_rate": 6.60741334764195e-07, + "loss": 0.0408, + "step": 5353 + }, + { + "epoch": 2.787090057261843, + "grad_norm": 0.22592912598051434, + "learning_rate": 6.575363235205856e-07, + "loss": 0.0434, + "step": 5354 + }, + { + "epoch": 2.7876106194690267, + "grad_norm": 0.22293933544483777, + "learning_rate": 6.543390007598016e-07, + "loss": 0.0433, + "step": 5355 + }, + { + "epoch": 2.7881311816762104, + "grad_norm": 0.2200500939054284, + "learning_rate": 6.511493674917102e-07, + "loss": 0.0418, + "step": 5356 + }, + { + "epoch": 2.788651743883394, + "grad_norm": 0.22913113334418797, + "learning_rate": 6.479674247237472e-07, + "loss": 0.0443, + "step": 5357 + }, + { + "epoch": 2.789172306090578, + "grad_norm": 0.23960510693606646, + "learning_rate": 6.447931734609197e-07, + "loss": 0.0434, + "step": 5358 + }, + { + "epoch": 2.7896928682977613, + "grad_norm": 0.23198268720084506, + "learning_rate": 6.416266147058009e-07, + "loss": 0.0436, + "step": 5359 + }, + { + "epoch": 2.790213430504945, + "grad_norm": 0.23155582454440707, + "learning_rate": 6.38467749458535e-07, + "loss": 0.0445, + "step": 5360 + }, + { + "epoch": 2.790733992712129, + "grad_norm": 0.2298256419187941, + "learning_rate": 6.353165787168464e-07, + "loss": 0.0437, + "step": 5361 + }, + { + "epoch": 2.7912545549193126, + "grad_norm": 0.2254102181052845, + "learning_rate": 6.321731034760164e-07, + "loss": 0.0433, + "step": 5362 + }, + { + "epoch": 2.7917751171264964, + "grad_norm": 0.23436390964542603, + "learning_rate": 6.290373247289011e-07, + "loss": 0.0458, + "step": 5363 + }, + { + "epoch": 2.79229567933368, + "grad_norm": 0.21766623446970718, + "learning_rate": 6.259092434659247e-07, + "loss": 0.0416, + "step": 5364 + }, + { + "epoch": 2.792816241540864, + "grad_norm": 0.21981768528318218, + "learning_rate": 6.227888606750803e-07, + "loss": 0.042, + "step": 5365 + }, + { + "epoch": 2.7933368037480477, + "grad_norm": 0.23061777624720375, + "learning_rate": 6.196761773419324e-07, + "loss": 0.0445, + "step": 5366 + }, + { + "epoch": 2.7938573659552315, + "grad_norm": 0.21991675403724478, + "learning_rate": 6.165711944496083e-07, + "loss": 0.0409, + "step": 5367 + }, + { + "epoch": 2.7943779281624153, + "grad_norm": 0.23345443481213357, + "learning_rate": 6.134739129788125e-07, + "loss": 0.0458, + "step": 5368 + }, + { + "epoch": 2.794898490369599, + "grad_norm": 0.23028111148067149, + "learning_rate": 6.103843339078014e-07, + "loss": 0.0436, + "step": 5369 + }, + { + "epoch": 2.795419052576783, + "grad_norm": 0.2302827035468124, + "learning_rate": 6.073024582124165e-07, + "loss": 0.0435, + "step": 5370 + }, + { + "epoch": 2.7959396147839666, + "grad_norm": 0.23249897658678184, + "learning_rate": 6.042282868660515e-07, + "loss": 0.0458, + "step": 5371 + }, + { + "epoch": 2.7964601769911503, + "grad_norm": 0.22510823189817172, + "learning_rate": 6.011618208396768e-07, + "loss": 0.0426, + "step": 5372 + }, + { + "epoch": 2.796980739198334, + "grad_norm": 0.22638808421418086, + "learning_rate": 5.981030611018234e-07, + "loss": 0.042, + "step": 5373 + }, + { + "epoch": 2.797501301405518, + "grad_norm": 0.22413378380890733, + "learning_rate": 5.950520086185878e-07, + "loss": 0.0424, + "step": 5374 + }, + { + "epoch": 2.7980218636127017, + "grad_norm": 0.2430762884956928, + "learning_rate": 5.920086643536354e-07, + "loss": 0.047, + "step": 5375 + }, + { + "epoch": 2.7985424258198854, + "grad_norm": 0.232168583757956, + "learning_rate": 5.889730292681972e-07, + "loss": 0.0438, + "step": 5376 + }, + { + "epoch": 2.799062988027069, + "grad_norm": 0.24007634677429804, + "learning_rate": 5.859451043210701e-07, + "loss": 0.0442, + "step": 5377 + }, + { + "epoch": 2.799583550234253, + "grad_norm": 0.22672307210185036, + "learning_rate": 5.829248904686085e-07, + "loss": 0.0433, + "step": 5378 + }, + { + "epoch": 2.8001041124414368, + "grad_norm": 0.2346606956607048, + "learning_rate": 5.799123886647439e-07, + "loss": 0.0451, + "step": 5379 + }, + { + "epoch": 2.8006246746486205, + "grad_norm": 0.22725576849779167, + "learning_rate": 5.769075998609569e-07, + "loss": 0.0429, + "step": 5380 + }, + { + "epoch": 2.8011452368558043, + "grad_norm": 0.23830547232482183, + "learning_rate": 5.73910525006302e-07, + "loss": 0.0442, + "step": 5381 + }, + { + "epoch": 2.801665799062988, + "grad_norm": 0.22960806546029328, + "learning_rate": 5.709211650473972e-07, + "loss": 0.0425, + "step": 5382 + }, + { + "epoch": 2.802186361270172, + "grad_norm": 0.2201304972527062, + "learning_rate": 5.679395209284178e-07, + "loss": 0.0426, + "step": 5383 + }, + { + "epoch": 2.8027069234773556, + "grad_norm": 0.22778314103337913, + "learning_rate": 5.649655935911075e-07, + "loss": 0.0435, + "step": 5384 + }, + { + "epoch": 2.8032274856845394, + "grad_norm": 0.22560465254179524, + "learning_rate": 5.619993839747733e-07, + "loss": 0.0432, + "step": 5385 + }, + { + "epoch": 2.803748047891723, + "grad_norm": 0.23421440146211206, + "learning_rate": 5.590408930162799e-07, + "loss": 0.0435, + "step": 5386 + }, + { + "epoch": 2.804268610098907, + "grad_norm": 0.21559388239552502, + "learning_rate": 5.560901216500575e-07, + "loss": 0.0411, + "step": 5387 + }, + { + "epoch": 2.8047891723060907, + "grad_norm": 0.22569321419302593, + "learning_rate": 5.531470708080965e-07, + "loss": 0.0428, + "step": 5388 + }, + { + "epoch": 2.8053097345132745, + "grad_norm": 0.2256230654004025, + "learning_rate": 5.502117414199481e-07, + "loss": 0.0432, + "step": 5389 + }, + { + "epoch": 2.8058302967204583, + "grad_norm": 0.23309729929515796, + "learning_rate": 5.472841344127261e-07, + "loss": 0.043, + "step": 5390 + }, + { + "epoch": 2.806350858927642, + "grad_norm": 0.22323967104644515, + "learning_rate": 5.443642507111074e-07, + "loss": 0.0429, + "step": 5391 + }, + { + "epoch": 2.806871421134826, + "grad_norm": 0.2200278159127524, + "learning_rate": 5.414520912373239e-07, + "loss": 0.0415, + "step": 5392 + }, + { + "epoch": 2.8073919833420096, + "grad_norm": 0.22357673987141546, + "learning_rate": 5.38547656911173e-07, + "loss": 0.0434, + "step": 5393 + }, + { + "epoch": 2.8079125455491933, + "grad_norm": 0.23721653321770933, + "learning_rate": 5.3565094865001e-07, + "loss": 0.0436, + "step": 5394 + }, + { + "epoch": 2.808433107756377, + "grad_norm": 0.23374671104516043, + "learning_rate": 5.327619673687528e-07, + "loss": 0.0442, + "step": 5395 + }, + { + "epoch": 2.808953669963561, + "grad_norm": 0.2243628561143931, + "learning_rate": 5.298807139798689e-07, + "loss": 0.0434, + "step": 5396 + }, + { + "epoch": 2.8094742321707447, + "grad_norm": 0.21338440786570576, + "learning_rate": 5.270071893934026e-07, + "loss": 0.0422, + "step": 5397 + }, + { + "epoch": 2.8099947943779284, + "grad_norm": 0.21726078260576132, + "learning_rate": 5.24141394516936e-07, + "loss": 0.0425, + "step": 5398 + }, + { + "epoch": 2.8105153565851118, + "grad_norm": 0.2357056904995546, + "learning_rate": 5.212833302556258e-07, + "loss": 0.0441, + "step": 5399 + }, + { + "epoch": 2.8110359187922955, + "grad_norm": 0.2218114034723653, + "learning_rate": 5.184329975121832e-07, + "loss": 0.0414, + "step": 5400 + }, + { + "epoch": 2.8115564809994793, + "grad_norm": 0.21777410706096698, + "learning_rate": 5.155903971868742e-07, + "loss": 0.0425, + "step": 5401 + }, + { + "epoch": 2.812077043206663, + "grad_norm": 0.21863538909227456, + "learning_rate": 5.127555301775223e-07, + "loss": 0.0411, + "step": 5402 + }, + { + "epoch": 2.812597605413847, + "grad_norm": 0.2272082792608195, + "learning_rate": 5.099283973795111e-07, + "loss": 0.0438, + "step": 5403 + }, + { + "epoch": 2.8131181676210306, + "grad_norm": 0.21938578677958842, + "learning_rate": 5.071089996857848e-07, + "loss": 0.0422, + "step": 5404 + }, + { + "epoch": 2.8136387298282144, + "grad_norm": 0.22905873049766085, + "learning_rate": 5.042973379868365e-07, + "loss": 0.0422, + "step": 5405 + }, + { + "epoch": 2.814159292035398, + "grad_norm": 0.22649704219116712, + "learning_rate": 5.014934131707199e-07, + "loss": 0.0435, + "step": 5406 + }, + { + "epoch": 2.814679854242582, + "grad_norm": 0.22467215809657948, + "learning_rate": 4.98697226123046e-07, + "loss": 0.0418, + "step": 5407 + }, + { + "epoch": 2.8152004164497657, + "grad_norm": 0.22783791469459005, + "learning_rate": 4.959087777269805e-07, + "loss": 0.0446, + "step": 5408 + }, + { + "epoch": 2.8157209786569495, + "grad_norm": 0.2214050794833035, + "learning_rate": 4.931280688632467e-07, + "loss": 0.0408, + "step": 5409 + }, + { + "epoch": 2.8162415408641333, + "grad_norm": 0.22088031305505856, + "learning_rate": 4.90355100410117e-07, + "loss": 0.0416, + "step": 5410 + }, + { + "epoch": 2.816762103071317, + "grad_norm": 0.22653460185601557, + "learning_rate": 4.875898732434298e-07, + "loss": 0.0427, + "step": 5411 + }, + { + "epoch": 2.817282665278501, + "grad_norm": 0.21289858185605157, + "learning_rate": 4.848323882365668e-07, + "loss": 0.0413, + "step": 5412 + }, + { + "epoch": 2.8178032274856846, + "grad_norm": 0.2309828690362825, + "learning_rate": 4.820826462604788e-07, + "loss": 0.0434, + "step": 5413 + }, + { + "epoch": 2.8183237896928683, + "grad_norm": 0.2374990881444587, + "learning_rate": 4.793406481836515e-07, + "loss": 0.0427, + "step": 5414 + }, + { + "epoch": 2.818844351900052, + "grad_norm": 0.23637721721606647, + "learning_rate": 4.7660639487214496e-07, + "loss": 0.0464, + "step": 5415 + }, + { + "epoch": 2.819364914107236, + "grad_norm": 0.22388241364899847, + "learning_rate": 4.738798871895572e-07, + "loss": 0.0434, + "step": 5416 + }, + { + "epoch": 2.8198854763144197, + "grad_norm": 0.2210864475178364, + "learning_rate": 4.7116112599704666e-07, + "loss": 0.0435, + "step": 5417 + }, + { + "epoch": 2.8204060385216034, + "grad_norm": 0.2259240497819733, + "learning_rate": 4.6845011215332914e-07, + "loss": 0.0427, + "step": 5418 + }, + { + "epoch": 2.820926600728787, + "grad_norm": 0.219204603117465, + "learning_rate": 4.6574684651466415e-07, + "loss": 0.0411, + "step": 5419 + }, + { + "epoch": 2.821447162935971, + "grad_norm": 0.21818043914111, + "learning_rate": 4.6305132993487155e-07, + "loss": 0.0413, + "step": 5420 + }, + { + "epoch": 2.8219677251431547, + "grad_norm": 0.21659493110106742, + "learning_rate": 4.6036356326532024e-07, + "loss": 0.0402, + "step": 5421 + }, + { + "epoch": 2.8224882873503385, + "grad_norm": 0.22715592904564377, + "learning_rate": 4.57683547354934e-07, + "loss": 0.0437, + "step": 5422 + }, + { + "epoch": 2.823008849557522, + "grad_norm": 0.23159567590291835, + "learning_rate": 4.5501128305018013e-07, + "loss": 0.0432, + "step": 5423 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.21733811243688106, + "learning_rate": 4.523467711950946e-07, + "loss": 0.0413, + "step": 5424 + }, + { + "epoch": 2.8240499739718894, + "grad_norm": 0.23788228121357813, + "learning_rate": 4.496900126312431e-07, + "loss": 0.0433, + "step": 5425 + }, + { + "epoch": 2.824570536179073, + "grad_norm": 0.22471884600489062, + "learning_rate": 4.4704100819776e-07, + "loss": 0.0426, + "step": 5426 + }, + { + "epoch": 2.825091098386257, + "grad_norm": 0.22556479297052626, + "learning_rate": 4.443997587313231e-07, + "loss": 0.0436, + "step": 5427 + }, + { + "epoch": 2.8256116605934407, + "grad_norm": 0.2204685863010288, + "learning_rate": 4.4176626506616245e-07, + "loss": 0.0426, + "step": 5428 + }, + { + "epoch": 2.8261322228006245, + "grad_norm": 0.2222480259682644, + "learning_rate": 4.391405280340544e-07, + "loss": 0.0425, + "step": 5429 + }, + { + "epoch": 2.8266527850078083, + "grad_norm": 0.223467726694392, + "learning_rate": 4.365225484643326e-07, + "loss": 0.0435, + "step": 5430 + }, + { + "epoch": 2.827173347214992, + "grad_norm": 0.22428205507604543, + "learning_rate": 4.339123271838746e-07, + "loss": 0.0419, + "step": 5431 + }, + { + "epoch": 2.827693909422176, + "grad_norm": 0.22403734441364156, + "learning_rate": 4.3130986501711547e-07, + "loss": 0.0418, + "step": 5432 + }, + { + "epoch": 2.8282144716293596, + "grad_norm": 0.2274253646113262, + "learning_rate": 4.2871516278602806e-07, + "loss": 0.0439, + "step": 5433 + }, + { + "epoch": 2.8287350338365433, + "grad_norm": 0.22023423609927847, + "learning_rate": 4.2612822131013754e-07, + "loss": 0.0431, + "step": 5434 + }, + { + "epoch": 2.829255596043727, + "grad_norm": 0.23148682456065023, + "learning_rate": 4.235490414065263e-07, + "loss": 0.0442, + "step": 5435 + }, + { + "epoch": 2.829776158250911, + "grad_norm": 0.21570556980256503, + "learning_rate": 4.2097762388981775e-07, + "loss": 0.0404, + "step": 5436 + }, + { + "epoch": 2.8302967204580947, + "grad_norm": 0.2261675056659339, + "learning_rate": 4.1841396957218446e-07, + "loss": 0.0434, + "step": 5437 + }, + { + "epoch": 2.8308172826652784, + "grad_norm": 0.23651027031881064, + "learning_rate": 4.158580792633482e-07, + "loss": 0.0459, + "step": 5438 + }, + { + "epoch": 2.831337844872462, + "grad_norm": 0.22013855746990973, + "learning_rate": 4.1330995377057703e-07, + "loss": 0.0426, + "step": 5439 + }, + { + "epoch": 2.831858407079646, + "grad_norm": 0.21970369162169295, + "learning_rate": 4.107695938986883e-07, + "loss": 0.0412, + "step": 5440 + }, + { + "epoch": 2.8323789692868298, + "grad_norm": 0.23453520708614153, + "learning_rate": 4.0823700045004854e-07, + "loss": 0.0449, + "step": 5441 + }, + { + "epoch": 2.8328995314940135, + "grad_norm": 0.22928185978871454, + "learning_rate": 4.057121742245651e-07, + "loss": 0.0428, + "step": 5442 + }, + { + "epoch": 2.8334200937011973, + "grad_norm": 0.2378862033367092, + "learning_rate": 4.031951160196945e-07, + "loss": 0.0438, + "step": 5443 + }, + { + "epoch": 2.833940655908381, + "grad_norm": 0.22721678410714027, + "learning_rate": 4.0068582663044527e-07, + "loss": 0.0452, + "step": 5444 + }, + { + "epoch": 2.834461218115565, + "grad_norm": 0.22411659172958168, + "learning_rate": 3.98184306849364e-07, + "loss": 0.0434, + "step": 5445 + }, + { + "epoch": 2.8349817803227486, + "grad_norm": 0.22502599203444407, + "learning_rate": 3.9569055746654927e-07, + "loss": 0.0441, + "step": 5446 + }, + { + "epoch": 2.8355023425299324, + "grad_norm": 0.22506720047885997, + "learning_rate": 3.9320457926964313e-07, + "loss": 0.0434, + "step": 5447 + }, + { + "epoch": 2.836022904737116, + "grad_norm": 0.2329833809778821, + "learning_rate": 3.9072637304383155e-07, + "loss": 0.0429, + "step": 5448 + }, + { + "epoch": 2.8365434669443, + "grad_norm": 0.22427732647548107, + "learning_rate": 3.882559395718466e-07, + "loss": 0.0416, + "step": 5449 + }, + { + "epoch": 2.8370640291514837, + "grad_norm": 0.22799511119215582, + "learning_rate": 3.857932796339697e-07, + "loss": 0.0434, + "step": 5450 + }, + { + "epoch": 2.8375845913586675, + "grad_norm": 0.22666653298135034, + "learning_rate": 3.833383940080232e-07, + "loss": 0.0431, + "step": 5451 + }, + { + "epoch": 2.8381051535658512, + "grad_norm": 0.22778792498675723, + "learning_rate": 3.808912834693701e-07, + "loss": 0.0424, + "step": 5452 + }, + { + "epoch": 2.838625715773035, + "grad_norm": 0.22353151517718023, + "learning_rate": 3.784519487909255e-07, + "loss": 0.0421, + "step": 5453 + }, + { + "epoch": 2.839146277980219, + "grad_norm": 0.22576570564471596, + "learning_rate": 3.7602039074314254e-07, + "loss": 0.0432, + "step": 5454 + }, + { + "epoch": 2.8396668401874026, + "grad_norm": 0.21923761704451078, + "learning_rate": 3.7359661009402356e-07, + "loss": 0.0408, + "step": 5455 + }, + { + "epoch": 2.8401874023945863, + "grad_norm": 0.22437305305796867, + "learning_rate": 3.71180607609109e-07, + "loss": 0.0436, + "step": 5456 + }, + { + "epoch": 2.84070796460177, + "grad_norm": 0.22396548451251694, + "learning_rate": 3.687723840514828e-07, + "loss": 0.043, + "step": 5457 + }, + { + "epoch": 2.841228526808954, + "grad_norm": 0.22949152227259195, + "learning_rate": 3.6637194018177556e-07, + "loss": 0.0431, + "step": 5458 + }, + { + "epoch": 2.8417490890161377, + "grad_norm": 0.23054445515120947, + "learning_rate": 3.63979276758164e-07, + "loss": 0.0438, + "step": 5459 + }, + { + "epoch": 2.8422696512233214, + "grad_norm": 0.23505182113223505, + "learning_rate": 3.6159439453635757e-07, + "loss": 0.0446, + "step": 5460 + }, + { + "epoch": 2.842790213430505, + "grad_norm": 0.2304779734911683, + "learning_rate": 3.5921729426961206e-07, + "loss": 0.044, + "step": 5461 + }, + { + "epoch": 2.843310775637689, + "grad_norm": 0.22385791465509375, + "learning_rate": 3.568479767087296e-07, + "loss": 0.0431, + "step": 5462 + }, + { + "epoch": 2.8438313378448723, + "grad_norm": 0.2264655835605014, + "learning_rate": 3.544864426020478e-07, + "loss": 0.0442, + "step": 5463 + }, + { + "epoch": 2.844351900052056, + "grad_norm": 0.22304778819677265, + "learning_rate": 3.521326926954532e-07, + "loss": 0.044, + "step": 5464 + }, + { + "epoch": 2.84487246225924, + "grad_norm": 0.21800081748182232, + "learning_rate": 3.497867277323652e-07, + "loss": 0.0424, + "step": 5465 + }, + { + "epoch": 2.8453930244664236, + "grad_norm": 0.23473879191053773, + "learning_rate": 3.474485484537521e-07, + "loss": 0.045, + "step": 5466 + }, + { + "epoch": 2.8459135866736074, + "grad_norm": 0.23264235355245735, + "learning_rate": 3.451181555981148e-07, + "loss": 0.043, + "step": 5467 + }, + { + "epoch": 2.846434148880791, + "grad_norm": 0.22835863639242554, + "learning_rate": 3.42795549901509e-07, + "loss": 0.0433, + "step": 5468 + }, + { + "epoch": 2.846954711087975, + "grad_norm": 0.2296814079782182, + "learning_rate": 3.4048073209751175e-07, + "loss": 0.044, + "step": 5469 + }, + { + "epoch": 2.8474752732951587, + "grad_norm": 0.22896223448226471, + "learning_rate": 3.381737029172577e-07, + "loss": 0.043, + "step": 5470 + }, + { + "epoch": 2.8479958355023425, + "grad_norm": 0.2229011394279904, + "learning_rate": 3.358744630894084e-07, + "loss": 0.0433, + "step": 5471 + }, + { + "epoch": 2.8485163977095262, + "grad_norm": 0.22816466058853094, + "learning_rate": 3.335830133401746e-07, + "loss": 0.0434, + "step": 5472 + }, + { + "epoch": 2.84903695991671, + "grad_norm": 0.23157385199630254, + "learning_rate": 3.3129935439329963e-07, + "loss": 0.0439, + "step": 5473 + }, + { + "epoch": 2.849557522123894, + "grad_norm": 0.22032768627149715, + "learning_rate": 3.290234869700731e-07, + "loss": 0.0424, + "step": 5474 + }, + { + "epoch": 2.8500780843310776, + "grad_norm": 0.23190042597170488, + "learning_rate": 3.2675541178931456e-07, + "loss": 0.044, + "step": 5475 + }, + { + "epoch": 2.8505986465382613, + "grad_norm": 0.23176287027892795, + "learning_rate": 3.244951295673926e-07, + "loss": 0.0429, + "step": 5476 + }, + { + "epoch": 2.851119208745445, + "grad_norm": 0.2242543810542042, + "learning_rate": 3.222426410182111e-07, + "loss": 0.0421, + "step": 5477 + }, + { + "epoch": 2.851639770952629, + "grad_norm": 0.2206514598162686, + "learning_rate": 3.199979468532038e-07, + "loss": 0.0426, + "step": 5478 + }, + { + "epoch": 2.8521603331598127, + "grad_norm": 0.23274276038647804, + "learning_rate": 3.1776104778135364e-07, + "loss": 0.0435, + "step": 5479 + }, + { + "epoch": 2.8526808953669964, + "grad_norm": 0.22635998598505638, + "learning_rate": 3.155319445091787e-07, + "loss": 0.043, + "step": 5480 + }, + { + "epoch": 2.85320145757418, + "grad_norm": 0.2324227117826993, + "learning_rate": 3.1331063774072965e-07, + "loss": 0.044, + "step": 5481 + }, + { + "epoch": 2.853722019781364, + "grad_norm": 0.22289276085299753, + "learning_rate": 3.1109712817760374e-07, + "loss": 0.0431, + "step": 5482 + }, + { + "epoch": 2.8542425819885477, + "grad_norm": 0.2306524725426029, + "learning_rate": 3.0889141651892495e-07, + "loss": 0.0439, + "step": 5483 + }, + { + "epoch": 2.8547631441957315, + "grad_norm": 0.23473790460862667, + "learning_rate": 3.0669350346136106e-07, + "loss": 0.0446, + "step": 5484 + }, + { + "epoch": 2.8552837064029153, + "grad_norm": 0.2322283298990947, + "learning_rate": 3.045033896991178e-07, + "loss": 0.044, + "step": 5485 + }, + { + "epoch": 2.855804268610099, + "grad_norm": 0.23255630154350312, + "learning_rate": 3.0232107592393364e-07, + "loss": 0.0436, + "step": 5486 + }, + { + "epoch": 2.8563248308172824, + "grad_norm": 0.22101058262042608, + "learning_rate": 3.001465628250849e-07, + "loss": 0.0414, + "step": 5487 + }, + { + "epoch": 2.856845393024466, + "grad_norm": 0.22455476284506215, + "learning_rate": 2.979798510893833e-07, + "loss": 0.0418, + "step": 5488 + }, + { + "epoch": 2.85736595523165, + "grad_norm": 0.2183840949547814, + "learning_rate": 2.958209414011759e-07, + "loss": 0.042, + "step": 5489 + }, + { + "epoch": 2.8578865174388337, + "grad_norm": 0.2277491006600283, + "learning_rate": 2.936698344423505e-07, + "loss": 0.0439, + "step": 5490 + }, + { + "epoch": 2.8584070796460175, + "grad_norm": 0.22275283037062169, + "learning_rate": 2.9152653089232764e-07, + "loss": 0.0412, + "step": 5491 + }, + { + "epoch": 2.8589276418532013, + "grad_norm": 0.23345487136636467, + "learning_rate": 2.8939103142805457e-07, + "loss": 0.0423, + "step": 5492 + }, + { + "epoch": 2.859448204060385, + "grad_norm": 0.2319513079584263, + "learning_rate": 2.8726333672402796e-07, + "loss": 0.0446, + "step": 5493 + }, + { + "epoch": 2.859968766267569, + "grad_norm": 0.22778088078659994, + "learning_rate": 2.8514344745227126e-07, + "loss": 0.0428, + "step": 5494 + }, + { + "epoch": 2.8604893284747526, + "grad_norm": 0.2332741422674495, + "learning_rate": 2.8303136428234624e-07, + "loss": 0.0434, + "step": 5495 + }, + { + "epoch": 2.8610098906819363, + "grad_norm": 0.22627378712005314, + "learning_rate": 2.809270878813441e-07, + "loss": 0.0442, + "step": 5496 + }, + { + "epoch": 2.86153045288912, + "grad_norm": 0.22239061683349978, + "learning_rate": 2.788306189138945e-07, + "loss": 0.0428, + "step": 5497 + }, + { + "epoch": 2.862051015096304, + "grad_norm": 0.2248198343709385, + "learning_rate": 2.767419580421593e-07, + "loss": 0.0429, + "step": 5498 + }, + { + "epoch": 2.8625715773034877, + "grad_norm": 0.2304909297159804, + "learning_rate": 2.7466110592583585e-07, + "loss": 0.0413, + "step": 5499 + }, + { + "epoch": 2.8630921395106714, + "grad_norm": 0.23465634470891614, + "learning_rate": 2.725880632221511e-07, + "loss": 0.0442, + "step": 5500 + }, + { + "epoch": 2.863612701717855, + "grad_norm": 0.22387193157763116, + "learning_rate": 2.705228305858731e-07, + "loss": 0.0418, + "step": 5501 + }, + { + "epoch": 2.864133263925039, + "grad_norm": 0.23336695279278744, + "learning_rate": 2.684654086692939e-07, + "loss": 0.0445, + "step": 5502 + }, + { + "epoch": 2.8646538261322227, + "grad_norm": 0.21957744263463164, + "learning_rate": 2.664157981222437e-07, + "loss": 0.0435, + "step": 5503 + }, + { + "epoch": 2.8651743883394065, + "grad_norm": 0.2170786887345366, + "learning_rate": 2.6437399959208797e-07, + "loss": 0.0421, + "step": 5504 + }, + { + "epoch": 2.8656949505465903, + "grad_norm": 0.22627011415059628, + "learning_rate": 2.6234001372372194e-07, + "loss": 0.0428, + "step": 5505 + }, + { + "epoch": 2.866215512753774, + "grad_norm": 0.2196527927457996, + "learning_rate": 2.603138411595707e-07, + "loss": 0.0422, + "step": 5506 + }, + { + "epoch": 2.866736074960958, + "grad_norm": 0.2273020912046598, + "learning_rate": 2.5829548253959445e-07, + "loss": 0.043, + "step": 5507 + }, + { + "epoch": 2.8672566371681416, + "grad_norm": 0.22215073225966223, + "learning_rate": 2.5628493850128334e-07, + "loss": 0.0425, + "step": 5508 + }, + { + "epoch": 2.8677771993753254, + "grad_norm": 0.21964426631376047, + "learning_rate": 2.5428220967965986e-07, + "loss": 0.0419, + "step": 5509 + }, + { + "epoch": 2.868297761582509, + "grad_norm": 0.23320229447875457, + "learning_rate": 2.5228729670728477e-07, + "loss": 0.0438, + "step": 5510 + }, + { + "epoch": 2.868818323789693, + "grad_norm": 0.22196795916795412, + "learning_rate": 2.503002002142374e-07, + "loss": 0.0422, + "step": 5511 + }, + { + "epoch": 2.8693388859968767, + "grad_norm": 0.22835206156602514, + "learning_rate": 2.483209208281406e-07, + "loss": 0.0431, + "step": 5512 + }, + { + "epoch": 2.8698594482040605, + "grad_norm": 0.22386662881380173, + "learning_rate": 2.4634945917414164e-07, + "loss": 0.0421, + "step": 5513 + }, + { + "epoch": 2.8703800104112442, + "grad_norm": 0.2352898314338022, + "learning_rate": 2.4438581587491737e-07, + "loss": 0.0445, + "step": 5514 + }, + { + "epoch": 2.870900572618428, + "grad_norm": 0.23787340624984157, + "learning_rate": 2.424299915506828e-07, + "loss": 0.0447, + "step": 5515 + }, + { + "epoch": 2.871421134825612, + "grad_norm": 0.2205758396912121, + "learning_rate": 2.4048198681917154e-07, + "loss": 0.0432, + "step": 5516 + }, + { + "epoch": 2.8719416970327956, + "grad_norm": 0.21752952294602812, + "learning_rate": 2.3854180229565816e-07, + "loss": 0.0416, + "step": 5517 + }, + { + "epoch": 2.8724622592399793, + "grad_norm": 0.22801262969470634, + "learning_rate": 2.3660943859294127e-07, + "loss": 0.0438, + "step": 5518 + }, + { + "epoch": 2.872982821447163, + "grad_norm": 0.23350404962787336, + "learning_rate": 2.3468489632135772e-07, + "loss": 0.044, + "step": 5519 + }, + { + "epoch": 2.873503383654347, + "grad_norm": 0.23598923493996454, + "learning_rate": 2.3276817608875734e-07, + "loss": 0.0442, + "step": 5520 + }, + { + "epoch": 2.8740239458615306, + "grad_norm": 0.2246003814846989, + "learning_rate": 2.3085927850053646e-07, + "loss": 0.0423, + "step": 5521 + }, + { + "epoch": 2.8745445080687144, + "grad_norm": 0.22646231927768828, + "learning_rate": 2.2895820415961e-07, + "loss": 0.0444, + "step": 5522 + }, + { + "epoch": 2.875065070275898, + "grad_norm": 0.22076704915255055, + "learning_rate": 2.2706495366643098e-07, + "loss": 0.0422, + "step": 5523 + }, + { + "epoch": 2.875585632483082, + "grad_norm": 0.22233921973664836, + "learning_rate": 2.2517952761897387e-07, + "loss": 0.0406, + "step": 5524 + }, + { + "epoch": 2.8761061946902657, + "grad_norm": 0.22865295080762318, + "learning_rate": 2.233019266127373e-07, + "loss": 0.0421, + "step": 5525 + }, + { + "epoch": 2.8766267568974495, + "grad_norm": 0.22596505815503942, + "learning_rate": 2.2143215124076354e-07, + "loss": 0.0429, + "step": 5526 + }, + { + "epoch": 2.877147319104633, + "grad_norm": 0.22657611067678246, + "learning_rate": 2.195702020936108e-07, + "loss": 0.0425, + "step": 5527 + }, + { + "epoch": 2.8776678813118166, + "grad_norm": 0.22817410965615426, + "learning_rate": 2.1771607975937246e-07, + "loss": 0.0432, + "step": 5528 + }, + { + "epoch": 2.8781884435190004, + "grad_norm": 0.22318595175463884, + "learning_rate": 2.158697848236607e-07, + "loss": 0.0425, + "step": 5529 + }, + { + "epoch": 2.878709005726184, + "grad_norm": 0.23144548762355127, + "learning_rate": 2.1403131786962848e-07, + "loss": 0.043, + "step": 5530 + }, + { + "epoch": 2.879229567933368, + "grad_norm": 0.22008731677973759, + "learning_rate": 2.1220067947794187e-07, + "loss": 0.0423, + "step": 5531 + }, + { + "epoch": 2.8797501301405517, + "grad_norm": 0.22081632722175687, + "learning_rate": 2.1037787022681055e-07, + "loss": 0.0411, + "step": 5532 + }, + { + "epoch": 2.8802706923477355, + "grad_norm": 0.2261541595421086, + "learning_rate": 2.0856289069195457e-07, + "loss": 0.0435, + "step": 5533 + }, + { + "epoch": 2.8807912545549192, + "grad_norm": 0.2325478049396348, + "learning_rate": 2.0675574144663202e-07, + "loss": 0.0455, + "step": 5534 + }, + { + "epoch": 2.881311816762103, + "grad_norm": 0.22779678676819293, + "learning_rate": 2.0495642306162244e-07, + "loss": 0.0423, + "step": 5535 + }, + { + "epoch": 2.881832378969287, + "grad_norm": 0.2708994243214654, + "learning_rate": 2.031649361052379e-07, + "loss": 0.0435, + "step": 5536 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.225376654268015, + "learning_rate": 2.0138128114331467e-07, + "loss": 0.0425, + "step": 5537 + }, + { + "epoch": 2.8828735033836543, + "grad_norm": 0.21849928271688246, + "learning_rate": 1.9960545873920765e-07, + "loss": 0.0422, + "step": 5538 + }, + { + "epoch": 2.883394065590838, + "grad_norm": 0.2240071596809921, + "learning_rate": 1.978374694538071e-07, + "loss": 0.0446, + "step": 5539 + }, + { + "epoch": 2.883914627798022, + "grad_norm": 0.21762364229612807, + "learning_rate": 1.960773138455302e-07, + "loss": 0.0424, + "step": 5540 + }, + { + "epoch": 2.8844351900052057, + "grad_norm": 0.22194985729963923, + "learning_rate": 1.9432499247031e-07, + "loss": 0.0419, + "step": 5541 + }, + { + "epoch": 2.8849557522123894, + "grad_norm": 0.2192694312980397, + "learning_rate": 1.925805058816177e-07, + "loss": 0.0414, + "step": 5542 + }, + { + "epoch": 2.885476314419573, + "grad_norm": 0.2458743689393578, + "learning_rate": 1.9084385463043475e-07, + "loss": 0.0451, + "step": 5543 + }, + { + "epoch": 2.885996876626757, + "grad_norm": 0.22425721644334612, + "learning_rate": 1.891150392652835e-07, + "loss": 0.0417, + "step": 5544 + }, + { + "epoch": 2.8865174388339407, + "grad_norm": 0.2275200585924636, + "learning_rate": 1.873940603322022e-07, + "loss": 0.0437, + "step": 5545 + }, + { + "epoch": 2.8870380010411245, + "grad_norm": 0.22808079161183273, + "learning_rate": 1.85680918374756e-07, + "loss": 0.0427, + "step": 5546 + }, + { + "epoch": 2.8875585632483083, + "grad_norm": 0.2232387790940575, + "learning_rate": 1.8397561393403427e-07, + "loss": 0.0443, + "step": 5547 + }, + { + "epoch": 2.888079125455492, + "grad_norm": 0.22740682941731324, + "learning_rate": 1.8227814754865068e-07, + "loss": 0.0444, + "step": 5548 + }, + { + "epoch": 2.888599687662676, + "grad_norm": 0.2304311796701836, + "learning_rate": 1.8058851975474577e-07, + "loss": 0.0446, + "step": 5549 + }, + { + "epoch": 2.8891202498698596, + "grad_norm": 0.2347568933658674, + "learning_rate": 1.7890673108598433e-07, + "loss": 0.0424, + "step": 5550 + }, + { + "epoch": 2.889640812077043, + "grad_norm": 0.22430118829279044, + "learning_rate": 1.7723278207354976e-07, + "loss": 0.0433, + "step": 5551 + }, + { + "epoch": 2.8901613742842267, + "grad_norm": 0.22066156804762518, + "learning_rate": 1.7556667324615527e-07, + "loss": 0.0417, + "step": 5552 + }, + { + "epoch": 2.8906819364914105, + "grad_norm": 0.22032601369478175, + "learning_rate": 1.7390840513003548e-07, + "loss": 0.0415, + "step": 5553 + }, + { + "epoch": 2.8912024986985942, + "grad_norm": 0.23669903664743355, + "learning_rate": 1.722579782489464e-07, + "loss": 0.0443, + "step": 5554 + }, + { + "epoch": 2.891723060905778, + "grad_norm": 0.23261439457929947, + "learning_rate": 1.7061539312417108e-07, + "loss": 0.0434, + "step": 5555 + }, + { + "epoch": 2.892243623112962, + "grad_norm": 0.22614962993197088, + "learning_rate": 1.689806502745167e-07, + "loss": 0.042, + "step": 5556 + }, + { + "epoch": 2.8927641853201456, + "grad_norm": 0.22566726168640677, + "learning_rate": 1.6735375021630916e-07, + "loss": 0.0437, + "step": 5557 + }, + { + "epoch": 2.8932847475273293, + "grad_norm": 0.23008423849366555, + "learning_rate": 1.6573469346339576e-07, + "loss": 0.0432, + "step": 5558 + }, + { + "epoch": 2.893805309734513, + "grad_norm": 0.22692870640049476, + "learning_rate": 1.6412348052715632e-07, + "loss": 0.0448, + "step": 5559 + }, + { + "epoch": 2.894325871941697, + "grad_norm": 0.22253713385008417, + "learning_rate": 1.6252011191648653e-07, + "loss": 0.0418, + "step": 5560 + }, + { + "epoch": 2.8948464341488807, + "grad_norm": 0.24561203289796596, + "learning_rate": 1.6092458813779797e-07, + "loss": 0.043, + "step": 5561 + }, + { + "epoch": 2.8953669963560644, + "grad_norm": 0.2272264787299964, + "learning_rate": 1.593369096950348e-07, + "loss": 0.0422, + "step": 5562 + }, + { + "epoch": 2.895887558563248, + "grad_norm": 0.21728188819033073, + "learning_rate": 1.5775707708966248e-07, + "loss": 0.0418, + "step": 5563 + }, + { + "epoch": 2.896408120770432, + "grad_norm": 0.23004082744114976, + "learning_rate": 1.5618509082066246e-07, + "loss": 0.0437, + "step": 5564 + }, + { + "epoch": 2.8969286829776157, + "grad_norm": 0.24070902532286687, + "learning_rate": 1.5462095138454314e-07, + "loss": 0.043, + "step": 5565 + }, + { + "epoch": 2.8974492451847995, + "grad_norm": 0.2340610425537053, + "learning_rate": 1.5306465927533154e-07, + "loss": 0.0439, + "step": 5566 + }, + { + "epoch": 2.8979698073919833, + "grad_norm": 0.2266359975592615, + "learning_rate": 1.515162149845789e-07, + "loss": 0.0427, + "step": 5567 + }, + { + "epoch": 2.898490369599167, + "grad_norm": 0.22807262882625087, + "learning_rate": 1.4997561900135238e-07, + "loss": 0.0428, + "step": 5568 + }, + { + "epoch": 2.899010931806351, + "grad_norm": 0.22716325142678023, + "learning_rate": 1.4844287181224603e-07, + "loss": 0.0439, + "step": 5569 + }, + { + "epoch": 2.8995314940135346, + "grad_norm": 0.22620114345008535, + "learning_rate": 1.469179739013754e-07, + "loss": 0.0443, + "step": 5570 + }, + { + "epoch": 2.9000520562207184, + "grad_norm": 0.22824850058053361, + "learning_rate": 1.4540092575036636e-07, + "loss": 0.043, + "step": 5571 + }, + { + "epoch": 2.900572618427902, + "grad_norm": 0.23480974832174725, + "learning_rate": 1.4389172783838277e-07, + "loss": 0.0467, + "step": 5572 + }, + { + "epoch": 2.901093180635086, + "grad_norm": 0.22686928685216148, + "learning_rate": 1.4239038064209343e-07, + "loss": 0.0426, + "step": 5573 + }, + { + "epoch": 2.9016137428422697, + "grad_norm": 0.2185130750861107, + "learning_rate": 1.4089688463569394e-07, + "loss": 0.0412, + "step": 5574 + }, + { + "epoch": 2.9021343050494535, + "grad_norm": 0.22340460528624712, + "learning_rate": 1.3941124029090425e-07, + "loss": 0.0427, + "step": 5575 + }, + { + "epoch": 2.9026548672566372, + "grad_norm": 0.24387629963245097, + "learning_rate": 1.379334480769545e-07, + "loss": 0.046, + "step": 5576 + }, + { + "epoch": 2.903175429463821, + "grad_norm": 0.22899996540170645, + "learning_rate": 1.3646350846060197e-07, + "loss": 0.0453, + "step": 5577 + }, + { + "epoch": 2.903695991671005, + "grad_norm": 0.22580749048473495, + "learning_rate": 1.3500142190612797e-07, + "loss": 0.043, + "step": 5578 + }, + { + "epoch": 2.9042165538781886, + "grad_norm": 0.22259129283476642, + "learning_rate": 1.335471888753187e-07, + "loss": 0.0431, + "step": 5579 + }, + { + "epoch": 2.9047371160853723, + "grad_norm": 0.2222893681291658, + "learning_rate": 1.3210080982749284e-07, + "loss": 0.0424, + "step": 5580 + }, + { + "epoch": 2.905257678292556, + "grad_norm": 0.21864221805966788, + "learning_rate": 1.306622852194822e-07, + "loss": 0.0413, + "step": 5581 + }, + { + "epoch": 2.90577824049974, + "grad_norm": 0.2293590339534558, + "learning_rate": 1.2923161550564278e-07, + "loss": 0.0436, + "step": 5582 + }, + { + "epoch": 2.9062988027069236, + "grad_norm": 0.228497915949144, + "learning_rate": 1.2780880113784365e-07, + "loss": 0.0449, + "step": 5583 + }, + { + "epoch": 2.9068193649141074, + "grad_norm": 0.21641699537000803, + "learning_rate": 1.2639384256547816e-07, + "loss": 0.0413, + "step": 5584 + }, + { + "epoch": 2.907339927121291, + "grad_norm": 0.21723573180469424, + "learning_rate": 1.2498674023545543e-07, + "loss": 0.042, + "step": 5585 + }, + { + "epoch": 2.907860489328475, + "grad_norm": 0.2299450832808966, + "learning_rate": 1.2358749459220332e-07, + "loss": 0.0431, + "step": 5586 + }, + { + "epoch": 2.9083810515356587, + "grad_norm": 0.23308971409684184, + "learning_rate": 1.221961060776683e-07, + "loss": 0.0439, + "step": 5587 + }, + { + "epoch": 2.9089016137428425, + "grad_norm": 0.23005044711744224, + "learning_rate": 1.2081257513131828e-07, + "loss": 0.0447, + "step": 5588 + }, + { + "epoch": 2.9094221759500263, + "grad_norm": 0.22157757520601382, + "learning_rate": 1.1943690219013148e-07, + "loss": 0.0424, + "step": 5589 + }, + { + "epoch": 2.90994273815721, + "grad_norm": 0.22673099799671645, + "learning_rate": 1.1806908768861314e-07, + "loss": 0.0427, + "step": 5590 + }, + { + "epoch": 2.9104633003643934, + "grad_norm": 0.22789349189024055, + "learning_rate": 1.167091320587843e-07, + "loss": 0.0433, + "step": 5591 + }, + { + "epoch": 2.910983862571577, + "grad_norm": 0.221483966989294, + "learning_rate": 1.1535703573017919e-07, + "loss": 0.0421, + "step": 5592 + }, + { + "epoch": 2.911504424778761, + "grad_norm": 0.22112758106997762, + "learning_rate": 1.1401279912985341e-07, + "loss": 0.0426, + "step": 5593 + }, + { + "epoch": 2.9120249869859447, + "grad_norm": 0.22109349725590008, + "learning_rate": 1.1267642268238121e-07, + "loss": 0.0426, + "step": 5594 + }, + { + "epoch": 2.9125455491931285, + "grad_norm": 0.22080664506069955, + "learning_rate": 1.1134790680984996e-07, + "loss": 0.0418, + "step": 5595 + }, + { + "epoch": 2.9130661114003122, + "grad_norm": 0.22213280394837107, + "learning_rate": 1.1002725193186847e-07, + "loss": 0.0414, + "step": 5596 + }, + { + "epoch": 2.913586673607496, + "grad_norm": 0.230005852903613, + "learning_rate": 1.0871445846555861e-07, + "loss": 0.0451, + "step": 5597 + }, + { + "epoch": 2.91410723581468, + "grad_norm": 0.22684379339260194, + "learning_rate": 1.0740952682556371e-07, + "loss": 0.0439, + "step": 5598 + }, + { + "epoch": 2.9146277980218636, + "grad_norm": 0.2309550050238035, + "learning_rate": 1.0611245742404297e-07, + "loss": 0.0443, + "step": 5599 + }, + { + "epoch": 2.9151483602290473, + "grad_norm": 0.22694425419899925, + "learning_rate": 1.0482325067066868e-07, + "loss": 0.0429, + "step": 5600 + }, + { + "epoch": 2.915668922436231, + "grad_norm": 0.2355942815015955, + "learning_rate": 1.0354190697263454e-07, + "loss": 0.0416, + "step": 5601 + }, + { + "epoch": 2.916189484643415, + "grad_norm": 0.22915067675452885, + "learning_rate": 1.0226842673464742e-07, + "loss": 0.0431, + "step": 5602 + }, + { + "epoch": 2.9167100468505986, + "grad_norm": 0.22859882687372848, + "learning_rate": 1.0100281035893277e-07, + "loss": 0.0421, + "step": 5603 + }, + { + "epoch": 2.9172306090577824, + "grad_norm": 0.22612284310863184, + "learning_rate": 9.97450582452264e-08, + "loss": 0.0432, + "step": 5604 + }, + { + "epoch": 2.917751171264966, + "grad_norm": 0.2371748286947038, + "learning_rate": 9.849517079079384e-08, + "loss": 0.0442, + "step": 5605 + }, + { + "epoch": 2.91827173347215, + "grad_norm": 0.23429013057909479, + "learning_rate": 9.725314839039989e-08, + "loss": 0.0456, + "step": 5606 + }, + { + "epoch": 2.9187922956793337, + "grad_norm": 0.2252893473582409, + "learning_rate": 9.60189914363363e-08, + "loss": 0.0435, + "step": 5607 + }, + { + "epoch": 2.9193128578865175, + "grad_norm": 0.22061594822107208, + "learning_rate": 9.479270031840514e-08, + "loss": 0.0426, + "step": 5608 + }, + { + "epoch": 2.9198334200937013, + "grad_norm": 0.22394052877244155, + "learning_rate": 9.357427542392716e-08, + "loss": 0.0443, + "step": 5609 + }, + { + "epoch": 2.920353982300885, + "grad_norm": 0.22924733216319026, + "learning_rate": 9.236371713774172e-08, + "loss": 0.0432, + "step": 5610 + }, + { + "epoch": 2.920874544508069, + "grad_norm": 0.22008904881055802, + "learning_rate": 9.116102584219299e-08, + "loss": 0.0411, + "step": 5611 + }, + { + "epoch": 2.9213951067152526, + "grad_norm": 0.23123570570029328, + "learning_rate": 8.996620191714933e-08, + "loss": 0.0448, + "step": 5612 + }, + { + "epoch": 2.9219156689224364, + "grad_norm": 0.22502509485305863, + "learning_rate": 8.877924573999496e-08, + "loss": 0.0435, + "step": 5613 + }, + { + "epoch": 2.92243623112962, + "grad_norm": 0.22614275559557945, + "learning_rate": 8.760015768562169e-08, + "loss": 0.0431, + "step": 5614 + }, + { + "epoch": 2.9229567933368035, + "grad_norm": 0.23309402951892166, + "learning_rate": 8.642893812644271e-08, + "loss": 0.043, + "step": 5615 + }, + { + "epoch": 2.9234773555439872, + "grad_norm": 0.23455703296115385, + "learning_rate": 8.526558743238156e-08, + "loss": 0.0459, + "step": 5616 + }, + { + "epoch": 2.923997917751171, + "grad_norm": 0.22213718683676725, + "learning_rate": 8.411010597088043e-08, + "loss": 0.0411, + "step": 5617 + }, + { + "epoch": 2.924518479958355, + "grad_norm": 0.2309034800189144, + "learning_rate": 8.296249410689461e-08, + "loss": 0.0446, + "step": 5618 + }, + { + "epoch": 2.9250390421655386, + "grad_norm": 0.22884832459468085, + "learning_rate": 8.182275220288971e-08, + "loss": 0.0437, + "step": 5619 + }, + { + "epoch": 2.9255596043727223, + "grad_norm": 0.22329303957735033, + "learning_rate": 8.069088061885277e-08, + "loss": 0.0421, + "step": 5620 + }, + { + "epoch": 2.926080166579906, + "grad_norm": 0.22435831062356618, + "learning_rate": 7.956687971228116e-08, + "loss": 0.0445, + "step": 5621 + }, + { + "epoch": 2.92660072878709, + "grad_norm": 0.22004471253385186, + "learning_rate": 7.845074983818534e-08, + "loss": 0.0423, + "step": 5622 + }, + { + "epoch": 2.9271212909942737, + "grad_norm": 0.2281975415710203, + "learning_rate": 7.734249134909166e-08, + "loss": 0.0443, + "step": 5623 + }, + { + "epoch": 2.9276418532014574, + "grad_norm": 0.2347633299429142, + "learning_rate": 7.624210459504233e-08, + "loss": 0.0468, + "step": 5624 + }, + { + "epoch": 2.928162415408641, + "grad_norm": 0.2244782906812353, + "learning_rate": 7.514958992358434e-08, + "loss": 0.0429, + "step": 5625 + }, + { + "epoch": 2.928682977615825, + "grad_norm": 0.22082459228471335, + "learning_rate": 7.406494767979167e-08, + "loss": 0.0404, + "step": 5626 + }, + { + "epoch": 2.9292035398230087, + "grad_norm": 0.23891153241426735, + "learning_rate": 7.298817820623749e-08, + "loss": 0.0452, + "step": 5627 + }, + { + "epoch": 2.9297241020301925, + "grad_norm": 0.2285958330718903, + "learning_rate": 7.191928184302477e-08, + "loss": 0.0424, + "step": 5628 + }, + { + "epoch": 2.9302446642373763, + "grad_norm": 0.21899788706540083, + "learning_rate": 7.085825892775288e-08, + "loss": 0.0439, + "step": 5629 + }, + { + "epoch": 2.93076522644456, + "grad_norm": 0.22669475507459996, + "learning_rate": 6.980510979554545e-08, + "loss": 0.0419, + "step": 5630 + }, + { + "epoch": 2.931285788651744, + "grad_norm": 0.23827004869207266, + "learning_rate": 6.875983477903635e-08, + "loss": 0.0448, + "step": 5631 + }, + { + "epoch": 2.9318063508589276, + "grad_norm": 0.23319352811948646, + "learning_rate": 6.772243420836988e-08, + "loss": 0.0425, + "step": 5632 + }, + { + "epoch": 2.9323269130661114, + "grad_norm": 0.21762653350885441, + "learning_rate": 6.66929084112089e-08, + "loss": 0.0409, + "step": 5633 + }, + { + "epoch": 2.932847475273295, + "grad_norm": 0.2387237574668359, + "learning_rate": 6.567125771272387e-08, + "loss": 0.0452, + "step": 5634 + }, + { + "epoch": 2.933368037480479, + "grad_norm": 0.23525862361486116, + "learning_rate": 6.465748243559556e-08, + "loss": 0.0429, + "step": 5635 + }, + { + "epoch": 2.9338885996876627, + "grad_norm": 0.23174264458375252, + "learning_rate": 6.365158290002893e-08, + "loss": 0.0442, + "step": 5636 + }, + { + "epoch": 2.9344091618948465, + "grad_norm": 0.2253382202261745, + "learning_rate": 6.26535594237282e-08, + "loss": 0.0426, + "step": 5637 + }, + { + "epoch": 2.9349297241020302, + "grad_norm": 0.23139331028310633, + "learning_rate": 6.166341232191896e-08, + "loss": 0.0437, + "step": 5638 + }, + { + "epoch": 2.935450286309214, + "grad_norm": 0.22693372243517235, + "learning_rate": 6.068114190733442e-08, + "loss": 0.0429, + "step": 5639 + }, + { + "epoch": 2.935970848516398, + "grad_norm": 0.2286453647937163, + "learning_rate": 5.970674849022084e-08, + "loss": 0.0427, + "step": 5640 + }, + { + "epoch": 2.9364914107235816, + "grad_norm": 0.21710776971071516, + "learning_rate": 5.874023237833759e-08, + "loss": 0.0422, + "step": 5641 + }, + { + "epoch": 2.9370119729307653, + "grad_norm": 0.22418298046187546, + "learning_rate": 5.7781593876954366e-08, + "loss": 0.0432, + "step": 5642 + }, + { + "epoch": 2.937532535137949, + "grad_norm": 0.2227493313624406, + "learning_rate": 5.683083328885397e-08, + "loss": 0.0413, + "step": 5643 + }, + { + "epoch": 2.938053097345133, + "grad_norm": 0.22433420679943167, + "learning_rate": 5.58879509143323e-08, + "loss": 0.0422, + "step": 5644 + }, + { + "epoch": 2.9385736595523166, + "grad_norm": 0.2201777114475942, + "learning_rate": 5.495294705119558e-08, + "loss": 0.0427, + "step": 5645 + }, + { + "epoch": 2.9390942217595004, + "grad_norm": 0.2272931098330437, + "learning_rate": 5.402582199476036e-08, + "loss": 0.0427, + "step": 5646 + }, + { + "epoch": 2.939614783966684, + "grad_norm": 0.21987207104156062, + "learning_rate": 5.3106576037856295e-08, + "loss": 0.0416, + "step": 5647 + }, + { + "epoch": 2.940135346173868, + "grad_norm": 0.23629433671926608, + "learning_rate": 5.2195209470823345e-08, + "loss": 0.0434, + "step": 5648 + }, + { + "epoch": 2.9406559083810517, + "grad_norm": 0.226357025922961, + "learning_rate": 5.1291722581511826e-08, + "loss": 0.0437, + "step": 5649 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.22887056354520308, + "learning_rate": 5.039611565529068e-08, + "loss": 0.0424, + "step": 5650 + }, + { + "epoch": 2.9416970327954193, + "grad_norm": 0.22765259780951763, + "learning_rate": 4.950838897503085e-08, + "loss": 0.0434, + "step": 5651 + }, + { + "epoch": 2.942217595002603, + "grad_norm": 0.22353601786311908, + "learning_rate": 4.86285428211164e-08, + "loss": 0.0419, + "step": 5652 + }, + { + "epoch": 2.942738157209787, + "grad_norm": 0.22628985062359203, + "learning_rate": 4.7756577471444466e-08, + "loss": 0.0427, + "step": 5653 + }, + { + "epoch": 2.9432587194169706, + "grad_norm": 0.2334402923764293, + "learning_rate": 4.6892493201422525e-08, + "loss": 0.0443, + "step": 5654 + }, + { + "epoch": 2.943779281624154, + "grad_norm": 0.2213843214467801, + "learning_rate": 4.603629028397116e-08, + "loss": 0.0411, + "step": 5655 + }, + { + "epoch": 2.9442998438313377, + "grad_norm": 0.22400947744966332, + "learning_rate": 4.518796898951572e-08, + "loss": 0.0424, + "step": 5656 + }, + { + "epoch": 2.9448204060385215, + "grad_norm": 0.2205563956277841, + "learning_rate": 4.434752958599464e-08, + "loss": 0.0419, + "step": 5657 + }, + { + "epoch": 2.9453409682457052, + "grad_norm": 0.2218373906484088, + "learning_rate": 4.351497233886226e-08, + "loss": 0.0423, + "step": 5658 + }, + { + "epoch": 2.945861530452889, + "grad_norm": 0.2257691605384441, + "learning_rate": 4.26902975110749e-08, + "loss": 0.0434, + "step": 5659 + }, + { + "epoch": 2.946382092660073, + "grad_norm": 0.21762331110270125, + "learning_rate": 4.1873505363104746e-08, + "loss": 0.0414, + "step": 5660 + }, + { + "epoch": 2.9469026548672566, + "grad_norm": 0.2270394287344173, + "learning_rate": 4.106459615293434e-08, + "loss": 0.0434, + "step": 5661 + }, + { + "epoch": 2.9474232170744403, + "grad_norm": 0.22235806351067547, + "learning_rate": 4.026357013605098e-08, + "loss": 0.0432, + "step": 5662 + }, + { + "epoch": 2.947943779281624, + "grad_norm": 0.20548692652481407, + "learning_rate": 3.9470427565460596e-08, + "loss": 0.0413, + "step": 5663 + }, + { + "epoch": 2.948464341488808, + "grad_norm": 0.2279626929922583, + "learning_rate": 3.8685168691671156e-08, + "loss": 0.042, + "step": 5664 + }, + { + "epoch": 2.9489849036959916, + "grad_norm": 0.2257806223440966, + "learning_rate": 3.7907793762703705e-08, + "loss": 0.0418, + "step": 5665 + }, + { + "epoch": 2.9495054659031754, + "grad_norm": 0.2270748445679294, + "learning_rate": 3.713830302409238e-08, + "loss": 0.0433, + "step": 5666 + }, + { + "epoch": 2.950026028110359, + "grad_norm": 0.22232084572422514, + "learning_rate": 3.637669671887611e-08, + "loss": 0.0418, + "step": 5667 + }, + { + "epoch": 2.950546590317543, + "grad_norm": 0.22207686160720008, + "learning_rate": 3.562297508760415e-08, + "loss": 0.042, + "step": 5668 + }, + { + "epoch": 2.9510671525247267, + "grad_norm": 0.2335518580519873, + "learning_rate": 3.4877138368341614e-08, + "loss": 0.0447, + "step": 5669 + }, + { + "epoch": 2.9515877147319105, + "grad_norm": 0.23000775212512042, + "learning_rate": 3.413918679665284e-08, + "loss": 0.0434, + "step": 5670 + }, + { + "epoch": 2.9521082769390943, + "grad_norm": 0.2200961120523701, + "learning_rate": 3.3409120605623624e-08, + "loss": 0.0416, + "step": 5671 + }, + { + "epoch": 2.952628839146278, + "grad_norm": 0.2211023507432535, + "learning_rate": 3.268694002583617e-08, + "loss": 0.0421, + "step": 5672 + }, + { + "epoch": 2.953149401353462, + "grad_norm": 0.23087187119179148, + "learning_rate": 3.197264528539135e-08, + "loss": 0.0447, + "step": 5673 + }, + { + "epoch": 2.9536699635606456, + "grad_norm": 0.22974934193368118, + "learning_rate": 3.1266236609900356e-08, + "loss": 0.0436, + "step": 5674 + }, + { + "epoch": 2.9541905257678294, + "grad_norm": 0.23183069263185715, + "learning_rate": 3.056771422247362e-08, + "loss": 0.0428, + "step": 5675 + }, + { + "epoch": 2.954711087975013, + "grad_norm": 0.22537547711234884, + "learning_rate": 2.9877078343740206e-08, + "loss": 0.0435, + "step": 5676 + }, + { + "epoch": 2.955231650182197, + "grad_norm": 0.22654670232931923, + "learning_rate": 2.919432919183396e-08, + "loss": 0.042, + "step": 5677 + }, + { + "epoch": 2.9557522123893807, + "grad_norm": 0.22542875112228397, + "learning_rate": 2.851946698240182e-08, + "loss": 0.0431, + "step": 5678 + }, + { + "epoch": 2.956272774596564, + "grad_norm": 0.22216150114544114, + "learning_rate": 2.7852491928595514e-08, + "loss": 0.0407, + "step": 5679 + }, + { + "epoch": 2.956793336803748, + "grad_norm": 0.22284731710279135, + "learning_rate": 2.7193404241074306e-08, + "loss": 0.0428, + "step": 5680 + }, + { + "epoch": 2.9573138990109316, + "grad_norm": 0.22243490757474335, + "learning_rate": 2.654220412801056e-08, + "loss": 0.0416, + "step": 5681 + }, + { + "epoch": 2.9578344612181153, + "grad_norm": 0.22588973540324708, + "learning_rate": 2.5898891795084202e-08, + "loss": 0.0429, + "step": 5682 + }, + { + "epoch": 2.958355023425299, + "grad_norm": 0.2378710459634585, + "learning_rate": 2.5263467445479915e-08, + "loss": 0.0448, + "step": 5683 + }, + { + "epoch": 2.958875585632483, + "grad_norm": 0.23245001982292812, + "learning_rate": 2.4635931279898273e-08, + "loss": 0.043, + "step": 5684 + }, + { + "epoch": 2.9593961478396666, + "grad_norm": 0.2277675922701488, + "learning_rate": 2.4016283496544613e-08, + "loss": 0.043, + "step": 5685 + }, + { + "epoch": 2.9599167100468504, + "grad_norm": 0.21835077756145146, + "learning_rate": 2.340452429112905e-08, + "loss": 0.0416, + "step": 5686 + }, + { + "epoch": 2.960437272254034, + "grad_norm": 0.20768373110222885, + "learning_rate": 2.2800653856874797e-08, + "loss": 0.0403, + "step": 5687 + }, + { + "epoch": 2.960957834461218, + "grad_norm": 0.22352062317713317, + "learning_rate": 2.2204672384512625e-08, + "loss": 0.0415, + "step": 5688 + }, + { + "epoch": 2.9614783966684017, + "grad_norm": 0.22503022396452138, + "learning_rate": 2.161658006228362e-08, + "loss": 0.0422, + "step": 5689 + }, + { + "epoch": 2.9619989588755855, + "grad_norm": 0.22166037598844507, + "learning_rate": 2.1036377075930867e-08, + "loss": 0.0415, + "step": 5690 + }, + { + "epoch": 2.9625195210827693, + "grad_norm": 0.22650211899906575, + "learning_rate": 2.046406360871056e-08, + "loss": 0.0425, + "step": 5691 + }, + { + "epoch": 2.963040083289953, + "grad_norm": 0.22401814975398185, + "learning_rate": 1.989963984138643e-08, + "loss": 0.0444, + "step": 5692 + }, + { + "epoch": 2.963560645497137, + "grad_norm": 0.22589420895588494, + "learning_rate": 1.9343105952229768e-08, + "loss": 0.0425, + "step": 5693 + }, + { + "epoch": 2.9640812077043206, + "grad_norm": 0.2303703387090916, + "learning_rate": 1.8794462117022182e-08, + "loss": 0.0437, + "step": 5694 + }, + { + "epoch": 2.9646017699115044, + "grad_norm": 0.22853971241469895, + "learning_rate": 1.8253708509047285e-08, + "loss": 0.0427, + "step": 5695 + }, + { + "epoch": 2.965122332118688, + "grad_norm": 0.22039445211667255, + "learning_rate": 1.7720845299101783e-08, + "loss": 0.0427, + "step": 5696 + }, + { + "epoch": 2.965642894325872, + "grad_norm": 0.2267933732428841, + "learning_rate": 1.7195872655487166e-08, + "loss": 0.0439, + "step": 5697 + }, + { + "epoch": 2.9661634565330557, + "grad_norm": 0.22063737426257915, + "learning_rate": 1.6678790744015238e-08, + "loss": 0.0427, + "step": 5698 + }, + { + "epoch": 2.9666840187402395, + "grad_norm": 0.22278000386951152, + "learning_rate": 1.616959972800536e-08, + "loss": 0.0429, + "step": 5699 + }, + { + "epoch": 2.9672045809474232, + "grad_norm": 0.2207802651418027, + "learning_rate": 1.5668299768284434e-08, + "loss": 0.0415, + "step": 5700 + }, + { + "epoch": 2.967725143154607, + "grad_norm": 0.22385079884702969, + "learning_rate": 1.5174891023184146e-08, + "loss": 0.044, + "step": 5701 + }, + { + "epoch": 2.9682457053617908, + "grad_norm": 0.2247189896311398, + "learning_rate": 1.4689373648549277e-08, + "loss": 0.0434, + "step": 5702 + }, + { + "epoch": 2.9687662675689745, + "grad_norm": 0.2352896279762599, + "learning_rate": 1.421174779772383e-08, + "loss": 0.0438, + "step": 5703 + }, + { + "epoch": 2.9692868297761583, + "grad_norm": 0.21791843470524855, + "learning_rate": 1.3742013621564909e-08, + "loss": 0.0411, + "step": 5704 + }, + { + "epoch": 2.969807391983342, + "grad_norm": 0.21979737042742506, + "learning_rate": 1.3280171268442721e-08, + "loss": 0.0421, + "step": 5705 + }, + { + "epoch": 2.970327954190526, + "grad_norm": 0.22711419377584297, + "learning_rate": 1.282622088422114e-08, + "loss": 0.0422, + "step": 5706 + }, + { + "epoch": 2.9708485163977096, + "grad_norm": 0.2214357911141881, + "learning_rate": 1.2380162612282697e-08, + "loss": 0.0423, + "step": 5707 + }, + { + "epoch": 2.9713690786048934, + "grad_norm": 0.23079498217049907, + "learning_rate": 1.1941996593514693e-08, + "loss": 0.044, + "step": 5708 + }, + { + "epoch": 2.971889640812077, + "grad_norm": 0.2328227144923505, + "learning_rate": 1.1511722966306426e-08, + "loss": 0.0442, + "step": 5709 + }, + { + "epoch": 2.972410203019261, + "grad_norm": 0.23439895371290795, + "learning_rate": 1.10893418665603e-08, + "loss": 0.043, + "step": 5710 + }, + { + "epoch": 2.9729307652264447, + "grad_norm": 0.22428047997601785, + "learning_rate": 1.0674853427683484e-08, + "loss": 0.0444, + "step": 5711 + }, + { + "epoch": 2.9734513274336285, + "grad_norm": 0.22842869772809338, + "learning_rate": 1.0268257780590707e-08, + "loss": 0.0417, + "step": 5712 + }, + { + "epoch": 2.9739718896408123, + "grad_norm": 0.21987057018297476, + "learning_rate": 9.869555053704239e-09, + "loss": 0.0413, + "step": 5713 + }, + { + "epoch": 2.974492451847996, + "grad_norm": 0.2298608691054948, + "learning_rate": 9.4787453729539e-09, + "loss": 0.0437, + "step": 5714 + }, + { + "epoch": 2.97501301405518, + "grad_norm": 0.2253370055551647, + "learning_rate": 9.095828861771516e-09, + "loss": 0.0432, + "step": 5715 + }, + { + "epoch": 2.9755335762623636, + "grad_norm": 0.2278149249962827, + "learning_rate": 8.720805641104779e-09, + "loss": 0.0434, + "step": 5716 + }, + { + "epoch": 2.9760541384695474, + "grad_norm": 0.2314267428074255, + "learning_rate": 8.353675829403385e-09, + "loss": 0.0443, + "step": 5717 + }, + { + "epoch": 2.976574700676731, + "grad_norm": 0.22400674704669293, + "learning_rate": 7.994439542619025e-09, + "loss": 0.0425, + "step": 5718 + }, + { + "epoch": 2.9770952628839145, + "grad_norm": 0.22281537791883546, + "learning_rate": 7.643096894222046e-09, + "loss": 0.0427, + "step": 5719 + }, + { + "epoch": 2.9776158250910982, + "grad_norm": 0.22059637224805248, + "learning_rate": 7.299647995176462e-09, + "loss": 0.0427, + "step": 5720 + }, + { + "epoch": 2.978136387298282, + "grad_norm": 0.22754394710380124, + "learning_rate": 6.964092953962165e-09, + "loss": 0.0436, + "step": 5721 + }, + { + "epoch": 2.978656949505466, + "grad_norm": 0.23398045942704954, + "learning_rate": 6.63643187656382e-09, + "loss": 0.0447, + "step": 5722 + }, + { + "epoch": 2.9791775117126496, + "grad_norm": 0.22256790802648144, + "learning_rate": 6.316664866470867e-09, + "loss": 0.0434, + "step": 5723 + }, + { + "epoch": 2.9796980739198333, + "grad_norm": 0.22504707512425381, + "learning_rate": 6.004792024680295e-09, + "loss": 0.0433, + "step": 5724 + }, + { + "epoch": 2.980218636127017, + "grad_norm": 0.2172897922400504, + "learning_rate": 5.700813449699416e-09, + "loss": 0.043, + "step": 5725 + }, + { + "epoch": 2.980739198334201, + "grad_norm": 0.23460775231227962, + "learning_rate": 5.404729237531991e-09, + "loss": 0.045, + "step": 5726 + }, + { + "epoch": 2.9812597605413846, + "grad_norm": 0.2093848412957586, + "learning_rate": 5.116539481703208e-09, + "loss": 0.0402, + "step": 5727 + }, + { + "epoch": 2.9817803227485684, + "grad_norm": 0.22035954463083757, + "learning_rate": 4.836244273231927e-09, + "loss": 0.0423, + "step": 5728 + }, + { + "epoch": 2.982300884955752, + "grad_norm": 0.24018779005516844, + "learning_rate": 4.56384370064733e-09, + "loss": 0.0433, + "step": 5729 + }, + { + "epoch": 2.982821447162936, + "grad_norm": 0.22737460452274955, + "learning_rate": 4.299337849991703e-09, + "loss": 0.0441, + "step": 5730 + }, + { + "epoch": 2.9833420093701197, + "grad_norm": 0.219433427420989, + "learning_rate": 4.042726804801e-09, + "loss": 0.0412, + "step": 5731 + }, + { + "epoch": 2.9838625715773035, + "grad_norm": 0.22548984426888996, + "learning_rate": 3.794010646132606e-09, + "loss": 0.0428, + "step": 5732 + }, + { + "epoch": 2.9843831337844873, + "grad_norm": 0.2231171978367076, + "learning_rate": 3.553189452537575e-09, + "loss": 0.0435, + "step": 5733 + }, + { + "epoch": 2.984903695991671, + "grad_norm": 0.2261689918270949, + "learning_rate": 3.3202633000772865e-09, + "loss": 0.0423, + "step": 5734 + }, + { + "epoch": 2.985424258198855, + "grad_norm": 0.24047982640447405, + "learning_rate": 3.0952322623262197e-09, + "loss": 0.0442, + "step": 5735 + }, + { + "epoch": 2.9859448204060386, + "grad_norm": 0.2198384025823191, + "learning_rate": 2.878096410355302e-09, + "loss": 0.0421, + "step": 5736 + }, + { + "epoch": 2.9864653826132224, + "grad_norm": 0.2205868075205334, + "learning_rate": 2.6688558127485607e-09, + "loss": 0.0414, + "step": 5737 + }, + { + "epoch": 2.986985944820406, + "grad_norm": 0.23578679799030974, + "learning_rate": 2.4675105355920213e-09, + "loss": 0.0445, + "step": 5738 + }, + { + "epoch": 2.98750650702759, + "grad_norm": 0.22520079740469365, + "learning_rate": 2.2740606424792587e-09, + "loss": 0.0416, + "step": 5739 + }, + { + "epoch": 2.9880270692347737, + "grad_norm": 0.23337576964018655, + "learning_rate": 2.088506194514173e-09, + "loss": 0.043, + "step": 5740 + }, + { + "epoch": 2.9885476314419575, + "grad_norm": 0.23186028702019476, + "learning_rate": 1.9108472502998855e-09, + "loss": 0.0432, + "step": 5741 + }, + { + "epoch": 2.9890681936491412, + "grad_norm": 0.2184071173157055, + "learning_rate": 1.7410838659498442e-09, + "loss": 0.0418, + "step": 5742 + }, + { + "epoch": 2.9895887558563246, + "grad_norm": 0.2199140316024418, + "learning_rate": 1.579216095087821e-09, + "loss": 0.0428, + "step": 5743 + }, + { + "epoch": 2.9901093180635083, + "grad_norm": 0.22353723184238114, + "learning_rate": 1.4252439888340353e-09, + "loss": 0.0435, + "step": 5744 + }, + { + "epoch": 2.990629880270692, + "grad_norm": 0.22673756486431215, + "learning_rate": 1.2791675958218064e-09, + "loss": 0.0439, + "step": 5745 + }, + { + "epoch": 2.991150442477876, + "grad_norm": 0.22545208764365354, + "learning_rate": 1.140986962186452e-09, + "loss": 0.0438, + "step": 5746 + }, + { + "epoch": 2.9916710046850596, + "grad_norm": 0.23688504795341422, + "learning_rate": 1.010702131576391e-09, + "loss": 0.0442, + "step": 5747 + }, + { + "epoch": 2.9921915668922434, + "grad_norm": 0.23253053657185024, + "learning_rate": 8.883131451392635e-10, + "loss": 0.044, + "step": 5748 + }, + { + "epoch": 2.992712129099427, + "grad_norm": 0.2379163810816669, + "learning_rate": 7.738200415302599e-10, + "loss": 0.0452, + "step": 5749 + }, + { + "epoch": 2.993232691306611, + "grad_norm": 0.21509722288903138, + "learning_rate": 6.672228569148953e-10, + "loss": 0.0425, + "step": 5750 + }, + { + "epoch": 2.9937532535137947, + "grad_norm": 0.22479935136282975, + "learning_rate": 5.685216249579073e-10, + "loss": 0.0421, + "step": 5751 + }, + { + "epoch": 2.9942738157209785, + "grad_norm": 0.2329527890243757, + "learning_rate": 4.777163768343585e-10, + "loss": 0.0431, + "step": 5752 + }, + { + "epoch": 2.9947943779281623, + "grad_norm": 0.21818439935194653, + "learning_rate": 3.94807141224085e-10, + "loss": 0.0418, + "step": 5753 + }, + { + "epoch": 2.995314940135346, + "grad_norm": 0.23421104271035925, + "learning_rate": 3.197939443172482e-10, + "loss": 0.0437, + "step": 5754 + }, + { + "epoch": 2.99583550234253, + "grad_norm": 0.21714292300858615, + "learning_rate": 2.526768098060073e-10, + "loss": 0.0426, + "step": 5755 + }, + { + "epoch": 2.9963560645497136, + "grad_norm": 0.23045424040900458, + "learning_rate": 1.9345575888451983e-10, + "loss": 0.0436, + "step": 5756 + }, + { + "epoch": 2.9968766267568974, + "grad_norm": 0.22483821530138692, + "learning_rate": 1.421308102628194e-10, + "loss": 0.0433, + "step": 5757 + }, + { + "epoch": 2.997397188964081, + "grad_norm": 0.22259646441449232, + "learning_rate": 9.870198014738652e-11, + "loss": 0.0449, + "step": 5758 + }, + { + "epoch": 2.997917751171265, + "grad_norm": 0.22244406662721328, + "learning_rate": 6.316928225780228e-11, + "loss": 0.0437, + "step": 5759 + }, + { + "epoch": 2.9984383133784487, + "grad_norm": 0.22721923240497144, + "learning_rate": 3.553272781842143e-11, + "loss": 0.0434, + "step": 5760 + }, + { + "epoch": 2.9989588755856325, + "grad_norm": 0.22864277231584557, + "learning_rate": 1.5792325552821398e-11, + "loss": 0.0435, + "step": 5761 + }, + { + "epoch": 2.9994794377928162, + "grad_norm": 0.22668136835192348, + "learning_rate": 3.948081700455575e-12, + "loss": 0.0439, + "step": 5762 + }, + { + "epoch": 3.0, + "grad_norm": 0.21769778535390255, + "learning_rate": 0.0, + "loss": 0.042, + "step": 5763 + } + ], + "logging_steps": 1.0, + "max_steps": 5763, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 240, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2413100531712000.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}