diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14331 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 29.999559594239493, + "eval_steps": 500, + "global_step": 1021770, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 1.2330610752105713, + "learning_rate": 7.5e-05, + "loss": 47.9921, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 1.4777156114578247, + "learning_rate": 0.00015, + "loss": 6.0359, + "step": 1000 + }, + { + "epoch": 0.04, + "grad_norm": 2.119438886642456, + "learning_rate": 0.000225, + "loss": 5.8858, + "step": 1500 + }, + { + "epoch": 0.06, + "grad_norm": 2.3987252712249756, + "learning_rate": 0.0003, + "loss": 5.7995, + "step": 2000 + }, + { + "epoch": 0.07, + "grad_norm": 1.466189980506897, + "learning_rate": 0.0002998529080086686, + "loss": 5.5973, + "step": 2500 + }, + { + "epoch": 0.09, + "grad_norm": 1.6185613870620728, + "learning_rate": 0.0002997058160173372, + "loss": 4.5547, + "step": 3000 + }, + { + "epoch": 0.1, + "grad_norm": 2.422328472137451, + "learning_rate": 0.0002995587240260058, + "loss": 3.8361, + "step": 3500 + }, + { + "epoch": 0.12, + "grad_norm": 2.422811269760132, + "learning_rate": 0.00029941163203467446, + "loss": 3.5267, + "step": 4000 + }, + { + "epoch": 0.13, + "grad_norm": 1.8635873794555664, + "learning_rate": 0.0002992645400433431, + "loss": 3.3318, + "step": 4500 + }, + { + "epoch": 0.15, + "grad_norm": 2.5039353370666504, + "learning_rate": 0.00029911744805201173, + "loss": 3.2049, + "step": 5000 + }, + { + "epoch": 0.16, + "grad_norm": 3.018263578414917, + "learning_rate": 0.0002989703560606803, + "loss": 3.1146, + "step": 5500 + }, + { + "epoch": 0.18, + "grad_norm": 2.2091281414031982, + "learning_rate": 0.00029882326406934894, + "loss": 3.0412, + "step": 6000 + }, + { + "epoch": 0.19, + "grad_norm": 2.7145626544952393, + "learning_rate": 0.0002986761720780176, + "loss": 2.9849, + "step": 6500 + }, + { + "epoch": 0.21, + "grad_norm": 3.0529212951660156, + "learning_rate": 0.00029852908008668616, + "loss": 2.9731, + "step": 7000 + }, + { + "epoch": 0.22, + "grad_norm": 2.4529542922973633, + "learning_rate": 0.0002983819880953548, + "loss": 2.9193, + "step": 7500 + }, + { + "epoch": 0.23, + "grad_norm": 2.009725332260132, + "learning_rate": 0.00029823489610402343, + "loss": 2.8839, + "step": 8000 + }, + { + "epoch": 0.25, + "grad_norm": 3.02736759185791, + "learning_rate": 0.00029808780411269206, + "loss": 2.8483, + "step": 8500 + }, + { + "epoch": 0.26, + "grad_norm": 2.597303867340088, + "learning_rate": 0.0002979407121213607, + "loss": 2.84, + "step": 9000 + }, + { + "epoch": 0.28, + "grad_norm": 2.826665163040161, + "learning_rate": 0.0002977936201300293, + "loss": 2.8151, + "step": 9500 + }, + { + "epoch": 0.29, + "grad_norm": 2.141164779663086, + "learning_rate": 0.0002976465281386979, + "loss": 2.7756, + "step": 10000 + }, + { + "epoch": 0.31, + "grad_norm": 3.0721323490142822, + "learning_rate": 0.00029749943614736655, + "loss": 2.7468, + "step": 10500 + }, + { + "epoch": 0.32, + "grad_norm": 3.0305256843566895, + "learning_rate": 0.0002973523441560351, + "loss": 2.7548, + "step": 11000 + }, + { + "epoch": 0.34, + "grad_norm": 1.7159794569015503, + "learning_rate": 0.00029720525216470376, + "loss": 2.739, + "step": 11500 + }, + { + "epoch": 0.35, + "grad_norm": 4.015839099884033, + "learning_rate": 0.0002970581601733724, + "loss": 2.7338, + "step": 12000 + }, + { + "epoch": 0.37, + "grad_norm": 2.1496126651763916, + "learning_rate": 0.00029691106818204103, + "loss": 2.7047, + "step": 12500 + }, + { + "epoch": 0.38, + "grad_norm": 12.543700218200684, + "learning_rate": 0.00029676397619070967, + "loss": 2.6905, + "step": 13000 + }, + { + "epoch": 0.4, + "grad_norm": 2.919098138809204, + "learning_rate": 0.00029661688419937825, + "loss": 2.6511, + "step": 13500 + }, + { + "epoch": 0.41, + "grad_norm": 2.164848566055298, + "learning_rate": 0.0002964697922080469, + "loss": 2.6765, + "step": 14000 + }, + { + "epoch": 0.43, + "grad_norm": 2.595182418823242, + "learning_rate": 0.0002963227002167155, + "loss": 2.6406, + "step": 14500 + }, + { + "epoch": 0.44, + "grad_norm": 3.517666816711426, + "learning_rate": 0.0002961756082253841, + "loss": 2.6752, + "step": 15000 + }, + { + "epoch": 0.46, + "grad_norm": 3.788487434387207, + "learning_rate": 0.00029602851623405273, + "loss": 2.6248, + "step": 15500 + }, + { + "epoch": 0.47, + "grad_norm": 1.8919869661331177, + "learning_rate": 0.00029588142424272137, + "loss": 2.6751, + "step": 16000 + }, + { + "epoch": 0.48, + "grad_norm": 7.1151628494262695, + "learning_rate": 0.00029573433225139, + "loss": 2.6024, + "step": 16500 + }, + { + "epoch": 0.5, + "grad_norm": 1.6645219326019287, + "learning_rate": 0.00029558724026005863, + "loss": 2.6017, + "step": 17000 + }, + { + "epoch": 0.51, + "grad_norm": 4.183793067932129, + "learning_rate": 0.0002954401482687272, + "loss": 2.5482, + "step": 17500 + }, + { + "epoch": 0.53, + "grad_norm": 3.3193671703338623, + "learning_rate": 0.00029529305627739585, + "loss": 2.6058, + "step": 18000 + }, + { + "epoch": 0.54, + "grad_norm": 1.861599326133728, + "learning_rate": 0.0002951459642860645, + "loss": 2.5766, + "step": 18500 + }, + { + "epoch": 0.56, + "grad_norm": 2.443558931350708, + "learning_rate": 0.00029499887229473306, + "loss": 2.5901, + "step": 19000 + }, + { + "epoch": 0.57, + "grad_norm": 2.1485493183135986, + "learning_rate": 0.0002948517803034017, + "loss": 2.549, + "step": 19500 + }, + { + "epoch": 0.59, + "grad_norm": 2.802183151245117, + "learning_rate": 0.00029470468831207033, + "loss": 2.568, + "step": 20000 + }, + { + "epoch": 0.6, + "grad_norm": 2.226308822631836, + "learning_rate": 0.00029455759632073897, + "loss": 2.5599, + "step": 20500 + }, + { + "epoch": 0.62, + "grad_norm": 8.404243469238281, + "learning_rate": 0.0002944105043294076, + "loss": 2.5574, + "step": 21000 + }, + { + "epoch": 0.63, + "grad_norm": 4.14946985244751, + "learning_rate": 0.0002942634123380762, + "loss": 2.5266, + "step": 21500 + }, + { + "epoch": 0.65, + "grad_norm": 1.674615502357483, + "learning_rate": 0.0002941163203467448, + "loss": 2.5094, + "step": 22000 + }, + { + "epoch": 0.66, + "grad_norm": 1.943082332611084, + "learning_rate": 0.00029396922835541345, + "loss": 2.4967, + "step": 22500 + }, + { + "epoch": 0.68, + "grad_norm": 1.9026265144348145, + "learning_rate": 0.0002938221363640821, + "loss": 2.4974, + "step": 23000 + }, + { + "epoch": 0.69, + "grad_norm": 4.214877128601074, + "learning_rate": 0.00029367504437275067, + "loss": 2.5329, + "step": 23500 + }, + { + "epoch": 0.7, + "grad_norm": 2.1662731170654297, + "learning_rate": 0.0002935279523814193, + "loss": 2.508, + "step": 24000 + }, + { + "epoch": 0.72, + "grad_norm": 2.6833200454711914, + "learning_rate": 0.00029338086039008794, + "loss": 2.4891, + "step": 24500 + }, + { + "epoch": 0.73, + "grad_norm": 10.436029434204102, + "learning_rate": 0.00029323376839875657, + "loss": 2.506, + "step": 25000 + }, + { + "epoch": 0.75, + "grad_norm": 1.6364326477050781, + "learning_rate": 0.0002930866764074252, + "loss": 2.5091, + "step": 25500 + }, + { + "epoch": 0.76, + "grad_norm": 2.408642292022705, + "learning_rate": 0.0002929395844160938, + "loss": 2.4987, + "step": 26000 + }, + { + "epoch": 0.78, + "grad_norm": 2.324516534805298, + "learning_rate": 0.0002927924924247624, + "loss": 2.5619, + "step": 26500 + }, + { + "epoch": 0.79, + "grad_norm": 2.3488011360168457, + "learning_rate": 0.00029264540043343106, + "loss": 2.4594, + "step": 27000 + }, + { + "epoch": 0.81, + "grad_norm": 2.130359411239624, + "learning_rate": 0.00029249830844209964, + "loss": 2.4677, + "step": 27500 + }, + { + "epoch": 0.82, + "grad_norm": 3.6321048736572266, + "learning_rate": 0.0002923512164507683, + "loss": 2.4925, + "step": 28000 + }, + { + "epoch": 0.84, + "grad_norm": 2.2566709518432617, + "learning_rate": 0.0002922041244594369, + "loss": 2.4779, + "step": 28500 + }, + { + "epoch": 0.85, + "grad_norm": 1.4957294464111328, + "learning_rate": 0.00029205703246810554, + "loss": 2.4482, + "step": 29000 + }, + { + "epoch": 0.87, + "grad_norm": 2.031252384185791, + "learning_rate": 0.0002919099404767742, + "loss": 2.4215, + "step": 29500 + }, + { + "epoch": 0.88, + "grad_norm": 3.686828136444092, + "learning_rate": 0.00029176284848544276, + "loss": 2.4463, + "step": 30000 + }, + { + "epoch": 0.9, + "grad_norm": 1.746907114982605, + "learning_rate": 0.0002916157564941114, + "loss": 2.4436, + "step": 30500 + }, + { + "epoch": 0.91, + "grad_norm": 2.379333257675171, + "learning_rate": 0.00029146866450278, + "loss": 2.4244, + "step": 31000 + }, + { + "epoch": 0.92, + "grad_norm": 2.696253776550293, + "learning_rate": 0.0002913215725114486, + "loss": 2.4218, + "step": 31500 + }, + { + "epoch": 0.94, + "grad_norm": 4.1202826499938965, + "learning_rate": 0.0002911744805201173, + "loss": 2.438, + "step": 32000 + }, + { + "epoch": 0.95, + "grad_norm": 3.3910372257232666, + "learning_rate": 0.0002910273885287859, + "loss": 2.4405, + "step": 32500 + }, + { + "epoch": 0.97, + "grad_norm": 2.9638476371765137, + "learning_rate": 0.0002908802965374545, + "loss": 2.4461, + "step": 33000 + }, + { + "epoch": 0.98, + "grad_norm": 3.280348062515259, + "learning_rate": 0.00029073320454612315, + "loss": 2.4177, + "step": 33500 + }, + { + "epoch": 1.0, + "grad_norm": 2.654728651046753, + "learning_rate": 0.0002905861125547917, + "loss": 2.414, + "step": 34000 + }, + { + "epoch": 1.01, + "grad_norm": 2.3180739879608154, + "learning_rate": 0.00029043902056346036, + "loss": 2.367, + "step": 34500 + }, + { + "epoch": 1.03, + "grad_norm": 5.21975040435791, + "learning_rate": 0.000290291928572129, + "loss": 2.4158, + "step": 35000 + }, + { + "epoch": 1.04, + "grad_norm": 1.7568458318710327, + "learning_rate": 0.0002901448365807976, + "loss": 2.3574, + "step": 35500 + }, + { + "epoch": 1.06, + "grad_norm": 2.872269630432129, + "learning_rate": 0.00028999774458946626, + "loss": 2.3842, + "step": 36000 + }, + { + "epoch": 1.07, + "grad_norm": 2.3413188457489014, + "learning_rate": 0.00028985065259813484, + "loss": 2.3313, + "step": 36500 + }, + { + "epoch": 1.09, + "grad_norm": 1.4920777082443237, + "learning_rate": 0.0002897035606068035, + "loss": 2.4049, + "step": 37000 + }, + { + "epoch": 1.1, + "grad_norm": 2.5813608169555664, + "learning_rate": 0.0002895564686154721, + "loss": 2.3641, + "step": 37500 + }, + { + "epoch": 1.12, + "grad_norm": 2.110539674758911, + "learning_rate": 0.0002894093766241407, + "loss": 2.3704, + "step": 38000 + }, + { + "epoch": 1.13, + "grad_norm": 2.397954225540161, + "learning_rate": 0.00028926228463280933, + "loss": 2.3962, + "step": 38500 + }, + { + "epoch": 1.15, + "grad_norm": 1.290098786354065, + "learning_rate": 0.00028911519264147796, + "loss": 2.3359, + "step": 39000 + }, + { + "epoch": 1.16, + "grad_norm": 1.7063418626785278, + "learning_rate": 0.00028896810065014654, + "loss": 2.34, + "step": 39500 + }, + { + "epoch": 1.17, + "grad_norm": 1.603582501411438, + "learning_rate": 0.00028882100865881523, + "loss": 2.3439, + "step": 40000 + }, + { + "epoch": 1.19, + "grad_norm": 1.6357001066207886, + "learning_rate": 0.0002886739166674838, + "loss": 2.3689, + "step": 40500 + }, + { + "epoch": 1.2, + "grad_norm": 2.4969911575317383, + "learning_rate": 0.00028852682467615245, + "loss": 2.3621, + "step": 41000 + }, + { + "epoch": 1.22, + "grad_norm": 1.7131437063217163, + "learning_rate": 0.0002883797326848211, + "loss": 2.3402, + "step": 41500 + }, + { + "epoch": 1.23, + "grad_norm": 1.681779384613037, + "learning_rate": 0.00028823264069348966, + "loss": 2.3047, + "step": 42000 + }, + { + "epoch": 1.25, + "grad_norm": 3.0767245292663574, + "learning_rate": 0.0002880855487021583, + "loss": 2.3529, + "step": 42500 + }, + { + "epoch": 1.26, + "grad_norm": 2.4065330028533936, + "learning_rate": 0.00028793845671082693, + "loss": 2.3332, + "step": 43000 + }, + { + "epoch": 1.28, + "grad_norm": 3.0359294414520264, + "learning_rate": 0.0002877913647194955, + "loss": 2.3477, + "step": 43500 + }, + { + "epoch": 1.29, + "grad_norm": 2.010524272918701, + "learning_rate": 0.0002876442727281642, + "loss": 2.3123, + "step": 44000 + }, + { + "epoch": 1.31, + "grad_norm": 9.476223945617676, + "learning_rate": 0.0002874971807368328, + "loss": 2.3187, + "step": 44500 + }, + { + "epoch": 1.32, + "grad_norm": 1.5241541862487793, + "learning_rate": 0.0002873500887455014, + "loss": 2.3473, + "step": 45000 + }, + { + "epoch": 1.34, + "grad_norm": 2.111661434173584, + "learning_rate": 0.00028720299675417005, + "loss": 2.3122, + "step": 45500 + }, + { + "epoch": 1.35, + "grad_norm": 1.5378273725509644, + "learning_rate": 0.00028705590476283863, + "loss": 2.3185, + "step": 46000 + }, + { + "epoch": 1.37, + "grad_norm": 2.218921184539795, + "learning_rate": 0.00028690881277150727, + "loss": 2.3426, + "step": 46500 + }, + { + "epoch": 1.38, + "grad_norm": 2.234276294708252, + "learning_rate": 0.0002867617207801759, + "loss": 2.2885, + "step": 47000 + }, + { + "epoch": 1.39, + "grad_norm": 2.929063081741333, + "learning_rate": 0.0002866146287888445, + "loss": 2.3047, + "step": 47500 + }, + { + "epoch": 1.41, + "grad_norm": 1.764546513557434, + "learning_rate": 0.00028646753679751317, + "loss": 2.275, + "step": 48000 + }, + { + "epoch": 1.42, + "grad_norm": 3.143376111984253, + "learning_rate": 0.00028632044480618175, + "loss": 2.2591, + "step": 48500 + }, + { + "epoch": 1.44, + "grad_norm": 2.8672120571136475, + "learning_rate": 0.0002861733528148504, + "loss": 2.2462, + "step": 49000 + }, + { + "epoch": 1.45, + "grad_norm": 3.577282667160034, + "learning_rate": 0.000286026260823519, + "loss": 2.2689, + "step": 49500 + }, + { + "epoch": 1.47, + "grad_norm": 1.5024135112762451, + "learning_rate": 0.0002858791688321876, + "loss": 2.2886, + "step": 50000 + }, + { + "epoch": 1.48, + "grad_norm": 2.36291241645813, + "learning_rate": 0.00028573207684085624, + "loss": 2.2102, + "step": 50500 + }, + { + "epoch": 1.5, + "grad_norm": 2.7048864364624023, + "learning_rate": 0.00028558498484952487, + "loss": 2.2734, + "step": 51000 + }, + { + "epoch": 1.51, + "grad_norm": 15.023186683654785, + "learning_rate": 0.00028543789285819345, + "loss": 2.2859, + "step": 51500 + }, + { + "epoch": 1.53, + "grad_norm": 1.5235730409622192, + "learning_rate": 0.00028529080086686214, + "loss": 2.2459, + "step": 52000 + }, + { + "epoch": 1.54, + "grad_norm": 3.157870292663574, + "learning_rate": 0.0002851437088755307, + "loss": 2.2679, + "step": 52500 + }, + { + "epoch": 1.56, + "grad_norm": 5.78060245513916, + "learning_rate": 0.00028499661688419936, + "loss": 2.2781, + "step": 53000 + }, + { + "epoch": 1.57, + "grad_norm": 3.3039705753326416, + "learning_rate": 0.000284849524892868, + "loss": 2.2761, + "step": 53500 + }, + { + "epoch": 1.59, + "grad_norm": 2.286205530166626, + "learning_rate": 0.00028470243290153657, + "loss": 2.2549, + "step": 54000 + }, + { + "epoch": 1.6, + "grad_norm": 2.2150983810424805, + "learning_rate": 0.0002845553409102052, + "loss": 2.2443, + "step": 54500 + }, + { + "epoch": 1.61, + "grad_norm": 3.0205180644989014, + "learning_rate": 0.00028440824891887384, + "loss": 2.2676, + "step": 55000 + }, + { + "epoch": 1.63, + "grad_norm": 1.5430960655212402, + "learning_rate": 0.0002842611569275424, + "loss": 2.2403, + "step": 55500 + }, + { + "epoch": 1.64, + "grad_norm": 1.979278802871704, + "learning_rate": 0.0002841140649362111, + "loss": 2.2409, + "step": 56000 + }, + { + "epoch": 1.66, + "grad_norm": 1.6833264827728271, + "learning_rate": 0.0002839669729448797, + "loss": 2.2657, + "step": 56500 + }, + { + "epoch": 1.67, + "grad_norm": 1.9541752338409424, + "learning_rate": 0.0002838198809535483, + "loss": 2.2767, + "step": 57000 + }, + { + "epoch": 1.69, + "grad_norm": 5.107935905456543, + "learning_rate": 0.00028367278896221696, + "loss": 2.2774, + "step": 57500 + }, + { + "epoch": 1.7, + "grad_norm": 1.5604417324066162, + "learning_rate": 0.00028352569697088554, + "loss": 2.2405, + "step": 58000 + }, + { + "epoch": 1.72, + "grad_norm": 1.6418412923812866, + "learning_rate": 0.0002833786049795542, + "loss": 2.2434, + "step": 58500 + }, + { + "epoch": 1.73, + "grad_norm": 1.5278984308242798, + "learning_rate": 0.0002832315129882228, + "loss": 2.2977, + "step": 59000 + }, + { + "epoch": 1.75, + "grad_norm": 2.3131840229034424, + "learning_rate": 0.0002830844209968914, + "loss": 2.2236, + "step": 59500 + }, + { + "epoch": 1.76, + "grad_norm": 1.7341011762619019, + "learning_rate": 0.0002829373290055601, + "loss": 2.2235, + "step": 60000 + }, + { + "epoch": 1.78, + "grad_norm": 2.8039567470550537, + "learning_rate": 0.00028279023701422866, + "loss": 2.2475, + "step": 60500 + }, + { + "epoch": 1.79, + "grad_norm": 4.142283916473389, + "learning_rate": 0.0002826431450228973, + "loss": 2.2128, + "step": 61000 + }, + { + "epoch": 1.81, + "grad_norm": 1.2194138765335083, + "learning_rate": 0.00028249605303156593, + "loss": 2.2568, + "step": 61500 + }, + { + "epoch": 1.82, + "grad_norm": 1.2129665613174438, + "learning_rate": 0.0002823489610402345, + "loss": 2.2282, + "step": 62000 + }, + { + "epoch": 1.84, + "grad_norm": 8.789237022399902, + "learning_rate": 0.00028220186904890314, + "loss": 2.2071, + "step": 62500 + }, + { + "epoch": 1.85, + "grad_norm": 1.4489936828613281, + "learning_rate": 0.0002820547770575718, + "loss": 2.2389, + "step": 63000 + }, + { + "epoch": 1.86, + "grad_norm": 1.591963768005371, + "learning_rate": 0.0002819076850662404, + "loss": 2.2277, + "step": 63500 + }, + { + "epoch": 1.88, + "grad_norm": 2.642183303833008, + "learning_rate": 0.00028176059307490905, + "loss": 2.2195, + "step": 64000 + }, + { + "epoch": 1.89, + "grad_norm": 2.7950453758239746, + "learning_rate": 0.00028161350108357763, + "loss": 2.2179, + "step": 64500 + }, + { + "epoch": 1.91, + "grad_norm": 2.3858351707458496, + "learning_rate": 0.00028146640909224626, + "loss": 2.2266, + "step": 65000 + }, + { + "epoch": 1.92, + "grad_norm": 2.2545394897460938, + "learning_rate": 0.0002813193171009149, + "loss": 2.2538, + "step": 65500 + }, + { + "epoch": 1.94, + "grad_norm": 2.3457772731781006, + "learning_rate": 0.0002811722251095835, + "loss": 2.2351, + "step": 66000 + }, + { + "epoch": 1.95, + "grad_norm": 2.0313475131988525, + "learning_rate": 0.0002810251331182521, + "loss": 2.2574, + "step": 66500 + }, + { + "epoch": 1.97, + "grad_norm": 4.175018310546875, + "learning_rate": 0.00028087804112692075, + "loss": 2.2269, + "step": 67000 + }, + { + "epoch": 1.98, + "grad_norm": 1.963953971862793, + "learning_rate": 0.0002807309491355894, + "loss": 2.2408, + "step": 67500 + }, + { + "epoch": 2.0, + "grad_norm": 1.8645119667053223, + "learning_rate": 0.000280583857144258, + "loss": 2.2588, + "step": 68000 + }, + { + "epoch": 2.01, + "grad_norm": 1.8845419883728027, + "learning_rate": 0.0002804367651529266, + "loss": 2.1532, + "step": 68500 + }, + { + "epoch": 2.03, + "grad_norm": 2.3963561058044434, + "learning_rate": 0.00028028967316159523, + "loss": 2.1441, + "step": 69000 + }, + { + "epoch": 2.04, + "grad_norm": 3.194061040878296, + "learning_rate": 0.00028014258117026387, + "loss": 2.1184, + "step": 69500 + }, + { + "epoch": 2.06, + "grad_norm": 1.4685026407241821, + "learning_rate": 0.0002799954891789325, + "loss": 2.1504, + "step": 70000 + }, + { + "epoch": 2.07, + "grad_norm": 6.733902931213379, + "learning_rate": 0.0002798483971876011, + "loss": 2.1449, + "step": 70500 + }, + { + "epoch": 2.08, + "grad_norm": 1.5692353248596191, + "learning_rate": 0.0002797013051962697, + "loss": 2.1132, + "step": 71000 + }, + { + "epoch": 2.1, + "grad_norm": 2.4298503398895264, + "learning_rate": 0.00027955421320493835, + "loss": 2.1557, + "step": 71500 + }, + { + "epoch": 2.11, + "grad_norm": 1.9930598735809326, + "learning_rate": 0.000279407121213607, + "loss": 2.1509, + "step": 72000 + }, + { + "epoch": 2.13, + "grad_norm": 3.2199394702911377, + "learning_rate": 0.0002792600292222756, + "loss": 2.1637, + "step": 72500 + }, + { + "epoch": 2.14, + "grad_norm": 11.737617492675781, + "learning_rate": 0.0002791129372309442, + "loss": 2.1145, + "step": 73000 + }, + { + "epoch": 2.16, + "grad_norm": 3.2281816005706787, + "learning_rate": 0.00027896584523961284, + "loss": 2.205, + "step": 73500 + }, + { + "epoch": 2.17, + "grad_norm": 1.6318072080612183, + "learning_rate": 0.00027881875324828147, + "loss": 2.1355, + "step": 74000 + }, + { + "epoch": 2.19, + "grad_norm": 1.4795607328414917, + "learning_rate": 0.00027867166125695005, + "loss": 2.1594, + "step": 74500 + }, + { + "epoch": 2.2, + "grad_norm": 3.246556043624878, + "learning_rate": 0.0002785245692656187, + "loss": 2.1225, + "step": 75000 + }, + { + "epoch": 2.22, + "grad_norm": 5.301700115203857, + "learning_rate": 0.0002783774772742873, + "loss": 2.1477, + "step": 75500 + }, + { + "epoch": 2.23, + "grad_norm": 1.3937615156173706, + "learning_rate": 0.00027823038528295595, + "loss": 2.1377, + "step": 76000 + }, + { + "epoch": 2.25, + "grad_norm": 5.8993611335754395, + "learning_rate": 0.0002780832932916246, + "loss": 2.156, + "step": 76500 + }, + { + "epoch": 2.26, + "grad_norm": 3.7031867504119873, + "learning_rate": 0.00027793620130029317, + "loss": 2.1246, + "step": 77000 + }, + { + "epoch": 2.28, + "grad_norm": 3.8393964767456055, + "learning_rate": 0.0002777891093089618, + "loss": 2.1503, + "step": 77500 + }, + { + "epoch": 2.29, + "grad_norm": 1.3489915132522583, + "learning_rate": 0.00027764201731763044, + "loss": 2.1506, + "step": 78000 + }, + { + "epoch": 2.3, + "grad_norm": 6.942753791809082, + "learning_rate": 0.000277494925326299, + "loss": 2.1575, + "step": 78500 + }, + { + "epoch": 2.32, + "grad_norm": 3.2154929637908936, + "learning_rate": 0.00027734783333496765, + "loss": 2.1279, + "step": 79000 + }, + { + "epoch": 2.33, + "grad_norm": 1.8168853521347046, + "learning_rate": 0.0002772007413436363, + "loss": 2.0715, + "step": 79500 + }, + { + "epoch": 2.35, + "grad_norm": 2.112091541290283, + "learning_rate": 0.0002770536493523049, + "loss": 2.1344, + "step": 80000 + }, + { + "epoch": 2.36, + "grad_norm": 2.050875663757324, + "learning_rate": 0.00027690655736097356, + "loss": 2.1416, + "step": 80500 + }, + { + "epoch": 2.38, + "grad_norm": 1.736621618270874, + "learning_rate": 0.00027675946536964214, + "loss": 2.1196, + "step": 81000 + }, + { + "epoch": 2.39, + "grad_norm": 2.051025629043579, + "learning_rate": 0.0002766123733783108, + "loss": 2.1272, + "step": 81500 + }, + { + "epoch": 2.41, + "grad_norm": 4.314809799194336, + "learning_rate": 0.0002764652813869794, + "loss": 2.1618, + "step": 82000 + }, + { + "epoch": 2.42, + "grad_norm": 2.920485496520996, + "learning_rate": 0.000276318189395648, + "loss": 2.1354, + "step": 82500 + }, + { + "epoch": 2.44, + "grad_norm": 6.310970783233643, + "learning_rate": 0.0002761710974043166, + "loss": 2.1495, + "step": 83000 + }, + { + "epoch": 2.45, + "grad_norm": 1.929152250289917, + "learning_rate": 0.00027602400541298526, + "loss": 2.1271, + "step": 83500 + }, + { + "epoch": 2.47, + "grad_norm": 2.9410946369171143, + "learning_rate": 0.0002758769134216539, + "loss": 2.1646, + "step": 84000 + }, + { + "epoch": 2.48, + "grad_norm": 2.474297523498535, + "learning_rate": 0.00027572982143032253, + "loss": 2.127, + "step": 84500 + }, + { + "epoch": 2.5, + "grad_norm": 1.6442033052444458, + "learning_rate": 0.0002755827294389911, + "loss": 2.1433, + "step": 85000 + }, + { + "epoch": 2.51, + "grad_norm": 1.3729546070098877, + "learning_rate": 0.00027543563744765974, + "loss": 2.1121, + "step": 85500 + }, + { + "epoch": 2.52, + "grad_norm": 1.6497186422348022, + "learning_rate": 0.0002752885454563284, + "loss": 2.1566, + "step": 86000 + }, + { + "epoch": 2.54, + "grad_norm": 2.6162164211273193, + "learning_rate": 0.00027514145346499696, + "loss": 2.1513, + "step": 86500 + }, + { + "epoch": 2.55, + "grad_norm": 1.7324166297912598, + "learning_rate": 0.0002749943614736656, + "loss": 2.1482, + "step": 87000 + }, + { + "epoch": 2.57, + "grad_norm": 1.3059158325195312, + "learning_rate": 0.0002748472694823342, + "loss": 2.1346, + "step": 87500 + }, + { + "epoch": 2.58, + "grad_norm": 1.1019114255905151, + "learning_rate": 0.00027470017749100286, + "loss": 2.123, + "step": 88000 + }, + { + "epoch": 2.6, + "grad_norm": 7.39063024520874, + "learning_rate": 0.0002745530854996715, + "loss": 2.1278, + "step": 88500 + }, + { + "epoch": 2.61, + "grad_norm": 1.9237990379333496, + "learning_rate": 0.0002744059935083401, + "loss": 2.115, + "step": 89000 + }, + { + "epoch": 2.63, + "grad_norm": 1.4632532596588135, + "learning_rate": 0.0002742589015170087, + "loss": 2.137, + "step": 89500 + }, + { + "epoch": 2.64, + "grad_norm": 1.9587647914886475, + "learning_rate": 0.00027411180952567735, + "loss": 2.1056, + "step": 90000 + }, + { + "epoch": 2.66, + "grad_norm": 7.492849349975586, + "learning_rate": 0.0002739647175343459, + "loss": 2.1208, + "step": 90500 + }, + { + "epoch": 2.67, + "grad_norm": 1.7255750894546509, + "learning_rate": 0.00027381762554301456, + "loss": 2.1015, + "step": 91000 + }, + { + "epoch": 2.69, + "grad_norm": 8.091690063476562, + "learning_rate": 0.0002736705335516832, + "loss": 2.1073, + "step": 91500 + }, + { + "epoch": 2.7, + "grad_norm": 6.672662734985352, + "learning_rate": 0.00027352344156035183, + "loss": 2.1059, + "step": 92000 + }, + { + "epoch": 2.72, + "grad_norm": 1.948398470878601, + "learning_rate": 0.00027337634956902047, + "loss": 2.0943, + "step": 92500 + }, + { + "epoch": 2.73, + "grad_norm": 1.4260573387145996, + "learning_rate": 0.00027322925757768905, + "loss": 2.1283, + "step": 93000 + }, + { + "epoch": 2.75, + "grad_norm": 1.5472320318222046, + "learning_rate": 0.0002730821655863577, + "loss": 2.0882, + "step": 93500 + }, + { + "epoch": 2.76, + "grad_norm": 1.3860925436019897, + "learning_rate": 0.0002729350735950263, + "loss": 2.1323, + "step": 94000 + }, + { + "epoch": 2.77, + "grad_norm": 2.232808828353882, + "learning_rate": 0.0002727879816036949, + "loss": 2.1485, + "step": 94500 + }, + { + "epoch": 2.79, + "grad_norm": 1.8404427766799927, + "learning_rate": 0.00027264088961236353, + "loss": 2.1116, + "step": 95000 + }, + { + "epoch": 2.8, + "grad_norm": 2.124656915664673, + "learning_rate": 0.00027249379762103216, + "loss": 2.1076, + "step": 95500 + }, + { + "epoch": 2.82, + "grad_norm": 2.2169177532196045, + "learning_rate": 0.0002723467056297008, + "loss": 2.1035, + "step": 96000 + }, + { + "epoch": 2.83, + "grad_norm": 2.7737512588500977, + "learning_rate": 0.00027219961363836943, + "loss": 2.1369, + "step": 96500 + }, + { + "epoch": 2.85, + "grad_norm": 2.6662325859069824, + "learning_rate": 0.000272052521647038, + "loss": 2.1131, + "step": 97000 + }, + { + "epoch": 2.86, + "grad_norm": 2.6663177013397217, + "learning_rate": 0.00027190542965570665, + "loss": 2.1175, + "step": 97500 + }, + { + "epoch": 2.88, + "grad_norm": 1.446356177330017, + "learning_rate": 0.0002717583376643753, + "loss": 2.1076, + "step": 98000 + }, + { + "epoch": 2.89, + "grad_norm": 2.3340353965759277, + "learning_rate": 0.00027161124567304386, + "loss": 2.0877, + "step": 98500 + }, + { + "epoch": 2.91, + "grad_norm": 3.389127254486084, + "learning_rate": 0.0002714641536817125, + "loss": 2.0904, + "step": 99000 + }, + { + "epoch": 2.92, + "grad_norm": 2.044728994369507, + "learning_rate": 0.00027131706169038113, + "loss": 2.0608, + "step": 99500 + }, + { + "epoch": 2.94, + "grad_norm": 17.932655334472656, + "learning_rate": 0.00027116996969904977, + "loss": 2.1136, + "step": 100000 + }, + { + "epoch": 2.95, + "grad_norm": 1.4239097833633423, + "learning_rate": 0.0002710228777077184, + "loss": 2.0783, + "step": 100500 + }, + { + "epoch": 2.97, + "grad_norm": 2.1303598880767822, + "learning_rate": 0.000270875785716387, + "loss": 2.0981, + "step": 101000 + }, + { + "epoch": 2.98, + "grad_norm": 2.2777888774871826, + "learning_rate": 0.0002707286937250556, + "loss": 2.0862, + "step": 101500 + }, + { + "epoch": 2.99, + "grad_norm": 2.2035715579986572, + "learning_rate": 0.00027058160173372425, + "loss": 2.1089, + "step": 102000 + }, + { + "epoch": 3.01, + "grad_norm": 2.7756547927856445, + "learning_rate": 0.00027043450974239283, + "loss": 2.0614, + "step": 102500 + }, + { + "epoch": 3.02, + "grad_norm": 1.4222705364227295, + "learning_rate": 0.00027028741775106147, + "loss": 2.0216, + "step": 103000 + }, + { + "epoch": 3.04, + "grad_norm": 1.5257524251937866, + "learning_rate": 0.0002701403257597301, + "loss": 2.0388, + "step": 103500 + }, + { + "epoch": 3.05, + "grad_norm": 2.274021625518799, + "learning_rate": 0.00026999323376839874, + "loss": 2.048, + "step": 104000 + }, + { + "epoch": 3.07, + "grad_norm": 5.903868198394775, + "learning_rate": 0.00026984614177706737, + "loss": 2.0008, + "step": 104500 + }, + { + "epoch": 3.08, + "grad_norm": 1.561962604522705, + "learning_rate": 0.00026969904978573595, + "loss": 2.05, + "step": 105000 + }, + { + "epoch": 3.1, + "grad_norm": 2.002523422241211, + "learning_rate": 0.0002695519577944046, + "loss": 2.0217, + "step": 105500 + }, + { + "epoch": 3.11, + "grad_norm": 1.9345905780792236, + "learning_rate": 0.0002694048658030732, + "loss": 2.0343, + "step": 106000 + }, + { + "epoch": 3.13, + "grad_norm": 1.5755488872528076, + "learning_rate": 0.0002692577738117418, + "loss": 2.0133, + "step": 106500 + }, + { + "epoch": 3.14, + "grad_norm": 1.7304731607437134, + "learning_rate": 0.00026911068182041044, + "loss": 2.0177, + "step": 107000 + }, + { + "epoch": 3.16, + "grad_norm": 2.4045250415802, + "learning_rate": 0.00026896358982907907, + "loss": 2.034, + "step": 107500 + }, + { + "epoch": 3.17, + "grad_norm": 1.5433346033096313, + "learning_rate": 0.0002688164978377477, + "loss": 2.0228, + "step": 108000 + }, + { + "epoch": 3.19, + "grad_norm": 2.033250093460083, + "learning_rate": 0.00026866940584641634, + "loss": 2.0076, + "step": 108500 + }, + { + "epoch": 3.2, + "grad_norm": 3.0048422813415527, + "learning_rate": 0.0002685223138550849, + "loss": 1.9876, + "step": 109000 + }, + { + "epoch": 3.21, + "grad_norm": 1.6355750560760498, + "learning_rate": 0.00026837522186375356, + "loss": 2.0259, + "step": 109500 + }, + { + "epoch": 3.23, + "grad_norm": 7.954076290130615, + "learning_rate": 0.0002682281298724222, + "loss": 2.0408, + "step": 110000 + }, + { + "epoch": 3.24, + "grad_norm": 1.3474091291427612, + "learning_rate": 0.0002680810378810908, + "loss": 2.0114, + "step": 110500 + }, + { + "epoch": 3.26, + "grad_norm": 1.8665661811828613, + "learning_rate": 0.0002679339458897594, + "loss": 2.0388, + "step": 111000 + }, + { + "epoch": 3.27, + "grad_norm": 1.452719807624817, + "learning_rate": 0.00026778685389842804, + "loss": 2.0411, + "step": 111500 + }, + { + "epoch": 3.29, + "grad_norm": 1.9409444332122803, + "learning_rate": 0.0002676397619070967, + "loss": 2.0355, + "step": 112000 + }, + { + "epoch": 3.3, + "grad_norm": 1.4139233827590942, + "learning_rate": 0.0002674926699157653, + "loss": 2.0306, + "step": 112500 + }, + { + "epoch": 3.32, + "grad_norm": 2.548440933227539, + "learning_rate": 0.00026734557792443394, + "loss": 2.013, + "step": 113000 + }, + { + "epoch": 3.33, + "grad_norm": 16.565500259399414, + "learning_rate": 0.0002671984859331025, + "loss": 2.0435, + "step": 113500 + }, + { + "epoch": 3.35, + "grad_norm": 2.008643865585327, + "learning_rate": 0.00026705139394177116, + "loss": 2.0094, + "step": 114000 + }, + { + "epoch": 3.36, + "grad_norm": 5.655598163604736, + "learning_rate": 0.0002669043019504398, + "loss": 2.0169, + "step": 114500 + }, + { + "epoch": 3.38, + "grad_norm": 13.43127727508545, + "learning_rate": 0.0002667572099591084, + "loss": 2.0273, + "step": 115000 + }, + { + "epoch": 3.39, + "grad_norm": 4.050173282623291, + "learning_rate": 0.00026661011796777706, + "loss": 2.0406, + "step": 115500 + }, + { + "epoch": 3.41, + "grad_norm": 2.2970187664031982, + "learning_rate": 0.00026646302597644564, + "loss": 2.0374, + "step": 116000 + }, + { + "epoch": 3.42, + "grad_norm": 1.8749233484268188, + "learning_rate": 0.0002663159339851143, + "loss": 1.991, + "step": 116500 + }, + { + "epoch": 3.44, + "grad_norm": 2.4106335639953613, + "learning_rate": 0.0002661688419937829, + "loss": 2.0425, + "step": 117000 + }, + { + "epoch": 3.45, + "grad_norm": 1.8036812543869019, + "learning_rate": 0.0002660217500024515, + "loss": 1.9968, + "step": 117500 + }, + { + "epoch": 3.46, + "grad_norm": 1.831998348236084, + "learning_rate": 0.00026587465801112013, + "loss": 2.0291, + "step": 118000 + }, + { + "epoch": 3.48, + "grad_norm": 2.4199304580688477, + "learning_rate": 0.00026572756601978876, + "loss": 2.0186, + "step": 118500 + }, + { + "epoch": 3.49, + "grad_norm": 4.254742622375488, + "learning_rate": 0.00026558047402845734, + "loss": 2.0226, + "step": 119000 + }, + { + "epoch": 3.51, + "grad_norm": 12.558996200561523, + "learning_rate": 0.00026543338203712603, + "loss": 2.0368, + "step": 119500 + }, + { + "epoch": 3.52, + "grad_norm": 1.9092944860458374, + "learning_rate": 0.0002652862900457946, + "loss": 2.02, + "step": 120000 + }, + { + "epoch": 3.54, + "grad_norm": 1.9921791553497314, + "learning_rate": 0.00026513919805446325, + "loss": 2.019, + "step": 120500 + }, + { + "epoch": 3.55, + "grad_norm": 2.1361401081085205, + "learning_rate": 0.0002649921060631319, + "loss": 1.9927, + "step": 121000 + }, + { + "epoch": 3.57, + "grad_norm": 1.6607388257980347, + "learning_rate": 0.00026484501407180046, + "loss": 2.0722, + "step": 121500 + }, + { + "epoch": 3.58, + "grad_norm": 2.246613025665283, + "learning_rate": 0.0002646979220804691, + "loss": 2.0091, + "step": 122000 + }, + { + "epoch": 3.6, + "grad_norm": 5.497014045715332, + "learning_rate": 0.00026455083008913773, + "loss": 2.0182, + "step": 122500 + }, + { + "epoch": 3.61, + "grad_norm": 1.5741316080093384, + "learning_rate": 0.00026440373809780637, + "loss": 2.0113, + "step": 123000 + }, + { + "epoch": 3.63, + "grad_norm": 2.1419429779052734, + "learning_rate": 0.000264256646106475, + "loss": 2.0112, + "step": 123500 + }, + { + "epoch": 3.64, + "grad_norm": 2.309093952178955, + "learning_rate": 0.0002641095541151436, + "loss": 1.9943, + "step": 124000 + }, + { + "epoch": 3.66, + "grad_norm": 1.7635866403579712, + "learning_rate": 0.0002639624621238122, + "loss": 2.0081, + "step": 124500 + }, + { + "epoch": 3.67, + "grad_norm": 2.3035855293273926, + "learning_rate": 0.00026381537013248085, + "loss": 1.9658, + "step": 125000 + }, + { + "epoch": 3.68, + "grad_norm": 1.8141978979110718, + "learning_rate": 0.00026366827814114943, + "loss": 2.0203, + "step": 125500 + }, + { + "epoch": 3.7, + "grad_norm": 1.8287400007247925, + "learning_rate": 0.00026352118614981807, + "loss": 1.9636, + "step": 126000 + }, + { + "epoch": 3.71, + "grad_norm": 1.356780767440796, + "learning_rate": 0.0002633740941584867, + "loss": 2.0116, + "step": 126500 + }, + { + "epoch": 3.73, + "grad_norm": 5.973343849182129, + "learning_rate": 0.00026322700216715534, + "loss": 1.9837, + "step": 127000 + }, + { + "epoch": 3.74, + "grad_norm": 2.090287446975708, + "learning_rate": 0.00026307991017582397, + "loss": 2.0161, + "step": 127500 + }, + { + "epoch": 3.76, + "grad_norm": 1.422405481338501, + "learning_rate": 0.00026293281818449255, + "loss": 2.0096, + "step": 128000 + }, + { + "epoch": 3.77, + "grad_norm": 5.0919365882873535, + "learning_rate": 0.0002627857261931612, + "loss": 1.997, + "step": 128500 + }, + { + "epoch": 3.79, + "grad_norm": 2.7366178035736084, + "learning_rate": 0.0002626386342018298, + "loss": 2.001, + "step": 129000 + }, + { + "epoch": 3.8, + "grad_norm": 1.3529456853866577, + "learning_rate": 0.0002624915422104984, + "loss": 2.0208, + "step": 129500 + }, + { + "epoch": 3.82, + "grad_norm": 2.3386831283569336, + "learning_rate": 0.00026234445021916704, + "loss": 2.0049, + "step": 130000 + }, + { + "epoch": 3.83, + "grad_norm": 2.074753999710083, + "learning_rate": 0.00026219735822783567, + "loss": 1.988, + "step": 130500 + }, + { + "epoch": 3.85, + "grad_norm": 4.412100791931152, + "learning_rate": 0.0002620502662365043, + "loss": 1.9851, + "step": 131000 + }, + { + "epoch": 3.86, + "grad_norm": 2.1146128177642822, + "learning_rate": 0.00026190317424517294, + "loss": 2.0049, + "step": 131500 + }, + { + "epoch": 3.88, + "grad_norm": 7.746240615844727, + "learning_rate": 0.0002617560822538415, + "loss": 1.9978, + "step": 132000 + }, + { + "epoch": 3.89, + "grad_norm": 1.8406715393066406, + "learning_rate": 0.00026160899026251016, + "loss": 1.9677, + "step": 132500 + }, + { + "epoch": 3.9, + "grad_norm": 6.467206954956055, + "learning_rate": 0.0002614618982711788, + "loss": 1.9906, + "step": 133000 + }, + { + "epoch": 3.92, + "grad_norm": 2.756176471710205, + "learning_rate": 0.00026131480627984737, + "loss": 2.0179, + "step": 133500 + }, + { + "epoch": 3.93, + "grad_norm": 2.753931760787964, + "learning_rate": 0.000261167714288516, + "loss": 2.0166, + "step": 134000 + }, + { + "epoch": 3.95, + "grad_norm": 32.68937301635742, + "learning_rate": 0.00026102062229718464, + "loss": 2.002, + "step": 134500 + }, + { + "epoch": 3.96, + "grad_norm": 2.8605151176452637, + "learning_rate": 0.0002608735303058533, + "loss": 1.9997, + "step": 135000 + }, + { + "epoch": 3.98, + "grad_norm": 1.3387537002563477, + "learning_rate": 0.0002607264383145219, + "loss": 1.9992, + "step": 135500 + }, + { + "epoch": 3.99, + "grad_norm": 2.1721653938293457, + "learning_rate": 0.0002605793463231905, + "loss": 1.9946, + "step": 136000 + }, + { + "epoch": 4.01, + "grad_norm": 15.696208953857422, + "learning_rate": 0.0002604322543318591, + "loss": 1.9808, + "step": 136500 + }, + { + "epoch": 4.02, + "grad_norm": 2.118614912033081, + "learning_rate": 0.00026028516234052776, + "loss": 1.9334, + "step": 137000 + }, + { + "epoch": 4.04, + "grad_norm": 4.258530616760254, + "learning_rate": 0.00026013807034919634, + "loss": 1.9505, + "step": 137500 + }, + { + "epoch": 4.05, + "grad_norm": 1.8109312057495117, + "learning_rate": 0.000259990978357865, + "loss": 1.909, + "step": 138000 + }, + { + "epoch": 4.07, + "grad_norm": 1.503454327583313, + "learning_rate": 0.0002598438863665336, + "loss": 1.9283, + "step": 138500 + }, + { + "epoch": 4.08, + "grad_norm": 5.558799743652344, + "learning_rate": 0.00025969679437520224, + "loss": 1.959, + "step": 139000 + }, + { + "epoch": 4.1, + "grad_norm": 2.509039878845215, + "learning_rate": 0.0002595497023838709, + "loss": 1.9111, + "step": 139500 + }, + { + "epoch": 4.11, + "grad_norm": 1.6668161153793335, + "learning_rate": 0.00025940261039253946, + "loss": 1.9116, + "step": 140000 + }, + { + "epoch": 4.13, + "grad_norm": 1.4484660625457764, + "learning_rate": 0.0002592555184012081, + "loss": 1.9337, + "step": 140500 + }, + { + "epoch": 4.14, + "grad_norm": 2.352369785308838, + "learning_rate": 0.00025910842640987673, + "loss": 1.922, + "step": 141000 + }, + { + "epoch": 4.15, + "grad_norm": 17.899917602539062, + "learning_rate": 0.0002589613344185453, + "loss": 1.9144, + "step": 141500 + }, + { + "epoch": 4.17, + "grad_norm": 2.010423421859741, + "learning_rate": 0.00025881424242721394, + "loss": 1.9909, + "step": 142000 + }, + { + "epoch": 4.18, + "grad_norm": 2.928180694580078, + "learning_rate": 0.0002586671504358826, + "loss": 1.9295, + "step": 142500 + }, + { + "epoch": 4.2, + "grad_norm": 2.5269832611083984, + "learning_rate": 0.0002585200584445512, + "loss": 1.9473, + "step": 143000 + }, + { + "epoch": 4.21, + "grad_norm": 4.112135410308838, + "learning_rate": 0.00025837296645321985, + "loss": 1.9538, + "step": 143500 + }, + { + "epoch": 4.23, + "grad_norm": 2.3555727005004883, + "learning_rate": 0.00025822587446188843, + "loss": 1.951, + "step": 144000 + }, + { + "epoch": 4.24, + "grad_norm": 2.098503828048706, + "learning_rate": 0.00025807878247055706, + "loss": 1.9492, + "step": 144500 + }, + { + "epoch": 4.26, + "grad_norm": 2.459561824798584, + "learning_rate": 0.0002579316904792257, + "loss": 1.9427, + "step": 145000 + }, + { + "epoch": 4.27, + "grad_norm": 2.0252935886383057, + "learning_rate": 0.0002577845984878943, + "loss": 1.9161, + "step": 145500 + }, + { + "epoch": 4.29, + "grad_norm": 1.384768009185791, + "learning_rate": 0.0002576375064965629, + "loss": 1.9325, + "step": 146000 + }, + { + "epoch": 4.3, + "grad_norm": 1.6305208206176758, + "learning_rate": 0.00025749041450523155, + "loss": 1.9892, + "step": 146500 + }, + { + "epoch": 4.32, + "grad_norm": 2.4807193279266357, + "learning_rate": 0.0002573433225139002, + "loss": 1.9288, + "step": 147000 + }, + { + "epoch": 4.33, + "grad_norm": 1.798276424407959, + "learning_rate": 0.0002571962305225688, + "loss": 1.9124, + "step": 147500 + }, + { + "epoch": 4.35, + "grad_norm": 3.1262764930725098, + "learning_rate": 0.0002570491385312374, + "loss": 1.9356, + "step": 148000 + }, + { + "epoch": 4.36, + "grad_norm": 2.5284624099731445, + "learning_rate": 0.00025690204653990603, + "loss": 1.9519, + "step": 148500 + }, + { + "epoch": 4.37, + "grad_norm": 2.51084566116333, + "learning_rate": 0.00025675495454857467, + "loss": 1.948, + "step": 149000 + }, + { + "epoch": 4.39, + "grad_norm": 1.7908506393432617, + "learning_rate": 0.00025660786255724325, + "loss": 1.9214, + "step": 149500 + }, + { + "epoch": 4.4, + "grad_norm": 1.8897744417190552, + "learning_rate": 0.0002564607705659119, + "loss": 1.9492, + "step": 150000 + }, + { + "epoch": 4.42, + "grad_norm": 2.1504998207092285, + "learning_rate": 0.0002563136785745805, + "loss": 1.9261, + "step": 150500 + }, + { + "epoch": 4.43, + "grad_norm": 1.7902778387069702, + "learning_rate": 0.00025616658658324915, + "loss": 1.9332, + "step": 151000 + }, + { + "epoch": 4.45, + "grad_norm": 1.6913944482803345, + "learning_rate": 0.0002560194945919178, + "loss": 1.9391, + "step": 151500 + }, + { + "epoch": 4.46, + "grad_norm": 1.7668604850769043, + "learning_rate": 0.00025587240260058637, + "loss": 1.961, + "step": 152000 + }, + { + "epoch": 4.48, + "grad_norm": 2.929547071456909, + "learning_rate": 0.000255725310609255, + "loss": 1.9578, + "step": 152500 + }, + { + "epoch": 4.49, + "grad_norm": 2.5235610008239746, + "learning_rate": 0.00025557821861792363, + "loss": 1.8975, + "step": 153000 + }, + { + "epoch": 4.51, + "grad_norm": 5.4155097007751465, + "learning_rate": 0.0002554311266265922, + "loss": 1.9549, + "step": 153500 + }, + { + "epoch": 4.52, + "grad_norm": 2.9047696590423584, + "learning_rate": 0.00025528403463526085, + "loss": 1.9409, + "step": 154000 + }, + { + "epoch": 4.54, + "grad_norm": 3.789259910583496, + "learning_rate": 0.0002551369426439295, + "loss": 1.9267, + "step": 154500 + }, + { + "epoch": 4.55, + "grad_norm": 2.7584846019744873, + "learning_rate": 0.0002549898506525981, + "loss": 1.9138, + "step": 155000 + }, + { + "epoch": 4.57, + "grad_norm": 4.33261251449585, + "learning_rate": 0.00025484275866126675, + "loss": 1.9611, + "step": 155500 + }, + { + "epoch": 4.58, + "grad_norm": 3.2369813919067383, + "learning_rate": 0.00025469566666993533, + "loss": 1.9354, + "step": 156000 + }, + { + "epoch": 4.59, + "grad_norm": 2.6031434535980225, + "learning_rate": 0.00025454857467860397, + "loss": 1.9172, + "step": 156500 + }, + { + "epoch": 4.61, + "grad_norm": 2.9214913845062256, + "learning_rate": 0.0002544014826872726, + "loss": 1.9356, + "step": 157000 + }, + { + "epoch": 4.62, + "grad_norm": 2.684860944747925, + "learning_rate": 0.00025425439069594124, + "loss": 1.9494, + "step": 157500 + }, + { + "epoch": 4.64, + "grad_norm": 2.98620867729187, + "learning_rate": 0.0002541072987046098, + "loss": 1.9277, + "step": 158000 + }, + { + "epoch": 4.65, + "grad_norm": 1.6060062646865845, + "learning_rate": 0.00025396020671327845, + "loss": 1.9417, + "step": 158500 + }, + { + "epoch": 4.67, + "grad_norm": 1.4754972457885742, + "learning_rate": 0.0002538131147219471, + "loss": 1.9691, + "step": 159000 + }, + { + "epoch": 4.68, + "grad_norm": 1.5829949378967285, + "learning_rate": 0.0002536660227306157, + "loss": 1.9201, + "step": 159500 + }, + { + "epoch": 4.7, + "grad_norm": 2.13678240776062, + "learning_rate": 0.00025351893073928436, + "loss": 1.9697, + "step": 160000 + }, + { + "epoch": 4.71, + "grad_norm": 2.030682325363159, + "learning_rate": 0.00025337183874795294, + "loss": 1.9364, + "step": 160500 + }, + { + "epoch": 4.73, + "grad_norm": 2.715879201889038, + "learning_rate": 0.00025322474675662157, + "loss": 1.9574, + "step": 161000 + }, + { + "epoch": 4.74, + "grad_norm": 48.37791442871094, + "learning_rate": 0.0002530776547652902, + "loss": 1.9372, + "step": 161500 + }, + { + "epoch": 4.76, + "grad_norm": 3.232931613922119, + "learning_rate": 0.0002529305627739588, + "loss": 1.9093, + "step": 162000 + }, + { + "epoch": 4.77, + "grad_norm": 2.2022180557250977, + "learning_rate": 0.0002527834707826274, + "loss": 1.8969, + "step": 162500 + }, + { + "epoch": 4.79, + "grad_norm": 1.3633408546447754, + "learning_rate": 0.00025263637879129606, + "loss": 1.9476, + "step": 163000 + }, + { + "epoch": 4.8, + "grad_norm": 1.8852393627166748, + "learning_rate": 0.0002524892867999647, + "loss": 1.9467, + "step": 163500 + }, + { + "epoch": 4.82, + "grad_norm": 2.2378666400909424, + "learning_rate": 0.0002523421948086333, + "loss": 1.9216, + "step": 164000 + }, + { + "epoch": 4.83, + "grad_norm": 6.068374156951904, + "learning_rate": 0.0002521951028173019, + "loss": 1.9188, + "step": 164500 + }, + { + "epoch": 4.84, + "grad_norm": 2.9869394302368164, + "learning_rate": 0.00025204801082597054, + "loss": 1.8981, + "step": 165000 + }, + { + "epoch": 4.86, + "grad_norm": 5.120233058929443, + "learning_rate": 0.0002519009188346392, + "loss": 1.9119, + "step": 165500 + }, + { + "epoch": 4.87, + "grad_norm": 24.895925521850586, + "learning_rate": 0.00025175382684330776, + "loss": 1.9263, + "step": 166000 + }, + { + "epoch": 4.89, + "grad_norm": 4.085075378417969, + "learning_rate": 0.0002516067348519764, + "loss": 1.9464, + "step": 166500 + }, + { + "epoch": 4.9, + "grad_norm": 2.498352527618408, + "learning_rate": 0.000251459642860645, + "loss": 1.9349, + "step": 167000 + }, + { + "epoch": 4.92, + "grad_norm": 1.9715830087661743, + "learning_rate": 0.00025131255086931366, + "loss": 1.9198, + "step": 167500 + }, + { + "epoch": 4.93, + "grad_norm": 6.1134161949157715, + "learning_rate": 0.0002511654588779823, + "loss": 1.9064, + "step": 168000 + }, + { + "epoch": 4.95, + "grad_norm": 1.4753895998001099, + "learning_rate": 0.0002510183668866509, + "loss": 1.9277, + "step": 168500 + }, + { + "epoch": 4.96, + "grad_norm": 22.2166805267334, + "learning_rate": 0.0002508712748953195, + "loss": 1.9105, + "step": 169000 + }, + { + "epoch": 4.98, + "grad_norm": 9.678267478942871, + "learning_rate": 0.00025072418290398815, + "loss": 1.9361, + "step": 169500 + }, + { + "epoch": 4.99, + "grad_norm": 5.086581230163574, + "learning_rate": 0.0002505770909126567, + "loss": 1.9035, + "step": 170000 + }, + { + "epoch": 5.01, + "grad_norm": 3.46391224861145, + "learning_rate": 0.00025042999892132536, + "loss": 1.9096, + "step": 170500 + }, + { + "epoch": 5.02, + "grad_norm": 1.564864158630371, + "learning_rate": 0.000250282906929994, + "loss": 1.8412, + "step": 171000 + }, + { + "epoch": 5.04, + "grad_norm": 3.917158603668213, + "learning_rate": 0.00025013581493866263, + "loss": 1.8263, + "step": 171500 + }, + { + "epoch": 5.05, + "grad_norm": 2.3305134773254395, + "learning_rate": 0.00024998872294733126, + "loss": 1.8263, + "step": 172000 + }, + { + "epoch": 5.06, + "grad_norm": 2.812856435775757, + "learning_rate": 0.00024984163095599985, + "loss": 1.8238, + "step": 172500 + }, + { + "epoch": 5.08, + "grad_norm": 12.236431121826172, + "learning_rate": 0.0002496945389646685, + "loss": 1.8475, + "step": 173000 + }, + { + "epoch": 5.09, + "grad_norm": 1.9508650302886963, + "learning_rate": 0.0002495474469733371, + "loss": 1.8671, + "step": 173500 + }, + { + "epoch": 5.11, + "grad_norm": 4.95379638671875, + "learning_rate": 0.0002494003549820057, + "loss": 1.8541, + "step": 174000 + }, + { + "epoch": 5.12, + "grad_norm": 2.8690032958984375, + "learning_rate": 0.00024925326299067433, + "loss": 1.8622, + "step": 174500 + }, + { + "epoch": 5.14, + "grad_norm": 2.8076915740966797, + "learning_rate": 0.00024910617099934296, + "loss": 1.8394, + "step": 175000 + }, + { + "epoch": 5.15, + "grad_norm": 1.6845248937606812, + "learning_rate": 0.0002489590790080116, + "loss": 1.8572, + "step": 175500 + }, + { + "epoch": 5.17, + "grad_norm": 1.74410080909729, + "learning_rate": 0.00024881198701668023, + "loss": 1.8387, + "step": 176000 + }, + { + "epoch": 5.18, + "grad_norm": 2.655266761779785, + "learning_rate": 0.0002486648950253488, + "loss": 1.8776, + "step": 176500 + }, + { + "epoch": 5.2, + "grad_norm": 2.5884244441986084, + "learning_rate": 0.00024851780303401745, + "loss": 1.8564, + "step": 177000 + }, + { + "epoch": 5.21, + "grad_norm": 4.314496040344238, + "learning_rate": 0.0002483707110426861, + "loss": 1.8573, + "step": 177500 + }, + { + "epoch": 5.23, + "grad_norm": 2.15973162651062, + "learning_rate": 0.00024822361905135466, + "loss": 1.8671, + "step": 178000 + }, + { + "epoch": 5.24, + "grad_norm": 1.7802814245224, + "learning_rate": 0.00024807652706002335, + "loss": 1.8722, + "step": 178500 + }, + { + "epoch": 5.26, + "grad_norm": 29.04892921447754, + "learning_rate": 0.00024792943506869193, + "loss": 1.8747, + "step": 179000 + }, + { + "epoch": 5.27, + "grad_norm": 2.0370380878448486, + "learning_rate": 0.00024778234307736057, + "loss": 1.8615, + "step": 179500 + }, + { + "epoch": 5.28, + "grad_norm": 2.681140661239624, + "learning_rate": 0.0002476352510860292, + "loss": 1.8603, + "step": 180000 + }, + { + "epoch": 5.3, + "grad_norm": 1.609506368637085, + "learning_rate": 0.0002474881590946978, + "loss": 1.857, + "step": 180500 + }, + { + "epoch": 5.31, + "grad_norm": 1.4114452600479126, + "learning_rate": 0.0002473410671033664, + "loss": 1.8849, + "step": 181000 + }, + { + "epoch": 5.33, + "grad_norm": 12.606436729431152, + "learning_rate": 0.00024719397511203505, + "loss": 1.8576, + "step": 181500 + }, + { + "epoch": 5.34, + "grad_norm": 1.5284700393676758, + "learning_rate": 0.00024704688312070363, + "loss": 1.8565, + "step": 182000 + }, + { + "epoch": 5.36, + "grad_norm": 2.134824514389038, + "learning_rate": 0.0002468997911293723, + "loss": 1.8447, + "step": 182500 + }, + { + "epoch": 5.37, + "grad_norm": 2.755667209625244, + "learning_rate": 0.0002467526991380409, + "loss": 1.8894, + "step": 183000 + }, + { + "epoch": 5.39, + "grad_norm": 1.8626573085784912, + "learning_rate": 0.00024660560714670954, + "loss": 1.8788, + "step": 183500 + }, + { + "epoch": 5.4, + "grad_norm": 1.2436336278915405, + "learning_rate": 0.00024645851515537817, + "loss": 1.888, + "step": 184000 + }, + { + "epoch": 5.42, + "grad_norm": 1.6450908184051514, + "learning_rate": 0.00024631142316404675, + "loss": 1.8576, + "step": 184500 + }, + { + "epoch": 5.43, + "grad_norm": 2.5818614959716797, + "learning_rate": 0.0002461643311727154, + "loss": 1.8634, + "step": 185000 + }, + { + "epoch": 5.45, + "grad_norm": 2.4842381477355957, + "learning_rate": 0.000246017239181384, + "loss": 1.8777, + "step": 185500 + }, + { + "epoch": 5.46, + "grad_norm": 5.475472927093506, + "learning_rate": 0.0002458701471900526, + "loss": 1.8537, + "step": 186000 + }, + { + "epoch": 5.48, + "grad_norm": 6.852906227111816, + "learning_rate": 0.0002457230551987213, + "loss": 1.863, + "step": 186500 + }, + { + "epoch": 5.49, + "grad_norm": 2.8549883365631104, + "learning_rate": 0.00024557596320738987, + "loss": 1.8643, + "step": 187000 + }, + { + "epoch": 5.51, + "grad_norm": 4.323022365570068, + "learning_rate": 0.0002454288712160585, + "loss": 1.8489, + "step": 187500 + }, + { + "epoch": 5.52, + "grad_norm": 1.4015731811523438, + "learning_rate": 0.00024528177922472714, + "loss": 1.879, + "step": 188000 + }, + { + "epoch": 5.53, + "grad_norm": 2.5605356693267822, + "learning_rate": 0.0002451346872333957, + "loss": 1.8564, + "step": 188500 + }, + { + "epoch": 5.55, + "grad_norm": 1.567606806755066, + "learning_rate": 0.00024498759524206436, + "loss": 1.8902, + "step": 189000 + }, + { + "epoch": 5.56, + "grad_norm": 1.4582362174987793, + "learning_rate": 0.000244840503250733, + "loss": 1.8659, + "step": 189500 + }, + { + "epoch": 5.58, + "grad_norm": 1.8250012397766113, + "learning_rate": 0.00024469341125940157, + "loss": 1.8558, + "step": 190000 + }, + { + "epoch": 5.59, + "grad_norm": 6.115236759185791, + "learning_rate": 0.00024454631926807026, + "loss": 1.839, + "step": 190500 + }, + { + "epoch": 5.61, + "grad_norm": 4.505608081817627, + "learning_rate": 0.00024439922727673884, + "loss": 1.846, + "step": 191000 + }, + { + "epoch": 5.62, + "grad_norm": 2.1621007919311523, + "learning_rate": 0.0002442521352854075, + "loss": 1.8287, + "step": 191500 + }, + { + "epoch": 5.64, + "grad_norm": 2.337688684463501, + "learning_rate": 0.0002441050432940761, + "loss": 1.88, + "step": 192000 + }, + { + "epoch": 5.65, + "grad_norm": 1.921249508857727, + "learning_rate": 0.00024395795130274472, + "loss": 1.8345, + "step": 192500 + }, + { + "epoch": 5.67, + "grad_norm": 1.396031379699707, + "learning_rate": 0.00024381085931141332, + "loss": 1.8527, + "step": 193000 + }, + { + "epoch": 5.68, + "grad_norm": 2.6552846431732178, + "learning_rate": 0.00024366376732008196, + "loss": 1.8628, + "step": 193500 + }, + { + "epoch": 5.7, + "grad_norm": 4.992696285247803, + "learning_rate": 0.00024351667532875057, + "loss": 1.8727, + "step": 194000 + }, + { + "epoch": 5.71, + "grad_norm": 4.128008842468262, + "learning_rate": 0.00024336958333741923, + "loss": 1.8677, + "step": 194500 + }, + { + "epoch": 5.73, + "grad_norm": 2.2797539234161377, + "learning_rate": 0.00024322249134608784, + "loss": 1.8495, + "step": 195000 + }, + { + "epoch": 5.74, + "grad_norm": 1.987032175064087, + "learning_rate": 0.00024307539935475644, + "loss": 1.8555, + "step": 195500 + }, + { + "epoch": 5.75, + "grad_norm": 2.1700329780578613, + "learning_rate": 0.00024292830736342508, + "loss": 1.8745, + "step": 196000 + }, + { + "epoch": 5.77, + "grad_norm": 2.0741491317749023, + "learning_rate": 0.00024278121537209369, + "loss": 1.8859, + "step": 196500 + }, + { + "epoch": 5.78, + "grad_norm": 3.1270885467529297, + "learning_rate": 0.0002426341233807623, + "loss": 1.8778, + "step": 197000 + }, + { + "epoch": 5.8, + "grad_norm": 2.7290618419647217, + "learning_rate": 0.00024248703138943093, + "loss": 1.8418, + "step": 197500 + }, + { + "epoch": 5.81, + "grad_norm": 1.5647644996643066, + "learning_rate": 0.00024233993939809954, + "loss": 1.8519, + "step": 198000 + }, + { + "epoch": 5.83, + "grad_norm": 2.4231462478637695, + "learning_rate": 0.0002421928474067682, + "loss": 1.868, + "step": 198500 + }, + { + "epoch": 5.84, + "grad_norm": 3.1139655113220215, + "learning_rate": 0.0002420457554154368, + "loss": 1.8255, + "step": 199000 + }, + { + "epoch": 5.86, + "grad_norm": 4.595509052276611, + "learning_rate": 0.0002418986634241054, + "loss": 1.8626, + "step": 199500 + }, + { + "epoch": 5.87, + "grad_norm": 7.253945827484131, + "learning_rate": 0.00024175157143277405, + "loss": 1.8714, + "step": 200000 + }, + { + "epoch": 5.89, + "grad_norm": 1.5077595710754395, + "learning_rate": 0.00024160447944144266, + "loss": 1.8743, + "step": 200500 + }, + { + "epoch": 5.9, + "grad_norm": 5.791823387145996, + "learning_rate": 0.00024145738745011126, + "loss": 1.8499, + "step": 201000 + }, + { + "epoch": 5.92, + "grad_norm": 2.558816909790039, + "learning_rate": 0.0002413102954587799, + "loss": 1.8757, + "step": 201500 + }, + { + "epoch": 5.93, + "grad_norm": 2.4598610401153564, + "learning_rate": 0.0002411632034674485, + "loss": 1.8396, + "step": 202000 + }, + { + "epoch": 5.95, + "grad_norm": 1.2680063247680664, + "learning_rate": 0.00024101611147611717, + "loss": 1.9265, + "step": 202500 + }, + { + "epoch": 5.96, + "grad_norm": 13.615561485290527, + "learning_rate": 0.00024086901948478577, + "loss": 1.833, + "step": 203000 + }, + { + "epoch": 5.97, + "grad_norm": 1.367218017578125, + "learning_rate": 0.00024072192749345438, + "loss": 1.836, + "step": 203500 + }, + { + "epoch": 5.99, + "grad_norm": 1.8230597972869873, + "learning_rate": 0.00024057483550212302, + "loss": 1.8833, + "step": 204000 + }, + { + "epoch": 6.0, + "grad_norm": 3.095012903213501, + "learning_rate": 0.00024042774351079162, + "loss": 1.8106, + "step": 204500 + }, + { + "epoch": 6.02, + "grad_norm": 1.357537031173706, + "learning_rate": 0.00024028065151946023, + "loss": 1.809, + "step": 205000 + }, + { + "epoch": 6.03, + "grad_norm": 1.8202615976333618, + "learning_rate": 0.00024013355952812887, + "loss": 1.788, + "step": 205500 + }, + { + "epoch": 6.05, + "grad_norm": 2.0572142601013184, + "learning_rate": 0.00023998646753679747, + "loss": 1.7786, + "step": 206000 + }, + { + "epoch": 6.06, + "grad_norm": 2.056692123413086, + "learning_rate": 0.00023983937554546614, + "loss": 1.801, + "step": 206500 + }, + { + "epoch": 6.08, + "grad_norm": 1.6310349702835083, + "learning_rate": 0.00023969228355413474, + "loss": 1.794, + "step": 207000 + }, + { + "epoch": 6.09, + "grad_norm": 2.9542150497436523, + "learning_rate": 0.00023954519156280335, + "loss": 1.8013, + "step": 207500 + }, + { + "epoch": 6.11, + "grad_norm": 2.0560832023620605, + "learning_rate": 0.00023939809957147199, + "loss": 1.809, + "step": 208000 + }, + { + "epoch": 6.12, + "grad_norm": 2.991140842437744, + "learning_rate": 0.0002392510075801406, + "loss": 1.8122, + "step": 208500 + }, + { + "epoch": 6.14, + "grad_norm": 4.056896686553955, + "learning_rate": 0.0002391039155888092, + "loss": 1.7895, + "step": 209000 + }, + { + "epoch": 6.15, + "grad_norm": 1.4278576374053955, + "learning_rate": 0.00023895682359747784, + "loss": 1.7778, + "step": 209500 + }, + { + "epoch": 6.17, + "grad_norm": 2.918614387512207, + "learning_rate": 0.00023880973160614644, + "loss": 1.7882, + "step": 210000 + }, + { + "epoch": 6.18, + "grad_norm": 6.002877712249756, + "learning_rate": 0.0002386626396148151, + "loss": 1.8129, + "step": 210500 + }, + { + "epoch": 6.2, + "grad_norm": 1.8039432764053345, + "learning_rate": 0.0002385155476234837, + "loss": 1.7879, + "step": 211000 + }, + { + "epoch": 6.21, + "grad_norm": 2.1559925079345703, + "learning_rate": 0.00023836845563215232, + "loss": 1.825, + "step": 211500 + }, + { + "epoch": 6.22, + "grad_norm": 1.6617883443832397, + "learning_rate": 0.00023822136364082095, + "loss": 1.8048, + "step": 212000 + }, + { + "epoch": 6.24, + "grad_norm": 3.7312443256378174, + "learning_rate": 0.00023807427164948956, + "loss": 1.7759, + "step": 212500 + }, + { + "epoch": 6.25, + "grad_norm": 4.757487773895264, + "learning_rate": 0.0002379271796581582, + "loss": 1.8357, + "step": 213000 + }, + { + "epoch": 6.27, + "grad_norm": 1.4285889863967896, + "learning_rate": 0.0002377800876668268, + "loss": 1.8127, + "step": 213500 + }, + { + "epoch": 6.28, + "grad_norm": 1.7722175121307373, + "learning_rate": 0.0002376329956754954, + "loss": 1.79, + "step": 214000 + }, + { + "epoch": 6.3, + "grad_norm": 2.17244291305542, + "learning_rate": 0.00023748590368416407, + "loss": 1.7949, + "step": 214500 + }, + { + "epoch": 6.31, + "grad_norm": 2.136143922805786, + "learning_rate": 0.00023733881169283268, + "loss": 1.8199, + "step": 215000 + }, + { + "epoch": 6.33, + "grad_norm": 1.36685049533844, + "learning_rate": 0.00023719171970150132, + "loss": 1.8101, + "step": 215500 + }, + { + "epoch": 6.34, + "grad_norm": 1.5481481552124023, + "learning_rate": 0.00023704462771016992, + "loss": 1.819, + "step": 216000 + }, + { + "epoch": 6.36, + "grad_norm": 2.1855366230010986, + "learning_rate": 0.00023689753571883853, + "loss": 1.8162, + "step": 216500 + }, + { + "epoch": 6.37, + "grad_norm": 1.5019465684890747, + "learning_rate": 0.00023675044372750717, + "loss": 1.7929, + "step": 217000 + }, + { + "epoch": 6.39, + "grad_norm": 1.4323623180389404, + "learning_rate": 0.00023660335173617577, + "loss": 1.7664, + "step": 217500 + }, + { + "epoch": 6.4, + "grad_norm": 3.942918539047241, + "learning_rate": 0.00023645625974484438, + "loss": 1.8165, + "step": 218000 + }, + { + "epoch": 6.42, + "grad_norm": 1.4521820545196533, + "learning_rate": 0.00023630916775351304, + "loss": 1.7966, + "step": 218500 + }, + { + "epoch": 6.43, + "grad_norm": 3.9033968448638916, + "learning_rate": 0.00023616207576218165, + "loss": 1.7952, + "step": 219000 + }, + { + "epoch": 6.44, + "grad_norm": 2.7818336486816406, + "learning_rate": 0.00023601498377085029, + "loss": 1.7963, + "step": 219500 + }, + { + "epoch": 6.46, + "grad_norm": 4.406651496887207, + "learning_rate": 0.0002358678917795189, + "loss": 1.7901, + "step": 220000 + }, + { + "epoch": 6.47, + "grad_norm": 2.1108040809631348, + "learning_rate": 0.0002357207997881875, + "loss": 1.7887, + "step": 220500 + }, + { + "epoch": 6.49, + "grad_norm": 6.802318096160889, + "learning_rate": 0.00023557370779685613, + "loss": 1.7811, + "step": 221000 + }, + { + "epoch": 6.5, + "grad_norm": 2.7774646282196045, + "learning_rate": 0.00023542661580552474, + "loss": 1.782, + "step": 221500 + }, + { + "epoch": 6.52, + "grad_norm": 1.5817331075668335, + "learning_rate": 0.00023527952381419335, + "loss": 1.7883, + "step": 222000 + }, + { + "epoch": 6.53, + "grad_norm": 1.3639189004898071, + "learning_rate": 0.000235132431822862, + "loss": 1.8137, + "step": 222500 + }, + { + "epoch": 6.55, + "grad_norm": 3.018841505050659, + "learning_rate": 0.00023498533983153062, + "loss": 1.8012, + "step": 223000 + }, + { + "epoch": 6.56, + "grad_norm": 1.6388349533081055, + "learning_rate": 0.00023483824784019925, + "loss": 1.7776, + "step": 223500 + }, + { + "epoch": 6.58, + "grad_norm": 2.2828805446624756, + "learning_rate": 0.00023469115584886786, + "loss": 1.793, + "step": 224000 + }, + { + "epoch": 6.59, + "grad_norm": 1.5646781921386719, + "learning_rate": 0.00023454406385753647, + "loss": 1.8049, + "step": 224500 + }, + { + "epoch": 6.61, + "grad_norm": 2.5620057582855225, + "learning_rate": 0.0002343969718662051, + "loss": 1.8082, + "step": 225000 + }, + { + "epoch": 6.62, + "grad_norm": 1.1866906881332397, + "learning_rate": 0.0002342498798748737, + "loss": 1.8197, + "step": 225500 + }, + { + "epoch": 6.64, + "grad_norm": 8.060154914855957, + "learning_rate": 0.00023410278788354232, + "loss": 1.7765, + "step": 226000 + }, + { + "epoch": 6.65, + "grad_norm": 2.2780892848968506, + "learning_rate": 0.00023395569589221098, + "loss": 1.7903, + "step": 226500 + }, + { + "epoch": 6.66, + "grad_norm": 8.265579223632812, + "learning_rate": 0.0002338086039008796, + "loss": 1.8143, + "step": 227000 + }, + { + "epoch": 6.68, + "grad_norm": 1.2793028354644775, + "learning_rate": 0.00023366151190954822, + "loss": 1.7729, + "step": 227500 + }, + { + "epoch": 6.69, + "grad_norm": 1.7103880643844604, + "learning_rate": 0.00023351441991821683, + "loss": 1.8019, + "step": 228000 + }, + { + "epoch": 6.71, + "grad_norm": 6.143857479095459, + "learning_rate": 0.00023336732792688544, + "loss": 1.7781, + "step": 228500 + }, + { + "epoch": 6.72, + "grad_norm": 3.1208572387695312, + "learning_rate": 0.00023322023593555407, + "loss": 1.791, + "step": 229000 + }, + { + "epoch": 6.74, + "grad_norm": 1.774464726448059, + "learning_rate": 0.00023307314394422268, + "loss": 1.7596, + "step": 229500 + }, + { + "epoch": 6.75, + "grad_norm": 1.9884730577468872, + "learning_rate": 0.0002329260519528913, + "loss": 1.7842, + "step": 230000 + }, + { + "epoch": 6.77, + "grad_norm": 0.9491617679595947, + "learning_rate": 0.00023277895996155995, + "loss": 1.8013, + "step": 230500 + }, + { + "epoch": 6.78, + "grad_norm": 1.6505547761917114, + "learning_rate": 0.00023263186797022856, + "loss": 1.7913, + "step": 231000 + }, + { + "epoch": 6.8, + "grad_norm": 1.262868046760559, + "learning_rate": 0.0002324847759788972, + "loss": 1.8212, + "step": 231500 + }, + { + "epoch": 6.81, + "grad_norm": 1.1688213348388672, + "learning_rate": 0.0002323376839875658, + "loss": 1.815, + "step": 232000 + }, + { + "epoch": 6.83, + "grad_norm": 1.4348254203796387, + "learning_rate": 0.0002321905919962344, + "loss": 1.8335, + "step": 232500 + }, + { + "epoch": 6.84, + "grad_norm": 1.977734923362732, + "learning_rate": 0.00023204350000490304, + "loss": 1.8231, + "step": 233000 + }, + { + "epoch": 6.86, + "grad_norm": 12.462953567504883, + "learning_rate": 0.00023189640801357165, + "loss": 1.8086, + "step": 233500 + }, + { + "epoch": 6.87, + "grad_norm": 1.9486029148101807, + "learning_rate": 0.00023174931602224028, + "loss": 1.7932, + "step": 234000 + }, + { + "epoch": 6.89, + "grad_norm": 6.036319732666016, + "learning_rate": 0.00023160222403090892, + "loss": 1.7906, + "step": 234500 + }, + { + "epoch": 6.9, + "grad_norm": 1.6775847673416138, + "learning_rate": 0.00023145513203957753, + "loss": 1.8026, + "step": 235000 + }, + { + "epoch": 6.91, + "grad_norm": 2.1960055828094482, + "learning_rate": 0.00023130804004824616, + "loss": 1.8093, + "step": 235500 + }, + { + "epoch": 6.93, + "grad_norm": 4.2877197265625, + "learning_rate": 0.00023116094805691477, + "loss": 1.8198, + "step": 236000 + }, + { + "epoch": 6.94, + "grad_norm": 1.4908256530761719, + "learning_rate": 0.0002310138560655834, + "loss": 1.7882, + "step": 236500 + }, + { + "epoch": 6.96, + "grad_norm": 5.148675918579102, + "learning_rate": 0.000230866764074252, + "loss": 1.786, + "step": 237000 + }, + { + "epoch": 6.97, + "grad_norm": 5.5215744972229, + "learning_rate": 0.00023071967208292062, + "loss": 1.8168, + "step": 237500 + }, + { + "epoch": 6.99, + "grad_norm": 21.99406623840332, + "learning_rate": 0.00023057258009158928, + "loss": 1.7993, + "step": 238000 + }, + { + "epoch": 7.0, + "grad_norm": 3.8185484409332275, + "learning_rate": 0.0002304254881002579, + "loss": 1.8105, + "step": 238500 + }, + { + "epoch": 7.02, + "grad_norm": 7.100237846374512, + "learning_rate": 0.00023027839610892652, + "loss": 1.7102, + "step": 239000 + }, + { + "epoch": 7.03, + "grad_norm": 5.007193088531494, + "learning_rate": 0.00023013130411759513, + "loss": 1.7293, + "step": 239500 + }, + { + "epoch": 7.05, + "grad_norm": 1.6522769927978516, + "learning_rate": 0.00022998421212626374, + "loss": 1.7199, + "step": 240000 + }, + { + "epoch": 7.06, + "grad_norm": 1.7635401487350464, + "learning_rate": 0.00022983712013493237, + "loss": 1.7304, + "step": 240500 + }, + { + "epoch": 7.08, + "grad_norm": 2.441962957382202, + "learning_rate": 0.00022969002814360098, + "loss": 1.7291, + "step": 241000 + }, + { + "epoch": 7.09, + "grad_norm": 1.6155414581298828, + "learning_rate": 0.0002295429361522696, + "loss": 1.7477, + "step": 241500 + }, + { + "epoch": 7.11, + "grad_norm": 1.3782049417495728, + "learning_rate": 0.00022939584416093825, + "loss": 1.7552, + "step": 242000 + }, + { + "epoch": 7.12, + "grad_norm": 1.6964011192321777, + "learning_rate": 0.00022924875216960686, + "loss": 1.7233, + "step": 242500 + }, + { + "epoch": 7.13, + "grad_norm": 1.9475128650665283, + "learning_rate": 0.0002291016601782755, + "loss": 1.7603, + "step": 243000 + }, + { + "epoch": 7.15, + "grad_norm": 2.674274444580078, + "learning_rate": 0.0002289545681869441, + "loss": 1.7472, + "step": 243500 + }, + { + "epoch": 7.16, + "grad_norm": 1.6163575649261475, + "learning_rate": 0.0002288074761956127, + "loss": 1.7754, + "step": 244000 + }, + { + "epoch": 7.18, + "grad_norm": 1.3212227821350098, + "learning_rate": 0.00022866038420428134, + "loss": 1.7828, + "step": 244500 + }, + { + "epoch": 7.19, + "grad_norm": 1.9953806400299072, + "learning_rate": 0.00022851329221294995, + "loss": 1.7289, + "step": 245000 + }, + { + "epoch": 7.21, + "grad_norm": 1.5477960109710693, + "learning_rate": 0.00022836620022161856, + "loss": 1.7651, + "step": 245500 + }, + { + "epoch": 7.22, + "grad_norm": 16.499107360839844, + "learning_rate": 0.00022821910823028722, + "loss": 1.7207, + "step": 246000 + }, + { + "epoch": 7.24, + "grad_norm": 2.4671411514282227, + "learning_rate": 0.00022807201623895583, + "loss": 1.7438, + "step": 246500 + }, + { + "epoch": 7.25, + "grad_norm": 4.774916648864746, + "learning_rate": 0.00022792492424762446, + "loss": 1.7721, + "step": 247000 + }, + { + "epoch": 7.27, + "grad_norm": 2.282515287399292, + "learning_rate": 0.00022777783225629307, + "loss": 1.7145, + "step": 247500 + }, + { + "epoch": 7.28, + "grad_norm": 1.9206039905548096, + "learning_rate": 0.00022763074026496168, + "loss": 1.7314, + "step": 248000 + }, + { + "epoch": 7.3, + "grad_norm": 2.7325820922851562, + "learning_rate": 0.0002274836482736303, + "loss": 1.725, + "step": 248500 + }, + { + "epoch": 7.31, + "grad_norm": 1.7279201745986938, + "learning_rate": 0.00022733655628229892, + "loss": 1.7095, + "step": 249000 + }, + { + "epoch": 7.33, + "grad_norm": 1.8310534954071045, + "learning_rate": 0.00022718946429096753, + "loss": 1.7325, + "step": 249500 + }, + { + "epoch": 7.34, + "grad_norm": 5.919315338134766, + "learning_rate": 0.0002270423722996362, + "loss": 1.7301, + "step": 250000 + }, + { + "epoch": 7.35, + "grad_norm": 2.3758251667022705, + "learning_rate": 0.0002268952803083048, + "loss": 1.7529, + "step": 250500 + }, + { + "epoch": 7.37, + "grad_norm": 3.524482011795044, + "learning_rate": 0.00022674818831697343, + "loss": 1.7822, + "step": 251000 + }, + { + "epoch": 7.38, + "grad_norm": 2.1281306743621826, + "learning_rate": 0.00022660109632564204, + "loss": 1.7315, + "step": 251500 + }, + { + "epoch": 7.4, + "grad_norm": 1.6832419633865356, + "learning_rate": 0.00022645400433431064, + "loss": 1.7553, + "step": 252000 + }, + { + "epoch": 7.41, + "grad_norm": 13.24813175201416, + "learning_rate": 0.00022630691234297928, + "loss": 1.7623, + "step": 252500 + }, + { + "epoch": 7.43, + "grad_norm": 1.5083931684494019, + "learning_rate": 0.0002261598203516479, + "loss": 1.7579, + "step": 253000 + }, + { + "epoch": 7.44, + "grad_norm": 3.207942247390747, + "learning_rate": 0.00022601272836031652, + "loss": 1.7248, + "step": 253500 + }, + { + "epoch": 7.46, + "grad_norm": 7.897548675537109, + "learning_rate": 0.00022586563636898516, + "loss": 1.7309, + "step": 254000 + }, + { + "epoch": 7.47, + "grad_norm": 2.2789628505706787, + "learning_rate": 0.00022571854437765376, + "loss": 1.7444, + "step": 254500 + }, + { + "epoch": 7.49, + "grad_norm": 2.129913568496704, + "learning_rate": 0.0002255714523863224, + "loss": 1.7505, + "step": 255000 + }, + { + "epoch": 7.5, + "grad_norm": 2.0121564865112305, + "learning_rate": 0.000225424360394991, + "loss": 1.8085, + "step": 255500 + }, + { + "epoch": 7.52, + "grad_norm": 3.321162462234497, + "learning_rate": 0.00022527726840365964, + "loss": 1.7601, + "step": 256000 + }, + { + "epoch": 7.53, + "grad_norm": 8.002218246459961, + "learning_rate": 0.00022513017641232825, + "loss": 1.7719, + "step": 256500 + }, + { + "epoch": 7.55, + "grad_norm": 1.610587477684021, + "learning_rate": 0.00022498308442099686, + "loss": 1.7373, + "step": 257000 + }, + { + "epoch": 7.56, + "grad_norm": 5.272850513458252, + "learning_rate": 0.0002248359924296655, + "loss": 1.7481, + "step": 257500 + }, + { + "epoch": 7.57, + "grad_norm": 1.519241452217102, + "learning_rate": 0.00022468890043833413, + "loss": 1.7722, + "step": 258000 + }, + { + "epoch": 7.59, + "grad_norm": 3.9072492122650146, + "learning_rate": 0.00022454180844700276, + "loss": 1.7425, + "step": 258500 + }, + { + "epoch": 7.6, + "grad_norm": 16.145322799682617, + "learning_rate": 0.00022439471645567137, + "loss": 1.7645, + "step": 259000 + }, + { + "epoch": 7.62, + "grad_norm": 1.7309913635253906, + "learning_rate": 0.00022424762446433998, + "loss": 1.7388, + "step": 259500 + }, + { + "epoch": 7.63, + "grad_norm": 1.4854837656021118, + "learning_rate": 0.0002241005324730086, + "loss": 1.7728, + "step": 260000 + }, + { + "epoch": 7.65, + "grad_norm": 1.9065077304840088, + "learning_rate": 0.00022395344048167722, + "loss": 1.7306, + "step": 260500 + }, + { + "epoch": 7.66, + "grad_norm": 2.6869423389434814, + "learning_rate": 0.00022380634849034582, + "loss": 1.751, + "step": 261000 + }, + { + "epoch": 7.68, + "grad_norm": 2.3300232887268066, + "learning_rate": 0.00022365925649901446, + "loss": 1.7643, + "step": 261500 + }, + { + "epoch": 7.69, + "grad_norm": 1.6109992265701294, + "learning_rate": 0.0002235121645076831, + "loss": 1.7848, + "step": 262000 + }, + { + "epoch": 7.71, + "grad_norm": 3.2768895626068115, + "learning_rate": 0.00022336507251635173, + "loss": 1.7638, + "step": 262500 + }, + { + "epoch": 7.72, + "grad_norm": 4.605926036834717, + "learning_rate": 0.00022321798052502034, + "loss": 1.7801, + "step": 263000 + }, + { + "epoch": 7.74, + "grad_norm": 1.451865553855896, + "learning_rate": 0.00022307088853368894, + "loss": 1.7229, + "step": 263500 + }, + { + "epoch": 7.75, + "grad_norm": 1.938750982284546, + "learning_rate": 0.00022292379654235758, + "loss": 1.7575, + "step": 264000 + }, + { + "epoch": 7.77, + "grad_norm": 8.690546989440918, + "learning_rate": 0.00022277670455102619, + "loss": 1.7572, + "step": 264500 + }, + { + "epoch": 7.78, + "grad_norm": 6.99992036819458, + "learning_rate": 0.0002226296125596948, + "loss": 1.7912, + "step": 265000 + }, + { + "epoch": 7.8, + "grad_norm": 1.375183343887329, + "learning_rate": 0.00022248252056836343, + "loss": 1.7588, + "step": 265500 + }, + { + "epoch": 7.81, + "grad_norm": 1.4782127141952515, + "learning_rate": 0.00022233542857703206, + "loss": 1.7929, + "step": 266000 + }, + { + "epoch": 7.82, + "grad_norm": 3.282301187515259, + "learning_rate": 0.0002221883365857007, + "loss": 1.7855, + "step": 266500 + }, + { + "epoch": 7.84, + "grad_norm": 2.2392537593841553, + "learning_rate": 0.0002220412445943693, + "loss": 1.7603, + "step": 267000 + }, + { + "epoch": 7.85, + "grad_norm": 3.2060225009918213, + "learning_rate": 0.0002218941526030379, + "loss": 1.7416, + "step": 267500 + }, + { + "epoch": 7.87, + "grad_norm": 2.6562771797180176, + "learning_rate": 0.00022174706061170655, + "loss": 1.766, + "step": 268000 + }, + { + "epoch": 7.88, + "grad_norm": 3.6191391944885254, + "learning_rate": 0.00022159996862037516, + "loss": 1.7746, + "step": 268500 + }, + { + "epoch": 7.9, + "grad_norm": 1.9865895509719849, + "learning_rate": 0.00022145287662904376, + "loss": 1.7841, + "step": 269000 + }, + { + "epoch": 7.91, + "grad_norm": 1.3776191473007202, + "learning_rate": 0.0002213057846377124, + "loss": 1.7709, + "step": 269500 + }, + { + "epoch": 7.93, + "grad_norm": 2.238267183303833, + "learning_rate": 0.00022115869264638103, + "loss": 1.7764, + "step": 270000 + }, + { + "epoch": 7.94, + "grad_norm": 3.3425824642181396, + "learning_rate": 0.00022101160065504967, + "loss": 1.7901, + "step": 270500 + }, + { + "epoch": 7.96, + "grad_norm": 2.183436632156372, + "learning_rate": 0.00022086450866371827, + "loss": 1.757, + "step": 271000 + }, + { + "epoch": 7.97, + "grad_norm": 3.393548011779785, + "learning_rate": 0.00022071741667238688, + "loss": 1.7538, + "step": 271500 + }, + { + "epoch": 7.99, + "grad_norm": 4.727015018463135, + "learning_rate": 0.00022057032468105552, + "loss": 1.7585, + "step": 272000 + }, + { + "epoch": 8.0, + "grad_norm": 2.0307464599609375, + "learning_rate": 0.00022042323268972412, + "loss": 1.7659, + "step": 272500 + }, + { + "epoch": 8.02, + "grad_norm": 4.098311424255371, + "learning_rate": 0.00022027614069839273, + "loss": 1.7191, + "step": 273000 + }, + { + "epoch": 8.03, + "grad_norm": 2.290457248687744, + "learning_rate": 0.00022012904870706137, + "loss": 1.7206, + "step": 273500 + }, + { + "epoch": 8.04, + "grad_norm": 2.1348979473114014, + "learning_rate": 0.00021998195671573, + "loss": 1.6935, + "step": 274000 + }, + { + "epoch": 8.06, + "grad_norm": 2.5333049297332764, + "learning_rate": 0.00021983486472439864, + "loss": 1.6709, + "step": 274500 + }, + { + "epoch": 8.07, + "grad_norm": 2.1333959102630615, + "learning_rate": 0.00021968777273306724, + "loss": 1.7318, + "step": 275000 + }, + { + "epoch": 8.09, + "grad_norm": 1.6256059408187866, + "learning_rate": 0.00021954068074173585, + "loss": 1.7007, + "step": 275500 + }, + { + "epoch": 8.1, + "grad_norm": 1.4450695514678955, + "learning_rate": 0.00021939358875040449, + "loss": 1.7038, + "step": 276000 + }, + { + "epoch": 8.12, + "grad_norm": 4.230172634124756, + "learning_rate": 0.0002192464967590731, + "loss": 1.6738, + "step": 276500 + }, + { + "epoch": 8.13, + "grad_norm": 2.046879529953003, + "learning_rate": 0.00021909940476774173, + "loss": 1.6658, + "step": 277000 + }, + { + "epoch": 8.15, + "grad_norm": 2.2633895874023438, + "learning_rate": 0.00021895231277641034, + "loss": 1.6929, + "step": 277500 + }, + { + "epoch": 8.16, + "grad_norm": 34.2008171081543, + "learning_rate": 0.00021880522078507897, + "loss": 1.6956, + "step": 278000 + }, + { + "epoch": 8.18, + "grad_norm": 11.11546516418457, + "learning_rate": 0.0002186581287937476, + "loss": 1.7103, + "step": 278500 + }, + { + "epoch": 8.19, + "grad_norm": 5.735045909881592, + "learning_rate": 0.0002185110368024162, + "loss": 1.6991, + "step": 279000 + }, + { + "epoch": 8.21, + "grad_norm": 2.9397599697113037, + "learning_rate": 0.00021836394481108485, + "loss": 1.6909, + "step": 279500 + }, + { + "epoch": 8.22, + "grad_norm": 1.9226768016815186, + "learning_rate": 0.00021821685281975345, + "loss": 1.6894, + "step": 280000 + }, + { + "epoch": 8.24, + "grad_norm": 2.0962398052215576, + "learning_rate": 0.00021806976082842206, + "loss": 1.7094, + "step": 280500 + }, + { + "epoch": 8.25, + "grad_norm": 13.345786094665527, + "learning_rate": 0.0002179226688370907, + "loss": 1.6927, + "step": 281000 + }, + { + "epoch": 8.26, + "grad_norm": 3.1707746982574463, + "learning_rate": 0.0002177755768457593, + "loss": 1.7004, + "step": 281500 + }, + { + "epoch": 8.28, + "grad_norm": 3.3897292613983154, + "learning_rate": 0.00021762848485442797, + "loss": 1.7094, + "step": 282000 + }, + { + "epoch": 8.29, + "grad_norm": 5.791382312774658, + "learning_rate": 0.00021748139286309657, + "loss": 1.7279, + "step": 282500 + }, + { + "epoch": 8.31, + "grad_norm": 2.3419294357299805, + "learning_rate": 0.00021733430087176518, + "loss": 1.7053, + "step": 283000 + }, + { + "epoch": 8.32, + "grad_norm": 1.9053447246551514, + "learning_rate": 0.00021718720888043382, + "loss": 1.7225, + "step": 283500 + }, + { + "epoch": 8.34, + "grad_norm": 1.8822693824768066, + "learning_rate": 0.00021704011688910242, + "loss": 1.6968, + "step": 284000 + }, + { + "epoch": 8.35, + "grad_norm": 1.4814122915267944, + "learning_rate": 0.00021689302489777103, + "loss": 1.6697, + "step": 284500 + }, + { + "epoch": 8.37, + "grad_norm": 8.579392433166504, + "learning_rate": 0.00021674593290643967, + "loss": 1.7206, + "step": 285000 + }, + { + "epoch": 8.38, + "grad_norm": 1.9921379089355469, + "learning_rate": 0.00021659884091510827, + "loss": 1.7113, + "step": 285500 + }, + { + "epoch": 8.4, + "grad_norm": 2.2425730228424072, + "learning_rate": 0.00021645174892377694, + "loss": 1.6893, + "step": 286000 + }, + { + "epoch": 8.41, + "grad_norm": 4.465683460235596, + "learning_rate": 0.00021630465693244554, + "loss": 1.6997, + "step": 286500 + }, + { + "epoch": 8.43, + "grad_norm": 11.377062797546387, + "learning_rate": 0.00021615756494111415, + "loss": 1.691, + "step": 287000 + }, + { + "epoch": 8.44, + "grad_norm": 4.9282636642456055, + "learning_rate": 0.00021601047294978279, + "loss": 1.6722, + "step": 287500 + }, + { + "epoch": 8.46, + "grad_norm": 4.008066177368164, + "learning_rate": 0.0002158633809584514, + "loss": 1.6498, + "step": 288000 + }, + { + "epoch": 8.47, + "grad_norm": 1.7437248229980469, + "learning_rate": 0.00021571628896712, + "loss": 1.7336, + "step": 288500 + }, + { + "epoch": 8.49, + "grad_norm": 1.627820372581482, + "learning_rate": 0.00021556919697578864, + "loss": 1.6973, + "step": 289000 + }, + { + "epoch": 8.5, + "grad_norm": 5.1567254066467285, + "learning_rate": 0.00021542210498445724, + "loss": 1.6974, + "step": 289500 + }, + { + "epoch": 8.51, + "grad_norm": 1.3468036651611328, + "learning_rate": 0.0002152750129931259, + "loss": 1.6815, + "step": 290000 + }, + { + "epoch": 8.53, + "grad_norm": 1.6024253368377686, + "learning_rate": 0.0002151279210017945, + "loss": 1.6784, + "step": 290500 + }, + { + "epoch": 8.54, + "grad_norm": 1.915425181388855, + "learning_rate": 0.00021498082901046312, + "loss": 1.7297, + "step": 291000 + }, + { + "epoch": 8.56, + "grad_norm": 2.750544548034668, + "learning_rate": 0.00021483373701913175, + "loss": 1.717, + "step": 291500 + }, + { + "epoch": 8.57, + "grad_norm": 8.173510551452637, + "learning_rate": 0.00021468664502780036, + "loss": 1.6971, + "step": 292000 + }, + { + "epoch": 8.59, + "grad_norm": 2.2209818363189697, + "learning_rate": 0.00021453955303646897, + "loss": 1.6803, + "step": 292500 + }, + { + "epoch": 8.6, + "grad_norm": 4.283031463623047, + "learning_rate": 0.0002143924610451376, + "loss": 1.7101, + "step": 293000 + }, + { + "epoch": 8.62, + "grad_norm": 2.29679799079895, + "learning_rate": 0.0002142453690538062, + "loss": 1.7021, + "step": 293500 + }, + { + "epoch": 8.63, + "grad_norm": 3.301649570465088, + "learning_rate": 0.00021409827706247487, + "loss": 1.727, + "step": 294000 + }, + { + "epoch": 8.65, + "grad_norm": 38.071407318115234, + "learning_rate": 0.00021395118507114348, + "loss": 1.7318, + "step": 294500 + }, + { + "epoch": 8.66, + "grad_norm": 1.9748557806015015, + "learning_rate": 0.0002138040930798121, + "loss": 1.7327, + "step": 295000 + }, + { + "epoch": 8.68, + "grad_norm": 1.8591829538345337, + "learning_rate": 0.00021365700108848072, + "loss": 1.6944, + "step": 295500 + }, + { + "epoch": 8.69, + "grad_norm": 6.686271667480469, + "learning_rate": 0.00021350990909714933, + "loss": 1.7289, + "step": 296000 + }, + { + "epoch": 8.71, + "grad_norm": 2.0325803756713867, + "learning_rate": 0.00021336281710581794, + "loss": 1.7074, + "step": 296500 + }, + { + "epoch": 8.72, + "grad_norm": 2.5102651119232178, + "learning_rate": 0.00021321572511448657, + "loss": 1.7061, + "step": 297000 + }, + { + "epoch": 8.73, + "grad_norm": 5.305497646331787, + "learning_rate": 0.0002130686331231552, + "loss": 1.7047, + "step": 297500 + }, + { + "epoch": 8.75, + "grad_norm": 2.878256320953369, + "learning_rate": 0.00021292154113182384, + "loss": 1.7109, + "step": 298000 + }, + { + "epoch": 8.76, + "grad_norm": 1.7807819843292236, + "learning_rate": 0.00021277444914049245, + "loss": 1.7136, + "step": 298500 + }, + { + "epoch": 8.78, + "grad_norm": 1.8971943855285645, + "learning_rate": 0.00021262735714916106, + "loss": 1.7319, + "step": 299000 + }, + { + "epoch": 8.79, + "grad_norm": 2.650453805923462, + "learning_rate": 0.0002124802651578297, + "loss": 1.7436, + "step": 299500 + }, + { + "epoch": 8.81, + "grad_norm": 3.341278076171875, + "learning_rate": 0.0002123331731664983, + "loss": 1.6982, + "step": 300000 + }, + { + "epoch": 8.82, + "grad_norm": 4.539509296417236, + "learning_rate": 0.00021218608117516693, + "loss": 1.7149, + "step": 300500 + }, + { + "epoch": 8.84, + "grad_norm": 1.6525373458862305, + "learning_rate": 0.00021203898918383554, + "loss": 1.7154, + "step": 301000 + }, + { + "epoch": 8.85, + "grad_norm": 2.1805810928344727, + "learning_rate": 0.00021189189719250418, + "loss": 1.7244, + "step": 301500 + }, + { + "epoch": 8.87, + "grad_norm": 1.623687744140625, + "learning_rate": 0.0002117448052011728, + "loss": 1.7109, + "step": 302000 + }, + { + "epoch": 8.88, + "grad_norm": 1.8576191663742065, + "learning_rate": 0.00021159771320984142, + "loss": 1.71, + "step": 302500 + }, + { + "epoch": 8.9, + "grad_norm": 2.7003326416015625, + "learning_rate": 0.00021145062121851005, + "loss": 1.717, + "step": 303000 + }, + { + "epoch": 8.91, + "grad_norm": 1.5555850267410278, + "learning_rate": 0.00021130352922717866, + "loss": 1.6771, + "step": 303500 + }, + { + "epoch": 8.93, + "grad_norm": 1.8485440015792847, + "learning_rate": 0.00021115643723584727, + "loss": 1.6907, + "step": 304000 + }, + { + "epoch": 8.94, + "grad_norm": 1.9917840957641602, + "learning_rate": 0.0002110093452445159, + "loss": 1.6929, + "step": 304500 + }, + { + "epoch": 8.95, + "grad_norm": 2.54939866065979, + "learning_rate": 0.0002108622532531845, + "loss": 1.7211, + "step": 305000 + }, + { + "epoch": 8.97, + "grad_norm": 4.3445234298706055, + "learning_rate": 0.00021071516126185317, + "loss": 1.7038, + "step": 305500 + }, + { + "epoch": 8.98, + "grad_norm": 2.9025015830993652, + "learning_rate": 0.00021056806927052178, + "loss": 1.6987, + "step": 306000 + }, + { + "epoch": 9.0, + "grad_norm": 3.951500177383423, + "learning_rate": 0.0002104209772791904, + "loss": 1.7063, + "step": 306500 + }, + { + "epoch": 9.01, + "grad_norm": 2.2812907695770264, + "learning_rate": 0.00021027388528785902, + "loss": 1.6754, + "step": 307000 + }, + { + "epoch": 9.03, + "grad_norm": 1.313977837562561, + "learning_rate": 0.00021012679329652763, + "loss": 1.6384, + "step": 307500 + }, + { + "epoch": 9.04, + "grad_norm": 1.1051950454711914, + "learning_rate": 0.00020997970130519624, + "loss": 1.6372, + "step": 308000 + }, + { + "epoch": 9.06, + "grad_norm": 2.6401305198669434, + "learning_rate": 0.00020983260931386487, + "loss": 1.6535, + "step": 308500 + }, + { + "epoch": 9.07, + "grad_norm": 1.477562665939331, + "learning_rate": 0.00020968551732253348, + "loss": 1.6342, + "step": 309000 + }, + { + "epoch": 9.09, + "grad_norm": 1.7968777418136597, + "learning_rate": 0.00020953842533120214, + "loss": 1.6163, + "step": 309500 + }, + { + "epoch": 9.1, + "grad_norm": 1.8777967691421509, + "learning_rate": 0.00020939133333987075, + "loss": 1.6466, + "step": 310000 + }, + { + "epoch": 9.12, + "grad_norm": 1.970590353012085, + "learning_rate": 0.00020924424134853936, + "loss": 1.6481, + "step": 310500 + }, + { + "epoch": 9.13, + "grad_norm": 11.109952926635742, + "learning_rate": 0.000209097149357208, + "loss": 1.6634, + "step": 311000 + }, + { + "epoch": 9.15, + "grad_norm": 1.5415191650390625, + "learning_rate": 0.0002089500573658766, + "loss": 1.637, + "step": 311500 + }, + { + "epoch": 9.16, + "grad_norm": 2.2349135875701904, + "learning_rate": 0.0002088029653745452, + "loss": 1.6373, + "step": 312000 + }, + { + "epoch": 9.18, + "grad_norm": 2.3896191120147705, + "learning_rate": 0.00020865587338321384, + "loss": 1.6318, + "step": 312500 + }, + { + "epoch": 9.19, + "grad_norm": 1.5920429229736328, + "learning_rate": 0.00020850878139188245, + "loss": 1.6383, + "step": 313000 + }, + { + "epoch": 9.2, + "grad_norm": 1.485474944114685, + "learning_rate": 0.0002083616894005511, + "loss": 1.6392, + "step": 313500 + }, + { + "epoch": 9.22, + "grad_norm": 9.920788764953613, + "learning_rate": 0.00020821459740921972, + "loss": 1.6407, + "step": 314000 + }, + { + "epoch": 9.23, + "grad_norm": 5.970518589019775, + "learning_rate": 0.00020806750541788833, + "loss": 1.6389, + "step": 314500 + }, + { + "epoch": 9.25, + "grad_norm": 1.5371183156967163, + "learning_rate": 0.00020792041342655696, + "loss": 1.6485, + "step": 315000 + }, + { + "epoch": 9.26, + "grad_norm": 2.242612600326538, + "learning_rate": 0.00020777332143522557, + "loss": 1.6344, + "step": 315500 + }, + { + "epoch": 9.28, + "grad_norm": 3.137301445007324, + "learning_rate": 0.00020762622944389418, + "loss": 1.6269, + "step": 316000 + }, + { + "epoch": 9.29, + "grad_norm": 3.391427993774414, + "learning_rate": 0.0002074791374525628, + "loss": 1.6575, + "step": 316500 + }, + { + "epoch": 9.31, + "grad_norm": 7.92528772354126, + "learning_rate": 0.00020733204546123142, + "loss": 1.6333, + "step": 317000 + }, + { + "epoch": 9.32, + "grad_norm": 1.0485262870788574, + "learning_rate": 0.00020718495346990008, + "loss": 1.6574, + "step": 317500 + }, + { + "epoch": 9.34, + "grad_norm": 1.6511611938476562, + "learning_rate": 0.0002070378614785687, + "loss": 1.6474, + "step": 318000 + }, + { + "epoch": 9.35, + "grad_norm": 1.3929743766784668, + "learning_rate": 0.0002068907694872373, + "loss": 1.6484, + "step": 318500 + }, + { + "epoch": 9.37, + "grad_norm": 1.0993067026138306, + "learning_rate": 0.00020674367749590593, + "loss": 1.6312, + "step": 319000 + }, + { + "epoch": 9.38, + "grad_norm": 2.8097622394561768, + "learning_rate": 0.00020659658550457454, + "loss": 1.6438, + "step": 319500 + }, + { + "epoch": 9.4, + "grad_norm": 2.274348020553589, + "learning_rate": 0.00020644949351324314, + "loss": 1.6258, + "step": 320000 + }, + { + "epoch": 9.41, + "grad_norm": 1.9707189798355103, + "learning_rate": 0.00020630240152191178, + "loss": 1.6619, + "step": 320500 + }, + { + "epoch": 9.42, + "grad_norm": 2.7805778980255127, + "learning_rate": 0.0002061553095305804, + "loss": 1.6537, + "step": 321000 + }, + { + "epoch": 9.44, + "grad_norm": 2.856705665588379, + "learning_rate": 0.00020600821753924905, + "loss": 1.6745, + "step": 321500 + }, + { + "epoch": 9.45, + "grad_norm": 1.3102498054504395, + "learning_rate": 0.00020586112554791766, + "loss": 1.6662, + "step": 322000 + }, + { + "epoch": 9.47, + "grad_norm": 4.1165690422058105, + "learning_rate": 0.00020571403355658626, + "loss": 1.6531, + "step": 322500 + }, + { + "epoch": 9.48, + "grad_norm": 5.66050386428833, + "learning_rate": 0.0002055669415652549, + "loss": 1.669, + "step": 323000 + }, + { + "epoch": 9.5, + "grad_norm": 1.3612234592437744, + "learning_rate": 0.0002054198495739235, + "loss": 1.642, + "step": 323500 + }, + { + "epoch": 9.51, + "grad_norm": 2.5399467945098877, + "learning_rate": 0.00020527275758259214, + "loss": 1.677, + "step": 324000 + }, + { + "epoch": 9.53, + "grad_norm": 5.590959548950195, + "learning_rate": 0.00020512566559126075, + "loss": 1.6673, + "step": 324500 + }, + { + "epoch": 9.54, + "grad_norm": 1.6591867208480835, + "learning_rate": 0.00020497857359992936, + "loss": 1.6523, + "step": 325000 + }, + { + "epoch": 9.56, + "grad_norm": 2.3825442790985107, + "learning_rate": 0.00020483148160859802, + "loss": 1.6472, + "step": 325500 + }, + { + "epoch": 9.57, + "grad_norm": 1.509970784187317, + "learning_rate": 0.00020468438961726663, + "loss": 1.6095, + "step": 326000 + }, + { + "epoch": 9.59, + "grad_norm": 1.0791665315628052, + "learning_rate": 0.00020453729762593526, + "loss": 1.6545, + "step": 326500 + }, + { + "epoch": 9.6, + "grad_norm": 1.480979323387146, + "learning_rate": 0.00020439020563460387, + "loss": 1.6691, + "step": 327000 + }, + { + "epoch": 9.62, + "grad_norm": 1.9298070669174194, + "learning_rate": 0.00020424311364327248, + "loss": 1.6561, + "step": 327500 + }, + { + "epoch": 9.63, + "grad_norm": 2.1420977115631104, + "learning_rate": 0.0002040960216519411, + "loss": 1.6332, + "step": 328000 + }, + { + "epoch": 9.64, + "grad_norm": 1.6322475671768188, + "learning_rate": 0.00020394892966060972, + "loss": 1.6576, + "step": 328500 + }, + { + "epoch": 9.66, + "grad_norm": 1.7701492309570312, + "learning_rate": 0.00020380183766927833, + "loss": 1.6603, + "step": 329000 + }, + { + "epoch": 9.67, + "grad_norm": 22.195663452148438, + "learning_rate": 0.000203654745677947, + "loss": 1.6605, + "step": 329500 + }, + { + "epoch": 9.69, + "grad_norm": 1.735055685043335, + "learning_rate": 0.0002035076536866156, + "loss": 1.6728, + "step": 330000 + }, + { + "epoch": 9.7, + "grad_norm": 1.3548341989517212, + "learning_rate": 0.00020336056169528423, + "loss": 1.6606, + "step": 330500 + }, + { + "epoch": 9.72, + "grad_norm": 2.0401058197021484, + "learning_rate": 0.00020321346970395284, + "loss": 1.6687, + "step": 331000 + }, + { + "epoch": 9.73, + "grad_norm": 16.856937408447266, + "learning_rate": 0.00020306637771262144, + "loss": 1.6369, + "step": 331500 + }, + { + "epoch": 9.75, + "grad_norm": 3.8989596366882324, + "learning_rate": 0.00020291928572129008, + "loss": 1.6454, + "step": 332000 + }, + { + "epoch": 9.76, + "grad_norm": 1.6723474264144897, + "learning_rate": 0.0002027721937299587, + "loss": 1.6723, + "step": 332500 + }, + { + "epoch": 9.78, + "grad_norm": 1.7381031513214111, + "learning_rate": 0.0002026251017386273, + "loss": 1.6672, + "step": 333000 + }, + { + "epoch": 9.79, + "grad_norm": 1.7013987302780151, + "learning_rate": 0.00020247800974729596, + "loss": 1.6666, + "step": 333500 + }, + { + "epoch": 9.81, + "grad_norm": 1.6793595552444458, + "learning_rate": 0.00020233091775596456, + "loss": 1.6594, + "step": 334000 + }, + { + "epoch": 9.82, + "grad_norm": 1.2351815700531006, + "learning_rate": 0.0002021838257646332, + "loss": 1.6475, + "step": 334500 + }, + { + "epoch": 9.84, + "grad_norm": 3.5721399784088135, + "learning_rate": 0.0002020367337733018, + "loss": 1.6584, + "step": 335000 + }, + { + "epoch": 9.85, + "grad_norm": 1.5881208181381226, + "learning_rate": 0.0002018896417819704, + "loss": 1.6584, + "step": 335500 + }, + { + "epoch": 9.87, + "grad_norm": 2.2431886196136475, + "learning_rate": 0.00020174254979063905, + "loss": 1.6632, + "step": 336000 + }, + { + "epoch": 9.88, + "grad_norm": 1.9759477376937866, + "learning_rate": 0.00020159545779930766, + "loss": 1.6098, + "step": 336500 + }, + { + "epoch": 9.89, + "grad_norm": 2.4007914066314697, + "learning_rate": 0.00020144836580797626, + "loss": 1.6553, + "step": 337000 + }, + { + "epoch": 9.91, + "grad_norm": 1.1646785736083984, + "learning_rate": 0.00020130127381664492, + "loss": 1.6917, + "step": 337500 + }, + { + "epoch": 9.92, + "grad_norm": 2.7321279048919678, + "learning_rate": 0.00020115418182531353, + "loss": 1.6165, + "step": 338000 + }, + { + "epoch": 9.94, + "grad_norm": 2.1226370334625244, + "learning_rate": 0.00020100708983398217, + "loss": 1.6583, + "step": 338500 + }, + { + "epoch": 9.95, + "grad_norm": 16.655624389648438, + "learning_rate": 0.00020085999784265077, + "loss": 1.6594, + "step": 339000 + }, + { + "epoch": 9.97, + "grad_norm": 6.142359256744385, + "learning_rate": 0.00020071290585131938, + "loss": 1.6343, + "step": 339500 + }, + { + "epoch": 9.98, + "grad_norm": 2.1720430850982666, + "learning_rate": 0.00020056581385998802, + "loss": 1.6509, + "step": 340000 + }, + { + "epoch": 10.0, + "grad_norm": 2.870986223220825, + "learning_rate": 0.00020041872186865662, + "loss": 1.6707, + "step": 340500 + }, + { + "epoch": 10.01, + "grad_norm": 1.382546067237854, + "learning_rate": 0.00020027162987732526, + "loss": 1.5759, + "step": 341000 + }, + { + "epoch": 10.03, + "grad_norm": 1.2603851556777954, + "learning_rate": 0.0002001245378859939, + "loss": 1.5937, + "step": 341500 + }, + { + "epoch": 10.04, + "grad_norm": 2.455425262451172, + "learning_rate": 0.0001999774458946625, + "loss": 1.5859, + "step": 342000 + }, + { + "epoch": 10.06, + "grad_norm": 1.6993873119354248, + "learning_rate": 0.00019983035390333114, + "loss": 1.6102, + "step": 342500 + }, + { + "epoch": 10.07, + "grad_norm": 2.338402509689331, + "learning_rate": 0.00019968326191199974, + "loss": 1.6058, + "step": 343000 + }, + { + "epoch": 10.09, + "grad_norm": 1.4525296688079834, + "learning_rate": 0.00019953616992066838, + "loss": 1.5925, + "step": 343500 + }, + { + "epoch": 10.1, + "grad_norm": 1.7461038827896118, + "learning_rate": 0.00019938907792933699, + "loss": 1.5824, + "step": 344000 + }, + { + "epoch": 10.11, + "grad_norm": 20.686992645263672, + "learning_rate": 0.0001992419859380056, + "loss": 1.6173, + "step": 344500 + }, + { + "epoch": 10.13, + "grad_norm": 11.99207592010498, + "learning_rate": 0.00019909489394667423, + "loss": 1.5998, + "step": 345000 + }, + { + "epoch": 10.14, + "grad_norm": 2.926992177963257, + "learning_rate": 0.00019894780195534286, + "loss": 1.5864, + "step": 345500 + }, + { + "epoch": 10.16, + "grad_norm": 3.7215046882629395, + "learning_rate": 0.0001988007099640115, + "loss": 1.5856, + "step": 346000 + }, + { + "epoch": 10.17, + "grad_norm": 1.7094500064849854, + "learning_rate": 0.0001986536179726801, + "loss": 1.5964, + "step": 346500 + }, + { + "epoch": 10.19, + "grad_norm": 2.5829222202301025, + "learning_rate": 0.0001985065259813487, + "loss": 1.5943, + "step": 347000 + }, + { + "epoch": 10.2, + "grad_norm": 1.6218924522399902, + "learning_rate": 0.00019835943399001735, + "loss": 1.5808, + "step": 347500 + }, + { + "epoch": 10.22, + "grad_norm": 1.952506184577942, + "learning_rate": 0.00019821234199868595, + "loss": 1.5548, + "step": 348000 + }, + { + "epoch": 10.23, + "grad_norm": 2.1660783290863037, + "learning_rate": 0.00019806525000735456, + "loss": 1.585, + "step": 348500 + }, + { + "epoch": 10.25, + "grad_norm": 1.8295018672943115, + "learning_rate": 0.0001979181580160232, + "loss": 1.5975, + "step": 349000 + }, + { + "epoch": 10.26, + "grad_norm": 2.3375236988067627, + "learning_rate": 0.00019777106602469183, + "loss": 1.617, + "step": 349500 + }, + { + "epoch": 10.28, + "grad_norm": 1.323554515838623, + "learning_rate": 0.00019762397403336047, + "loss": 1.5915, + "step": 350000 + }, + { + "epoch": 10.29, + "grad_norm": 1.259614109992981, + "learning_rate": 0.00019747688204202907, + "loss": 1.6126, + "step": 350500 + }, + { + "epoch": 10.31, + "grad_norm": 27.243751525878906, + "learning_rate": 0.00019732979005069768, + "loss": 1.5886, + "step": 351000 + }, + { + "epoch": 10.32, + "grad_norm": 1.4931347370147705, + "learning_rate": 0.00019718269805936632, + "loss": 1.6165, + "step": 351500 + }, + { + "epoch": 10.33, + "grad_norm": 1.3573870658874512, + "learning_rate": 0.00019703560606803492, + "loss": 1.5967, + "step": 352000 + }, + { + "epoch": 10.35, + "grad_norm": 2.393355369567871, + "learning_rate": 0.00019688851407670353, + "loss": 1.6033, + "step": 352500 + }, + { + "epoch": 10.36, + "grad_norm": 17.975017547607422, + "learning_rate": 0.00019674142208537217, + "loss": 1.6176, + "step": 353000 + }, + { + "epoch": 10.38, + "grad_norm": 2.195270538330078, + "learning_rate": 0.0001965943300940408, + "loss": 1.6022, + "step": 353500 + }, + { + "epoch": 10.39, + "grad_norm": 1.5013370513916016, + "learning_rate": 0.00019644723810270944, + "loss": 1.5999, + "step": 354000 + }, + { + "epoch": 10.41, + "grad_norm": 16.84026527404785, + "learning_rate": 0.00019630014611137804, + "loss": 1.6061, + "step": 354500 + }, + { + "epoch": 10.42, + "grad_norm": 2.8341875076293945, + "learning_rate": 0.00019615305412004665, + "loss": 1.5742, + "step": 355000 + }, + { + "epoch": 10.44, + "grad_norm": 1.6109588146209717, + "learning_rate": 0.00019600596212871529, + "loss": 1.5941, + "step": 355500 + }, + { + "epoch": 10.45, + "grad_norm": 5.435859203338623, + "learning_rate": 0.0001958588701373839, + "loss": 1.5875, + "step": 356000 + }, + { + "epoch": 10.47, + "grad_norm": 4.453019142150879, + "learning_rate": 0.0001957117781460525, + "loss": 1.5901, + "step": 356500 + }, + { + "epoch": 10.48, + "grad_norm": 2.145413875579834, + "learning_rate": 0.00019556468615472116, + "loss": 1.5946, + "step": 357000 + }, + { + "epoch": 10.5, + "grad_norm": 2.531158208847046, + "learning_rate": 0.00019541759416338977, + "loss": 1.5865, + "step": 357500 + }, + { + "epoch": 10.51, + "grad_norm": 1.8745834827423096, + "learning_rate": 0.0001952705021720584, + "loss": 1.6033, + "step": 358000 + }, + { + "epoch": 10.53, + "grad_norm": 1.3398380279541016, + "learning_rate": 0.000195123410180727, + "loss": 1.6205, + "step": 358500 + }, + { + "epoch": 10.54, + "grad_norm": 6.710257053375244, + "learning_rate": 0.00019497631818939562, + "loss": 1.6171, + "step": 359000 + }, + { + "epoch": 10.56, + "grad_norm": 1.5463893413543701, + "learning_rate": 0.00019482922619806425, + "loss": 1.5913, + "step": 359500 + }, + { + "epoch": 10.57, + "grad_norm": 1.32403564453125, + "learning_rate": 0.00019468213420673286, + "loss": 1.6129, + "step": 360000 + }, + { + "epoch": 10.58, + "grad_norm": 1.1276665925979614, + "learning_rate": 0.00019453504221540147, + "loss": 1.6063, + "step": 360500 + }, + { + "epoch": 10.6, + "grad_norm": 3.2467503547668457, + "learning_rate": 0.00019438795022407013, + "loss": 1.6036, + "step": 361000 + }, + { + "epoch": 10.61, + "grad_norm": 2.091618537902832, + "learning_rate": 0.00019424085823273874, + "loss": 1.5919, + "step": 361500 + }, + { + "epoch": 10.63, + "grad_norm": 2.571202516555786, + "learning_rate": 0.00019409376624140737, + "loss": 1.6218, + "step": 362000 + }, + { + "epoch": 10.64, + "grad_norm": 1.7798309326171875, + "learning_rate": 0.00019394667425007598, + "loss": 1.5919, + "step": 362500 + }, + { + "epoch": 10.66, + "grad_norm": 1.96729576587677, + "learning_rate": 0.0001937995822587446, + "loss": 1.6116, + "step": 363000 + }, + { + "epoch": 10.67, + "grad_norm": 3.229706048965454, + "learning_rate": 0.00019365249026741322, + "loss": 1.5993, + "step": 363500 + }, + { + "epoch": 10.69, + "grad_norm": 6.615269184112549, + "learning_rate": 0.00019350539827608183, + "loss": 1.6034, + "step": 364000 + }, + { + "epoch": 10.7, + "grad_norm": 6.847731113433838, + "learning_rate": 0.00019335830628475047, + "loss": 1.5785, + "step": 364500 + }, + { + "epoch": 10.72, + "grad_norm": 1.3055931329727173, + "learning_rate": 0.0001932112142934191, + "loss": 1.6059, + "step": 365000 + }, + { + "epoch": 10.73, + "grad_norm": 5.202936172485352, + "learning_rate": 0.0001930641223020877, + "loss": 1.6145, + "step": 365500 + }, + { + "epoch": 10.75, + "grad_norm": 1.704176902770996, + "learning_rate": 0.00019291703031075634, + "loss": 1.6349, + "step": 366000 + }, + { + "epoch": 10.76, + "grad_norm": 2.4655399322509766, + "learning_rate": 0.00019276993831942495, + "loss": 1.5845, + "step": 366500 + }, + { + "epoch": 10.78, + "grad_norm": 6.769374370574951, + "learning_rate": 0.00019262284632809358, + "loss": 1.6054, + "step": 367000 + }, + { + "epoch": 10.79, + "grad_norm": 3.4355087280273438, + "learning_rate": 0.0001924757543367622, + "loss": 1.6205, + "step": 367500 + }, + { + "epoch": 10.8, + "grad_norm": 1.796799659729004, + "learning_rate": 0.0001923286623454308, + "loss": 1.6138, + "step": 368000 + }, + { + "epoch": 10.82, + "grad_norm": 4.333281993865967, + "learning_rate": 0.00019218157035409943, + "loss": 1.6273, + "step": 368500 + }, + { + "epoch": 10.83, + "grad_norm": 10.447500228881836, + "learning_rate": 0.00019203447836276807, + "loss": 1.6497, + "step": 369000 + }, + { + "epoch": 10.85, + "grad_norm": 1.9053871631622314, + "learning_rate": 0.0001918873863714367, + "loss": 1.5815, + "step": 369500 + }, + { + "epoch": 10.86, + "grad_norm": 6.775355339050293, + "learning_rate": 0.0001917402943801053, + "loss": 1.5986, + "step": 370000 + }, + { + "epoch": 10.88, + "grad_norm": 19.00932502746582, + "learning_rate": 0.00019159320238877392, + "loss": 1.5916, + "step": 370500 + }, + { + "epoch": 10.89, + "grad_norm": 2.1029250621795654, + "learning_rate": 0.00019144611039744255, + "loss": 1.6072, + "step": 371000 + }, + { + "epoch": 10.91, + "grad_norm": 1.3043904304504395, + "learning_rate": 0.00019129901840611116, + "loss": 1.5885, + "step": 371500 + }, + { + "epoch": 10.92, + "grad_norm": 1.8047374486923218, + "learning_rate": 0.00019115192641477977, + "loss": 1.6265, + "step": 372000 + }, + { + "epoch": 10.94, + "grad_norm": 2.591125011444092, + "learning_rate": 0.0001910048344234484, + "loss": 1.6028, + "step": 372500 + }, + { + "epoch": 10.95, + "grad_norm": 1.8858853578567505, + "learning_rate": 0.00019085774243211704, + "loss": 1.6212, + "step": 373000 + }, + { + "epoch": 10.97, + "grad_norm": 1.4778261184692383, + "learning_rate": 0.00019071065044078567, + "loss": 1.6049, + "step": 373500 + }, + { + "epoch": 10.98, + "grad_norm": 7.7040934562683105, + "learning_rate": 0.00019056355844945428, + "loss": 1.5926, + "step": 374000 + }, + { + "epoch": 11.0, + "grad_norm": 1.5968307256698608, + "learning_rate": 0.0001904164664581229, + "loss": 1.6438, + "step": 374500 + }, + { + "epoch": 11.01, + "grad_norm": 1.4240366220474243, + "learning_rate": 0.00019026937446679152, + "loss": 1.5451, + "step": 375000 + }, + { + "epoch": 11.02, + "grad_norm": 2.0358214378356934, + "learning_rate": 0.00019012228247546013, + "loss": 1.5232, + "step": 375500 + }, + { + "epoch": 11.04, + "grad_norm": 1.2968121767044067, + "learning_rate": 0.00018997519048412874, + "loss": 1.5417, + "step": 376000 + }, + { + "epoch": 11.05, + "grad_norm": 3.3961257934570312, + "learning_rate": 0.00018982809849279737, + "loss": 1.5513, + "step": 376500 + }, + { + "epoch": 11.07, + "grad_norm": 2.7855663299560547, + "learning_rate": 0.000189681006501466, + "loss": 1.5437, + "step": 377000 + }, + { + "epoch": 11.08, + "grad_norm": 5.572605609893799, + "learning_rate": 0.00018953391451013464, + "loss": 1.5246, + "step": 377500 + }, + { + "epoch": 11.1, + "grad_norm": 2.3603007793426514, + "learning_rate": 0.00018938682251880325, + "loss": 1.5246, + "step": 378000 + }, + { + "epoch": 11.11, + "grad_norm": 19.196584701538086, + "learning_rate": 0.00018923973052747186, + "loss": 1.5223, + "step": 378500 + }, + { + "epoch": 11.13, + "grad_norm": 2.4410743713378906, + "learning_rate": 0.0001890926385361405, + "loss": 1.5202, + "step": 379000 + }, + { + "epoch": 11.14, + "grad_norm": 1.8335517644882202, + "learning_rate": 0.0001889455465448091, + "loss": 1.5625, + "step": 379500 + }, + { + "epoch": 11.16, + "grad_norm": 2.2342159748077393, + "learning_rate": 0.0001887984545534777, + "loss": 1.5391, + "step": 380000 + }, + { + "epoch": 11.17, + "grad_norm": 5.949936389923096, + "learning_rate": 0.00018865136256214634, + "loss": 1.569, + "step": 380500 + }, + { + "epoch": 11.19, + "grad_norm": 6.047088623046875, + "learning_rate": 0.00018850427057081498, + "loss": 1.5366, + "step": 381000 + }, + { + "epoch": 11.2, + "grad_norm": 16.05063819885254, + "learning_rate": 0.0001883571785794836, + "loss": 1.5343, + "step": 381500 + }, + { + "epoch": 11.22, + "grad_norm": 1.327913761138916, + "learning_rate": 0.00018821008658815222, + "loss": 1.5641, + "step": 382000 + }, + { + "epoch": 11.23, + "grad_norm": 4.132845878601074, + "learning_rate": 0.00018806299459682083, + "loss": 1.5722, + "step": 382500 + }, + { + "epoch": 11.25, + "grad_norm": 1.9688639640808105, + "learning_rate": 0.00018791590260548946, + "loss": 1.5477, + "step": 383000 + }, + { + "epoch": 11.26, + "grad_norm": 1.6657782793045044, + "learning_rate": 0.00018776881061415807, + "loss": 1.5398, + "step": 383500 + }, + { + "epoch": 11.27, + "grad_norm": 2.9266579151153564, + "learning_rate": 0.00018762171862282668, + "loss": 1.556, + "step": 384000 + }, + { + "epoch": 11.29, + "grad_norm": 5.755486011505127, + "learning_rate": 0.0001874746266314953, + "loss": 1.5415, + "step": 384500 + }, + { + "epoch": 11.3, + "grad_norm": 2.1186070442199707, + "learning_rate": 0.00018732753464016395, + "loss": 1.5379, + "step": 385000 + }, + { + "epoch": 11.32, + "grad_norm": 3.1836068630218506, + "learning_rate": 0.00018718044264883258, + "loss": 1.5621, + "step": 385500 + }, + { + "epoch": 11.33, + "grad_norm": 4.884739875793457, + "learning_rate": 0.0001870333506575012, + "loss": 1.5791, + "step": 386000 + }, + { + "epoch": 11.35, + "grad_norm": 2.4326329231262207, + "learning_rate": 0.0001868862586661698, + "loss": 1.5647, + "step": 386500 + }, + { + "epoch": 11.36, + "grad_norm": 5.392803192138672, + "learning_rate": 0.00018673916667483843, + "loss": 1.5703, + "step": 387000 + }, + { + "epoch": 11.38, + "grad_norm": 6.940735340118408, + "learning_rate": 0.00018659207468350704, + "loss": 1.5579, + "step": 387500 + }, + { + "epoch": 11.39, + "grad_norm": 2.8021979331970215, + "learning_rate": 0.00018644498269217567, + "loss": 1.5469, + "step": 388000 + }, + { + "epoch": 11.41, + "grad_norm": 1.8717076778411865, + "learning_rate": 0.00018629789070084428, + "loss": 1.5624, + "step": 388500 + }, + { + "epoch": 11.42, + "grad_norm": 3.1023306846618652, + "learning_rate": 0.00018615079870951291, + "loss": 1.5751, + "step": 389000 + }, + { + "epoch": 11.44, + "grad_norm": 1.5794864892959595, + "learning_rate": 0.00018600370671818155, + "loss": 1.5652, + "step": 389500 + }, + { + "epoch": 11.45, + "grad_norm": 3.9795074462890625, + "learning_rate": 0.00018585661472685016, + "loss": 1.5663, + "step": 390000 + }, + { + "epoch": 11.47, + "grad_norm": 1.4186744689941406, + "learning_rate": 0.0001857095227355188, + "loss": 1.5508, + "step": 390500 + }, + { + "epoch": 11.48, + "grad_norm": 3.268982172012329, + "learning_rate": 0.0001855624307441874, + "loss": 1.5666, + "step": 391000 + }, + { + "epoch": 11.49, + "grad_norm": 1.246903419494629, + "learning_rate": 0.000185415338752856, + "loss": 1.5797, + "step": 391500 + }, + { + "epoch": 11.51, + "grad_norm": 1.4437174797058105, + "learning_rate": 0.00018526824676152464, + "loss": 1.562, + "step": 392000 + }, + { + "epoch": 11.52, + "grad_norm": 1.8195850849151611, + "learning_rate": 0.00018512115477019325, + "loss": 1.5748, + "step": 392500 + }, + { + "epoch": 11.54, + "grad_norm": 1.6077349185943604, + "learning_rate": 0.0001849740627788619, + "loss": 1.5628, + "step": 393000 + }, + { + "epoch": 11.55, + "grad_norm": 1.614617943763733, + "learning_rate": 0.00018482697078753052, + "loss": 1.5992, + "step": 393500 + }, + { + "epoch": 11.57, + "grad_norm": 2.860440731048584, + "learning_rate": 0.00018467987879619913, + "loss": 1.5635, + "step": 394000 + }, + { + "epoch": 11.58, + "grad_norm": 4.223182201385498, + "learning_rate": 0.00018453278680486776, + "loss": 1.5678, + "step": 394500 + }, + { + "epoch": 11.6, + "grad_norm": 2.143765687942505, + "learning_rate": 0.00018438569481353637, + "loss": 1.5714, + "step": 395000 + }, + { + "epoch": 11.61, + "grad_norm": 14.07819652557373, + "learning_rate": 0.00018423860282220498, + "loss": 1.5662, + "step": 395500 + }, + { + "epoch": 11.63, + "grad_norm": 1.6693955659866333, + "learning_rate": 0.0001840915108308736, + "loss": 1.5431, + "step": 396000 + }, + { + "epoch": 11.64, + "grad_norm": 2.4783432483673096, + "learning_rate": 0.00018394441883954222, + "loss": 1.5439, + "step": 396500 + }, + { + "epoch": 11.66, + "grad_norm": 2.1813952922821045, + "learning_rate": 0.00018379732684821088, + "loss": 1.5539, + "step": 397000 + }, + { + "epoch": 11.67, + "grad_norm": 4.754596710205078, + "learning_rate": 0.0001836502348568795, + "loss": 1.5675, + "step": 397500 + }, + { + "epoch": 11.69, + "grad_norm": 1.9912656545639038, + "learning_rate": 0.0001835031428655481, + "loss": 1.5909, + "step": 398000 + }, + { + "epoch": 11.7, + "grad_norm": 1.5389857292175293, + "learning_rate": 0.00018335605087421673, + "loss": 1.5843, + "step": 398500 + }, + { + "epoch": 11.71, + "grad_norm": 3.8105554580688477, + "learning_rate": 0.00018320895888288534, + "loss": 1.5804, + "step": 399000 + }, + { + "epoch": 11.73, + "grad_norm": 2.6107397079467773, + "learning_rate": 0.00018306186689155394, + "loss": 1.5685, + "step": 399500 + }, + { + "epoch": 11.74, + "grad_norm": 4.173858642578125, + "learning_rate": 0.00018291477490022258, + "loss": 1.5823, + "step": 400000 + }, + { + "epoch": 11.76, + "grad_norm": 1.6713734865188599, + "learning_rate": 0.0001827676829088912, + "loss": 1.5672, + "step": 400500 + }, + { + "epoch": 11.77, + "grad_norm": 4.650341033935547, + "learning_rate": 0.00018262059091755985, + "loss": 1.5297, + "step": 401000 + }, + { + "epoch": 11.79, + "grad_norm": 1.5853548049926758, + "learning_rate": 0.00018247349892622846, + "loss": 1.5814, + "step": 401500 + }, + { + "epoch": 11.8, + "grad_norm": 5.71536111831665, + "learning_rate": 0.00018232640693489706, + "loss": 1.5444, + "step": 402000 + }, + { + "epoch": 11.82, + "grad_norm": 2.5603206157684326, + "learning_rate": 0.0001821793149435657, + "loss": 1.593, + "step": 402500 + }, + { + "epoch": 11.83, + "grad_norm": 2.556755304336548, + "learning_rate": 0.0001820322229522343, + "loss": 1.5717, + "step": 403000 + }, + { + "epoch": 11.85, + "grad_norm": 2.1441986560821533, + "learning_rate": 0.0001818851309609029, + "loss": 1.5766, + "step": 403500 + }, + { + "epoch": 11.86, + "grad_norm": 1.762389898300171, + "learning_rate": 0.00018173803896957155, + "loss": 1.5857, + "step": 404000 + }, + { + "epoch": 11.88, + "grad_norm": 1.7397152185440063, + "learning_rate": 0.00018159094697824016, + "loss": 1.5767, + "step": 404500 + }, + { + "epoch": 11.89, + "grad_norm": 2.2762222290039062, + "learning_rate": 0.00018144385498690882, + "loss": 1.5735, + "step": 405000 + }, + { + "epoch": 11.91, + "grad_norm": 2.9703893661499023, + "learning_rate": 0.00018129676299557742, + "loss": 1.5618, + "step": 405500 + }, + { + "epoch": 11.92, + "grad_norm": 3.4913887977600098, + "learning_rate": 0.00018114967100424603, + "loss": 1.589, + "step": 406000 + }, + { + "epoch": 11.93, + "grad_norm": 1.7856007814407349, + "learning_rate": 0.00018100257901291467, + "loss": 1.5924, + "step": 406500 + }, + { + "epoch": 11.95, + "grad_norm": 5.869511604309082, + "learning_rate": 0.00018085548702158327, + "loss": 1.5516, + "step": 407000 + }, + { + "epoch": 11.96, + "grad_norm": 1.4495360851287842, + "learning_rate": 0.00018070839503025188, + "loss": 1.5721, + "step": 407500 + }, + { + "epoch": 11.98, + "grad_norm": 5.264606475830078, + "learning_rate": 0.00018056130303892052, + "loss": 1.5684, + "step": 408000 + }, + { + "epoch": 11.99, + "grad_norm": 1.1457246541976929, + "learning_rate": 0.00018041421104758912, + "loss": 1.5747, + "step": 408500 + }, + { + "epoch": 12.01, + "grad_norm": 2.9236228466033936, + "learning_rate": 0.00018026711905625779, + "loss": 1.5313, + "step": 409000 + }, + { + "epoch": 12.02, + "grad_norm": 11.91308307647705, + "learning_rate": 0.0001801200270649264, + "loss": 1.4883, + "step": 409500 + }, + { + "epoch": 12.04, + "grad_norm": 1.8393372297286987, + "learning_rate": 0.000179972935073595, + "loss": 1.4828, + "step": 410000 + }, + { + "epoch": 12.05, + "grad_norm": 1.9591150283813477, + "learning_rate": 0.00017982584308226364, + "loss": 1.4832, + "step": 410500 + }, + { + "epoch": 12.07, + "grad_norm": 1.5424935817718506, + "learning_rate": 0.00017967875109093224, + "loss": 1.4863, + "step": 411000 + }, + { + "epoch": 12.08, + "grad_norm": 4.338646411895752, + "learning_rate": 0.00017953165909960088, + "loss": 1.5376, + "step": 411500 + }, + { + "epoch": 12.1, + "grad_norm": 4.14846134185791, + "learning_rate": 0.00017938456710826949, + "loss": 1.506, + "step": 412000 + }, + { + "epoch": 12.11, + "grad_norm": 6.690493106842041, + "learning_rate": 0.00017923747511693812, + "loss": 1.5015, + "step": 412500 + }, + { + "epoch": 12.13, + "grad_norm": 4.801180362701416, + "learning_rate": 0.00017909038312560676, + "loss": 1.53, + "step": 413000 + }, + { + "epoch": 12.14, + "grad_norm": 1.8997361660003662, + "learning_rate": 0.00017894329113427536, + "loss": 1.5191, + "step": 413500 + }, + { + "epoch": 12.16, + "grad_norm": 19.748411178588867, + "learning_rate": 0.000178796199142944, + "loss": 1.5213, + "step": 414000 + }, + { + "epoch": 12.17, + "grad_norm": 3.8095591068267822, + "learning_rate": 0.0001786491071516126, + "loss": 1.496, + "step": 414500 + }, + { + "epoch": 12.18, + "grad_norm": 3.752877950668335, + "learning_rate": 0.0001785020151602812, + "loss": 1.5014, + "step": 415000 + }, + { + "epoch": 12.2, + "grad_norm": 2.396458625793457, + "learning_rate": 0.00017835492316894985, + "loss": 1.4683, + "step": 415500 + }, + { + "epoch": 12.21, + "grad_norm": 2.928061008453369, + "learning_rate": 0.00017820783117761846, + "loss": 1.5093, + "step": 416000 + }, + { + "epoch": 12.23, + "grad_norm": 6.80102014541626, + "learning_rate": 0.00017806073918628712, + "loss": 1.5085, + "step": 416500 + }, + { + "epoch": 12.24, + "grad_norm": 16.06849479675293, + "learning_rate": 0.00017791364719495572, + "loss": 1.5023, + "step": 417000 + }, + { + "epoch": 12.26, + "grad_norm": 9.823871612548828, + "learning_rate": 0.00017776655520362433, + "loss": 1.5677, + "step": 417500 + }, + { + "epoch": 12.27, + "grad_norm": 1.8364531993865967, + "learning_rate": 0.00017761946321229297, + "loss": 1.5146, + "step": 418000 + }, + { + "epoch": 12.29, + "grad_norm": 2.6699376106262207, + "learning_rate": 0.00017747237122096157, + "loss": 1.5106, + "step": 418500 + }, + { + "epoch": 12.3, + "grad_norm": 7.037272930145264, + "learning_rate": 0.00017732527922963018, + "loss": 1.5276, + "step": 419000 + }, + { + "epoch": 12.32, + "grad_norm": 1.9254209995269775, + "learning_rate": 0.00017717818723829882, + "loss": 1.4907, + "step": 419500 + }, + { + "epoch": 12.33, + "grad_norm": 5.405977249145508, + "learning_rate": 0.00017703109524696742, + "loss": 1.5039, + "step": 420000 + }, + { + "epoch": 12.35, + "grad_norm": 6.036567211151123, + "learning_rate": 0.00017688400325563609, + "loss": 1.5258, + "step": 420500 + }, + { + "epoch": 12.36, + "grad_norm": 3.601088523864746, + "learning_rate": 0.0001767369112643047, + "loss": 1.5316, + "step": 421000 + }, + { + "epoch": 12.38, + "grad_norm": 3.247906446456909, + "learning_rate": 0.0001765898192729733, + "loss": 1.5006, + "step": 421500 + }, + { + "epoch": 12.39, + "grad_norm": 2.138275384902954, + "learning_rate": 0.00017644272728164194, + "loss": 1.5125, + "step": 422000 + }, + { + "epoch": 12.4, + "grad_norm": 2.024502992630005, + "learning_rate": 0.00017629563529031054, + "loss": 1.5323, + "step": 422500 + }, + { + "epoch": 12.42, + "grad_norm": 21.49098777770996, + "learning_rate": 0.00017614854329897915, + "loss": 1.5607, + "step": 423000 + }, + { + "epoch": 12.43, + "grad_norm": 3.5568199157714844, + "learning_rate": 0.00017600145130764779, + "loss": 1.4843, + "step": 423500 + }, + { + "epoch": 12.45, + "grad_norm": 1.6709766387939453, + "learning_rate": 0.0001758543593163164, + "loss": 1.5621, + "step": 424000 + }, + { + "epoch": 12.46, + "grad_norm": 2.4930756092071533, + "learning_rate": 0.00017570726732498505, + "loss": 1.5202, + "step": 424500 + }, + { + "epoch": 12.48, + "grad_norm": 1.8919436931610107, + "learning_rate": 0.00017556017533365366, + "loss": 1.4859, + "step": 425000 + }, + { + "epoch": 12.49, + "grad_norm": 1.2928180694580078, + "learning_rate": 0.00017541308334232227, + "loss": 1.5423, + "step": 425500 + }, + { + "epoch": 12.51, + "grad_norm": 3.5068464279174805, + "learning_rate": 0.0001752659913509909, + "loss": 1.5326, + "step": 426000 + }, + { + "epoch": 12.52, + "grad_norm": 2.7532618045806885, + "learning_rate": 0.0001751188993596595, + "loss": 1.5276, + "step": 426500 + }, + { + "epoch": 12.54, + "grad_norm": 1.5163066387176514, + "learning_rate": 0.00017497180736832812, + "loss": 1.5306, + "step": 427000 + }, + { + "epoch": 12.55, + "grad_norm": 21.672080993652344, + "learning_rate": 0.00017482471537699675, + "loss": 1.5018, + "step": 427500 + }, + { + "epoch": 12.57, + "grad_norm": 4.169219970703125, + "learning_rate": 0.00017467762338566536, + "loss": 1.5314, + "step": 428000 + }, + { + "epoch": 12.58, + "grad_norm": 1.7433300018310547, + "learning_rate": 0.00017453053139433402, + "loss": 1.5408, + "step": 428500 + }, + { + "epoch": 12.6, + "grad_norm": 1.6866792440414429, + "learning_rate": 0.00017438343940300263, + "loss": 1.5405, + "step": 429000 + }, + { + "epoch": 12.61, + "grad_norm": 12.107425689697266, + "learning_rate": 0.00017423634741167124, + "loss": 1.5375, + "step": 429500 + }, + { + "epoch": 12.62, + "grad_norm": 2.0178098678588867, + "learning_rate": 0.00017408925542033987, + "loss": 1.5137, + "step": 430000 + }, + { + "epoch": 12.64, + "grad_norm": 3.7658607959747314, + "learning_rate": 0.00017394216342900848, + "loss": 1.5138, + "step": 430500 + }, + { + "epoch": 12.65, + "grad_norm": 2.0998480319976807, + "learning_rate": 0.00017379507143767712, + "loss": 1.5005, + "step": 431000 + }, + { + "epoch": 12.67, + "grad_norm": 1.7215161323547363, + "learning_rate": 0.00017364797944634572, + "loss": 1.5083, + "step": 431500 + }, + { + "epoch": 12.68, + "grad_norm": 3.4021968841552734, + "learning_rate": 0.00017350088745501433, + "loss": 1.5103, + "step": 432000 + }, + { + "epoch": 12.7, + "grad_norm": 5.235382080078125, + "learning_rate": 0.000173353795463683, + "loss": 1.5491, + "step": 432500 + }, + { + "epoch": 12.71, + "grad_norm": 5.455018997192383, + "learning_rate": 0.0001732067034723516, + "loss": 1.5486, + "step": 433000 + }, + { + "epoch": 12.73, + "grad_norm": 2.4046807289123535, + "learning_rate": 0.00017305961148102024, + "loss": 1.5241, + "step": 433500 + }, + { + "epoch": 12.74, + "grad_norm": 2.2005653381347656, + "learning_rate": 0.00017291251948968884, + "loss": 1.5392, + "step": 434000 + }, + { + "epoch": 12.76, + "grad_norm": 1.7510486841201782, + "learning_rate": 0.00017276542749835745, + "loss": 1.5076, + "step": 434500 + }, + { + "epoch": 12.77, + "grad_norm": 2.4069275856018066, + "learning_rate": 0.00017261833550702608, + "loss": 1.5189, + "step": 435000 + }, + { + "epoch": 12.79, + "grad_norm": 3.2867445945739746, + "learning_rate": 0.0001724712435156947, + "loss": 1.5082, + "step": 435500 + }, + { + "epoch": 12.8, + "grad_norm": 1.321496605873108, + "learning_rate": 0.0001723241515243633, + "loss": 1.5364, + "step": 436000 + }, + { + "epoch": 12.82, + "grad_norm": 2.841768980026245, + "learning_rate": 0.00017217705953303196, + "loss": 1.5062, + "step": 436500 + }, + { + "epoch": 12.83, + "grad_norm": 2.678611993789673, + "learning_rate": 0.00017202996754170057, + "loss": 1.5048, + "step": 437000 + }, + { + "epoch": 12.85, + "grad_norm": 3.197082996368408, + "learning_rate": 0.0001718828755503692, + "loss": 1.5553, + "step": 437500 + }, + { + "epoch": 12.86, + "grad_norm": 2.398364543914795, + "learning_rate": 0.0001717357835590378, + "loss": 1.4866, + "step": 438000 + }, + { + "epoch": 12.87, + "grad_norm": 2.2497568130493164, + "learning_rate": 0.00017158869156770642, + "loss": 1.504, + "step": 438500 + }, + { + "epoch": 12.89, + "grad_norm": 3.9616646766662598, + "learning_rate": 0.00017144159957637505, + "loss": 1.5049, + "step": 439000 + }, + { + "epoch": 12.9, + "grad_norm": 1.5139656066894531, + "learning_rate": 0.00017129450758504366, + "loss": 1.549, + "step": 439500 + }, + { + "epoch": 12.92, + "grad_norm": 2.305665969848633, + "learning_rate": 0.00017114741559371227, + "loss": 1.5255, + "step": 440000 + }, + { + "epoch": 12.93, + "grad_norm": 2.3823301792144775, + "learning_rate": 0.00017100032360238093, + "loss": 1.4995, + "step": 440500 + }, + { + "epoch": 12.95, + "grad_norm": 1.6001240015029907, + "learning_rate": 0.00017085323161104954, + "loss": 1.5494, + "step": 441000 + }, + { + "epoch": 12.96, + "grad_norm": 2.843163013458252, + "learning_rate": 0.00017070613961971817, + "loss": 1.5394, + "step": 441500 + }, + { + "epoch": 12.98, + "grad_norm": 2.1387860774993896, + "learning_rate": 0.00017055904762838678, + "loss": 1.5456, + "step": 442000 + }, + { + "epoch": 12.99, + "grad_norm": 4.817657470703125, + "learning_rate": 0.0001704119556370554, + "loss": 1.5453, + "step": 442500 + }, + { + "epoch": 13.01, + "grad_norm": 1.376753807067871, + "learning_rate": 0.00017026486364572402, + "loss": 1.5093, + "step": 443000 + }, + { + "epoch": 13.02, + "grad_norm": 1.1947274208068848, + "learning_rate": 0.00017011777165439263, + "loss": 1.457, + "step": 443500 + }, + { + "epoch": 13.04, + "grad_norm": 3.931847333908081, + "learning_rate": 0.00016997067966306124, + "loss": 1.4286, + "step": 444000 + }, + { + "epoch": 13.05, + "grad_norm": 14.636221885681152, + "learning_rate": 0.0001698235876717299, + "loss": 1.4712, + "step": 444500 + }, + { + "epoch": 13.07, + "grad_norm": 3.1448845863342285, + "learning_rate": 0.0001696764956803985, + "loss": 1.4628, + "step": 445000 + }, + { + "epoch": 13.08, + "grad_norm": 0.9527159929275513, + "learning_rate": 0.00016952940368906714, + "loss": 1.4179, + "step": 445500 + }, + { + "epoch": 13.09, + "grad_norm": 2.0560357570648193, + "learning_rate": 0.00016938231169773575, + "loss": 1.4716, + "step": 446000 + }, + { + "epoch": 13.11, + "grad_norm": 2.8239283561706543, + "learning_rate": 0.00016923521970640436, + "loss": 1.477, + "step": 446500 + }, + { + "epoch": 13.12, + "grad_norm": 3.246338367462158, + "learning_rate": 0.000169088127715073, + "loss": 1.4739, + "step": 447000 + }, + { + "epoch": 13.14, + "grad_norm": 4.943697929382324, + "learning_rate": 0.0001689410357237416, + "loss": 1.4772, + "step": 447500 + }, + { + "epoch": 13.15, + "grad_norm": 4.003148555755615, + "learning_rate": 0.0001687939437324102, + "loss": 1.4723, + "step": 448000 + }, + { + "epoch": 13.17, + "grad_norm": 1.5625710487365723, + "learning_rate": 0.00016864685174107887, + "loss": 1.4424, + "step": 448500 + }, + { + "epoch": 13.18, + "grad_norm": 2.3796448707580566, + "learning_rate": 0.00016849975974974748, + "loss": 1.512, + "step": 449000 + }, + { + "epoch": 13.2, + "grad_norm": 2.314465284347534, + "learning_rate": 0.0001683526677584161, + "loss": 1.5012, + "step": 449500 + }, + { + "epoch": 13.21, + "grad_norm": 1.8954664468765259, + "learning_rate": 0.00016820557576708472, + "loss": 1.4634, + "step": 450000 + }, + { + "epoch": 13.23, + "grad_norm": 1.6504507064819336, + "learning_rate": 0.00016805848377575333, + "loss": 1.496, + "step": 450500 + }, + { + "epoch": 13.24, + "grad_norm": 1.8685587644577026, + "learning_rate": 0.00016791139178442196, + "loss": 1.5054, + "step": 451000 + }, + { + "epoch": 13.26, + "grad_norm": 3.0073273181915283, + "learning_rate": 0.00016776429979309057, + "loss": 1.4714, + "step": 451500 + }, + { + "epoch": 13.27, + "grad_norm": 1.5322887897491455, + "learning_rate": 0.0001676172078017592, + "loss": 1.4683, + "step": 452000 + }, + { + "epoch": 13.29, + "grad_norm": 10.244864463806152, + "learning_rate": 0.00016747011581042784, + "loss": 1.4834, + "step": 452500 + }, + { + "epoch": 13.3, + "grad_norm": 2.7981672286987305, + "learning_rate": 0.00016732302381909645, + "loss": 1.4919, + "step": 453000 + }, + { + "epoch": 13.31, + "grad_norm": 1.3389350175857544, + "learning_rate": 0.00016717593182776508, + "loss": 1.4589, + "step": 453500 + }, + { + "epoch": 13.33, + "grad_norm": 2.1007466316223145, + "learning_rate": 0.0001670288398364337, + "loss": 1.4844, + "step": 454000 + }, + { + "epoch": 13.34, + "grad_norm": 1.9225057363510132, + "learning_rate": 0.00016688174784510232, + "loss": 1.479, + "step": 454500 + }, + { + "epoch": 13.36, + "grad_norm": 6.121819972991943, + "learning_rate": 0.00016673465585377093, + "loss": 1.4713, + "step": 455000 + }, + { + "epoch": 13.37, + "grad_norm": 2.738173246383667, + "learning_rate": 0.00016658756386243954, + "loss": 1.5046, + "step": 455500 + }, + { + "epoch": 13.39, + "grad_norm": 1.3760331869125366, + "learning_rate": 0.00016644047187110817, + "loss": 1.4919, + "step": 456000 + }, + { + "epoch": 13.4, + "grad_norm": 9.234387397766113, + "learning_rate": 0.0001662933798797768, + "loss": 1.4531, + "step": 456500 + }, + { + "epoch": 13.42, + "grad_norm": 1.6355879306793213, + "learning_rate": 0.00016614628788844544, + "loss": 1.442, + "step": 457000 + }, + { + "epoch": 13.43, + "grad_norm": 5.203847885131836, + "learning_rate": 0.00016599919589711405, + "loss": 1.4672, + "step": 457500 + }, + { + "epoch": 13.45, + "grad_norm": 5.590662956237793, + "learning_rate": 0.00016585210390578266, + "loss": 1.459, + "step": 458000 + }, + { + "epoch": 13.46, + "grad_norm": 1.945825457572937, + "learning_rate": 0.0001657050119144513, + "loss": 1.4632, + "step": 458500 + }, + { + "epoch": 13.48, + "grad_norm": 5.772292137145996, + "learning_rate": 0.0001655579199231199, + "loss": 1.4581, + "step": 459000 + }, + { + "epoch": 13.49, + "grad_norm": 1.2538419961929321, + "learning_rate": 0.0001654108279317885, + "loss": 1.4735, + "step": 459500 + }, + { + "epoch": 13.51, + "grad_norm": 5.774242401123047, + "learning_rate": 0.00016526373594045714, + "loss": 1.4833, + "step": 460000 + }, + { + "epoch": 13.52, + "grad_norm": 1.8276828527450562, + "learning_rate": 0.00016511664394912578, + "loss": 1.5017, + "step": 460500 + }, + { + "epoch": 13.54, + "grad_norm": 2.792278528213501, + "learning_rate": 0.0001649695519577944, + "loss": 1.4677, + "step": 461000 + }, + { + "epoch": 13.55, + "grad_norm": 2.325228691101074, + "learning_rate": 0.00016482245996646302, + "loss": 1.482, + "step": 461500 + }, + { + "epoch": 13.56, + "grad_norm": 2.3256492614746094, + "learning_rate": 0.00016467536797513163, + "loss": 1.4808, + "step": 462000 + }, + { + "epoch": 13.58, + "grad_norm": 1.2064257860183716, + "learning_rate": 0.00016452827598380026, + "loss": 1.4695, + "step": 462500 + }, + { + "epoch": 13.59, + "grad_norm": 3.7189693450927734, + "learning_rate": 0.00016438118399246887, + "loss": 1.4593, + "step": 463000 + }, + { + "epoch": 13.61, + "grad_norm": 1.6200836896896362, + "learning_rate": 0.00016423409200113748, + "loss": 1.5176, + "step": 463500 + }, + { + "epoch": 13.62, + "grad_norm": 3.3271920680999756, + "learning_rate": 0.0001640870000098061, + "loss": 1.4941, + "step": 464000 + }, + { + "epoch": 13.64, + "grad_norm": 1.7106289863586426, + "learning_rate": 0.00016393990801847474, + "loss": 1.4414, + "step": 464500 + }, + { + "epoch": 13.65, + "grad_norm": 2.1748921871185303, + "learning_rate": 0.00016379281602714338, + "loss": 1.4887, + "step": 465000 + }, + { + "epoch": 13.67, + "grad_norm": 1.4012078046798706, + "learning_rate": 0.000163645724035812, + "loss": 1.4773, + "step": 465500 + }, + { + "epoch": 13.68, + "grad_norm": 1.6043895483016968, + "learning_rate": 0.0001634986320444806, + "loss": 1.5182, + "step": 466000 + }, + { + "epoch": 13.7, + "grad_norm": 1.53175950050354, + "learning_rate": 0.00016335154005314923, + "loss": 1.4939, + "step": 466500 + }, + { + "epoch": 13.71, + "grad_norm": 7.83038854598999, + "learning_rate": 0.00016320444806181784, + "loss": 1.4997, + "step": 467000 + }, + { + "epoch": 13.73, + "grad_norm": 20.90247344970703, + "learning_rate": 0.00016305735607048644, + "loss": 1.4717, + "step": 467500 + }, + { + "epoch": 13.74, + "grad_norm": 1.7639943361282349, + "learning_rate": 0.00016291026407915508, + "loss": 1.4682, + "step": 468000 + }, + { + "epoch": 13.76, + "grad_norm": 1.8345731496810913, + "learning_rate": 0.00016276317208782371, + "loss": 1.4941, + "step": 468500 + }, + { + "epoch": 13.77, + "grad_norm": 13.04284381866455, + "learning_rate": 0.00016261608009649235, + "loss": 1.508, + "step": 469000 + }, + { + "epoch": 13.78, + "grad_norm": 3.476304531097412, + "learning_rate": 0.00016246898810516096, + "loss": 1.4849, + "step": 469500 + }, + { + "epoch": 13.8, + "grad_norm": 76.50082397460938, + "learning_rate": 0.00016232189611382956, + "loss": 1.5388, + "step": 470000 + }, + { + "epoch": 13.81, + "grad_norm": 2.03076171875, + "learning_rate": 0.0001621748041224982, + "loss": 1.4964, + "step": 470500 + }, + { + "epoch": 13.83, + "grad_norm": 5.736513137817383, + "learning_rate": 0.0001620277121311668, + "loss": 1.5211, + "step": 471000 + }, + { + "epoch": 13.84, + "grad_norm": 1.7192264795303345, + "learning_rate": 0.0001618806201398354, + "loss": 1.4806, + "step": 471500 + }, + { + "epoch": 13.86, + "grad_norm": 1.4231237173080444, + "learning_rate": 0.00016173352814850408, + "loss": 1.465, + "step": 472000 + }, + { + "epoch": 13.87, + "grad_norm": 1.2601704597473145, + "learning_rate": 0.00016158643615717268, + "loss": 1.5081, + "step": 472500 + }, + { + "epoch": 13.89, + "grad_norm": 1.2420154809951782, + "learning_rate": 0.00016143934416584132, + "loss": 1.5014, + "step": 473000 + }, + { + "epoch": 13.9, + "grad_norm": 2.446563720703125, + "learning_rate": 0.00016129225217450993, + "loss": 1.4975, + "step": 473500 + }, + { + "epoch": 13.92, + "grad_norm": 2.2312917709350586, + "learning_rate": 0.00016114516018317853, + "loss": 1.4827, + "step": 474000 + }, + { + "epoch": 13.93, + "grad_norm": 2.2286717891693115, + "learning_rate": 0.00016099806819184717, + "loss": 1.4932, + "step": 474500 + }, + { + "epoch": 13.95, + "grad_norm": 3.0326056480407715, + "learning_rate": 0.00016085097620051577, + "loss": 1.5015, + "step": 475000 + }, + { + "epoch": 13.96, + "grad_norm": 4.350718975067139, + "learning_rate": 0.0001607038842091844, + "loss": 1.4917, + "step": 475500 + }, + { + "epoch": 13.98, + "grad_norm": 1.2791180610656738, + "learning_rate": 0.00016055679221785304, + "loss": 1.5066, + "step": 476000 + }, + { + "epoch": 13.99, + "grad_norm": 2.8099005222320557, + "learning_rate": 0.00016040970022652165, + "loss": 1.4746, + "step": 476500 + }, + { + "epoch": 14.0, + "grad_norm": 2.087524175643921, + "learning_rate": 0.0001602626082351903, + "loss": 1.4278, + "step": 477000 + }, + { + "epoch": 14.02, + "grad_norm": 5.598818778991699, + "learning_rate": 0.0001601155162438589, + "loss": 1.4266, + "step": 477500 + }, + { + "epoch": 14.03, + "grad_norm": 1.573020100593567, + "learning_rate": 0.00015996842425252753, + "loss": 1.4198, + "step": 478000 + }, + { + "epoch": 14.05, + "grad_norm": 2.8413734436035156, + "learning_rate": 0.00015982133226119614, + "loss": 1.421, + "step": 478500 + }, + { + "epoch": 14.06, + "grad_norm": 1.3834174871444702, + "learning_rate": 0.00015967424026986474, + "loss": 1.4446, + "step": 479000 + }, + { + "epoch": 14.08, + "grad_norm": 1.9328341484069824, + "learning_rate": 0.00015952714827853338, + "loss": 1.4332, + "step": 479500 + }, + { + "epoch": 14.09, + "grad_norm": 4.331046104431152, + "learning_rate": 0.000159380056287202, + "loss": 1.4373, + "step": 480000 + }, + { + "epoch": 14.11, + "grad_norm": 8.744680404663086, + "learning_rate": 0.00015923296429587065, + "loss": 1.4372, + "step": 480500 + }, + { + "epoch": 14.12, + "grad_norm": 1.4437713623046875, + "learning_rate": 0.00015908587230453926, + "loss": 1.467, + "step": 481000 + }, + { + "epoch": 14.14, + "grad_norm": 1.5478661060333252, + "learning_rate": 0.00015893878031320786, + "loss": 1.4209, + "step": 481500 + }, + { + "epoch": 14.15, + "grad_norm": 12.385702133178711, + "learning_rate": 0.0001587916883218765, + "loss": 1.4529, + "step": 482000 + }, + { + "epoch": 14.17, + "grad_norm": 1.585307002067566, + "learning_rate": 0.0001586445963305451, + "loss": 1.442, + "step": 482500 + }, + { + "epoch": 14.18, + "grad_norm": 2.214268207550049, + "learning_rate": 0.0001584975043392137, + "loss": 1.4349, + "step": 483000 + }, + { + "epoch": 14.2, + "grad_norm": 9.036513328552246, + "learning_rate": 0.00015835041234788235, + "loss": 1.4179, + "step": 483500 + }, + { + "epoch": 14.21, + "grad_norm": 2.498725175857544, + "learning_rate": 0.00015820332035655098, + "loss": 1.4394, + "step": 484000 + }, + { + "epoch": 14.23, + "grad_norm": 7.920095920562744, + "learning_rate": 0.00015805622836521962, + "loss": 1.4494, + "step": 484500 + }, + { + "epoch": 14.24, + "grad_norm": 2.6044135093688965, + "learning_rate": 0.00015790913637388822, + "loss": 1.4159, + "step": 485000 + }, + { + "epoch": 14.25, + "grad_norm": 1.906570315361023, + "learning_rate": 0.00015776204438255683, + "loss": 1.446, + "step": 485500 + }, + { + "epoch": 14.27, + "grad_norm": 1.8452608585357666, + "learning_rate": 0.00015761495239122547, + "loss": 1.3961, + "step": 486000 + }, + { + "epoch": 14.28, + "grad_norm": 1.6884217262268066, + "learning_rate": 0.00015746786039989407, + "loss": 1.4385, + "step": 486500 + }, + { + "epoch": 14.3, + "grad_norm": 1.6386513710021973, + "learning_rate": 0.00015732076840856268, + "loss": 1.3842, + "step": 487000 + }, + { + "epoch": 14.31, + "grad_norm": 5.906566619873047, + "learning_rate": 0.00015717367641723132, + "loss": 1.4241, + "step": 487500 + }, + { + "epoch": 14.33, + "grad_norm": 2.964063882827759, + "learning_rate": 0.00015702658442589995, + "loss": 1.4454, + "step": 488000 + }, + { + "epoch": 14.34, + "grad_norm": 1.53037428855896, + "learning_rate": 0.00015687949243456859, + "loss": 1.4067, + "step": 488500 + }, + { + "epoch": 14.36, + "grad_norm": 1.5273714065551758, + "learning_rate": 0.0001567324004432372, + "loss": 1.4382, + "step": 489000 + }, + { + "epoch": 14.37, + "grad_norm": 1.7097736597061157, + "learning_rate": 0.0001565853084519058, + "loss": 1.4167, + "step": 489500 + }, + { + "epoch": 14.39, + "grad_norm": 2.170961380004883, + "learning_rate": 0.00015643821646057444, + "loss": 1.4541, + "step": 490000 + }, + { + "epoch": 14.4, + "grad_norm": 5.699864387512207, + "learning_rate": 0.00015629112446924304, + "loss": 1.407, + "step": 490500 + }, + { + "epoch": 14.42, + "grad_norm": 1.7430428266525269, + "learning_rate": 0.00015614403247791165, + "loss": 1.4418, + "step": 491000 + }, + { + "epoch": 14.43, + "grad_norm": 2.179090976715088, + "learning_rate": 0.00015599694048658029, + "loss": 1.4331, + "step": 491500 + }, + { + "epoch": 14.45, + "grad_norm": 1.534424901008606, + "learning_rate": 0.00015584984849524892, + "loss": 1.4536, + "step": 492000 + }, + { + "epoch": 14.46, + "grad_norm": 1.464789867401123, + "learning_rate": 0.00015570275650391755, + "loss": 1.4459, + "step": 492500 + }, + { + "epoch": 14.47, + "grad_norm": 1.9043675661087036, + "learning_rate": 0.00015555566451258616, + "loss": 1.4396, + "step": 493000 + }, + { + "epoch": 14.49, + "grad_norm": 5.7054762840271, + "learning_rate": 0.00015540857252125477, + "loss": 1.4243, + "step": 493500 + }, + { + "epoch": 14.5, + "grad_norm": 4.648599624633789, + "learning_rate": 0.0001552614805299234, + "loss": 1.4607, + "step": 494000 + }, + { + "epoch": 14.52, + "grad_norm": 1.4880977869033813, + "learning_rate": 0.000155114388538592, + "loss": 1.4457, + "step": 494500 + }, + { + "epoch": 14.53, + "grad_norm": 2.5349297523498535, + "learning_rate": 0.00015496729654726062, + "loss": 1.4675, + "step": 495000 + }, + { + "epoch": 14.55, + "grad_norm": 3.5980210304260254, + "learning_rate": 0.00015482020455592925, + "loss": 1.4704, + "step": 495500 + }, + { + "epoch": 14.56, + "grad_norm": 3.0215611457824707, + "learning_rate": 0.0001546731125645979, + "loss": 1.4202, + "step": 496000 + }, + { + "epoch": 14.58, + "grad_norm": 1.6257708072662354, + "learning_rate": 0.00015452602057326652, + "loss": 1.4569, + "step": 496500 + }, + { + "epoch": 14.59, + "grad_norm": 3.856419324874878, + "learning_rate": 0.00015437892858193513, + "loss": 1.4142, + "step": 497000 + }, + { + "epoch": 14.61, + "grad_norm": 4.544717788696289, + "learning_rate": 0.00015423183659060374, + "loss": 1.4437, + "step": 497500 + }, + { + "epoch": 14.62, + "grad_norm": 3.5249671936035156, + "learning_rate": 0.00015408474459927237, + "loss": 1.4432, + "step": 498000 + }, + { + "epoch": 14.64, + "grad_norm": 1.8191946744918823, + "learning_rate": 0.00015393765260794098, + "loss": 1.435, + "step": 498500 + }, + { + "epoch": 14.65, + "grad_norm": 2.6009254455566406, + "learning_rate": 0.00015379056061660962, + "loss": 1.411, + "step": 499000 + }, + { + "epoch": 14.67, + "grad_norm": 2.475637674331665, + "learning_rate": 0.00015364346862527822, + "loss": 1.4568, + "step": 499500 + }, + { + "epoch": 14.68, + "grad_norm": 3.965743064880371, + "learning_rate": 0.00015349637663394686, + "loss": 1.455, + "step": 500000 + }, + { + "epoch": 14.69, + "grad_norm": 1.3109550476074219, + "learning_rate": 0.0001533492846426155, + "loss": 1.4547, + "step": 500500 + }, + { + "epoch": 14.71, + "grad_norm": 2.0032811164855957, + "learning_rate": 0.0001532021926512841, + "loss": 1.4451, + "step": 501000 + }, + { + "epoch": 14.72, + "grad_norm": 1.4996896982192993, + "learning_rate": 0.00015305510065995274, + "loss": 1.4497, + "step": 501500 + }, + { + "epoch": 14.74, + "grad_norm": 2.1643595695495605, + "learning_rate": 0.00015290800866862134, + "loss": 1.4335, + "step": 502000 + }, + { + "epoch": 14.75, + "grad_norm": 1.5510461330413818, + "learning_rate": 0.00015276091667728995, + "loss": 1.44, + "step": 502500 + }, + { + "epoch": 14.77, + "grad_norm": 1.9841787815093994, + "learning_rate": 0.00015261382468595858, + "loss": 1.392, + "step": 503000 + }, + { + "epoch": 14.78, + "grad_norm": 7.774050712585449, + "learning_rate": 0.0001524667326946272, + "loss": 1.4521, + "step": 503500 + }, + { + "epoch": 14.8, + "grad_norm": 1.3330835103988647, + "learning_rate": 0.00015231964070329585, + "loss": 1.4562, + "step": 504000 + }, + { + "epoch": 14.81, + "grad_norm": 2.6969919204711914, + "learning_rate": 0.00015217254871196446, + "loss": 1.4313, + "step": 504500 + }, + { + "epoch": 14.83, + "grad_norm": 1.6068675518035889, + "learning_rate": 0.00015202545672063307, + "loss": 1.4515, + "step": 505000 + }, + { + "epoch": 14.84, + "grad_norm": 3.470465660095215, + "learning_rate": 0.0001518783647293017, + "loss": 1.418, + "step": 505500 + }, + { + "epoch": 14.86, + "grad_norm": 5.018684387207031, + "learning_rate": 0.0001517312727379703, + "loss": 1.4236, + "step": 506000 + }, + { + "epoch": 14.87, + "grad_norm": 1.603996753692627, + "learning_rate": 0.00015158418074663892, + "loss": 1.4478, + "step": 506500 + }, + { + "epoch": 14.89, + "grad_norm": 2.150404930114746, + "learning_rate": 0.00015143708875530755, + "loss": 1.4523, + "step": 507000 + }, + { + "epoch": 14.9, + "grad_norm": 1.252964735031128, + "learning_rate": 0.00015128999676397616, + "loss": 1.4546, + "step": 507500 + }, + { + "epoch": 14.92, + "grad_norm": 3.1075639724731445, + "learning_rate": 0.00015114290477264482, + "loss": 1.4547, + "step": 508000 + }, + { + "epoch": 14.93, + "grad_norm": 1.4597883224487305, + "learning_rate": 0.00015099581278131343, + "loss": 1.4902, + "step": 508500 + }, + { + "epoch": 14.94, + "grad_norm": 2.405596971511841, + "learning_rate": 0.00015084872078998204, + "loss": 1.4306, + "step": 509000 + }, + { + "epoch": 14.96, + "grad_norm": 1.952222228050232, + "learning_rate": 0.00015070162879865067, + "loss": 1.4209, + "step": 509500 + }, + { + "epoch": 14.97, + "grad_norm": 2.3458521366119385, + "learning_rate": 0.00015055453680731928, + "loss": 1.468, + "step": 510000 + }, + { + "epoch": 14.99, + "grad_norm": 1.6442558765411377, + "learning_rate": 0.0001504074448159879, + "loss": 1.4541, + "step": 510500 + }, + { + "epoch": 15.0, + "grad_norm": 2.1197829246520996, + "learning_rate": 0.00015026035282465652, + "loss": 1.4195, + "step": 511000 + }, + { + "epoch": 15.02, + "grad_norm": 2.8808157444000244, + "learning_rate": 0.00015011326083332513, + "loss": 1.3719, + "step": 511500 + }, + { + "epoch": 15.03, + "grad_norm": 1.939942717552185, + "learning_rate": 0.00014996616884199377, + "loss": 1.3627, + "step": 512000 + }, + { + "epoch": 15.05, + "grad_norm": 4.798866271972656, + "learning_rate": 0.00014981907685066237, + "loss": 1.3839, + "step": 512500 + }, + { + "epoch": 15.06, + "grad_norm": 2.553893566131592, + "learning_rate": 0.000149671984859331, + "loss": 1.4047, + "step": 513000 + }, + { + "epoch": 15.08, + "grad_norm": 1.915277123451233, + "learning_rate": 0.00014952489286799964, + "loss": 1.3933, + "step": 513500 + }, + { + "epoch": 15.09, + "grad_norm": 1.5252777338027954, + "learning_rate": 0.00014937780087666825, + "loss": 1.3644, + "step": 514000 + }, + { + "epoch": 15.11, + "grad_norm": 1.6960341930389404, + "learning_rate": 0.00014923070888533686, + "loss": 1.3946, + "step": 514500 + }, + { + "epoch": 15.12, + "grad_norm": 9.384785652160645, + "learning_rate": 0.0001490836168940055, + "loss": 1.3706, + "step": 515000 + }, + { + "epoch": 15.14, + "grad_norm": 2.280630350112915, + "learning_rate": 0.00014893652490267413, + "loss": 1.3836, + "step": 515500 + }, + { + "epoch": 15.15, + "grad_norm": 1.8966234922409058, + "learning_rate": 0.00014878943291134273, + "loss": 1.373, + "step": 516000 + }, + { + "epoch": 15.16, + "grad_norm": 2.4201481342315674, + "learning_rate": 0.00014864234092001134, + "loss": 1.4014, + "step": 516500 + }, + { + "epoch": 15.18, + "grad_norm": 2.8996338844299316, + "learning_rate": 0.00014849524892867998, + "loss": 1.3985, + "step": 517000 + }, + { + "epoch": 15.19, + "grad_norm": 2.2007150650024414, + "learning_rate": 0.0001483481569373486, + "loss": 1.3823, + "step": 517500 + }, + { + "epoch": 15.21, + "grad_norm": 2.7956199645996094, + "learning_rate": 0.00014820106494601722, + "loss": 1.3936, + "step": 518000 + }, + { + "epoch": 15.22, + "grad_norm": 2.891369104385376, + "learning_rate": 0.00014805397295468585, + "loss": 1.3991, + "step": 518500 + }, + { + "epoch": 15.24, + "grad_norm": 7.358971118927002, + "learning_rate": 0.00014790688096335446, + "loss": 1.4115, + "step": 519000 + }, + { + "epoch": 15.25, + "grad_norm": 2.1062731742858887, + "learning_rate": 0.0001477597889720231, + "loss": 1.398, + "step": 519500 + }, + { + "epoch": 15.27, + "grad_norm": 3.914013385772705, + "learning_rate": 0.0001476126969806917, + "loss": 1.4129, + "step": 520000 + }, + { + "epoch": 15.28, + "grad_norm": 1.342411756515503, + "learning_rate": 0.00014746560498936034, + "loss": 1.4022, + "step": 520500 + }, + { + "epoch": 15.3, + "grad_norm": 1.3836804628372192, + "learning_rate": 0.00014731851299802897, + "loss": 1.3849, + "step": 521000 + }, + { + "epoch": 15.31, + "grad_norm": 4.170617580413818, + "learning_rate": 0.00014717142100669758, + "loss": 1.3855, + "step": 521500 + }, + { + "epoch": 15.33, + "grad_norm": 6.060724258422852, + "learning_rate": 0.0001470243290153662, + "loss": 1.3645, + "step": 522000 + }, + { + "epoch": 15.34, + "grad_norm": 5.009141445159912, + "learning_rate": 0.00014687723702403482, + "loss": 1.4106, + "step": 522500 + }, + { + "epoch": 15.36, + "grad_norm": 4.241628646850586, + "learning_rate": 0.00014673014503270346, + "loss": 1.3983, + "step": 523000 + }, + { + "epoch": 15.37, + "grad_norm": 1.6837831735610962, + "learning_rate": 0.00014658305304137206, + "loss": 1.4023, + "step": 523500 + }, + { + "epoch": 15.38, + "grad_norm": 2.455502510070801, + "learning_rate": 0.00014643596105004067, + "loss": 1.3607, + "step": 524000 + }, + { + "epoch": 15.4, + "grad_norm": 6.113306522369385, + "learning_rate": 0.0001462888690587093, + "loss": 1.4292, + "step": 524500 + }, + { + "epoch": 15.41, + "grad_norm": 1.7169700860977173, + "learning_rate": 0.00014614177706737794, + "loss": 1.364, + "step": 525000 + }, + { + "epoch": 15.43, + "grad_norm": 3.563976287841797, + "learning_rate": 0.00014599468507604655, + "loss": 1.4007, + "step": 525500 + }, + { + "epoch": 15.44, + "grad_norm": 3.6780059337615967, + "learning_rate": 0.00014584759308471516, + "loss": 1.4072, + "step": 526000 + }, + { + "epoch": 15.46, + "grad_norm": 1.8734827041625977, + "learning_rate": 0.0001457005010933838, + "loss": 1.4088, + "step": 526500 + }, + { + "epoch": 15.47, + "grad_norm": 6.644404411315918, + "learning_rate": 0.00014555340910205243, + "loss": 1.3958, + "step": 527000 + }, + { + "epoch": 15.49, + "grad_norm": 1.7657749652862549, + "learning_rate": 0.00014540631711072103, + "loss": 1.418, + "step": 527500 + }, + { + "epoch": 15.5, + "grad_norm": 4.092434406280518, + "learning_rate": 0.00014525922511938964, + "loss": 1.3643, + "step": 528000 + }, + { + "epoch": 15.52, + "grad_norm": 30.432186126708984, + "learning_rate": 0.00014511213312805828, + "loss": 1.3797, + "step": 528500 + }, + { + "epoch": 15.53, + "grad_norm": 10.275367736816406, + "learning_rate": 0.0001449650411367269, + "loss": 1.3928, + "step": 529000 + }, + { + "epoch": 15.55, + "grad_norm": 20.94750213623047, + "learning_rate": 0.00014481794914539552, + "loss": 1.4131, + "step": 529500 + }, + { + "epoch": 15.56, + "grad_norm": 1.7463383674621582, + "learning_rate": 0.00014467085715406413, + "loss": 1.4173, + "step": 530000 + }, + { + "epoch": 15.58, + "grad_norm": 2.380938768386841, + "learning_rate": 0.00014452376516273276, + "loss": 1.4092, + "step": 530500 + }, + { + "epoch": 15.59, + "grad_norm": 1.8386043310165405, + "learning_rate": 0.0001443766731714014, + "loss": 1.3972, + "step": 531000 + }, + { + "epoch": 15.61, + "grad_norm": 1.3829760551452637, + "learning_rate": 0.00014422958118007, + "loss": 1.391, + "step": 531500 + }, + { + "epoch": 15.62, + "grad_norm": 2.171069383621216, + "learning_rate": 0.0001440824891887386, + "loss": 1.3993, + "step": 532000 + }, + { + "epoch": 15.63, + "grad_norm": 1.716299057006836, + "learning_rate": 0.00014393539719740724, + "loss": 1.398, + "step": 532500 + }, + { + "epoch": 15.65, + "grad_norm": 1.7643611431121826, + "learning_rate": 0.00014378830520607588, + "loss": 1.3901, + "step": 533000 + }, + { + "epoch": 15.66, + "grad_norm": 1.68152916431427, + "learning_rate": 0.0001436412132147445, + "loss": 1.3873, + "step": 533500 + }, + { + "epoch": 15.68, + "grad_norm": 2.581348419189453, + "learning_rate": 0.0001434941212234131, + "loss": 1.3713, + "step": 534000 + }, + { + "epoch": 15.69, + "grad_norm": 3.0933191776275635, + "learning_rate": 0.00014334702923208173, + "loss": 1.4283, + "step": 534500 + }, + { + "epoch": 15.71, + "grad_norm": 6.795374870300293, + "learning_rate": 0.00014319993724075036, + "loss": 1.4088, + "step": 535000 + }, + { + "epoch": 15.72, + "grad_norm": 2.514035701751709, + "learning_rate": 0.00014305284524941897, + "loss": 1.4042, + "step": 535500 + }, + { + "epoch": 15.74, + "grad_norm": 2.5651772022247314, + "learning_rate": 0.00014290575325808758, + "loss": 1.4236, + "step": 536000 + }, + { + "epoch": 15.75, + "grad_norm": 1.6859495639801025, + "learning_rate": 0.00014275866126675621, + "loss": 1.4243, + "step": 536500 + }, + { + "epoch": 15.77, + "grad_norm": 1.8449592590332031, + "learning_rate": 0.00014261156927542485, + "loss": 1.406, + "step": 537000 + }, + { + "epoch": 15.78, + "grad_norm": 2.2886695861816406, + "learning_rate": 0.00014246447728409346, + "loss": 1.4095, + "step": 537500 + }, + { + "epoch": 15.8, + "grad_norm": 2.669768810272217, + "learning_rate": 0.00014231738529276206, + "loss": 1.4172, + "step": 538000 + }, + { + "epoch": 15.81, + "grad_norm": 5.082691192626953, + "learning_rate": 0.0001421702933014307, + "loss": 1.4307, + "step": 538500 + }, + { + "epoch": 15.83, + "grad_norm": 2.9871368408203125, + "learning_rate": 0.00014202320131009933, + "loss": 1.3937, + "step": 539000 + }, + { + "epoch": 15.84, + "grad_norm": 1.900804877281189, + "learning_rate": 0.00014187610931876794, + "loss": 1.4226, + "step": 539500 + }, + { + "epoch": 15.85, + "grad_norm": 1.668407678604126, + "learning_rate": 0.00014172901732743655, + "loss": 1.3807, + "step": 540000 + }, + { + "epoch": 15.87, + "grad_norm": 5.046024799346924, + "learning_rate": 0.00014158192533610518, + "loss": 1.4203, + "step": 540500 + }, + { + "epoch": 15.88, + "grad_norm": 2.8824052810668945, + "learning_rate": 0.00014143483334477382, + "loss": 1.4188, + "step": 541000 + }, + { + "epoch": 15.9, + "grad_norm": 2.688316583633423, + "learning_rate": 0.00014128774135344243, + "loss": 1.418, + "step": 541500 + }, + { + "epoch": 15.91, + "grad_norm": 2.323672294616699, + "learning_rate": 0.00014114064936211106, + "loss": 1.3825, + "step": 542000 + }, + { + "epoch": 15.93, + "grad_norm": 23.119873046875, + "learning_rate": 0.00014099355737077967, + "loss": 1.4148, + "step": 542500 + }, + { + "epoch": 15.94, + "grad_norm": 3.257922649383545, + "learning_rate": 0.0001408464653794483, + "loss": 1.4327, + "step": 543000 + }, + { + "epoch": 15.96, + "grad_norm": 1.8719940185546875, + "learning_rate": 0.0001406993733881169, + "loss": 1.3868, + "step": 543500 + }, + { + "epoch": 15.97, + "grad_norm": 2.128316640853882, + "learning_rate": 0.00014055228139678554, + "loss": 1.4102, + "step": 544000 + }, + { + "epoch": 15.99, + "grad_norm": 2.277371644973755, + "learning_rate": 0.00014040518940545418, + "loss": 1.4014, + "step": 544500 + }, + { + "epoch": 16.0, + "grad_norm": 41.53245162963867, + "learning_rate": 0.0001402580974141228, + "loss": 1.4178, + "step": 545000 + }, + { + "epoch": 16.02, + "grad_norm": 1.792492151260376, + "learning_rate": 0.0001401110054227914, + "loss": 1.3353, + "step": 545500 + }, + { + "epoch": 16.03, + "grad_norm": 1.582220435142517, + "learning_rate": 0.00013996391343146003, + "loss": 1.3722, + "step": 546000 + }, + { + "epoch": 16.05, + "grad_norm": 3.7628824710845947, + "learning_rate": 0.00013981682144012866, + "loss": 1.3593, + "step": 546500 + }, + { + "epoch": 16.06, + "grad_norm": 2.833401918411255, + "learning_rate": 0.00013966972944879727, + "loss": 1.3627, + "step": 547000 + }, + { + "epoch": 16.07, + "grad_norm": 2.1202030181884766, + "learning_rate": 0.00013952263745746588, + "loss": 1.3528, + "step": 547500 + }, + { + "epoch": 16.09, + "grad_norm": 1.919555425643921, + "learning_rate": 0.0001393755454661345, + "loss": 1.3087, + "step": 548000 + }, + { + "epoch": 16.1, + "grad_norm": 2.3834056854248047, + "learning_rate": 0.00013922845347480315, + "loss": 1.3303, + "step": 548500 + }, + { + "epoch": 16.12, + "grad_norm": 2.0871472358703613, + "learning_rate": 0.00013908136148347176, + "loss": 1.3423, + "step": 549000 + }, + { + "epoch": 16.13, + "grad_norm": 3.040555953979492, + "learning_rate": 0.00013893426949214036, + "loss": 1.3402, + "step": 549500 + }, + { + "epoch": 16.15, + "grad_norm": 2.8366496562957764, + "learning_rate": 0.000138787177500809, + "loss": 1.3586, + "step": 550000 + }, + { + "epoch": 16.16, + "grad_norm": 10.008976936340332, + "learning_rate": 0.00013864008550947763, + "loss": 1.3321, + "step": 550500 + }, + { + "epoch": 16.18, + "grad_norm": 8.792502403259277, + "learning_rate": 0.00013849299351814624, + "loss": 1.3781, + "step": 551000 + }, + { + "epoch": 16.19, + "grad_norm": 8.872962951660156, + "learning_rate": 0.00013834590152681485, + "loss": 1.3687, + "step": 551500 + }, + { + "epoch": 16.21, + "grad_norm": 5.631560802459717, + "learning_rate": 0.00013819880953548348, + "loss": 1.3419, + "step": 552000 + }, + { + "epoch": 16.22, + "grad_norm": 2.04437255859375, + "learning_rate": 0.00013805171754415212, + "loss": 1.3335, + "step": 552500 + }, + { + "epoch": 16.24, + "grad_norm": 2.526149272918701, + "learning_rate": 0.00013790462555282072, + "loss": 1.3446, + "step": 553000 + }, + { + "epoch": 16.25, + "grad_norm": 8.970195770263672, + "learning_rate": 0.00013775753356148933, + "loss": 1.3795, + "step": 553500 + }, + { + "epoch": 16.27, + "grad_norm": 4.344628810882568, + "learning_rate": 0.00013761044157015797, + "loss": 1.3231, + "step": 554000 + }, + { + "epoch": 16.28, + "grad_norm": 2.5846548080444336, + "learning_rate": 0.0001374633495788266, + "loss": 1.3571, + "step": 554500 + }, + { + "epoch": 16.3, + "grad_norm": 20.92795181274414, + "learning_rate": 0.0001373162575874952, + "loss": 1.38, + "step": 555000 + }, + { + "epoch": 16.31, + "grad_norm": 2.3088529109954834, + "learning_rate": 0.00013716916559616382, + "loss": 1.3473, + "step": 555500 + }, + { + "epoch": 16.32, + "grad_norm": 13.217586517333984, + "learning_rate": 0.00013702207360483245, + "loss": 1.3558, + "step": 556000 + }, + { + "epoch": 16.34, + "grad_norm": 8.642449378967285, + "learning_rate": 0.00013687498161350109, + "loss": 1.382, + "step": 556500 + }, + { + "epoch": 16.35, + "grad_norm": 1.4824799299240112, + "learning_rate": 0.0001367278896221697, + "loss": 1.384, + "step": 557000 + }, + { + "epoch": 16.37, + "grad_norm": 1.741585373878479, + "learning_rate": 0.0001365807976308383, + "loss": 1.3664, + "step": 557500 + }, + { + "epoch": 16.38, + "grad_norm": 1.7038291692733765, + "learning_rate": 0.00013643370563950694, + "loss": 1.3634, + "step": 558000 + }, + { + "epoch": 16.4, + "grad_norm": 17.11383628845215, + "learning_rate": 0.00013628661364817557, + "loss": 1.3604, + "step": 558500 + }, + { + "epoch": 16.41, + "grad_norm": 19.561166763305664, + "learning_rate": 0.00013613952165684418, + "loss": 1.3731, + "step": 559000 + }, + { + "epoch": 16.43, + "grad_norm": 2.2067980766296387, + "learning_rate": 0.00013599242966551279, + "loss": 1.3472, + "step": 559500 + }, + { + "epoch": 16.44, + "grad_norm": 3.9638609886169434, + "learning_rate": 0.00013584533767418142, + "loss": 1.365, + "step": 560000 + }, + { + "epoch": 16.46, + "grad_norm": 2.2947542667388916, + "learning_rate": 0.00013569824568285006, + "loss": 1.3618, + "step": 560500 + }, + { + "epoch": 16.47, + "grad_norm": 2.3892598152160645, + "learning_rate": 0.00013555115369151866, + "loss": 1.3331, + "step": 561000 + }, + { + "epoch": 16.49, + "grad_norm": 1.9236092567443848, + "learning_rate": 0.00013540406170018727, + "loss": 1.3508, + "step": 561500 + }, + { + "epoch": 16.5, + "grad_norm": 5.180337429046631, + "learning_rate": 0.0001352569697088559, + "loss": 1.3612, + "step": 562000 + }, + { + "epoch": 16.52, + "grad_norm": 8.786672592163086, + "learning_rate": 0.00013510987771752454, + "loss": 1.3776, + "step": 562500 + }, + { + "epoch": 16.53, + "grad_norm": 4.111878871917725, + "learning_rate": 0.00013496278572619315, + "loss": 1.3661, + "step": 563000 + }, + { + "epoch": 16.54, + "grad_norm": 4.706780433654785, + "learning_rate": 0.00013481569373486178, + "loss": 1.3731, + "step": 563500 + }, + { + "epoch": 16.56, + "grad_norm": 1.3788596391677856, + "learning_rate": 0.0001346686017435304, + "loss": 1.3316, + "step": 564000 + }, + { + "epoch": 16.57, + "grad_norm": 2.970449924468994, + "learning_rate": 0.00013452150975219902, + "loss": 1.3612, + "step": 564500 + }, + { + "epoch": 16.59, + "grad_norm": 2.0503463745117188, + "learning_rate": 0.00013437441776086763, + "loss": 1.3635, + "step": 565000 + }, + { + "epoch": 16.6, + "grad_norm": 10.668388366699219, + "learning_rate": 0.00013422732576953627, + "loss": 1.3764, + "step": 565500 + }, + { + "epoch": 16.62, + "grad_norm": 8.57248592376709, + "learning_rate": 0.00013408023377820487, + "loss": 1.3521, + "step": 566000 + }, + { + "epoch": 16.63, + "grad_norm": 1.6269396543502808, + "learning_rate": 0.0001339331417868735, + "loss": 1.3406, + "step": 566500 + }, + { + "epoch": 16.65, + "grad_norm": 1.764863133430481, + "learning_rate": 0.00013378604979554212, + "loss": 1.3325, + "step": 567000 + }, + { + "epoch": 16.66, + "grad_norm": 1.6542813777923584, + "learning_rate": 0.00013363895780421075, + "loss": 1.3601, + "step": 567500 + }, + { + "epoch": 16.68, + "grad_norm": 1.9206827878952026, + "learning_rate": 0.00013349186581287936, + "loss": 1.3515, + "step": 568000 + }, + { + "epoch": 16.69, + "grad_norm": 2.092914581298828, + "learning_rate": 0.000133344773821548, + "loss": 1.3617, + "step": 568500 + }, + { + "epoch": 16.71, + "grad_norm": 6.570430278778076, + "learning_rate": 0.0001331976818302166, + "loss": 1.3728, + "step": 569000 + }, + { + "epoch": 16.72, + "grad_norm": 2.719400644302368, + "learning_rate": 0.00013305058983888524, + "loss": 1.3406, + "step": 569500 + }, + { + "epoch": 16.74, + "grad_norm": 9.820898056030273, + "learning_rate": 0.00013290349784755384, + "loss": 1.4036, + "step": 570000 + }, + { + "epoch": 16.75, + "grad_norm": 2.3756299018859863, + "learning_rate": 0.00013275640585622248, + "loss": 1.3735, + "step": 570500 + }, + { + "epoch": 16.76, + "grad_norm": 6.200007438659668, + "learning_rate": 0.00013260931386489109, + "loss": 1.3792, + "step": 571000 + }, + { + "epoch": 16.78, + "grad_norm": 7.884439468383789, + "learning_rate": 0.00013246222187355972, + "loss": 1.3666, + "step": 571500 + }, + { + "epoch": 16.79, + "grad_norm": 2.011915683746338, + "learning_rate": 0.00013231512988222833, + "loss": 1.3541, + "step": 572000 + }, + { + "epoch": 16.81, + "grad_norm": 6.167238712310791, + "learning_rate": 0.00013216803789089696, + "loss": 1.3781, + "step": 572500 + }, + { + "epoch": 16.82, + "grad_norm": 38.23750686645508, + "learning_rate": 0.00013202094589956557, + "loss": 1.3546, + "step": 573000 + }, + { + "epoch": 16.84, + "grad_norm": 2.6406400203704834, + "learning_rate": 0.0001318738539082342, + "loss": 1.3959, + "step": 573500 + }, + { + "epoch": 16.85, + "grad_norm": 1.713273525238037, + "learning_rate": 0.0001317267619169028, + "loss": 1.3625, + "step": 574000 + }, + { + "epoch": 16.87, + "grad_norm": 2.819561004638672, + "learning_rate": 0.00013157966992557145, + "loss": 1.354, + "step": 574500 + }, + { + "epoch": 16.88, + "grad_norm": 5.471235275268555, + "learning_rate": 0.00013143257793424005, + "loss": 1.3988, + "step": 575000 + }, + { + "epoch": 16.9, + "grad_norm": 2.4000468254089355, + "learning_rate": 0.0001312854859429087, + "loss": 1.3432, + "step": 575500 + }, + { + "epoch": 16.91, + "grad_norm": 2.053870439529419, + "learning_rate": 0.0001311383939515773, + "loss": 1.3626, + "step": 576000 + }, + { + "epoch": 16.93, + "grad_norm": 1.577664852142334, + "learning_rate": 0.00013099130196024593, + "loss": 1.3806, + "step": 576500 + }, + { + "epoch": 16.94, + "grad_norm": 16.11113166809082, + "learning_rate": 0.00013084420996891454, + "loss": 1.3787, + "step": 577000 + }, + { + "epoch": 16.96, + "grad_norm": 4.257967948913574, + "learning_rate": 0.00013069711797758317, + "loss": 1.355, + "step": 577500 + }, + { + "epoch": 16.97, + "grad_norm": 1.8505833148956299, + "learning_rate": 0.00013055002598625178, + "loss": 1.3702, + "step": 578000 + }, + { + "epoch": 16.98, + "grad_norm": 2.1093640327453613, + "learning_rate": 0.00013040293399492042, + "loss": 1.3775, + "step": 578500 + }, + { + "epoch": 17.0, + "grad_norm": 1.8386383056640625, + "learning_rate": 0.00013025584200358902, + "loss": 1.3776, + "step": 579000 + }, + { + "epoch": 17.01, + "grad_norm": 7.34138298034668, + "learning_rate": 0.00013010875001225766, + "loss": 1.3215, + "step": 579500 + }, + { + "epoch": 17.03, + "grad_norm": 1.0172581672668457, + "learning_rate": 0.00012996165802092627, + "loss": 1.3147, + "step": 580000 + }, + { + "epoch": 17.04, + "grad_norm": 2.6336045265197754, + "learning_rate": 0.0001298145660295949, + "loss": 1.3365, + "step": 580500 + }, + { + "epoch": 17.06, + "grad_norm": 3.9906227588653564, + "learning_rate": 0.0001296674740382635, + "loss": 1.3245, + "step": 581000 + }, + { + "epoch": 17.07, + "grad_norm": 3.1087284088134766, + "learning_rate": 0.00012952038204693214, + "loss": 1.2864, + "step": 581500 + }, + { + "epoch": 17.09, + "grad_norm": 2.5867342948913574, + "learning_rate": 0.00012937329005560075, + "loss": 1.2947, + "step": 582000 + }, + { + "epoch": 17.1, + "grad_norm": 1.2118226289749146, + "learning_rate": 0.00012922619806426938, + "loss": 1.3, + "step": 582500 + }, + { + "epoch": 17.12, + "grad_norm": 2.56510329246521, + "learning_rate": 0.000129079106072938, + "loss": 1.3147, + "step": 583000 + }, + { + "epoch": 17.13, + "grad_norm": 10.831042289733887, + "learning_rate": 0.00012893201408160663, + "loss": 1.3125, + "step": 583500 + }, + { + "epoch": 17.15, + "grad_norm": 10.36989688873291, + "learning_rate": 0.00012878492209027523, + "loss": 1.2969, + "step": 584000 + }, + { + "epoch": 17.16, + "grad_norm": 2.0238804817199707, + "learning_rate": 0.00012863783009894387, + "loss": 1.3044, + "step": 584500 + }, + { + "epoch": 17.18, + "grad_norm": 4.501575469970703, + "learning_rate": 0.00012849073810761248, + "loss": 1.3013, + "step": 585000 + }, + { + "epoch": 17.19, + "grad_norm": 4.236315727233887, + "learning_rate": 0.0001283436461162811, + "loss": 1.2991, + "step": 585500 + }, + { + "epoch": 17.21, + "grad_norm": 4.139219760894775, + "learning_rate": 0.00012819655412494972, + "loss": 1.2979, + "step": 586000 + }, + { + "epoch": 17.22, + "grad_norm": 2.1206071376800537, + "learning_rate": 0.00012804946213361835, + "loss": 1.3377, + "step": 586500 + }, + { + "epoch": 17.23, + "grad_norm": 1.7728540897369385, + "learning_rate": 0.000127902370142287, + "loss": 1.2992, + "step": 587000 + }, + { + "epoch": 17.25, + "grad_norm": 1.9550994634628296, + "learning_rate": 0.0001277552781509556, + "loss": 1.3311, + "step": 587500 + }, + { + "epoch": 17.26, + "grad_norm": 1.6412031650543213, + "learning_rate": 0.00012760818615962423, + "loss": 1.3101, + "step": 588000 + }, + { + "epoch": 17.28, + "grad_norm": 1.6908353567123413, + "learning_rate": 0.00012746109416829284, + "loss": 1.3357, + "step": 588500 + }, + { + "epoch": 17.29, + "grad_norm": 1.6260554790496826, + "learning_rate": 0.00012731400217696147, + "loss": 1.3128, + "step": 589000 + }, + { + "epoch": 17.31, + "grad_norm": 4.529758453369141, + "learning_rate": 0.00012716691018563008, + "loss": 1.3282, + "step": 589500 + }, + { + "epoch": 17.32, + "grad_norm": 1.872877836227417, + "learning_rate": 0.00012701981819429871, + "loss": 1.3271, + "step": 590000 + }, + { + "epoch": 17.34, + "grad_norm": 3.3399105072021484, + "learning_rate": 0.00012687272620296732, + "loss": 1.3104, + "step": 590500 + }, + { + "epoch": 17.35, + "grad_norm": 3.178553819656372, + "learning_rate": 0.00012672563421163596, + "loss": 1.3222, + "step": 591000 + }, + { + "epoch": 17.37, + "grad_norm": 2.042067527770996, + "learning_rate": 0.00012657854222030456, + "loss": 1.332, + "step": 591500 + }, + { + "epoch": 17.38, + "grad_norm": 2.1601064205169678, + "learning_rate": 0.0001264314502289732, + "loss": 1.3198, + "step": 592000 + }, + { + "epoch": 17.4, + "grad_norm": 1.6580477952957153, + "learning_rate": 0.0001262843582376418, + "loss": 1.3059, + "step": 592500 + }, + { + "epoch": 17.41, + "grad_norm": 3.175902843475342, + "learning_rate": 0.00012613726624631044, + "loss": 1.3262, + "step": 593000 + }, + { + "epoch": 17.43, + "grad_norm": 2.7562525272369385, + "learning_rate": 0.00012599017425497905, + "loss": 1.3389, + "step": 593500 + }, + { + "epoch": 17.44, + "grad_norm": 1.7579740285873413, + "learning_rate": 0.00012584308226364768, + "loss": 1.3308, + "step": 594000 + }, + { + "epoch": 17.45, + "grad_norm": 4.593905448913574, + "learning_rate": 0.0001256959902723163, + "loss": 1.3059, + "step": 594500 + }, + { + "epoch": 17.47, + "grad_norm": 1.438839077949524, + "learning_rate": 0.00012554889828098493, + "loss": 1.3159, + "step": 595000 + }, + { + "epoch": 17.48, + "grad_norm": 2.5471925735473633, + "learning_rate": 0.00012540180628965353, + "loss": 1.2989, + "step": 595500 + }, + { + "epoch": 17.5, + "grad_norm": 1.8019795417785645, + "learning_rate": 0.00012525471429832217, + "loss": 1.3441, + "step": 596000 + }, + { + "epoch": 17.51, + "grad_norm": 2.0826618671417236, + "learning_rate": 0.00012510762230699078, + "loss": 1.3309, + "step": 596500 + }, + { + "epoch": 17.53, + "grad_norm": 1.820566177368164, + "learning_rate": 0.0001249605303156594, + "loss": 1.3224, + "step": 597000 + }, + { + "epoch": 17.54, + "grad_norm": 24.11446762084961, + "learning_rate": 0.00012481343832432802, + "loss": 1.2934, + "step": 597500 + }, + { + "epoch": 17.56, + "grad_norm": 1.5428298711776733, + "learning_rate": 0.00012466634633299665, + "loss": 1.3277, + "step": 598000 + }, + { + "epoch": 17.57, + "grad_norm": 2.171504259109497, + "learning_rate": 0.00012451925434166526, + "loss": 1.3365, + "step": 598500 + }, + { + "epoch": 17.59, + "grad_norm": 2.862025022506714, + "learning_rate": 0.0001243721623503339, + "loss": 1.3351, + "step": 599000 + }, + { + "epoch": 17.6, + "grad_norm": 2.7436113357543945, + "learning_rate": 0.0001242250703590025, + "loss": 1.3203, + "step": 599500 + }, + { + "epoch": 17.62, + "grad_norm": 1.9902766942977905, + "learning_rate": 0.00012407797836767114, + "loss": 1.3324, + "step": 600000 + }, + { + "epoch": 17.63, + "grad_norm": 3.6419193744659424, + "learning_rate": 0.00012393088637633975, + "loss": 1.347, + "step": 600500 + }, + { + "epoch": 17.65, + "grad_norm": 1.8328200578689575, + "learning_rate": 0.00012378379438500838, + "loss": 1.3261, + "step": 601000 + }, + { + "epoch": 17.66, + "grad_norm": 3.2816476821899414, + "learning_rate": 0.000123636702393677, + "loss": 1.3228, + "step": 601500 + }, + { + "epoch": 17.67, + "grad_norm": 1.142104148864746, + "learning_rate": 0.00012348961040234562, + "loss": 1.3183, + "step": 602000 + }, + { + "epoch": 17.69, + "grad_norm": 2.1242527961730957, + "learning_rate": 0.00012334251841101423, + "loss": 1.3162, + "step": 602500 + }, + { + "epoch": 17.7, + "grad_norm": 9.647717475891113, + "learning_rate": 0.00012319542641968286, + "loss": 1.3473, + "step": 603000 + }, + { + "epoch": 17.72, + "grad_norm": 3.551119804382324, + "learning_rate": 0.00012304833442835147, + "loss": 1.3513, + "step": 603500 + }, + { + "epoch": 17.73, + "grad_norm": 1.6903916597366333, + "learning_rate": 0.0001229012424370201, + "loss": 1.3393, + "step": 604000 + }, + { + "epoch": 17.75, + "grad_norm": 2.54724383354187, + "learning_rate": 0.00012275415044568871, + "loss": 1.3402, + "step": 604500 + }, + { + "epoch": 17.76, + "grad_norm": 5.089727878570557, + "learning_rate": 0.00012260705845435735, + "loss": 1.303, + "step": 605000 + }, + { + "epoch": 17.78, + "grad_norm": 12.237126350402832, + "learning_rate": 0.00012245996646302596, + "loss": 1.3319, + "step": 605500 + }, + { + "epoch": 17.79, + "grad_norm": 1.576462745666504, + "learning_rate": 0.0001223128744716946, + "loss": 1.3265, + "step": 606000 + }, + { + "epoch": 17.81, + "grad_norm": 1.7792675495147705, + "learning_rate": 0.0001221657824803632, + "loss": 1.3335, + "step": 606500 + }, + { + "epoch": 17.82, + "grad_norm": 5.532106876373291, + "learning_rate": 0.00012201869048903183, + "loss": 1.3456, + "step": 607000 + }, + { + "epoch": 17.84, + "grad_norm": 3.333435297012329, + "learning_rate": 0.00012187159849770044, + "loss": 1.3332, + "step": 607500 + }, + { + "epoch": 17.85, + "grad_norm": 3.1190874576568604, + "learning_rate": 0.00012172450650636908, + "loss": 1.3368, + "step": 608000 + }, + { + "epoch": 17.87, + "grad_norm": 1.656201720237732, + "learning_rate": 0.0001215774145150377, + "loss": 1.3266, + "step": 608500 + }, + { + "epoch": 17.88, + "grad_norm": 2.088050365447998, + "learning_rate": 0.00012143032252370632, + "loss": 1.3128, + "step": 609000 + }, + { + "epoch": 17.9, + "grad_norm": 2.941950798034668, + "learning_rate": 0.00012128323053237493, + "loss": 1.3393, + "step": 609500 + }, + { + "epoch": 17.91, + "grad_norm": 1.8323218822479248, + "learning_rate": 0.00012113613854104356, + "loss": 1.3255, + "step": 610000 + }, + { + "epoch": 17.92, + "grad_norm": 1.3735178709030151, + "learning_rate": 0.00012098904654971218, + "loss": 1.3095, + "step": 610500 + }, + { + "epoch": 17.94, + "grad_norm": 2.9366865158081055, + "learning_rate": 0.0001208419545583808, + "loss": 1.3401, + "step": 611000 + }, + { + "epoch": 17.95, + "grad_norm": 3.2511463165283203, + "learning_rate": 0.00012069486256704941, + "loss": 1.3396, + "step": 611500 + }, + { + "epoch": 17.97, + "grad_norm": 2.2338857650756836, + "learning_rate": 0.00012054777057571804, + "loss": 1.3286, + "step": 612000 + }, + { + "epoch": 17.98, + "grad_norm": 1.5889378786087036, + "learning_rate": 0.00012040067858438667, + "loss": 1.3163, + "step": 612500 + }, + { + "epoch": 18.0, + "grad_norm": 2.792966365814209, + "learning_rate": 0.00012025358659305529, + "loss": 1.2973, + "step": 613000 + }, + { + "epoch": 18.01, + "grad_norm": 2.239032745361328, + "learning_rate": 0.00012010649460172391, + "loss": 1.3099, + "step": 613500 + }, + { + "epoch": 18.03, + "grad_norm": 2.293813705444336, + "learning_rate": 0.00011995940261039253, + "loss": 1.2568, + "step": 614000 + }, + { + "epoch": 18.04, + "grad_norm": 2.175294876098633, + "learning_rate": 0.00011981231061906115, + "loss": 1.2661, + "step": 614500 + }, + { + "epoch": 18.06, + "grad_norm": 1.7673249244689941, + "learning_rate": 0.00011966521862772977, + "loss": 1.2747, + "step": 615000 + }, + { + "epoch": 18.07, + "grad_norm": 2.6049957275390625, + "learning_rate": 0.00011951812663639839, + "loss": 1.2851, + "step": 615500 + }, + { + "epoch": 18.09, + "grad_norm": 2.0433642864227295, + "learning_rate": 0.00011937103464506703, + "loss": 1.2846, + "step": 616000 + }, + { + "epoch": 18.1, + "grad_norm": 2.120561122894287, + "learning_rate": 0.00011922394265373563, + "loss": 1.2634, + "step": 616500 + }, + { + "epoch": 18.12, + "grad_norm": 2.6130003929138184, + "learning_rate": 0.00011907685066240426, + "loss": 1.2844, + "step": 617000 + }, + { + "epoch": 18.13, + "grad_norm": 1.8312240839004517, + "learning_rate": 0.00011892975867107288, + "loss": 1.289, + "step": 617500 + }, + { + "epoch": 18.14, + "grad_norm": 1.5402841567993164, + "learning_rate": 0.00011878266667974151, + "loss": 1.2578, + "step": 618000 + }, + { + "epoch": 18.16, + "grad_norm": 2.0804052352905273, + "learning_rate": 0.00011863557468841012, + "loss": 1.2558, + "step": 618500 + }, + { + "epoch": 18.17, + "grad_norm": 1.77811861038208, + "learning_rate": 0.00011848848269707874, + "loss": 1.2853, + "step": 619000 + }, + { + "epoch": 18.19, + "grad_norm": 6.067068099975586, + "learning_rate": 0.00011834139070574736, + "loss": 1.2941, + "step": 619500 + }, + { + "epoch": 18.2, + "grad_norm": 2.4684622287750244, + "learning_rate": 0.000118194298714416, + "loss": 1.2749, + "step": 620000 + }, + { + "epoch": 18.22, + "grad_norm": 3.7913448810577393, + "learning_rate": 0.0001180472067230846, + "loss": 1.298, + "step": 620500 + }, + { + "epoch": 18.23, + "grad_norm": 1.343802571296692, + "learning_rate": 0.00011790011473175322, + "loss": 1.2954, + "step": 621000 + }, + { + "epoch": 18.25, + "grad_norm": 1.797194480895996, + "learning_rate": 0.00011775302274042185, + "loss": 1.2718, + "step": 621500 + }, + { + "epoch": 18.26, + "grad_norm": 2.3011558055877686, + "learning_rate": 0.00011760593074909048, + "loss": 1.2966, + "step": 622000 + }, + { + "epoch": 18.28, + "grad_norm": 2.5689167976379395, + "learning_rate": 0.00011745883875775909, + "loss": 1.286, + "step": 622500 + }, + { + "epoch": 18.29, + "grad_norm": 8.428597450256348, + "learning_rate": 0.00011731174676642771, + "loss": 1.2872, + "step": 623000 + }, + { + "epoch": 18.31, + "grad_norm": 1.1936590671539307, + "learning_rate": 0.00011716465477509633, + "loss": 1.2621, + "step": 623500 + }, + { + "epoch": 18.32, + "grad_norm": 1.5052251815795898, + "learning_rate": 0.00011701756278376497, + "loss": 1.2758, + "step": 624000 + }, + { + "epoch": 18.34, + "grad_norm": 1.296823263168335, + "learning_rate": 0.00011687047079243357, + "loss": 1.266, + "step": 624500 + }, + { + "epoch": 18.35, + "grad_norm": 3.121631383895874, + "learning_rate": 0.0001167233788011022, + "loss": 1.2858, + "step": 625000 + }, + { + "epoch": 18.36, + "grad_norm": 7.022789478302002, + "learning_rate": 0.00011657628680977081, + "loss": 1.2986, + "step": 625500 + }, + { + "epoch": 18.38, + "grad_norm": 11.550426483154297, + "learning_rate": 0.00011642919481843945, + "loss": 1.2736, + "step": 626000 + }, + { + "epoch": 18.39, + "grad_norm": 2.277326822280884, + "learning_rate": 0.00011628210282710807, + "loss": 1.3052, + "step": 626500 + }, + { + "epoch": 18.41, + "grad_norm": 3.439568519592285, + "learning_rate": 0.00011613501083577668, + "loss": 1.3042, + "step": 627000 + }, + { + "epoch": 18.42, + "grad_norm": 8.868010520935059, + "learning_rate": 0.0001159879188444453, + "loss": 1.2612, + "step": 627500 + }, + { + "epoch": 18.44, + "grad_norm": 1.5216681957244873, + "learning_rate": 0.00011584082685311393, + "loss": 1.2634, + "step": 628000 + }, + { + "epoch": 18.45, + "grad_norm": 2.113112688064575, + "learning_rate": 0.00011569373486178256, + "loss": 1.2681, + "step": 628500 + }, + { + "epoch": 18.47, + "grad_norm": 2.9850034713745117, + "learning_rate": 0.00011554664287045116, + "loss": 1.2865, + "step": 629000 + }, + { + "epoch": 18.48, + "grad_norm": 2.309042453765869, + "learning_rate": 0.00011539955087911978, + "loss": 1.2674, + "step": 629500 + }, + { + "epoch": 18.5, + "grad_norm": 3.1562564373016357, + "learning_rate": 0.00011525245888778842, + "loss": 1.262, + "step": 630000 + }, + { + "epoch": 18.51, + "grad_norm": 3.4192593097686768, + "learning_rate": 0.00011510536689645704, + "loss": 1.2729, + "step": 630500 + }, + { + "epoch": 18.53, + "grad_norm": 2.193237543106079, + "learning_rate": 0.00011495827490512565, + "loss": 1.2586, + "step": 631000 + }, + { + "epoch": 18.54, + "grad_norm": 8.091324806213379, + "learning_rate": 0.00011481118291379427, + "loss": 1.2649, + "step": 631500 + }, + { + "epoch": 18.56, + "grad_norm": 2.0321621894836426, + "learning_rate": 0.0001146640909224629, + "loss": 1.2792, + "step": 632000 + }, + { + "epoch": 18.57, + "grad_norm": 1.623028039932251, + "learning_rate": 0.00011451699893113152, + "loss": 1.3013, + "step": 632500 + }, + { + "epoch": 18.59, + "grad_norm": 2.7462871074676514, + "learning_rate": 0.00011436990693980013, + "loss": 1.2858, + "step": 633000 + }, + { + "epoch": 18.6, + "grad_norm": 22.743488311767578, + "learning_rate": 0.00011422281494846875, + "loss": 1.2739, + "step": 633500 + }, + { + "epoch": 18.61, + "grad_norm": 4.659852981567383, + "learning_rate": 0.00011407572295713739, + "loss": 1.2773, + "step": 634000 + }, + { + "epoch": 18.63, + "grad_norm": 1.8134876489639282, + "learning_rate": 0.00011392863096580601, + "loss": 1.274, + "step": 634500 + }, + { + "epoch": 18.64, + "grad_norm": 2.266272783279419, + "learning_rate": 0.00011378153897447463, + "loss": 1.3084, + "step": 635000 + }, + { + "epoch": 18.66, + "grad_norm": 1.4625264406204224, + "learning_rate": 0.00011363444698314324, + "loss": 1.2794, + "step": 635500 + }, + { + "epoch": 18.67, + "grad_norm": 1.57483971118927, + "learning_rate": 0.00011348735499181187, + "loss": 1.2872, + "step": 636000 + }, + { + "epoch": 18.69, + "grad_norm": 1.3935645818710327, + "learning_rate": 0.00011334026300048049, + "loss": 1.2809, + "step": 636500 + }, + { + "epoch": 18.7, + "grad_norm": 3.3207247257232666, + "learning_rate": 0.00011319317100914911, + "loss": 1.2764, + "step": 637000 + }, + { + "epoch": 18.72, + "grad_norm": 4.18394660949707, + "learning_rate": 0.00011304607901781772, + "loss": 1.2855, + "step": 637500 + }, + { + "epoch": 18.73, + "grad_norm": 1.6960937976837158, + "learning_rate": 0.00011289898702648636, + "loss": 1.2807, + "step": 638000 + }, + { + "epoch": 18.75, + "grad_norm": 6.864727020263672, + "learning_rate": 0.00011275189503515498, + "loss": 1.2736, + "step": 638500 + }, + { + "epoch": 18.76, + "grad_norm": 1.7619363069534302, + "learning_rate": 0.0001126048030438236, + "loss": 1.3063, + "step": 639000 + }, + { + "epoch": 18.78, + "grad_norm": 1.6224156618118286, + "learning_rate": 0.0001124577110524922, + "loss": 1.3031, + "step": 639500 + }, + { + "epoch": 18.79, + "grad_norm": 4.564239025115967, + "learning_rate": 0.00011231061906116084, + "loss": 1.3185, + "step": 640000 + }, + { + "epoch": 18.81, + "grad_norm": 2.237443208694458, + "learning_rate": 0.00011216352706982946, + "loss": 1.2612, + "step": 640500 + }, + { + "epoch": 18.82, + "grad_norm": 1.7501612901687622, + "learning_rate": 0.00011201643507849808, + "loss": 1.278, + "step": 641000 + }, + { + "epoch": 18.83, + "grad_norm": 1.1969166994094849, + "learning_rate": 0.00011186934308716669, + "loss": 1.2979, + "step": 641500 + }, + { + "epoch": 18.85, + "grad_norm": 2.23476505279541, + "learning_rate": 0.00011172225109583533, + "loss": 1.2796, + "step": 642000 + }, + { + "epoch": 18.86, + "grad_norm": 3.1486740112304688, + "learning_rate": 0.00011157515910450395, + "loss": 1.3212, + "step": 642500 + }, + { + "epoch": 18.88, + "grad_norm": 3.5119261741638184, + "learning_rate": 0.00011142806711317257, + "loss": 1.3082, + "step": 643000 + }, + { + "epoch": 18.89, + "grad_norm": 3.0694100856781006, + "learning_rate": 0.00011128097512184118, + "loss": 1.2643, + "step": 643500 + }, + { + "epoch": 18.91, + "grad_norm": 2.235492467880249, + "learning_rate": 0.00011113388313050981, + "loss": 1.2832, + "step": 644000 + }, + { + "epoch": 18.92, + "grad_norm": 2.627898931503296, + "learning_rate": 0.00011098679113917843, + "loss": 1.2863, + "step": 644500 + }, + { + "epoch": 18.94, + "grad_norm": 1.390758991241455, + "learning_rate": 0.00011083969914784705, + "loss": 1.2949, + "step": 645000 + }, + { + "epoch": 18.95, + "grad_norm": 1.9412230253219604, + "learning_rate": 0.00011069260715651567, + "loss": 1.2765, + "step": 645500 + }, + { + "epoch": 18.97, + "grad_norm": 7.697941780090332, + "learning_rate": 0.0001105455151651843, + "loss": 1.2872, + "step": 646000 + }, + { + "epoch": 18.98, + "grad_norm": 2.0770368576049805, + "learning_rate": 0.00011039842317385292, + "loss": 1.3058, + "step": 646500 + }, + { + "epoch": 19.0, + "grad_norm": 1.8311492204666138, + "learning_rate": 0.00011025133118252154, + "loss": 1.3229, + "step": 647000 + }, + { + "epoch": 19.01, + "grad_norm": 2.744004726409912, + "learning_rate": 0.00011010423919119017, + "loss": 1.253, + "step": 647500 + }, + { + "epoch": 19.03, + "grad_norm": 2.0883147716522217, + "learning_rate": 0.00010995714719985879, + "loss": 1.2348, + "step": 648000 + }, + { + "epoch": 19.04, + "grad_norm": 13.648962020874023, + "learning_rate": 0.0001098100552085274, + "loss": 1.2572, + "step": 648500 + }, + { + "epoch": 19.05, + "grad_norm": 3.89188814163208, + "learning_rate": 0.00010966296321719602, + "loss": 1.2312, + "step": 649000 + }, + { + "epoch": 19.07, + "grad_norm": 12.498522758483887, + "learning_rate": 0.00010951587122586466, + "loss": 1.2285, + "step": 649500 + }, + { + "epoch": 19.08, + "grad_norm": 1.9123058319091797, + "learning_rate": 0.00010936877923453328, + "loss": 1.2223, + "step": 650000 + }, + { + "epoch": 19.1, + "grad_norm": 1.9629552364349365, + "learning_rate": 0.00010922168724320188, + "loss": 1.2323, + "step": 650500 + }, + { + "epoch": 19.11, + "grad_norm": 16.135618209838867, + "learning_rate": 0.0001090745952518705, + "loss": 1.1976, + "step": 651000 + }, + { + "epoch": 19.13, + "grad_norm": 2.948089361190796, + "learning_rate": 0.00010892750326053914, + "loss": 1.2367, + "step": 651500 + }, + { + "epoch": 19.14, + "grad_norm": 2.4549195766448975, + "learning_rate": 0.00010878041126920776, + "loss": 1.2345, + "step": 652000 + }, + { + "epoch": 19.16, + "grad_norm": 1.7298622131347656, + "learning_rate": 0.00010863331927787637, + "loss": 1.2477, + "step": 652500 + }, + { + "epoch": 19.17, + "grad_norm": 3.237170696258545, + "learning_rate": 0.00010848622728654499, + "loss": 1.2578, + "step": 653000 + }, + { + "epoch": 19.19, + "grad_norm": 2.876091718673706, + "learning_rate": 0.00010833913529521362, + "loss": 1.2657, + "step": 653500 + }, + { + "epoch": 19.2, + "grad_norm": 2.6806657314300537, + "learning_rate": 0.00010819204330388225, + "loss": 1.2369, + "step": 654000 + }, + { + "epoch": 19.22, + "grad_norm": 1.682861328125, + "learning_rate": 0.00010804495131255085, + "loss": 1.2353, + "step": 654500 + }, + { + "epoch": 19.23, + "grad_norm": 1.420599102973938, + "learning_rate": 0.00010789785932121947, + "loss": 1.2292, + "step": 655000 + }, + { + "epoch": 19.25, + "grad_norm": 2.785423517227173, + "learning_rate": 0.00010775076732988811, + "loss": 1.2299, + "step": 655500 + }, + { + "epoch": 19.26, + "grad_norm": 3.515298843383789, + "learning_rate": 0.00010760367533855673, + "loss": 1.2343, + "step": 656000 + }, + { + "epoch": 19.28, + "grad_norm": 8.15224552154541, + "learning_rate": 0.00010745658334722534, + "loss": 1.262, + "step": 656500 + }, + { + "epoch": 19.29, + "grad_norm": 2.5358471870422363, + "learning_rate": 0.00010730949135589396, + "loss": 1.2579, + "step": 657000 + }, + { + "epoch": 19.3, + "grad_norm": 2.8267860412597656, + "learning_rate": 0.0001071623993645626, + "loss": 1.2195, + "step": 657500 + }, + { + "epoch": 19.32, + "grad_norm": 2.0857648849487305, + "learning_rate": 0.00010701530737323122, + "loss": 1.2244, + "step": 658000 + }, + { + "epoch": 19.33, + "grad_norm": 2.2825379371643066, + "learning_rate": 0.00010686821538189984, + "loss": 1.2502, + "step": 658500 + }, + { + "epoch": 19.35, + "grad_norm": 2.1249475479125977, + "learning_rate": 0.00010672112339056844, + "loss": 1.2408, + "step": 659000 + }, + { + "epoch": 19.36, + "grad_norm": 1.9578863382339478, + "learning_rate": 0.00010657403139923708, + "loss": 1.2599, + "step": 659500 + }, + { + "epoch": 19.38, + "grad_norm": 1.7473647594451904, + "learning_rate": 0.0001064269394079057, + "loss": 1.2482, + "step": 660000 + }, + { + "epoch": 19.39, + "grad_norm": 1.9577364921569824, + "learning_rate": 0.00010627984741657432, + "loss": 1.2389, + "step": 660500 + }, + { + "epoch": 19.41, + "grad_norm": 6.437145233154297, + "learning_rate": 0.00010613275542524293, + "loss": 1.2327, + "step": 661000 + }, + { + "epoch": 19.42, + "grad_norm": 36.84048080444336, + "learning_rate": 0.00010598566343391156, + "loss": 1.2553, + "step": 661500 + }, + { + "epoch": 19.44, + "grad_norm": 2.4691734313964844, + "learning_rate": 0.00010583857144258018, + "loss": 1.2551, + "step": 662000 + }, + { + "epoch": 19.45, + "grad_norm": 3.6141631603240967, + "learning_rate": 0.0001056914794512488, + "loss": 1.2459, + "step": 662500 + }, + { + "epoch": 19.47, + "grad_norm": 4.1790008544921875, + "learning_rate": 0.00010554438745991741, + "loss": 1.2299, + "step": 663000 + }, + { + "epoch": 19.48, + "grad_norm": 2.3077800273895264, + "learning_rate": 0.00010539729546858605, + "loss": 1.2354, + "step": 663500 + }, + { + "epoch": 19.5, + "grad_norm": 2.0375678539276123, + "learning_rate": 0.00010525020347725467, + "loss": 1.2332, + "step": 664000 + }, + { + "epoch": 19.51, + "grad_norm": 2.190852403640747, + "learning_rate": 0.00010510311148592329, + "loss": 1.2343, + "step": 664500 + }, + { + "epoch": 19.52, + "grad_norm": 1.7503656148910522, + "learning_rate": 0.0001049560194945919, + "loss": 1.2328, + "step": 665000 + }, + { + "epoch": 19.54, + "grad_norm": 2.364180326461792, + "learning_rate": 0.00010480892750326053, + "loss": 1.245, + "step": 665500 + }, + { + "epoch": 19.55, + "grad_norm": 6.837544918060303, + "learning_rate": 0.00010466183551192915, + "loss": 1.2361, + "step": 666000 + }, + { + "epoch": 19.57, + "grad_norm": 18.344188690185547, + "learning_rate": 0.00010451474352059777, + "loss": 1.2527, + "step": 666500 + }, + { + "epoch": 19.58, + "grad_norm": 4.090867519378662, + "learning_rate": 0.00010436765152926638, + "loss": 1.2573, + "step": 667000 + }, + { + "epoch": 19.6, + "grad_norm": 61.80951690673828, + "learning_rate": 0.00010422055953793502, + "loss": 1.2776, + "step": 667500 + }, + { + "epoch": 19.61, + "grad_norm": 1.9649507999420166, + "learning_rate": 0.00010407346754660364, + "loss": 1.2479, + "step": 668000 + }, + { + "epoch": 19.63, + "grad_norm": 4.030837535858154, + "learning_rate": 0.00010392637555527226, + "loss": 1.2458, + "step": 668500 + }, + { + "epoch": 19.64, + "grad_norm": 3.310805082321167, + "learning_rate": 0.00010377928356394088, + "loss": 1.2491, + "step": 669000 + }, + { + "epoch": 19.66, + "grad_norm": 6.558318138122559, + "learning_rate": 0.0001036321915726095, + "loss": 1.2562, + "step": 669500 + }, + { + "epoch": 19.67, + "grad_norm": 2.0995540618896484, + "learning_rate": 0.00010348509958127812, + "loss": 1.2529, + "step": 670000 + }, + { + "epoch": 19.69, + "grad_norm": 3.003690242767334, + "learning_rate": 0.00010333800758994674, + "loss": 1.2717, + "step": 670500 + }, + { + "epoch": 19.7, + "grad_norm": 2.099637269973755, + "learning_rate": 0.00010319091559861536, + "loss": 1.2632, + "step": 671000 + }, + { + "epoch": 19.72, + "grad_norm": 35.86410140991211, + "learning_rate": 0.000103043823607284, + "loss": 1.2763, + "step": 671500 + }, + { + "epoch": 19.73, + "grad_norm": 1.7510465383529663, + "learning_rate": 0.0001028967316159526, + "loss": 1.2535, + "step": 672000 + }, + { + "epoch": 19.74, + "grad_norm": 10.661267280578613, + "learning_rate": 0.00010274963962462123, + "loss": 1.2346, + "step": 672500 + }, + { + "epoch": 19.76, + "grad_norm": 1.9645477533340454, + "learning_rate": 0.00010260254763328985, + "loss": 1.2398, + "step": 673000 + }, + { + "epoch": 19.77, + "grad_norm": 2.795703172683716, + "learning_rate": 0.00010245545564195848, + "loss": 1.2683, + "step": 673500 + }, + { + "epoch": 19.79, + "grad_norm": 3.7908451557159424, + "learning_rate": 0.00010230836365062709, + "loss": 1.2439, + "step": 674000 + }, + { + "epoch": 19.8, + "grad_norm": 2.028703451156616, + "learning_rate": 0.00010216127165929571, + "loss": 1.2518, + "step": 674500 + }, + { + "epoch": 19.82, + "grad_norm": 2.059154510498047, + "learning_rate": 0.00010201417966796433, + "loss": 1.2359, + "step": 675000 + }, + { + "epoch": 19.83, + "grad_norm": 1.3817317485809326, + "learning_rate": 0.00010186708767663297, + "loss": 1.27, + "step": 675500 + }, + { + "epoch": 19.85, + "grad_norm": 1.8642240762710571, + "learning_rate": 0.00010171999568530158, + "loss": 1.2604, + "step": 676000 + }, + { + "epoch": 19.86, + "grad_norm": 1.7178900241851807, + "learning_rate": 0.0001015729036939702, + "loss": 1.2509, + "step": 676500 + }, + { + "epoch": 19.88, + "grad_norm": 6.73402738571167, + "learning_rate": 0.00010142581170263882, + "loss": 1.2566, + "step": 677000 + }, + { + "epoch": 19.89, + "grad_norm": 1.7260433435440063, + "learning_rate": 0.00010127871971130745, + "loss": 1.258, + "step": 677500 + }, + { + "epoch": 19.91, + "grad_norm": 2.0348527431488037, + "learning_rate": 0.00010113162771997606, + "loss": 1.234, + "step": 678000 + }, + { + "epoch": 19.92, + "grad_norm": 3.1736955642700195, + "learning_rate": 0.00010098453572864468, + "loss": 1.241, + "step": 678500 + }, + { + "epoch": 19.94, + "grad_norm": 10.23302173614502, + "learning_rate": 0.0001008374437373133, + "loss": 1.2604, + "step": 679000 + }, + { + "epoch": 19.95, + "grad_norm": 2.974153995513916, + "learning_rate": 0.00010069035174598194, + "loss": 1.2525, + "step": 679500 + }, + { + "epoch": 19.97, + "grad_norm": 2.5226101875305176, + "learning_rate": 0.00010054325975465056, + "loss": 1.264, + "step": 680000 + }, + { + "epoch": 19.98, + "grad_norm": 2.472259521484375, + "learning_rate": 0.00010039616776331917, + "loss": 1.2299, + "step": 680500 + }, + { + "epoch": 19.99, + "grad_norm": 2.3573238849639893, + "learning_rate": 0.00010024907577198779, + "loss": 1.2657, + "step": 681000 + }, + { + "epoch": 20.01, + "grad_norm": 3.6109812259674072, + "learning_rate": 0.00010010198378065642, + "loss": 1.2247, + "step": 681500 + }, + { + "epoch": 20.02, + "grad_norm": 1.8781336545944214, + "learning_rate": 9.995489178932504e-05, + "loss": 1.2068, + "step": 682000 + }, + { + "epoch": 20.04, + "grad_norm": 1.4420850276947021, + "learning_rate": 9.980779979799365e-05, + "loss": 1.2, + "step": 682500 + }, + { + "epoch": 20.05, + "grad_norm": 3.082235097885132, + "learning_rate": 9.966070780666227e-05, + "loss": 1.2062, + "step": 683000 + }, + { + "epoch": 20.07, + "grad_norm": 1.9524058103561401, + "learning_rate": 9.95136158153309e-05, + "loss": 1.1999, + "step": 683500 + }, + { + "epoch": 20.08, + "grad_norm": 1.9696966409683228, + "learning_rate": 9.936652382399953e-05, + "loss": 1.1944, + "step": 684000 + }, + { + "epoch": 20.1, + "grad_norm": 3.851034641265869, + "learning_rate": 9.921943183266813e-05, + "loss": 1.1913, + "step": 684500 + }, + { + "epoch": 20.11, + "grad_norm": 1.7799595594406128, + "learning_rate": 9.907233984133676e-05, + "loss": 1.21, + "step": 685000 + }, + { + "epoch": 20.13, + "grad_norm": 2.3180084228515625, + "learning_rate": 9.892524785000539e-05, + "loss": 1.1741, + "step": 685500 + }, + { + "epoch": 20.14, + "grad_norm": 2.012601375579834, + "learning_rate": 9.877815585867401e-05, + "loss": 1.1956, + "step": 686000 + }, + { + "epoch": 20.16, + "grad_norm": 2.5793349742889404, + "learning_rate": 9.863106386734262e-05, + "loss": 1.2011, + "step": 686500 + }, + { + "epoch": 20.17, + "grad_norm": 1.7113804817199707, + "learning_rate": 9.848397187601124e-05, + "loss": 1.2382, + "step": 687000 + }, + { + "epoch": 20.19, + "grad_norm": 1.8239011764526367, + "learning_rate": 9.833687988467988e-05, + "loss": 1.1904, + "step": 687500 + }, + { + "epoch": 20.2, + "grad_norm": 8.440109252929688, + "learning_rate": 9.81897878933485e-05, + "loss": 1.2087, + "step": 688000 + }, + { + "epoch": 20.21, + "grad_norm": 2.4333038330078125, + "learning_rate": 9.80426959020171e-05, + "loss": 1.209, + "step": 688500 + }, + { + "epoch": 20.23, + "grad_norm": 3.217632532119751, + "learning_rate": 9.789560391068572e-05, + "loss": 1.1964, + "step": 689000 + }, + { + "epoch": 20.24, + "grad_norm": 3.0806314945220947, + "learning_rate": 9.774851191935436e-05, + "loss": 1.2227, + "step": 689500 + }, + { + "epoch": 20.26, + "grad_norm": 21.035554885864258, + "learning_rate": 9.760141992802298e-05, + "loss": 1.1899, + "step": 690000 + }, + { + "epoch": 20.27, + "grad_norm": 4.15212345123291, + "learning_rate": 9.74543279366916e-05, + "loss": 1.1919, + "step": 690500 + }, + { + "epoch": 20.29, + "grad_norm": 8.08483600616455, + "learning_rate": 9.730723594536021e-05, + "loss": 1.2083, + "step": 691000 + }, + { + "epoch": 20.3, + "grad_norm": 2.5341944694519043, + "learning_rate": 9.716014395402884e-05, + "loss": 1.181, + "step": 691500 + }, + { + "epoch": 20.32, + "grad_norm": 7.360681533813477, + "learning_rate": 9.701305196269747e-05, + "loss": 1.1975, + "step": 692000 + }, + { + "epoch": 20.33, + "grad_norm": 1.5373992919921875, + "learning_rate": 9.686595997136609e-05, + "loss": 1.191, + "step": 692500 + }, + { + "epoch": 20.35, + "grad_norm": 1.8679372072219849, + "learning_rate": 9.67188679800347e-05, + "loss": 1.213, + "step": 693000 + }, + { + "epoch": 20.36, + "grad_norm": 3.206306219100952, + "learning_rate": 9.657177598870333e-05, + "loss": 1.2051, + "step": 693500 + }, + { + "epoch": 20.38, + "grad_norm": 2.6320650577545166, + "learning_rate": 9.642468399737195e-05, + "loss": 1.1851, + "step": 694000 + }, + { + "epoch": 20.39, + "grad_norm": 3.6654086112976074, + "learning_rate": 9.627759200604057e-05, + "loss": 1.2063, + "step": 694500 + }, + { + "epoch": 20.41, + "grad_norm": 3.010348081588745, + "learning_rate": 9.613050001470918e-05, + "loss": 1.2239, + "step": 695000 + }, + { + "epoch": 20.42, + "grad_norm": 27.742935180664062, + "learning_rate": 9.598340802337781e-05, + "loss": 1.2084, + "step": 695500 + }, + { + "epoch": 20.43, + "grad_norm": 2.783250570297241, + "learning_rate": 9.583631603204643e-05, + "loss": 1.2263, + "step": 696000 + }, + { + "epoch": 20.45, + "grad_norm": 2.516063690185547, + "learning_rate": 9.568922404071506e-05, + "loss": 1.2084, + "step": 696500 + }, + { + "epoch": 20.46, + "grad_norm": 1.8266417980194092, + "learning_rate": 9.554213204938366e-05, + "loss": 1.2052, + "step": 697000 + }, + { + "epoch": 20.48, + "grad_norm": 1.2132940292358398, + "learning_rate": 9.53950400580523e-05, + "loss": 1.2132, + "step": 697500 + }, + { + "epoch": 20.49, + "grad_norm": 4.550230503082275, + "learning_rate": 9.524794806672092e-05, + "loss": 1.2263, + "step": 698000 + }, + { + "epoch": 20.51, + "grad_norm": 2.1874680519104004, + "learning_rate": 9.510085607538954e-05, + "loss": 1.1855, + "step": 698500 + }, + { + "epoch": 20.52, + "grad_norm": 1.8161512613296509, + "learning_rate": 9.495376408405815e-05, + "loss": 1.1949, + "step": 699000 + }, + { + "epoch": 20.54, + "grad_norm": 1.3943161964416504, + "learning_rate": 9.480667209272678e-05, + "loss": 1.1944, + "step": 699500 + }, + { + "epoch": 20.55, + "grad_norm": 3.2997055053710938, + "learning_rate": 9.46595801013954e-05, + "loss": 1.2213, + "step": 700000 + }, + { + "epoch": 20.57, + "grad_norm": 1.9309003353118896, + "learning_rate": 9.451248811006402e-05, + "loss": 1.2163, + "step": 700500 + }, + { + "epoch": 20.58, + "grad_norm": 2.734384775161743, + "learning_rate": 9.436539611873265e-05, + "loss": 1.1912, + "step": 701000 + }, + { + "epoch": 20.6, + "grad_norm": 2.1459152698516846, + "learning_rate": 9.421830412740127e-05, + "loss": 1.2343, + "step": 701500 + }, + { + "epoch": 20.61, + "grad_norm": 2.039071798324585, + "learning_rate": 9.407121213606989e-05, + "loss": 1.228, + "step": 702000 + }, + { + "epoch": 20.63, + "grad_norm": 3.1236155033111572, + "learning_rate": 9.392412014473851e-05, + "loss": 1.2362, + "step": 702500 + }, + { + "epoch": 20.64, + "grad_norm": 6.471434593200684, + "learning_rate": 9.377702815340713e-05, + "loss": 1.2159, + "step": 703000 + }, + { + "epoch": 20.66, + "grad_norm": 3.2641398906707764, + "learning_rate": 9.362993616207576e-05, + "loss": 1.1702, + "step": 703500 + }, + { + "epoch": 20.67, + "grad_norm": 2.0481953620910645, + "learning_rate": 9.348284417074437e-05, + "loss": 1.218, + "step": 704000 + }, + { + "epoch": 20.68, + "grad_norm": 1.6071834564208984, + "learning_rate": 9.3335752179413e-05, + "loss": 1.2422, + "step": 704500 + }, + { + "epoch": 20.7, + "grad_norm": 1.8020005226135254, + "learning_rate": 9.318866018808163e-05, + "loss": 1.2268, + "step": 705000 + }, + { + "epoch": 20.71, + "grad_norm": 4.970008373260498, + "learning_rate": 9.304156819675025e-05, + "loss": 1.2093, + "step": 705500 + }, + { + "epoch": 20.73, + "grad_norm": 2.152195453643799, + "learning_rate": 9.289447620541886e-05, + "loss": 1.2361, + "step": 706000 + }, + { + "epoch": 20.74, + "grad_norm": 5.956038475036621, + "learning_rate": 9.274738421408748e-05, + "loss": 1.1908, + "step": 706500 + }, + { + "epoch": 20.76, + "grad_norm": 2.429544687271118, + "learning_rate": 9.260029222275611e-05, + "loss": 1.1805, + "step": 707000 + }, + { + "epoch": 20.77, + "grad_norm": 2.6851959228515625, + "learning_rate": 9.245320023142473e-05, + "loss": 1.204, + "step": 707500 + }, + { + "epoch": 20.79, + "grad_norm": 2.6581480503082275, + "learning_rate": 9.230610824009334e-05, + "loss": 1.2357, + "step": 708000 + }, + { + "epoch": 20.8, + "grad_norm": 2.938990354537964, + "learning_rate": 9.215901624876196e-05, + "loss": 1.2326, + "step": 708500 + }, + { + "epoch": 20.82, + "grad_norm": 1.9101125001907349, + "learning_rate": 9.20119242574306e-05, + "loss": 1.2013, + "step": 709000 + }, + { + "epoch": 20.83, + "grad_norm": 3.4203646183013916, + "learning_rate": 9.186483226609922e-05, + "loss": 1.2021, + "step": 709500 + }, + { + "epoch": 20.85, + "grad_norm": 2.2764744758605957, + "learning_rate": 9.171774027476783e-05, + "loss": 1.223, + "step": 710000 + }, + { + "epoch": 20.86, + "grad_norm": 1.5339056253433228, + "learning_rate": 9.157064828343645e-05, + "loss": 1.2001, + "step": 710500 + }, + { + "epoch": 20.88, + "grad_norm": 1.3566075563430786, + "learning_rate": 9.142355629210508e-05, + "loss": 1.2324, + "step": 711000 + }, + { + "epoch": 20.89, + "grad_norm": 1.5568170547485352, + "learning_rate": 9.12764643007737e-05, + "loss": 1.2065, + "step": 711500 + }, + { + "epoch": 20.9, + "grad_norm": 26.298019409179688, + "learning_rate": 9.112937230944231e-05, + "loss": 1.2283, + "step": 712000 + }, + { + "epoch": 20.92, + "grad_norm": 1.2808449268341064, + "learning_rate": 9.098228031811093e-05, + "loss": 1.2531, + "step": 712500 + }, + { + "epoch": 20.93, + "grad_norm": 1.9005004167556763, + "learning_rate": 9.083518832677957e-05, + "loss": 1.2108, + "step": 713000 + }, + { + "epoch": 20.95, + "grad_norm": 6.3783721923828125, + "learning_rate": 9.068809633544819e-05, + "loss": 1.2015, + "step": 713500 + }, + { + "epoch": 20.96, + "grad_norm": 2.5035910606384277, + "learning_rate": 9.054100434411681e-05, + "loss": 1.2342, + "step": 714000 + }, + { + "epoch": 20.98, + "grad_norm": 9.022185325622559, + "learning_rate": 9.039391235278542e-05, + "loss": 1.2176, + "step": 714500 + }, + { + "epoch": 20.99, + "grad_norm": 1.442610502243042, + "learning_rate": 9.024682036145405e-05, + "loss": 1.2406, + "step": 715000 + }, + { + "epoch": 21.01, + "grad_norm": 2.419548273086548, + "learning_rate": 9.009972837012267e-05, + "loss": 1.1987, + "step": 715500 + }, + { + "epoch": 21.02, + "grad_norm": 9.644920349121094, + "learning_rate": 8.995263637879129e-05, + "loss": 1.141, + "step": 716000 + }, + { + "epoch": 21.04, + "grad_norm": 2.2750537395477295, + "learning_rate": 8.98055443874599e-05, + "loss": 1.1621, + "step": 716500 + }, + { + "epoch": 21.05, + "grad_norm": 12.630777359008789, + "learning_rate": 8.965845239612853e-05, + "loss": 1.1495, + "step": 717000 + }, + { + "epoch": 21.07, + "grad_norm": 1.7819632291793823, + "learning_rate": 8.951136040479716e-05, + "loss": 1.1687, + "step": 717500 + }, + { + "epoch": 21.08, + "grad_norm": 2.4531936645507812, + "learning_rate": 8.936426841346578e-05, + "loss": 1.1722, + "step": 718000 + }, + { + "epoch": 21.1, + "grad_norm": 4.26564884185791, + "learning_rate": 8.921717642213438e-05, + "loss": 1.1804, + "step": 718500 + }, + { + "epoch": 21.11, + "grad_norm": 3.8713653087615967, + "learning_rate": 8.907008443080302e-05, + "loss": 1.1954, + "step": 719000 + }, + { + "epoch": 21.12, + "grad_norm": 2.3605353832244873, + "learning_rate": 8.892299243947164e-05, + "loss": 1.1688, + "step": 719500 + }, + { + "epoch": 21.14, + "grad_norm": 2.322021007537842, + "learning_rate": 8.877590044814026e-05, + "loss": 1.1403, + "step": 720000 + }, + { + "epoch": 21.15, + "grad_norm": 2.5785558223724365, + "learning_rate": 8.862880845680887e-05, + "loss": 1.1812, + "step": 720500 + }, + { + "epoch": 21.17, + "grad_norm": 1.8795828819274902, + "learning_rate": 8.84817164654775e-05, + "loss": 1.1756, + "step": 721000 + }, + { + "epoch": 21.18, + "grad_norm": 1.7690235376358032, + "learning_rate": 8.833462447414613e-05, + "loss": 1.1592, + "step": 721500 + }, + { + "epoch": 21.2, + "grad_norm": 2.3973443508148193, + "learning_rate": 8.818753248281475e-05, + "loss": 1.1586, + "step": 722000 + }, + { + "epoch": 21.21, + "grad_norm": 11.784666061401367, + "learning_rate": 8.804044049148335e-05, + "loss": 1.1522, + "step": 722500 + }, + { + "epoch": 21.23, + "grad_norm": 1.8928039073944092, + "learning_rate": 8.789334850015199e-05, + "loss": 1.1739, + "step": 723000 + }, + { + "epoch": 21.24, + "grad_norm": 28.14616584777832, + "learning_rate": 8.774625650882061e-05, + "loss": 1.1844, + "step": 723500 + }, + { + "epoch": 21.26, + "grad_norm": 22.186017990112305, + "learning_rate": 8.759916451748923e-05, + "loss": 1.1501, + "step": 724000 + }, + { + "epoch": 21.27, + "grad_norm": 3.089933395385742, + "learning_rate": 8.745207252615785e-05, + "loss": 1.1687, + "step": 724500 + }, + { + "epoch": 21.29, + "grad_norm": 3.315870523452759, + "learning_rate": 8.730498053482647e-05, + "loss": 1.1695, + "step": 725000 + }, + { + "epoch": 21.3, + "grad_norm": 2.118852138519287, + "learning_rate": 8.71578885434951e-05, + "loss": 1.1831, + "step": 725500 + }, + { + "epoch": 21.32, + "grad_norm": 2.3328022956848145, + "learning_rate": 8.701079655216372e-05, + "loss": 1.1972, + "step": 726000 + }, + { + "epoch": 21.33, + "grad_norm": 2.6026008129119873, + "learning_rate": 8.686370456083234e-05, + "loss": 1.177, + "step": 726500 + }, + { + "epoch": 21.34, + "grad_norm": 1.873986840248108, + "learning_rate": 8.671661256950097e-05, + "loss": 1.1856, + "step": 727000 + }, + { + "epoch": 21.36, + "grad_norm": 2.4924209117889404, + "learning_rate": 8.656952057816958e-05, + "loss": 1.1587, + "step": 727500 + }, + { + "epoch": 21.37, + "grad_norm": 3.228402614593506, + "learning_rate": 8.64224285868382e-05, + "loss": 1.1654, + "step": 728000 + }, + { + "epoch": 21.39, + "grad_norm": 2.2008731365203857, + "learning_rate": 8.627533659550682e-05, + "loss": 1.1953, + "step": 728500 + }, + { + "epoch": 21.4, + "grad_norm": 16.22712516784668, + "learning_rate": 8.612824460417546e-05, + "loss": 1.1871, + "step": 729000 + }, + { + "epoch": 21.42, + "grad_norm": 4.7584919929504395, + "learning_rate": 8.598115261284406e-05, + "loss": 1.1865, + "step": 729500 + }, + { + "epoch": 21.43, + "grad_norm": 3.2225780487060547, + "learning_rate": 8.583406062151268e-05, + "loss": 1.1776, + "step": 730000 + }, + { + "epoch": 21.45, + "grad_norm": 5.469396591186523, + "learning_rate": 8.56869686301813e-05, + "loss": 1.1897, + "step": 730500 + }, + { + "epoch": 21.46, + "grad_norm": 1.8942010402679443, + "learning_rate": 8.553987663884994e-05, + "loss": 1.1484, + "step": 731000 + }, + { + "epoch": 21.48, + "grad_norm": 2.122103452682495, + "learning_rate": 8.539278464751855e-05, + "loss": 1.1707, + "step": 731500 + }, + { + "epoch": 21.49, + "grad_norm": 3.3333709239959717, + "learning_rate": 8.524569265618717e-05, + "loss": 1.1471, + "step": 732000 + }, + { + "epoch": 21.51, + "grad_norm": 3.002875566482544, + "learning_rate": 8.509860066485579e-05, + "loss": 1.1785, + "step": 732500 + }, + { + "epoch": 21.52, + "grad_norm": 3.3468096256256104, + "learning_rate": 8.495150867352442e-05, + "loss": 1.1705, + "step": 733000 + }, + { + "epoch": 21.54, + "grad_norm": 20.700353622436523, + "learning_rate": 8.480441668219303e-05, + "loss": 1.1792, + "step": 733500 + }, + { + "epoch": 21.55, + "grad_norm": 5.712481498718262, + "learning_rate": 8.465732469086165e-05, + "loss": 1.1633, + "step": 734000 + }, + { + "epoch": 21.57, + "grad_norm": 2.2254440784454346, + "learning_rate": 8.451023269953027e-05, + "loss": 1.1809, + "step": 734500 + }, + { + "epoch": 21.58, + "grad_norm": 5.889398574829102, + "learning_rate": 8.436314070819891e-05, + "loss": 1.1732, + "step": 735000 + }, + { + "epoch": 21.59, + "grad_norm": 6.2943572998046875, + "learning_rate": 8.421604871686753e-05, + "loss": 1.1896, + "step": 735500 + }, + { + "epoch": 21.61, + "grad_norm": 2.1915676593780518, + "learning_rate": 8.406895672553614e-05, + "loss": 1.1884, + "step": 736000 + }, + { + "epoch": 21.62, + "grad_norm": 2.950507164001465, + "learning_rate": 8.392186473420476e-05, + "loss": 1.1825, + "step": 736500 + }, + { + "epoch": 21.64, + "grad_norm": 2.338834047317505, + "learning_rate": 8.37747727428734e-05, + "loss": 1.1787, + "step": 737000 + }, + { + "epoch": 21.65, + "grad_norm": 2.806655168533325, + "learning_rate": 8.362768075154201e-05, + "loss": 1.1988, + "step": 737500 + }, + { + "epoch": 21.67, + "grad_norm": 2.509188652038574, + "learning_rate": 8.348058876021062e-05, + "loss": 1.1678, + "step": 738000 + }, + { + "epoch": 21.68, + "grad_norm": 2.3886609077453613, + "learning_rate": 8.333349676887924e-05, + "loss": 1.2138, + "step": 738500 + }, + { + "epoch": 21.7, + "grad_norm": 8.767037391662598, + "learning_rate": 8.318640477754788e-05, + "loss": 1.1682, + "step": 739000 + }, + { + "epoch": 21.71, + "grad_norm": 2.7434234619140625, + "learning_rate": 8.30393127862165e-05, + "loss": 1.1675, + "step": 739500 + }, + { + "epoch": 21.73, + "grad_norm": 3.837946891784668, + "learning_rate": 8.28922207948851e-05, + "loss": 1.1746, + "step": 740000 + }, + { + "epoch": 21.74, + "grad_norm": 2.2261147499084473, + "learning_rate": 8.274512880355373e-05, + "loss": 1.1805, + "step": 740500 + }, + { + "epoch": 21.76, + "grad_norm": 3.7343368530273438, + "learning_rate": 8.259803681222236e-05, + "loss": 1.1829, + "step": 741000 + }, + { + "epoch": 21.77, + "grad_norm": 2.696150779724121, + "learning_rate": 8.245094482089098e-05, + "loss": 1.1791, + "step": 741500 + }, + { + "epoch": 21.79, + "grad_norm": 3.374664783477783, + "learning_rate": 8.230385282955959e-05, + "loss": 1.186, + "step": 742000 + }, + { + "epoch": 21.8, + "grad_norm": 2.0599725246429443, + "learning_rate": 8.215676083822821e-05, + "loss": 1.1704, + "step": 742500 + }, + { + "epoch": 21.81, + "grad_norm": 2.960590362548828, + "learning_rate": 8.200966884689685e-05, + "loss": 1.2078, + "step": 743000 + }, + { + "epoch": 21.83, + "grad_norm": 3.514355421066284, + "learning_rate": 8.186257685556547e-05, + "loss": 1.1853, + "step": 743500 + }, + { + "epoch": 21.84, + "grad_norm": 3.7044436931610107, + "learning_rate": 8.171548486423408e-05, + "loss": 1.1775, + "step": 744000 + }, + { + "epoch": 21.86, + "grad_norm": 2.7482707500457764, + "learning_rate": 8.15683928729027e-05, + "loss": 1.1927, + "step": 744500 + }, + { + "epoch": 21.87, + "grad_norm": 3.462541103363037, + "learning_rate": 8.142130088157133e-05, + "loss": 1.155, + "step": 745000 + }, + { + "epoch": 21.89, + "grad_norm": 1.3520176410675049, + "learning_rate": 8.127420889023995e-05, + "loss": 1.1887, + "step": 745500 + }, + { + "epoch": 21.9, + "grad_norm": 3.2204971313476562, + "learning_rate": 8.112711689890857e-05, + "loss": 1.17, + "step": 746000 + }, + { + "epoch": 21.92, + "grad_norm": 2.8493075370788574, + "learning_rate": 8.098002490757718e-05, + "loss": 1.1804, + "step": 746500 + }, + { + "epoch": 21.93, + "grad_norm": 28.339750289916992, + "learning_rate": 8.083293291624582e-05, + "loss": 1.1832, + "step": 747000 + }, + { + "epoch": 21.95, + "grad_norm": 3.3736674785614014, + "learning_rate": 8.068584092491444e-05, + "loss": 1.1903, + "step": 747500 + }, + { + "epoch": 21.96, + "grad_norm": 2.166250228881836, + "learning_rate": 8.053874893358306e-05, + "loss": 1.1712, + "step": 748000 + }, + { + "epoch": 21.98, + "grad_norm": 3.713568925857544, + "learning_rate": 8.039165694225167e-05, + "loss": 1.1896, + "step": 748500 + }, + { + "epoch": 21.99, + "grad_norm": 3.077561140060425, + "learning_rate": 8.02445649509203e-05, + "loss": 1.1999, + "step": 749000 + }, + { + "epoch": 22.01, + "grad_norm": 1.1793521642684937, + "learning_rate": 8.009747295958892e-05, + "loss": 1.166, + "step": 749500 + }, + { + "epoch": 22.02, + "grad_norm": 1.4397152662277222, + "learning_rate": 7.995038096825754e-05, + "loss": 1.1327, + "step": 750000 + }, + { + "epoch": 22.03, + "grad_norm": 6.889225959777832, + "learning_rate": 7.980328897692615e-05, + "loss": 1.1533, + "step": 750500 + }, + { + "epoch": 22.05, + "grad_norm": 1.7650978565216064, + "learning_rate": 7.965619698559479e-05, + "loss": 1.1301, + "step": 751000 + }, + { + "epoch": 22.06, + "grad_norm": 1.922248125076294, + "learning_rate": 7.95091049942634e-05, + "loss": 1.1313, + "step": 751500 + }, + { + "epoch": 22.08, + "grad_norm": 9.232244491577148, + "learning_rate": 7.936201300293203e-05, + "loss": 1.1601, + "step": 752000 + }, + { + "epoch": 22.09, + "grad_norm": 2.162410259246826, + "learning_rate": 7.921492101160063e-05, + "loss": 1.1394, + "step": 752500 + }, + { + "epoch": 22.11, + "grad_norm": 2.0230438709259033, + "learning_rate": 7.906782902026927e-05, + "loss": 1.131, + "step": 753000 + }, + { + "epoch": 22.12, + "grad_norm": 3.8333919048309326, + "learning_rate": 7.892073702893789e-05, + "loss": 1.1264, + "step": 753500 + }, + { + "epoch": 22.14, + "grad_norm": 1.487121343612671, + "learning_rate": 7.877364503760651e-05, + "loss": 1.1127, + "step": 754000 + }, + { + "epoch": 22.15, + "grad_norm": 2.5997982025146484, + "learning_rate": 7.862655304627512e-05, + "loss": 1.1072, + "step": 754500 + }, + { + "epoch": 22.17, + "grad_norm": 2.5122156143188477, + "learning_rate": 7.847946105494375e-05, + "loss": 1.1091, + "step": 755000 + }, + { + "epoch": 22.18, + "grad_norm": 2.6082444190979004, + "learning_rate": 7.833236906361238e-05, + "loss": 1.142, + "step": 755500 + }, + { + "epoch": 22.2, + "grad_norm": 8.16508960723877, + "learning_rate": 7.8185277072281e-05, + "loss": 1.1247, + "step": 756000 + }, + { + "epoch": 22.21, + "grad_norm": 3.061530113220215, + "learning_rate": 7.803818508094962e-05, + "loss": 1.1526, + "step": 756500 + }, + { + "epoch": 22.23, + "grad_norm": 3.9026131629943848, + "learning_rate": 7.789109308961824e-05, + "loss": 1.152, + "step": 757000 + }, + { + "epoch": 22.24, + "grad_norm": 2.3439905643463135, + "learning_rate": 7.774400109828686e-05, + "loss": 1.1416, + "step": 757500 + }, + { + "epoch": 22.26, + "grad_norm": 0.9024741053581238, + "learning_rate": 7.759690910695548e-05, + "loss": 1.1272, + "step": 758000 + }, + { + "epoch": 22.27, + "grad_norm": 3.3680572509765625, + "learning_rate": 7.74498171156241e-05, + "loss": 1.1445, + "step": 758500 + }, + { + "epoch": 22.28, + "grad_norm": 1.695378065109253, + "learning_rate": 7.730272512429274e-05, + "loss": 1.1373, + "step": 759000 + }, + { + "epoch": 22.3, + "grad_norm": 4.116308689117432, + "learning_rate": 7.715563313296134e-05, + "loss": 1.1453, + "step": 759500 + }, + { + "epoch": 22.31, + "grad_norm": 2.014496088027954, + "learning_rate": 7.700854114162997e-05, + "loss": 1.146, + "step": 760000 + }, + { + "epoch": 22.33, + "grad_norm": 1.5276461839675903, + "learning_rate": 7.686144915029859e-05, + "loss": 1.13, + "step": 760500 + }, + { + "epoch": 22.34, + "grad_norm": 2.514599084854126, + "learning_rate": 7.671435715896722e-05, + "loss": 1.142, + "step": 761000 + }, + { + "epoch": 22.36, + "grad_norm": 2.2747581005096436, + "learning_rate": 7.656726516763583e-05, + "loss": 1.1552, + "step": 761500 + }, + { + "epoch": 22.37, + "grad_norm": 2.406036615371704, + "learning_rate": 7.642017317630445e-05, + "loss": 1.1453, + "step": 762000 + }, + { + "epoch": 22.39, + "grad_norm": 2.231992721557617, + "learning_rate": 7.627308118497307e-05, + "loss": 1.1386, + "step": 762500 + }, + { + "epoch": 22.4, + "grad_norm": 3.1717188358306885, + "learning_rate": 7.61259891936417e-05, + "loss": 1.1549, + "step": 763000 + }, + { + "epoch": 22.42, + "grad_norm": 22.27747917175293, + "learning_rate": 7.597889720231031e-05, + "loss": 1.1142, + "step": 763500 + }, + { + "epoch": 22.43, + "grad_norm": 2.7918033599853516, + "learning_rate": 7.583180521097893e-05, + "loss": 1.1108, + "step": 764000 + }, + { + "epoch": 22.45, + "grad_norm": 1.4565542936325073, + "learning_rate": 7.568471321964757e-05, + "loss": 1.1509, + "step": 764500 + }, + { + "epoch": 22.46, + "grad_norm": 3.1026453971862793, + "learning_rate": 7.553762122831619e-05, + "loss": 1.1513, + "step": 765000 + }, + { + "epoch": 22.48, + "grad_norm": 1.6490851640701294, + "learning_rate": 7.53905292369848e-05, + "loss": 1.1242, + "step": 765500 + }, + { + "epoch": 22.49, + "grad_norm": 1.7027279138565063, + "learning_rate": 7.524343724565342e-05, + "loss": 1.1561, + "step": 766000 + }, + { + "epoch": 22.5, + "grad_norm": 1.2775441408157349, + "learning_rate": 7.509634525432205e-05, + "loss": 1.1168, + "step": 766500 + }, + { + "epoch": 22.52, + "grad_norm": 5.64236307144165, + "learning_rate": 7.494925326299067e-05, + "loss": 1.1465, + "step": 767000 + }, + { + "epoch": 22.53, + "grad_norm": 2.019430637359619, + "learning_rate": 7.480216127165928e-05, + "loss": 1.1162, + "step": 767500 + }, + { + "epoch": 22.55, + "grad_norm": 1.3938747644424438, + "learning_rate": 7.465506928032792e-05, + "loss": 1.1253, + "step": 768000 + }, + { + "epoch": 22.56, + "grad_norm": 2.122945785522461, + "learning_rate": 7.450797728899654e-05, + "loss": 1.1409, + "step": 768500 + }, + { + "epoch": 22.58, + "grad_norm": 1.826390266418457, + "learning_rate": 7.436088529766516e-05, + "loss": 1.1473, + "step": 769000 + }, + { + "epoch": 22.59, + "grad_norm": 1.8003076314926147, + "learning_rate": 7.421379330633378e-05, + "loss": 1.1827, + "step": 769500 + }, + { + "epoch": 22.61, + "grad_norm": 2.6314899921417236, + "learning_rate": 7.40667013150024e-05, + "loss": 1.1426, + "step": 770000 + }, + { + "epoch": 22.62, + "grad_norm": 2.7484140396118164, + "learning_rate": 7.391960932367102e-05, + "loss": 1.1603, + "step": 770500 + }, + { + "epoch": 22.64, + "grad_norm": 1.5445499420166016, + "learning_rate": 7.377251733233964e-05, + "loss": 1.1609, + "step": 771000 + }, + { + "epoch": 22.65, + "grad_norm": 2.3308262825012207, + "learning_rate": 7.362542534100826e-05, + "loss": 1.1126, + "step": 771500 + }, + { + "epoch": 22.67, + "grad_norm": 2.675467014312744, + "learning_rate": 7.347833334967689e-05, + "loss": 1.1717, + "step": 772000 + }, + { + "epoch": 22.68, + "grad_norm": 7.849360466003418, + "learning_rate": 7.333124135834551e-05, + "loss": 1.1304, + "step": 772500 + }, + { + "epoch": 22.7, + "grad_norm": 1.596022129058838, + "learning_rate": 7.318414936701413e-05, + "loss": 1.1563, + "step": 773000 + }, + { + "epoch": 22.71, + "grad_norm": 1.8678432703018188, + "learning_rate": 7.303705737568275e-05, + "loss": 1.1668, + "step": 773500 + }, + { + "epoch": 22.72, + "grad_norm": 1.3757662773132324, + "learning_rate": 7.288996538435137e-05, + "loss": 1.1484, + "step": 774000 + }, + { + "epoch": 22.74, + "grad_norm": 2.819344997406006, + "learning_rate": 7.274287339301999e-05, + "loss": 1.1213, + "step": 774500 + }, + { + "epoch": 22.75, + "grad_norm": 2.370375871658325, + "learning_rate": 7.259578140168861e-05, + "loss": 1.1654, + "step": 775000 + }, + { + "epoch": 22.77, + "grad_norm": 3.6313276290893555, + "learning_rate": 7.244868941035723e-05, + "loss": 1.13, + "step": 775500 + }, + { + "epoch": 22.78, + "grad_norm": 1.737453579902649, + "learning_rate": 7.230159741902585e-05, + "loss": 1.1283, + "step": 776000 + }, + { + "epoch": 22.8, + "grad_norm": 2.530315637588501, + "learning_rate": 7.215450542769448e-05, + "loss": 1.1413, + "step": 776500 + }, + { + "epoch": 22.81, + "grad_norm": 2.049701452255249, + "learning_rate": 7.20074134363631e-05, + "loss": 1.1863, + "step": 777000 + }, + { + "epoch": 22.83, + "grad_norm": 2.112698554992676, + "learning_rate": 7.186032144503172e-05, + "loss": 1.1217, + "step": 777500 + }, + { + "epoch": 22.84, + "grad_norm": 1.982596755027771, + "learning_rate": 7.171322945370034e-05, + "loss": 1.151, + "step": 778000 + }, + { + "epoch": 22.86, + "grad_norm": 2.8483023643493652, + "learning_rate": 7.156613746236896e-05, + "loss": 1.1371, + "step": 778500 + }, + { + "epoch": 22.87, + "grad_norm": 2.3431105613708496, + "learning_rate": 7.141904547103758e-05, + "loss": 1.1549, + "step": 779000 + }, + { + "epoch": 22.89, + "grad_norm": 2.256725549697876, + "learning_rate": 7.12719534797062e-05, + "loss": 1.1387, + "step": 779500 + }, + { + "epoch": 22.9, + "grad_norm": 3.2769088745117188, + "learning_rate": 7.112486148837482e-05, + "loss": 1.1365, + "step": 780000 + }, + { + "epoch": 22.92, + "grad_norm": 7.680245399475098, + "learning_rate": 7.097776949704344e-05, + "loss": 1.1524, + "step": 780500 + }, + { + "epoch": 22.93, + "grad_norm": 2.0272703170776367, + "learning_rate": 7.083067750571207e-05, + "loss": 1.1483, + "step": 781000 + }, + { + "epoch": 22.95, + "grad_norm": 3.6463356018066406, + "learning_rate": 7.068358551438069e-05, + "loss": 1.1188, + "step": 781500 + }, + { + "epoch": 22.96, + "grad_norm": 2.499232769012451, + "learning_rate": 7.053649352304931e-05, + "loss": 1.1434, + "step": 782000 + }, + { + "epoch": 22.97, + "grad_norm": 2.1540205478668213, + "learning_rate": 7.038940153171793e-05, + "loss": 1.1394, + "step": 782500 + }, + { + "epoch": 22.99, + "grad_norm": 2.569671154022217, + "learning_rate": 7.024230954038655e-05, + "loss": 1.1581, + "step": 783000 + }, + { + "epoch": 23.0, + "grad_norm": 5.3423333168029785, + "learning_rate": 7.009521754905517e-05, + "loss": 1.1392, + "step": 783500 + }, + { + "epoch": 23.02, + "grad_norm": 1.8991897106170654, + "learning_rate": 6.994812555772379e-05, + "loss": 1.0877, + "step": 784000 + }, + { + "epoch": 23.03, + "grad_norm": 2.094095230102539, + "learning_rate": 6.980103356639241e-05, + "loss": 1.1106, + "step": 784500 + }, + { + "epoch": 23.05, + "grad_norm": 2.4271163940429688, + "learning_rate": 6.965394157506104e-05, + "loss": 1.0893, + "step": 785000 + }, + { + "epoch": 23.06, + "grad_norm": 3.5016427040100098, + "learning_rate": 6.950684958372966e-05, + "loss": 1.1119, + "step": 785500 + }, + { + "epoch": 23.08, + "grad_norm": 3.388533592224121, + "learning_rate": 6.935975759239828e-05, + "loss": 1.1135, + "step": 786000 + }, + { + "epoch": 23.09, + "grad_norm": 1.610478401184082, + "learning_rate": 6.92126656010669e-05, + "loss": 1.1064, + "step": 786500 + }, + { + "epoch": 23.11, + "grad_norm": 2.6857564449310303, + "learning_rate": 6.906557360973552e-05, + "loss": 1.105, + "step": 787000 + }, + { + "epoch": 23.12, + "grad_norm": 3.785341739654541, + "learning_rate": 6.891848161840414e-05, + "loss": 1.1109, + "step": 787500 + }, + { + "epoch": 23.14, + "grad_norm": 1.6604160070419312, + "learning_rate": 6.877138962707276e-05, + "loss": 1.1124, + "step": 788000 + }, + { + "epoch": 23.15, + "grad_norm": 1.9755994081497192, + "learning_rate": 6.862429763574138e-05, + "loss": 1.1034, + "step": 788500 + }, + { + "epoch": 23.17, + "grad_norm": 2.989527940750122, + "learning_rate": 6.847720564441e-05, + "loss": 1.1055, + "step": 789000 + }, + { + "epoch": 23.18, + "grad_norm": 2.2718911170959473, + "learning_rate": 6.833011365307863e-05, + "loss": 1.104, + "step": 789500 + }, + { + "epoch": 23.19, + "grad_norm": 4.303855895996094, + "learning_rate": 6.818302166174725e-05, + "loss": 1.1174, + "step": 790000 + }, + { + "epoch": 23.21, + "grad_norm": 2.6747548580169678, + "learning_rate": 6.803592967041587e-05, + "loss": 1.1046, + "step": 790500 + }, + { + "epoch": 23.22, + "grad_norm": 2.6597251892089844, + "learning_rate": 6.78888376790845e-05, + "loss": 1.1246, + "step": 791000 + }, + { + "epoch": 23.24, + "grad_norm": 1.6096960306167603, + "learning_rate": 6.774174568775311e-05, + "loss": 1.1103, + "step": 791500 + }, + { + "epoch": 23.25, + "grad_norm": 1.8474167585372925, + "learning_rate": 6.759465369642174e-05, + "loss": 1.1136, + "step": 792000 + }, + { + "epoch": 23.27, + "grad_norm": 6.043608665466309, + "learning_rate": 6.744756170509035e-05, + "loss": 1.1118, + "step": 792500 + }, + { + "epoch": 23.28, + "grad_norm": 10.027447700500488, + "learning_rate": 6.730046971375899e-05, + "loss": 1.115, + "step": 793000 + }, + { + "epoch": 23.3, + "grad_norm": 2.5558454990386963, + "learning_rate": 6.71533777224276e-05, + "loss": 1.0993, + "step": 793500 + }, + { + "epoch": 23.31, + "grad_norm": 2.075737476348877, + "learning_rate": 6.700628573109623e-05, + "loss": 1.1111, + "step": 794000 + }, + { + "epoch": 23.33, + "grad_norm": 2.4479551315307617, + "learning_rate": 6.685919373976484e-05, + "loss": 1.0848, + "step": 794500 + }, + { + "epoch": 23.34, + "grad_norm": 2.503129243850708, + "learning_rate": 6.671210174843347e-05, + "loss": 1.1315, + "step": 795000 + }, + { + "epoch": 23.36, + "grad_norm": 2.2025949954986572, + "learning_rate": 6.656500975710208e-05, + "loss": 1.1061, + "step": 795500 + }, + { + "epoch": 23.37, + "grad_norm": 2.5942320823669434, + "learning_rate": 6.641791776577071e-05, + "loss": 1.1244, + "step": 796000 + }, + { + "epoch": 23.39, + "grad_norm": 2.2347915172576904, + "learning_rate": 6.627082577443932e-05, + "loss": 1.0868, + "step": 796500 + }, + { + "epoch": 23.4, + "grad_norm": 2.3821372985839844, + "learning_rate": 6.612373378310796e-05, + "loss": 1.1175, + "step": 797000 + }, + { + "epoch": 23.41, + "grad_norm": 2.4445548057556152, + "learning_rate": 6.597664179177656e-05, + "loss": 1.1163, + "step": 797500 + }, + { + "epoch": 23.43, + "grad_norm": 4.338953018188477, + "learning_rate": 6.58295498004452e-05, + "loss": 1.104, + "step": 798000 + }, + { + "epoch": 23.44, + "grad_norm": 1.766503930091858, + "learning_rate": 6.56824578091138e-05, + "loss": 1.1178, + "step": 798500 + }, + { + "epoch": 23.46, + "grad_norm": 1.7907469272613525, + "learning_rate": 6.553536581778244e-05, + "loss": 1.1183, + "step": 799000 + }, + { + "epoch": 23.47, + "grad_norm": 1.722770094871521, + "learning_rate": 6.538827382645105e-05, + "loss": 1.1009, + "step": 799500 + }, + { + "epoch": 23.49, + "grad_norm": 1.7940298318862915, + "learning_rate": 6.524118183511968e-05, + "loss": 1.0938, + "step": 800000 + }, + { + "epoch": 23.5, + "grad_norm": 1.5882244110107422, + "learning_rate": 6.50940898437883e-05, + "loss": 1.1024, + "step": 800500 + }, + { + "epoch": 23.52, + "grad_norm": 1.7875205278396606, + "learning_rate": 6.494699785245692e-05, + "loss": 1.092, + "step": 801000 + }, + { + "epoch": 23.53, + "grad_norm": 2.0258092880249023, + "learning_rate": 6.479990586112555e-05, + "loss": 1.1136, + "step": 801500 + }, + { + "epoch": 23.55, + "grad_norm": 1.6038285493850708, + "learning_rate": 6.465281386979417e-05, + "loss": 1.1013, + "step": 802000 + }, + { + "epoch": 23.56, + "grad_norm": 2.411367654800415, + "learning_rate": 6.450572187846279e-05, + "loss": 1.0971, + "step": 802500 + }, + { + "epoch": 23.58, + "grad_norm": 18.325363159179688, + "learning_rate": 6.435862988713141e-05, + "loss": 1.1292, + "step": 803000 + }, + { + "epoch": 23.59, + "grad_norm": 2.520692825317383, + "learning_rate": 6.421153789580003e-05, + "loss": 1.1108, + "step": 803500 + }, + { + "epoch": 23.61, + "grad_norm": 2.115523099899292, + "learning_rate": 6.406444590446865e-05, + "loss": 1.0878, + "step": 804000 + }, + { + "epoch": 23.62, + "grad_norm": 2.4668285846710205, + "learning_rate": 6.391735391313727e-05, + "loss": 1.1089, + "step": 804500 + }, + { + "epoch": 23.64, + "grad_norm": 2.2170443534851074, + "learning_rate": 6.37702619218059e-05, + "loss": 1.1189, + "step": 805000 + }, + { + "epoch": 23.65, + "grad_norm": 2.2153406143188477, + "learning_rate": 6.362316993047451e-05, + "loss": 1.1053, + "step": 805500 + }, + { + "epoch": 23.66, + "grad_norm": 2.937535285949707, + "learning_rate": 6.347607793914314e-05, + "loss": 1.108, + "step": 806000 + }, + { + "epoch": 23.68, + "grad_norm": 1.9660471677780151, + "learning_rate": 6.332898594781176e-05, + "loss": 1.1203, + "step": 806500 + }, + { + "epoch": 23.69, + "grad_norm": 3.5854222774505615, + "learning_rate": 6.318189395648038e-05, + "loss": 1.1082, + "step": 807000 + }, + { + "epoch": 23.71, + "grad_norm": 6.527500152587891, + "learning_rate": 6.3034801965149e-05, + "loss": 1.1146, + "step": 807500 + }, + { + "epoch": 23.72, + "grad_norm": 7.003795623779297, + "learning_rate": 6.288770997381762e-05, + "loss": 1.1137, + "step": 808000 + }, + { + "epoch": 23.74, + "grad_norm": 2.606776714324951, + "learning_rate": 6.274061798248624e-05, + "loss": 1.0925, + "step": 808500 + }, + { + "epoch": 23.75, + "grad_norm": 3.0098581314086914, + "learning_rate": 6.259352599115486e-05, + "loss": 1.1041, + "step": 809000 + }, + { + "epoch": 23.77, + "grad_norm": 1.7633291482925415, + "learning_rate": 6.244643399982348e-05, + "loss": 1.1257, + "step": 809500 + }, + { + "epoch": 23.78, + "grad_norm": 2.6187331676483154, + "learning_rate": 6.22993420084921e-05, + "loss": 1.0956, + "step": 810000 + }, + { + "epoch": 23.8, + "grad_norm": 1.919297456741333, + "learning_rate": 6.215225001716073e-05, + "loss": 1.0954, + "step": 810500 + }, + { + "epoch": 23.81, + "grad_norm": 3.422133445739746, + "learning_rate": 6.200515802582935e-05, + "loss": 1.0997, + "step": 811000 + }, + { + "epoch": 23.83, + "grad_norm": 1.8277616500854492, + "learning_rate": 6.185806603449797e-05, + "loss": 1.1357, + "step": 811500 + }, + { + "epoch": 23.84, + "grad_norm": 1.641455054283142, + "learning_rate": 6.171097404316659e-05, + "loss": 1.1186, + "step": 812000 + }, + { + "epoch": 23.86, + "grad_norm": 2.629831075668335, + "learning_rate": 6.156388205183521e-05, + "loss": 1.0844, + "step": 812500 + }, + { + "epoch": 23.87, + "grad_norm": 2.9981462955474854, + "learning_rate": 6.141679006050383e-05, + "loss": 1.1323, + "step": 813000 + }, + { + "epoch": 23.88, + "grad_norm": 7.224218845367432, + "learning_rate": 6.126969806917245e-05, + "loss": 1.1091, + "step": 813500 + }, + { + "epoch": 23.9, + "grad_norm": 1.445144772529602, + "learning_rate": 6.112260607784107e-05, + "loss": 1.0885, + "step": 814000 + }, + { + "epoch": 23.91, + "grad_norm": 6.933565616607666, + "learning_rate": 6.0975514086509695e-05, + "loss": 1.1264, + "step": 814500 + }, + { + "epoch": 23.93, + "grad_norm": 1.5660400390625, + "learning_rate": 6.082842209517832e-05, + "loss": 1.102, + "step": 815000 + }, + { + "epoch": 23.94, + "grad_norm": 3.0154061317443848, + "learning_rate": 6.068133010384694e-05, + "loss": 1.1043, + "step": 815500 + }, + { + "epoch": 23.96, + "grad_norm": 4.825801372528076, + "learning_rate": 6.0534238112515565e-05, + "loss": 1.1289, + "step": 816000 + }, + { + "epoch": 23.97, + "grad_norm": 2.10929012298584, + "learning_rate": 6.038714612118418e-05, + "loss": 1.112, + "step": 816500 + }, + { + "epoch": 23.99, + "grad_norm": 15.725834846496582, + "learning_rate": 6.024005412985281e-05, + "loss": 1.0857, + "step": 817000 + }, + { + "epoch": 24.0, + "grad_norm": 3.745626926422119, + "learning_rate": 6.009296213852142e-05, + "loss": 1.0957, + "step": 817500 + }, + { + "epoch": 24.02, + "grad_norm": 1.5650004148483276, + "learning_rate": 5.994587014719005e-05, + "loss": 1.0553, + "step": 818000 + }, + { + "epoch": 24.03, + "grad_norm": 1.279994249343872, + "learning_rate": 5.9798778155858664e-05, + "loss": 1.0524, + "step": 818500 + }, + { + "epoch": 24.05, + "grad_norm": 1.6556503772735596, + "learning_rate": 5.965168616452729e-05, + "loss": 1.0709, + "step": 819000 + }, + { + "epoch": 24.06, + "grad_norm": 1.5529954433441162, + "learning_rate": 5.9504594173195906e-05, + "loss": 1.0952, + "step": 819500 + }, + { + "epoch": 24.08, + "grad_norm": 1.7145531177520752, + "learning_rate": 5.9357502181864534e-05, + "loss": 1.0526, + "step": 820000 + }, + { + "epoch": 24.09, + "grad_norm": 1.5692338943481445, + "learning_rate": 5.921041019053315e-05, + "loss": 1.0493, + "step": 820500 + }, + { + "epoch": 24.1, + "grad_norm": 37.72039031982422, + "learning_rate": 5.9063318199201776e-05, + "loss": 1.068, + "step": 821000 + }, + { + "epoch": 24.12, + "grad_norm": 2.0122039318084717, + "learning_rate": 5.891622620787039e-05, + "loss": 1.0775, + "step": 821500 + }, + { + "epoch": 24.13, + "grad_norm": 5.577430248260498, + "learning_rate": 5.876913421653902e-05, + "loss": 1.1098, + "step": 822000 + }, + { + "epoch": 24.15, + "grad_norm": 3.4199016094207764, + "learning_rate": 5.8622042225207647e-05, + "loss": 1.0893, + "step": 822500 + }, + { + "epoch": 24.16, + "grad_norm": 3.654630184173584, + "learning_rate": 5.847495023387626e-05, + "loss": 1.0719, + "step": 823000 + }, + { + "epoch": 24.18, + "grad_norm": 2.0985825061798096, + "learning_rate": 5.832785824254489e-05, + "loss": 1.0561, + "step": 823500 + }, + { + "epoch": 24.19, + "grad_norm": 3.724655866622925, + "learning_rate": 5.81807662512135e-05, + "loss": 1.047, + "step": 824000 + }, + { + "epoch": 24.21, + "grad_norm": 2.0398764610290527, + "learning_rate": 5.803367425988213e-05, + "loss": 1.0806, + "step": 824500 + }, + { + "epoch": 24.22, + "grad_norm": 2.9711148738861084, + "learning_rate": 5.7886582268550745e-05, + "loss": 1.104, + "step": 825000 + }, + { + "epoch": 24.24, + "grad_norm": 1.4870119094848633, + "learning_rate": 5.773949027721937e-05, + "loss": 1.0812, + "step": 825500 + }, + { + "epoch": 24.25, + "grad_norm": 2.184882640838623, + "learning_rate": 5.759239828588799e-05, + "loss": 1.071, + "step": 826000 + }, + { + "epoch": 24.27, + "grad_norm": 2.135782241821289, + "learning_rate": 5.7445306294556616e-05, + "loss": 1.081, + "step": 826500 + }, + { + "epoch": 24.28, + "grad_norm": 1.952862024307251, + "learning_rate": 5.729821430322523e-05, + "loss": 1.0688, + "step": 827000 + }, + { + "epoch": 24.3, + "grad_norm": 3.28511381149292, + "learning_rate": 5.715112231189386e-05, + "loss": 1.0719, + "step": 827500 + }, + { + "epoch": 24.31, + "grad_norm": 3.095506191253662, + "learning_rate": 5.700403032056247e-05, + "loss": 1.1089, + "step": 828000 + }, + { + "epoch": 24.33, + "grad_norm": 1.744498372077942, + "learning_rate": 5.68569383292311e-05, + "loss": 1.0666, + "step": 828500 + }, + { + "epoch": 24.34, + "grad_norm": 1.5370303392410278, + "learning_rate": 5.6709846337899714e-05, + "loss": 1.0834, + "step": 829000 + }, + { + "epoch": 24.35, + "grad_norm": 3.65459942817688, + "learning_rate": 5.656275434656834e-05, + "loss": 1.0767, + "step": 829500 + }, + { + "epoch": 24.37, + "grad_norm": 2.0333411693573, + "learning_rate": 5.641566235523696e-05, + "loss": 1.0965, + "step": 830000 + }, + { + "epoch": 24.38, + "grad_norm": 2.9158904552459717, + "learning_rate": 5.6268570363905585e-05, + "loss": 1.0726, + "step": 830500 + }, + { + "epoch": 24.4, + "grad_norm": 2.0873067378997803, + "learning_rate": 5.6121478372574206e-05, + "loss": 1.0664, + "step": 831000 + }, + { + "epoch": 24.41, + "grad_norm": 2.0477983951568604, + "learning_rate": 5.597438638124283e-05, + "loss": 1.0588, + "step": 831500 + }, + { + "epoch": 24.43, + "grad_norm": 1.9659167528152466, + "learning_rate": 5.582729438991145e-05, + "loss": 1.0986, + "step": 832000 + }, + { + "epoch": 24.44, + "grad_norm": 2.130729913711548, + "learning_rate": 5.568020239858007e-05, + "loss": 1.0713, + "step": 832500 + }, + { + "epoch": 24.46, + "grad_norm": 99.76852416992188, + "learning_rate": 5.553311040724869e-05, + "loss": 1.0803, + "step": 833000 + }, + { + "epoch": 24.47, + "grad_norm": 5.594266414642334, + "learning_rate": 5.538601841591731e-05, + "loss": 1.08, + "step": 833500 + }, + { + "epoch": 24.49, + "grad_norm": 2.3737406730651855, + "learning_rate": 5.523892642458593e-05, + "loss": 1.0931, + "step": 834000 + }, + { + "epoch": 24.5, + "grad_norm": 1.6415624618530273, + "learning_rate": 5.5091834433254554e-05, + "loss": 1.0798, + "step": 834500 + }, + { + "epoch": 24.52, + "grad_norm": 7.322782516479492, + "learning_rate": 5.4944742441923175e-05, + "loss": 1.0865, + "step": 835000 + }, + { + "epoch": 24.53, + "grad_norm": 2.1509947776794434, + "learning_rate": 5.4797650450591796e-05, + "loss": 1.0782, + "step": 835500 + }, + { + "epoch": 24.55, + "grad_norm": 2.485328197479248, + "learning_rate": 5.465055845926042e-05, + "loss": 1.0585, + "step": 836000 + }, + { + "epoch": 24.56, + "grad_norm": 2.5374958515167236, + "learning_rate": 5.450346646792904e-05, + "loss": 1.0577, + "step": 836500 + }, + { + "epoch": 24.57, + "grad_norm": 2.196706533432007, + "learning_rate": 5.435637447659766e-05, + "loss": 1.0894, + "step": 837000 + }, + { + "epoch": 24.59, + "grad_norm": 2.9057724475860596, + "learning_rate": 5.420928248526629e-05, + "loss": 1.075, + "step": 837500 + }, + { + "epoch": 24.6, + "grad_norm": 2.4338793754577637, + "learning_rate": 5.40621904939349e-05, + "loss": 1.0829, + "step": 838000 + }, + { + "epoch": 24.62, + "grad_norm": 2.225640058517456, + "learning_rate": 5.391509850260353e-05, + "loss": 1.0798, + "step": 838500 + }, + { + "epoch": 24.63, + "grad_norm": 2.3546741008758545, + "learning_rate": 5.3768006511272144e-05, + "loss": 1.0887, + "step": 839000 + }, + { + "epoch": 24.65, + "grad_norm": 1.3033528327941895, + "learning_rate": 5.362091451994077e-05, + "loss": 1.0809, + "step": 839500 + }, + { + "epoch": 24.66, + "grad_norm": 2.5136497020721436, + "learning_rate": 5.3473822528609386e-05, + "loss": 1.0851, + "step": 840000 + }, + { + "epoch": 24.68, + "grad_norm": 2.0871777534484863, + "learning_rate": 5.3326730537278014e-05, + "loss": 1.0744, + "step": 840500 + }, + { + "epoch": 24.69, + "grad_norm": 5.352667808532715, + "learning_rate": 5.317963854594663e-05, + "loss": 1.102, + "step": 841000 + }, + { + "epoch": 24.71, + "grad_norm": 1.8060741424560547, + "learning_rate": 5.3032546554615256e-05, + "loss": 1.0541, + "step": 841500 + }, + { + "epoch": 24.72, + "grad_norm": 2.5160653591156006, + "learning_rate": 5.288545456328387e-05, + "loss": 1.077, + "step": 842000 + }, + { + "epoch": 24.74, + "grad_norm": 4.854517936706543, + "learning_rate": 5.27383625719525e-05, + "loss": 1.084, + "step": 842500 + }, + { + "epoch": 24.75, + "grad_norm": 7.554323196411133, + "learning_rate": 5.259127058062111e-05, + "loss": 1.0974, + "step": 843000 + }, + { + "epoch": 24.77, + "grad_norm": 1.8624228239059448, + "learning_rate": 5.244417858928974e-05, + "loss": 1.0793, + "step": 843500 + }, + { + "epoch": 24.78, + "grad_norm": 1.6777387857437134, + "learning_rate": 5.2297086597958355e-05, + "loss": 1.0746, + "step": 844000 + }, + { + "epoch": 24.79, + "grad_norm": 1.9839929342269897, + "learning_rate": 5.214999460662698e-05, + "loss": 1.0813, + "step": 844500 + }, + { + "epoch": 24.81, + "grad_norm": 1.5621155500411987, + "learning_rate": 5.20029026152956e-05, + "loss": 1.0936, + "step": 845000 + }, + { + "epoch": 24.82, + "grad_norm": 2.5677313804626465, + "learning_rate": 5.1855810623964225e-05, + "loss": 1.0845, + "step": 845500 + }, + { + "epoch": 24.84, + "grad_norm": 1.3594130277633667, + "learning_rate": 5.170871863263284e-05, + "loss": 1.0563, + "step": 846000 + }, + { + "epoch": 24.85, + "grad_norm": 1.3905824422836304, + "learning_rate": 5.156162664130147e-05, + "loss": 1.0661, + "step": 846500 + }, + { + "epoch": 24.87, + "grad_norm": 1.6563278436660767, + "learning_rate": 5.141453464997008e-05, + "loss": 1.079, + "step": 847000 + }, + { + "epoch": 24.88, + "grad_norm": 2.1937708854675293, + "learning_rate": 5.126744265863871e-05, + "loss": 1.0714, + "step": 847500 + }, + { + "epoch": 24.9, + "grad_norm": 1.874009609222412, + "learning_rate": 5.112035066730733e-05, + "loss": 1.064, + "step": 848000 + }, + { + "epoch": 24.91, + "grad_norm": 1.4228936433792114, + "learning_rate": 5.097325867597595e-05, + "loss": 1.0954, + "step": 848500 + }, + { + "epoch": 24.93, + "grad_norm": 1.7444462776184082, + "learning_rate": 5.082616668464457e-05, + "loss": 1.0921, + "step": 849000 + }, + { + "epoch": 24.94, + "grad_norm": 186.56033325195312, + "learning_rate": 5.0679074693313194e-05, + "loss": 1.0757, + "step": 849500 + }, + { + "epoch": 24.96, + "grad_norm": 4.014658451080322, + "learning_rate": 5.0531982701981815e-05, + "loss": 1.0864, + "step": 850000 + }, + { + "epoch": 24.97, + "grad_norm": 1.8258497714996338, + "learning_rate": 5.0384890710650436e-05, + "loss": 1.0691, + "step": 850500 + }, + { + "epoch": 24.99, + "grad_norm": 2.100780487060547, + "learning_rate": 5.023779871931906e-05, + "loss": 1.0722, + "step": 851000 + }, + { + "epoch": 25.0, + "grad_norm": 2.250474452972412, + "learning_rate": 5.009070672798768e-05, + "loss": 1.0853, + "step": 851500 + }, + { + "epoch": 25.02, + "grad_norm": 2.6746740341186523, + "learning_rate": 4.99436147366563e-05, + "loss": 1.0277, + "step": 852000 + }, + { + "epoch": 25.03, + "grad_norm": 2.413872480392456, + "learning_rate": 4.979652274532492e-05, + "loss": 1.0804, + "step": 852500 + }, + { + "epoch": 25.04, + "grad_norm": 3.563396453857422, + "learning_rate": 4.964943075399354e-05, + "loss": 1.0638, + "step": 853000 + }, + { + "epoch": 25.06, + "grad_norm": 1.7918055057525635, + "learning_rate": 4.950233876266217e-05, + "loss": 1.0541, + "step": 853500 + }, + { + "epoch": 25.07, + "grad_norm": 2.508848190307617, + "learning_rate": 4.9355246771330784e-05, + "loss": 1.0083, + "step": 854000 + }, + { + "epoch": 25.09, + "grad_norm": 9.781774520874023, + "learning_rate": 4.920815477999941e-05, + "loss": 1.0236, + "step": 854500 + }, + { + "epoch": 25.1, + "grad_norm": 2.6363613605499268, + "learning_rate": 4.9061062788668026e-05, + "loss": 1.0356, + "step": 855000 + }, + { + "epoch": 25.12, + "grad_norm": 5.132301330566406, + "learning_rate": 4.8913970797336654e-05, + "loss": 1.0509, + "step": 855500 + }, + { + "epoch": 25.13, + "grad_norm": 2.983057737350464, + "learning_rate": 4.876687880600527e-05, + "loss": 1.0486, + "step": 856000 + }, + { + "epoch": 25.15, + "grad_norm": 3.175165891647339, + "learning_rate": 4.8619786814673897e-05, + "loss": 1.0521, + "step": 856500 + }, + { + "epoch": 25.16, + "grad_norm": 5.045122146606445, + "learning_rate": 4.847269482334251e-05, + "loss": 1.0494, + "step": 857000 + }, + { + "epoch": 25.18, + "grad_norm": 2.601104259490967, + "learning_rate": 4.832560283201114e-05, + "loss": 1.0301, + "step": 857500 + }, + { + "epoch": 25.19, + "grad_norm": 2.546464443206787, + "learning_rate": 4.817851084067975e-05, + "loss": 1.0237, + "step": 858000 + }, + { + "epoch": 25.21, + "grad_norm": 2.9643750190734863, + "learning_rate": 4.803141884934838e-05, + "loss": 1.0573, + "step": 858500 + }, + { + "epoch": 25.22, + "grad_norm": 5.178891658782959, + "learning_rate": 4.7884326858016995e-05, + "loss": 1.0588, + "step": 859000 + }, + { + "epoch": 25.24, + "grad_norm": 2.786492347717285, + "learning_rate": 4.773723486668562e-05, + "loss": 1.0537, + "step": 859500 + }, + { + "epoch": 25.25, + "grad_norm": 3.3391730785369873, + "learning_rate": 4.759014287535424e-05, + "loss": 1.0452, + "step": 860000 + }, + { + "epoch": 25.26, + "grad_norm": 2.8821425437927246, + "learning_rate": 4.7443050884022866e-05, + "loss": 1.0538, + "step": 860500 + }, + { + "epoch": 25.28, + "grad_norm": 8.122342109680176, + "learning_rate": 4.729595889269148e-05, + "loss": 1.0326, + "step": 861000 + }, + { + "epoch": 25.29, + "grad_norm": 5.042423725128174, + "learning_rate": 4.714886690136011e-05, + "loss": 1.0586, + "step": 861500 + }, + { + "epoch": 25.31, + "grad_norm": 3.068471670150757, + "learning_rate": 4.700177491002872e-05, + "loss": 1.032, + "step": 862000 + }, + { + "epoch": 25.32, + "grad_norm": 3.833307981491089, + "learning_rate": 4.685468291869735e-05, + "loss": 1.0581, + "step": 862500 + }, + { + "epoch": 25.34, + "grad_norm": 2.662344455718994, + "learning_rate": 4.6707590927365964e-05, + "loss": 1.0526, + "step": 863000 + }, + { + "epoch": 25.35, + "grad_norm": 3.626770257949829, + "learning_rate": 4.656049893603459e-05, + "loss": 1.0484, + "step": 863500 + }, + { + "epoch": 25.37, + "grad_norm": 2.1823232173919678, + "learning_rate": 4.6413406944703213e-05, + "loss": 1.0524, + "step": 864000 + }, + { + "epoch": 25.38, + "grad_norm": 2.051774740219116, + "learning_rate": 4.6266314953371835e-05, + "loss": 1.0406, + "step": 864500 + }, + { + "epoch": 25.4, + "grad_norm": 9.218499183654785, + "learning_rate": 4.6119222962040456e-05, + "loss": 1.0605, + "step": 865000 + }, + { + "epoch": 25.41, + "grad_norm": 2.4243879318237305, + "learning_rate": 4.597213097070908e-05, + "loss": 1.033, + "step": 865500 + }, + { + "epoch": 25.43, + "grad_norm": 5.0782599449157715, + "learning_rate": 4.58250389793777e-05, + "loss": 1.059, + "step": 866000 + }, + { + "epoch": 25.44, + "grad_norm": 2.4752907752990723, + "learning_rate": 4.567794698804632e-05, + "loss": 1.0348, + "step": 866500 + }, + { + "epoch": 25.46, + "grad_norm": 3.958376407623291, + "learning_rate": 4.553085499671494e-05, + "loss": 1.0677, + "step": 867000 + }, + { + "epoch": 25.47, + "grad_norm": 2.9827880859375, + "learning_rate": 4.538376300538356e-05, + "loss": 1.0371, + "step": 867500 + }, + { + "epoch": 25.48, + "grad_norm": 4.071894645690918, + "learning_rate": 4.523667101405218e-05, + "loss": 1.0578, + "step": 868000 + }, + { + "epoch": 25.5, + "grad_norm": 34.7899169921875, + "learning_rate": 4.5089579022720804e-05, + "loss": 1.0903, + "step": 868500 + }, + { + "epoch": 25.51, + "grad_norm": 2.133716344833374, + "learning_rate": 4.4942487031389425e-05, + "loss": 1.0471, + "step": 869000 + }, + { + "epoch": 25.53, + "grad_norm": 2.6652045249938965, + "learning_rate": 4.4795395040058046e-05, + "loss": 1.0358, + "step": 869500 + }, + { + "epoch": 25.54, + "grad_norm": 2.98919677734375, + "learning_rate": 4.464830304872667e-05, + "loss": 1.0621, + "step": 870000 + }, + { + "epoch": 25.56, + "grad_norm": 2.7026760578155518, + "learning_rate": 4.4501211057395295e-05, + "loss": 1.0621, + "step": 870500 + }, + { + "epoch": 25.57, + "grad_norm": 3.262974977493286, + "learning_rate": 4.435411906606391e-05, + "loss": 1.0321, + "step": 871000 + }, + { + "epoch": 25.59, + "grad_norm": 1.9586899280548096, + "learning_rate": 4.420702707473254e-05, + "loss": 1.0347, + "step": 871500 + }, + { + "epoch": 25.6, + "grad_norm": 7.560853481292725, + "learning_rate": 4.405993508340115e-05, + "loss": 1.0221, + "step": 872000 + }, + { + "epoch": 25.62, + "grad_norm": 3.056516408920288, + "learning_rate": 4.391284309206978e-05, + "loss": 1.043, + "step": 872500 + }, + { + "epoch": 25.63, + "grad_norm": 2.7999415397644043, + "learning_rate": 4.3765751100738394e-05, + "loss": 1.0515, + "step": 873000 + }, + { + "epoch": 25.65, + "grad_norm": 2.5458314418792725, + "learning_rate": 4.361865910940702e-05, + "loss": 1.0599, + "step": 873500 + }, + { + "epoch": 25.66, + "grad_norm": 3.634589910507202, + "learning_rate": 4.3471567118075636e-05, + "loss": 1.0428, + "step": 874000 + }, + { + "epoch": 25.68, + "grad_norm": 2.8104031085968018, + "learning_rate": 4.3324475126744264e-05, + "loss": 1.0396, + "step": 874500 + }, + { + "epoch": 25.69, + "grad_norm": 2.319333791732788, + "learning_rate": 4.317738313541288e-05, + "loss": 1.0426, + "step": 875000 + }, + { + "epoch": 25.71, + "grad_norm": 8.262571334838867, + "learning_rate": 4.3030291144081506e-05, + "loss": 1.0616, + "step": 875500 + }, + { + "epoch": 25.72, + "grad_norm": 3.307812213897705, + "learning_rate": 4.288319915275012e-05, + "loss": 1.0706, + "step": 876000 + }, + { + "epoch": 25.73, + "grad_norm": 7.4953508377075195, + "learning_rate": 4.273610716141875e-05, + "loss": 1.0457, + "step": 876500 + }, + { + "epoch": 25.75, + "grad_norm": 3.5895800590515137, + "learning_rate": 4.258901517008736e-05, + "loss": 1.0219, + "step": 877000 + }, + { + "epoch": 25.76, + "grad_norm": 3.023827075958252, + "learning_rate": 4.244192317875599e-05, + "loss": 1.05, + "step": 877500 + }, + { + "epoch": 25.78, + "grad_norm": 2.1299614906311035, + "learning_rate": 4.2294831187424605e-05, + "loss": 1.0505, + "step": 878000 + }, + { + "epoch": 25.79, + "grad_norm": 3.042360305786133, + "learning_rate": 4.214773919609323e-05, + "loss": 1.0481, + "step": 878500 + }, + { + "epoch": 25.81, + "grad_norm": 2.3197755813598633, + "learning_rate": 4.200064720476185e-05, + "loss": 1.0181, + "step": 879000 + }, + { + "epoch": 25.82, + "grad_norm": 3.230426788330078, + "learning_rate": 4.1853555213430475e-05, + "loss": 1.0327, + "step": 879500 + }, + { + "epoch": 25.84, + "grad_norm": 7.685334205627441, + "learning_rate": 4.1706463222099096e-05, + "loss": 1.0334, + "step": 880000 + }, + { + "epoch": 25.85, + "grad_norm": 3.2607569694519043, + "learning_rate": 4.155937123076772e-05, + "loss": 1.0342, + "step": 880500 + }, + { + "epoch": 25.87, + "grad_norm": 3.470470428466797, + "learning_rate": 4.1412279239436345e-05, + "loss": 1.0508, + "step": 881000 + }, + { + "epoch": 25.88, + "grad_norm": 16.141782760620117, + "learning_rate": 4.126518724810496e-05, + "loss": 1.046, + "step": 881500 + }, + { + "epoch": 25.9, + "grad_norm": 2.900520086288452, + "learning_rate": 4.111809525677359e-05, + "loss": 1.0791, + "step": 882000 + }, + { + "epoch": 25.91, + "grad_norm": 2.5581626892089844, + "learning_rate": 4.09710032654422e-05, + "loss": 1.0386, + "step": 882500 + }, + { + "epoch": 25.93, + "grad_norm": 2.301318883895874, + "learning_rate": 4.082391127411083e-05, + "loss": 1.078, + "step": 883000 + }, + { + "epoch": 25.94, + "grad_norm": 2.5168445110321045, + "learning_rate": 4.0676819282779444e-05, + "loss": 1.0524, + "step": 883500 + }, + { + "epoch": 25.95, + "grad_norm": 3.2487659454345703, + "learning_rate": 4.052972729144807e-05, + "loss": 1.0476, + "step": 884000 + }, + { + "epoch": 25.97, + "grad_norm": 3.8711748123168945, + "learning_rate": 4.0382635300116686e-05, + "loss": 1.0349, + "step": 884500 + }, + { + "epoch": 25.98, + "grad_norm": 2.8647818565368652, + "learning_rate": 4.0235543308785314e-05, + "loss": 1.0681, + "step": 885000 + }, + { + "epoch": 26.0, + "grad_norm": 2.307555675506592, + "learning_rate": 4.008845131745393e-05, + "loss": 1.0299, + "step": 885500 + }, + { + "epoch": 26.01, + "grad_norm": 5.377994537353516, + "learning_rate": 3.9941359326122557e-05, + "loss": 1.0274, + "step": 886000 + }, + { + "epoch": 26.03, + "grad_norm": 3.1154091358184814, + "learning_rate": 3.979426733479118e-05, + "loss": 1.0313, + "step": 886500 + }, + { + "epoch": 26.04, + "grad_norm": 1.3295516967773438, + "learning_rate": 3.96471753434598e-05, + "loss": 0.9962, + "step": 887000 + }, + { + "epoch": 26.06, + "grad_norm": 2.4110894203186035, + "learning_rate": 3.950008335212842e-05, + "loss": 1.0051, + "step": 887500 + }, + { + "epoch": 26.07, + "grad_norm": 1.6697067022323608, + "learning_rate": 3.935299136079704e-05, + "loss": 1.0119, + "step": 888000 + }, + { + "epoch": 26.09, + "grad_norm": 2.4017999172210693, + "learning_rate": 3.920589936946566e-05, + "loss": 1.0253, + "step": 888500 + }, + { + "epoch": 26.1, + "grad_norm": 3.1681103706359863, + "learning_rate": 3.905880737813428e-05, + "loss": 1.0213, + "step": 889000 + }, + { + "epoch": 26.12, + "grad_norm": 2.9184956550598145, + "learning_rate": 3.8911715386802904e-05, + "loss": 1.0298, + "step": 889500 + }, + { + "epoch": 26.13, + "grad_norm": 1.6290541887283325, + "learning_rate": 3.8764623395471526e-05, + "loss": 1.0094, + "step": 890000 + }, + { + "epoch": 26.15, + "grad_norm": 2.779973030090332, + "learning_rate": 3.861753140414015e-05, + "loss": 1.016, + "step": 890500 + }, + { + "epoch": 26.16, + "grad_norm": 4.393610000610352, + "learning_rate": 3.847043941280877e-05, + "loss": 1.0131, + "step": 891000 + }, + { + "epoch": 26.17, + "grad_norm": 2.503702402114868, + "learning_rate": 3.832334742147739e-05, + "loss": 1.0207, + "step": 891500 + }, + { + "epoch": 26.19, + "grad_norm": 1.3769996166229248, + "learning_rate": 3.817625543014601e-05, + "loss": 1.0305, + "step": 892000 + }, + { + "epoch": 26.2, + "grad_norm": 9.221471786499023, + "learning_rate": 3.802916343881463e-05, + "loss": 1.0258, + "step": 892500 + }, + { + "epoch": 26.22, + "grad_norm": 1.706425666809082, + "learning_rate": 3.788207144748326e-05, + "loss": 0.9808, + "step": 893000 + }, + { + "epoch": 26.23, + "grad_norm": 15.878872871398926, + "learning_rate": 3.7734979456151873e-05, + "loss": 1.0225, + "step": 893500 + }, + { + "epoch": 26.25, + "grad_norm": 9.141250610351562, + "learning_rate": 3.75878874648205e-05, + "loss": 1.0234, + "step": 894000 + }, + { + "epoch": 26.26, + "grad_norm": 1.698013424873352, + "learning_rate": 3.7440795473489116e-05, + "loss": 1.0207, + "step": 894500 + }, + { + "epoch": 26.28, + "grad_norm": 2.8760321140289307, + "learning_rate": 3.729370348215774e-05, + "loss": 1.0054, + "step": 895000 + }, + { + "epoch": 26.29, + "grad_norm": 10.192665100097656, + "learning_rate": 3.714661149082636e-05, + "loss": 1.0202, + "step": 895500 + }, + { + "epoch": 26.31, + "grad_norm": 1.8373937606811523, + "learning_rate": 3.699951949949498e-05, + "loss": 1.0399, + "step": 896000 + }, + { + "epoch": 26.32, + "grad_norm": 0.8489872217178345, + "learning_rate": 3.68524275081636e-05, + "loss": 1.0525, + "step": 896500 + }, + { + "epoch": 26.34, + "grad_norm": 1.7799832820892334, + "learning_rate": 3.670533551683222e-05, + "loss": 1.0271, + "step": 897000 + }, + { + "epoch": 26.35, + "grad_norm": 1.6424106359481812, + "learning_rate": 3.655824352550084e-05, + "loss": 1.0093, + "step": 897500 + }, + { + "epoch": 26.37, + "grad_norm": 4.4057135581970215, + "learning_rate": 3.6411151534169464e-05, + "loss": 1.0214, + "step": 898000 + }, + { + "epoch": 26.38, + "grad_norm": 1.971706509590149, + "learning_rate": 3.6264059542838085e-05, + "loss": 1.0316, + "step": 898500 + }, + { + "epoch": 26.39, + "grad_norm": 1.934280276298523, + "learning_rate": 3.6116967551506706e-05, + "loss": 1.0368, + "step": 899000 + }, + { + "epoch": 26.41, + "grad_norm": 1.6416791677474976, + "learning_rate": 3.596987556017533e-05, + "loss": 1.0123, + "step": 899500 + }, + { + "epoch": 26.42, + "grad_norm": 1.6787991523742676, + "learning_rate": 3.582278356884395e-05, + "loss": 1.0026, + "step": 900000 + }, + { + "epoch": 26.44, + "grad_norm": 2.6135306358337402, + "learning_rate": 3.567569157751257e-05, + "loss": 0.9887, + "step": 900500 + }, + { + "epoch": 26.45, + "grad_norm": 1.4282991886138916, + "learning_rate": 3.552859958618119e-05, + "loss": 1.0442, + "step": 901000 + }, + { + "epoch": 26.47, + "grad_norm": 1.8328664302825928, + "learning_rate": 3.538150759484981e-05, + "loss": 0.9983, + "step": 901500 + }, + { + "epoch": 26.48, + "grad_norm": 2.659299850463867, + "learning_rate": 3.523441560351843e-05, + "loss": 1.0397, + "step": 902000 + }, + { + "epoch": 26.5, + "grad_norm": 2.29178524017334, + "learning_rate": 3.508732361218706e-05, + "loss": 1.0182, + "step": 902500 + }, + { + "epoch": 26.51, + "grad_norm": 9.058496475219727, + "learning_rate": 3.494023162085568e-05, + "loss": 1.0113, + "step": 903000 + }, + { + "epoch": 26.53, + "grad_norm": 1.5530019998550415, + "learning_rate": 3.47931396295243e-05, + "loss": 1.0243, + "step": 903500 + }, + { + "epoch": 26.54, + "grad_norm": 1.9570651054382324, + "learning_rate": 3.4646047638192924e-05, + "loss": 1.0012, + "step": 904000 + }, + { + "epoch": 26.56, + "grad_norm": 1.154820203781128, + "learning_rate": 3.4498955646861545e-05, + "loss": 1.0231, + "step": 904500 + }, + { + "epoch": 26.57, + "grad_norm": 1.5875574350357056, + "learning_rate": 3.4351863655530166e-05, + "loss": 1.0239, + "step": 905000 + }, + { + "epoch": 26.59, + "grad_norm": 2.007080554962158, + "learning_rate": 3.420477166419879e-05, + "loss": 1.0119, + "step": 905500 + }, + { + "epoch": 26.6, + "grad_norm": 1.6997588872909546, + "learning_rate": 3.405767967286741e-05, + "loss": 1.0235, + "step": 906000 + }, + { + "epoch": 26.62, + "grad_norm": 2.8754279613494873, + "learning_rate": 3.391058768153603e-05, + "loss": 1.0152, + "step": 906500 + }, + { + "epoch": 26.63, + "grad_norm": 3.1198441982269287, + "learning_rate": 3.376349569020465e-05, + "loss": 1.0351, + "step": 907000 + }, + { + "epoch": 26.64, + "grad_norm": 11.747318267822266, + "learning_rate": 3.361640369887327e-05, + "loss": 1.0166, + "step": 907500 + }, + { + "epoch": 26.66, + "grad_norm": 1.7430450916290283, + "learning_rate": 3.346931170754189e-05, + "loss": 1.0268, + "step": 908000 + }, + { + "epoch": 26.67, + "grad_norm": 1.4505376815795898, + "learning_rate": 3.3322219716210514e-05, + "loss": 1.0297, + "step": 908500 + }, + { + "epoch": 26.69, + "grad_norm": 4.441274642944336, + "learning_rate": 3.3175127724879135e-05, + "loss": 1.0145, + "step": 909000 + }, + { + "epoch": 26.7, + "grad_norm": 4.974277973175049, + "learning_rate": 3.302803573354776e-05, + "loss": 1.0163, + "step": 909500 + }, + { + "epoch": 26.72, + "grad_norm": 2.1050703525543213, + "learning_rate": 3.2880943742216384e-05, + "loss": 1.0125, + "step": 910000 + }, + { + "epoch": 26.73, + "grad_norm": 4.370170593261719, + "learning_rate": 3.2733851750885005e-05, + "loss": 1.0134, + "step": 910500 + }, + { + "epoch": 26.75, + "grad_norm": 1.755359172821045, + "learning_rate": 3.2586759759553626e-05, + "loss": 1.0174, + "step": 911000 + }, + { + "epoch": 26.76, + "grad_norm": 2.099271774291992, + "learning_rate": 3.243966776822225e-05, + "loss": 1.0142, + "step": 911500 + }, + { + "epoch": 26.78, + "grad_norm": 1.560284972190857, + "learning_rate": 3.229257577689087e-05, + "loss": 1.0393, + "step": 912000 + }, + { + "epoch": 26.79, + "grad_norm": 2.1582424640655518, + "learning_rate": 3.214548378555949e-05, + "loss": 1.016, + "step": 912500 + }, + { + "epoch": 26.81, + "grad_norm": 2.342576026916504, + "learning_rate": 3.199839179422811e-05, + "loss": 1.0262, + "step": 913000 + }, + { + "epoch": 26.82, + "grad_norm": 1.4274934530258179, + "learning_rate": 3.185129980289673e-05, + "loss": 0.9991, + "step": 913500 + }, + { + "epoch": 26.84, + "grad_norm": 3.858720064163208, + "learning_rate": 3.170420781156535e-05, + "loss": 1.0171, + "step": 914000 + }, + { + "epoch": 26.85, + "grad_norm": 2.058018922805786, + "learning_rate": 3.1557115820233974e-05, + "loss": 1.0263, + "step": 914500 + }, + { + "epoch": 26.86, + "grad_norm": 2.6626269817352295, + "learning_rate": 3.1410023828902595e-05, + "loss": 1.0257, + "step": 915000 + }, + { + "epoch": 26.88, + "grad_norm": 5.859118938446045, + "learning_rate": 3.1262931837571216e-05, + "loss": 1.023, + "step": 915500 + }, + { + "epoch": 26.89, + "grad_norm": 6.2187957763671875, + "learning_rate": 3.111583984623984e-05, + "loss": 1.0107, + "step": 916000 + }, + { + "epoch": 26.91, + "grad_norm": 2.6373274326324463, + "learning_rate": 3.096874785490846e-05, + "loss": 1.0172, + "step": 916500 + }, + { + "epoch": 26.92, + "grad_norm": 3.272045850753784, + "learning_rate": 3.082165586357708e-05, + "loss": 1.0175, + "step": 917000 + }, + { + "epoch": 26.94, + "grad_norm": 1.6807574033737183, + "learning_rate": 3.06745638722457e-05, + "loss": 1.0295, + "step": 917500 + }, + { + "epoch": 26.95, + "grad_norm": 4.664640426635742, + "learning_rate": 3.052747188091432e-05, + "loss": 1.0368, + "step": 918000 + }, + { + "epoch": 26.97, + "grad_norm": 1.5610125064849854, + "learning_rate": 3.0380379889582943e-05, + "loss": 1.0365, + "step": 918500 + }, + { + "epoch": 26.98, + "grad_norm": 2.0391860008239746, + "learning_rate": 3.0233287898251564e-05, + "loss": 1.0289, + "step": 919000 + }, + { + "epoch": 27.0, + "grad_norm": 47.55400848388672, + "learning_rate": 3.0086195906920185e-05, + "loss": 1.0292, + "step": 919500 + }, + { + "epoch": 27.01, + "grad_norm": 2.6204073429107666, + "learning_rate": 2.9939103915588807e-05, + "loss": 1.0069, + "step": 920000 + }, + { + "epoch": 27.03, + "grad_norm": 9.747673034667969, + "learning_rate": 2.9792011924257428e-05, + "loss": 0.9997, + "step": 920500 + }, + { + "epoch": 27.04, + "grad_norm": 2.057465076446533, + "learning_rate": 2.9644919932926052e-05, + "loss": 0.9993, + "step": 921000 + }, + { + "epoch": 27.06, + "grad_norm": 8.264533996582031, + "learning_rate": 2.9497827941594673e-05, + "loss": 0.9862, + "step": 921500 + }, + { + "epoch": 27.07, + "grad_norm": 4.143492698669434, + "learning_rate": 2.9350735950263294e-05, + "loss": 0.9774, + "step": 922000 + }, + { + "epoch": 27.08, + "grad_norm": 2.2518653869628906, + "learning_rate": 2.9203643958931916e-05, + "loss": 1.0113, + "step": 922500 + }, + { + "epoch": 27.1, + "grad_norm": 2.745732307434082, + "learning_rate": 2.9056551967600537e-05, + "loss": 0.9734, + "step": 923000 + }, + { + "epoch": 27.11, + "grad_norm": 1.3990339040756226, + "learning_rate": 2.8909459976269158e-05, + "loss": 0.9918, + "step": 923500 + }, + { + "epoch": 27.13, + "grad_norm": 2.024524211883545, + "learning_rate": 2.876236798493778e-05, + "loss": 1.0033, + "step": 924000 + }, + { + "epoch": 27.14, + "grad_norm": 2.5025253295898438, + "learning_rate": 2.86152759936064e-05, + "loss": 1.0004, + "step": 924500 + }, + { + "epoch": 27.16, + "grad_norm": 2.7478108406066895, + "learning_rate": 2.846818400227502e-05, + "loss": 0.9901, + "step": 925000 + }, + { + "epoch": 27.17, + "grad_norm": 1.7393391132354736, + "learning_rate": 2.8321092010943642e-05, + "loss": 1.0035, + "step": 925500 + }, + { + "epoch": 27.19, + "grad_norm": 1.3920613527297974, + "learning_rate": 2.8174000019612263e-05, + "loss": 1.0194, + "step": 926000 + }, + { + "epoch": 27.2, + "grad_norm": 2.2148451805114746, + "learning_rate": 2.8026908028280885e-05, + "loss": 0.9858, + "step": 926500 + }, + { + "epoch": 27.22, + "grad_norm": 1.7391964197158813, + "learning_rate": 2.7879816036949506e-05, + "loss": 1.0069, + "step": 927000 + }, + { + "epoch": 27.23, + "grad_norm": 2.359649419784546, + "learning_rate": 2.7732724045618127e-05, + "loss": 0.9989, + "step": 927500 + }, + { + "epoch": 27.25, + "grad_norm": 2.8471827507019043, + "learning_rate": 2.7585632054286748e-05, + "loss": 0.9832, + "step": 928000 + }, + { + "epoch": 27.26, + "grad_norm": 1.788493037223816, + "learning_rate": 2.743854006295537e-05, + "loss": 1.0104, + "step": 928500 + }, + { + "epoch": 27.28, + "grad_norm": 1.7663925886154175, + "learning_rate": 2.7291448071623994e-05, + "loss": 1.0031, + "step": 929000 + }, + { + "epoch": 27.29, + "grad_norm": 2.699171304702759, + "learning_rate": 2.7144356080292615e-05, + "loss": 1.0037, + "step": 929500 + }, + { + "epoch": 27.31, + "grad_norm": 12.16844367980957, + "learning_rate": 2.6997264088961236e-05, + "loss": 1.0109, + "step": 930000 + }, + { + "epoch": 27.32, + "grad_norm": 2.0973167419433594, + "learning_rate": 2.6850172097629857e-05, + "loss": 1.0123, + "step": 930500 + }, + { + "epoch": 27.33, + "grad_norm": 1.7330328226089478, + "learning_rate": 2.6703080106298478e-05, + "loss": 1.0055, + "step": 931000 + }, + { + "epoch": 27.35, + "grad_norm": 5.502919673919678, + "learning_rate": 2.65559881149671e-05, + "loss": 1.003, + "step": 931500 + }, + { + "epoch": 27.36, + "grad_norm": 2.3308217525482178, + "learning_rate": 2.640889612363572e-05, + "loss": 1.0059, + "step": 932000 + }, + { + "epoch": 27.38, + "grad_norm": 1.6248698234558105, + "learning_rate": 2.626180413230434e-05, + "loss": 0.9849, + "step": 932500 + }, + { + "epoch": 27.39, + "grad_norm": 3.2510266304016113, + "learning_rate": 2.6114712140972963e-05, + "loss": 0.9862, + "step": 933000 + }, + { + "epoch": 27.41, + "grad_norm": 2.0555245876312256, + "learning_rate": 2.5967620149641584e-05, + "loss": 0.9844, + "step": 933500 + }, + { + "epoch": 27.42, + "grad_norm": 7.706765651702881, + "learning_rate": 2.5820528158310205e-05, + "loss": 0.9913, + "step": 934000 + }, + { + "epoch": 27.44, + "grad_norm": 1.9011601209640503, + "learning_rate": 2.5673436166978826e-05, + "loss": 1.0102, + "step": 934500 + }, + { + "epoch": 27.45, + "grad_norm": 6.408154010772705, + "learning_rate": 2.5526344175647447e-05, + "loss": 0.996, + "step": 935000 + }, + { + "epoch": 27.47, + "grad_norm": 6.746776580810547, + "learning_rate": 2.5379252184316068e-05, + "loss": 1.0273, + "step": 935500 + }, + { + "epoch": 27.48, + "grad_norm": 33.82996368408203, + "learning_rate": 2.523216019298469e-05, + "loss": 1.007, + "step": 936000 + }, + { + "epoch": 27.5, + "grad_norm": 1.6131765842437744, + "learning_rate": 2.508506820165331e-05, + "loss": 0.9855, + "step": 936500 + }, + { + "epoch": 27.51, + "grad_norm": 2.5696966648101807, + "learning_rate": 2.493797621032193e-05, + "loss": 1.0153, + "step": 937000 + }, + { + "epoch": 27.53, + "grad_norm": 1.5355910062789917, + "learning_rate": 2.4790884218990556e-05, + "loss": 0.9714, + "step": 937500 + }, + { + "epoch": 27.54, + "grad_norm": 2.0435054302215576, + "learning_rate": 2.4643792227659177e-05, + "loss": 0.9997, + "step": 938000 + }, + { + "epoch": 27.55, + "grad_norm": 1.6511478424072266, + "learning_rate": 2.44967002363278e-05, + "loss": 0.9865, + "step": 938500 + }, + { + "epoch": 27.57, + "grad_norm": 1.9227514266967773, + "learning_rate": 2.434960824499642e-05, + "loss": 1.0103, + "step": 939000 + }, + { + "epoch": 27.58, + "grad_norm": 16.868221282958984, + "learning_rate": 2.420251625366504e-05, + "loss": 0.9854, + "step": 939500 + }, + { + "epoch": 27.6, + "grad_norm": 1.7769354581832886, + "learning_rate": 2.4055424262333662e-05, + "loss": 1.0176, + "step": 940000 + }, + { + "epoch": 27.61, + "grad_norm": 1.6719156503677368, + "learning_rate": 2.3908332271002283e-05, + "loss": 0.9951, + "step": 940500 + }, + { + "epoch": 27.63, + "grad_norm": 3.5106630325317383, + "learning_rate": 2.3761240279670904e-05, + "loss": 0.994, + "step": 941000 + }, + { + "epoch": 27.64, + "grad_norm": 3.102487564086914, + "learning_rate": 2.3614148288339525e-05, + "loss": 0.9802, + "step": 941500 + }, + { + "epoch": 27.66, + "grad_norm": 12.020054817199707, + "learning_rate": 2.3467056297008146e-05, + "loss": 0.9997, + "step": 942000 + }, + { + "epoch": 27.67, + "grad_norm": 2.8283021450042725, + "learning_rate": 2.3319964305676767e-05, + "loss": 0.9892, + "step": 942500 + }, + { + "epoch": 27.69, + "grad_norm": 37.08485412597656, + "learning_rate": 2.317287231434539e-05, + "loss": 0.9969, + "step": 943000 + }, + { + "epoch": 27.7, + "grad_norm": 1.4365429878234863, + "learning_rate": 2.302578032301401e-05, + "loss": 0.9748, + "step": 943500 + }, + { + "epoch": 27.72, + "grad_norm": 1.8424559831619263, + "learning_rate": 2.287868833168263e-05, + "loss": 0.9741, + "step": 944000 + }, + { + "epoch": 27.73, + "grad_norm": 2.4849884510040283, + "learning_rate": 2.2731596340351252e-05, + "loss": 0.9896, + "step": 944500 + }, + { + "epoch": 27.75, + "grad_norm": 2.0955851078033447, + "learning_rate": 2.2584504349019873e-05, + "loss": 1.012, + "step": 945000 + }, + { + "epoch": 27.76, + "grad_norm": 5.421468257904053, + "learning_rate": 2.2437412357688498e-05, + "loss": 0.9681, + "step": 945500 + }, + { + "epoch": 27.77, + "grad_norm": 3.2725253105163574, + "learning_rate": 2.229032036635712e-05, + "loss": 1.0059, + "step": 946000 + }, + { + "epoch": 27.79, + "grad_norm": 2.741119861602783, + "learning_rate": 2.214322837502574e-05, + "loss": 0.9933, + "step": 946500 + }, + { + "epoch": 27.8, + "grad_norm": 1.600335717201233, + "learning_rate": 2.199613638369436e-05, + "loss": 0.997, + "step": 947000 + }, + { + "epoch": 27.82, + "grad_norm": 3.4619009494781494, + "learning_rate": 2.1849044392362982e-05, + "loss": 0.9976, + "step": 947500 + }, + { + "epoch": 27.83, + "grad_norm": 10.748187065124512, + "learning_rate": 2.1701952401031603e-05, + "loss": 0.9727, + "step": 948000 + }, + { + "epoch": 27.85, + "grad_norm": 3.452791690826416, + "learning_rate": 2.1554860409700224e-05, + "loss": 1.001, + "step": 948500 + }, + { + "epoch": 27.86, + "grad_norm": 1.3464235067367554, + "learning_rate": 2.1407768418368845e-05, + "loss": 0.9876, + "step": 949000 + }, + { + "epoch": 27.88, + "grad_norm": 1.6416382789611816, + "learning_rate": 2.1260676427037467e-05, + "loss": 1.0062, + "step": 949500 + }, + { + "epoch": 27.89, + "grad_norm": 1.434910774230957, + "learning_rate": 2.1113584435706088e-05, + "loss": 0.9934, + "step": 950000 + }, + { + "epoch": 27.91, + "grad_norm": 5.8801045417785645, + "learning_rate": 2.096649244437471e-05, + "loss": 0.9899, + "step": 950500 + }, + { + "epoch": 27.92, + "grad_norm": 1.864310622215271, + "learning_rate": 2.081940045304333e-05, + "loss": 1.013, + "step": 951000 + }, + { + "epoch": 27.94, + "grad_norm": 2.070906639099121, + "learning_rate": 2.067230846171195e-05, + "loss": 0.9947, + "step": 951500 + }, + { + "epoch": 27.95, + "grad_norm": 2.5215396881103516, + "learning_rate": 2.0525216470380572e-05, + "loss": 0.984, + "step": 952000 + }, + { + "epoch": 27.97, + "grad_norm": 1.2144149541854858, + "learning_rate": 2.0378124479049193e-05, + "loss": 0.9993, + "step": 952500 + }, + { + "epoch": 27.98, + "grad_norm": 2.4629998207092285, + "learning_rate": 2.0231032487717814e-05, + "loss": 1.0146, + "step": 953000 + }, + { + "epoch": 28.0, + "grad_norm": 4.50878381729126, + "learning_rate": 2.008394049638644e-05, + "loss": 1.0005, + "step": 953500 + }, + { + "epoch": 28.01, + "grad_norm": 2.623155355453491, + "learning_rate": 1.993684850505506e-05, + "loss": 0.9928, + "step": 954000 + }, + { + "epoch": 28.02, + "grad_norm": 2.886497735977173, + "learning_rate": 1.978975651372368e-05, + "loss": 0.9886, + "step": 954500 + }, + { + "epoch": 28.04, + "grad_norm": 2.8885116577148438, + "learning_rate": 1.9642664522392302e-05, + "loss": 0.9788, + "step": 955000 + }, + { + "epoch": 28.05, + "grad_norm": 1.946003794670105, + "learning_rate": 1.9495572531060923e-05, + "loss": 0.9647, + "step": 955500 + }, + { + "epoch": 28.07, + "grad_norm": 12.426846504211426, + "learning_rate": 1.9348480539729545e-05, + "loss": 0.9753, + "step": 956000 + }, + { + "epoch": 28.08, + "grad_norm": 1.905735969543457, + "learning_rate": 1.9201388548398166e-05, + "loss": 0.9855, + "step": 956500 + }, + { + "epoch": 28.1, + "grad_norm": 2.33889102935791, + "learning_rate": 1.9054296557066787e-05, + "loss": 0.9744, + "step": 957000 + }, + { + "epoch": 28.11, + "grad_norm": 2.3063101768493652, + "learning_rate": 1.8907204565735408e-05, + "loss": 0.9702, + "step": 957500 + }, + { + "epoch": 28.13, + "grad_norm": 2.292107582092285, + "learning_rate": 1.876011257440403e-05, + "loss": 0.9888, + "step": 958000 + }, + { + "epoch": 28.14, + "grad_norm": 2.144972324371338, + "learning_rate": 1.8613020583072654e-05, + "loss": 0.9754, + "step": 958500 + }, + { + "epoch": 28.16, + "grad_norm": 1.5302867889404297, + "learning_rate": 1.8465928591741275e-05, + "loss": 0.9816, + "step": 959000 + }, + { + "epoch": 28.17, + "grad_norm": 1.6462048292160034, + "learning_rate": 1.8318836600409896e-05, + "loss": 0.9403, + "step": 959500 + }, + { + "epoch": 28.19, + "grad_norm": 2.6783134937286377, + "learning_rate": 1.8171744609078517e-05, + "loss": 0.9762, + "step": 960000 + }, + { + "epoch": 28.2, + "grad_norm": 2.0703890323638916, + "learning_rate": 1.8024652617747138e-05, + "loss": 0.9756, + "step": 960500 + }, + { + "epoch": 28.22, + "grad_norm": 2.4419829845428467, + "learning_rate": 1.787756062641576e-05, + "loss": 0.9684, + "step": 961000 + }, + { + "epoch": 28.23, + "grad_norm": 4.775966644287109, + "learning_rate": 1.773046863508438e-05, + "loss": 0.9912, + "step": 961500 + }, + { + "epoch": 28.24, + "grad_norm": 1.0155693292617798, + "learning_rate": 1.7583376643753e-05, + "loss": 0.97, + "step": 962000 + }, + { + "epoch": 28.26, + "grad_norm": 2.105178117752075, + "learning_rate": 1.7436284652421623e-05, + "loss": 0.9699, + "step": 962500 + }, + { + "epoch": 28.27, + "grad_norm": 11.809367179870605, + "learning_rate": 1.7289192661090244e-05, + "loss": 0.9869, + "step": 963000 + }, + { + "epoch": 28.29, + "grad_norm": 2.5389130115509033, + "learning_rate": 1.7142100669758865e-05, + "loss": 0.9807, + "step": 963500 + }, + { + "epoch": 28.3, + "grad_norm": 2.3344273567199707, + "learning_rate": 1.6995008678427486e-05, + "loss": 0.9861, + "step": 964000 + }, + { + "epoch": 28.32, + "grad_norm": 2.3741703033447266, + "learning_rate": 1.6847916687096107e-05, + "loss": 0.9592, + "step": 964500 + }, + { + "epoch": 28.33, + "grad_norm": 5.04816198348999, + "learning_rate": 1.670082469576473e-05, + "loss": 0.9903, + "step": 965000 + }, + { + "epoch": 28.35, + "grad_norm": 1.2963931560516357, + "learning_rate": 1.6553732704433353e-05, + "loss": 0.9676, + "step": 965500 + }, + { + "epoch": 28.36, + "grad_norm": 3.1595191955566406, + "learning_rate": 1.6406640713101974e-05, + "loss": 0.9942, + "step": 966000 + }, + { + "epoch": 28.38, + "grad_norm": 1.5514929294586182, + "learning_rate": 1.6259548721770595e-05, + "loss": 0.9741, + "step": 966500 + }, + { + "epoch": 28.39, + "grad_norm": 1.9268239736557007, + "learning_rate": 1.6112456730439216e-05, + "loss": 0.9869, + "step": 967000 + }, + { + "epoch": 28.41, + "grad_norm": 2.7622382640838623, + "learning_rate": 1.5965364739107837e-05, + "loss": 0.9598, + "step": 967500 + }, + { + "epoch": 28.42, + "grad_norm": 1.418778657913208, + "learning_rate": 1.581827274777646e-05, + "loss": 0.9464, + "step": 968000 + }, + { + "epoch": 28.44, + "grad_norm": 2.1014456748962402, + "learning_rate": 1.567118075644508e-05, + "loss": 0.9846, + "step": 968500 + }, + { + "epoch": 28.45, + "grad_norm": 14.58630657196045, + "learning_rate": 1.55240887651137e-05, + "loss": 0.9695, + "step": 969000 + }, + { + "epoch": 28.46, + "grad_norm": 1.556394100189209, + "learning_rate": 1.5376996773782322e-05, + "loss": 0.9916, + "step": 969500 + }, + { + "epoch": 28.48, + "grad_norm": 1.8601114749908447, + "learning_rate": 1.5229904782450943e-05, + "loss": 0.9552, + "step": 970000 + }, + { + "epoch": 28.49, + "grad_norm": 2.2888152599334717, + "learning_rate": 1.5082812791119564e-05, + "loss": 0.972, + "step": 970500 + }, + { + "epoch": 28.51, + "grad_norm": 2.9334394931793213, + "learning_rate": 1.4935720799788187e-05, + "loss": 0.9986, + "step": 971000 + }, + { + "epoch": 28.52, + "grad_norm": 2.88075590133667, + "learning_rate": 1.4788628808456808e-05, + "loss": 0.9935, + "step": 971500 + }, + { + "epoch": 28.54, + "grad_norm": 4.609189510345459, + "learning_rate": 1.4641536817125429e-05, + "loss": 0.9715, + "step": 972000 + }, + { + "epoch": 28.55, + "grad_norm": 1.6899704933166504, + "learning_rate": 1.449444482579405e-05, + "loss": 0.9737, + "step": 972500 + }, + { + "epoch": 28.57, + "grad_norm": 18.26349449157715, + "learning_rate": 1.4347352834462671e-05, + "loss": 0.9728, + "step": 973000 + }, + { + "epoch": 28.58, + "grad_norm": 5.080548286437988, + "learning_rate": 1.4200260843131292e-05, + "loss": 0.9819, + "step": 973500 + }, + { + "epoch": 28.6, + "grad_norm": 3.9338316917419434, + "learning_rate": 1.4053168851799914e-05, + "loss": 0.9758, + "step": 974000 + }, + { + "epoch": 28.61, + "grad_norm": 1.4379346370697021, + "learning_rate": 1.3906076860468535e-05, + "loss": 0.9634, + "step": 974500 + }, + { + "epoch": 28.63, + "grad_norm": 2.943136215209961, + "learning_rate": 1.3758984869137157e-05, + "loss": 0.9519, + "step": 975000 + }, + { + "epoch": 28.64, + "grad_norm": 3.1957294940948486, + "learning_rate": 1.3611892877805779e-05, + "loss": 0.9727, + "step": 975500 + }, + { + "epoch": 28.66, + "grad_norm": 2.38423752784729, + "learning_rate": 1.34648008864744e-05, + "loss": 0.9787, + "step": 976000 + }, + { + "epoch": 28.67, + "grad_norm": 2.059079885482788, + "learning_rate": 1.331770889514302e-05, + "loss": 0.9851, + "step": 976500 + }, + { + "epoch": 28.69, + "grad_norm": 1.8208869695663452, + "learning_rate": 1.3170616903811642e-05, + "loss": 0.9665, + "step": 977000 + }, + { + "epoch": 28.7, + "grad_norm": 2.108597755432129, + "learning_rate": 1.3023524912480263e-05, + "loss": 0.9941, + "step": 977500 + }, + { + "epoch": 28.71, + "grad_norm": 1.5259730815887451, + "learning_rate": 1.2876432921148884e-05, + "loss": 0.98, + "step": 978000 + }, + { + "epoch": 28.73, + "grad_norm": 3.095134735107422, + "learning_rate": 1.2729340929817505e-05, + "loss": 0.9962, + "step": 978500 + }, + { + "epoch": 28.74, + "grad_norm": 1.49867582321167, + "learning_rate": 1.2582248938486128e-05, + "loss": 0.9714, + "step": 979000 + }, + { + "epoch": 28.76, + "grad_norm": 1.5933908224105835, + "learning_rate": 1.243515694715475e-05, + "loss": 0.999, + "step": 979500 + }, + { + "epoch": 28.77, + "grad_norm": 2.663355588912964, + "learning_rate": 1.228806495582337e-05, + "loss": 0.9682, + "step": 980000 + }, + { + "epoch": 28.79, + "grad_norm": 104.24024200439453, + "learning_rate": 1.2140972964491992e-05, + "loss": 0.9774, + "step": 980500 + }, + { + "epoch": 28.8, + "grad_norm": 1.7283943891525269, + "learning_rate": 1.1993880973160613e-05, + "loss": 0.9607, + "step": 981000 + }, + { + "epoch": 28.82, + "grad_norm": 2.0852441787719727, + "learning_rate": 1.1846788981829234e-05, + "loss": 1.0005, + "step": 981500 + }, + { + "epoch": 28.83, + "grad_norm": 2.0442757606506348, + "learning_rate": 1.1699696990497855e-05, + "loss": 0.9836, + "step": 982000 + }, + { + "epoch": 28.85, + "grad_norm": 1.5879892110824585, + "learning_rate": 1.1552604999166476e-05, + "loss": 0.9973, + "step": 982500 + }, + { + "epoch": 28.86, + "grad_norm": 2.7892065048217773, + "learning_rate": 1.14055130078351e-05, + "loss": 0.9615, + "step": 983000 + }, + { + "epoch": 28.88, + "grad_norm": 3.826296329498291, + "learning_rate": 1.1258421016503722e-05, + "loss": 0.9746, + "step": 983500 + }, + { + "epoch": 28.89, + "grad_norm": 7.535625457763672, + "learning_rate": 1.1111329025172343e-05, + "loss": 0.9685, + "step": 984000 + }, + { + "epoch": 28.91, + "grad_norm": 1.7329784631729126, + "learning_rate": 1.0964237033840964e-05, + "loss": 0.9745, + "step": 984500 + }, + { + "epoch": 28.92, + "grad_norm": 1.9252818822860718, + "learning_rate": 1.0817145042509585e-05, + "loss": 0.9693, + "step": 985000 + }, + { + "epoch": 28.93, + "grad_norm": 2.0630569458007812, + "learning_rate": 1.0670053051178206e-05, + "loss": 0.9634, + "step": 985500 + }, + { + "epoch": 28.95, + "grad_norm": 1.6531202793121338, + "learning_rate": 1.0522961059846827e-05, + "loss": 0.9702, + "step": 986000 + }, + { + "epoch": 28.96, + "grad_norm": 32.64999771118164, + "learning_rate": 1.037586906851545e-05, + "loss": 0.9593, + "step": 986500 + }, + { + "epoch": 28.98, + "grad_norm": 5.578742980957031, + "learning_rate": 1.0228777077184071e-05, + "loss": 0.9624, + "step": 987000 + }, + { + "epoch": 28.99, + "grad_norm": 1.9109100103378296, + "learning_rate": 1.0081685085852692e-05, + "loss": 0.9698, + "step": 987500 + }, + { + "epoch": 29.01, + "grad_norm": 3.434389352798462, + "learning_rate": 9.934593094521313e-06, + "loss": 0.9824, + "step": 988000 + }, + { + "epoch": 29.02, + "grad_norm": 3.804184913635254, + "learning_rate": 9.787501103189935e-06, + "loss": 0.9534, + "step": 988500 + }, + { + "epoch": 29.04, + "grad_norm": 2.4718267917633057, + "learning_rate": 9.640409111858556e-06, + "loss": 0.9533, + "step": 989000 + }, + { + "epoch": 29.05, + "grad_norm": 2.81648588180542, + "learning_rate": 9.493317120527177e-06, + "loss": 0.9688, + "step": 989500 + }, + { + "epoch": 29.07, + "grad_norm": 3.936281681060791, + "learning_rate": 9.346225129195798e-06, + "loss": 0.9486, + "step": 990000 + }, + { + "epoch": 29.08, + "grad_norm": 4.693751335144043, + "learning_rate": 9.19913313786442e-06, + "loss": 0.9505, + "step": 990500 + }, + { + "epoch": 29.1, + "grad_norm": 3.5278286933898926, + "learning_rate": 9.052041146533042e-06, + "loss": 0.9338, + "step": 991000 + }, + { + "epoch": 29.11, + "grad_norm": 2.9467105865478516, + "learning_rate": 8.904949155201663e-06, + "loss": 0.9602, + "step": 991500 + }, + { + "epoch": 29.13, + "grad_norm": 2.929056406021118, + "learning_rate": 8.757857163870284e-06, + "loss": 0.9469, + "step": 992000 + }, + { + "epoch": 29.14, + "grad_norm": 4.979098320007324, + "learning_rate": 8.610765172538905e-06, + "loss": 0.9364, + "step": 992500 + }, + { + "epoch": 29.15, + "grad_norm": 3.7940807342529297, + "learning_rate": 8.463673181207526e-06, + "loss": 0.9407, + "step": 993000 + }, + { + "epoch": 29.17, + "grad_norm": 2.5154898166656494, + "learning_rate": 8.316581189876148e-06, + "loss": 0.9504, + "step": 993500 + }, + { + "epoch": 29.18, + "grad_norm": 3.7596309185028076, + "learning_rate": 8.169489198544769e-06, + "loss": 0.9762, + "step": 994000 + }, + { + "epoch": 29.2, + "grad_norm": 2.13797926902771, + "learning_rate": 8.022397207213392e-06, + "loss": 0.9605, + "step": 994500 + }, + { + "epoch": 29.21, + "grad_norm": 3.1647770404815674, + "learning_rate": 7.875305215882013e-06, + "loss": 0.9611, + "step": 995000 + }, + { + "epoch": 29.23, + "grad_norm": 3.417736768722534, + "learning_rate": 7.728213224550634e-06, + "loss": 0.9611, + "step": 995500 + }, + { + "epoch": 29.24, + "grad_norm": 4.05425500869751, + "learning_rate": 7.581121233219255e-06, + "loss": 0.9733, + "step": 996000 + }, + { + "epoch": 29.26, + "grad_norm": 2.117457866668701, + "learning_rate": 7.434029241887876e-06, + "loss": 0.9494, + "step": 996500 + }, + { + "epoch": 29.27, + "grad_norm": 9.484780311584473, + "learning_rate": 7.286937250556497e-06, + "loss": 0.9842, + "step": 997000 + }, + { + "epoch": 29.29, + "grad_norm": 5.586018085479736, + "learning_rate": 7.139845259225119e-06, + "loss": 0.955, + "step": 997500 + }, + { + "epoch": 29.3, + "grad_norm": 6.439801216125488, + "learning_rate": 6.99275326789374e-06, + "loss": 0.9608, + "step": 998000 + }, + { + "epoch": 29.32, + "grad_norm": 2.890882730484009, + "learning_rate": 6.845661276562361e-06, + "loss": 0.9649, + "step": 998500 + }, + { + "epoch": 29.33, + "grad_norm": 2.0991222858428955, + "learning_rate": 6.6985692852309825e-06, + "loss": 0.9476, + "step": 999000 + }, + { + "epoch": 29.35, + "grad_norm": 2.40987229347229, + "learning_rate": 6.5514772938996045e-06, + "loss": 0.9537, + "step": 999500 + }, + { + "epoch": 29.36, + "grad_norm": 2.95318865776062, + "learning_rate": 6.4043853025682256e-06, + "loss": 0.9749, + "step": 1000000 + }, + { + "epoch": 29.38, + "grad_norm": 5.403099060058594, + "learning_rate": 6.257293311236847e-06, + "loss": 0.9496, + "step": 1000500 + }, + { + "epoch": 29.39, + "grad_norm": 3.3474481105804443, + "learning_rate": 6.110201319905468e-06, + "loss": 0.9756, + "step": 1001000 + }, + { + "epoch": 29.4, + "grad_norm": 2.7617027759552, + "learning_rate": 5.96310932857409e-06, + "loss": 0.944, + "step": 1001500 + }, + { + "epoch": 29.42, + "grad_norm": 3.7849531173706055, + "learning_rate": 5.816017337242711e-06, + "loss": 0.9616, + "step": 1002000 + }, + { + "epoch": 29.43, + "grad_norm": 2.1899170875549316, + "learning_rate": 5.668925345911332e-06, + "loss": 0.9578, + "step": 1002500 + }, + { + "epoch": 29.45, + "grad_norm": 2.423264265060425, + "learning_rate": 5.521833354579953e-06, + "loss": 0.9886, + "step": 1003000 + }, + { + "epoch": 29.46, + "grad_norm": 10.377019882202148, + "learning_rate": 5.374741363248575e-06, + "loss": 0.9559, + "step": 1003500 + }, + { + "epoch": 29.48, + "grad_norm": 3.244654655456543, + "learning_rate": 5.227649371917196e-06, + "loss": 0.9467, + "step": 1004000 + }, + { + "epoch": 29.49, + "grad_norm": 3.0262067317962646, + "learning_rate": 5.080557380585817e-06, + "loss": 0.9825, + "step": 1004500 + }, + { + "epoch": 29.51, + "grad_norm": 1.8769322633743286, + "learning_rate": 4.933465389254439e-06, + "loss": 0.9835, + "step": 1005000 + }, + { + "epoch": 29.52, + "grad_norm": 3.122030735015869, + "learning_rate": 4.786373397923061e-06, + "loss": 0.9686, + "step": 1005500 + }, + { + "epoch": 29.54, + "grad_norm": 3.3955576419830322, + "learning_rate": 4.639281406591682e-06, + "loss": 0.9549, + "step": 1006000 + }, + { + "epoch": 29.55, + "grad_norm": 11.812577247619629, + "learning_rate": 4.492189415260304e-06, + "loss": 0.944, + "step": 1006500 + }, + { + "epoch": 29.57, + "grad_norm": 3.2719273567199707, + "learning_rate": 4.345097423928925e-06, + "loss": 0.9675, + "step": 1007000 + }, + { + "epoch": 29.58, + "grad_norm": 2.0601563453674316, + "learning_rate": 4.198005432597546e-06, + "loss": 0.9816, + "step": 1007500 + }, + { + "epoch": 29.6, + "grad_norm": 2.1465187072753906, + "learning_rate": 4.050913441266167e-06, + "loss": 0.9386, + "step": 1008000 + }, + { + "epoch": 29.61, + "grad_norm": 3.070183753967285, + "learning_rate": 3.903821449934789e-06, + "loss": 0.9729, + "step": 1008500 + }, + { + "epoch": 29.62, + "grad_norm": 8.74717903137207, + "learning_rate": 3.7567294586034105e-06, + "loss": 0.9483, + "step": 1009000 + }, + { + "epoch": 29.64, + "grad_norm": 4.731165409088135, + "learning_rate": 3.6096374672720316e-06, + "loss": 0.9699, + "step": 1009500 + }, + { + "epoch": 29.65, + "grad_norm": 3.4259703159332275, + "learning_rate": 3.462545475940653e-06, + "loss": 0.9768, + "step": 1010000 + }, + { + "epoch": 29.67, + "grad_norm": 2.1280946731567383, + "learning_rate": 3.3154534846092743e-06, + "loss": 0.9732, + "step": 1010500 + }, + { + "epoch": 29.68, + "grad_norm": 2.901146411895752, + "learning_rate": 3.168361493277896e-06, + "loss": 0.968, + "step": 1011000 + }, + { + "epoch": 29.7, + "grad_norm": 3.9223461151123047, + "learning_rate": 3.021269501946517e-06, + "loss": 0.9499, + "step": 1011500 + }, + { + "epoch": 29.71, + "grad_norm": 2.9555001258850098, + "learning_rate": 2.8741775106151385e-06, + "loss": 0.9306, + "step": 1012000 + }, + { + "epoch": 29.73, + "grad_norm": 3.057680606842041, + "learning_rate": 2.7270855192837596e-06, + "loss": 0.9558, + "step": 1012500 + }, + { + "epoch": 29.74, + "grad_norm": 2.978799343109131, + "learning_rate": 2.579993527952381e-06, + "loss": 0.9701, + "step": 1013000 + }, + { + "epoch": 29.76, + "grad_norm": 2.0970911979675293, + "learning_rate": 2.4329015366210023e-06, + "loss": 0.9577, + "step": 1013500 + }, + { + "epoch": 29.77, + "grad_norm": 2.8144404888153076, + "learning_rate": 2.285809545289624e-06, + "loss": 0.9587, + "step": 1014000 + }, + { + "epoch": 29.79, + "grad_norm": 2.756420135498047, + "learning_rate": 2.1387175539582454e-06, + "loss": 0.9554, + "step": 1014500 + }, + { + "epoch": 29.8, + "grad_norm": 3.5521059036254883, + "learning_rate": 1.9916255626268665e-06, + "loss": 0.95, + "step": 1015000 + }, + { + "epoch": 29.82, + "grad_norm": 34.70965576171875, + "learning_rate": 1.844533571295488e-06, + "loss": 0.9708, + "step": 1015500 + }, + { + "epoch": 29.83, + "grad_norm": 3.2949025630950928, + "learning_rate": 1.6974415799641094e-06, + "loss": 0.9673, + "step": 1016000 + }, + { + "epoch": 29.84, + "grad_norm": 2.9649112224578857, + "learning_rate": 1.5503495886327307e-06, + "loss": 0.9758, + "step": 1016500 + }, + { + "epoch": 29.86, + "grad_norm": 2.4358630180358887, + "learning_rate": 1.403257597301352e-06, + "loss": 0.9793, + "step": 1017000 + }, + { + "epoch": 29.87, + "grad_norm": 3.3332042694091797, + "learning_rate": 1.2561656059699736e-06, + "loss": 0.9783, + "step": 1017500 + }, + { + "epoch": 29.89, + "grad_norm": 5.788940906524658, + "learning_rate": 1.1090736146385948e-06, + "loss": 0.9519, + "step": 1018000 + }, + { + "epoch": 29.9, + "grad_norm": 2.6301000118255615, + "learning_rate": 9.619816233072163e-07, + "loss": 0.9558, + "step": 1018500 + }, + { + "epoch": 29.92, + "grad_norm": 2.9536306858062744, + "learning_rate": 8.148896319758376e-07, + "loss": 0.9466, + "step": 1019000 + }, + { + "epoch": 29.93, + "grad_norm": 2.9202470779418945, + "learning_rate": 6.67797640644459e-07, + "loss": 0.9695, + "step": 1019500 + }, + { + "epoch": 29.95, + "grad_norm": 3.0824179649353027, + "learning_rate": 5.207056493130803e-07, + "loss": 0.9361, + "step": 1020000 + }, + { + "epoch": 29.96, + "grad_norm": 2.5186848640441895, + "learning_rate": 3.736136579817017e-07, + "loss": 0.9476, + "step": 1020500 + }, + { + "epoch": 29.98, + "grad_norm": 5.588615894317627, + "learning_rate": 2.265216666503231e-07, + "loss": 0.9431, + "step": 1021000 + }, + { + "epoch": 29.99, + "grad_norm": 2.025895595550537, + "learning_rate": 7.942967531894445e-08, + "loss": 0.9567, + "step": 1021500 + }, + { + "epoch": 30.0, + "step": 1021770, + "total_flos": 1.7792790799000776e+21, + "train_loss": 1.519880951870647, + "train_runtime": 591190.4296, + "train_samples_per_second": 27.653, + "train_steps_per_second": 1.728 + } + ], + "logging_steps": 500, + "max_steps": 1021770, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 500, + "total_flos": 1.7792790799000776e+21, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}