{ "best_metric": null, "best_model_checkpoint": null, "epoch": 29.999559594239493, "eval_steps": 500, "global_step": 1021770, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 1.2330610752105713, "learning_rate": 7.5e-05, "loss": 47.9921, "step": 500 }, { "epoch": 0.03, "grad_norm": 1.4777156114578247, "learning_rate": 0.00015, "loss": 6.0359, "step": 1000 }, { "epoch": 0.04, "grad_norm": 2.119438886642456, "learning_rate": 0.000225, "loss": 5.8858, "step": 1500 }, { "epoch": 0.06, "grad_norm": 2.3987252712249756, "learning_rate": 0.0003, "loss": 5.7995, "step": 2000 }, { "epoch": 0.07, "grad_norm": 1.466189980506897, "learning_rate": 0.0002998529080086686, "loss": 5.5973, "step": 2500 }, { "epoch": 0.09, "grad_norm": 1.6185613870620728, "learning_rate": 0.0002997058160173372, "loss": 4.5547, "step": 3000 }, { "epoch": 0.1, "grad_norm": 2.422328472137451, "learning_rate": 0.0002995587240260058, "loss": 3.8361, "step": 3500 }, { "epoch": 0.12, "grad_norm": 2.422811269760132, "learning_rate": 0.00029941163203467446, "loss": 3.5267, "step": 4000 }, { "epoch": 0.13, "grad_norm": 1.8635873794555664, "learning_rate": 0.0002992645400433431, "loss": 3.3318, "step": 4500 }, { "epoch": 0.15, "grad_norm": 2.5039353370666504, "learning_rate": 0.00029911744805201173, "loss": 3.2049, "step": 5000 }, { "epoch": 0.16, "grad_norm": 3.018263578414917, "learning_rate": 0.0002989703560606803, "loss": 3.1146, "step": 5500 }, { "epoch": 0.18, "grad_norm": 2.2091281414031982, "learning_rate": 0.00029882326406934894, "loss": 3.0412, "step": 6000 }, { "epoch": 0.19, "grad_norm": 2.7145626544952393, "learning_rate": 0.0002986761720780176, "loss": 2.9849, "step": 6500 }, { "epoch": 0.21, "grad_norm": 3.0529212951660156, "learning_rate": 0.00029852908008668616, "loss": 2.9731, "step": 7000 }, { "epoch": 0.22, "grad_norm": 2.4529542922973633, "learning_rate": 0.0002983819880953548, "loss": 2.9193, "step": 7500 }, { "epoch": 0.23, "grad_norm": 2.009725332260132, "learning_rate": 0.00029823489610402343, "loss": 2.8839, "step": 8000 }, { "epoch": 0.25, "grad_norm": 3.02736759185791, "learning_rate": 0.00029808780411269206, "loss": 2.8483, "step": 8500 }, { "epoch": 0.26, "grad_norm": 2.597303867340088, "learning_rate": 0.0002979407121213607, "loss": 2.84, "step": 9000 }, { "epoch": 0.28, "grad_norm": 2.826665163040161, "learning_rate": 0.0002977936201300293, "loss": 2.8151, "step": 9500 }, { "epoch": 0.29, "grad_norm": 2.141164779663086, "learning_rate": 0.0002976465281386979, "loss": 2.7756, "step": 10000 }, { "epoch": 0.31, "grad_norm": 3.0721323490142822, "learning_rate": 0.00029749943614736655, "loss": 2.7468, "step": 10500 }, { "epoch": 0.32, "grad_norm": 3.0305256843566895, "learning_rate": 0.0002973523441560351, "loss": 2.7548, "step": 11000 }, { "epoch": 0.34, "grad_norm": 1.7159794569015503, "learning_rate": 0.00029720525216470376, "loss": 2.739, "step": 11500 }, { "epoch": 0.35, "grad_norm": 4.015839099884033, "learning_rate": 0.0002970581601733724, "loss": 2.7338, "step": 12000 }, { "epoch": 0.37, "grad_norm": 2.1496126651763916, "learning_rate": 0.00029691106818204103, "loss": 2.7047, "step": 12500 }, { "epoch": 0.38, "grad_norm": 12.543700218200684, "learning_rate": 0.00029676397619070967, "loss": 2.6905, "step": 13000 }, { "epoch": 0.4, "grad_norm": 2.919098138809204, "learning_rate": 0.00029661688419937825, "loss": 2.6511, "step": 13500 }, { "epoch": 0.41, "grad_norm": 2.164848566055298, "learning_rate": 0.0002964697922080469, "loss": 2.6765, "step": 14000 }, { "epoch": 0.43, "grad_norm": 2.595182418823242, "learning_rate": 0.0002963227002167155, "loss": 2.6406, "step": 14500 }, { "epoch": 0.44, "grad_norm": 3.517666816711426, "learning_rate": 0.0002961756082253841, "loss": 2.6752, "step": 15000 }, { "epoch": 0.46, "grad_norm": 3.788487434387207, "learning_rate": 0.00029602851623405273, "loss": 2.6248, "step": 15500 }, { "epoch": 0.47, "grad_norm": 1.8919869661331177, "learning_rate": 0.00029588142424272137, "loss": 2.6751, "step": 16000 }, { "epoch": 0.48, "grad_norm": 7.1151628494262695, "learning_rate": 0.00029573433225139, "loss": 2.6024, "step": 16500 }, { "epoch": 0.5, "grad_norm": 1.6645219326019287, "learning_rate": 0.00029558724026005863, "loss": 2.6017, "step": 17000 }, { "epoch": 0.51, "grad_norm": 4.183793067932129, "learning_rate": 0.0002954401482687272, "loss": 2.5482, "step": 17500 }, { "epoch": 0.53, "grad_norm": 3.3193671703338623, "learning_rate": 0.00029529305627739585, "loss": 2.6058, "step": 18000 }, { "epoch": 0.54, "grad_norm": 1.861599326133728, "learning_rate": 0.0002951459642860645, "loss": 2.5766, "step": 18500 }, { "epoch": 0.56, "grad_norm": 2.443558931350708, "learning_rate": 0.00029499887229473306, "loss": 2.5901, "step": 19000 }, { "epoch": 0.57, "grad_norm": 2.1485493183135986, "learning_rate": 0.0002948517803034017, "loss": 2.549, "step": 19500 }, { "epoch": 0.59, "grad_norm": 2.802183151245117, "learning_rate": 0.00029470468831207033, "loss": 2.568, "step": 20000 }, { "epoch": 0.6, "grad_norm": 2.226308822631836, "learning_rate": 0.00029455759632073897, "loss": 2.5599, "step": 20500 }, { "epoch": 0.62, "grad_norm": 8.404243469238281, "learning_rate": 0.0002944105043294076, "loss": 2.5574, "step": 21000 }, { "epoch": 0.63, "grad_norm": 4.14946985244751, "learning_rate": 0.0002942634123380762, "loss": 2.5266, "step": 21500 }, { "epoch": 0.65, "grad_norm": 1.674615502357483, "learning_rate": 0.0002941163203467448, "loss": 2.5094, "step": 22000 }, { "epoch": 0.66, "grad_norm": 1.943082332611084, "learning_rate": 0.00029396922835541345, "loss": 2.4967, "step": 22500 }, { "epoch": 0.68, "grad_norm": 1.9026265144348145, "learning_rate": 0.0002938221363640821, "loss": 2.4974, "step": 23000 }, { "epoch": 0.69, "grad_norm": 4.214877128601074, "learning_rate": 0.00029367504437275067, "loss": 2.5329, "step": 23500 }, { "epoch": 0.7, "grad_norm": 2.1662731170654297, "learning_rate": 0.0002935279523814193, "loss": 2.508, "step": 24000 }, { "epoch": 0.72, "grad_norm": 2.6833200454711914, "learning_rate": 0.00029338086039008794, "loss": 2.4891, "step": 24500 }, { "epoch": 0.73, "grad_norm": 10.436029434204102, "learning_rate": 0.00029323376839875657, "loss": 2.506, "step": 25000 }, { "epoch": 0.75, "grad_norm": 1.6364326477050781, "learning_rate": 0.0002930866764074252, "loss": 2.5091, "step": 25500 }, { "epoch": 0.76, "grad_norm": 2.408642292022705, "learning_rate": 0.0002929395844160938, "loss": 2.4987, "step": 26000 }, { "epoch": 0.78, "grad_norm": 2.324516534805298, "learning_rate": 0.0002927924924247624, "loss": 2.5619, "step": 26500 }, { "epoch": 0.79, "grad_norm": 2.3488011360168457, "learning_rate": 0.00029264540043343106, "loss": 2.4594, "step": 27000 }, { "epoch": 0.81, "grad_norm": 2.130359411239624, "learning_rate": 0.00029249830844209964, "loss": 2.4677, "step": 27500 }, { "epoch": 0.82, "grad_norm": 3.6321048736572266, "learning_rate": 0.0002923512164507683, "loss": 2.4925, "step": 28000 }, { "epoch": 0.84, "grad_norm": 2.2566709518432617, "learning_rate": 0.0002922041244594369, "loss": 2.4779, "step": 28500 }, { "epoch": 0.85, "grad_norm": 1.4957294464111328, "learning_rate": 0.00029205703246810554, "loss": 2.4482, "step": 29000 }, { "epoch": 0.87, "grad_norm": 2.031252384185791, "learning_rate": 0.0002919099404767742, "loss": 2.4215, "step": 29500 }, { "epoch": 0.88, "grad_norm": 3.686828136444092, "learning_rate": 0.00029176284848544276, "loss": 2.4463, "step": 30000 }, { "epoch": 0.9, "grad_norm": 1.746907114982605, "learning_rate": 0.0002916157564941114, "loss": 2.4436, "step": 30500 }, { "epoch": 0.91, "grad_norm": 2.379333257675171, "learning_rate": 0.00029146866450278, "loss": 2.4244, "step": 31000 }, { "epoch": 0.92, "grad_norm": 2.696253776550293, "learning_rate": 0.0002913215725114486, "loss": 2.4218, "step": 31500 }, { "epoch": 0.94, "grad_norm": 4.1202826499938965, "learning_rate": 0.0002911744805201173, "loss": 2.438, "step": 32000 }, { "epoch": 0.95, "grad_norm": 3.3910372257232666, "learning_rate": 0.0002910273885287859, "loss": 2.4405, "step": 32500 }, { "epoch": 0.97, "grad_norm": 2.9638476371765137, "learning_rate": 0.0002908802965374545, "loss": 2.4461, "step": 33000 }, { "epoch": 0.98, "grad_norm": 3.280348062515259, "learning_rate": 0.00029073320454612315, "loss": 2.4177, "step": 33500 }, { "epoch": 1.0, "grad_norm": 2.654728651046753, "learning_rate": 0.0002905861125547917, "loss": 2.414, "step": 34000 }, { "epoch": 1.01, "grad_norm": 2.3180739879608154, "learning_rate": 0.00029043902056346036, "loss": 2.367, "step": 34500 }, { "epoch": 1.03, "grad_norm": 5.21975040435791, "learning_rate": 0.000290291928572129, "loss": 2.4158, "step": 35000 }, { "epoch": 1.04, "grad_norm": 1.7568458318710327, "learning_rate": 0.0002901448365807976, "loss": 2.3574, "step": 35500 }, { "epoch": 1.06, "grad_norm": 2.872269630432129, "learning_rate": 0.00028999774458946626, "loss": 2.3842, "step": 36000 }, { "epoch": 1.07, "grad_norm": 2.3413188457489014, "learning_rate": 0.00028985065259813484, "loss": 2.3313, "step": 36500 }, { "epoch": 1.09, "grad_norm": 1.4920777082443237, "learning_rate": 0.0002897035606068035, "loss": 2.4049, "step": 37000 }, { "epoch": 1.1, "grad_norm": 2.5813608169555664, "learning_rate": 0.0002895564686154721, "loss": 2.3641, "step": 37500 }, { "epoch": 1.12, "grad_norm": 2.110539674758911, "learning_rate": 0.0002894093766241407, "loss": 2.3704, "step": 38000 }, { "epoch": 1.13, "grad_norm": 2.397954225540161, "learning_rate": 0.00028926228463280933, "loss": 2.3962, "step": 38500 }, { "epoch": 1.15, "grad_norm": 1.290098786354065, "learning_rate": 0.00028911519264147796, "loss": 2.3359, "step": 39000 }, { "epoch": 1.16, "grad_norm": 1.7063418626785278, "learning_rate": 0.00028896810065014654, "loss": 2.34, "step": 39500 }, { "epoch": 1.17, "grad_norm": 1.603582501411438, "learning_rate": 0.00028882100865881523, "loss": 2.3439, "step": 40000 }, { "epoch": 1.19, "grad_norm": 1.6357001066207886, "learning_rate": 0.0002886739166674838, "loss": 2.3689, "step": 40500 }, { "epoch": 1.2, "grad_norm": 2.4969911575317383, "learning_rate": 0.00028852682467615245, "loss": 2.3621, "step": 41000 }, { "epoch": 1.22, "grad_norm": 1.7131437063217163, "learning_rate": 0.0002883797326848211, "loss": 2.3402, "step": 41500 }, { "epoch": 1.23, "grad_norm": 1.681779384613037, "learning_rate": 0.00028823264069348966, "loss": 2.3047, "step": 42000 }, { "epoch": 1.25, "grad_norm": 3.0767245292663574, "learning_rate": 0.0002880855487021583, "loss": 2.3529, "step": 42500 }, { "epoch": 1.26, "grad_norm": 2.4065330028533936, "learning_rate": 0.00028793845671082693, "loss": 2.3332, "step": 43000 }, { "epoch": 1.28, "grad_norm": 3.0359294414520264, "learning_rate": 0.0002877913647194955, "loss": 2.3477, "step": 43500 }, { "epoch": 1.29, "grad_norm": 2.010524272918701, "learning_rate": 0.0002876442727281642, "loss": 2.3123, "step": 44000 }, { "epoch": 1.31, "grad_norm": 9.476223945617676, "learning_rate": 0.0002874971807368328, "loss": 2.3187, "step": 44500 }, { "epoch": 1.32, "grad_norm": 1.5241541862487793, "learning_rate": 0.0002873500887455014, "loss": 2.3473, "step": 45000 }, { "epoch": 1.34, "grad_norm": 2.111661434173584, "learning_rate": 0.00028720299675417005, "loss": 2.3122, "step": 45500 }, { "epoch": 1.35, "grad_norm": 1.5378273725509644, "learning_rate": 0.00028705590476283863, "loss": 2.3185, "step": 46000 }, { "epoch": 1.37, "grad_norm": 2.218921184539795, "learning_rate": 0.00028690881277150727, "loss": 2.3426, "step": 46500 }, { "epoch": 1.38, "grad_norm": 2.234276294708252, "learning_rate": 0.0002867617207801759, "loss": 2.2885, "step": 47000 }, { "epoch": 1.39, "grad_norm": 2.929063081741333, "learning_rate": 0.0002866146287888445, "loss": 2.3047, "step": 47500 }, { "epoch": 1.41, "grad_norm": 1.764546513557434, "learning_rate": 0.00028646753679751317, "loss": 2.275, "step": 48000 }, { "epoch": 1.42, "grad_norm": 3.143376111984253, "learning_rate": 0.00028632044480618175, "loss": 2.2591, "step": 48500 }, { "epoch": 1.44, "grad_norm": 2.8672120571136475, "learning_rate": 0.0002861733528148504, "loss": 2.2462, "step": 49000 }, { "epoch": 1.45, "grad_norm": 3.577282667160034, "learning_rate": 0.000286026260823519, "loss": 2.2689, "step": 49500 }, { "epoch": 1.47, "grad_norm": 1.5024135112762451, "learning_rate": 0.0002858791688321876, "loss": 2.2886, "step": 50000 }, { "epoch": 1.48, "grad_norm": 2.36291241645813, "learning_rate": 0.00028573207684085624, "loss": 2.2102, "step": 50500 }, { "epoch": 1.5, "grad_norm": 2.7048864364624023, "learning_rate": 0.00028558498484952487, "loss": 2.2734, "step": 51000 }, { "epoch": 1.51, "grad_norm": 15.023186683654785, "learning_rate": 0.00028543789285819345, "loss": 2.2859, "step": 51500 }, { "epoch": 1.53, "grad_norm": 1.5235730409622192, "learning_rate": 0.00028529080086686214, "loss": 2.2459, "step": 52000 }, { "epoch": 1.54, "grad_norm": 3.157870292663574, "learning_rate": 0.0002851437088755307, "loss": 2.2679, "step": 52500 }, { "epoch": 1.56, "grad_norm": 5.78060245513916, "learning_rate": 0.00028499661688419936, "loss": 2.2781, "step": 53000 }, { "epoch": 1.57, "grad_norm": 3.3039705753326416, "learning_rate": 0.000284849524892868, "loss": 2.2761, "step": 53500 }, { "epoch": 1.59, "grad_norm": 2.286205530166626, "learning_rate": 0.00028470243290153657, "loss": 2.2549, "step": 54000 }, { "epoch": 1.6, "grad_norm": 2.2150983810424805, "learning_rate": 0.0002845553409102052, "loss": 2.2443, "step": 54500 }, { "epoch": 1.61, "grad_norm": 3.0205180644989014, "learning_rate": 0.00028440824891887384, "loss": 2.2676, "step": 55000 }, { "epoch": 1.63, "grad_norm": 1.5430960655212402, "learning_rate": 0.0002842611569275424, "loss": 2.2403, "step": 55500 }, { "epoch": 1.64, "grad_norm": 1.979278802871704, "learning_rate": 0.0002841140649362111, "loss": 2.2409, "step": 56000 }, { "epoch": 1.66, "grad_norm": 1.6833264827728271, "learning_rate": 0.0002839669729448797, "loss": 2.2657, "step": 56500 }, { "epoch": 1.67, "grad_norm": 1.9541752338409424, "learning_rate": 0.0002838198809535483, "loss": 2.2767, "step": 57000 }, { "epoch": 1.69, "grad_norm": 5.107935905456543, "learning_rate": 0.00028367278896221696, "loss": 2.2774, "step": 57500 }, { "epoch": 1.7, "grad_norm": 1.5604417324066162, "learning_rate": 0.00028352569697088554, "loss": 2.2405, "step": 58000 }, { "epoch": 1.72, "grad_norm": 1.6418412923812866, "learning_rate": 0.0002833786049795542, "loss": 2.2434, "step": 58500 }, { "epoch": 1.73, "grad_norm": 1.5278984308242798, "learning_rate": 0.0002832315129882228, "loss": 2.2977, "step": 59000 }, { "epoch": 1.75, "grad_norm": 2.3131840229034424, "learning_rate": 0.0002830844209968914, "loss": 2.2236, "step": 59500 }, { "epoch": 1.76, "grad_norm": 1.7341011762619019, "learning_rate": 0.0002829373290055601, "loss": 2.2235, "step": 60000 }, { "epoch": 1.78, "grad_norm": 2.8039567470550537, "learning_rate": 0.00028279023701422866, "loss": 2.2475, "step": 60500 }, { "epoch": 1.79, "grad_norm": 4.142283916473389, "learning_rate": 0.0002826431450228973, "loss": 2.2128, "step": 61000 }, { "epoch": 1.81, "grad_norm": 1.2194138765335083, "learning_rate": 0.00028249605303156593, "loss": 2.2568, "step": 61500 }, { "epoch": 1.82, "grad_norm": 1.2129665613174438, "learning_rate": 0.0002823489610402345, "loss": 2.2282, "step": 62000 }, { "epoch": 1.84, "grad_norm": 8.789237022399902, "learning_rate": 0.00028220186904890314, "loss": 2.2071, "step": 62500 }, { "epoch": 1.85, "grad_norm": 1.4489936828613281, "learning_rate": 0.0002820547770575718, "loss": 2.2389, "step": 63000 }, { "epoch": 1.86, "grad_norm": 1.591963768005371, "learning_rate": 0.0002819076850662404, "loss": 2.2277, "step": 63500 }, { "epoch": 1.88, "grad_norm": 2.642183303833008, "learning_rate": 0.00028176059307490905, "loss": 2.2195, "step": 64000 }, { "epoch": 1.89, "grad_norm": 2.7950453758239746, "learning_rate": 0.00028161350108357763, "loss": 2.2179, "step": 64500 }, { "epoch": 1.91, "grad_norm": 2.3858351707458496, "learning_rate": 0.00028146640909224626, "loss": 2.2266, "step": 65000 }, { "epoch": 1.92, "grad_norm": 2.2545394897460938, "learning_rate": 0.0002813193171009149, "loss": 2.2538, "step": 65500 }, { "epoch": 1.94, "grad_norm": 2.3457772731781006, "learning_rate": 0.0002811722251095835, "loss": 2.2351, "step": 66000 }, { "epoch": 1.95, "grad_norm": 2.0313475131988525, "learning_rate": 0.0002810251331182521, "loss": 2.2574, "step": 66500 }, { "epoch": 1.97, "grad_norm": 4.175018310546875, "learning_rate": 0.00028087804112692075, "loss": 2.2269, "step": 67000 }, { "epoch": 1.98, "grad_norm": 1.963953971862793, "learning_rate": 0.0002807309491355894, "loss": 2.2408, "step": 67500 }, { "epoch": 2.0, "grad_norm": 1.8645119667053223, "learning_rate": 0.000280583857144258, "loss": 2.2588, "step": 68000 }, { "epoch": 2.01, "grad_norm": 1.8845419883728027, "learning_rate": 0.0002804367651529266, "loss": 2.1532, "step": 68500 }, { "epoch": 2.03, "grad_norm": 2.3963561058044434, "learning_rate": 0.00028028967316159523, "loss": 2.1441, "step": 69000 }, { "epoch": 2.04, "grad_norm": 3.194061040878296, "learning_rate": 0.00028014258117026387, "loss": 2.1184, "step": 69500 }, { "epoch": 2.06, "grad_norm": 1.4685026407241821, "learning_rate": 0.0002799954891789325, "loss": 2.1504, "step": 70000 }, { "epoch": 2.07, "grad_norm": 6.733902931213379, "learning_rate": 0.0002798483971876011, "loss": 2.1449, "step": 70500 }, { "epoch": 2.08, "grad_norm": 1.5692353248596191, "learning_rate": 0.0002797013051962697, "loss": 2.1132, "step": 71000 }, { "epoch": 2.1, "grad_norm": 2.4298503398895264, "learning_rate": 0.00027955421320493835, "loss": 2.1557, "step": 71500 }, { "epoch": 2.11, "grad_norm": 1.9930598735809326, "learning_rate": 0.000279407121213607, "loss": 2.1509, "step": 72000 }, { "epoch": 2.13, "grad_norm": 3.2199394702911377, "learning_rate": 0.0002792600292222756, "loss": 2.1637, "step": 72500 }, { "epoch": 2.14, "grad_norm": 11.737617492675781, "learning_rate": 0.0002791129372309442, "loss": 2.1145, "step": 73000 }, { "epoch": 2.16, "grad_norm": 3.2281816005706787, "learning_rate": 0.00027896584523961284, "loss": 2.205, "step": 73500 }, { "epoch": 2.17, "grad_norm": 1.6318072080612183, "learning_rate": 0.00027881875324828147, "loss": 2.1355, "step": 74000 }, { "epoch": 2.19, "grad_norm": 1.4795607328414917, "learning_rate": 0.00027867166125695005, "loss": 2.1594, "step": 74500 }, { "epoch": 2.2, "grad_norm": 3.246556043624878, "learning_rate": 0.0002785245692656187, "loss": 2.1225, "step": 75000 }, { "epoch": 2.22, "grad_norm": 5.301700115203857, "learning_rate": 0.0002783774772742873, "loss": 2.1477, "step": 75500 }, { "epoch": 2.23, "grad_norm": 1.3937615156173706, "learning_rate": 0.00027823038528295595, "loss": 2.1377, "step": 76000 }, { "epoch": 2.25, "grad_norm": 5.8993611335754395, "learning_rate": 0.0002780832932916246, "loss": 2.156, "step": 76500 }, { "epoch": 2.26, "grad_norm": 3.7031867504119873, "learning_rate": 0.00027793620130029317, "loss": 2.1246, "step": 77000 }, { "epoch": 2.28, "grad_norm": 3.8393964767456055, "learning_rate": 0.0002777891093089618, "loss": 2.1503, "step": 77500 }, { "epoch": 2.29, "grad_norm": 1.3489915132522583, "learning_rate": 0.00027764201731763044, "loss": 2.1506, "step": 78000 }, { "epoch": 2.3, "grad_norm": 6.942753791809082, "learning_rate": 0.000277494925326299, "loss": 2.1575, "step": 78500 }, { "epoch": 2.32, "grad_norm": 3.2154929637908936, "learning_rate": 0.00027734783333496765, "loss": 2.1279, "step": 79000 }, { "epoch": 2.33, "grad_norm": 1.8168853521347046, "learning_rate": 0.0002772007413436363, "loss": 2.0715, "step": 79500 }, { "epoch": 2.35, "grad_norm": 2.112091541290283, "learning_rate": 0.0002770536493523049, "loss": 2.1344, "step": 80000 }, { "epoch": 2.36, "grad_norm": 2.050875663757324, "learning_rate": 0.00027690655736097356, "loss": 2.1416, "step": 80500 }, { "epoch": 2.38, "grad_norm": 1.736621618270874, "learning_rate": 0.00027675946536964214, "loss": 2.1196, "step": 81000 }, { "epoch": 2.39, "grad_norm": 2.051025629043579, "learning_rate": 0.0002766123733783108, "loss": 2.1272, "step": 81500 }, { "epoch": 2.41, "grad_norm": 4.314809799194336, "learning_rate": 0.0002764652813869794, "loss": 2.1618, "step": 82000 }, { "epoch": 2.42, "grad_norm": 2.920485496520996, "learning_rate": 0.000276318189395648, "loss": 2.1354, "step": 82500 }, { "epoch": 2.44, "grad_norm": 6.310970783233643, "learning_rate": 0.0002761710974043166, "loss": 2.1495, "step": 83000 }, { "epoch": 2.45, "grad_norm": 1.929152250289917, "learning_rate": 0.00027602400541298526, "loss": 2.1271, "step": 83500 }, { "epoch": 2.47, "grad_norm": 2.9410946369171143, "learning_rate": 0.0002758769134216539, "loss": 2.1646, "step": 84000 }, { "epoch": 2.48, "grad_norm": 2.474297523498535, "learning_rate": 0.00027572982143032253, "loss": 2.127, "step": 84500 }, { "epoch": 2.5, "grad_norm": 1.6442033052444458, "learning_rate": 0.0002755827294389911, "loss": 2.1433, "step": 85000 }, { "epoch": 2.51, "grad_norm": 1.3729546070098877, "learning_rate": 0.00027543563744765974, "loss": 2.1121, "step": 85500 }, { "epoch": 2.52, "grad_norm": 1.6497186422348022, "learning_rate": 0.0002752885454563284, "loss": 2.1566, "step": 86000 }, { "epoch": 2.54, "grad_norm": 2.6162164211273193, "learning_rate": 0.00027514145346499696, "loss": 2.1513, "step": 86500 }, { "epoch": 2.55, "grad_norm": 1.7324166297912598, "learning_rate": 0.0002749943614736656, "loss": 2.1482, "step": 87000 }, { "epoch": 2.57, "grad_norm": 1.3059158325195312, "learning_rate": 0.0002748472694823342, "loss": 2.1346, "step": 87500 }, { "epoch": 2.58, "grad_norm": 1.1019114255905151, "learning_rate": 0.00027470017749100286, "loss": 2.123, "step": 88000 }, { "epoch": 2.6, "grad_norm": 7.39063024520874, "learning_rate": 0.0002745530854996715, "loss": 2.1278, "step": 88500 }, { "epoch": 2.61, "grad_norm": 1.9237990379333496, "learning_rate": 0.0002744059935083401, "loss": 2.115, "step": 89000 }, { "epoch": 2.63, "grad_norm": 1.4632532596588135, "learning_rate": 0.0002742589015170087, "loss": 2.137, "step": 89500 }, { "epoch": 2.64, "grad_norm": 1.9587647914886475, "learning_rate": 0.00027411180952567735, "loss": 2.1056, "step": 90000 }, { "epoch": 2.66, "grad_norm": 7.492849349975586, "learning_rate": 0.0002739647175343459, "loss": 2.1208, "step": 90500 }, { "epoch": 2.67, "grad_norm": 1.7255750894546509, "learning_rate": 0.00027381762554301456, "loss": 2.1015, "step": 91000 }, { "epoch": 2.69, "grad_norm": 8.091690063476562, "learning_rate": 0.0002736705335516832, "loss": 2.1073, "step": 91500 }, { "epoch": 2.7, "grad_norm": 6.672662734985352, "learning_rate": 0.00027352344156035183, "loss": 2.1059, "step": 92000 }, { "epoch": 2.72, "grad_norm": 1.948398470878601, "learning_rate": 0.00027337634956902047, "loss": 2.0943, "step": 92500 }, { "epoch": 2.73, "grad_norm": 1.4260573387145996, "learning_rate": 0.00027322925757768905, "loss": 2.1283, "step": 93000 }, { "epoch": 2.75, "grad_norm": 1.5472320318222046, "learning_rate": 0.0002730821655863577, "loss": 2.0882, "step": 93500 }, { "epoch": 2.76, "grad_norm": 1.3860925436019897, "learning_rate": 0.0002729350735950263, "loss": 2.1323, "step": 94000 }, { "epoch": 2.77, "grad_norm": 2.232808828353882, "learning_rate": 0.0002727879816036949, "loss": 2.1485, "step": 94500 }, { "epoch": 2.79, "grad_norm": 1.8404427766799927, "learning_rate": 0.00027264088961236353, "loss": 2.1116, "step": 95000 }, { "epoch": 2.8, "grad_norm": 2.124656915664673, "learning_rate": 0.00027249379762103216, "loss": 2.1076, "step": 95500 }, { "epoch": 2.82, "grad_norm": 2.2169177532196045, "learning_rate": 0.0002723467056297008, "loss": 2.1035, "step": 96000 }, { "epoch": 2.83, "grad_norm": 2.7737512588500977, "learning_rate": 0.00027219961363836943, "loss": 2.1369, "step": 96500 }, { "epoch": 2.85, "grad_norm": 2.6662325859069824, "learning_rate": 0.000272052521647038, "loss": 2.1131, "step": 97000 }, { "epoch": 2.86, "grad_norm": 2.6663177013397217, "learning_rate": 0.00027190542965570665, "loss": 2.1175, "step": 97500 }, { "epoch": 2.88, "grad_norm": 1.446356177330017, "learning_rate": 0.0002717583376643753, "loss": 2.1076, "step": 98000 }, { "epoch": 2.89, "grad_norm": 2.3340353965759277, "learning_rate": 0.00027161124567304386, "loss": 2.0877, "step": 98500 }, { "epoch": 2.91, "grad_norm": 3.389127254486084, "learning_rate": 0.0002714641536817125, "loss": 2.0904, "step": 99000 }, { "epoch": 2.92, "grad_norm": 2.044728994369507, "learning_rate": 0.00027131706169038113, "loss": 2.0608, "step": 99500 }, { "epoch": 2.94, "grad_norm": 17.932655334472656, "learning_rate": 0.00027116996969904977, "loss": 2.1136, "step": 100000 }, { "epoch": 2.95, "grad_norm": 1.4239097833633423, "learning_rate": 0.0002710228777077184, "loss": 2.0783, "step": 100500 }, { "epoch": 2.97, "grad_norm": 2.1303598880767822, "learning_rate": 0.000270875785716387, "loss": 2.0981, "step": 101000 }, { "epoch": 2.98, "grad_norm": 2.2777888774871826, "learning_rate": 0.0002707286937250556, "loss": 2.0862, "step": 101500 }, { "epoch": 2.99, "grad_norm": 2.2035715579986572, "learning_rate": 0.00027058160173372425, "loss": 2.1089, "step": 102000 }, { "epoch": 3.01, "grad_norm": 2.7756547927856445, "learning_rate": 0.00027043450974239283, "loss": 2.0614, "step": 102500 }, { "epoch": 3.02, "grad_norm": 1.4222705364227295, "learning_rate": 0.00027028741775106147, "loss": 2.0216, "step": 103000 }, { "epoch": 3.04, "grad_norm": 1.5257524251937866, "learning_rate": 0.0002701403257597301, "loss": 2.0388, "step": 103500 }, { "epoch": 3.05, "grad_norm": 2.274021625518799, "learning_rate": 0.00026999323376839874, "loss": 2.048, "step": 104000 }, { "epoch": 3.07, "grad_norm": 5.903868198394775, "learning_rate": 0.00026984614177706737, "loss": 2.0008, "step": 104500 }, { "epoch": 3.08, "grad_norm": 1.561962604522705, "learning_rate": 0.00026969904978573595, "loss": 2.05, "step": 105000 }, { "epoch": 3.1, "grad_norm": 2.002523422241211, "learning_rate": 0.0002695519577944046, "loss": 2.0217, "step": 105500 }, { "epoch": 3.11, "grad_norm": 1.9345905780792236, "learning_rate": 0.0002694048658030732, "loss": 2.0343, "step": 106000 }, { "epoch": 3.13, "grad_norm": 1.5755488872528076, "learning_rate": 0.0002692577738117418, "loss": 2.0133, "step": 106500 }, { "epoch": 3.14, "grad_norm": 1.7304731607437134, "learning_rate": 0.00026911068182041044, "loss": 2.0177, "step": 107000 }, { "epoch": 3.16, "grad_norm": 2.4045250415802, "learning_rate": 0.00026896358982907907, "loss": 2.034, "step": 107500 }, { "epoch": 3.17, "grad_norm": 1.5433346033096313, "learning_rate": 0.0002688164978377477, "loss": 2.0228, "step": 108000 }, { "epoch": 3.19, "grad_norm": 2.033250093460083, "learning_rate": 0.00026866940584641634, "loss": 2.0076, "step": 108500 }, { "epoch": 3.2, "grad_norm": 3.0048422813415527, "learning_rate": 0.0002685223138550849, "loss": 1.9876, "step": 109000 }, { "epoch": 3.21, "grad_norm": 1.6355750560760498, "learning_rate": 0.00026837522186375356, "loss": 2.0259, "step": 109500 }, { "epoch": 3.23, "grad_norm": 7.954076290130615, "learning_rate": 0.0002682281298724222, "loss": 2.0408, "step": 110000 }, { "epoch": 3.24, "grad_norm": 1.3474091291427612, "learning_rate": 0.0002680810378810908, "loss": 2.0114, "step": 110500 }, { "epoch": 3.26, "grad_norm": 1.8665661811828613, "learning_rate": 0.0002679339458897594, "loss": 2.0388, "step": 111000 }, { "epoch": 3.27, "grad_norm": 1.452719807624817, "learning_rate": 0.00026778685389842804, "loss": 2.0411, "step": 111500 }, { "epoch": 3.29, "grad_norm": 1.9409444332122803, "learning_rate": 0.0002676397619070967, "loss": 2.0355, "step": 112000 }, { "epoch": 3.3, "grad_norm": 1.4139233827590942, "learning_rate": 0.0002674926699157653, "loss": 2.0306, "step": 112500 }, { "epoch": 3.32, "grad_norm": 2.548440933227539, "learning_rate": 0.00026734557792443394, "loss": 2.013, "step": 113000 }, { "epoch": 3.33, "grad_norm": 16.565500259399414, "learning_rate": 0.0002671984859331025, "loss": 2.0435, "step": 113500 }, { "epoch": 3.35, "grad_norm": 2.008643865585327, "learning_rate": 0.00026705139394177116, "loss": 2.0094, "step": 114000 }, { "epoch": 3.36, "grad_norm": 5.655598163604736, "learning_rate": 0.0002669043019504398, "loss": 2.0169, "step": 114500 }, { "epoch": 3.38, "grad_norm": 13.43127727508545, "learning_rate": 0.0002667572099591084, "loss": 2.0273, "step": 115000 }, { "epoch": 3.39, "grad_norm": 4.050173282623291, "learning_rate": 0.00026661011796777706, "loss": 2.0406, "step": 115500 }, { "epoch": 3.41, "grad_norm": 2.2970187664031982, "learning_rate": 0.00026646302597644564, "loss": 2.0374, "step": 116000 }, { "epoch": 3.42, "grad_norm": 1.8749233484268188, "learning_rate": 0.0002663159339851143, "loss": 1.991, "step": 116500 }, { "epoch": 3.44, "grad_norm": 2.4106335639953613, "learning_rate": 0.0002661688419937829, "loss": 2.0425, "step": 117000 }, { "epoch": 3.45, "grad_norm": 1.8036812543869019, "learning_rate": 0.0002660217500024515, "loss": 1.9968, "step": 117500 }, { "epoch": 3.46, "grad_norm": 1.831998348236084, "learning_rate": 0.00026587465801112013, "loss": 2.0291, "step": 118000 }, { "epoch": 3.48, "grad_norm": 2.4199304580688477, "learning_rate": 0.00026572756601978876, "loss": 2.0186, "step": 118500 }, { "epoch": 3.49, "grad_norm": 4.254742622375488, "learning_rate": 0.00026558047402845734, "loss": 2.0226, "step": 119000 }, { "epoch": 3.51, "grad_norm": 12.558996200561523, "learning_rate": 0.00026543338203712603, "loss": 2.0368, "step": 119500 }, { "epoch": 3.52, "grad_norm": 1.9092944860458374, "learning_rate": 0.0002652862900457946, "loss": 2.02, "step": 120000 }, { "epoch": 3.54, "grad_norm": 1.9921791553497314, "learning_rate": 0.00026513919805446325, "loss": 2.019, "step": 120500 }, { "epoch": 3.55, "grad_norm": 2.1361401081085205, "learning_rate": 0.0002649921060631319, "loss": 1.9927, "step": 121000 }, { "epoch": 3.57, "grad_norm": 1.6607388257980347, "learning_rate": 0.00026484501407180046, "loss": 2.0722, "step": 121500 }, { "epoch": 3.58, "grad_norm": 2.246613025665283, "learning_rate": 0.0002646979220804691, "loss": 2.0091, "step": 122000 }, { "epoch": 3.6, "grad_norm": 5.497014045715332, "learning_rate": 0.00026455083008913773, "loss": 2.0182, "step": 122500 }, { "epoch": 3.61, "grad_norm": 1.5741316080093384, "learning_rate": 0.00026440373809780637, "loss": 2.0113, "step": 123000 }, { "epoch": 3.63, "grad_norm": 2.1419429779052734, "learning_rate": 0.000264256646106475, "loss": 2.0112, "step": 123500 }, { "epoch": 3.64, "grad_norm": 2.309093952178955, "learning_rate": 0.0002641095541151436, "loss": 1.9943, "step": 124000 }, { "epoch": 3.66, "grad_norm": 1.7635866403579712, "learning_rate": 0.0002639624621238122, "loss": 2.0081, "step": 124500 }, { "epoch": 3.67, "grad_norm": 2.3035855293273926, "learning_rate": 0.00026381537013248085, "loss": 1.9658, "step": 125000 }, { "epoch": 3.68, "grad_norm": 1.8141978979110718, "learning_rate": 0.00026366827814114943, "loss": 2.0203, "step": 125500 }, { "epoch": 3.7, "grad_norm": 1.8287400007247925, "learning_rate": 0.00026352118614981807, "loss": 1.9636, "step": 126000 }, { "epoch": 3.71, "grad_norm": 1.356780767440796, "learning_rate": 0.0002633740941584867, "loss": 2.0116, "step": 126500 }, { "epoch": 3.73, "grad_norm": 5.973343849182129, "learning_rate": 0.00026322700216715534, "loss": 1.9837, "step": 127000 }, { "epoch": 3.74, "grad_norm": 2.090287446975708, "learning_rate": 0.00026307991017582397, "loss": 2.0161, "step": 127500 }, { "epoch": 3.76, "grad_norm": 1.422405481338501, "learning_rate": 0.00026293281818449255, "loss": 2.0096, "step": 128000 }, { "epoch": 3.77, "grad_norm": 5.0919365882873535, "learning_rate": 0.0002627857261931612, "loss": 1.997, "step": 128500 }, { "epoch": 3.79, "grad_norm": 2.7366178035736084, "learning_rate": 0.0002626386342018298, "loss": 2.001, "step": 129000 }, { "epoch": 3.8, "grad_norm": 1.3529456853866577, "learning_rate": 0.0002624915422104984, "loss": 2.0208, "step": 129500 }, { "epoch": 3.82, "grad_norm": 2.3386831283569336, "learning_rate": 0.00026234445021916704, "loss": 2.0049, "step": 130000 }, { "epoch": 3.83, "grad_norm": 2.074753999710083, "learning_rate": 0.00026219735822783567, "loss": 1.988, "step": 130500 }, { "epoch": 3.85, "grad_norm": 4.412100791931152, "learning_rate": 0.0002620502662365043, "loss": 1.9851, "step": 131000 }, { "epoch": 3.86, "grad_norm": 2.1146128177642822, "learning_rate": 0.00026190317424517294, "loss": 2.0049, "step": 131500 }, { "epoch": 3.88, "grad_norm": 7.746240615844727, "learning_rate": 0.0002617560822538415, "loss": 1.9978, "step": 132000 }, { "epoch": 3.89, "grad_norm": 1.8406715393066406, "learning_rate": 0.00026160899026251016, "loss": 1.9677, "step": 132500 }, { "epoch": 3.9, "grad_norm": 6.467206954956055, "learning_rate": 0.0002614618982711788, "loss": 1.9906, "step": 133000 }, { "epoch": 3.92, "grad_norm": 2.756176471710205, "learning_rate": 0.00026131480627984737, "loss": 2.0179, "step": 133500 }, { "epoch": 3.93, "grad_norm": 2.753931760787964, "learning_rate": 0.000261167714288516, "loss": 2.0166, "step": 134000 }, { "epoch": 3.95, "grad_norm": 32.68937301635742, "learning_rate": 0.00026102062229718464, "loss": 2.002, "step": 134500 }, { "epoch": 3.96, "grad_norm": 2.8605151176452637, "learning_rate": 0.0002608735303058533, "loss": 1.9997, "step": 135000 }, { "epoch": 3.98, "grad_norm": 1.3387537002563477, "learning_rate": 0.0002607264383145219, "loss": 1.9992, "step": 135500 }, { "epoch": 3.99, "grad_norm": 2.1721653938293457, "learning_rate": 0.0002605793463231905, "loss": 1.9946, "step": 136000 }, { "epoch": 4.01, "grad_norm": 15.696208953857422, "learning_rate": 0.0002604322543318591, "loss": 1.9808, "step": 136500 }, { "epoch": 4.02, "grad_norm": 2.118614912033081, "learning_rate": 0.00026028516234052776, "loss": 1.9334, "step": 137000 }, { "epoch": 4.04, "grad_norm": 4.258530616760254, "learning_rate": 0.00026013807034919634, "loss": 1.9505, "step": 137500 }, { "epoch": 4.05, "grad_norm": 1.8109312057495117, "learning_rate": 0.000259990978357865, "loss": 1.909, "step": 138000 }, { "epoch": 4.07, "grad_norm": 1.503454327583313, "learning_rate": 0.0002598438863665336, "loss": 1.9283, "step": 138500 }, { "epoch": 4.08, "grad_norm": 5.558799743652344, "learning_rate": 0.00025969679437520224, "loss": 1.959, "step": 139000 }, { "epoch": 4.1, "grad_norm": 2.509039878845215, "learning_rate": 0.0002595497023838709, "loss": 1.9111, "step": 139500 }, { "epoch": 4.11, "grad_norm": 1.6668161153793335, "learning_rate": 0.00025940261039253946, "loss": 1.9116, "step": 140000 }, { "epoch": 4.13, "grad_norm": 1.4484660625457764, "learning_rate": 0.0002592555184012081, "loss": 1.9337, "step": 140500 }, { "epoch": 4.14, "grad_norm": 2.352369785308838, "learning_rate": 0.00025910842640987673, "loss": 1.922, "step": 141000 }, { "epoch": 4.15, "grad_norm": 17.899917602539062, "learning_rate": 0.0002589613344185453, "loss": 1.9144, "step": 141500 }, { "epoch": 4.17, "grad_norm": 2.010423421859741, "learning_rate": 0.00025881424242721394, "loss": 1.9909, "step": 142000 }, { "epoch": 4.18, "grad_norm": 2.928180694580078, "learning_rate": 0.0002586671504358826, "loss": 1.9295, "step": 142500 }, { "epoch": 4.2, "grad_norm": 2.5269832611083984, "learning_rate": 0.0002585200584445512, "loss": 1.9473, "step": 143000 }, { "epoch": 4.21, "grad_norm": 4.112135410308838, "learning_rate": 0.00025837296645321985, "loss": 1.9538, "step": 143500 }, { "epoch": 4.23, "grad_norm": 2.3555727005004883, "learning_rate": 0.00025822587446188843, "loss": 1.951, "step": 144000 }, { "epoch": 4.24, "grad_norm": 2.098503828048706, "learning_rate": 0.00025807878247055706, "loss": 1.9492, "step": 144500 }, { "epoch": 4.26, "grad_norm": 2.459561824798584, "learning_rate": 0.0002579316904792257, "loss": 1.9427, "step": 145000 }, { "epoch": 4.27, "grad_norm": 2.0252935886383057, "learning_rate": 0.0002577845984878943, "loss": 1.9161, "step": 145500 }, { "epoch": 4.29, "grad_norm": 1.384768009185791, "learning_rate": 0.0002576375064965629, "loss": 1.9325, "step": 146000 }, { "epoch": 4.3, "grad_norm": 1.6305208206176758, "learning_rate": 0.00025749041450523155, "loss": 1.9892, "step": 146500 }, { "epoch": 4.32, "grad_norm": 2.4807193279266357, "learning_rate": 0.0002573433225139002, "loss": 1.9288, "step": 147000 }, { "epoch": 4.33, "grad_norm": 1.798276424407959, "learning_rate": 0.0002571962305225688, "loss": 1.9124, "step": 147500 }, { "epoch": 4.35, "grad_norm": 3.1262764930725098, "learning_rate": 0.0002570491385312374, "loss": 1.9356, "step": 148000 }, { "epoch": 4.36, "grad_norm": 2.5284624099731445, "learning_rate": 0.00025690204653990603, "loss": 1.9519, "step": 148500 }, { "epoch": 4.37, "grad_norm": 2.51084566116333, "learning_rate": 0.00025675495454857467, "loss": 1.948, "step": 149000 }, { "epoch": 4.39, "grad_norm": 1.7908506393432617, "learning_rate": 0.00025660786255724325, "loss": 1.9214, "step": 149500 }, { "epoch": 4.4, "grad_norm": 1.8897744417190552, "learning_rate": 0.0002564607705659119, "loss": 1.9492, "step": 150000 }, { "epoch": 4.42, "grad_norm": 2.1504998207092285, "learning_rate": 0.0002563136785745805, "loss": 1.9261, "step": 150500 }, { "epoch": 4.43, "grad_norm": 1.7902778387069702, "learning_rate": 0.00025616658658324915, "loss": 1.9332, "step": 151000 }, { "epoch": 4.45, "grad_norm": 1.6913944482803345, "learning_rate": 0.0002560194945919178, "loss": 1.9391, "step": 151500 }, { "epoch": 4.46, "grad_norm": 1.7668604850769043, "learning_rate": 0.00025587240260058637, "loss": 1.961, "step": 152000 }, { "epoch": 4.48, "grad_norm": 2.929547071456909, "learning_rate": 0.000255725310609255, "loss": 1.9578, "step": 152500 }, { "epoch": 4.49, "grad_norm": 2.5235610008239746, "learning_rate": 0.00025557821861792363, "loss": 1.8975, "step": 153000 }, { "epoch": 4.51, "grad_norm": 5.4155097007751465, "learning_rate": 0.0002554311266265922, "loss": 1.9549, "step": 153500 }, { "epoch": 4.52, "grad_norm": 2.9047696590423584, "learning_rate": 0.00025528403463526085, "loss": 1.9409, "step": 154000 }, { "epoch": 4.54, "grad_norm": 3.789259910583496, "learning_rate": 0.0002551369426439295, "loss": 1.9267, "step": 154500 }, { "epoch": 4.55, "grad_norm": 2.7584846019744873, "learning_rate": 0.0002549898506525981, "loss": 1.9138, "step": 155000 }, { "epoch": 4.57, "grad_norm": 4.33261251449585, "learning_rate": 0.00025484275866126675, "loss": 1.9611, "step": 155500 }, { "epoch": 4.58, "grad_norm": 3.2369813919067383, "learning_rate": 0.00025469566666993533, "loss": 1.9354, "step": 156000 }, { "epoch": 4.59, "grad_norm": 2.6031434535980225, "learning_rate": 0.00025454857467860397, "loss": 1.9172, "step": 156500 }, { "epoch": 4.61, "grad_norm": 2.9214913845062256, "learning_rate": 0.0002544014826872726, "loss": 1.9356, "step": 157000 }, { "epoch": 4.62, "grad_norm": 2.684860944747925, "learning_rate": 0.00025425439069594124, "loss": 1.9494, "step": 157500 }, { "epoch": 4.64, "grad_norm": 2.98620867729187, "learning_rate": 0.0002541072987046098, "loss": 1.9277, "step": 158000 }, { "epoch": 4.65, "grad_norm": 1.6060062646865845, "learning_rate": 0.00025396020671327845, "loss": 1.9417, "step": 158500 }, { "epoch": 4.67, "grad_norm": 1.4754972457885742, "learning_rate": 0.0002538131147219471, "loss": 1.9691, "step": 159000 }, { "epoch": 4.68, "grad_norm": 1.5829949378967285, "learning_rate": 0.0002536660227306157, "loss": 1.9201, "step": 159500 }, { "epoch": 4.7, "grad_norm": 2.13678240776062, "learning_rate": 0.00025351893073928436, "loss": 1.9697, "step": 160000 }, { "epoch": 4.71, "grad_norm": 2.030682325363159, "learning_rate": 0.00025337183874795294, "loss": 1.9364, "step": 160500 }, { "epoch": 4.73, "grad_norm": 2.715879201889038, "learning_rate": 0.00025322474675662157, "loss": 1.9574, "step": 161000 }, { "epoch": 4.74, "grad_norm": 48.37791442871094, "learning_rate": 0.0002530776547652902, "loss": 1.9372, "step": 161500 }, { "epoch": 4.76, "grad_norm": 3.232931613922119, "learning_rate": 0.0002529305627739588, "loss": 1.9093, "step": 162000 }, { "epoch": 4.77, "grad_norm": 2.2022180557250977, "learning_rate": 0.0002527834707826274, "loss": 1.8969, "step": 162500 }, { "epoch": 4.79, "grad_norm": 1.3633408546447754, "learning_rate": 0.00025263637879129606, "loss": 1.9476, "step": 163000 }, { "epoch": 4.8, "grad_norm": 1.8852393627166748, "learning_rate": 0.0002524892867999647, "loss": 1.9467, "step": 163500 }, { "epoch": 4.82, "grad_norm": 2.2378666400909424, "learning_rate": 0.0002523421948086333, "loss": 1.9216, "step": 164000 }, { "epoch": 4.83, "grad_norm": 6.068374156951904, "learning_rate": 0.0002521951028173019, "loss": 1.9188, "step": 164500 }, { "epoch": 4.84, "grad_norm": 2.9869394302368164, "learning_rate": 0.00025204801082597054, "loss": 1.8981, "step": 165000 }, { "epoch": 4.86, "grad_norm": 5.120233058929443, "learning_rate": 0.0002519009188346392, "loss": 1.9119, "step": 165500 }, { "epoch": 4.87, "grad_norm": 24.895925521850586, "learning_rate": 0.00025175382684330776, "loss": 1.9263, "step": 166000 }, { "epoch": 4.89, "grad_norm": 4.085075378417969, "learning_rate": 0.0002516067348519764, "loss": 1.9464, "step": 166500 }, { "epoch": 4.9, "grad_norm": 2.498352527618408, "learning_rate": 0.000251459642860645, "loss": 1.9349, "step": 167000 }, { "epoch": 4.92, "grad_norm": 1.9715830087661743, "learning_rate": 0.00025131255086931366, "loss": 1.9198, "step": 167500 }, { "epoch": 4.93, "grad_norm": 6.1134161949157715, "learning_rate": 0.0002511654588779823, "loss": 1.9064, "step": 168000 }, { "epoch": 4.95, "grad_norm": 1.4753895998001099, "learning_rate": 0.0002510183668866509, "loss": 1.9277, "step": 168500 }, { "epoch": 4.96, "grad_norm": 22.2166805267334, "learning_rate": 0.0002508712748953195, "loss": 1.9105, "step": 169000 }, { "epoch": 4.98, "grad_norm": 9.678267478942871, "learning_rate": 0.00025072418290398815, "loss": 1.9361, "step": 169500 }, { "epoch": 4.99, "grad_norm": 5.086581230163574, "learning_rate": 0.0002505770909126567, "loss": 1.9035, "step": 170000 }, { "epoch": 5.01, "grad_norm": 3.46391224861145, "learning_rate": 0.00025042999892132536, "loss": 1.9096, "step": 170500 }, { "epoch": 5.02, "grad_norm": 1.564864158630371, "learning_rate": 0.000250282906929994, "loss": 1.8412, "step": 171000 }, { "epoch": 5.04, "grad_norm": 3.917158603668213, "learning_rate": 0.00025013581493866263, "loss": 1.8263, "step": 171500 }, { "epoch": 5.05, "grad_norm": 2.3305134773254395, "learning_rate": 0.00024998872294733126, "loss": 1.8263, "step": 172000 }, { "epoch": 5.06, "grad_norm": 2.812856435775757, "learning_rate": 0.00024984163095599985, "loss": 1.8238, "step": 172500 }, { "epoch": 5.08, "grad_norm": 12.236431121826172, "learning_rate": 0.0002496945389646685, "loss": 1.8475, "step": 173000 }, { "epoch": 5.09, "grad_norm": 1.9508650302886963, "learning_rate": 0.0002495474469733371, "loss": 1.8671, "step": 173500 }, { "epoch": 5.11, "grad_norm": 4.95379638671875, "learning_rate": 0.0002494003549820057, "loss": 1.8541, "step": 174000 }, { "epoch": 5.12, "grad_norm": 2.8690032958984375, "learning_rate": 0.00024925326299067433, "loss": 1.8622, "step": 174500 }, { "epoch": 5.14, "grad_norm": 2.8076915740966797, "learning_rate": 0.00024910617099934296, "loss": 1.8394, "step": 175000 }, { "epoch": 5.15, "grad_norm": 1.6845248937606812, "learning_rate": 0.0002489590790080116, "loss": 1.8572, "step": 175500 }, { "epoch": 5.17, "grad_norm": 1.74410080909729, "learning_rate": 0.00024881198701668023, "loss": 1.8387, "step": 176000 }, { "epoch": 5.18, "grad_norm": 2.655266761779785, "learning_rate": 0.0002486648950253488, "loss": 1.8776, "step": 176500 }, { "epoch": 5.2, "grad_norm": 2.5884244441986084, "learning_rate": 0.00024851780303401745, "loss": 1.8564, "step": 177000 }, { "epoch": 5.21, "grad_norm": 4.314496040344238, "learning_rate": 0.0002483707110426861, "loss": 1.8573, "step": 177500 }, { "epoch": 5.23, "grad_norm": 2.15973162651062, "learning_rate": 0.00024822361905135466, "loss": 1.8671, "step": 178000 }, { "epoch": 5.24, "grad_norm": 1.7802814245224, "learning_rate": 0.00024807652706002335, "loss": 1.8722, "step": 178500 }, { "epoch": 5.26, "grad_norm": 29.04892921447754, "learning_rate": 0.00024792943506869193, "loss": 1.8747, "step": 179000 }, { "epoch": 5.27, "grad_norm": 2.0370380878448486, "learning_rate": 0.00024778234307736057, "loss": 1.8615, "step": 179500 }, { "epoch": 5.28, "grad_norm": 2.681140661239624, "learning_rate": 0.0002476352510860292, "loss": 1.8603, "step": 180000 }, { "epoch": 5.3, "grad_norm": 1.609506368637085, "learning_rate": 0.0002474881590946978, "loss": 1.857, "step": 180500 }, { "epoch": 5.31, "grad_norm": 1.4114452600479126, "learning_rate": 0.0002473410671033664, "loss": 1.8849, "step": 181000 }, { "epoch": 5.33, "grad_norm": 12.606436729431152, "learning_rate": 0.00024719397511203505, "loss": 1.8576, "step": 181500 }, { "epoch": 5.34, "grad_norm": 1.5284700393676758, "learning_rate": 0.00024704688312070363, "loss": 1.8565, "step": 182000 }, { "epoch": 5.36, "grad_norm": 2.134824514389038, "learning_rate": 0.0002468997911293723, "loss": 1.8447, "step": 182500 }, { "epoch": 5.37, "grad_norm": 2.755667209625244, "learning_rate": 0.0002467526991380409, "loss": 1.8894, "step": 183000 }, { "epoch": 5.39, "grad_norm": 1.8626573085784912, "learning_rate": 0.00024660560714670954, "loss": 1.8788, "step": 183500 }, { "epoch": 5.4, "grad_norm": 1.2436336278915405, "learning_rate": 0.00024645851515537817, "loss": 1.888, "step": 184000 }, { "epoch": 5.42, "grad_norm": 1.6450908184051514, "learning_rate": 0.00024631142316404675, "loss": 1.8576, "step": 184500 }, { "epoch": 5.43, "grad_norm": 2.5818614959716797, "learning_rate": 0.0002461643311727154, "loss": 1.8634, "step": 185000 }, { "epoch": 5.45, "grad_norm": 2.4842381477355957, "learning_rate": 0.000246017239181384, "loss": 1.8777, "step": 185500 }, { "epoch": 5.46, "grad_norm": 5.475472927093506, "learning_rate": 0.0002458701471900526, "loss": 1.8537, "step": 186000 }, { "epoch": 5.48, "grad_norm": 6.852906227111816, "learning_rate": 0.0002457230551987213, "loss": 1.863, "step": 186500 }, { "epoch": 5.49, "grad_norm": 2.8549883365631104, "learning_rate": 0.00024557596320738987, "loss": 1.8643, "step": 187000 }, { "epoch": 5.51, "grad_norm": 4.323022365570068, "learning_rate": 0.0002454288712160585, "loss": 1.8489, "step": 187500 }, { "epoch": 5.52, "grad_norm": 1.4015731811523438, "learning_rate": 0.00024528177922472714, "loss": 1.879, "step": 188000 }, { "epoch": 5.53, "grad_norm": 2.5605356693267822, "learning_rate": 0.0002451346872333957, "loss": 1.8564, "step": 188500 }, { "epoch": 5.55, "grad_norm": 1.567606806755066, "learning_rate": 0.00024498759524206436, "loss": 1.8902, "step": 189000 }, { "epoch": 5.56, "grad_norm": 1.4582362174987793, "learning_rate": 0.000244840503250733, "loss": 1.8659, "step": 189500 }, { "epoch": 5.58, "grad_norm": 1.8250012397766113, "learning_rate": 0.00024469341125940157, "loss": 1.8558, "step": 190000 }, { "epoch": 5.59, "grad_norm": 6.115236759185791, "learning_rate": 0.00024454631926807026, "loss": 1.839, "step": 190500 }, { "epoch": 5.61, "grad_norm": 4.505608081817627, "learning_rate": 0.00024439922727673884, "loss": 1.846, "step": 191000 }, { "epoch": 5.62, "grad_norm": 2.1621007919311523, "learning_rate": 0.0002442521352854075, "loss": 1.8287, "step": 191500 }, { "epoch": 5.64, "grad_norm": 2.337688684463501, "learning_rate": 0.0002441050432940761, "loss": 1.88, "step": 192000 }, { "epoch": 5.65, "grad_norm": 1.921249508857727, "learning_rate": 0.00024395795130274472, "loss": 1.8345, "step": 192500 }, { "epoch": 5.67, "grad_norm": 1.396031379699707, "learning_rate": 0.00024381085931141332, "loss": 1.8527, "step": 193000 }, { "epoch": 5.68, "grad_norm": 2.6552846431732178, "learning_rate": 0.00024366376732008196, "loss": 1.8628, "step": 193500 }, { "epoch": 5.7, "grad_norm": 4.992696285247803, "learning_rate": 0.00024351667532875057, "loss": 1.8727, "step": 194000 }, { "epoch": 5.71, "grad_norm": 4.128008842468262, "learning_rate": 0.00024336958333741923, "loss": 1.8677, "step": 194500 }, { "epoch": 5.73, "grad_norm": 2.2797539234161377, "learning_rate": 0.00024322249134608784, "loss": 1.8495, "step": 195000 }, { "epoch": 5.74, "grad_norm": 1.987032175064087, "learning_rate": 0.00024307539935475644, "loss": 1.8555, "step": 195500 }, { "epoch": 5.75, "grad_norm": 2.1700329780578613, "learning_rate": 0.00024292830736342508, "loss": 1.8745, "step": 196000 }, { "epoch": 5.77, "grad_norm": 2.0741491317749023, "learning_rate": 0.00024278121537209369, "loss": 1.8859, "step": 196500 }, { "epoch": 5.78, "grad_norm": 3.1270885467529297, "learning_rate": 0.0002426341233807623, "loss": 1.8778, "step": 197000 }, { "epoch": 5.8, "grad_norm": 2.7290618419647217, "learning_rate": 0.00024248703138943093, "loss": 1.8418, "step": 197500 }, { "epoch": 5.81, "grad_norm": 1.5647644996643066, "learning_rate": 0.00024233993939809954, "loss": 1.8519, "step": 198000 }, { "epoch": 5.83, "grad_norm": 2.4231462478637695, "learning_rate": 0.0002421928474067682, "loss": 1.868, "step": 198500 }, { "epoch": 5.84, "grad_norm": 3.1139655113220215, "learning_rate": 0.0002420457554154368, "loss": 1.8255, "step": 199000 }, { "epoch": 5.86, "grad_norm": 4.595509052276611, "learning_rate": 0.0002418986634241054, "loss": 1.8626, "step": 199500 }, { "epoch": 5.87, "grad_norm": 7.253945827484131, "learning_rate": 0.00024175157143277405, "loss": 1.8714, "step": 200000 }, { "epoch": 5.89, "grad_norm": 1.5077595710754395, "learning_rate": 0.00024160447944144266, "loss": 1.8743, "step": 200500 }, { "epoch": 5.9, "grad_norm": 5.791823387145996, "learning_rate": 0.00024145738745011126, "loss": 1.8499, "step": 201000 }, { "epoch": 5.92, "grad_norm": 2.558816909790039, "learning_rate": 0.0002413102954587799, "loss": 1.8757, "step": 201500 }, { "epoch": 5.93, "grad_norm": 2.4598610401153564, "learning_rate": 0.0002411632034674485, "loss": 1.8396, "step": 202000 }, { "epoch": 5.95, "grad_norm": 1.2680063247680664, "learning_rate": 0.00024101611147611717, "loss": 1.9265, "step": 202500 }, { "epoch": 5.96, "grad_norm": 13.615561485290527, "learning_rate": 0.00024086901948478577, "loss": 1.833, "step": 203000 }, { "epoch": 5.97, "grad_norm": 1.367218017578125, "learning_rate": 0.00024072192749345438, "loss": 1.836, "step": 203500 }, { "epoch": 5.99, "grad_norm": 1.8230597972869873, "learning_rate": 0.00024057483550212302, "loss": 1.8833, "step": 204000 }, { "epoch": 6.0, "grad_norm": 3.095012903213501, "learning_rate": 0.00024042774351079162, "loss": 1.8106, "step": 204500 }, { "epoch": 6.02, "grad_norm": 1.357537031173706, "learning_rate": 0.00024028065151946023, "loss": 1.809, "step": 205000 }, { "epoch": 6.03, "grad_norm": 1.8202615976333618, "learning_rate": 0.00024013355952812887, "loss": 1.788, "step": 205500 }, { "epoch": 6.05, "grad_norm": 2.0572142601013184, "learning_rate": 0.00023998646753679747, "loss": 1.7786, "step": 206000 }, { "epoch": 6.06, "grad_norm": 2.056692123413086, "learning_rate": 0.00023983937554546614, "loss": 1.801, "step": 206500 }, { "epoch": 6.08, "grad_norm": 1.6310349702835083, "learning_rate": 0.00023969228355413474, "loss": 1.794, "step": 207000 }, { "epoch": 6.09, "grad_norm": 2.9542150497436523, "learning_rate": 0.00023954519156280335, "loss": 1.8013, "step": 207500 }, { "epoch": 6.11, "grad_norm": 2.0560832023620605, "learning_rate": 0.00023939809957147199, "loss": 1.809, "step": 208000 }, { "epoch": 6.12, "grad_norm": 2.991140842437744, "learning_rate": 0.0002392510075801406, "loss": 1.8122, "step": 208500 }, { "epoch": 6.14, "grad_norm": 4.056896686553955, "learning_rate": 0.0002391039155888092, "loss": 1.7895, "step": 209000 }, { "epoch": 6.15, "grad_norm": 1.4278576374053955, "learning_rate": 0.00023895682359747784, "loss": 1.7778, "step": 209500 }, { "epoch": 6.17, "grad_norm": 2.918614387512207, "learning_rate": 0.00023880973160614644, "loss": 1.7882, "step": 210000 }, { "epoch": 6.18, "grad_norm": 6.002877712249756, "learning_rate": 0.0002386626396148151, "loss": 1.8129, "step": 210500 }, { "epoch": 6.2, "grad_norm": 1.8039432764053345, "learning_rate": 0.0002385155476234837, "loss": 1.7879, "step": 211000 }, { "epoch": 6.21, "grad_norm": 2.1559925079345703, "learning_rate": 0.00023836845563215232, "loss": 1.825, "step": 211500 }, { "epoch": 6.22, "grad_norm": 1.6617883443832397, "learning_rate": 0.00023822136364082095, "loss": 1.8048, "step": 212000 }, { "epoch": 6.24, "grad_norm": 3.7312443256378174, "learning_rate": 0.00023807427164948956, "loss": 1.7759, "step": 212500 }, { "epoch": 6.25, "grad_norm": 4.757487773895264, "learning_rate": 0.0002379271796581582, "loss": 1.8357, "step": 213000 }, { "epoch": 6.27, "grad_norm": 1.4285889863967896, "learning_rate": 0.0002377800876668268, "loss": 1.8127, "step": 213500 }, { "epoch": 6.28, "grad_norm": 1.7722175121307373, "learning_rate": 0.0002376329956754954, "loss": 1.79, "step": 214000 }, { "epoch": 6.3, "grad_norm": 2.17244291305542, "learning_rate": 0.00023748590368416407, "loss": 1.7949, "step": 214500 }, { "epoch": 6.31, "grad_norm": 2.136143922805786, "learning_rate": 0.00023733881169283268, "loss": 1.8199, "step": 215000 }, { "epoch": 6.33, "grad_norm": 1.36685049533844, "learning_rate": 0.00023719171970150132, "loss": 1.8101, "step": 215500 }, { "epoch": 6.34, "grad_norm": 1.5481481552124023, "learning_rate": 0.00023704462771016992, "loss": 1.819, "step": 216000 }, { "epoch": 6.36, "grad_norm": 2.1855366230010986, "learning_rate": 0.00023689753571883853, "loss": 1.8162, "step": 216500 }, { "epoch": 6.37, "grad_norm": 1.5019465684890747, "learning_rate": 0.00023675044372750717, "loss": 1.7929, "step": 217000 }, { "epoch": 6.39, "grad_norm": 1.4323623180389404, "learning_rate": 0.00023660335173617577, "loss": 1.7664, "step": 217500 }, { "epoch": 6.4, "grad_norm": 3.942918539047241, "learning_rate": 0.00023645625974484438, "loss": 1.8165, "step": 218000 }, { "epoch": 6.42, "grad_norm": 1.4521820545196533, "learning_rate": 0.00023630916775351304, "loss": 1.7966, "step": 218500 }, { "epoch": 6.43, "grad_norm": 3.9033968448638916, "learning_rate": 0.00023616207576218165, "loss": 1.7952, "step": 219000 }, { "epoch": 6.44, "grad_norm": 2.7818336486816406, "learning_rate": 0.00023601498377085029, "loss": 1.7963, "step": 219500 }, { "epoch": 6.46, "grad_norm": 4.406651496887207, "learning_rate": 0.0002358678917795189, "loss": 1.7901, "step": 220000 }, { "epoch": 6.47, "grad_norm": 2.1108040809631348, "learning_rate": 0.0002357207997881875, "loss": 1.7887, "step": 220500 }, { "epoch": 6.49, "grad_norm": 6.802318096160889, "learning_rate": 0.00023557370779685613, "loss": 1.7811, "step": 221000 }, { "epoch": 6.5, "grad_norm": 2.7774646282196045, "learning_rate": 0.00023542661580552474, "loss": 1.782, "step": 221500 }, { "epoch": 6.52, "grad_norm": 1.5817331075668335, "learning_rate": 0.00023527952381419335, "loss": 1.7883, "step": 222000 }, { "epoch": 6.53, "grad_norm": 1.3639189004898071, "learning_rate": 0.000235132431822862, "loss": 1.8137, "step": 222500 }, { "epoch": 6.55, "grad_norm": 3.018841505050659, "learning_rate": 0.00023498533983153062, "loss": 1.8012, "step": 223000 }, { "epoch": 6.56, "grad_norm": 1.6388349533081055, "learning_rate": 0.00023483824784019925, "loss": 1.7776, "step": 223500 }, { "epoch": 6.58, "grad_norm": 2.2828805446624756, "learning_rate": 0.00023469115584886786, "loss": 1.793, "step": 224000 }, { "epoch": 6.59, "grad_norm": 1.5646781921386719, "learning_rate": 0.00023454406385753647, "loss": 1.8049, "step": 224500 }, { "epoch": 6.61, "grad_norm": 2.5620057582855225, "learning_rate": 0.0002343969718662051, "loss": 1.8082, "step": 225000 }, { "epoch": 6.62, "grad_norm": 1.1866906881332397, "learning_rate": 0.0002342498798748737, "loss": 1.8197, "step": 225500 }, { "epoch": 6.64, "grad_norm": 8.060154914855957, "learning_rate": 0.00023410278788354232, "loss": 1.7765, "step": 226000 }, { "epoch": 6.65, "grad_norm": 2.2780892848968506, "learning_rate": 0.00023395569589221098, "loss": 1.7903, "step": 226500 }, { "epoch": 6.66, "grad_norm": 8.265579223632812, "learning_rate": 0.0002338086039008796, "loss": 1.8143, "step": 227000 }, { "epoch": 6.68, "grad_norm": 1.2793028354644775, "learning_rate": 0.00023366151190954822, "loss": 1.7729, "step": 227500 }, { "epoch": 6.69, "grad_norm": 1.7103880643844604, "learning_rate": 0.00023351441991821683, "loss": 1.8019, "step": 228000 }, { "epoch": 6.71, "grad_norm": 6.143857479095459, "learning_rate": 0.00023336732792688544, "loss": 1.7781, "step": 228500 }, { "epoch": 6.72, "grad_norm": 3.1208572387695312, "learning_rate": 0.00023322023593555407, "loss": 1.791, "step": 229000 }, { "epoch": 6.74, "grad_norm": 1.774464726448059, "learning_rate": 0.00023307314394422268, "loss": 1.7596, "step": 229500 }, { "epoch": 6.75, "grad_norm": 1.9884730577468872, "learning_rate": 0.0002329260519528913, "loss": 1.7842, "step": 230000 }, { "epoch": 6.77, "grad_norm": 0.9491617679595947, "learning_rate": 0.00023277895996155995, "loss": 1.8013, "step": 230500 }, { "epoch": 6.78, "grad_norm": 1.6505547761917114, "learning_rate": 0.00023263186797022856, "loss": 1.7913, "step": 231000 }, { "epoch": 6.8, "grad_norm": 1.262868046760559, "learning_rate": 0.0002324847759788972, "loss": 1.8212, "step": 231500 }, { "epoch": 6.81, "grad_norm": 1.1688213348388672, "learning_rate": 0.0002323376839875658, "loss": 1.815, "step": 232000 }, { "epoch": 6.83, "grad_norm": 1.4348254203796387, "learning_rate": 0.0002321905919962344, "loss": 1.8335, "step": 232500 }, { "epoch": 6.84, "grad_norm": 1.977734923362732, "learning_rate": 0.00023204350000490304, "loss": 1.8231, "step": 233000 }, { "epoch": 6.86, "grad_norm": 12.462953567504883, "learning_rate": 0.00023189640801357165, "loss": 1.8086, "step": 233500 }, { "epoch": 6.87, "grad_norm": 1.9486029148101807, "learning_rate": 0.00023174931602224028, "loss": 1.7932, "step": 234000 }, { "epoch": 6.89, "grad_norm": 6.036319732666016, "learning_rate": 0.00023160222403090892, "loss": 1.7906, "step": 234500 }, { "epoch": 6.9, "grad_norm": 1.6775847673416138, "learning_rate": 0.00023145513203957753, "loss": 1.8026, "step": 235000 }, { "epoch": 6.91, "grad_norm": 2.1960055828094482, "learning_rate": 0.00023130804004824616, "loss": 1.8093, "step": 235500 }, { "epoch": 6.93, "grad_norm": 4.2877197265625, "learning_rate": 0.00023116094805691477, "loss": 1.8198, "step": 236000 }, { "epoch": 6.94, "grad_norm": 1.4908256530761719, "learning_rate": 0.0002310138560655834, "loss": 1.7882, "step": 236500 }, { "epoch": 6.96, "grad_norm": 5.148675918579102, "learning_rate": 0.000230866764074252, "loss": 1.786, "step": 237000 }, { "epoch": 6.97, "grad_norm": 5.5215744972229, "learning_rate": 0.00023071967208292062, "loss": 1.8168, "step": 237500 }, { "epoch": 6.99, "grad_norm": 21.99406623840332, "learning_rate": 0.00023057258009158928, "loss": 1.7993, "step": 238000 }, { "epoch": 7.0, "grad_norm": 3.8185484409332275, "learning_rate": 0.0002304254881002579, "loss": 1.8105, "step": 238500 }, { "epoch": 7.02, "grad_norm": 7.100237846374512, "learning_rate": 0.00023027839610892652, "loss": 1.7102, "step": 239000 }, { "epoch": 7.03, "grad_norm": 5.007193088531494, "learning_rate": 0.00023013130411759513, "loss": 1.7293, "step": 239500 }, { "epoch": 7.05, "grad_norm": 1.6522769927978516, "learning_rate": 0.00022998421212626374, "loss": 1.7199, "step": 240000 }, { "epoch": 7.06, "grad_norm": 1.7635401487350464, "learning_rate": 0.00022983712013493237, "loss": 1.7304, "step": 240500 }, { "epoch": 7.08, "grad_norm": 2.441962957382202, "learning_rate": 0.00022969002814360098, "loss": 1.7291, "step": 241000 }, { "epoch": 7.09, "grad_norm": 1.6155414581298828, "learning_rate": 0.0002295429361522696, "loss": 1.7477, "step": 241500 }, { "epoch": 7.11, "grad_norm": 1.3782049417495728, "learning_rate": 0.00022939584416093825, "loss": 1.7552, "step": 242000 }, { "epoch": 7.12, "grad_norm": 1.6964011192321777, "learning_rate": 0.00022924875216960686, "loss": 1.7233, "step": 242500 }, { "epoch": 7.13, "grad_norm": 1.9475128650665283, "learning_rate": 0.0002291016601782755, "loss": 1.7603, "step": 243000 }, { "epoch": 7.15, "grad_norm": 2.674274444580078, "learning_rate": 0.0002289545681869441, "loss": 1.7472, "step": 243500 }, { "epoch": 7.16, "grad_norm": 1.6163575649261475, "learning_rate": 0.0002288074761956127, "loss": 1.7754, "step": 244000 }, { "epoch": 7.18, "grad_norm": 1.3212227821350098, "learning_rate": 0.00022866038420428134, "loss": 1.7828, "step": 244500 }, { "epoch": 7.19, "grad_norm": 1.9953806400299072, "learning_rate": 0.00022851329221294995, "loss": 1.7289, "step": 245000 }, { "epoch": 7.21, "grad_norm": 1.5477960109710693, "learning_rate": 0.00022836620022161856, "loss": 1.7651, "step": 245500 }, { "epoch": 7.22, "grad_norm": 16.499107360839844, "learning_rate": 0.00022821910823028722, "loss": 1.7207, "step": 246000 }, { "epoch": 7.24, "grad_norm": 2.4671411514282227, "learning_rate": 0.00022807201623895583, "loss": 1.7438, "step": 246500 }, { "epoch": 7.25, "grad_norm": 4.774916648864746, "learning_rate": 0.00022792492424762446, "loss": 1.7721, "step": 247000 }, { "epoch": 7.27, "grad_norm": 2.282515287399292, "learning_rate": 0.00022777783225629307, "loss": 1.7145, "step": 247500 }, { "epoch": 7.28, "grad_norm": 1.9206039905548096, "learning_rate": 0.00022763074026496168, "loss": 1.7314, "step": 248000 }, { "epoch": 7.3, "grad_norm": 2.7325820922851562, "learning_rate": 0.0002274836482736303, "loss": 1.725, "step": 248500 }, { "epoch": 7.31, "grad_norm": 1.7279201745986938, "learning_rate": 0.00022733655628229892, "loss": 1.7095, "step": 249000 }, { "epoch": 7.33, "grad_norm": 1.8310534954071045, "learning_rate": 0.00022718946429096753, "loss": 1.7325, "step": 249500 }, { "epoch": 7.34, "grad_norm": 5.919315338134766, "learning_rate": 0.0002270423722996362, "loss": 1.7301, "step": 250000 }, { "epoch": 7.35, "grad_norm": 2.3758251667022705, "learning_rate": 0.0002268952803083048, "loss": 1.7529, "step": 250500 }, { "epoch": 7.37, "grad_norm": 3.524482011795044, "learning_rate": 0.00022674818831697343, "loss": 1.7822, "step": 251000 }, { "epoch": 7.38, "grad_norm": 2.1281306743621826, "learning_rate": 0.00022660109632564204, "loss": 1.7315, "step": 251500 }, { "epoch": 7.4, "grad_norm": 1.6832419633865356, "learning_rate": 0.00022645400433431064, "loss": 1.7553, "step": 252000 }, { "epoch": 7.41, "grad_norm": 13.24813175201416, "learning_rate": 0.00022630691234297928, "loss": 1.7623, "step": 252500 }, { "epoch": 7.43, "grad_norm": 1.5083931684494019, "learning_rate": 0.0002261598203516479, "loss": 1.7579, "step": 253000 }, { "epoch": 7.44, "grad_norm": 3.207942247390747, "learning_rate": 0.00022601272836031652, "loss": 1.7248, "step": 253500 }, { "epoch": 7.46, "grad_norm": 7.897548675537109, "learning_rate": 0.00022586563636898516, "loss": 1.7309, "step": 254000 }, { "epoch": 7.47, "grad_norm": 2.2789628505706787, "learning_rate": 0.00022571854437765376, "loss": 1.7444, "step": 254500 }, { "epoch": 7.49, "grad_norm": 2.129913568496704, "learning_rate": 0.0002255714523863224, "loss": 1.7505, "step": 255000 }, { "epoch": 7.5, "grad_norm": 2.0121564865112305, "learning_rate": 0.000225424360394991, "loss": 1.8085, "step": 255500 }, { "epoch": 7.52, "grad_norm": 3.321162462234497, "learning_rate": 0.00022527726840365964, "loss": 1.7601, "step": 256000 }, { "epoch": 7.53, "grad_norm": 8.002218246459961, "learning_rate": 0.00022513017641232825, "loss": 1.7719, "step": 256500 }, { "epoch": 7.55, "grad_norm": 1.610587477684021, "learning_rate": 0.00022498308442099686, "loss": 1.7373, "step": 257000 }, { "epoch": 7.56, "grad_norm": 5.272850513458252, "learning_rate": 0.0002248359924296655, "loss": 1.7481, "step": 257500 }, { "epoch": 7.57, "grad_norm": 1.519241452217102, "learning_rate": 0.00022468890043833413, "loss": 1.7722, "step": 258000 }, { "epoch": 7.59, "grad_norm": 3.9072492122650146, "learning_rate": 0.00022454180844700276, "loss": 1.7425, "step": 258500 }, { "epoch": 7.6, "grad_norm": 16.145322799682617, "learning_rate": 0.00022439471645567137, "loss": 1.7645, "step": 259000 }, { "epoch": 7.62, "grad_norm": 1.7309913635253906, "learning_rate": 0.00022424762446433998, "loss": 1.7388, "step": 259500 }, { "epoch": 7.63, "grad_norm": 1.4854837656021118, "learning_rate": 0.0002241005324730086, "loss": 1.7728, "step": 260000 }, { "epoch": 7.65, "grad_norm": 1.9065077304840088, "learning_rate": 0.00022395344048167722, "loss": 1.7306, "step": 260500 }, { "epoch": 7.66, "grad_norm": 2.6869423389434814, "learning_rate": 0.00022380634849034582, "loss": 1.751, "step": 261000 }, { "epoch": 7.68, "grad_norm": 2.3300232887268066, "learning_rate": 0.00022365925649901446, "loss": 1.7643, "step": 261500 }, { "epoch": 7.69, "grad_norm": 1.6109992265701294, "learning_rate": 0.0002235121645076831, "loss": 1.7848, "step": 262000 }, { "epoch": 7.71, "grad_norm": 3.2768895626068115, "learning_rate": 0.00022336507251635173, "loss": 1.7638, "step": 262500 }, { "epoch": 7.72, "grad_norm": 4.605926036834717, "learning_rate": 0.00022321798052502034, "loss": 1.7801, "step": 263000 }, { "epoch": 7.74, "grad_norm": 1.451865553855896, "learning_rate": 0.00022307088853368894, "loss": 1.7229, "step": 263500 }, { "epoch": 7.75, "grad_norm": 1.938750982284546, "learning_rate": 0.00022292379654235758, "loss": 1.7575, "step": 264000 }, { "epoch": 7.77, "grad_norm": 8.690546989440918, "learning_rate": 0.00022277670455102619, "loss": 1.7572, "step": 264500 }, { "epoch": 7.78, "grad_norm": 6.99992036819458, "learning_rate": 0.0002226296125596948, "loss": 1.7912, "step": 265000 }, { "epoch": 7.8, "grad_norm": 1.375183343887329, "learning_rate": 0.00022248252056836343, "loss": 1.7588, "step": 265500 }, { "epoch": 7.81, "grad_norm": 1.4782127141952515, "learning_rate": 0.00022233542857703206, "loss": 1.7929, "step": 266000 }, { "epoch": 7.82, "grad_norm": 3.282301187515259, "learning_rate": 0.0002221883365857007, "loss": 1.7855, "step": 266500 }, { "epoch": 7.84, "grad_norm": 2.2392537593841553, "learning_rate": 0.0002220412445943693, "loss": 1.7603, "step": 267000 }, { "epoch": 7.85, "grad_norm": 3.2060225009918213, "learning_rate": 0.0002218941526030379, "loss": 1.7416, "step": 267500 }, { "epoch": 7.87, "grad_norm": 2.6562771797180176, "learning_rate": 0.00022174706061170655, "loss": 1.766, "step": 268000 }, { "epoch": 7.88, "grad_norm": 3.6191391944885254, "learning_rate": 0.00022159996862037516, "loss": 1.7746, "step": 268500 }, { "epoch": 7.9, "grad_norm": 1.9865895509719849, "learning_rate": 0.00022145287662904376, "loss": 1.7841, "step": 269000 }, { "epoch": 7.91, "grad_norm": 1.3776191473007202, "learning_rate": 0.0002213057846377124, "loss": 1.7709, "step": 269500 }, { "epoch": 7.93, "grad_norm": 2.238267183303833, "learning_rate": 0.00022115869264638103, "loss": 1.7764, "step": 270000 }, { "epoch": 7.94, "grad_norm": 3.3425824642181396, "learning_rate": 0.00022101160065504967, "loss": 1.7901, "step": 270500 }, { "epoch": 7.96, "grad_norm": 2.183436632156372, "learning_rate": 0.00022086450866371827, "loss": 1.757, "step": 271000 }, { "epoch": 7.97, "grad_norm": 3.393548011779785, "learning_rate": 0.00022071741667238688, "loss": 1.7538, "step": 271500 }, { "epoch": 7.99, "grad_norm": 4.727015018463135, "learning_rate": 0.00022057032468105552, "loss": 1.7585, "step": 272000 }, { "epoch": 8.0, "grad_norm": 2.0307464599609375, "learning_rate": 0.00022042323268972412, "loss": 1.7659, "step": 272500 }, { "epoch": 8.02, "grad_norm": 4.098311424255371, "learning_rate": 0.00022027614069839273, "loss": 1.7191, "step": 273000 }, { "epoch": 8.03, "grad_norm": 2.290457248687744, "learning_rate": 0.00022012904870706137, "loss": 1.7206, "step": 273500 }, { "epoch": 8.04, "grad_norm": 2.1348979473114014, "learning_rate": 0.00021998195671573, "loss": 1.6935, "step": 274000 }, { "epoch": 8.06, "grad_norm": 2.5333049297332764, "learning_rate": 0.00021983486472439864, "loss": 1.6709, "step": 274500 }, { "epoch": 8.07, "grad_norm": 2.1333959102630615, "learning_rate": 0.00021968777273306724, "loss": 1.7318, "step": 275000 }, { "epoch": 8.09, "grad_norm": 1.6256059408187866, "learning_rate": 0.00021954068074173585, "loss": 1.7007, "step": 275500 }, { "epoch": 8.1, "grad_norm": 1.4450695514678955, "learning_rate": 0.00021939358875040449, "loss": 1.7038, "step": 276000 }, { "epoch": 8.12, "grad_norm": 4.230172634124756, "learning_rate": 0.0002192464967590731, "loss": 1.6738, "step": 276500 }, { "epoch": 8.13, "grad_norm": 2.046879529953003, "learning_rate": 0.00021909940476774173, "loss": 1.6658, "step": 277000 }, { "epoch": 8.15, "grad_norm": 2.2633895874023438, "learning_rate": 0.00021895231277641034, "loss": 1.6929, "step": 277500 }, { "epoch": 8.16, "grad_norm": 34.2008171081543, "learning_rate": 0.00021880522078507897, "loss": 1.6956, "step": 278000 }, { "epoch": 8.18, "grad_norm": 11.11546516418457, "learning_rate": 0.0002186581287937476, "loss": 1.7103, "step": 278500 }, { "epoch": 8.19, "grad_norm": 5.735045909881592, "learning_rate": 0.0002185110368024162, "loss": 1.6991, "step": 279000 }, { "epoch": 8.21, "grad_norm": 2.9397599697113037, "learning_rate": 0.00021836394481108485, "loss": 1.6909, "step": 279500 }, { "epoch": 8.22, "grad_norm": 1.9226768016815186, "learning_rate": 0.00021821685281975345, "loss": 1.6894, "step": 280000 }, { "epoch": 8.24, "grad_norm": 2.0962398052215576, "learning_rate": 0.00021806976082842206, "loss": 1.7094, "step": 280500 }, { "epoch": 8.25, "grad_norm": 13.345786094665527, "learning_rate": 0.0002179226688370907, "loss": 1.6927, "step": 281000 }, { "epoch": 8.26, "grad_norm": 3.1707746982574463, "learning_rate": 0.0002177755768457593, "loss": 1.7004, "step": 281500 }, { "epoch": 8.28, "grad_norm": 3.3897292613983154, "learning_rate": 0.00021762848485442797, "loss": 1.7094, "step": 282000 }, { "epoch": 8.29, "grad_norm": 5.791382312774658, "learning_rate": 0.00021748139286309657, "loss": 1.7279, "step": 282500 }, { "epoch": 8.31, "grad_norm": 2.3419294357299805, "learning_rate": 0.00021733430087176518, "loss": 1.7053, "step": 283000 }, { "epoch": 8.32, "grad_norm": 1.9053447246551514, "learning_rate": 0.00021718720888043382, "loss": 1.7225, "step": 283500 }, { "epoch": 8.34, "grad_norm": 1.8822693824768066, "learning_rate": 0.00021704011688910242, "loss": 1.6968, "step": 284000 }, { "epoch": 8.35, "grad_norm": 1.4814122915267944, "learning_rate": 0.00021689302489777103, "loss": 1.6697, "step": 284500 }, { "epoch": 8.37, "grad_norm": 8.579392433166504, "learning_rate": 0.00021674593290643967, "loss": 1.7206, "step": 285000 }, { "epoch": 8.38, "grad_norm": 1.9921379089355469, "learning_rate": 0.00021659884091510827, "loss": 1.7113, "step": 285500 }, { "epoch": 8.4, "grad_norm": 2.2425730228424072, "learning_rate": 0.00021645174892377694, "loss": 1.6893, "step": 286000 }, { "epoch": 8.41, "grad_norm": 4.465683460235596, "learning_rate": 0.00021630465693244554, "loss": 1.6997, "step": 286500 }, { "epoch": 8.43, "grad_norm": 11.377062797546387, "learning_rate": 0.00021615756494111415, "loss": 1.691, "step": 287000 }, { "epoch": 8.44, "grad_norm": 4.9282636642456055, "learning_rate": 0.00021601047294978279, "loss": 1.6722, "step": 287500 }, { "epoch": 8.46, "grad_norm": 4.008066177368164, "learning_rate": 0.0002158633809584514, "loss": 1.6498, "step": 288000 }, { "epoch": 8.47, "grad_norm": 1.7437248229980469, "learning_rate": 0.00021571628896712, "loss": 1.7336, "step": 288500 }, { "epoch": 8.49, "grad_norm": 1.627820372581482, "learning_rate": 0.00021556919697578864, "loss": 1.6973, "step": 289000 }, { "epoch": 8.5, "grad_norm": 5.1567254066467285, "learning_rate": 0.00021542210498445724, "loss": 1.6974, "step": 289500 }, { "epoch": 8.51, "grad_norm": 1.3468036651611328, "learning_rate": 0.0002152750129931259, "loss": 1.6815, "step": 290000 }, { "epoch": 8.53, "grad_norm": 1.6024253368377686, "learning_rate": 0.0002151279210017945, "loss": 1.6784, "step": 290500 }, { "epoch": 8.54, "grad_norm": 1.915425181388855, "learning_rate": 0.00021498082901046312, "loss": 1.7297, "step": 291000 }, { "epoch": 8.56, "grad_norm": 2.750544548034668, "learning_rate": 0.00021483373701913175, "loss": 1.717, "step": 291500 }, { "epoch": 8.57, "grad_norm": 8.173510551452637, "learning_rate": 0.00021468664502780036, "loss": 1.6971, "step": 292000 }, { "epoch": 8.59, "grad_norm": 2.2209818363189697, "learning_rate": 0.00021453955303646897, "loss": 1.6803, "step": 292500 }, { "epoch": 8.6, "grad_norm": 4.283031463623047, "learning_rate": 0.0002143924610451376, "loss": 1.7101, "step": 293000 }, { "epoch": 8.62, "grad_norm": 2.29679799079895, "learning_rate": 0.0002142453690538062, "loss": 1.7021, "step": 293500 }, { "epoch": 8.63, "grad_norm": 3.301649570465088, "learning_rate": 0.00021409827706247487, "loss": 1.727, "step": 294000 }, { "epoch": 8.65, "grad_norm": 38.071407318115234, "learning_rate": 0.00021395118507114348, "loss": 1.7318, "step": 294500 }, { "epoch": 8.66, "grad_norm": 1.9748557806015015, "learning_rate": 0.0002138040930798121, "loss": 1.7327, "step": 295000 }, { "epoch": 8.68, "grad_norm": 1.8591829538345337, "learning_rate": 0.00021365700108848072, "loss": 1.6944, "step": 295500 }, { "epoch": 8.69, "grad_norm": 6.686271667480469, "learning_rate": 0.00021350990909714933, "loss": 1.7289, "step": 296000 }, { "epoch": 8.71, "grad_norm": 2.0325803756713867, "learning_rate": 0.00021336281710581794, "loss": 1.7074, "step": 296500 }, { "epoch": 8.72, "grad_norm": 2.5102651119232178, "learning_rate": 0.00021321572511448657, "loss": 1.7061, "step": 297000 }, { "epoch": 8.73, "grad_norm": 5.305497646331787, "learning_rate": 0.0002130686331231552, "loss": 1.7047, "step": 297500 }, { "epoch": 8.75, "grad_norm": 2.878256320953369, "learning_rate": 0.00021292154113182384, "loss": 1.7109, "step": 298000 }, { "epoch": 8.76, "grad_norm": 1.7807819843292236, "learning_rate": 0.00021277444914049245, "loss": 1.7136, "step": 298500 }, { "epoch": 8.78, "grad_norm": 1.8971943855285645, "learning_rate": 0.00021262735714916106, "loss": 1.7319, "step": 299000 }, { "epoch": 8.79, "grad_norm": 2.650453805923462, "learning_rate": 0.0002124802651578297, "loss": 1.7436, "step": 299500 }, { "epoch": 8.81, "grad_norm": 3.341278076171875, "learning_rate": 0.0002123331731664983, "loss": 1.6982, "step": 300000 }, { "epoch": 8.82, "grad_norm": 4.539509296417236, "learning_rate": 0.00021218608117516693, "loss": 1.7149, "step": 300500 }, { "epoch": 8.84, "grad_norm": 1.6525373458862305, "learning_rate": 0.00021203898918383554, "loss": 1.7154, "step": 301000 }, { "epoch": 8.85, "grad_norm": 2.1805810928344727, "learning_rate": 0.00021189189719250418, "loss": 1.7244, "step": 301500 }, { "epoch": 8.87, "grad_norm": 1.623687744140625, "learning_rate": 0.0002117448052011728, "loss": 1.7109, "step": 302000 }, { "epoch": 8.88, "grad_norm": 1.8576191663742065, "learning_rate": 0.00021159771320984142, "loss": 1.71, "step": 302500 }, { "epoch": 8.9, "grad_norm": 2.7003326416015625, "learning_rate": 0.00021145062121851005, "loss": 1.717, "step": 303000 }, { "epoch": 8.91, "grad_norm": 1.5555850267410278, "learning_rate": 0.00021130352922717866, "loss": 1.6771, "step": 303500 }, { "epoch": 8.93, "grad_norm": 1.8485440015792847, "learning_rate": 0.00021115643723584727, "loss": 1.6907, "step": 304000 }, { "epoch": 8.94, "grad_norm": 1.9917840957641602, "learning_rate": 0.0002110093452445159, "loss": 1.6929, "step": 304500 }, { "epoch": 8.95, "grad_norm": 2.54939866065979, "learning_rate": 0.0002108622532531845, "loss": 1.7211, "step": 305000 }, { "epoch": 8.97, "grad_norm": 4.3445234298706055, "learning_rate": 0.00021071516126185317, "loss": 1.7038, "step": 305500 }, { "epoch": 8.98, "grad_norm": 2.9025015830993652, "learning_rate": 0.00021056806927052178, "loss": 1.6987, "step": 306000 }, { "epoch": 9.0, "grad_norm": 3.951500177383423, "learning_rate": 0.0002104209772791904, "loss": 1.7063, "step": 306500 }, { "epoch": 9.01, "grad_norm": 2.2812907695770264, "learning_rate": 0.00021027388528785902, "loss": 1.6754, "step": 307000 }, { "epoch": 9.03, "grad_norm": 1.313977837562561, "learning_rate": 0.00021012679329652763, "loss": 1.6384, "step": 307500 }, { "epoch": 9.04, "grad_norm": 1.1051950454711914, "learning_rate": 0.00020997970130519624, "loss": 1.6372, "step": 308000 }, { "epoch": 9.06, "grad_norm": 2.6401305198669434, "learning_rate": 0.00020983260931386487, "loss": 1.6535, "step": 308500 }, { "epoch": 9.07, "grad_norm": 1.477562665939331, "learning_rate": 0.00020968551732253348, "loss": 1.6342, "step": 309000 }, { "epoch": 9.09, "grad_norm": 1.7968777418136597, "learning_rate": 0.00020953842533120214, "loss": 1.6163, "step": 309500 }, { "epoch": 9.1, "grad_norm": 1.8777967691421509, "learning_rate": 0.00020939133333987075, "loss": 1.6466, "step": 310000 }, { "epoch": 9.12, "grad_norm": 1.970590353012085, "learning_rate": 0.00020924424134853936, "loss": 1.6481, "step": 310500 }, { "epoch": 9.13, "grad_norm": 11.109952926635742, "learning_rate": 0.000209097149357208, "loss": 1.6634, "step": 311000 }, { "epoch": 9.15, "grad_norm": 1.5415191650390625, "learning_rate": 0.0002089500573658766, "loss": 1.637, "step": 311500 }, { "epoch": 9.16, "grad_norm": 2.2349135875701904, "learning_rate": 0.0002088029653745452, "loss": 1.6373, "step": 312000 }, { "epoch": 9.18, "grad_norm": 2.3896191120147705, "learning_rate": 0.00020865587338321384, "loss": 1.6318, "step": 312500 }, { "epoch": 9.19, "grad_norm": 1.5920429229736328, "learning_rate": 0.00020850878139188245, "loss": 1.6383, "step": 313000 }, { "epoch": 9.2, "grad_norm": 1.485474944114685, "learning_rate": 0.0002083616894005511, "loss": 1.6392, "step": 313500 }, { "epoch": 9.22, "grad_norm": 9.920788764953613, "learning_rate": 0.00020821459740921972, "loss": 1.6407, "step": 314000 }, { "epoch": 9.23, "grad_norm": 5.970518589019775, "learning_rate": 0.00020806750541788833, "loss": 1.6389, "step": 314500 }, { "epoch": 9.25, "grad_norm": 1.5371183156967163, "learning_rate": 0.00020792041342655696, "loss": 1.6485, "step": 315000 }, { "epoch": 9.26, "grad_norm": 2.242612600326538, "learning_rate": 0.00020777332143522557, "loss": 1.6344, "step": 315500 }, { "epoch": 9.28, "grad_norm": 3.137301445007324, "learning_rate": 0.00020762622944389418, "loss": 1.6269, "step": 316000 }, { "epoch": 9.29, "grad_norm": 3.391427993774414, "learning_rate": 0.0002074791374525628, "loss": 1.6575, "step": 316500 }, { "epoch": 9.31, "grad_norm": 7.92528772354126, "learning_rate": 0.00020733204546123142, "loss": 1.6333, "step": 317000 }, { "epoch": 9.32, "grad_norm": 1.0485262870788574, "learning_rate": 0.00020718495346990008, "loss": 1.6574, "step": 317500 }, { "epoch": 9.34, "grad_norm": 1.6511611938476562, "learning_rate": 0.0002070378614785687, "loss": 1.6474, "step": 318000 }, { "epoch": 9.35, "grad_norm": 1.3929743766784668, "learning_rate": 0.0002068907694872373, "loss": 1.6484, "step": 318500 }, { "epoch": 9.37, "grad_norm": 1.0993067026138306, "learning_rate": 0.00020674367749590593, "loss": 1.6312, "step": 319000 }, { "epoch": 9.38, "grad_norm": 2.8097622394561768, "learning_rate": 0.00020659658550457454, "loss": 1.6438, "step": 319500 }, { "epoch": 9.4, "grad_norm": 2.274348020553589, "learning_rate": 0.00020644949351324314, "loss": 1.6258, "step": 320000 }, { "epoch": 9.41, "grad_norm": 1.9707189798355103, "learning_rate": 0.00020630240152191178, "loss": 1.6619, "step": 320500 }, { "epoch": 9.42, "grad_norm": 2.7805778980255127, "learning_rate": 0.0002061553095305804, "loss": 1.6537, "step": 321000 }, { "epoch": 9.44, "grad_norm": 2.856705665588379, "learning_rate": 0.00020600821753924905, "loss": 1.6745, "step": 321500 }, { "epoch": 9.45, "grad_norm": 1.3102498054504395, "learning_rate": 0.00020586112554791766, "loss": 1.6662, "step": 322000 }, { "epoch": 9.47, "grad_norm": 4.1165690422058105, "learning_rate": 0.00020571403355658626, "loss": 1.6531, "step": 322500 }, { "epoch": 9.48, "grad_norm": 5.66050386428833, "learning_rate": 0.0002055669415652549, "loss": 1.669, "step": 323000 }, { "epoch": 9.5, "grad_norm": 1.3612234592437744, "learning_rate": 0.0002054198495739235, "loss": 1.642, "step": 323500 }, { "epoch": 9.51, "grad_norm": 2.5399467945098877, "learning_rate": 0.00020527275758259214, "loss": 1.677, "step": 324000 }, { "epoch": 9.53, "grad_norm": 5.590959548950195, "learning_rate": 0.00020512566559126075, "loss": 1.6673, "step": 324500 }, { "epoch": 9.54, "grad_norm": 1.6591867208480835, "learning_rate": 0.00020497857359992936, "loss": 1.6523, "step": 325000 }, { "epoch": 9.56, "grad_norm": 2.3825442790985107, "learning_rate": 0.00020483148160859802, "loss": 1.6472, "step": 325500 }, { "epoch": 9.57, "grad_norm": 1.509970784187317, "learning_rate": 0.00020468438961726663, "loss": 1.6095, "step": 326000 }, { "epoch": 9.59, "grad_norm": 1.0791665315628052, "learning_rate": 0.00020453729762593526, "loss": 1.6545, "step": 326500 }, { "epoch": 9.6, "grad_norm": 1.480979323387146, "learning_rate": 0.00020439020563460387, "loss": 1.6691, "step": 327000 }, { "epoch": 9.62, "grad_norm": 1.9298070669174194, "learning_rate": 0.00020424311364327248, "loss": 1.6561, "step": 327500 }, { "epoch": 9.63, "grad_norm": 2.1420977115631104, "learning_rate": 0.0002040960216519411, "loss": 1.6332, "step": 328000 }, { "epoch": 9.64, "grad_norm": 1.6322475671768188, "learning_rate": 0.00020394892966060972, "loss": 1.6576, "step": 328500 }, { "epoch": 9.66, "grad_norm": 1.7701492309570312, "learning_rate": 0.00020380183766927833, "loss": 1.6603, "step": 329000 }, { "epoch": 9.67, "grad_norm": 22.195663452148438, "learning_rate": 0.000203654745677947, "loss": 1.6605, "step": 329500 }, { "epoch": 9.69, "grad_norm": 1.735055685043335, "learning_rate": 0.0002035076536866156, "loss": 1.6728, "step": 330000 }, { "epoch": 9.7, "grad_norm": 1.3548341989517212, "learning_rate": 0.00020336056169528423, "loss": 1.6606, "step": 330500 }, { "epoch": 9.72, "grad_norm": 2.0401058197021484, "learning_rate": 0.00020321346970395284, "loss": 1.6687, "step": 331000 }, { "epoch": 9.73, "grad_norm": 16.856937408447266, "learning_rate": 0.00020306637771262144, "loss": 1.6369, "step": 331500 }, { "epoch": 9.75, "grad_norm": 3.8989596366882324, "learning_rate": 0.00020291928572129008, "loss": 1.6454, "step": 332000 }, { "epoch": 9.76, "grad_norm": 1.6723474264144897, "learning_rate": 0.0002027721937299587, "loss": 1.6723, "step": 332500 }, { "epoch": 9.78, "grad_norm": 1.7381031513214111, "learning_rate": 0.0002026251017386273, "loss": 1.6672, "step": 333000 }, { "epoch": 9.79, "grad_norm": 1.7013987302780151, "learning_rate": 0.00020247800974729596, "loss": 1.6666, "step": 333500 }, { "epoch": 9.81, "grad_norm": 1.6793595552444458, "learning_rate": 0.00020233091775596456, "loss": 1.6594, "step": 334000 }, { "epoch": 9.82, "grad_norm": 1.2351815700531006, "learning_rate": 0.0002021838257646332, "loss": 1.6475, "step": 334500 }, { "epoch": 9.84, "grad_norm": 3.5721399784088135, "learning_rate": 0.0002020367337733018, "loss": 1.6584, "step": 335000 }, { "epoch": 9.85, "grad_norm": 1.5881208181381226, "learning_rate": 0.0002018896417819704, "loss": 1.6584, "step": 335500 }, { "epoch": 9.87, "grad_norm": 2.2431886196136475, "learning_rate": 0.00020174254979063905, "loss": 1.6632, "step": 336000 }, { "epoch": 9.88, "grad_norm": 1.9759477376937866, "learning_rate": 0.00020159545779930766, "loss": 1.6098, "step": 336500 }, { "epoch": 9.89, "grad_norm": 2.4007914066314697, "learning_rate": 0.00020144836580797626, "loss": 1.6553, "step": 337000 }, { "epoch": 9.91, "grad_norm": 1.1646785736083984, "learning_rate": 0.00020130127381664492, "loss": 1.6917, "step": 337500 }, { "epoch": 9.92, "grad_norm": 2.7321279048919678, "learning_rate": 0.00020115418182531353, "loss": 1.6165, "step": 338000 }, { "epoch": 9.94, "grad_norm": 2.1226370334625244, "learning_rate": 0.00020100708983398217, "loss": 1.6583, "step": 338500 }, { "epoch": 9.95, "grad_norm": 16.655624389648438, "learning_rate": 0.00020085999784265077, "loss": 1.6594, "step": 339000 }, { "epoch": 9.97, "grad_norm": 6.142359256744385, "learning_rate": 0.00020071290585131938, "loss": 1.6343, "step": 339500 }, { "epoch": 9.98, "grad_norm": 2.1720430850982666, "learning_rate": 0.00020056581385998802, "loss": 1.6509, "step": 340000 }, { "epoch": 10.0, "grad_norm": 2.870986223220825, "learning_rate": 0.00020041872186865662, "loss": 1.6707, "step": 340500 }, { "epoch": 10.01, "grad_norm": 1.382546067237854, "learning_rate": 0.00020027162987732526, "loss": 1.5759, "step": 341000 }, { "epoch": 10.03, "grad_norm": 1.2603851556777954, "learning_rate": 0.0002001245378859939, "loss": 1.5937, "step": 341500 }, { "epoch": 10.04, "grad_norm": 2.455425262451172, "learning_rate": 0.0001999774458946625, "loss": 1.5859, "step": 342000 }, { "epoch": 10.06, "grad_norm": 1.6993873119354248, "learning_rate": 0.00019983035390333114, "loss": 1.6102, "step": 342500 }, { "epoch": 10.07, "grad_norm": 2.338402509689331, "learning_rate": 0.00019968326191199974, "loss": 1.6058, "step": 343000 }, { "epoch": 10.09, "grad_norm": 1.4525296688079834, "learning_rate": 0.00019953616992066838, "loss": 1.5925, "step": 343500 }, { "epoch": 10.1, "grad_norm": 1.7461038827896118, "learning_rate": 0.00019938907792933699, "loss": 1.5824, "step": 344000 }, { "epoch": 10.11, "grad_norm": 20.686992645263672, "learning_rate": 0.0001992419859380056, "loss": 1.6173, "step": 344500 }, { "epoch": 10.13, "grad_norm": 11.99207592010498, "learning_rate": 0.00019909489394667423, "loss": 1.5998, "step": 345000 }, { "epoch": 10.14, "grad_norm": 2.926992177963257, "learning_rate": 0.00019894780195534286, "loss": 1.5864, "step": 345500 }, { "epoch": 10.16, "grad_norm": 3.7215046882629395, "learning_rate": 0.0001988007099640115, "loss": 1.5856, "step": 346000 }, { "epoch": 10.17, "grad_norm": 1.7094500064849854, "learning_rate": 0.0001986536179726801, "loss": 1.5964, "step": 346500 }, { "epoch": 10.19, "grad_norm": 2.5829222202301025, "learning_rate": 0.0001985065259813487, "loss": 1.5943, "step": 347000 }, { "epoch": 10.2, "grad_norm": 1.6218924522399902, "learning_rate": 0.00019835943399001735, "loss": 1.5808, "step": 347500 }, { "epoch": 10.22, "grad_norm": 1.952506184577942, "learning_rate": 0.00019821234199868595, "loss": 1.5548, "step": 348000 }, { "epoch": 10.23, "grad_norm": 2.1660783290863037, "learning_rate": 0.00019806525000735456, "loss": 1.585, "step": 348500 }, { "epoch": 10.25, "grad_norm": 1.8295018672943115, "learning_rate": 0.0001979181580160232, "loss": 1.5975, "step": 349000 }, { "epoch": 10.26, "grad_norm": 2.3375236988067627, "learning_rate": 0.00019777106602469183, "loss": 1.617, "step": 349500 }, { "epoch": 10.28, "grad_norm": 1.323554515838623, "learning_rate": 0.00019762397403336047, "loss": 1.5915, "step": 350000 }, { "epoch": 10.29, "grad_norm": 1.259614109992981, "learning_rate": 0.00019747688204202907, "loss": 1.6126, "step": 350500 }, { "epoch": 10.31, "grad_norm": 27.243751525878906, "learning_rate": 0.00019732979005069768, "loss": 1.5886, "step": 351000 }, { "epoch": 10.32, "grad_norm": 1.4931347370147705, "learning_rate": 0.00019718269805936632, "loss": 1.6165, "step": 351500 }, { "epoch": 10.33, "grad_norm": 1.3573870658874512, "learning_rate": 0.00019703560606803492, "loss": 1.5967, "step": 352000 }, { "epoch": 10.35, "grad_norm": 2.393355369567871, "learning_rate": 0.00019688851407670353, "loss": 1.6033, "step": 352500 }, { "epoch": 10.36, "grad_norm": 17.975017547607422, "learning_rate": 0.00019674142208537217, "loss": 1.6176, "step": 353000 }, { "epoch": 10.38, "grad_norm": 2.195270538330078, "learning_rate": 0.0001965943300940408, "loss": 1.6022, "step": 353500 }, { "epoch": 10.39, "grad_norm": 1.5013370513916016, "learning_rate": 0.00019644723810270944, "loss": 1.5999, "step": 354000 }, { "epoch": 10.41, "grad_norm": 16.84026527404785, "learning_rate": 0.00019630014611137804, "loss": 1.6061, "step": 354500 }, { "epoch": 10.42, "grad_norm": 2.8341875076293945, "learning_rate": 0.00019615305412004665, "loss": 1.5742, "step": 355000 }, { "epoch": 10.44, "grad_norm": 1.6109588146209717, "learning_rate": 0.00019600596212871529, "loss": 1.5941, "step": 355500 }, { "epoch": 10.45, "grad_norm": 5.435859203338623, "learning_rate": 0.0001958588701373839, "loss": 1.5875, "step": 356000 }, { "epoch": 10.47, "grad_norm": 4.453019142150879, "learning_rate": 0.0001957117781460525, "loss": 1.5901, "step": 356500 }, { "epoch": 10.48, "grad_norm": 2.145413875579834, "learning_rate": 0.00019556468615472116, "loss": 1.5946, "step": 357000 }, { "epoch": 10.5, "grad_norm": 2.531158208847046, "learning_rate": 0.00019541759416338977, "loss": 1.5865, "step": 357500 }, { "epoch": 10.51, "grad_norm": 1.8745834827423096, "learning_rate": 0.0001952705021720584, "loss": 1.6033, "step": 358000 }, { "epoch": 10.53, "grad_norm": 1.3398380279541016, "learning_rate": 0.000195123410180727, "loss": 1.6205, "step": 358500 }, { "epoch": 10.54, "grad_norm": 6.710257053375244, "learning_rate": 0.00019497631818939562, "loss": 1.6171, "step": 359000 }, { "epoch": 10.56, "grad_norm": 1.5463893413543701, "learning_rate": 0.00019482922619806425, "loss": 1.5913, "step": 359500 }, { "epoch": 10.57, "grad_norm": 1.32403564453125, "learning_rate": 0.00019468213420673286, "loss": 1.6129, "step": 360000 }, { "epoch": 10.58, "grad_norm": 1.1276665925979614, "learning_rate": 0.00019453504221540147, "loss": 1.6063, "step": 360500 }, { "epoch": 10.6, "grad_norm": 3.2467503547668457, "learning_rate": 0.00019438795022407013, "loss": 1.6036, "step": 361000 }, { "epoch": 10.61, "grad_norm": 2.091618537902832, "learning_rate": 0.00019424085823273874, "loss": 1.5919, "step": 361500 }, { "epoch": 10.63, "grad_norm": 2.571202516555786, "learning_rate": 0.00019409376624140737, "loss": 1.6218, "step": 362000 }, { "epoch": 10.64, "grad_norm": 1.7798309326171875, "learning_rate": 0.00019394667425007598, "loss": 1.5919, "step": 362500 }, { "epoch": 10.66, "grad_norm": 1.96729576587677, "learning_rate": 0.0001937995822587446, "loss": 1.6116, "step": 363000 }, { "epoch": 10.67, "grad_norm": 3.229706048965454, "learning_rate": 0.00019365249026741322, "loss": 1.5993, "step": 363500 }, { "epoch": 10.69, "grad_norm": 6.615269184112549, "learning_rate": 0.00019350539827608183, "loss": 1.6034, "step": 364000 }, { "epoch": 10.7, "grad_norm": 6.847731113433838, "learning_rate": 0.00019335830628475047, "loss": 1.5785, "step": 364500 }, { "epoch": 10.72, "grad_norm": 1.3055931329727173, "learning_rate": 0.0001932112142934191, "loss": 1.6059, "step": 365000 }, { "epoch": 10.73, "grad_norm": 5.202936172485352, "learning_rate": 0.0001930641223020877, "loss": 1.6145, "step": 365500 }, { "epoch": 10.75, "grad_norm": 1.704176902770996, "learning_rate": 0.00019291703031075634, "loss": 1.6349, "step": 366000 }, { "epoch": 10.76, "grad_norm": 2.4655399322509766, "learning_rate": 0.00019276993831942495, "loss": 1.5845, "step": 366500 }, { "epoch": 10.78, "grad_norm": 6.769374370574951, "learning_rate": 0.00019262284632809358, "loss": 1.6054, "step": 367000 }, { "epoch": 10.79, "grad_norm": 3.4355087280273438, "learning_rate": 0.0001924757543367622, "loss": 1.6205, "step": 367500 }, { "epoch": 10.8, "grad_norm": 1.796799659729004, "learning_rate": 0.0001923286623454308, "loss": 1.6138, "step": 368000 }, { "epoch": 10.82, "grad_norm": 4.333281993865967, "learning_rate": 0.00019218157035409943, "loss": 1.6273, "step": 368500 }, { "epoch": 10.83, "grad_norm": 10.447500228881836, "learning_rate": 0.00019203447836276807, "loss": 1.6497, "step": 369000 }, { "epoch": 10.85, "grad_norm": 1.9053871631622314, "learning_rate": 0.0001918873863714367, "loss": 1.5815, "step": 369500 }, { "epoch": 10.86, "grad_norm": 6.775355339050293, "learning_rate": 0.0001917402943801053, "loss": 1.5986, "step": 370000 }, { "epoch": 10.88, "grad_norm": 19.00932502746582, "learning_rate": 0.00019159320238877392, "loss": 1.5916, "step": 370500 }, { "epoch": 10.89, "grad_norm": 2.1029250621795654, "learning_rate": 0.00019144611039744255, "loss": 1.6072, "step": 371000 }, { "epoch": 10.91, "grad_norm": 1.3043904304504395, "learning_rate": 0.00019129901840611116, "loss": 1.5885, "step": 371500 }, { "epoch": 10.92, "grad_norm": 1.8047374486923218, "learning_rate": 0.00019115192641477977, "loss": 1.6265, "step": 372000 }, { "epoch": 10.94, "grad_norm": 2.591125011444092, "learning_rate": 0.0001910048344234484, "loss": 1.6028, "step": 372500 }, { "epoch": 10.95, "grad_norm": 1.8858853578567505, "learning_rate": 0.00019085774243211704, "loss": 1.6212, "step": 373000 }, { "epoch": 10.97, "grad_norm": 1.4778261184692383, "learning_rate": 0.00019071065044078567, "loss": 1.6049, "step": 373500 }, { "epoch": 10.98, "grad_norm": 7.7040934562683105, "learning_rate": 0.00019056355844945428, "loss": 1.5926, "step": 374000 }, { "epoch": 11.0, "grad_norm": 1.5968307256698608, "learning_rate": 0.0001904164664581229, "loss": 1.6438, "step": 374500 }, { "epoch": 11.01, "grad_norm": 1.4240366220474243, "learning_rate": 0.00019026937446679152, "loss": 1.5451, "step": 375000 }, { "epoch": 11.02, "grad_norm": 2.0358214378356934, "learning_rate": 0.00019012228247546013, "loss": 1.5232, "step": 375500 }, { "epoch": 11.04, "grad_norm": 1.2968121767044067, "learning_rate": 0.00018997519048412874, "loss": 1.5417, "step": 376000 }, { "epoch": 11.05, "grad_norm": 3.3961257934570312, "learning_rate": 0.00018982809849279737, "loss": 1.5513, "step": 376500 }, { "epoch": 11.07, "grad_norm": 2.7855663299560547, "learning_rate": 0.000189681006501466, "loss": 1.5437, "step": 377000 }, { "epoch": 11.08, "grad_norm": 5.572605609893799, "learning_rate": 0.00018953391451013464, "loss": 1.5246, "step": 377500 }, { "epoch": 11.1, "grad_norm": 2.3603007793426514, "learning_rate": 0.00018938682251880325, "loss": 1.5246, "step": 378000 }, { "epoch": 11.11, "grad_norm": 19.196584701538086, "learning_rate": 0.00018923973052747186, "loss": 1.5223, "step": 378500 }, { "epoch": 11.13, "grad_norm": 2.4410743713378906, "learning_rate": 0.0001890926385361405, "loss": 1.5202, "step": 379000 }, { "epoch": 11.14, "grad_norm": 1.8335517644882202, "learning_rate": 0.0001889455465448091, "loss": 1.5625, "step": 379500 }, { "epoch": 11.16, "grad_norm": 2.2342159748077393, "learning_rate": 0.0001887984545534777, "loss": 1.5391, "step": 380000 }, { "epoch": 11.17, "grad_norm": 5.949936389923096, "learning_rate": 0.00018865136256214634, "loss": 1.569, "step": 380500 }, { "epoch": 11.19, "grad_norm": 6.047088623046875, "learning_rate": 0.00018850427057081498, "loss": 1.5366, "step": 381000 }, { "epoch": 11.2, "grad_norm": 16.05063819885254, "learning_rate": 0.0001883571785794836, "loss": 1.5343, "step": 381500 }, { "epoch": 11.22, "grad_norm": 1.327913761138916, "learning_rate": 0.00018821008658815222, "loss": 1.5641, "step": 382000 }, { "epoch": 11.23, "grad_norm": 4.132845878601074, "learning_rate": 0.00018806299459682083, "loss": 1.5722, "step": 382500 }, { "epoch": 11.25, "grad_norm": 1.9688639640808105, "learning_rate": 0.00018791590260548946, "loss": 1.5477, "step": 383000 }, { "epoch": 11.26, "grad_norm": 1.6657782793045044, "learning_rate": 0.00018776881061415807, "loss": 1.5398, "step": 383500 }, { "epoch": 11.27, "grad_norm": 2.9266579151153564, "learning_rate": 0.00018762171862282668, "loss": 1.556, "step": 384000 }, { "epoch": 11.29, "grad_norm": 5.755486011505127, "learning_rate": 0.0001874746266314953, "loss": 1.5415, "step": 384500 }, { "epoch": 11.3, "grad_norm": 2.1186070442199707, "learning_rate": 0.00018732753464016395, "loss": 1.5379, "step": 385000 }, { "epoch": 11.32, "grad_norm": 3.1836068630218506, "learning_rate": 0.00018718044264883258, "loss": 1.5621, "step": 385500 }, { "epoch": 11.33, "grad_norm": 4.884739875793457, "learning_rate": 0.0001870333506575012, "loss": 1.5791, "step": 386000 }, { "epoch": 11.35, "grad_norm": 2.4326329231262207, "learning_rate": 0.0001868862586661698, "loss": 1.5647, "step": 386500 }, { "epoch": 11.36, "grad_norm": 5.392803192138672, "learning_rate": 0.00018673916667483843, "loss": 1.5703, "step": 387000 }, { "epoch": 11.38, "grad_norm": 6.940735340118408, "learning_rate": 0.00018659207468350704, "loss": 1.5579, "step": 387500 }, { "epoch": 11.39, "grad_norm": 2.8021979331970215, "learning_rate": 0.00018644498269217567, "loss": 1.5469, "step": 388000 }, { "epoch": 11.41, "grad_norm": 1.8717076778411865, "learning_rate": 0.00018629789070084428, "loss": 1.5624, "step": 388500 }, { "epoch": 11.42, "grad_norm": 3.1023306846618652, "learning_rate": 0.00018615079870951291, "loss": 1.5751, "step": 389000 }, { "epoch": 11.44, "grad_norm": 1.5794864892959595, "learning_rate": 0.00018600370671818155, "loss": 1.5652, "step": 389500 }, { "epoch": 11.45, "grad_norm": 3.9795074462890625, "learning_rate": 0.00018585661472685016, "loss": 1.5663, "step": 390000 }, { "epoch": 11.47, "grad_norm": 1.4186744689941406, "learning_rate": 0.0001857095227355188, "loss": 1.5508, "step": 390500 }, { "epoch": 11.48, "grad_norm": 3.268982172012329, "learning_rate": 0.0001855624307441874, "loss": 1.5666, "step": 391000 }, { "epoch": 11.49, "grad_norm": 1.246903419494629, "learning_rate": 0.000185415338752856, "loss": 1.5797, "step": 391500 }, { "epoch": 11.51, "grad_norm": 1.4437174797058105, "learning_rate": 0.00018526824676152464, "loss": 1.562, "step": 392000 }, { "epoch": 11.52, "grad_norm": 1.8195850849151611, "learning_rate": 0.00018512115477019325, "loss": 1.5748, "step": 392500 }, { "epoch": 11.54, "grad_norm": 1.6077349185943604, "learning_rate": 0.0001849740627788619, "loss": 1.5628, "step": 393000 }, { "epoch": 11.55, "grad_norm": 1.614617943763733, "learning_rate": 0.00018482697078753052, "loss": 1.5992, "step": 393500 }, { "epoch": 11.57, "grad_norm": 2.860440731048584, "learning_rate": 0.00018467987879619913, "loss": 1.5635, "step": 394000 }, { "epoch": 11.58, "grad_norm": 4.223182201385498, "learning_rate": 0.00018453278680486776, "loss": 1.5678, "step": 394500 }, { "epoch": 11.6, "grad_norm": 2.143765687942505, "learning_rate": 0.00018438569481353637, "loss": 1.5714, "step": 395000 }, { "epoch": 11.61, "grad_norm": 14.07819652557373, "learning_rate": 0.00018423860282220498, "loss": 1.5662, "step": 395500 }, { "epoch": 11.63, "grad_norm": 1.6693955659866333, "learning_rate": 0.0001840915108308736, "loss": 1.5431, "step": 396000 }, { "epoch": 11.64, "grad_norm": 2.4783432483673096, "learning_rate": 0.00018394441883954222, "loss": 1.5439, "step": 396500 }, { "epoch": 11.66, "grad_norm": 2.1813952922821045, "learning_rate": 0.00018379732684821088, "loss": 1.5539, "step": 397000 }, { "epoch": 11.67, "grad_norm": 4.754596710205078, "learning_rate": 0.0001836502348568795, "loss": 1.5675, "step": 397500 }, { "epoch": 11.69, "grad_norm": 1.9912656545639038, "learning_rate": 0.0001835031428655481, "loss": 1.5909, "step": 398000 }, { "epoch": 11.7, "grad_norm": 1.5389857292175293, "learning_rate": 0.00018335605087421673, "loss": 1.5843, "step": 398500 }, { "epoch": 11.71, "grad_norm": 3.8105554580688477, "learning_rate": 0.00018320895888288534, "loss": 1.5804, "step": 399000 }, { "epoch": 11.73, "grad_norm": 2.6107397079467773, "learning_rate": 0.00018306186689155394, "loss": 1.5685, "step": 399500 }, { "epoch": 11.74, "grad_norm": 4.173858642578125, "learning_rate": 0.00018291477490022258, "loss": 1.5823, "step": 400000 }, { "epoch": 11.76, "grad_norm": 1.6713734865188599, "learning_rate": 0.0001827676829088912, "loss": 1.5672, "step": 400500 }, { "epoch": 11.77, "grad_norm": 4.650341033935547, "learning_rate": 0.00018262059091755985, "loss": 1.5297, "step": 401000 }, { "epoch": 11.79, "grad_norm": 1.5853548049926758, "learning_rate": 0.00018247349892622846, "loss": 1.5814, "step": 401500 }, { "epoch": 11.8, "grad_norm": 5.71536111831665, "learning_rate": 0.00018232640693489706, "loss": 1.5444, "step": 402000 }, { "epoch": 11.82, "grad_norm": 2.5603206157684326, "learning_rate": 0.0001821793149435657, "loss": 1.593, "step": 402500 }, { "epoch": 11.83, "grad_norm": 2.556755304336548, "learning_rate": 0.0001820322229522343, "loss": 1.5717, "step": 403000 }, { "epoch": 11.85, "grad_norm": 2.1441986560821533, "learning_rate": 0.0001818851309609029, "loss": 1.5766, "step": 403500 }, { "epoch": 11.86, "grad_norm": 1.762389898300171, "learning_rate": 0.00018173803896957155, "loss": 1.5857, "step": 404000 }, { "epoch": 11.88, "grad_norm": 1.7397152185440063, "learning_rate": 0.00018159094697824016, "loss": 1.5767, "step": 404500 }, { "epoch": 11.89, "grad_norm": 2.2762222290039062, "learning_rate": 0.00018144385498690882, "loss": 1.5735, "step": 405000 }, { "epoch": 11.91, "grad_norm": 2.9703893661499023, "learning_rate": 0.00018129676299557742, "loss": 1.5618, "step": 405500 }, { "epoch": 11.92, "grad_norm": 3.4913887977600098, "learning_rate": 0.00018114967100424603, "loss": 1.589, "step": 406000 }, { "epoch": 11.93, "grad_norm": 1.7856007814407349, "learning_rate": 0.00018100257901291467, "loss": 1.5924, "step": 406500 }, { "epoch": 11.95, "grad_norm": 5.869511604309082, "learning_rate": 0.00018085548702158327, "loss": 1.5516, "step": 407000 }, { "epoch": 11.96, "grad_norm": 1.4495360851287842, "learning_rate": 0.00018070839503025188, "loss": 1.5721, "step": 407500 }, { "epoch": 11.98, "grad_norm": 5.264606475830078, "learning_rate": 0.00018056130303892052, "loss": 1.5684, "step": 408000 }, { "epoch": 11.99, "grad_norm": 1.1457246541976929, "learning_rate": 0.00018041421104758912, "loss": 1.5747, "step": 408500 }, { "epoch": 12.01, "grad_norm": 2.9236228466033936, "learning_rate": 0.00018026711905625779, "loss": 1.5313, "step": 409000 }, { "epoch": 12.02, "grad_norm": 11.91308307647705, "learning_rate": 0.0001801200270649264, "loss": 1.4883, "step": 409500 }, { "epoch": 12.04, "grad_norm": 1.8393372297286987, "learning_rate": 0.000179972935073595, "loss": 1.4828, "step": 410000 }, { "epoch": 12.05, "grad_norm": 1.9591150283813477, "learning_rate": 0.00017982584308226364, "loss": 1.4832, "step": 410500 }, { "epoch": 12.07, "grad_norm": 1.5424935817718506, "learning_rate": 0.00017967875109093224, "loss": 1.4863, "step": 411000 }, { "epoch": 12.08, "grad_norm": 4.338646411895752, "learning_rate": 0.00017953165909960088, "loss": 1.5376, "step": 411500 }, { "epoch": 12.1, "grad_norm": 4.14846134185791, "learning_rate": 0.00017938456710826949, "loss": 1.506, "step": 412000 }, { "epoch": 12.11, "grad_norm": 6.690493106842041, "learning_rate": 0.00017923747511693812, "loss": 1.5015, "step": 412500 }, { "epoch": 12.13, "grad_norm": 4.801180362701416, "learning_rate": 0.00017909038312560676, "loss": 1.53, "step": 413000 }, { "epoch": 12.14, "grad_norm": 1.8997361660003662, "learning_rate": 0.00017894329113427536, "loss": 1.5191, "step": 413500 }, { "epoch": 12.16, "grad_norm": 19.748411178588867, "learning_rate": 0.000178796199142944, "loss": 1.5213, "step": 414000 }, { "epoch": 12.17, "grad_norm": 3.8095591068267822, "learning_rate": 0.0001786491071516126, "loss": 1.496, "step": 414500 }, { "epoch": 12.18, "grad_norm": 3.752877950668335, "learning_rate": 0.0001785020151602812, "loss": 1.5014, "step": 415000 }, { "epoch": 12.2, "grad_norm": 2.396458625793457, "learning_rate": 0.00017835492316894985, "loss": 1.4683, "step": 415500 }, { "epoch": 12.21, "grad_norm": 2.928061008453369, "learning_rate": 0.00017820783117761846, "loss": 1.5093, "step": 416000 }, { "epoch": 12.23, "grad_norm": 6.80102014541626, "learning_rate": 0.00017806073918628712, "loss": 1.5085, "step": 416500 }, { "epoch": 12.24, "grad_norm": 16.06849479675293, "learning_rate": 0.00017791364719495572, "loss": 1.5023, "step": 417000 }, { "epoch": 12.26, "grad_norm": 9.823871612548828, "learning_rate": 0.00017776655520362433, "loss": 1.5677, "step": 417500 }, { "epoch": 12.27, "grad_norm": 1.8364531993865967, "learning_rate": 0.00017761946321229297, "loss": 1.5146, "step": 418000 }, { "epoch": 12.29, "grad_norm": 2.6699376106262207, "learning_rate": 0.00017747237122096157, "loss": 1.5106, "step": 418500 }, { "epoch": 12.3, "grad_norm": 7.037272930145264, "learning_rate": 0.00017732527922963018, "loss": 1.5276, "step": 419000 }, { "epoch": 12.32, "grad_norm": 1.9254209995269775, "learning_rate": 0.00017717818723829882, "loss": 1.4907, "step": 419500 }, { "epoch": 12.33, "grad_norm": 5.405977249145508, "learning_rate": 0.00017703109524696742, "loss": 1.5039, "step": 420000 }, { "epoch": 12.35, "grad_norm": 6.036567211151123, "learning_rate": 0.00017688400325563609, "loss": 1.5258, "step": 420500 }, { "epoch": 12.36, "grad_norm": 3.601088523864746, "learning_rate": 0.0001767369112643047, "loss": 1.5316, "step": 421000 }, { "epoch": 12.38, "grad_norm": 3.247906446456909, "learning_rate": 0.0001765898192729733, "loss": 1.5006, "step": 421500 }, { "epoch": 12.39, "grad_norm": 2.138275384902954, "learning_rate": 0.00017644272728164194, "loss": 1.5125, "step": 422000 }, { "epoch": 12.4, "grad_norm": 2.024502992630005, "learning_rate": 0.00017629563529031054, "loss": 1.5323, "step": 422500 }, { "epoch": 12.42, "grad_norm": 21.49098777770996, "learning_rate": 0.00017614854329897915, "loss": 1.5607, "step": 423000 }, { "epoch": 12.43, "grad_norm": 3.5568199157714844, "learning_rate": 0.00017600145130764779, "loss": 1.4843, "step": 423500 }, { "epoch": 12.45, "grad_norm": 1.6709766387939453, "learning_rate": 0.0001758543593163164, "loss": 1.5621, "step": 424000 }, { "epoch": 12.46, "grad_norm": 2.4930756092071533, "learning_rate": 0.00017570726732498505, "loss": 1.5202, "step": 424500 }, { "epoch": 12.48, "grad_norm": 1.8919436931610107, "learning_rate": 0.00017556017533365366, "loss": 1.4859, "step": 425000 }, { "epoch": 12.49, "grad_norm": 1.2928180694580078, "learning_rate": 0.00017541308334232227, "loss": 1.5423, "step": 425500 }, { "epoch": 12.51, "grad_norm": 3.5068464279174805, "learning_rate": 0.0001752659913509909, "loss": 1.5326, "step": 426000 }, { "epoch": 12.52, "grad_norm": 2.7532618045806885, "learning_rate": 0.0001751188993596595, "loss": 1.5276, "step": 426500 }, { "epoch": 12.54, "grad_norm": 1.5163066387176514, "learning_rate": 0.00017497180736832812, "loss": 1.5306, "step": 427000 }, { "epoch": 12.55, "grad_norm": 21.672080993652344, "learning_rate": 0.00017482471537699675, "loss": 1.5018, "step": 427500 }, { "epoch": 12.57, "grad_norm": 4.169219970703125, "learning_rate": 0.00017467762338566536, "loss": 1.5314, "step": 428000 }, { "epoch": 12.58, "grad_norm": 1.7433300018310547, "learning_rate": 0.00017453053139433402, "loss": 1.5408, "step": 428500 }, { "epoch": 12.6, "grad_norm": 1.6866792440414429, "learning_rate": 0.00017438343940300263, "loss": 1.5405, "step": 429000 }, { "epoch": 12.61, "grad_norm": 12.107425689697266, "learning_rate": 0.00017423634741167124, "loss": 1.5375, "step": 429500 }, { "epoch": 12.62, "grad_norm": 2.0178098678588867, "learning_rate": 0.00017408925542033987, "loss": 1.5137, "step": 430000 }, { "epoch": 12.64, "grad_norm": 3.7658607959747314, "learning_rate": 0.00017394216342900848, "loss": 1.5138, "step": 430500 }, { "epoch": 12.65, "grad_norm": 2.0998480319976807, "learning_rate": 0.00017379507143767712, "loss": 1.5005, "step": 431000 }, { "epoch": 12.67, "grad_norm": 1.7215161323547363, "learning_rate": 0.00017364797944634572, "loss": 1.5083, "step": 431500 }, { "epoch": 12.68, "grad_norm": 3.4021968841552734, "learning_rate": 0.00017350088745501433, "loss": 1.5103, "step": 432000 }, { "epoch": 12.7, "grad_norm": 5.235382080078125, "learning_rate": 0.000173353795463683, "loss": 1.5491, "step": 432500 }, { "epoch": 12.71, "grad_norm": 5.455018997192383, "learning_rate": 0.0001732067034723516, "loss": 1.5486, "step": 433000 }, { "epoch": 12.73, "grad_norm": 2.4046807289123535, "learning_rate": 0.00017305961148102024, "loss": 1.5241, "step": 433500 }, { "epoch": 12.74, "grad_norm": 2.2005653381347656, "learning_rate": 0.00017291251948968884, "loss": 1.5392, "step": 434000 }, { "epoch": 12.76, "grad_norm": 1.7510486841201782, "learning_rate": 0.00017276542749835745, "loss": 1.5076, "step": 434500 }, { "epoch": 12.77, "grad_norm": 2.4069275856018066, "learning_rate": 0.00017261833550702608, "loss": 1.5189, "step": 435000 }, { "epoch": 12.79, "grad_norm": 3.2867445945739746, "learning_rate": 0.0001724712435156947, "loss": 1.5082, "step": 435500 }, { "epoch": 12.8, "grad_norm": 1.321496605873108, "learning_rate": 0.0001723241515243633, "loss": 1.5364, "step": 436000 }, { "epoch": 12.82, "grad_norm": 2.841768980026245, "learning_rate": 0.00017217705953303196, "loss": 1.5062, "step": 436500 }, { "epoch": 12.83, "grad_norm": 2.678611993789673, "learning_rate": 0.00017202996754170057, "loss": 1.5048, "step": 437000 }, { "epoch": 12.85, "grad_norm": 3.197082996368408, "learning_rate": 0.0001718828755503692, "loss": 1.5553, "step": 437500 }, { "epoch": 12.86, "grad_norm": 2.398364543914795, "learning_rate": 0.0001717357835590378, "loss": 1.4866, "step": 438000 }, { "epoch": 12.87, "grad_norm": 2.2497568130493164, "learning_rate": 0.00017158869156770642, "loss": 1.504, "step": 438500 }, { "epoch": 12.89, "grad_norm": 3.9616646766662598, "learning_rate": 0.00017144159957637505, "loss": 1.5049, "step": 439000 }, { "epoch": 12.9, "grad_norm": 1.5139656066894531, "learning_rate": 0.00017129450758504366, "loss": 1.549, "step": 439500 }, { "epoch": 12.92, "grad_norm": 2.305665969848633, "learning_rate": 0.00017114741559371227, "loss": 1.5255, "step": 440000 }, { "epoch": 12.93, "grad_norm": 2.3823301792144775, "learning_rate": 0.00017100032360238093, "loss": 1.4995, "step": 440500 }, { "epoch": 12.95, "grad_norm": 1.6001240015029907, "learning_rate": 0.00017085323161104954, "loss": 1.5494, "step": 441000 }, { "epoch": 12.96, "grad_norm": 2.843163013458252, "learning_rate": 0.00017070613961971817, "loss": 1.5394, "step": 441500 }, { "epoch": 12.98, "grad_norm": 2.1387860774993896, "learning_rate": 0.00017055904762838678, "loss": 1.5456, "step": 442000 }, { "epoch": 12.99, "grad_norm": 4.817657470703125, "learning_rate": 0.0001704119556370554, "loss": 1.5453, "step": 442500 }, { "epoch": 13.01, "grad_norm": 1.376753807067871, "learning_rate": 0.00017026486364572402, "loss": 1.5093, "step": 443000 }, { "epoch": 13.02, "grad_norm": 1.1947274208068848, "learning_rate": 0.00017011777165439263, "loss": 1.457, "step": 443500 }, { "epoch": 13.04, "grad_norm": 3.931847333908081, "learning_rate": 0.00016997067966306124, "loss": 1.4286, "step": 444000 }, { "epoch": 13.05, "grad_norm": 14.636221885681152, "learning_rate": 0.0001698235876717299, "loss": 1.4712, "step": 444500 }, { "epoch": 13.07, "grad_norm": 3.1448845863342285, "learning_rate": 0.0001696764956803985, "loss": 1.4628, "step": 445000 }, { "epoch": 13.08, "grad_norm": 0.9527159929275513, "learning_rate": 0.00016952940368906714, "loss": 1.4179, "step": 445500 }, { "epoch": 13.09, "grad_norm": 2.0560357570648193, "learning_rate": 0.00016938231169773575, "loss": 1.4716, "step": 446000 }, { "epoch": 13.11, "grad_norm": 2.8239283561706543, "learning_rate": 0.00016923521970640436, "loss": 1.477, "step": 446500 }, { "epoch": 13.12, "grad_norm": 3.246338367462158, "learning_rate": 0.000169088127715073, "loss": 1.4739, "step": 447000 }, { "epoch": 13.14, "grad_norm": 4.943697929382324, "learning_rate": 0.0001689410357237416, "loss": 1.4772, "step": 447500 }, { "epoch": 13.15, "grad_norm": 4.003148555755615, "learning_rate": 0.0001687939437324102, "loss": 1.4723, "step": 448000 }, { "epoch": 13.17, "grad_norm": 1.5625710487365723, "learning_rate": 0.00016864685174107887, "loss": 1.4424, "step": 448500 }, { "epoch": 13.18, "grad_norm": 2.3796448707580566, "learning_rate": 0.00016849975974974748, "loss": 1.512, "step": 449000 }, { "epoch": 13.2, "grad_norm": 2.314465284347534, "learning_rate": 0.0001683526677584161, "loss": 1.5012, "step": 449500 }, { "epoch": 13.21, "grad_norm": 1.8954664468765259, "learning_rate": 0.00016820557576708472, "loss": 1.4634, "step": 450000 }, { "epoch": 13.23, "grad_norm": 1.6504507064819336, "learning_rate": 0.00016805848377575333, "loss": 1.496, "step": 450500 }, { "epoch": 13.24, "grad_norm": 1.8685587644577026, "learning_rate": 0.00016791139178442196, "loss": 1.5054, "step": 451000 }, { "epoch": 13.26, "grad_norm": 3.0073273181915283, "learning_rate": 0.00016776429979309057, "loss": 1.4714, "step": 451500 }, { "epoch": 13.27, "grad_norm": 1.5322887897491455, "learning_rate": 0.0001676172078017592, "loss": 1.4683, "step": 452000 }, { "epoch": 13.29, "grad_norm": 10.244864463806152, "learning_rate": 0.00016747011581042784, "loss": 1.4834, "step": 452500 }, { "epoch": 13.3, "grad_norm": 2.7981672286987305, "learning_rate": 0.00016732302381909645, "loss": 1.4919, "step": 453000 }, { "epoch": 13.31, "grad_norm": 1.3389350175857544, "learning_rate": 0.00016717593182776508, "loss": 1.4589, "step": 453500 }, { "epoch": 13.33, "grad_norm": 2.1007466316223145, "learning_rate": 0.0001670288398364337, "loss": 1.4844, "step": 454000 }, { "epoch": 13.34, "grad_norm": 1.9225057363510132, "learning_rate": 0.00016688174784510232, "loss": 1.479, "step": 454500 }, { "epoch": 13.36, "grad_norm": 6.121819972991943, "learning_rate": 0.00016673465585377093, "loss": 1.4713, "step": 455000 }, { "epoch": 13.37, "grad_norm": 2.738173246383667, "learning_rate": 0.00016658756386243954, "loss": 1.5046, "step": 455500 }, { "epoch": 13.39, "grad_norm": 1.3760331869125366, "learning_rate": 0.00016644047187110817, "loss": 1.4919, "step": 456000 }, { "epoch": 13.4, "grad_norm": 9.234387397766113, "learning_rate": 0.0001662933798797768, "loss": 1.4531, "step": 456500 }, { "epoch": 13.42, "grad_norm": 1.6355879306793213, "learning_rate": 0.00016614628788844544, "loss": 1.442, "step": 457000 }, { "epoch": 13.43, "grad_norm": 5.203847885131836, "learning_rate": 0.00016599919589711405, "loss": 1.4672, "step": 457500 }, { "epoch": 13.45, "grad_norm": 5.590662956237793, "learning_rate": 0.00016585210390578266, "loss": 1.459, "step": 458000 }, { "epoch": 13.46, "grad_norm": 1.945825457572937, "learning_rate": 0.0001657050119144513, "loss": 1.4632, "step": 458500 }, { "epoch": 13.48, "grad_norm": 5.772292137145996, "learning_rate": 0.0001655579199231199, "loss": 1.4581, "step": 459000 }, { "epoch": 13.49, "grad_norm": 1.2538419961929321, "learning_rate": 0.0001654108279317885, "loss": 1.4735, "step": 459500 }, { "epoch": 13.51, "grad_norm": 5.774242401123047, "learning_rate": 0.00016526373594045714, "loss": 1.4833, "step": 460000 }, { "epoch": 13.52, "grad_norm": 1.8276828527450562, "learning_rate": 0.00016511664394912578, "loss": 1.5017, "step": 460500 }, { "epoch": 13.54, "grad_norm": 2.792278528213501, "learning_rate": 0.0001649695519577944, "loss": 1.4677, "step": 461000 }, { "epoch": 13.55, "grad_norm": 2.325228691101074, "learning_rate": 0.00016482245996646302, "loss": 1.482, "step": 461500 }, { "epoch": 13.56, "grad_norm": 2.3256492614746094, "learning_rate": 0.00016467536797513163, "loss": 1.4808, "step": 462000 }, { "epoch": 13.58, "grad_norm": 1.2064257860183716, "learning_rate": 0.00016452827598380026, "loss": 1.4695, "step": 462500 }, { "epoch": 13.59, "grad_norm": 3.7189693450927734, "learning_rate": 0.00016438118399246887, "loss": 1.4593, "step": 463000 }, { "epoch": 13.61, "grad_norm": 1.6200836896896362, "learning_rate": 0.00016423409200113748, "loss": 1.5176, "step": 463500 }, { "epoch": 13.62, "grad_norm": 3.3271920680999756, "learning_rate": 0.0001640870000098061, "loss": 1.4941, "step": 464000 }, { "epoch": 13.64, "grad_norm": 1.7106289863586426, "learning_rate": 0.00016393990801847474, "loss": 1.4414, "step": 464500 }, { "epoch": 13.65, "grad_norm": 2.1748921871185303, "learning_rate": 0.00016379281602714338, "loss": 1.4887, "step": 465000 }, { "epoch": 13.67, "grad_norm": 1.4012078046798706, "learning_rate": 0.000163645724035812, "loss": 1.4773, "step": 465500 }, { "epoch": 13.68, "grad_norm": 1.6043895483016968, "learning_rate": 0.0001634986320444806, "loss": 1.5182, "step": 466000 }, { "epoch": 13.7, "grad_norm": 1.53175950050354, "learning_rate": 0.00016335154005314923, "loss": 1.4939, "step": 466500 }, { "epoch": 13.71, "grad_norm": 7.83038854598999, "learning_rate": 0.00016320444806181784, "loss": 1.4997, "step": 467000 }, { "epoch": 13.73, "grad_norm": 20.90247344970703, "learning_rate": 0.00016305735607048644, "loss": 1.4717, "step": 467500 }, { "epoch": 13.74, "grad_norm": 1.7639943361282349, "learning_rate": 0.00016291026407915508, "loss": 1.4682, "step": 468000 }, { "epoch": 13.76, "grad_norm": 1.8345731496810913, "learning_rate": 0.00016276317208782371, "loss": 1.4941, "step": 468500 }, { "epoch": 13.77, "grad_norm": 13.04284381866455, "learning_rate": 0.00016261608009649235, "loss": 1.508, "step": 469000 }, { "epoch": 13.78, "grad_norm": 3.476304531097412, "learning_rate": 0.00016246898810516096, "loss": 1.4849, "step": 469500 }, { "epoch": 13.8, "grad_norm": 76.50082397460938, "learning_rate": 0.00016232189611382956, "loss": 1.5388, "step": 470000 }, { "epoch": 13.81, "grad_norm": 2.03076171875, "learning_rate": 0.0001621748041224982, "loss": 1.4964, "step": 470500 }, { "epoch": 13.83, "grad_norm": 5.736513137817383, "learning_rate": 0.0001620277121311668, "loss": 1.5211, "step": 471000 }, { "epoch": 13.84, "grad_norm": 1.7192264795303345, "learning_rate": 0.0001618806201398354, "loss": 1.4806, "step": 471500 }, { "epoch": 13.86, "grad_norm": 1.4231237173080444, "learning_rate": 0.00016173352814850408, "loss": 1.465, "step": 472000 }, { "epoch": 13.87, "grad_norm": 1.2601704597473145, "learning_rate": 0.00016158643615717268, "loss": 1.5081, "step": 472500 }, { "epoch": 13.89, "grad_norm": 1.2420154809951782, "learning_rate": 0.00016143934416584132, "loss": 1.5014, "step": 473000 }, { "epoch": 13.9, "grad_norm": 2.446563720703125, "learning_rate": 0.00016129225217450993, "loss": 1.4975, "step": 473500 }, { "epoch": 13.92, "grad_norm": 2.2312917709350586, "learning_rate": 0.00016114516018317853, "loss": 1.4827, "step": 474000 }, { "epoch": 13.93, "grad_norm": 2.2286717891693115, "learning_rate": 0.00016099806819184717, "loss": 1.4932, "step": 474500 }, { "epoch": 13.95, "grad_norm": 3.0326056480407715, "learning_rate": 0.00016085097620051577, "loss": 1.5015, "step": 475000 }, { "epoch": 13.96, "grad_norm": 4.350718975067139, "learning_rate": 0.0001607038842091844, "loss": 1.4917, "step": 475500 }, { "epoch": 13.98, "grad_norm": 1.2791180610656738, "learning_rate": 0.00016055679221785304, "loss": 1.5066, "step": 476000 }, { "epoch": 13.99, "grad_norm": 2.8099005222320557, "learning_rate": 0.00016040970022652165, "loss": 1.4746, "step": 476500 }, { "epoch": 14.0, "grad_norm": 2.087524175643921, "learning_rate": 0.0001602626082351903, "loss": 1.4278, "step": 477000 }, { "epoch": 14.02, "grad_norm": 5.598818778991699, "learning_rate": 0.0001601155162438589, "loss": 1.4266, "step": 477500 }, { "epoch": 14.03, "grad_norm": 1.573020100593567, "learning_rate": 0.00015996842425252753, "loss": 1.4198, "step": 478000 }, { "epoch": 14.05, "grad_norm": 2.8413734436035156, "learning_rate": 0.00015982133226119614, "loss": 1.421, "step": 478500 }, { "epoch": 14.06, "grad_norm": 1.3834174871444702, "learning_rate": 0.00015967424026986474, "loss": 1.4446, "step": 479000 }, { "epoch": 14.08, "grad_norm": 1.9328341484069824, "learning_rate": 0.00015952714827853338, "loss": 1.4332, "step": 479500 }, { "epoch": 14.09, "grad_norm": 4.331046104431152, "learning_rate": 0.000159380056287202, "loss": 1.4373, "step": 480000 }, { "epoch": 14.11, "grad_norm": 8.744680404663086, "learning_rate": 0.00015923296429587065, "loss": 1.4372, "step": 480500 }, { "epoch": 14.12, "grad_norm": 1.4437713623046875, "learning_rate": 0.00015908587230453926, "loss": 1.467, "step": 481000 }, { "epoch": 14.14, "grad_norm": 1.5478661060333252, "learning_rate": 0.00015893878031320786, "loss": 1.4209, "step": 481500 }, { "epoch": 14.15, "grad_norm": 12.385702133178711, "learning_rate": 0.0001587916883218765, "loss": 1.4529, "step": 482000 }, { "epoch": 14.17, "grad_norm": 1.585307002067566, "learning_rate": 0.0001586445963305451, "loss": 1.442, "step": 482500 }, { "epoch": 14.18, "grad_norm": 2.214268207550049, "learning_rate": 0.0001584975043392137, "loss": 1.4349, "step": 483000 }, { "epoch": 14.2, "grad_norm": 9.036513328552246, "learning_rate": 0.00015835041234788235, "loss": 1.4179, "step": 483500 }, { "epoch": 14.21, "grad_norm": 2.498725175857544, "learning_rate": 0.00015820332035655098, "loss": 1.4394, "step": 484000 }, { "epoch": 14.23, "grad_norm": 7.920095920562744, "learning_rate": 0.00015805622836521962, "loss": 1.4494, "step": 484500 }, { "epoch": 14.24, "grad_norm": 2.6044135093688965, "learning_rate": 0.00015790913637388822, "loss": 1.4159, "step": 485000 }, { "epoch": 14.25, "grad_norm": 1.906570315361023, "learning_rate": 0.00015776204438255683, "loss": 1.446, "step": 485500 }, { "epoch": 14.27, "grad_norm": 1.8452608585357666, "learning_rate": 0.00015761495239122547, "loss": 1.3961, "step": 486000 }, { "epoch": 14.28, "grad_norm": 1.6884217262268066, "learning_rate": 0.00015746786039989407, "loss": 1.4385, "step": 486500 }, { "epoch": 14.3, "grad_norm": 1.6386513710021973, "learning_rate": 0.00015732076840856268, "loss": 1.3842, "step": 487000 }, { "epoch": 14.31, "grad_norm": 5.906566619873047, "learning_rate": 0.00015717367641723132, "loss": 1.4241, "step": 487500 }, { "epoch": 14.33, "grad_norm": 2.964063882827759, "learning_rate": 0.00015702658442589995, "loss": 1.4454, "step": 488000 }, { "epoch": 14.34, "grad_norm": 1.53037428855896, "learning_rate": 0.00015687949243456859, "loss": 1.4067, "step": 488500 }, { "epoch": 14.36, "grad_norm": 1.5273714065551758, "learning_rate": 0.0001567324004432372, "loss": 1.4382, "step": 489000 }, { "epoch": 14.37, "grad_norm": 1.7097736597061157, "learning_rate": 0.0001565853084519058, "loss": 1.4167, "step": 489500 }, { "epoch": 14.39, "grad_norm": 2.170961380004883, "learning_rate": 0.00015643821646057444, "loss": 1.4541, "step": 490000 }, { "epoch": 14.4, "grad_norm": 5.699864387512207, "learning_rate": 0.00015629112446924304, "loss": 1.407, "step": 490500 }, { "epoch": 14.42, "grad_norm": 1.7430428266525269, "learning_rate": 0.00015614403247791165, "loss": 1.4418, "step": 491000 }, { "epoch": 14.43, "grad_norm": 2.179090976715088, "learning_rate": 0.00015599694048658029, "loss": 1.4331, "step": 491500 }, { "epoch": 14.45, "grad_norm": 1.534424901008606, "learning_rate": 0.00015584984849524892, "loss": 1.4536, "step": 492000 }, { "epoch": 14.46, "grad_norm": 1.464789867401123, "learning_rate": 0.00015570275650391755, "loss": 1.4459, "step": 492500 }, { "epoch": 14.47, "grad_norm": 1.9043675661087036, "learning_rate": 0.00015555566451258616, "loss": 1.4396, "step": 493000 }, { "epoch": 14.49, "grad_norm": 5.7054762840271, "learning_rate": 0.00015540857252125477, "loss": 1.4243, "step": 493500 }, { "epoch": 14.5, "grad_norm": 4.648599624633789, "learning_rate": 0.0001552614805299234, "loss": 1.4607, "step": 494000 }, { "epoch": 14.52, "grad_norm": 1.4880977869033813, "learning_rate": 0.000155114388538592, "loss": 1.4457, "step": 494500 }, { "epoch": 14.53, "grad_norm": 2.5349297523498535, "learning_rate": 0.00015496729654726062, "loss": 1.4675, "step": 495000 }, { "epoch": 14.55, "grad_norm": 3.5980210304260254, "learning_rate": 0.00015482020455592925, "loss": 1.4704, "step": 495500 }, { "epoch": 14.56, "grad_norm": 3.0215611457824707, "learning_rate": 0.0001546731125645979, "loss": 1.4202, "step": 496000 }, { "epoch": 14.58, "grad_norm": 1.6257708072662354, "learning_rate": 0.00015452602057326652, "loss": 1.4569, "step": 496500 }, { "epoch": 14.59, "grad_norm": 3.856419324874878, "learning_rate": 0.00015437892858193513, "loss": 1.4142, "step": 497000 }, { "epoch": 14.61, "grad_norm": 4.544717788696289, "learning_rate": 0.00015423183659060374, "loss": 1.4437, "step": 497500 }, { "epoch": 14.62, "grad_norm": 3.5249671936035156, "learning_rate": 0.00015408474459927237, "loss": 1.4432, "step": 498000 }, { "epoch": 14.64, "grad_norm": 1.8191946744918823, "learning_rate": 0.00015393765260794098, "loss": 1.435, "step": 498500 }, { "epoch": 14.65, "grad_norm": 2.6009254455566406, "learning_rate": 0.00015379056061660962, "loss": 1.411, "step": 499000 }, { "epoch": 14.67, "grad_norm": 2.475637674331665, "learning_rate": 0.00015364346862527822, "loss": 1.4568, "step": 499500 }, { "epoch": 14.68, "grad_norm": 3.965743064880371, "learning_rate": 0.00015349637663394686, "loss": 1.455, "step": 500000 }, { "epoch": 14.69, "grad_norm": 1.3109550476074219, "learning_rate": 0.0001533492846426155, "loss": 1.4547, "step": 500500 }, { "epoch": 14.71, "grad_norm": 2.0032811164855957, "learning_rate": 0.0001532021926512841, "loss": 1.4451, "step": 501000 }, { "epoch": 14.72, "grad_norm": 1.4996896982192993, "learning_rate": 0.00015305510065995274, "loss": 1.4497, "step": 501500 }, { "epoch": 14.74, "grad_norm": 2.1643595695495605, "learning_rate": 0.00015290800866862134, "loss": 1.4335, "step": 502000 }, { "epoch": 14.75, "grad_norm": 1.5510461330413818, "learning_rate": 0.00015276091667728995, "loss": 1.44, "step": 502500 }, { "epoch": 14.77, "grad_norm": 1.9841787815093994, "learning_rate": 0.00015261382468595858, "loss": 1.392, "step": 503000 }, { "epoch": 14.78, "grad_norm": 7.774050712585449, "learning_rate": 0.0001524667326946272, "loss": 1.4521, "step": 503500 }, { "epoch": 14.8, "grad_norm": 1.3330835103988647, "learning_rate": 0.00015231964070329585, "loss": 1.4562, "step": 504000 }, { "epoch": 14.81, "grad_norm": 2.6969919204711914, "learning_rate": 0.00015217254871196446, "loss": 1.4313, "step": 504500 }, { "epoch": 14.83, "grad_norm": 1.6068675518035889, "learning_rate": 0.00015202545672063307, "loss": 1.4515, "step": 505000 }, { "epoch": 14.84, "grad_norm": 3.470465660095215, "learning_rate": 0.0001518783647293017, "loss": 1.418, "step": 505500 }, { "epoch": 14.86, "grad_norm": 5.018684387207031, "learning_rate": 0.0001517312727379703, "loss": 1.4236, "step": 506000 }, { "epoch": 14.87, "grad_norm": 1.603996753692627, "learning_rate": 0.00015158418074663892, "loss": 1.4478, "step": 506500 }, { "epoch": 14.89, "grad_norm": 2.150404930114746, "learning_rate": 0.00015143708875530755, "loss": 1.4523, "step": 507000 }, { "epoch": 14.9, "grad_norm": 1.252964735031128, "learning_rate": 0.00015128999676397616, "loss": 1.4546, "step": 507500 }, { "epoch": 14.92, "grad_norm": 3.1075639724731445, "learning_rate": 0.00015114290477264482, "loss": 1.4547, "step": 508000 }, { "epoch": 14.93, "grad_norm": 1.4597883224487305, "learning_rate": 0.00015099581278131343, "loss": 1.4902, "step": 508500 }, { "epoch": 14.94, "grad_norm": 2.405596971511841, "learning_rate": 0.00015084872078998204, "loss": 1.4306, "step": 509000 }, { "epoch": 14.96, "grad_norm": 1.952222228050232, "learning_rate": 0.00015070162879865067, "loss": 1.4209, "step": 509500 }, { "epoch": 14.97, "grad_norm": 2.3458521366119385, "learning_rate": 0.00015055453680731928, "loss": 1.468, "step": 510000 }, { "epoch": 14.99, "grad_norm": 1.6442558765411377, "learning_rate": 0.0001504074448159879, "loss": 1.4541, "step": 510500 }, { "epoch": 15.0, "grad_norm": 2.1197829246520996, "learning_rate": 0.00015026035282465652, "loss": 1.4195, "step": 511000 }, { "epoch": 15.02, "grad_norm": 2.8808157444000244, "learning_rate": 0.00015011326083332513, "loss": 1.3719, "step": 511500 }, { "epoch": 15.03, "grad_norm": 1.939942717552185, "learning_rate": 0.00014996616884199377, "loss": 1.3627, "step": 512000 }, { "epoch": 15.05, "grad_norm": 4.798866271972656, "learning_rate": 0.00014981907685066237, "loss": 1.3839, "step": 512500 }, { "epoch": 15.06, "grad_norm": 2.553893566131592, "learning_rate": 0.000149671984859331, "loss": 1.4047, "step": 513000 }, { "epoch": 15.08, "grad_norm": 1.915277123451233, "learning_rate": 0.00014952489286799964, "loss": 1.3933, "step": 513500 }, { "epoch": 15.09, "grad_norm": 1.5252777338027954, "learning_rate": 0.00014937780087666825, "loss": 1.3644, "step": 514000 }, { "epoch": 15.11, "grad_norm": 1.6960341930389404, "learning_rate": 0.00014923070888533686, "loss": 1.3946, "step": 514500 }, { "epoch": 15.12, "grad_norm": 9.384785652160645, "learning_rate": 0.0001490836168940055, "loss": 1.3706, "step": 515000 }, { "epoch": 15.14, "grad_norm": 2.280630350112915, "learning_rate": 0.00014893652490267413, "loss": 1.3836, "step": 515500 }, { "epoch": 15.15, "grad_norm": 1.8966234922409058, "learning_rate": 0.00014878943291134273, "loss": 1.373, "step": 516000 }, { "epoch": 15.16, "grad_norm": 2.4201481342315674, "learning_rate": 0.00014864234092001134, "loss": 1.4014, "step": 516500 }, { "epoch": 15.18, "grad_norm": 2.8996338844299316, "learning_rate": 0.00014849524892867998, "loss": 1.3985, "step": 517000 }, { "epoch": 15.19, "grad_norm": 2.2007150650024414, "learning_rate": 0.0001483481569373486, "loss": 1.3823, "step": 517500 }, { "epoch": 15.21, "grad_norm": 2.7956199645996094, "learning_rate": 0.00014820106494601722, "loss": 1.3936, "step": 518000 }, { "epoch": 15.22, "grad_norm": 2.891369104385376, "learning_rate": 0.00014805397295468585, "loss": 1.3991, "step": 518500 }, { "epoch": 15.24, "grad_norm": 7.358971118927002, "learning_rate": 0.00014790688096335446, "loss": 1.4115, "step": 519000 }, { "epoch": 15.25, "grad_norm": 2.1062731742858887, "learning_rate": 0.0001477597889720231, "loss": 1.398, "step": 519500 }, { "epoch": 15.27, "grad_norm": 3.914013385772705, "learning_rate": 0.0001476126969806917, "loss": 1.4129, "step": 520000 }, { "epoch": 15.28, "grad_norm": 1.342411756515503, "learning_rate": 0.00014746560498936034, "loss": 1.4022, "step": 520500 }, { "epoch": 15.3, "grad_norm": 1.3836804628372192, "learning_rate": 0.00014731851299802897, "loss": 1.3849, "step": 521000 }, { "epoch": 15.31, "grad_norm": 4.170617580413818, "learning_rate": 0.00014717142100669758, "loss": 1.3855, "step": 521500 }, { "epoch": 15.33, "grad_norm": 6.060724258422852, "learning_rate": 0.0001470243290153662, "loss": 1.3645, "step": 522000 }, { "epoch": 15.34, "grad_norm": 5.009141445159912, "learning_rate": 0.00014687723702403482, "loss": 1.4106, "step": 522500 }, { "epoch": 15.36, "grad_norm": 4.241628646850586, "learning_rate": 0.00014673014503270346, "loss": 1.3983, "step": 523000 }, { "epoch": 15.37, "grad_norm": 1.6837831735610962, "learning_rate": 0.00014658305304137206, "loss": 1.4023, "step": 523500 }, { "epoch": 15.38, "grad_norm": 2.455502510070801, "learning_rate": 0.00014643596105004067, "loss": 1.3607, "step": 524000 }, { "epoch": 15.4, "grad_norm": 6.113306522369385, "learning_rate": 0.0001462888690587093, "loss": 1.4292, "step": 524500 }, { "epoch": 15.41, "grad_norm": 1.7169700860977173, "learning_rate": 0.00014614177706737794, "loss": 1.364, "step": 525000 }, { "epoch": 15.43, "grad_norm": 3.563976287841797, "learning_rate": 0.00014599468507604655, "loss": 1.4007, "step": 525500 }, { "epoch": 15.44, "grad_norm": 3.6780059337615967, "learning_rate": 0.00014584759308471516, "loss": 1.4072, "step": 526000 }, { "epoch": 15.46, "grad_norm": 1.8734827041625977, "learning_rate": 0.0001457005010933838, "loss": 1.4088, "step": 526500 }, { "epoch": 15.47, "grad_norm": 6.644404411315918, "learning_rate": 0.00014555340910205243, "loss": 1.3958, "step": 527000 }, { "epoch": 15.49, "grad_norm": 1.7657749652862549, "learning_rate": 0.00014540631711072103, "loss": 1.418, "step": 527500 }, { "epoch": 15.5, "grad_norm": 4.092434406280518, "learning_rate": 0.00014525922511938964, "loss": 1.3643, "step": 528000 }, { "epoch": 15.52, "grad_norm": 30.432186126708984, "learning_rate": 0.00014511213312805828, "loss": 1.3797, "step": 528500 }, { "epoch": 15.53, "grad_norm": 10.275367736816406, "learning_rate": 0.0001449650411367269, "loss": 1.3928, "step": 529000 }, { "epoch": 15.55, "grad_norm": 20.94750213623047, "learning_rate": 0.00014481794914539552, "loss": 1.4131, "step": 529500 }, { "epoch": 15.56, "grad_norm": 1.7463383674621582, "learning_rate": 0.00014467085715406413, "loss": 1.4173, "step": 530000 }, { "epoch": 15.58, "grad_norm": 2.380938768386841, "learning_rate": 0.00014452376516273276, "loss": 1.4092, "step": 530500 }, { "epoch": 15.59, "grad_norm": 1.8386043310165405, "learning_rate": 0.0001443766731714014, "loss": 1.3972, "step": 531000 }, { "epoch": 15.61, "grad_norm": 1.3829760551452637, "learning_rate": 0.00014422958118007, "loss": 1.391, "step": 531500 }, { "epoch": 15.62, "grad_norm": 2.171069383621216, "learning_rate": 0.0001440824891887386, "loss": 1.3993, "step": 532000 }, { "epoch": 15.63, "grad_norm": 1.716299057006836, "learning_rate": 0.00014393539719740724, "loss": 1.398, "step": 532500 }, { "epoch": 15.65, "grad_norm": 1.7643611431121826, "learning_rate": 0.00014378830520607588, "loss": 1.3901, "step": 533000 }, { "epoch": 15.66, "grad_norm": 1.68152916431427, "learning_rate": 0.0001436412132147445, "loss": 1.3873, "step": 533500 }, { "epoch": 15.68, "grad_norm": 2.581348419189453, "learning_rate": 0.0001434941212234131, "loss": 1.3713, "step": 534000 }, { "epoch": 15.69, "grad_norm": 3.0933191776275635, "learning_rate": 0.00014334702923208173, "loss": 1.4283, "step": 534500 }, { "epoch": 15.71, "grad_norm": 6.795374870300293, "learning_rate": 0.00014319993724075036, "loss": 1.4088, "step": 535000 }, { "epoch": 15.72, "grad_norm": 2.514035701751709, "learning_rate": 0.00014305284524941897, "loss": 1.4042, "step": 535500 }, { "epoch": 15.74, "grad_norm": 2.5651772022247314, "learning_rate": 0.00014290575325808758, "loss": 1.4236, "step": 536000 }, { "epoch": 15.75, "grad_norm": 1.6859495639801025, "learning_rate": 0.00014275866126675621, "loss": 1.4243, "step": 536500 }, { "epoch": 15.77, "grad_norm": 1.8449592590332031, "learning_rate": 0.00014261156927542485, "loss": 1.406, "step": 537000 }, { "epoch": 15.78, "grad_norm": 2.2886695861816406, "learning_rate": 0.00014246447728409346, "loss": 1.4095, "step": 537500 }, { "epoch": 15.8, "grad_norm": 2.669768810272217, "learning_rate": 0.00014231738529276206, "loss": 1.4172, "step": 538000 }, { "epoch": 15.81, "grad_norm": 5.082691192626953, "learning_rate": 0.0001421702933014307, "loss": 1.4307, "step": 538500 }, { "epoch": 15.83, "grad_norm": 2.9871368408203125, "learning_rate": 0.00014202320131009933, "loss": 1.3937, "step": 539000 }, { "epoch": 15.84, "grad_norm": 1.900804877281189, "learning_rate": 0.00014187610931876794, "loss": 1.4226, "step": 539500 }, { "epoch": 15.85, "grad_norm": 1.668407678604126, "learning_rate": 0.00014172901732743655, "loss": 1.3807, "step": 540000 }, { "epoch": 15.87, "grad_norm": 5.046024799346924, "learning_rate": 0.00014158192533610518, "loss": 1.4203, "step": 540500 }, { "epoch": 15.88, "grad_norm": 2.8824052810668945, "learning_rate": 0.00014143483334477382, "loss": 1.4188, "step": 541000 }, { "epoch": 15.9, "grad_norm": 2.688316583633423, "learning_rate": 0.00014128774135344243, "loss": 1.418, "step": 541500 }, { "epoch": 15.91, "grad_norm": 2.323672294616699, "learning_rate": 0.00014114064936211106, "loss": 1.3825, "step": 542000 }, { "epoch": 15.93, "grad_norm": 23.119873046875, "learning_rate": 0.00014099355737077967, "loss": 1.4148, "step": 542500 }, { "epoch": 15.94, "grad_norm": 3.257922649383545, "learning_rate": 0.0001408464653794483, "loss": 1.4327, "step": 543000 }, { "epoch": 15.96, "grad_norm": 1.8719940185546875, "learning_rate": 0.0001406993733881169, "loss": 1.3868, "step": 543500 }, { "epoch": 15.97, "grad_norm": 2.128316640853882, "learning_rate": 0.00014055228139678554, "loss": 1.4102, "step": 544000 }, { "epoch": 15.99, "grad_norm": 2.277371644973755, "learning_rate": 0.00014040518940545418, "loss": 1.4014, "step": 544500 }, { "epoch": 16.0, "grad_norm": 41.53245162963867, "learning_rate": 0.0001402580974141228, "loss": 1.4178, "step": 545000 }, { "epoch": 16.02, "grad_norm": 1.792492151260376, "learning_rate": 0.0001401110054227914, "loss": 1.3353, "step": 545500 }, { "epoch": 16.03, "grad_norm": 1.582220435142517, "learning_rate": 0.00013996391343146003, "loss": 1.3722, "step": 546000 }, { "epoch": 16.05, "grad_norm": 3.7628824710845947, "learning_rate": 0.00013981682144012866, "loss": 1.3593, "step": 546500 }, { "epoch": 16.06, "grad_norm": 2.833401918411255, "learning_rate": 0.00013966972944879727, "loss": 1.3627, "step": 547000 }, { "epoch": 16.07, "grad_norm": 2.1202030181884766, "learning_rate": 0.00013952263745746588, "loss": 1.3528, "step": 547500 }, { "epoch": 16.09, "grad_norm": 1.919555425643921, "learning_rate": 0.0001393755454661345, "loss": 1.3087, "step": 548000 }, { "epoch": 16.1, "grad_norm": 2.3834056854248047, "learning_rate": 0.00013922845347480315, "loss": 1.3303, "step": 548500 }, { "epoch": 16.12, "grad_norm": 2.0871472358703613, "learning_rate": 0.00013908136148347176, "loss": 1.3423, "step": 549000 }, { "epoch": 16.13, "grad_norm": 3.040555953979492, "learning_rate": 0.00013893426949214036, "loss": 1.3402, "step": 549500 }, { "epoch": 16.15, "grad_norm": 2.8366496562957764, "learning_rate": 0.000138787177500809, "loss": 1.3586, "step": 550000 }, { "epoch": 16.16, "grad_norm": 10.008976936340332, "learning_rate": 0.00013864008550947763, "loss": 1.3321, "step": 550500 }, { "epoch": 16.18, "grad_norm": 8.792502403259277, "learning_rate": 0.00013849299351814624, "loss": 1.3781, "step": 551000 }, { "epoch": 16.19, "grad_norm": 8.872962951660156, "learning_rate": 0.00013834590152681485, "loss": 1.3687, "step": 551500 }, { "epoch": 16.21, "grad_norm": 5.631560802459717, "learning_rate": 0.00013819880953548348, "loss": 1.3419, "step": 552000 }, { "epoch": 16.22, "grad_norm": 2.04437255859375, "learning_rate": 0.00013805171754415212, "loss": 1.3335, "step": 552500 }, { "epoch": 16.24, "grad_norm": 2.526149272918701, "learning_rate": 0.00013790462555282072, "loss": 1.3446, "step": 553000 }, { "epoch": 16.25, "grad_norm": 8.970195770263672, "learning_rate": 0.00013775753356148933, "loss": 1.3795, "step": 553500 }, { "epoch": 16.27, "grad_norm": 4.344628810882568, "learning_rate": 0.00013761044157015797, "loss": 1.3231, "step": 554000 }, { "epoch": 16.28, "grad_norm": 2.5846548080444336, "learning_rate": 0.0001374633495788266, "loss": 1.3571, "step": 554500 }, { "epoch": 16.3, "grad_norm": 20.92795181274414, "learning_rate": 0.0001373162575874952, "loss": 1.38, "step": 555000 }, { "epoch": 16.31, "grad_norm": 2.3088529109954834, "learning_rate": 0.00013716916559616382, "loss": 1.3473, "step": 555500 }, { "epoch": 16.32, "grad_norm": 13.217586517333984, "learning_rate": 0.00013702207360483245, "loss": 1.3558, "step": 556000 }, { "epoch": 16.34, "grad_norm": 8.642449378967285, "learning_rate": 0.00013687498161350109, "loss": 1.382, "step": 556500 }, { "epoch": 16.35, "grad_norm": 1.4824799299240112, "learning_rate": 0.0001367278896221697, "loss": 1.384, "step": 557000 }, { "epoch": 16.37, "grad_norm": 1.741585373878479, "learning_rate": 0.0001365807976308383, "loss": 1.3664, "step": 557500 }, { "epoch": 16.38, "grad_norm": 1.7038291692733765, "learning_rate": 0.00013643370563950694, "loss": 1.3634, "step": 558000 }, { "epoch": 16.4, "grad_norm": 17.11383628845215, "learning_rate": 0.00013628661364817557, "loss": 1.3604, "step": 558500 }, { "epoch": 16.41, "grad_norm": 19.561166763305664, "learning_rate": 0.00013613952165684418, "loss": 1.3731, "step": 559000 }, { "epoch": 16.43, "grad_norm": 2.2067980766296387, "learning_rate": 0.00013599242966551279, "loss": 1.3472, "step": 559500 }, { "epoch": 16.44, "grad_norm": 3.9638609886169434, "learning_rate": 0.00013584533767418142, "loss": 1.365, "step": 560000 }, { "epoch": 16.46, "grad_norm": 2.2947542667388916, "learning_rate": 0.00013569824568285006, "loss": 1.3618, "step": 560500 }, { "epoch": 16.47, "grad_norm": 2.3892598152160645, "learning_rate": 0.00013555115369151866, "loss": 1.3331, "step": 561000 }, { "epoch": 16.49, "grad_norm": 1.9236092567443848, "learning_rate": 0.00013540406170018727, "loss": 1.3508, "step": 561500 }, { "epoch": 16.5, "grad_norm": 5.180337429046631, "learning_rate": 0.0001352569697088559, "loss": 1.3612, "step": 562000 }, { "epoch": 16.52, "grad_norm": 8.786672592163086, "learning_rate": 0.00013510987771752454, "loss": 1.3776, "step": 562500 }, { "epoch": 16.53, "grad_norm": 4.111878871917725, "learning_rate": 0.00013496278572619315, "loss": 1.3661, "step": 563000 }, { "epoch": 16.54, "grad_norm": 4.706780433654785, "learning_rate": 0.00013481569373486178, "loss": 1.3731, "step": 563500 }, { "epoch": 16.56, "grad_norm": 1.3788596391677856, "learning_rate": 0.0001346686017435304, "loss": 1.3316, "step": 564000 }, { "epoch": 16.57, "grad_norm": 2.970449924468994, "learning_rate": 0.00013452150975219902, "loss": 1.3612, "step": 564500 }, { "epoch": 16.59, "grad_norm": 2.0503463745117188, "learning_rate": 0.00013437441776086763, "loss": 1.3635, "step": 565000 }, { "epoch": 16.6, "grad_norm": 10.668388366699219, "learning_rate": 0.00013422732576953627, "loss": 1.3764, "step": 565500 }, { "epoch": 16.62, "grad_norm": 8.57248592376709, "learning_rate": 0.00013408023377820487, "loss": 1.3521, "step": 566000 }, { "epoch": 16.63, "grad_norm": 1.6269396543502808, "learning_rate": 0.0001339331417868735, "loss": 1.3406, "step": 566500 }, { "epoch": 16.65, "grad_norm": 1.764863133430481, "learning_rate": 0.00013378604979554212, "loss": 1.3325, "step": 567000 }, { "epoch": 16.66, "grad_norm": 1.6542813777923584, "learning_rate": 0.00013363895780421075, "loss": 1.3601, "step": 567500 }, { "epoch": 16.68, "grad_norm": 1.9206827878952026, "learning_rate": 0.00013349186581287936, "loss": 1.3515, "step": 568000 }, { "epoch": 16.69, "grad_norm": 2.092914581298828, "learning_rate": 0.000133344773821548, "loss": 1.3617, "step": 568500 }, { "epoch": 16.71, "grad_norm": 6.570430278778076, "learning_rate": 0.0001331976818302166, "loss": 1.3728, "step": 569000 }, { "epoch": 16.72, "grad_norm": 2.719400644302368, "learning_rate": 0.00013305058983888524, "loss": 1.3406, "step": 569500 }, { "epoch": 16.74, "grad_norm": 9.820898056030273, "learning_rate": 0.00013290349784755384, "loss": 1.4036, "step": 570000 }, { "epoch": 16.75, "grad_norm": 2.3756299018859863, "learning_rate": 0.00013275640585622248, "loss": 1.3735, "step": 570500 }, { "epoch": 16.76, "grad_norm": 6.200007438659668, "learning_rate": 0.00013260931386489109, "loss": 1.3792, "step": 571000 }, { "epoch": 16.78, "grad_norm": 7.884439468383789, "learning_rate": 0.00013246222187355972, "loss": 1.3666, "step": 571500 }, { "epoch": 16.79, "grad_norm": 2.011915683746338, "learning_rate": 0.00013231512988222833, "loss": 1.3541, "step": 572000 }, { "epoch": 16.81, "grad_norm": 6.167238712310791, "learning_rate": 0.00013216803789089696, "loss": 1.3781, "step": 572500 }, { "epoch": 16.82, "grad_norm": 38.23750686645508, "learning_rate": 0.00013202094589956557, "loss": 1.3546, "step": 573000 }, { "epoch": 16.84, "grad_norm": 2.6406400203704834, "learning_rate": 0.0001318738539082342, "loss": 1.3959, "step": 573500 }, { "epoch": 16.85, "grad_norm": 1.713273525238037, "learning_rate": 0.0001317267619169028, "loss": 1.3625, "step": 574000 }, { "epoch": 16.87, "grad_norm": 2.819561004638672, "learning_rate": 0.00013157966992557145, "loss": 1.354, "step": 574500 }, { "epoch": 16.88, "grad_norm": 5.471235275268555, "learning_rate": 0.00013143257793424005, "loss": 1.3988, "step": 575000 }, { "epoch": 16.9, "grad_norm": 2.4000468254089355, "learning_rate": 0.0001312854859429087, "loss": 1.3432, "step": 575500 }, { "epoch": 16.91, "grad_norm": 2.053870439529419, "learning_rate": 0.0001311383939515773, "loss": 1.3626, "step": 576000 }, { "epoch": 16.93, "grad_norm": 1.577664852142334, "learning_rate": 0.00013099130196024593, "loss": 1.3806, "step": 576500 }, { "epoch": 16.94, "grad_norm": 16.11113166809082, "learning_rate": 0.00013084420996891454, "loss": 1.3787, "step": 577000 }, { "epoch": 16.96, "grad_norm": 4.257967948913574, "learning_rate": 0.00013069711797758317, "loss": 1.355, "step": 577500 }, { "epoch": 16.97, "grad_norm": 1.8505833148956299, "learning_rate": 0.00013055002598625178, "loss": 1.3702, "step": 578000 }, { "epoch": 16.98, "grad_norm": 2.1093640327453613, "learning_rate": 0.00013040293399492042, "loss": 1.3775, "step": 578500 }, { "epoch": 17.0, "grad_norm": 1.8386383056640625, "learning_rate": 0.00013025584200358902, "loss": 1.3776, "step": 579000 }, { "epoch": 17.01, "grad_norm": 7.34138298034668, "learning_rate": 0.00013010875001225766, "loss": 1.3215, "step": 579500 }, { "epoch": 17.03, "grad_norm": 1.0172581672668457, "learning_rate": 0.00012996165802092627, "loss": 1.3147, "step": 580000 }, { "epoch": 17.04, "grad_norm": 2.6336045265197754, "learning_rate": 0.0001298145660295949, "loss": 1.3365, "step": 580500 }, { "epoch": 17.06, "grad_norm": 3.9906227588653564, "learning_rate": 0.0001296674740382635, "loss": 1.3245, "step": 581000 }, { "epoch": 17.07, "grad_norm": 3.1087284088134766, "learning_rate": 0.00012952038204693214, "loss": 1.2864, "step": 581500 }, { "epoch": 17.09, "grad_norm": 2.5867342948913574, "learning_rate": 0.00012937329005560075, "loss": 1.2947, "step": 582000 }, { "epoch": 17.1, "grad_norm": 1.2118226289749146, "learning_rate": 0.00012922619806426938, "loss": 1.3, "step": 582500 }, { "epoch": 17.12, "grad_norm": 2.56510329246521, "learning_rate": 0.000129079106072938, "loss": 1.3147, "step": 583000 }, { "epoch": 17.13, "grad_norm": 10.831042289733887, "learning_rate": 0.00012893201408160663, "loss": 1.3125, "step": 583500 }, { "epoch": 17.15, "grad_norm": 10.36989688873291, "learning_rate": 0.00012878492209027523, "loss": 1.2969, "step": 584000 }, { "epoch": 17.16, "grad_norm": 2.0238804817199707, "learning_rate": 0.00012863783009894387, "loss": 1.3044, "step": 584500 }, { "epoch": 17.18, "grad_norm": 4.501575469970703, "learning_rate": 0.00012849073810761248, "loss": 1.3013, "step": 585000 }, { "epoch": 17.19, "grad_norm": 4.236315727233887, "learning_rate": 0.0001283436461162811, "loss": 1.2991, "step": 585500 }, { "epoch": 17.21, "grad_norm": 4.139219760894775, "learning_rate": 0.00012819655412494972, "loss": 1.2979, "step": 586000 }, { "epoch": 17.22, "grad_norm": 2.1206071376800537, "learning_rate": 0.00012804946213361835, "loss": 1.3377, "step": 586500 }, { "epoch": 17.23, "grad_norm": 1.7728540897369385, "learning_rate": 0.000127902370142287, "loss": 1.2992, "step": 587000 }, { "epoch": 17.25, "grad_norm": 1.9550994634628296, "learning_rate": 0.0001277552781509556, "loss": 1.3311, "step": 587500 }, { "epoch": 17.26, "grad_norm": 1.6412031650543213, "learning_rate": 0.00012760818615962423, "loss": 1.3101, "step": 588000 }, { "epoch": 17.28, "grad_norm": 1.6908353567123413, "learning_rate": 0.00012746109416829284, "loss": 1.3357, "step": 588500 }, { "epoch": 17.29, "grad_norm": 1.6260554790496826, "learning_rate": 0.00012731400217696147, "loss": 1.3128, "step": 589000 }, { "epoch": 17.31, "grad_norm": 4.529758453369141, "learning_rate": 0.00012716691018563008, "loss": 1.3282, "step": 589500 }, { "epoch": 17.32, "grad_norm": 1.872877836227417, "learning_rate": 0.00012701981819429871, "loss": 1.3271, "step": 590000 }, { "epoch": 17.34, "grad_norm": 3.3399105072021484, "learning_rate": 0.00012687272620296732, "loss": 1.3104, "step": 590500 }, { "epoch": 17.35, "grad_norm": 3.178553819656372, "learning_rate": 0.00012672563421163596, "loss": 1.3222, "step": 591000 }, { "epoch": 17.37, "grad_norm": 2.042067527770996, "learning_rate": 0.00012657854222030456, "loss": 1.332, "step": 591500 }, { "epoch": 17.38, "grad_norm": 2.1601064205169678, "learning_rate": 0.0001264314502289732, "loss": 1.3198, "step": 592000 }, { "epoch": 17.4, "grad_norm": 1.6580477952957153, "learning_rate": 0.0001262843582376418, "loss": 1.3059, "step": 592500 }, { "epoch": 17.41, "grad_norm": 3.175902843475342, "learning_rate": 0.00012613726624631044, "loss": 1.3262, "step": 593000 }, { "epoch": 17.43, "grad_norm": 2.7562525272369385, "learning_rate": 0.00012599017425497905, "loss": 1.3389, "step": 593500 }, { "epoch": 17.44, "grad_norm": 1.7579740285873413, "learning_rate": 0.00012584308226364768, "loss": 1.3308, "step": 594000 }, { "epoch": 17.45, "grad_norm": 4.593905448913574, "learning_rate": 0.0001256959902723163, "loss": 1.3059, "step": 594500 }, { "epoch": 17.47, "grad_norm": 1.438839077949524, "learning_rate": 0.00012554889828098493, "loss": 1.3159, "step": 595000 }, { "epoch": 17.48, "grad_norm": 2.5471925735473633, "learning_rate": 0.00012540180628965353, "loss": 1.2989, "step": 595500 }, { "epoch": 17.5, "grad_norm": 1.8019795417785645, "learning_rate": 0.00012525471429832217, "loss": 1.3441, "step": 596000 }, { "epoch": 17.51, "grad_norm": 2.0826618671417236, "learning_rate": 0.00012510762230699078, "loss": 1.3309, "step": 596500 }, { "epoch": 17.53, "grad_norm": 1.820566177368164, "learning_rate": 0.0001249605303156594, "loss": 1.3224, "step": 597000 }, { "epoch": 17.54, "grad_norm": 24.11446762084961, "learning_rate": 0.00012481343832432802, "loss": 1.2934, "step": 597500 }, { "epoch": 17.56, "grad_norm": 1.5428298711776733, "learning_rate": 0.00012466634633299665, "loss": 1.3277, "step": 598000 }, { "epoch": 17.57, "grad_norm": 2.171504259109497, "learning_rate": 0.00012451925434166526, "loss": 1.3365, "step": 598500 }, { "epoch": 17.59, "grad_norm": 2.862025022506714, "learning_rate": 0.0001243721623503339, "loss": 1.3351, "step": 599000 }, { "epoch": 17.6, "grad_norm": 2.7436113357543945, "learning_rate": 0.0001242250703590025, "loss": 1.3203, "step": 599500 }, { "epoch": 17.62, "grad_norm": 1.9902766942977905, "learning_rate": 0.00012407797836767114, "loss": 1.3324, "step": 600000 }, { "epoch": 17.63, "grad_norm": 3.6419193744659424, "learning_rate": 0.00012393088637633975, "loss": 1.347, "step": 600500 }, { "epoch": 17.65, "grad_norm": 1.8328200578689575, "learning_rate": 0.00012378379438500838, "loss": 1.3261, "step": 601000 }, { "epoch": 17.66, "grad_norm": 3.2816476821899414, "learning_rate": 0.000123636702393677, "loss": 1.3228, "step": 601500 }, { "epoch": 17.67, "grad_norm": 1.142104148864746, "learning_rate": 0.00012348961040234562, "loss": 1.3183, "step": 602000 }, { "epoch": 17.69, "grad_norm": 2.1242527961730957, "learning_rate": 0.00012334251841101423, "loss": 1.3162, "step": 602500 }, { "epoch": 17.7, "grad_norm": 9.647717475891113, "learning_rate": 0.00012319542641968286, "loss": 1.3473, "step": 603000 }, { "epoch": 17.72, "grad_norm": 3.551119804382324, "learning_rate": 0.00012304833442835147, "loss": 1.3513, "step": 603500 }, { "epoch": 17.73, "grad_norm": 1.6903916597366333, "learning_rate": 0.0001229012424370201, "loss": 1.3393, "step": 604000 }, { "epoch": 17.75, "grad_norm": 2.54724383354187, "learning_rate": 0.00012275415044568871, "loss": 1.3402, "step": 604500 }, { "epoch": 17.76, "grad_norm": 5.089727878570557, "learning_rate": 0.00012260705845435735, "loss": 1.303, "step": 605000 }, { "epoch": 17.78, "grad_norm": 12.237126350402832, "learning_rate": 0.00012245996646302596, "loss": 1.3319, "step": 605500 }, { "epoch": 17.79, "grad_norm": 1.576462745666504, "learning_rate": 0.0001223128744716946, "loss": 1.3265, "step": 606000 }, { "epoch": 17.81, "grad_norm": 1.7792675495147705, "learning_rate": 0.0001221657824803632, "loss": 1.3335, "step": 606500 }, { "epoch": 17.82, "grad_norm": 5.532106876373291, "learning_rate": 0.00012201869048903183, "loss": 1.3456, "step": 607000 }, { "epoch": 17.84, "grad_norm": 3.333435297012329, "learning_rate": 0.00012187159849770044, "loss": 1.3332, "step": 607500 }, { "epoch": 17.85, "grad_norm": 3.1190874576568604, "learning_rate": 0.00012172450650636908, "loss": 1.3368, "step": 608000 }, { "epoch": 17.87, "grad_norm": 1.656201720237732, "learning_rate": 0.0001215774145150377, "loss": 1.3266, "step": 608500 }, { "epoch": 17.88, "grad_norm": 2.088050365447998, "learning_rate": 0.00012143032252370632, "loss": 1.3128, "step": 609000 }, { "epoch": 17.9, "grad_norm": 2.941950798034668, "learning_rate": 0.00012128323053237493, "loss": 1.3393, "step": 609500 }, { "epoch": 17.91, "grad_norm": 1.8323218822479248, "learning_rate": 0.00012113613854104356, "loss": 1.3255, "step": 610000 }, { "epoch": 17.92, "grad_norm": 1.3735178709030151, "learning_rate": 0.00012098904654971218, "loss": 1.3095, "step": 610500 }, { "epoch": 17.94, "grad_norm": 2.9366865158081055, "learning_rate": 0.0001208419545583808, "loss": 1.3401, "step": 611000 }, { "epoch": 17.95, "grad_norm": 3.2511463165283203, "learning_rate": 0.00012069486256704941, "loss": 1.3396, "step": 611500 }, { "epoch": 17.97, "grad_norm": 2.2338857650756836, "learning_rate": 0.00012054777057571804, "loss": 1.3286, "step": 612000 }, { "epoch": 17.98, "grad_norm": 1.5889378786087036, "learning_rate": 0.00012040067858438667, "loss": 1.3163, "step": 612500 }, { "epoch": 18.0, "grad_norm": 2.792966365814209, "learning_rate": 0.00012025358659305529, "loss": 1.2973, "step": 613000 }, { "epoch": 18.01, "grad_norm": 2.239032745361328, "learning_rate": 0.00012010649460172391, "loss": 1.3099, "step": 613500 }, { "epoch": 18.03, "grad_norm": 2.293813705444336, "learning_rate": 0.00011995940261039253, "loss": 1.2568, "step": 614000 }, { "epoch": 18.04, "grad_norm": 2.175294876098633, "learning_rate": 0.00011981231061906115, "loss": 1.2661, "step": 614500 }, { "epoch": 18.06, "grad_norm": 1.7673249244689941, "learning_rate": 0.00011966521862772977, "loss": 1.2747, "step": 615000 }, { "epoch": 18.07, "grad_norm": 2.6049957275390625, "learning_rate": 0.00011951812663639839, "loss": 1.2851, "step": 615500 }, { "epoch": 18.09, "grad_norm": 2.0433642864227295, "learning_rate": 0.00011937103464506703, "loss": 1.2846, "step": 616000 }, { "epoch": 18.1, "grad_norm": 2.120561122894287, "learning_rate": 0.00011922394265373563, "loss": 1.2634, "step": 616500 }, { "epoch": 18.12, "grad_norm": 2.6130003929138184, "learning_rate": 0.00011907685066240426, "loss": 1.2844, "step": 617000 }, { "epoch": 18.13, "grad_norm": 1.8312240839004517, "learning_rate": 0.00011892975867107288, "loss": 1.289, "step": 617500 }, { "epoch": 18.14, "grad_norm": 1.5402841567993164, "learning_rate": 0.00011878266667974151, "loss": 1.2578, "step": 618000 }, { "epoch": 18.16, "grad_norm": 2.0804052352905273, "learning_rate": 0.00011863557468841012, "loss": 1.2558, "step": 618500 }, { "epoch": 18.17, "grad_norm": 1.77811861038208, "learning_rate": 0.00011848848269707874, "loss": 1.2853, "step": 619000 }, { "epoch": 18.19, "grad_norm": 6.067068099975586, "learning_rate": 0.00011834139070574736, "loss": 1.2941, "step": 619500 }, { "epoch": 18.2, "grad_norm": 2.4684622287750244, "learning_rate": 0.000118194298714416, "loss": 1.2749, "step": 620000 }, { "epoch": 18.22, "grad_norm": 3.7913448810577393, "learning_rate": 0.0001180472067230846, "loss": 1.298, "step": 620500 }, { "epoch": 18.23, "grad_norm": 1.343802571296692, "learning_rate": 0.00011790011473175322, "loss": 1.2954, "step": 621000 }, { "epoch": 18.25, "grad_norm": 1.797194480895996, "learning_rate": 0.00011775302274042185, "loss": 1.2718, "step": 621500 }, { "epoch": 18.26, "grad_norm": 2.3011558055877686, "learning_rate": 0.00011760593074909048, "loss": 1.2966, "step": 622000 }, { "epoch": 18.28, "grad_norm": 2.5689167976379395, "learning_rate": 0.00011745883875775909, "loss": 1.286, "step": 622500 }, { "epoch": 18.29, "grad_norm": 8.428597450256348, "learning_rate": 0.00011731174676642771, "loss": 1.2872, "step": 623000 }, { "epoch": 18.31, "grad_norm": 1.1936590671539307, "learning_rate": 0.00011716465477509633, "loss": 1.2621, "step": 623500 }, { "epoch": 18.32, "grad_norm": 1.5052251815795898, "learning_rate": 0.00011701756278376497, "loss": 1.2758, "step": 624000 }, { "epoch": 18.34, "grad_norm": 1.296823263168335, "learning_rate": 0.00011687047079243357, "loss": 1.266, "step": 624500 }, { "epoch": 18.35, "grad_norm": 3.121631383895874, "learning_rate": 0.0001167233788011022, "loss": 1.2858, "step": 625000 }, { "epoch": 18.36, "grad_norm": 7.022789478302002, "learning_rate": 0.00011657628680977081, "loss": 1.2986, "step": 625500 }, { "epoch": 18.38, "grad_norm": 11.550426483154297, "learning_rate": 0.00011642919481843945, "loss": 1.2736, "step": 626000 }, { "epoch": 18.39, "grad_norm": 2.277326822280884, "learning_rate": 0.00011628210282710807, "loss": 1.3052, "step": 626500 }, { "epoch": 18.41, "grad_norm": 3.439568519592285, "learning_rate": 0.00011613501083577668, "loss": 1.3042, "step": 627000 }, { "epoch": 18.42, "grad_norm": 8.868010520935059, "learning_rate": 0.0001159879188444453, "loss": 1.2612, "step": 627500 }, { "epoch": 18.44, "grad_norm": 1.5216681957244873, "learning_rate": 0.00011584082685311393, "loss": 1.2634, "step": 628000 }, { "epoch": 18.45, "grad_norm": 2.113112688064575, "learning_rate": 0.00011569373486178256, "loss": 1.2681, "step": 628500 }, { "epoch": 18.47, "grad_norm": 2.9850034713745117, "learning_rate": 0.00011554664287045116, "loss": 1.2865, "step": 629000 }, { "epoch": 18.48, "grad_norm": 2.309042453765869, "learning_rate": 0.00011539955087911978, "loss": 1.2674, "step": 629500 }, { "epoch": 18.5, "grad_norm": 3.1562564373016357, "learning_rate": 0.00011525245888778842, "loss": 1.262, "step": 630000 }, { "epoch": 18.51, "grad_norm": 3.4192593097686768, "learning_rate": 0.00011510536689645704, "loss": 1.2729, "step": 630500 }, { "epoch": 18.53, "grad_norm": 2.193237543106079, "learning_rate": 0.00011495827490512565, "loss": 1.2586, "step": 631000 }, { "epoch": 18.54, "grad_norm": 8.091324806213379, "learning_rate": 0.00011481118291379427, "loss": 1.2649, "step": 631500 }, { "epoch": 18.56, "grad_norm": 2.0321621894836426, "learning_rate": 0.0001146640909224629, "loss": 1.2792, "step": 632000 }, { "epoch": 18.57, "grad_norm": 1.623028039932251, "learning_rate": 0.00011451699893113152, "loss": 1.3013, "step": 632500 }, { "epoch": 18.59, "grad_norm": 2.7462871074676514, "learning_rate": 0.00011436990693980013, "loss": 1.2858, "step": 633000 }, { "epoch": 18.6, "grad_norm": 22.743488311767578, "learning_rate": 0.00011422281494846875, "loss": 1.2739, "step": 633500 }, { "epoch": 18.61, "grad_norm": 4.659852981567383, "learning_rate": 0.00011407572295713739, "loss": 1.2773, "step": 634000 }, { "epoch": 18.63, "grad_norm": 1.8134876489639282, "learning_rate": 0.00011392863096580601, "loss": 1.274, "step": 634500 }, { "epoch": 18.64, "grad_norm": 2.266272783279419, "learning_rate": 0.00011378153897447463, "loss": 1.3084, "step": 635000 }, { "epoch": 18.66, "grad_norm": 1.4625264406204224, "learning_rate": 0.00011363444698314324, "loss": 1.2794, "step": 635500 }, { "epoch": 18.67, "grad_norm": 1.57483971118927, "learning_rate": 0.00011348735499181187, "loss": 1.2872, "step": 636000 }, { "epoch": 18.69, "grad_norm": 1.3935645818710327, "learning_rate": 0.00011334026300048049, "loss": 1.2809, "step": 636500 }, { "epoch": 18.7, "grad_norm": 3.3207247257232666, "learning_rate": 0.00011319317100914911, "loss": 1.2764, "step": 637000 }, { "epoch": 18.72, "grad_norm": 4.18394660949707, "learning_rate": 0.00011304607901781772, "loss": 1.2855, "step": 637500 }, { "epoch": 18.73, "grad_norm": 1.6960937976837158, "learning_rate": 0.00011289898702648636, "loss": 1.2807, "step": 638000 }, { "epoch": 18.75, "grad_norm": 6.864727020263672, "learning_rate": 0.00011275189503515498, "loss": 1.2736, "step": 638500 }, { "epoch": 18.76, "grad_norm": 1.7619363069534302, "learning_rate": 0.0001126048030438236, "loss": 1.3063, "step": 639000 }, { "epoch": 18.78, "grad_norm": 1.6224156618118286, "learning_rate": 0.0001124577110524922, "loss": 1.3031, "step": 639500 }, { "epoch": 18.79, "grad_norm": 4.564239025115967, "learning_rate": 0.00011231061906116084, "loss": 1.3185, "step": 640000 }, { "epoch": 18.81, "grad_norm": 2.237443208694458, "learning_rate": 0.00011216352706982946, "loss": 1.2612, "step": 640500 }, { "epoch": 18.82, "grad_norm": 1.7501612901687622, "learning_rate": 0.00011201643507849808, "loss": 1.278, "step": 641000 }, { "epoch": 18.83, "grad_norm": 1.1969166994094849, "learning_rate": 0.00011186934308716669, "loss": 1.2979, "step": 641500 }, { "epoch": 18.85, "grad_norm": 2.23476505279541, "learning_rate": 0.00011172225109583533, "loss": 1.2796, "step": 642000 }, { "epoch": 18.86, "grad_norm": 3.1486740112304688, "learning_rate": 0.00011157515910450395, "loss": 1.3212, "step": 642500 }, { "epoch": 18.88, "grad_norm": 3.5119261741638184, "learning_rate": 0.00011142806711317257, "loss": 1.3082, "step": 643000 }, { "epoch": 18.89, "grad_norm": 3.0694100856781006, "learning_rate": 0.00011128097512184118, "loss": 1.2643, "step": 643500 }, { "epoch": 18.91, "grad_norm": 2.235492467880249, "learning_rate": 0.00011113388313050981, "loss": 1.2832, "step": 644000 }, { "epoch": 18.92, "grad_norm": 2.627898931503296, "learning_rate": 0.00011098679113917843, "loss": 1.2863, "step": 644500 }, { "epoch": 18.94, "grad_norm": 1.390758991241455, "learning_rate": 0.00011083969914784705, "loss": 1.2949, "step": 645000 }, { "epoch": 18.95, "grad_norm": 1.9412230253219604, "learning_rate": 0.00011069260715651567, "loss": 1.2765, "step": 645500 }, { "epoch": 18.97, "grad_norm": 7.697941780090332, "learning_rate": 0.0001105455151651843, "loss": 1.2872, "step": 646000 }, { "epoch": 18.98, "grad_norm": 2.0770368576049805, "learning_rate": 0.00011039842317385292, "loss": 1.3058, "step": 646500 }, { "epoch": 19.0, "grad_norm": 1.8311492204666138, "learning_rate": 0.00011025133118252154, "loss": 1.3229, "step": 647000 }, { "epoch": 19.01, "grad_norm": 2.744004726409912, "learning_rate": 0.00011010423919119017, "loss": 1.253, "step": 647500 }, { "epoch": 19.03, "grad_norm": 2.0883147716522217, "learning_rate": 0.00010995714719985879, "loss": 1.2348, "step": 648000 }, { "epoch": 19.04, "grad_norm": 13.648962020874023, "learning_rate": 0.0001098100552085274, "loss": 1.2572, "step": 648500 }, { "epoch": 19.05, "grad_norm": 3.89188814163208, "learning_rate": 0.00010966296321719602, "loss": 1.2312, "step": 649000 }, { "epoch": 19.07, "grad_norm": 12.498522758483887, "learning_rate": 0.00010951587122586466, "loss": 1.2285, "step": 649500 }, { "epoch": 19.08, "grad_norm": 1.9123058319091797, "learning_rate": 0.00010936877923453328, "loss": 1.2223, "step": 650000 }, { "epoch": 19.1, "grad_norm": 1.9629552364349365, "learning_rate": 0.00010922168724320188, "loss": 1.2323, "step": 650500 }, { "epoch": 19.11, "grad_norm": 16.135618209838867, "learning_rate": 0.0001090745952518705, "loss": 1.1976, "step": 651000 }, { "epoch": 19.13, "grad_norm": 2.948089361190796, "learning_rate": 0.00010892750326053914, "loss": 1.2367, "step": 651500 }, { "epoch": 19.14, "grad_norm": 2.4549195766448975, "learning_rate": 0.00010878041126920776, "loss": 1.2345, "step": 652000 }, { "epoch": 19.16, "grad_norm": 1.7298622131347656, "learning_rate": 0.00010863331927787637, "loss": 1.2477, "step": 652500 }, { "epoch": 19.17, "grad_norm": 3.237170696258545, "learning_rate": 0.00010848622728654499, "loss": 1.2578, "step": 653000 }, { "epoch": 19.19, "grad_norm": 2.876091718673706, "learning_rate": 0.00010833913529521362, "loss": 1.2657, "step": 653500 }, { "epoch": 19.2, "grad_norm": 2.6806657314300537, "learning_rate": 0.00010819204330388225, "loss": 1.2369, "step": 654000 }, { "epoch": 19.22, "grad_norm": 1.682861328125, "learning_rate": 0.00010804495131255085, "loss": 1.2353, "step": 654500 }, { "epoch": 19.23, "grad_norm": 1.420599102973938, "learning_rate": 0.00010789785932121947, "loss": 1.2292, "step": 655000 }, { "epoch": 19.25, "grad_norm": 2.785423517227173, "learning_rate": 0.00010775076732988811, "loss": 1.2299, "step": 655500 }, { "epoch": 19.26, "grad_norm": 3.515298843383789, "learning_rate": 0.00010760367533855673, "loss": 1.2343, "step": 656000 }, { "epoch": 19.28, "grad_norm": 8.15224552154541, "learning_rate": 0.00010745658334722534, "loss": 1.262, "step": 656500 }, { "epoch": 19.29, "grad_norm": 2.5358471870422363, "learning_rate": 0.00010730949135589396, "loss": 1.2579, "step": 657000 }, { "epoch": 19.3, "grad_norm": 2.8267860412597656, "learning_rate": 0.0001071623993645626, "loss": 1.2195, "step": 657500 }, { "epoch": 19.32, "grad_norm": 2.0857648849487305, "learning_rate": 0.00010701530737323122, "loss": 1.2244, "step": 658000 }, { "epoch": 19.33, "grad_norm": 2.2825379371643066, "learning_rate": 0.00010686821538189984, "loss": 1.2502, "step": 658500 }, { "epoch": 19.35, "grad_norm": 2.1249475479125977, "learning_rate": 0.00010672112339056844, "loss": 1.2408, "step": 659000 }, { "epoch": 19.36, "grad_norm": 1.9578863382339478, "learning_rate": 0.00010657403139923708, "loss": 1.2599, "step": 659500 }, { "epoch": 19.38, "grad_norm": 1.7473647594451904, "learning_rate": 0.0001064269394079057, "loss": 1.2482, "step": 660000 }, { "epoch": 19.39, "grad_norm": 1.9577364921569824, "learning_rate": 0.00010627984741657432, "loss": 1.2389, "step": 660500 }, { "epoch": 19.41, "grad_norm": 6.437145233154297, "learning_rate": 0.00010613275542524293, "loss": 1.2327, "step": 661000 }, { "epoch": 19.42, "grad_norm": 36.84048080444336, "learning_rate": 0.00010598566343391156, "loss": 1.2553, "step": 661500 }, { "epoch": 19.44, "grad_norm": 2.4691734313964844, "learning_rate": 0.00010583857144258018, "loss": 1.2551, "step": 662000 }, { "epoch": 19.45, "grad_norm": 3.6141631603240967, "learning_rate": 0.0001056914794512488, "loss": 1.2459, "step": 662500 }, { "epoch": 19.47, "grad_norm": 4.1790008544921875, "learning_rate": 0.00010554438745991741, "loss": 1.2299, "step": 663000 }, { "epoch": 19.48, "grad_norm": 2.3077800273895264, "learning_rate": 0.00010539729546858605, "loss": 1.2354, "step": 663500 }, { "epoch": 19.5, "grad_norm": 2.0375678539276123, "learning_rate": 0.00010525020347725467, "loss": 1.2332, "step": 664000 }, { "epoch": 19.51, "grad_norm": 2.190852403640747, "learning_rate": 0.00010510311148592329, "loss": 1.2343, "step": 664500 }, { "epoch": 19.52, "grad_norm": 1.7503656148910522, "learning_rate": 0.0001049560194945919, "loss": 1.2328, "step": 665000 }, { "epoch": 19.54, "grad_norm": 2.364180326461792, "learning_rate": 0.00010480892750326053, "loss": 1.245, "step": 665500 }, { "epoch": 19.55, "grad_norm": 6.837544918060303, "learning_rate": 0.00010466183551192915, "loss": 1.2361, "step": 666000 }, { "epoch": 19.57, "grad_norm": 18.344188690185547, "learning_rate": 0.00010451474352059777, "loss": 1.2527, "step": 666500 }, { "epoch": 19.58, "grad_norm": 4.090867519378662, "learning_rate": 0.00010436765152926638, "loss": 1.2573, "step": 667000 }, { "epoch": 19.6, "grad_norm": 61.80951690673828, "learning_rate": 0.00010422055953793502, "loss": 1.2776, "step": 667500 }, { "epoch": 19.61, "grad_norm": 1.9649507999420166, "learning_rate": 0.00010407346754660364, "loss": 1.2479, "step": 668000 }, { "epoch": 19.63, "grad_norm": 4.030837535858154, "learning_rate": 0.00010392637555527226, "loss": 1.2458, "step": 668500 }, { "epoch": 19.64, "grad_norm": 3.310805082321167, "learning_rate": 0.00010377928356394088, "loss": 1.2491, "step": 669000 }, { "epoch": 19.66, "grad_norm": 6.558318138122559, "learning_rate": 0.0001036321915726095, "loss": 1.2562, "step": 669500 }, { "epoch": 19.67, "grad_norm": 2.0995540618896484, "learning_rate": 0.00010348509958127812, "loss": 1.2529, "step": 670000 }, { "epoch": 19.69, "grad_norm": 3.003690242767334, "learning_rate": 0.00010333800758994674, "loss": 1.2717, "step": 670500 }, { "epoch": 19.7, "grad_norm": 2.099637269973755, "learning_rate": 0.00010319091559861536, "loss": 1.2632, "step": 671000 }, { "epoch": 19.72, "grad_norm": 35.86410140991211, "learning_rate": 0.000103043823607284, "loss": 1.2763, "step": 671500 }, { "epoch": 19.73, "grad_norm": 1.7510465383529663, "learning_rate": 0.0001028967316159526, "loss": 1.2535, "step": 672000 }, { "epoch": 19.74, "grad_norm": 10.661267280578613, "learning_rate": 0.00010274963962462123, "loss": 1.2346, "step": 672500 }, { "epoch": 19.76, "grad_norm": 1.9645477533340454, "learning_rate": 0.00010260254763328985, "loss": 1.2398, "step": 673000 }, { "epoch": 19.77, "grad_norm": 2.795703172683716, "learning_rate": 0.00010245545564195848, "loss": 1.2683, "step": 673500 }, { "epoch": 19.79, "grad_norm": 3.7908451557159424, "learning_rate": 0.00010230836365062709, "loss": 1.2439, "step": 674000 }, { "epoch": 19.8, "grad_norm": 2.028703451156616, "learning_rate": 0.00010216127165929571, "loss": 1.2518, "step": 674500 }, { "epoch": 19.82, "grad_norm": 2.059154510498047, "learning_rate": 0.00010201417966796433, "loss": 1.2359, "step": 675000 }, { "epoch": 19.83, "grad_norm": 1.3817317485809326, "learning_rate": 0.00010186708767663297, "loss": 1.27, "step": 675500 }, { "epoch": 19.85, "grad_norm": 1.8642240762710571, "learning_rate": 0.00010171999568530158, "loss": 1.2604, "step": 676000 }, { "epoch": 19.86, "grad_norm": 1.7178900241851807, "learning_rate": 0.0001015729036939702, "loss": 1.2509, "step": 676500 }, { "epoch": 19.88, "grad_norm": 6.73402738571167, "learning_rate": 0.00010142581170263882, "loss": 1.2566, "step": 677000 }, { "epoch": 19.89, "grad_norm": 1.7260433435440063, "learning_rate": 0.00010127871971130745, "loss": 1.258, "step": 677500 }, { "epoch": 19.91, "grad_norm": 2.0348527431488037, "learning_rate": 0.00010113162771997606, "loss": 1.234, "step": 678000 }, { "epoch": 19.92, "grad_norm": 3.1736955642700195, "learning_rate": 0.00010098453572864468, "loss": 1.241, "step": 678500 }, { "epoch": 19.94, "grad_norm": 10.23302173614502, "learning_rate": 0.0001008374437373133, "loss": 1.2604, "step": 679000 }, { "epoch": 19.95, "grad_norm": 2.974153995513916, "learning_rate": 0.00010069035174598194, "loss": 1.2525, "step": 679500 }, { "epoch": 19.97, "grad_norm": 2.5226101875305176, "learning_rate": 0.00010054325975465056, "loss": 1.264, "step": 680000 }, { "epoch": 19.98, "grad_norm": 2.472259521484375, "learning_rate": 0.00010039616776331917, "loss": 1.2299, "step": 680500 }, { "epoch": 19.99, "grad_norm": 2.3573238849639893, "learning_rate": 0.00010024907577198779, "loss": 1.2657, "step": 681000 }, { "epoch": 20.01, "grad_norm": 3.6109812259674072, "learning_rate": 0.00010010198378065642, "loss": 1.2247, "step": 681500 }, { "epoch": 20.02, "grad_norm": 1.8781336545944214, "learning_rate": 9.995489178932504e-05, "loss": 1.2068, "step": 682000 }, { "epoch": 20.04, "grad_norm": 1.4420850276947021, "learning_rate": 9.980779979799365e-05, "loss": 1.2, "step": 682500 }, { "epoch": 20.05, "grad_norm": 3.082235097885132, "learning_rate": 9.966070780666227e-05, "loss": 1.2062, "step": 683000 }, { "epoch": 20.07, "grad_norm": 1.9524058103561401, "learning_rate": 9.95136158153309e-05, "loss": 1.1999, "step": 683500 }, { "epoch": 20.08, "grad_norm": 1.9696966409683228, "learning_rate": 9.936652382399953e-05, "loss": 1.1944, "step": 684000 }, { "epoch": 20.1, "grad_norm": 3.851034641265869, "learning_rate": 9.921943183266813e-05, "loss": 1.1913, "step": 684500 }, { "epoch": 20.11, "grad_norm": 1.7799595594406128, "learning_rate": 9.907233984133676e-05, "loss": 1.21, "step": 685000 }, { "epoch": 20.13, "grad_norm": 2.3180084228515625, "learning_rate": 9.892524785000539e-05, "loss": 1.1741, "step": 685500 }, { "epoch": 20.14, "grad_norm": 2.012601375579834, "learning_rate": 9.877815585867401e-05, "loss": 1.1956, "step": 686000 }, { "epoch": 20.16, "grad_norm": 2.5793349742889404, "learning_rate": 9.863106386734262e-05, "loss": 1.2011, "step": 686500 }, { "epoch": 20.17, "grad_norm": 1.7113804817199707, "learning_rate": 9.848397187601124e-05, "loss": 1.2382, "step": 687000 }, { "epoch": 20.19, "grad_norm": 1.8239011764526367, "learning_rate": 9.833687988467988e-05, "loss": 1.1904, "step": 687500 }, { "epoch": 20.2, "grad_norm": 8.440109252929688, "learning_rate": 9.81897878933485e-05, "loss": 1.2087, "step": 688000 }, { "epoch": 20.21, "grad_norm": 2.4333038330078125, "learning_rate": 9.80426959020171e-05, "loss": 1.209, "step": 688500 }, { "epoch": 20.23, "grad_norm": 3.217632532119751, "learning_rate": 9.789560391068572e-05, "loss": 1.1964, "step": 689000 }, { "epoch": 20.24, "grad_norm": 3.0806314945220947, "learning_rate": 9.774851191935436e-05, "loss": 1.2227, "step": 689500 }, { "epoch": 20.26, "grad_norm": 21.035554885864258, "learning_rate": 9.760141992802298e-05, "loss": 1.1899, "step": 690000 }, { "epoch": 20.27, "grad_norm": 4.15212345123291, "learning_rate": 9.74543279366916e-05, "loss": 1.1919, "step": 690500 }, { "epoch": 20.29, "grad_norm": 8.08483600616455, "learning_rate": 9.730723594536021e-05, "loss": 1.2083, "step": 691000 }, { "epoch": 20.3, "grad_norm": 2.5341944694519043, "learning_rate": 9.716014395402884e-05, "loss": 1.181, "step": 691500 }, { "epoch": 20.32, "grad_norm": 7.360681533813477, "learning_rate": 9.701305196269747e-05, "loss": 1.1975, "step": 692000 }, { "epoch": 20.33, "grad_norm": 1.5373992919921875, "learning_rate": 9.686595997136609e-05, "loss": 1.191, "step": 692500 }, { "epoch": 20.35, "grad_norm": 1.8679372072219849, "learning_rate": 9.67188679800347e-05, "loss": 1.213, "step": 693000 }, { "epoch": 20.36, "grad_norm": 3.206306219100952, "learning_rate": 9.657177598870333e-05, "loss": 1.2051, "step": 693500 }, { "epoch": 20.38, "grad_norm": 2.6320650577545166, "learning_rate": 9.642468399737195e-05, "loss": 1.1851, "step": 694000 }, { "epoch": 20.39, "grad_norm": 3.6654086112976074, "learning_rate": 9.627759200604057e-05, "loss": 1.2063, "step": 694500 }, { "epoch": 20.41, "grad_norm": 3.010348081588745, "learning_rate": 9.613050001470918e-05, "loss": 1.2239, "step": 695000 }, { "epoch": 20.42, "grad_norm": 27.742935180664062, "learning_rate": 9.598340802337781e-05, "loss": 1.2084, "step": 695500 }, { "epoch": 20.43, "grad_norm": 2.783250570297241, "learning_rate": 9.583631603204643e-05, "loss": 1.2263, "step": 696000 }, { "epoch": 20.45, "grad_norm": 2.516063690185547, "learning_rate": 9.568922404071506e-05, "loss": 1.2084, "step": 696500 }, { "epoch": 20.46, "grad_norm": 1.8266417980194092, "learning_rate": 9.554213204938366e-05, "loss": 1.2052, "step": 697000 }, { "epoch": 20.48, "grad_norm": 1.2132940292358398, "learning_rate": 9.53950400580523e-05, "loss": 1.2132, "step": 697500 }, { "epoch": 20.49, "grad_norm": 4.550230503082275, "learning_rate": 9.524794806672092e-05, "loss": 1.2263, "step": 698000 }, { "epoch": 20.51, "grad_norm": 2.1874680519104004, "learning_rate": 9.510085607538954e-05, "loss": 1.1855, "step": 698500 }, { "epoch": 20.52, "grad_norm": 1.8161512613296509, "learning_rate": 9.495376408405815e-05, "loss": 1.1949, "step": 699000 }, { "epoch": 20.54, "grad_norm": 1.3943161964416504, "learning_rate": 9.480667209272678e-05, "loss": 1.1944, "step": 699500 }, { "epoch": 20.55, "grad_norm": 3.2997055053710938, "learning_rate": 9.46595801013954e-05, "loss": 1.2213, "step": 700000 }, { "epoch": 20.57, "grad_norm": 1.9309003353118896, "learning_rate": 9.451248811006402e-05, "loss": 1.2163, "step": 700500 }, { "epoch": 20.58, "grad_norm": 2.734384775161743, "learning_rate": 9.436539611873265e-05, "loss": 1.1912, "step": 701000 }, { "epoch": 20.6, "grad_norm": 2.1459152698516846, "learning_rate": 9.421830412740127e-05, "loss": 1.2343, "step": 701500 }, { "epoch": 20.61, "grad_norm": 2.039071798324585, "learning_rate": 9.407121213606989e-05, "loss": 1.228, "step": 702000 }, { "epoch": 20.63, "grad_norm": 3.1236155033111572, "learning_rate": 9.392412014473851e-05, "loss": 1.2362, "step": 702500 }, { "epoch": 20.64, "grad_norm": 6.471434593200684, "learning_rate": 9.377702815340713e-05, "loss": 1.2159, "step": 703000 }, { "epoch": 20.66, "grad_norm": 3.2641398906707764, "learning_rate": 9.362993616207576e-05, "loss": 1.1702, "step": 703500 }, { "epoch": 20.67, "grad_norm": 2.0481953620910645, "learning_rate": 9.348284417074437e-05, "loss": 1.218, "step": 704000 }, { "epoch": 20.68, "grad_norm": 1.6071834564208984, "learning_rate": 9.3335752179413e-05, "loss": 1.2422, "step": 704500 }, { "epoch": 20.7, "grad_norm": 1.8020005226135254, "learning_rate": 9.318866018808163e-05, "loss": 1.2268, "step": 705000 }, { "epoch": 20.71, "grad_norm": 4.970008373260498, "learning_rate": 9.304156819675025e-05, "loss": 1.2093, "step": 705500 }, { "epoch": 20.73, "grad_norm": 2.152195453643799, "learning_rate": 9.289447620541886e-05, "loss": 1.2361, "step": 706000 }, { "epoch": 20.74, "grad_norm": 5.956038475036621, "learning_rate": 9.274738421408748e-05, "loss": 1.1908, "step": 706500 }, { "epoch": 20.76, "grad_norm": 2.429544687271118, "learning_rate": 9.260029222275611e-05, "loss": 1.1805, "step": 707000 }, { "epoch": 20.77, "grad_norm": 2.6851959228515625, "learning_rate": 9.245320023142473e-05, "loss": 1.204, "step": 707500 }, { "epoch": 20.79, "grad_norm": 2.6581480503082275, "learning_rate": 9.230610824009334e-05, "loss": 1.2357, "step": 708000 }, { "epoch": 20.8, "grad_norm": 2.938990354537964, "learning_rate": 9.215901624876196e-05, "loss": 1.2326, "step": 708500 }, { "epoch": 20.82, "grad_norm": 1.9101125001907349, "learning_rate": 9.20119242574306e-05, "loss": 1.2013, "step": 709000 }, { "epoch": 20.83, "grad_norm": 3.4203646183013916, "learning_rate": 9.186483226609922e-05, "loss": 1.2021, "step": 709500 }, { "epoch": 20.85, "grad_norm": 2.2764744758605957, "learning_rate": 9.171774027476783e-05, "loss": 1.223, "step": 710000 }, { "epoch": 20.86, "grad_norm": 1.5339056253433228, "learning_rate": 9.157064828343645e-05, "loss": 1.2001, "step": 710500 }, { "epoch": 20.88, "grad_norm": 1.3566075563430786, "learning_rate": 9.142355629210508e-05, "loss": 1.2324, "step": 711000 }, { "epoch": 20.89, "grad_norm": 1.5568170547485352, "learning_rate": 9.12764643007737e-05, "loss": 1.2065, "step": 711500 }, { "epoch": 20.9, "grad_norm": 26.298019409179688, "learning_rate": 9.112937230944231e-05, "loss": 1.2283, "step": 712000 }, { "epoch": 20.92, "grad_norm": 1.2808449268341064, "learning_rate": 9.098228031811093e-05, "loss": 1.2531, "step": 712500 }, { "epoch": 20.93, "grad_norm": 1.9005004167556763, "learning_rate": 9.083518832677957e-05, "loss": 1.2108, "step": 713000 }, { "epoch": 20.95, "grad_norm": 6.3783721923828125, "learning_rate": 9.068809633544819e-05, "loss": 1.2015, "step": 713500 }, { "epoch": 20.96, "grad_norm": 2.5035910606384277, "learning_rate": 9.054100434411681e-05, "loss": 1.2342, "step": 714000 }, { "epoch": 20.98, "grad_norm": 9.022185325622559, "learning_rate": 9.039391235278542e-05, "loss": 1.2176, "step": 714500 }, { "epoch": 20.99, "grad_norm": 1.442610502243042, "learning_rate": 9.024682036145405e-05, "loss": 1.2406, "step": 715000 }, { "epoch": 21.01, "grad_norm": 2.419548273086548, "learning_rate": 9.009972837012267e-05, "loss": 1.1987, "step": 715500 }, { "epoch": 21.02, "grad_norm": 9.644920349121094, "learning_rate": 8.995263637879129e-05, "loss": 1.141, "step": 716000 }, { "epoch": 21.04, "grad_norm": 2.2750537395477295, "learning_rate": 8.98055443874599e-05, "loss": 1.1621, "step": 716500 }, { "epoch": 21.05, "grad_norm": 12.630777359008789, "learning_rate": 8.965845239612853e-05, "loss": 1.1495, "step": 717000 }, { "epoch": 21.07, "grad_norm": 1.7819632291793823, "learning_rate": 8.951136040479716e-05, "loss": 1.1687, "step": 717500 }, { "epoch": 21.08, "grad_norm": 2.4531936645507812, "learning_rate": 8.936426841346578e-05, "loss": 1.1722, "step": 718000 }, { "epoch": 21.1, "grad_norm": 4.26564884185791, "learning_rate": 8.921717642213438e-05, "loss": 1.1804, "step": 718500 }, { "epoch": 21.11, "grad_norm": 3.8713653087615967, "learning_rate": 8.907008443080302e-05, "loss": 1.1954, "step": 719000 }, { "epoch": 21.12, "grad_norm": 2.3605353832244873, "learning_rate": 8.892299243947164e-05, "loss": 1.1688, "step": 719500 }, { "epoch": 21.14, "grad_norm": 2.322021007537842, "learning_rate": 8.877590044814026e-05, "loss": 1.1403, "step": 720000 }, { "epoch": 21.15, "grad_norm": 2.5785558223724365, "learning_rate": 8.862880845680887e-05, "loss": 1.1812, "step": 720500 }, { "epoch": 21.17, "grad_norm": 1.8795828819274902, "learning_rate": 8.84817164654775e-05, "loss": 1.1756, "step": 721000 }, { "epoch": 21.18, "grad_norm": 1.7690235376358032, "learning_rate": 8.833462447414613e-05, "loss": 1.1592, "step": 721500 }, { "epoch": 21.2, "grad_norm": 2.3973443508148193, "learning_rate": 8.818753248281475e-05, "loss": 1.1586, "step": 722000 }, { "epoch": 21.21, "grad_norm": 11.784666061401367, "learning_rate": 8.804044049148335e-05, "loss": 1.1522, "step": 722500 }, { "epoch": 21.23, "grad_norm": 1.8928039073944092, "learning_rate": 8.789334850015199e-05, "loss": 1.1739, "step": 723000 }, { "epoch": 21.24, "grad_norm": 28.14616584777832, "learning_rate": 8.774625650882061e-05, "loss": 1.1844, "step": 723500 }, { "epoch": 21.26, "grad_norm": 22.186017990112305, "learning_rate": 8.759916451748923e-05, "loss": 1.1501, "step": 724000 }, { "epoch": 21.27, "grad_norm": 3.089933395385742, "learning_rate": 8.745207252615785e-05, "loss": 1.1687, "step": 724500 }, { "epoch": 21.29, "grad_norm": 3.315870523452759, "learning_rate": 8.730498053482647e-05, "loss": 1.1695, "step": 725000 }, { "epoch": 21.3, "grad_norm": 2.118852138519287, "learning_rate": 8.71578885434951e-05, "loss": 1.1831, "step": 725500 }, { "epoch": 21.32, "grad_norm": 2.3328022956848145, "learning_rate": 8.701079655216372e-05, "loss": 1.1972, "step": 726000 }, { "epoch": 21.33, "grad_norm": 2.6026008129119873, "learning_rate": 8.686370456083234e-05, "loss": 1.177, "step": 726500 }, { "epoch": 21.34, "grad_norm": 1.873986840248108, "learning_rate": 8.671661256950097e-05, "loss": 1.1856, "step": 727000 }, { "epoch": 21.36, "grad_norm": 2.4924209117889404, "learning_rate": 8.656952057816958e-05, "loss": 1.1587, "step": 727500 }, { "epoch": 21.37, "grad_norm": 3.228402614593506, "learning_rate": 8.64224285868382e-05, "loss": 1.1654, "step": 728000 }, { "epoch": 21.39, "grad_norm": 2.2008731365203857, "learning_rate": 8.627533659550682e-05, "loss": 1.1953, "step": 728500 }, { "epoch": 21.4, "grad_norm": 16.22712516784668, "learning_rate": 8.612824460417546e-05, "loss": 1.1871, "step": 729000 }, { "epoch": 21.42, "grad_norm": 4.7584919929504395, "learning_rate": 8.598115261284406e-05, "loss": 1.1865, "step": 729500 }, { "epoch": 21.43, "grad_norm": 3.2225780487060547, "learning_rate": 8.583406062151268e-05, "loss": 1.1776, "step": 730000 }, { "epoch": 21.45, "grad_norm": 5.469396591186523, "learning_rate": 8.56869686301813e-05, "loss": 1.1897, "step": 730500 }, { "epoch": 21.46, "grad_norm": 1.8942010402679443, "learning_rate": 8.553987663884994e-05, "loss": 1.1484, "step": 731000 }, { "epoch": 21.48, "grad_norm": 2.122103452682495, "learning_rate": 8.539278464751855e-05, "loss": 1.1707, "step": 731500 }, { "epoch": 21.49, "grad_norm": 3.3333709239959717, "learning_rate": 8.524569265618717e-05, "loss": 1.1471, "step": 732000 }, { "epoch": 21.51, "grad_norm": 3.002875566482544, "learning_rate": 8.509860066485579e-05, "loss": 1.1785, "step": 732500 }, { "epoch": 21.52, "grad_norm": 3.3468096256256104, "learning_rate": 8.495150867352442e-05, "loss": 1.1705, "step": 733000 }, { "epoch": 21.54, "grad_norm": 20.700353622436523, "learning_rate": 8.480441668219303e-05, "loss": 1.1792, "step": 733500 }, { "epoch": 21.55, "grad_norm": 5.712481498718262, "learning_rate": 8.465732469086165e-05, "loss": 1.1633, "step": 734000 }, { "epoch": 21.57, "grad_norm": 2.2254440784454346, "learning_rate": 8.451023269953027e-05, "loss": 1.1809, "step": 734500 }, { "epoch": 21.58, "grad_norm": 5.889398574829102, "learning_rate": 8.436314070819891e-05, "loss": 1.1732, "step": 735000 }, { "epoch": 21.59, "grad_norm": 6.2943572998046875, "learning_rate": 8.421604871686753e-05, "loss": 1.1896, "step": 735500 }, { "epoch": 21.61, "grad_norm": 2.1915676593780518, "learning_rate": 8.406895672553614e-05, "loss": 1.1884, "step": 736000 }, { "epoch": 21.62, "grad_norm": 2.950507164001465, "learning_rate": 8.392186473420476e-05, "loss": 1.1825, "step": 736500 }, { "epoch": 21.64, "grad_norm": 2.338834047317505, "learning_rate": 8.37747727428734e-05, "loss": 1.1787, "step": 737000 }, { "epoch": 21.65, "grad_norm": 2.806655168533325, "learning_rate": 8.362768075154201e-05, "loss": 1.1988, "step": 737500 }, { "epoch": 21.67, "grad_norm": 2.509188652038574, "learning_rate": 8.348058876021062e-05, "loss": 1.1678, "step": 738000 }, { "epoch": 21.68, "grad_norm": 2.3886609077453613, "learning_rate": 8.333349676887924e-05, "loss": 1.2138, "step": 738500 }, { "epoch": 21.7, "grad_norm": 8.767037391662598, "learning_rate": 8.318640477754788e-05, "loss": 1.1682, "step": 739000 }, { "epoch": 21.71, "grad_norm": 2.7434234619140625, "learning_rate": 8.30393127862165e-05, "loss": 1.1675, "step": 739500 }, { "epoch": 21.73, "grad_norm": 3.837946891784668, "learning_rate": 8.28922207948851e-05, "loss": 1.1746, "step": 740000 }, { "epoch": 21.74, "grad_norm": 2.2261147499084473, "learning_rate": 8.274512880355373e-05, "loss": 1.1805, "step": 740500 }, { "epoch": 21.76, "grad_norm": 3.7343368530273438, "learning_rate": 8.259803681222236e-05, "loss": 1.1829, "step": 741000 }, { "epoch": 21.77, "grad_norm": 2.696150779724121, "learning_rate": 8.245094482089098e-05, "loss": 1.1791, "step": 741500 }, { "epoch": 21.79, "grad_norm": 3.374664783477783, "learning_rate": 8.230385282955959e-05, "loss": 1.186, "step": 742000 }, { "epoch": 21.8, "grad_norm": 2.0599725246429443, "learning_rate": 8.215676083822821e-05, "loss": 1.1704, "step": 742500 }, { "epoch": 21.81, "grad_norm": 2.960590362548828, "learning_rate": 8.200966884689685e-05, "loss": 1.2078, "step": 743000 }, { "epoch": 21.83, "grad_norm": 3.514355421066284, "learning_rate": 8.186257685556547e-05, "loss": 1.1853, "step": 743500 }, { "epoch": 21.84, "grad_norm": 3.7044436931610107, "learning_rate": 8.171548486423408e-05, "loss": 1.1775, "step": 744000 }, { "epoch": 21.86, "grad_norm": 2.7482707500457764, "learning_rate": 8.15683928729027e-05, "loss": 1.1927, "step": 744500 }, { "epoch": 21.87, "grad_norm": 3.462541103363037, "learning_rate": 8.142130088157133e-05, "loss": 1.155, "step": 745000 }, { "epoch": 21.89, "grad_norm": 1.3520176410675049, "learning_rate": 8.127420889023995e-05, "loss": 1.1887, "step": 745500 }, { "epoch": 21.9, "grad_norm": 3.2204971313476562, "learning_rate": 8.112711689890857e-05, "loss": 1.17, "step": 746000 }, { "epoch": 21.92, "grad_norm": 2.8493075370788574, "learning_rate": 8.098002490757718e-05, "loss": 1.1804, "step": 746500 }, { "epoch": 21.93, "grad_norm": 28.339750289916992, "learning_rate": 8.083293291624582e-05, "loss": 1.1832, "step": 747000 }, { "epoch": 21.95, "grad_norm": 3.3736674785614014, "learning_rate": 8.068584092491444e-05, "loss": 1.1903, "step": 747500 }, { "epoch": 21.96, "grad_norm": 2.166250228881836, "learning_rate": 8.053874893358306e-05, "loss": 1.1712, "step": 748000 }, { "epoch": 21.98, "grad_norm": 3.713568925857544, "learning_rate": 8.039165694225167e-05, "loss": 1.1896, "step": 748500 }, { "epoch": 21.99, "grad_norm": 3.077561140060425, "learning_rate": 8.02445649509203e-05, "loss": 1.1999, "step": 749000 }, { "epoch": 22.01, "grad_norm": 1.1793521642684937, "learning_rate": 8.009747295958892e-05, "loss": 1.166, "step": 749500 }, { "epoch": 22.02, "grad_norm": 1.4397152662277222, "learning_rate": 7.995038096825754e-05, "loss": 1.1327, "step": 750000 }, { "epoch": 22.03, "grad_norm": 6.889225959777832, "learning_rate": 7.980328897692615e-05, "loss": 1.1533, "step": 750500 }, { "epoch": 22.05, "grad_norm": 1.7650978565216064, "learning_rate": 7.965619698559479e-05, "loss": 1.1301, "step": 751000 }, { "epoch": 22.06, "grad_norm": 1.922248125076294, "learning_rate": 7.95091049942634e-05, "loss": 1.1313, "step": 751500 }, { "epoch": 22.08, "grad_norm": 9.232244491577148, "learning_rate": 7.936201300293203e-05, "loss": 1.1601, "step": 752000 }, { "epoch": 22.09, "grad_norm": 2.162410259246826, "learning_rate": 7.921492101160063e-05, "loss": 1.1394, "step": 752500 }, { "epoch": 22.11, "grad_norm": 2.0230438709259033, "learning_rate": 7.906782902026927e-05, "loss": 1.131, "step": 753000 }, { "epoch": 22.12, "grad_norm": 3.8333919048309326, "learning_rate": 7.892073702893789e-05, "loss": 1.1264, "step": 753500 }, { "epoch": 22.14, "grad_norm": 1.487121343612671, "learning_rate": 7.877364503760651e-05, "loss": 1.1127, "step": 754000 }, { "epoch": 22.15, "grad_norm": 2.5997982025146484, "learning_rate": 7.862655304627512e-05, "loss": 1.1072, "step": 754500 }, { "epoch": 22.17, "grad_norm": 2.5122156143188477, "learning_rate": 7.847946105494375e-05, "loss": 1.1091, "step": 755000 }, { "epoch": 22.18, "grad_norm": 2.6082444190979004, "learning_rate": 7.833236906361238e-05, "loss": 1.142, "step": 755500 }, { "epoch": 22.2, "grad_norm": 8.16508960723877, "learning_rate": 7.8185277072281e-05, "loss": 1.1247, "step": 756000 }, { "epoch": 22.21, "grad_norm": 3.061530113220215, "learning_rate": 7.803818508094962e-05, "loss": 1.1526, "step": 756500 }, { "epoch": 22.23, "grad_norm": 3.9026131629943848, "learning_rate": 7.789109308961824e-05, "loss": 1.152, "step": 757000 }, { "epoch": 22.24, "grad_norm": 2.3439905643463135, "learning_rate": 7.774400109828686e-05, "loss": 1.1416, "step": 757500 }, { "epoch": 22.26, "grad_norm": 0.9024741053581238, "learning_rate": 7.759690910695548e-05, "loss": 1.1272, "step": 758000 }, { "epoch": 22.27, "grad_norm": 3.3680572509765625, "learning_rate": 7.74498171156241e-05, "loss": 1.1445, "step": 758500 }, { "epoch": 22.28, "grad_norm": 1.695378065109253, "learning_rate": 7.730272512429274e-05, "loss": 1.1373, "step": 759000 }, { "epoch": 22.3, "grad_norm": 4.116308689117432, "learning_rate": 7.715563313296134e-05, "loss": 1.1453, "step": 759500 }, { "epoch": 22.31, "grad_norm": 2.014496088027954, "learning_rate": 7.700854114162997e-05, "loss": 1.146, "step": 760000 }, { "epoch": 22.33, "grad_norm": 1.5276461839675903, "learning_rate": 7.686144915029859e-05, "loss": 1.13, "step": 760500 }, { "epoch": 22.34, "grad_norm": 2.514599084854126, "learning_rate": 7.671435715896722e-05, "loss": 1.142, "step": 761000 }, { "epoch": 22.36, "grad_norm": 2.2747581005096436, "learning_rate": 7.656726516763583e-05, "loss": 1.1552, "step": 761500 }, { "epoch": 22.37, "grad_norm": 2.406036615371704, "learning_rate": 7.642017317630445e-05, "loss": 1.1453, "step": 762000 }, { "epoch": 22.39, "grad_norm": 2.231992721557617, "learning_rate": 7.627308118497307e-05, "loss": 1.1386, "step": 762500 }, { "epoch": 22.4, "grad_norm": 3.1717188358306885, "learning_rate": 7.61259891936417e-05, "loss": 1.1549, "step": 763000 }, { "epoch": 22.42, "grad_norm": 22.27747917175293, "learning_rate": 7.597889720231031e-05, "loss": 1.1142, "step": 763500 }, { "epoch": 22.43, "grad_norm": 2.7918033599853516, "learning_rate": 7.583180521097893e-05, "loss": 1.1108, "step": 764000 }, { "epoch": 22.45, "grad_norm": 1.4565542936325073, "learning_rate": 7.568471321964757e-05, "loss": 1.1509, "step": 764500 }, { "epoch": 22.46, "grad_norm": 3.1026453971862793, "learning_rate": 7.553762122831619e-05, "loss": 1.1513, "step": 765000 }, { "epoch": 22.48, "grad_norm": 1.6490851640701294, "learning_rate": 7.53905292369848e-05, "loss": 1.1242, "step": 765500 }, { "epoch": 22.49, "grad_norm": 1.7027279138565063, "learning_rate": 7.524343724565342e-05, "loss": 1.1561, "step": 766000 }, { "epoch": 22.5, "grad_norm": 1.2775441408157349, "learning_rate": 7.509634525432205e-05, "loss": 1.1168, "step": 766500 }, { "epoch": 22.52, "grad_norm": 5.64236307144165, "learning_rate": 7.494925326299067e-05, "loss": 1.1465, "step": 767000 }, { "epoch": 22.53, "grad_norm": 2.019430637359619, "learning_rate": 7.480216127165928e-05, "loss": 1.1162, "step": 767500 }, { "epoch": 22.55, "grad_norm": 1.3938747644424438, "learning_rate": 7.465506928032792e-05, "loss": 1.1253, "step": 768000 }, { "epoch": 22.56, "grad_norm": 2.122945785522461, "learning_rate": 7.450797728899654e-05, "loss": 1.1409, "step": 768500 }, { "epoch": 22.58, "grad_norm": 1.826390266418457, "learning_rate": 7.436088529766516e-05, "loss": 1.1473, "step": 769000 }, { "epoch": 22.59, "grad_norm": 1.8003076314926147, "learning_rate": 7.421379330633378e-05, "loss": 1.1827, "step": 769500 }, { "epoch": 22.61, "grad_norm": 2.6314899921417236, "learning_rate": 7.40667013150024e-05, "loss": 1.1426, "step": 770000 }, { "epoch": 22.62, "grad_norm": 2.7484140396118164, "learning_rate": 7.391960932367102e-05, "loss": 1.1603, "step": 770500 }, { "epoch": 22.64, "grad_norm": 1.5445499420166016, "learning_rate": 7.377251733233964e-05, "loss": 1.1609, "step": 771000 }, { "epoch": 22.65, "grad_norm": 2.3308262825012207, "learning_rate": 7.362542534100826e-05, "loss": 1.1126, "step": 771500 }, { "epoch": 22.67, "grad_norm": 2.675467014312744, "learning_rate": 7.347833334967689e-05, "loss": 1.1717, "step": 772000 }, { "epoch": 22.68, "grad_norm": 7.849360466003418, "learning_rate": 7.333124135834551e-05, "loss": 1.1304, "step": 772500 }, { "epoch": 22.7, "grad_norm": 1.596022129058838, "learning_rate": 7.318414936701413e-05, "loss": 1.1563, "step": 773000 }, { "epoch": 22.71, "grad_norm": 1.8678432703018188, "learning_rate": 7.303705737568275e-05, "loss": 1.1668, "step": 773500 }, { "epoch": 22.72, "grad_norm": 1.3757662773132324, "learning_rate": 7.288996538435137e-05, "loss": 1.1484, "step": 774000 }, { "epoch": 22.74, "grad_norm": 2.819344997406006, "learning_rate": 7.274287339301999e-05, "loss": 1.1213, "step": 774500 }, { "epoch": 22.75, "grad_norm": 2.370375871658325, "learning_rate": 7.259578140168861e-05, "loss": 1.1654, "step": 775000 }, { "epoch": 22.77, "grad_norm": 3.6313276290893555, "learning_rate": 7.244868941035723e-05, "loss": 1.13, "step": 775500 }, { "epoch": 22.78, "grad_norm": 1.737453579902649, "learning_rate": 7.230159741902585e-05, "loss": 1.1283, "step": 776000 }, { "epoch": 22.8, "grad_norm": 2.530315637588501, "learning_rate": 7.215450542769448e-05, "loss": 1.1413, "step": 776500 }, { "epoch": 22.81, "grad_norm": 2.049701452255249, "learning_rate": 7.20074134363631e-05, "loss": 1.1863, "step": 777000 }, { "epoch": 22.83, "grad_norm": 2.112698554992676, "learning_rate": 7.186032144503172e-05, "loss": 1.1217, "step": 777500 }, { "epoch": 22.84, "grad_norm": 1.982596755027771, "learning_rate": 7.171322945370034e-05, "loss": 1.151, "step": 778000 }, { "epoch": 22.86, "grad_norm": 2.8483023643493652, "learning_rate": 7.156613746236896e-05, "loss": 1.1371, "step": 778500 }, { "epoch": 22.87, "grad_norm": 2.3431105613708496, "learning_rate": 7.141904547103758e-05, "loss": 1.1549, "step": 779000 }, { "epoch": 22.89, "grad_norm": 2.256725549697876, "learning_rate": 7.12719534797062e-05, "loss": 1.1387, "step": 779500 }, { "epoch": 22.9, "grad_norm": 3.2769088745117188, "learning_rate": 7.112486148837482e-05, "loss": 1.1365, "step": 780000 }, { "epoch": 22.92, "grad_norm": 7.680245399475098, "learning_rate": 7.097776949704344e-05, "loss": 1.1524, "step": 780500 }, { "epoch": 22.93, "grad_norm": 2.0272703170776367, "learning_rate": 7.083067750571207e-05, "loss": 1.1483, "step": 781000 }, { "epoch": 22.95, "grad_norm": 3.6463356018066406, "learning_rate": 7.068358551438069e-05, "loss": 1.1188, "step": 781500 }, { "epoch": 22.96, "grad_norm": 2.499232769012451, "learning_rate": 7.053649352304931e-05, "loss": 1.1434, "step": 782000 }, { "epoch": 22.97, "grad_norm": 2.1540205478668213, "learning_rate": 7.038940153171793e-05, "loss": 1.1394, "step": 782500 }, { "epoch": 22.99, "grad_norm": 2.569671154022217, "learning_rate": 7.024230954038655e-05, "loss": 1.1581, "step": 783000 }, { "epoch": 23.0, "grad_norm": 5.3423333168029785, "learning_rate": 7.009521754905517e-05, "loss": 1.1392, "step": 783500 }, { "epoch": 23.02, "grad_norm": 1.8991897106170654, "learning_rate": 6.994812555772379e-05, "loss": 1.0877, "step": 784000 }, { "epoch": 23.03, "grad_norm": 2.094095230102539, "learning_rate": 6.980103356639241e-05, "loss": 1.1106, "step": 784500 }, { "epoch": 23.05, "grad_norm": 2.4271163940429688, "learning_rate": 6.965394157506104e-05, "loss": 1.0893, "step": 785000 }, { "epoch": 23.06, "grad_norm": 3.5016427040100098, "learning_rate": 6.950684958372966e-05, "loss": 1.1119, "step": 785500 }, { "epoch": 23.08, "grad_norm": 3.388533592224121, "learning_rate": 6.935975759239828e-05, "loss": 1.1135, "step": 786000 }, { "epoch": 23.09, "grad_norm": 1.610478401184082, "learning_rate": 6.92126656010669e-05, "loss": 1.1064, "step": 786500 }, { "epoch": 23.11, "grad_norm": 2.6857564449310303, "learning_rate": 6.906557360973552e-05, "loss": 1.105, "step": 787000 }, { "epoch": 23.12, "grad_norm": 3.785341739654541, "learning_rate": 6.891848161840414e-05, "loss": 1.1109, "step": 787500 }, { "epoch": 23.14, "grad_norm": 1.6604160070419312, "learning_rate": 6.877138962707276e-05, "loss": 1.1124, "step": 788000 }, { "epoch": 23.15, "grad_norm": 1.9755994081497192, "learning_rate": 6.862429763574138e-05, "loss": 1.1034, "step": 788500 }, { "epoch": 23.17, "grad_norm": 2.989527940750122, "learning_rate": 6.847720564441e-05, "loss": 1.1055, "step": 789000 }, { "epoch": 23.18, "grad_norm": 2.2718911170959473, "learning_rate": 6.833011365307863e-05, "loss": 1.104, "step": 789500 }, { "epoch": 23.19, "grad_norm": 4.303855895996094, "learning_rate": 6.818302166174725e-05, "loss": 1.1174, "step": 790000 }, { "epoch": 23.21, "grad_norm": 2.6747548580169678, "learning_rate": 6.803592967041587e-05, "loss": 1.1046, "step": 790500 }, { "epoch": 23.22, "grad_norm": 2.6597251892089844, "learning_rate": 6.78888376790845e-05, "loss": 1.1246, "step": 791000 }, { "epoch": 23.24, "grad_norm": 1.6096960306167603, "learning_rate": 6.774174568775311e-05, "loss": 1.1103, "step": 791500 }, { "epoch": 23.25, "grad_norm": 1.8474167585372925, "learning_rate": 6.759465369642174e-05, "loss": 1.1136, "step": 792000 }, { "epoch": 23.27, "grad_norm": 6.043608665466309, "learning_rate": 6.744756170509035e-05, "loss": 1.1118, "step": 792500 }, { "epoch": 23.28, "grad_norm": 10.027447700500488, "learning_rate": 6.730046971375899e-05, "loss": 1.115, "step": 793000 }, { "epoch": 23.3, "grad_norm": 2.5558454990386963, "learning_rate": 6.71533777224276e-05, "loss": 1.0993, "step": 793500 }, { "epoch": 23.31, "grad_norm": 2.075737476348877, "learning_rate": 6.700628573109623e-05, "loss": 1.1111, "step": 794000 }, { "epoch": 23.33, "grad_norm": 2.4479551315307617, "learning_rate": 6.685919373976484e-05, "loss": 1.0848, "step": 794500 }, { "epoch": 23.34, "grad_norm": 2.503129243850708, "learning_rate": 6.671210174843347e-05, "loss": 1.1315, "step": 795000 }, { "epoch": 23.36, "grad_norm": 2.2025949954986572, "learning_rate": 6.656500975710208e-05, "loss": 1.1061, "step": 795500 }, { "epoch": 23.37, "grad_norm": 2.5942320823669434, "learning_rate": 6.641791776577071e-05, "loss": 1.1244, "step": 796000 }, { "epoch": 23.39, "grad_norm": 2.2347915172576904, "learning_rate": 6.627082577443932e-05, "loss": 1.0868, "step": 796500 }, { "epoch": 23.4, "grad_norm": 2.3821372985839844, "learning_rate": 6.612373378310796e-05, "loss": 1.1175, "step": 797000 }, { "epoch": 23.41, "grad_norm": 2.4445548057556152, "learning_rate": 6.597664179177656e-05, "loss": 1.1163, "step": 797500 }, { "epoch": 23.43, "grad_norm": 4.338953018188477, "learning_rate": 6.58295498004452e-05, "loss": 1.104, "step": 798000 }, { "epoch": 23.44, "grad_norm": 1.766503930091858, "learning_rate": 6.56824578091138e-05, "loss": 1.1178, "step": 798500 }, { "epoch": 23.46, "grad_norm": 1.7907469272613525, "learning_rate": 6.553536581778244e-05, "loss": 1.1183, "step": 799000 }, { "epoch": 23.47, "grad_norm": 1.722770094871521, "learning_rate": 6.538827382645105e-05, "loss": 1.1009, "step": 799500 }, { "epoch": 23.49, "grad_norm": 1.7940298318862915, "learning_rate": 6.524118183511968e-05, "loss": 1.0938, "step": 800000 }, { "epoch": 23.5, "grad_norm": 1.5882244110107422, "learning_rate": 6.50940898437883e-05, "loss": 1.1024, "step": 800500 }, { "epoch": 23.52, "grad_norm": 1.7875205278396606, "learning_rate": 6.494699785245692e-05, "loss": 1.092, "step": 801000 }, { "epoch": 23.53, "grad_norm": 2.0258092880249023, "learning_rate": 6.479990586112555e-05, "loss": 1.1136, "step": 801500 }, { "epoch": 23.55, "grad_norm": 1.6038285493850708, "learning_rate": 6.465281386979417e-05, "loss": 1.1013, "step": 802000 }, { "epoch": 23.56, "grad_norm": 2.411367654800415, "learning_rate": 6.450572187846279e-05, "loss": 1.0971, "step": 802500 }, { "epoch": 23.58, "grad_norm": 18.325363159179688, "learning_rate": 6.435862988713141e-05, "loss": 1.1292, "step": 803000 }, { "epoch": 23.59, "grad_norm": 2.520692825317383, "learning_rate": 6.421153789580003e-05, "loss": 1.1108, "step": 803500 }, { "epoch": 23.61, "grad_norm": 2.115523099899292, "learning_rate": 6.406444590446865e-05, "loss": 1.0878, "step": 804000 }, { "epoch": 23.62, "grad_norm": 2.4668285846710205, "learning_rate": 6.391735391313727e-05, "loss": 1.1089, "step": 804500 }, { "epoch": 23.64, "grad_norm": 2.2170443534851074, "learning_rate": 6.37702619218059e-05, "loss": 1.1189, "step": 805000 }, { "epoch": 23.65, "grad_norm": 2.2153406143188477, "learning_rate": 6.362316993047451e-05, "loss": 1.1053, "step": 805500 }, { "epoch": 23.66, "grad_norm": 2.937535285949707, "learning_rate": 6.347607793914314e-05, "loss": 1.108, "step": 806000 }, { "epoch": 23.68, "grad_norm": 1.9660471677780151, "learning_rate": 6.332898594781176e-05, "loss": 1.1203, "step": 806500 }, { "epoch": 23.69, "grad_norm": 3.5854222774505615, "learning_rate": 6.318189395648038e-05, "loss": 1.1082, "step": 807000 }, { "epoch": 23.71, "grad_norm": 6.527500152587891, "learning_rate": 6.3034801965149e-05, "loss": 1.1146, "step": 807500 }, { "epoch": 23.72, "grad_norm": 7.003795623779297, "learning_rate": 6.288770997381762e-05, "loss": 1.1137, "step": 808000 }, { "epoch": 23.74, "grad_norm": 2.606776714324951, "learning_rate": 6.274061798248624e-05, "loss": 1.0925, "step": 808500 }, { "epoch": 23.75, "grad_norm": 3.0098581314086914, "learning_rate": 6.259352599115486e-05, "loss": 1.1041, "step": 809000 }, { "epoch": 23.77, "grad_norm": 1.7633291482925415, "learning_rate": 6.244643399982348e-05, "loss": 1.1257, "step": 809500 }, { "epoch": 23.78, "grad_norm": 2.6187331676483154, "learning_rate": 6.22993420084921e-05, "loss": 1.0956, "step": 810000 }, { "epoch": 23.8, "grad_norm": 1.919297456741333, "learning_rate": 6.215225001716073e-05, "loss": 1.0954, "step": 810500 }, { "epoch": 23.81, "grad_norm": 3.422133445739746, "learning_rate": 6.200515802582935e-05, "loss": 1.0997, "step": 811000 }, { "epoch": 23.83, "grad_norm": 1.8277616500854492, "learning_rate": 6.185806603449797e-05, "loss": 1.1357, "step": 811500 }, { "epoch": 23.84, "grad_norm": 1.641455054283142, "learning_rate": 6.171097404316659e-05, "loss": 1.1186, "step": 812000 }, { "epoch": 23.86, "grad_norm": 2.629831075668335, "learning_rate": 6.156388205183521e-05, "loss": 1.0844, "step": 812500 }, { "epoch": 23.87, "grad_norm": 2.9981462955474854, "learning_rate": 6.141679006050383e-05, "loss": 1.1323, "step": 813000 }, { "epoch": 23.88, "grad_norm": 7.224218845367432, "learning_rate": 6.126969806917245e-05, "loss": 1.1091, "step": 813500 }, { "epoch": 23.9, "grad_norm": 1.445144772529602, "learning_rate": 6.112260607784107e-05, "loss": 1.0885, "step": 814000 }, { "epoch": 23.91, "grad_norm": 6.933565616607666, "learning_rate": 6.0975514086509695e-05, "loss": 1.1264, "step": 814500 }, { "epoch": 23.93, "grad_norm": 1.5660400390625, "learning_rate": 6.082842209517832e-05, "loss": 1.102, "step": 815000 }, { "epoch": 23.94, "grad_norm": 3.0154061317443848, "learning_rate": 6.068133010384694e-05, "loss": 1.1043, "step": 815500 }, { "epoch": 23.96, "grad_norm": 4.825801372528076, "learning_rate": 6.0534238112515565e-05, "loss": 1.1289, "step": 816000 }, { "epoch": 23.97, "grad_norm": 2.10929012298584, "learning_rate": 6.038714612118418e-05, "loss": 1.112, "step": 816500 }, { "epoch": 23.99, "grad_norm": 15.725834846496582, "learning_rate": 6.024005412985281e-05, "loss": 1.0857, "step": 817000 }, { "epoch": 24.0, "grad_norm": 3.745626926422119, "learning_rate": 6.009296213852142e-05, "loss": 1.0957, "step": 817500 }, { "epoch": 24.02, "grad_norm": 1.5650004148483276, "learning_rate": 5.994587014719005e-05, "loss": 1.0553, "step": 818000 }, { "epoch": 24.03, "grad_norm": 1.279994249343872, "learning_rate": 5.9798778155858664e-05, "loss": 1.0524, "step": 818500 }, { "epoch": 24.05, "grad_norm": 1.6556503772735596, "learning_rate": 5.965168616452729e-05, "loss": 1.0709, "step": 819000 }, { "epoch": 24.06, "grad_norm": 1.5529954433441162, "learning_rate": 5.9504594173195906e-05, "loss": 1.0952, "step": 819500 }, { "epoch": 24.08, "grad_norm": 1.7145531177520752, "learning_rate": 5.9357502181864534e-05, "loss": 1.0526, "step": 820000 }, { "epoch": 24.09, "grad_norm": 1.5692338943481445, "learning_rate": 5.921041019053315e-05, "loss": 1.0493, "step": 820500 }, { "epoch": 24.1, "grad_norm": 37.72039031982422, "learning_rate": 5.9063318199201776e-05, "loss": 1.068, "step": 821000 }, { "epoch": 24.12, "grad_norm": 2.0122039318084717, "learning_rate": 5.891622620787039e-05, "loss": 1.0775, "step": 821500 }, { "epoch": 24.13, "grad_norm": 5.577430248260498, "learning_rate": 5.876913421653902e-05, "loss": 1.1098, "step": 822000 }, { "epoch": 24.15, "grad_norm": 3.4199016094207764, "learning_rate": 5.8622042225207647e-05, "loss": 1.0893, "step": 822500 }, { "epoch": 24.16, "grad_norm": 3.654630184173584, "learning_rate": 5.847495023387626e-05, "loss": 1.0719, "step": 823000 }, { "epoch": 24.18, "grad_norm": 2.0985825061798096, "learning_rate": 5.832785824254489e-05, "loss": 1.0561, "step": 823500 }, { "epoch": 24.19, "grad_norm": 3.724655866622925, "learning_rate": 5.81807662512135e-05, "loss": 1.047, "step": 824000 }, { "epoch": 24.21, "grad_norm": 2.0398764610290527, "learning_rate": 5.803367425988213e-05, "loss": 1.0806, "step": 824500 }, { "epoch": 24.22, "grad_norm": 2.9711148738861084, "learning_rate": 5.7886582268550745e-05, "loss": 1.104, "step": 825000 }, { "epoch": 24.24, "grad_norm": 1.4870119094848633, "learning_rate": 5.773949027721937e-05, "loss": 1.0812, "step": 825500 }, { "epoch": 24.25, "grad_norm": 2.184882640838623, "learning_rate": 5.759239828588799e-05, "loss": 1.071, "step": 826000 }, { "epoch": 24.27, "grad_norm": 2.135782241821289, "learning_rate": 5.7445306294556616e-05, "loss": 1.081, "step": 826500 }, { "epoch": 24.28, "grad_norm": 1.952862024307251, "learning_rate": 5.729821430322523e-05, "loss": 1.0688, "step": 827000 }, { "epoch": 24.3, "grad_norm": 3.28511381149292, "learning_rate": 5.715112231189386e-05, "loss": 1.0719, "step": 827500 }, { "epoch": 24.31, "grad_norm": 3.095506191253662, "learning_rate": 5.700403032056247e-05, "loss": 1.1089, "step": 828000 }, { "epoch": 24.33, "grad_norm": 1.744498372077942, "learning_rate": 5.68569383292311e-05, "loss": 1.0666, "step": 828500 }, { "epoch": 24.34, "grad_norm": 1.5370303392410278, "learning_rate": 5.6709846337899714e-05, "loss": 1.0834, "step": 829000 }, { "epoch": 24.35, "grad_norm": 3.65459942817688, "learning_rate": 5.656275434656834e-05, "loss": 1.0767, "step": 829500 }, { "epoch": 24.37, "grad_norm": 2.0333411693573, "learning_rate": 5.641566235523696e-05, "loss": 1.0965, "step": 830000 }, { "epoch": 24.38, "grad_norm": 2.9158904552459717, "learning_rate": 5.6268570363905585e-05, "loss": 1.0726, "step": 830500 }, { "epoch": 24.4, "grad_norm": 2.0873067378997803, "learning_rate": 5.6121478372574206e-05, "loss": 1.0664, "step": 831000 }, { "epoch": 24.41, "grad_norm": 2.0477983951568604, "learning_rate": 5.597438638124283e-05, "loss": 1.0588, "step": 831500 }, { "epoch": 24.43, "grad_norm": 1.9659167528152466, "learning_rate": 5.582729438991145e-05, "loss": 1.0986, "step": 832000 }, { "epoch": 24.44, "grad_norm": 2.130729913711548, "learning_rate": 5.568020239858007e-05, "loss": 1.0713, "step": 832500 }, { "epoch": 24.46, "grad_norm": 99.76852416992188, "learning_rate": 5.553311040724869e-05, "loss": 1.0803, "step": 833000 }, { "epoch": 24.47, "grad_norm": 5.594266414642334, "learning_rate": 5.538601841591731e-05, "loss": 1.08, "step": 833500 }, { "epoch": 24.49, "grad_norm": 2.3737406730651855, "learning_rate": 5.523892642458593e-05, "loss": 1.0931, "step": 834000 }, { "epoch": 24.5, "grad_norm": 1.6415624618530273, "learning_rate": 5.5091834433254554e-05, "loss": 1.0798, "step": 834500 }, { "epoch": 24.52, "grad_norm": 7.322782516479492, "learning_rate": 5.4944742441923175e-05, "loss": 1.0865, "step": 835000 }, { "epoch": 24.53, "grad_norm": 2.1509947776794434, "learning_rate": 5.4797650450591796e-05, "loss": 1.0782, "step": 835500 }, { "epoch": 24.55, "grad_norm": 2.485328197479248, "learning_rate": 5.465055845926042e-05, "loss": 1.0585, "step": 836000 }, { "epoch": 24.56, "grad_norm": 2.5374958515167236, "learning_rate": 5.450346646792904e-05, "loss": 1.0577, "step": 836500 }, { "epoch": 24.57, "grad_norm": 2.196706533432007, "learning_rate": 5.435637447659766e-05, "loss": 1.0894, "step": 837000 }, { "epoch": 24.59, "grad_norm": 2.9057724475860596, "learning_rate": 5.420928248526629e-05, "loss": 1.075, "step": 837500 }, { "epoch": 24.6, "grad_norm": 2.4338793754577637, "learning_rate": 5.40621904939349e-05, "loss": 1.0829, "step": 838000 }, { "epoch": 24.62, "grad_norm": 2.225640058517456, "learning_rate": 5.391509850260353e-05, "loss": 1.0798, "step": 838500 }, { "epoch": 24.63, "grad_norm": 2.3546741008758545, "learning_rate": 5.3768006511272144e-05, "loss": 1.0887, "step": 839000 }, { "epoch": 24.65, "grad_norm": 1.3033528327941895, "learning_rate": 5.362091451994077e-05, "loss": 1.0809, "step": 839500 }, { "epoch": 24.66, "grad_norm": 2.5136497020721436, "learning_rate": 5.3473822528609386e-05, "loss": 1.0851, "step": 840000 }, { "epoch": 24.68, "grad_norm": 2.0871777534484863, "learning_rate": 5.3326730537278014e-05, "loss": 1.0744, "step": 840500 }, { "epoch": 24.69, "grad_norm": 5.352667808532715, "learning_rate": 5.317963854594663e-05, "loss": 1.102, "step": 841000 }, { "epoch": 24.71, "grad_norm": 1.8060741424560547, "learning_rate": 5.3032546554615256e-05, "loss": 1.0541, "step": 841500 }, { "epoch": 24.72, "grad_norm": 2.5160653591156006, "learning_rate": 5.288545456328387e-05, "loss": 1.077, "step": 842000 }, { "epoch": 24.74, "grad_norm": 4.854517936706543, "learning_rate": 5.27383625719525e-05, "loss": 1.084, "step": 842500 }, { "epoch": 24.75, "grad_norm": 7.554323196411133, "learning_rate": 5.259127058062111e-05, "loss": 1.0974, "step": 843000 }, { "epoch": 24.77, "grad_norm": 1.8624228239059448, "learning_rate": 5.244417858928974e-05, "loss": 1.0793, "step": 843500 }, { "epoch": 24.78, "grad_norm": 1.6777387857437134, "learning_rate": 5.2297086597958355e-05, "loss": 1.0746, "step": 844000 }, { "epoch": 24.79, "grad_norm": 1.9839929342269897, "learning_rate": 5.214999460662698e-05, "loss": 1.0813, "step": 844500 }, { "epoch": 24.81, "grad_norm": 1.5621155500411987, "learning_rate": 5.20029026152956e-05, "loss": 1.0936, "step": 845000 }, { "epoch": 24.82, "grad_norm": 2.5677313804626465, "learning_rate": 5.1855810623964225e-05, "loss": 1.0845, "step": 845500 }, { "epoch": 24.84, "grad_norm": 1.3594130277633667, "learning_rate": 5.170871863263284e-05, "loss": 1.0563, "step": 846000 }, { "epoch": 24.85, "grad_norm": 1.3905824422836304, "learning_rate": 5.156162664130147e-05, "loss": 1.0661, "step": 846500 }, { "epoch": 24.87, "grad_norm": 1.6563278436660767, "learning_rate": 5.141453464997008e-05, "loss": 1.079, "step": 847000 }, { "epoch": 24.88, "grad_norm": 2.1937708854675293, "learning_rate": 5.126744265863871e-05, "loss": 1.0714, "step": 847500 }, { "epoch": 24.9, "grad_norm": 1.874009609222412, "learning_rate": 5.112035066730733e-05, "loss": 1.064, "step": 848000 }, { "epoch": 24.91, "grad_norm": 1.4228936433792114, "learning_rate": 5.097325867597595e-05, "loss": 1.0954, "step": 848500 }, { "epoch": 24.93, "grad_norm": 1.7444462776184082, "learning_rate": 5.082616668464457e-05, "loss": 1.0921, "step": 849000 }, { "epoch": 24.94, "grad_norm": 186.56033325195312, "learning_rate": 5.0679074693313194e-05, "loss": 1.0757, "step": 849500 }, { "epoch": 24.96, "grad_norm": 4.014658451080322, "learning_rate": 5.0531982701981815e-05, "loss": 1.0864, "step": 850000 }, { "epoch": 24.97, "grad_norm": 1.8258497714996338, "learning_rate": 5.0384890710650436e-05, "loss": 1.0691, "step": 850500 }, { "epoch": 24.99, "grad_norm": 2.100780487060547, "learning_rate": 5.023779871931906e-05, "loss": 1.0722, "step": 851000 }, { "epoch": 25.0, "grad_norm": 2.250474452972412, "learning_rate": 5.009070672798768e-05, "loss": 1.0853, "step": 851500 }, { "epoch": 25.02, "grad_norm": 2.6746740341186523, "learning_rate": 4.99436147366563e-05, "loss": 1.0277, "step": 852000 }, { "epoch": 25.03, "grad_norm": 2.413872480392456, "learning_rate": 4.979652274532492e-05, "loss": 1.0804, "step": 852500 }, { "epoch": 25.04, "grad_norm": 3.563396453857422, "learning_rate": 4.964943075399354e-05, "loss": 1.0638, "step": 853000 }, { "epoch": 25.06, "grad_norm": 1.7918055057525635, "learning_rate": 4.950233876266217e-05, "loss": 1.0541, "step": 853500 }, { "epoch": 25.07, "grad_norm": 2.508848190307617, "learning_rate": 4.9355246771330784e-05, "loss": 1.0083, "step": 854000 }, { "epoch": 25.09, "grad_norm": 9.781774520874023, "learning_rate": 4.920815477999941e-05, "loss": 1.0236, "step": 854500 }, { "epoch": 25.1, "grad_norm": 2.6363613605499268, "learning_rate": 4.9061062788668026e-05, "loss": 1.0356, "step": 855000 }, { "epoch": 25.12, "grad_norm": 5.132301330566406, "learning_rate": 4.8913970797336654e-05, "loss": 1.0509, "step": 855500 }, { "epoch": 25.13, "grad_norm": 2.983057737350464, "learning_rate": 4.876687880600527e-05, "loss": 1.0486, "step": 856000 }, { "epoch": 25.15, "grad_norm": 3.175165891647339, "learning_rate": 4.8619786814673897e-05, "loss": 1.0521, "step": 856500 }, { "epoch": 25.16, "grad_norm": 5.045122146606445, "learning_rate": 4.847269482334251e-05, "loss": 1.0494, "step": 857000 }, { "epoch": 25.18, "grad_norm": 2.601104259490967, "learning_rate": 4.832560283201114e-05, "loss": 1.0301, "step": 857500 }, { "epoch": 25.19, "grad_norm": 2.546464443206787, "learning_rate": 4.817851084067975e-05, "loss": 1.0237, "step": 858000 }, { "epoch": 25.21, "grad_norm": 2.9643750190734863, "learning_rate": 4.803141884934838e-05, "loss": 1.0573, "step": 858500 }, { "epoch": 25.22, "grad_norm": 5.178891658782959, "learning_rate": 4.7884326858016995e-05, "loss": 1.0588, "step": 859000 }, { "epoch": 25.24, "grad_norm": 2.786492347717285, "learning_rate": 4.773723486668562e-05, "loss": 1.0537, "step": 859500 }, { "epoch": 25.25, "grad_norm": 3.3391730785369873, "learning_rate": 4.759014287535424e-05, "loss": 1.0452, "step": 860000 }, { "epoch": 25.26, "grad_norm": 2.8821425437927246, "learning_rate": 4.7443050884022866e-05, "loss": 1.0538, "step": 860500 }, { "epoch": 25.28, "grad_norm": 8.122342109680176, "learning_rate": 4.729595889269148e-05, "loss": 1.0326, "step": 861000 }, { "epoch": 25.29, "grad_norm": 5.042423725128174, "learning_rate": 4.714886690136011e-05, "loss": 1.0586, "step": 861500 }, { "epoch": 25.31, "grad_norm": 3.068471670150757, "learning_rate": 4.700177491002872e-05, "loss": 1.032, "step": 862000 }, { "epoch": 25.32, "grad_norm": 3.833307981491089, "learning_rate": 4.685468291869735e-05, "loss": 1.0581, "step": 862500 }, { "epoch": 25.34, "grad_norm": 2.662344455718994, "learning_rate": 4.6707590927365964e-05, "loss": 1.0526, "step": 863000 }, { "epoch": 25.35, "grad_norm": 3.626770257949829, "learning_rate": 4.656049893603459e-05, "loss": 1.0484, "step": 863500 }, { "epoch": 25.37, "grad_norm": 2.1823232173919678, "learning_rate": 4.6413406944703213e-05, "loss": 1.0524, "step": 864000 }, { "epoch": 25.38, "grad_norm": 2.051774740219116, "learning_rate": 4.6266314953371835e-05, "loss": 1.0406, "step": 864500 }, { "epoch": 25.4, "grad_norm": 9.218499183654785, "learning_rate": 4.6119222962040456e-05, "loss": 1.0605, "step": 865000 }, { "epoch": 25.41, "grad_norm": 2.4243879318237305, "learning_rate": 4.597213097070908e-05, "loss": 1.033, "step": 865500 }, { "epoch": 25.43, "grad_norm": 5.0782599449157715, "learning_rate": 4.58250389793777e-05, "loss": 1.059, "step": 866000 }, { "epoch": 25.44, "grad_norm": 2.4752907752990723, "learning_rate": 4.567794698804632e-05, "loss": 1.0348, "step": 866500 }, { "epoch": 25.46, "grad_norm": 3.958376407623291, "learning_rate": 4.553085499671494e-05, "loss": 1.0677, "step": 867000 }, { "epoch": 25.47, "grad_norm": 2.9827880859375, "learning_rate": 4.538376300538356e-05, "loss": 1.0371, "step": 867500 }, { "epoch": 25.48, "grad_norm": 4.071894645690918, "learning_rate": 4.523667101405218e-05, "loss": 1.0578, "step": 868000 }, { "epoch": 25.5, "grad_norm": 34.7899169921875, "learning_rate": 4.5089579022720804e-05, "loss": 1.0903, "step": 868500 }, { "epoch": 25.51, "grad_norm": 2.133716344833374, "learning_rate": 4.4942487031389425e-05, "loss": 1.0471, "step": 869000 }, { "epoch": 25.53, "grad_norm": 2.6652045249938965, "learning_rate": 4.4795395040058046e-05, "loss": 1.0358, "step": 869500 }, { "epoch": 25.54, "grad_norm": 2.98919677734375, "learning_rate": 4.464830304872667e-05, "loss": 1.0621, "step": 870000 }, { "epoch": 25.56, "grad_norm": 2.7026760578155518, "learning_rate": 4.4501211057395295e-05, "loss": 1.0621, "step": 870500 }, { "epoch": 25.57, "grad_norm": 3.262974977493286, "learning_rate": 4.435411906606391e-05, "loss": 1.0321, "step": 871000 }, { "epoch": 25.59, "grad_norm": 1.9586899280548096, "learning_rate": 4.420702707473254e-05, "loss": 1.0347, "step": 871500 }, { "epoch": 25.6, "grad_norm": 7.560853481292725, "learning_rate": 4.405993508340115e-05, "loss": 1.0221, "step": 872000 }, { "epoch": 25.62, "grad_norm": 3.056516408920288, "learning_rate": 4.391284309206978e-05, "loss": 1.043, "step": 872500 }, { "epoch": 25.63, "grad_norm": 2.7999415397644043, "learning_rate": 4.3765751100738394e-05, "loss": 1.0515, "step": 873000 }, { "epoch": 25.65, "grad_norm": 2.5458314418792725, "learning_rate": 4.361865910940702e-05, "loss": 1.0599, "step": 873500 }, { "epoch": 25.66, "grad_norm": 3.634589910507202, "learning_rate": 4.3471567118075636e-05, "loss": 1.0428, "step": 874000 }, { "epoch": 25.68, "grad_norm": 2.8104031085968018, "learning_rate": 4.3324475126744264e-05, "loss": 1.0396, "step": 874500 }, { "epoch": 25.69, "grad_norm": 2.319333791732788, "learning_rate": 4.317738313541288e-05, "loss": 1.0426, "step": 875000 }, { "epoch": 25.71, "grad_norm": 8.262571334838867, "learning_rate": 4.3030291144081506e-05, "loss": 1.0616, "step": 875500 }, { "epoch": 25.72, "grad_norm": 3.307812213897705, "learning_rate": 4.288319915275012e-05, "loss": 1.0706, "step": 876000 }, { "epoch": 25.73, "grad_norm": 7.4953508377075195, "learning_rate": 4.273610716141875e-05, "loss": 1.0457, "step": 876500 }, { "epoch": 25.75, "grad_norm": 3.5895800590515137, "learning_rate": 4.258901517008736e-05, "loss": 1.0219, "step": 877000 }, { "epoch": 25.76, "grad_norm": 3.023827075958252, "learning_rate": 4.244192317875599e-05, "loss": 1.05, "step": 877500 }, { "epoch": 25.78, "grad_norm": 2.1299614906311035, "learning_rate": 4.2294831187424605e-05, "loss": 1.0505, "step": 878000 }, { "epoch": 25.79, "grad_norm": 3.042360305786133, "learning_rate": 4.214773919609323e-05, "loss": 1.0481, "step": 878500 }, { "epoch": 25.81, "grad_norm": 2.3197755813598633, "learning_rate": 4.200064720476185e-05, "loss": 1.0181, "step": 879000 }, { "epoch": 25.82, "grad_norm": 3.230426788330078, "learning_rate": 4.1853555213430475e-05, "loss": 1.0327, "step": 879500 }, { "epoch": 25.84, "grad_norm": 7.685334205627441, "learning_rate": 4.1706463222099096e-05, "loss": 1.0334, "step": 880000 }, { "epoch": 25.85, "grad_norm": 3.2607569694519043, "learning_rate": 4.155937123076772e-05, "loss": 1.0342, "step": 880500 }, { "epoch": 25.87, "grad_norm": 3.470470428466797, "learning_rate": 4.1412279239436345e-05, "loss": 1.0508, "step": 881000 }, { "epoch": 25.88, "grad_norm": 16.141782760620117, "learning_rate": 4.126518724810496e-05, "loss": 1.046, "step": 881500 }, { "epoch": 25.9, "grad_norm": 2.900520086288452, "learning_rate": 4.111809525677359e-05, "loss": 1.0791, "step": 882000 }, { "epoch": 25.91, "grad_norm": 2.5581626892089844, "learning_rate": 4.09710032654422e-05, "loss": 1.0386, "step": 882500 }, { "epoch": 25.93, "grad_norm": 2.301318883895874, "learning_rate": 4.082391127411083e-05, "loss": 1.078, "step": 883000 }, { "epoch": 25.94, "grad_norm": 2.5168445110321045, "learning_rate": 4.0676819282779444e-05, "loss": 1.0524, "step": 883500 }, { "epoch": 25.95, "grad_norm": 3.2487659454345703, "learning_rate": 4.052972729144807e-05, "loss": 1.0476, "step": 884000 }, { "epoch": 25.97, "grad_norm": 3.8711748123168945, "learning_rate": 4.0382635300116686e-05, "loss": 1.0349, "step": 884500 }, { "epoch": 25.98, "grad_norm": 2.8647818565368652, "learning_rate": 4.0235543308785314e-05, "loss": 1.0681, "step": 885000 }, { "epoch": 26.0, "grad_norm": 2.307555675506592, "learning_rate": 4.008845131745393e-05, "loss": 1.0299, "step": 885500 }, { "epoch": 26.01, "grad_norm": 5.377994537353516, "learning_rate": 3.9941359326122557e-05, "loss": 1.0274, "step": 886000 }, { "epoch": 26.03, "grad_norm": 3.1154091358184814, "learning_rate": 3.979426733479118e-05, "loss": 1.0313, "step": 886500 }, { "epoch": 26.04, "grad_norm": 1.3295516967773438, "learning_rate": 3.96471753434598e-05, "loss": 0.9962, "step": 887000 }, { "epoch": 26.06, "grad_norm": 2.4110894203186035, "learning_rate": 3.950008335212842e-05, "loss": 1.0051, "step": 887500 }, { "epoch": 26.07, "grad_norm": 1.6697067022323608, "learning_rate": 3.935299136079704e-05, "loss": 1.0119, "step": 888000 }, { "epoch": 26.09, "grad_norm": 2.4017999172210693, "learning_rate": 3.920589936946566e-05, "loss": 1.0253, "step": 888500 }, { "epoch": 26.1, "grad_norm": 3.1681103706359863, "learning_rate": 3.905880737813428e-05, "loss": 1.0213, "step": 889000 }, { "epoch": 26.12, "grad_norm": 2.9184956550598145, "learning_rate": 3.8911715386802904e-05, "loss": 1.0298, "step": 889500 }, { "epoch": 26.13, "grad_norm": 1.6290541887283325, "learning_rate": 3.8764623395471526e-05, "loss": 1.0094, "step": 890000 }, { "epoch": 26.15, "grad_norm": 2.779973030090332, "learning_rate": 3.861753140414015e-05, "loss": 1.016, "step": 890500 }, { "epoch": 26.16, "grad_norm": 4.393610000610352, "learning_rate": 3.847043941280877e-05, "loss": 1.0131, "step": 891000 }, { "epoch": 26.17, "grad_norm": 2.503702402114868, "learning_rate": 3.832334742147739e-05, "loss": 1.0207, "step": 891500 }, { "epoch": 26.19, "grad_norm": 1.3769996166229248, "learning_rate": 3.817625543014601e-05, "loss": 1.0305, "step": 892000 }, { "epoch": 26.2, "grad_norm": 9.221471786499023, "learning_rate": 3.802916343881463e-05, "loss": 1.0258, "step": 892500 }, { "epoch": 26.22, "grad_norm": 1.706425666809082, "learning_rate": 3.788207144748326e-05, "loss": 0.9808, "step": 893000 }, { "epoch": 26.23, "grad_norm": 15.878872871398926, "learning_rate": 3.7734979456151873e-05, "loss": 1.0225, "step": 893500 }, { "epoch": 26.25, "grad_norm": 9.141250610351562, "learning_rate": 3.75878874648205e-05, "loss": 1.0234, "step": 894000 }, { "epoch": 26.26, "grad_norm": 1.698013424873352, "learning_rate": 3.7440795473489116e-05, "loss": 1.0207, "step": 894500 }, { "epoch": 26.28, "grad_norm": 2.8760321140289307, "learning_rate": 3.729370348215774e-05, "loss": 1.0054, "step": 895000 }, { "epoch": 26.29, "grad_norm": 10.192665100097656, "learning_rate": 3.714661149082636e-05, "loss": 1.0202, "step": 895500 }, { "epoch": 26.31, "grad_norm": 1.8373937606811523, "learning_rate": 3.699951949949498e-05, "loss": 1.0399, "step": 896000 }, { "epoch": 26.32, "grad_norm": 0.8489872217178345, "learning_rate": 3.68524275081636e-05, "loss": 1.0525, "step": 896500 }, { "epoch": 26.34, "grad_norm": 1.7799832820892334, "learning_rate": 3.670533551683222e-05, "loss": 1.0271, "step": 897000 }, { "epoch": 26.35, "grad_norm": 1.6424106359481812, "learning_rate": 3.655824352550084e-05, "loss": 1.0093, "step": 897500 }, { "epoch": 26.37, "grad_norm": 4.4057135581970215, "learning_rate": 3.6411151534169464e-05, "loss": 1.0214, "step": 898000 }, { "epoch": 26.38, "grad_norm": 1.971706509590149, "learning_rate": 3.6264059542838085e-05, "loss": 1.0316, "step": 898500 }, { "epoch": 26.39, "grad_norm": 1.934280276298523, "learning_rate": 3.6116967551506706e-05, "loss": 1.0368, "step": 899000 }, { "epoch": 26.41, "grad_norm": 1.6416791677474976, "learning_rate": 3.596987556017533e-05, "loss": 1.0123, "step": 899500 }, { "epoch": 26.42, "grad_norm": 1.6787991523742676, "learning_rate": 3.582278356884395e-05, "loss": 1.0026, "step": 900000 }, { "epoch": 26.44, "grad_norm": 2.6135306358337402, "learning_rate": 3.567569157751257e-05, "loss": 0.9887, "step": 900500 }, { "epoch": 26.45, "grad_norm": 1.4282991886138916, "learning_rate": 3.552859958618119e-05, "loss": 1.0442, "step": 901000 }, { "epoch": 26.47, "grad_norm": 1.8328664302825928, "learning_rate": 3.538150759484981e-05, "loss": 0.9983, "step": 901500 }, { "epoch": 26.48, "grad_norm": 2.659299850463867, "learning_rate": 3.523441560351843e-05, "loss": 1.0397, "step": 902000 }, { "epoch": 26.5, "grad_norm": 2.29178524017334, "learning_rate": 3.508732361218706e-05, "loss": 1.0182, "step": 902500 }, { "epoch": 26.51, "grad_norm": 9.058496475219727, "learning_rate": 3.494023162085568e-05, "loss": 1.0113, "step": 903000 }, { "epoch": 26.53, "grad_norm": 1.5530019998550415, "learning_rate": 3.47931396295243e-05, "loss": 1.0243, "step": 903500 }, { "epoch": 26.54, "grad_norm": 1.9570651054382324, "learning_rate": 3.4646047638192924e-05, "loss": 1.0012, "step": 904000 }, { "epoch": 26.56, "grad_norm": 1.154820203781128, "learning_rate": 3.4498955646861545e-05, "loss": 1.0231, "step": 904500 }, { "epoch": 26.57, "grad_norm": 1.5875574350357056, "learning_rate": 3.4351863655530166e-05, "loss": 1.0239, "step": 905000 }, { "epoch": 26.59, "grad_norm": 2.007080554962158, "learning_rate": 3.420477166419879e-05, "loss": 1.0119, "step": 905500 }, { "epoch": 26.6, "grad_norm": 1.6997588872909546, "learning_rate": 3.405767967286741e-05, "loss": 1.0235, "step": 906000 }, { "epoch": 26.62, "grad_norm": 2.8754279613494873, "learning_rate": 3.391058768153603e-05, "loss": 1.0152, "step": 906500 }, { "epoch": 26.63, "grad_norm": 3.1198441982269287, "learning_rate": 3.376349569020465e-05, "loss": 1.0351, "step": 907000 }, { "epoch": 26.64, "grad_norm": 11.747318267822266, "learning_rate": 3.361640369887327e-05, "loss": 1.0166, "step": 907500 }, { "epoch": 26.66, "grad_norm": 1.7430450916290283, "learning_rate": 3.346931170754189e-05, "loss": 1.0268, "step": 908000 }, { "epoch": 26.67, "grad_norm": 1.4505376815795898, "learning_rate": 3.3322219716210514e-05, "loss": 1.0297, "step": 908500 }, { "epoch": 26.69, "grad_norm": 4.441274642944336, "learning_rate": 3.3175127724879135e-05, "loss": 1.0145, "step": 909000 }, { "epoch": 26.7, "grad_norm": 4.974277973175049, "learning_rate": 3.302803573354776e-05, "loss": 1.0163, "step": 909500 }, { "epoch": 26.72, "grad_norm": 2.1050703525543213, "learning_rate": 3.2880943742216384e-05, "loss": 1.0125, "step": 910000 }, { "epoch": 26.73, "grad_norm": 4.370170593261719, "learning_rate": 3.2733851750885005e-05, "loss": 1.0134, "step": 910500 }, { "epoch": 26.75, "grad_norm": 1.755359172821045, "learning_rate": 3.2586759759553626e-05, "loss": 1.0174, "step": 911000 }, { "epoch": 26.76, "grad_norm": 2.099271774291992, "learning_rate": 3.243966776822225e-05, "loss": 1.0142, "step": 911500 }, { "epoch": 26.78, "grad_norm": 1.560284972190857, "learning_rate": 3.229257577689087e-05, "loss": 1.0393, "step": 912000 }, { "epoch": 26.79, "grad_norm": 2.1582424640655518, "learning_rate": 3.214548378555949e-05, "loss": 1.016, "step": 912500 }, { "epoch": 26.81, "grad_norm": 2.342576026916504, "learning_rate": 3.199839179422811e-05, "loss": 1.0262, "step": 913000 }, { "epoch": 26.82, "grad_norm": 1.4274934530258179, "learning_rate": 3.185129980289673e-05, "loss": 0.9991, "step": 913500 }, { "epoch": 26.84, "grad_norm": 3.858720064163208, "learning_rate": 3.170420781156535e-05, "loss": 1.0171, "step": 914000 }, { "epoch": 26.85, "grad_norm": 2.058018922805786, "learning_rate": 3.1557115820233974e-05, "loss": 1.0263, "step": 914500 }, { "epoch": 26.86, "grad_norm": 2.6626269817352295, "learning_rate": 3.1410023828902595e-05, "loss": 1.0257, "step": 915000 }, { "epoch": 26.88, "grad_norm": 5.859118938446045, "learning_rate": 3.1262931837571216e-05, "loss": 1.023, "step": 915500 }, { "epoch": 26.89, "grad_norm": 6.2187957763671875, "learning_rate": 3.111583984623984e-05, "loss": 1.0107, "step": 916000 }, { "epoch": 26.91, "grad_norm": 2.6373274326324463, "learning_rate": 3.096874785490846e-05, "loss": 1.0172, "step": 916500 }, { "epoch": 26.92, "grad_norm": 3.272045850753784, "learning_rate": 3.082165586357708e-05, "loss": 1.0175, "step": 917000 }, { "epoch": 26.94, "grad_norm": 1.6807574033737183, "learning_rate": 3.06745638722457e-05, "loss": 1.0295, "step": 917500 }, { "epoch": 26.95, "grad_norm": 4.664640426635742, "learning_rate": 3.052747188091432e-05, "loss": 1.0368, "step": 918000 }, { "epoch": 26.97, "grad_norm": 1.5610125064849854, "learning_rate": 3.0380379889582943e-05, "loss": 1.0365, "step": 918500 }, { "epoch": 26.98, "grad_norm": 2.0391860008239746, "learning_rate": 3.0233287898251564e-05, "loss": 1.0289, "step": 919000 }, { "epoch": 27.0, "grad_norm": 47.55400848388672, "learning_rate": 3.0086195906920185e-05, "loss": 1.0292, "step": 919500 }, { "epoch": 27.01, "grad_norm": 2.6204073429107666, "learning_rate": 2.9939103915588807e-05, "loss": 1.0069, "step": 920000 }, { "epoch": 27.03, "grad_norm": 9.747673034667969, "learning_rate": 2.9792011924257428e-05, "loss": 0.9997, "step": 920500 }, { "epoch": 27.04, "grad_norm": 2.057465076446533, "learning_rate": 2.9644919932926052e-05, "loss": 0.9993, "step": 921000 }, { "epoch": 27.06, "grad_norm": 8.264533996582031, "learning_rate": 2.9497827941594673e-05, "loss": 0.9862, "step": 921500 }, { "epoch": 27.07, "grad_norm": 4.143492698669434, "learning_rate": 2.9350735950263294e-05, "loss": 0.9774, "step": 922000 }, { "epoch": 27.08, "grad_norm": 2.2518653869628906, "learning_rate": 2.9203643958931916e-05, "loss": 1.0113, "step": 922500 }, { "epoch": 27.1, "grad_norm": 2.745732307434082, "learning_rate": 2.9056551967600537e-05, "loss": 0.9734, "step": 923000 }, { "epoch": 27.11, "grad_norm": 1.3990339040756226, "learning_rate": 2.8909459976269158e-05, "loss": 0.9918, "step": 923500 }, { "epoch": 27.13, "grad_norm": 2.024524211883545, "learning_rate": 2.876236798493778e-05, "loss": 1.0033, "step": 924000 }, { "epoch": 27.14, "grad_norm": 2.5025253295898438, "learning_rate": 2.86152759936064e-05, "loss": 1.0004, "step": 924500 }, { "epoch": 27.16, "grad_norm": 2.7478108406066895, "learning_rate": 2.846818400227502e-05, "loss": 0.9901, "step": 925000 }, { "epoch": 27.17, "grad_norm": 1.7393391132354736, "learning_rate": 2.8321092010943642e-05, "loss": 1.0035, "step": 925500 }, { "epoch": 27.19, "grad_norm": 1.3920613527297974, "learning_rate": 2.8174000019612263e-05, "loss": 1.0194, "step": 926000 }, { "epoch": 27.2, "grad_norm": 2.2148451805114746, "learning_rate": 2.8026908028280885e-05, "loss": 0.9858, "step": 926500 }, { "epoch": 27.22, "grad_norm": 1.7391964197158813, "learning_rate": 2.7879816036949506e-05, "loss": 1.0069, "step": 927000 }, { "epoch": 27.23, "grad_norm": 2.359649419784546, "learning_rate": 2.7732724045618127e-05, "loss": 0.9989, "step": 927500 }, { "epoch": 27.25, "grad_norm": 2.8471827507019043, "learning_rate": 2.7585632054286748e-05, "loss": 0.9832, "step": 928000 }, { "epoch": 27.26, "grad_norm": 1.788493037223816, "learning_rate": 2.743854006295537e-05, "loss": 1.0104, "step": 928500 }, { "epoch": 27.28, "grad_norm": 1.7663925886154175, "learning_rate": 2.7291448071623994e-05, "loss": 1.0031, "step": 929000 }, { "epoch": 27.29, "grad_norm": 2.699171304702759, "learning_rate": 2.7144356080292615e-05, "loss": 1.0037, "step": 929500 }, { "epoch": 27.31, "grad_norm": 12.16844367980957, "learning_rate": 2.6997264088961236e-05, "loss": 1.0109, "step": 930000 }, { "epoch": 27.32, "grad_norm": 2.0973167419433594, "learning_rate": 2.6850172097629857e-05, "loss": 1.0123, "step": 930500 }, { "epoch": 27.33, "grad_norm": 1.7330328226089478, "learning_rate": 2.6703080106298478e-05, "loss": 1.0055, "step": 931000 }, { "epoch": 27.35, "grad_norm": 5.502919673919678, "learning_rate": 2.65559881149671e-05, "loss": 1.003, "step": 931500 }, { "epoch": 27.36, "grad_norm": 2.3308217525482178, "learning_rate": 2.640889612363572e-05, "loss": 1.0059, "step": 932000 }, { "epoch": 27.38, "grad_norm": 1.6248698234558105, "learning_rate": 2.626180413230434e-05, "loss": 0.9849, "step": 932500 }, { "epoch": 27.39, "grad_norm": 3.2510266304016113, "learning_rate": 2.6114712140972963e-05, "loss": 0.9862, "step": 933000 }, { "epoch": 27.41, "grad_norm": 2.0555245876312256, "learning_rate": 2.5967620149641584e-05, "loss": 0.9844, "step": 933500 }, { "epoch": 27.42, "grad_norm": 7.706765651702881, "learning_rate": 2.5820528158310205e-05, "loss": 0.9913, "step": 934000 }, { "epoch": 27.44, "grad_norm": 1.9011601209640503, "learning_rate": 2.5673436166978826e-05, "loss": 1.0102, "step": 934500 }, { "epoch": 27.45, "grad_norm": 6.408154010772705, "learning_rate": 2.5526344175647447e-05, "loss": 0.996, "step": 935000 }, { "epoch": 27.47, "grad_norm": 6.746776580810547, "learning_rate": 2.5379252184316068e-05, "loss": 1.0273, "step": 935500 }, { "epoch": 27.48, "grad_norm": 33.82996368408203, "learning_rate": 2.523216019298469e-05, "loss": 1.007, "step": 936000 }, { "epoch": 27.5, "grad_norm": 1.6131765842437744, "learning_rate": 2.508506820165331e-05, "loss": 0.9855, "step": 936500 }, { "epoch": 27.51, "grad_norm": 2.5696966648101807, "learning_rate": 2.493797621032193e-05, "loss": 1.0153, "step": 937000 }, { "epoch": 27.53, "grad_norm": 1.5355910062789917, "learning_rate": 2.4790884218990556e-05, "loss": 0.9714, "step": 937500 }, { "epoch": 27.54, "grad_norm": 2.0435054302215576, "learning_rate": 2.4643792227659177e-05, "loss": 0.9997, "step": 938000 }, { "epoch": 27.55, "grad_norm": 1.6511478424072266, "learning_rate": 2.44967002363278e-05, "loss": 0.9865, "step": 938500 }, { "epoch": 27.57, "grad_norm": 1.9227514266967773, "learning_rate": 2.434960824499642e-05, "loss": 1.0103, "step": 939000 }, { "epoch": 27.58, "grad_norm": 16.868221282958984, "learning_rate": 2.420251625366504e-05, "loss": 0.9854, "step": 939500 }, { "epoch": 27.6, "grad_norm": 1.7769354581832886, "learning_rate": 2.4055424262333662e-05, "loss": 1.0176, "step": 940000 }, { "epoch": 27.61, "grad_norm": 1.6719156503677368, "learning_rate": 2.3908332271002283e-05, "loss": 0.9951, "step": 940500 }, { "epoch": 27.63, "grad_norm": 3.5106630325317383, "learning_rate": 2.3761240279670904e-05, "loss": 0.994, "step": 941000 }, { "epoch": 27.64, "grad_norm": 3.102487564086914, "learning_rate": 2.3614148288339525e-05, "loss": 0.9802, "step": 941500 }, { "epoch": 27.66, "grad_norm": 12.020054817199707, "learning_rate": 2.3467056297008146e-05, "loss": 0.9997, "step": 942000 }, { "epoch": 27.67, "grad_norm": 2.8283021450042725, "learning_rate": 2.3319964305676767e-05, "loss": 0.9892, "step": 942500 }, { "epoch": 27.69, "grad_norm": 37.08485412597656, "learning_rate": 2.317287231434539e-05, "loss": 0.9969, "step": 943000 }, { "epoch": 27.7, "grad_norm": 1.4365429878234863, "learning_rate": 2.302578032301401e-05, "loss": 0.9748, "step": 943500 }, { "epoch": 27.72, "grad_norm": 1.8424559831619263, "learning_rate": 2.287868833168263e-05, "loss": 0.9741, "step": 944000 }, { "epoch": 27.73, "grad_norm": 2.4849884510040283, "learning_rate": 2.2731596340351252e-05, "loss": 0.9896, "step": 944500 }, { "epoch": 27.75, "grad_norm": 2.0955851078033447, "learning_rate": 2.2584504349019873e-05, "loss": 1.012, "step": 945000 }, { "epoch": 27.76, "grad_norm": 5.421468257904053, "learning_rate": 2.2437412357688498e-05, "loss": 0.9681, "step": 945500 }, { "epoch": 27.77, "grad_norm": 3.2725253105163574, "learning_rate": 2.229032036635712e-05, "loss": 1.0059, "step": 946000 }, { "epoch": 27.79, "grad_norm": 2.741119861602783, "learning_rate": 2.214322837502574e-05, "loss": 0.9933, "step": 946500 }, { "epoch": 27.8, "grad_norm": 1.600335717201233, "learning_rate": 2.199613638369436e-05, "loss": 0.997, "step": 947000 }, { "epoch": 27.82, "grad_norm": 3.4619009494781494, "learning_rate": 2.1849044392362982e-05, "loss": 0.9976, "step": 947500 }, { "epoch": 27.83, "grad_norm": 10.748187065124512, "learning_rate": 2.1701952401031603e-05, "loss": 0.9727, "step": 948000 }, { "epoch": 27.85, "grad_norm": 3.452791690826416, "learning_rate": 2.1554860409700224e-05, "loss": 1.001, "step": 948500 }, { "epoch": 27.86, "grad_norm": 1.3464235067367554, "learning_rate": 2.1407768418368845e-05, "loss": 0.9876, "step": 949000 }, { "epoch": 27.88, "grad_norm": 1.6416382789611816, "learning_rate": 2.1260676427037467e-05, "loss": 1.0062, "step": 949500 }, { "epoch": 27.89, "grad_norm": 1.434910774230957, "learning_rate": 2.1113584435706088e-05, "loss": 0.9934, "step": 950000 }, { "epoch": 27.91, "grad_norm": 5.8801045417785645, "learning_rate": 2.096649244437471e-05, "loss": 0.9899, "step": 950500 }, { "epoch": 27.92, "grad_norm": 1.864310622215271, "learning_rate": 2.081940045304333e-05, "loss": 1.013, "step": 951000 }, { "epoch": 27.94, "grad_norm": 2.070906639099121, "learning_rate": 2.067230846171195e-05, "loss": 0.9947, "step": 951500 }, { "epoch": 27.95, "grad_norm": 2.5215396881103516, "learning_rate": 2.0525216470380572e-05, "loss": 0.984, "step": 952000 }, { "epoch": 27.97, "grad_norm": 1.2144149541854858, "learning_rate": 2.0378124479049193e-05, "loss": 0.9993, "step": 952500 }, { "epoch": 27.98, "grad_norm": 2.4629998207092285, "learning_rate": 2.0231032487717814e-05, "loss": 1.0146, "step": 953000 }, { "epoch": 28.0, "grad_norm": 4.50878381729126, "learning_rate": 2.008394049638644e-05, "loss": 1.0005, "step": 953500 }, { "epoch": 28.01, "grad_norm": 2.623155355453491, "learning_rate": 1.993684850505506e-05, "loss": 0.9928, "step": 954000 }, { "epoch": 28.02, "grad_norm": 2.886497735977173, "learning_rate": 1.978975651372368e-05, "loss": 0.9886, "step": 954500 }, { "epoch": 28.04, "grad_norm": 2.8885116577148438, "learning_rate": 1.9642664522392302e-05, "loss": 0.9788, "step": 955000 }, { "epoch": 28.05, "grad_norm": 1.946003794670105, "learning_rate": 1.9495572531060923e-05, "loss": 0.9647, "step": 955500 }, { "epoch": 28.07, "grad_norm": 12.426846504211426, "learning_rate": 1.9348480539729545e-05, "loss": 0.9753, "step": 956000 }, { "epoch": 28.08, "grad_norm": 1.905735969543457, "learning_rate": 1.9201388548398166e-05, "loss": 0.9855, "step": 956500 }, { "epoch": 28.1, "grad_norm": 2.33889102935791, "learning_rate": 1.9054296557066787e-05, "loss": 0.9744, "step": 957000 }, { "epoch": 28.11, "grad_norm": 2.3063101768493652, "learning_rate": 1.8907204565735408e-05, "loss": 0.9702, "step": 957500 }, { "epoch": 28.13, "grad_norm": 2.292107582092285, "learning_rate": 1.876011257440403e-05, "loss": 0.9888, "step": 958000 }, { "epoch": 28.14, "grad_norm": 2.144972324371338, "learning_rate": 1.8613020583072654e-05, "loss": 0.9754, "step": 958500 }, { "epoch": 28.16, "grad_norm": 1.5302867889404297, "learning_rate": 1.8465928591741275e-05, "loss": 0.9816, "step": 959000 }, { "epoch": 28.17, "grad_norm": 1.6462048292160034, "learning_rate": 1.8318836600409896e-05, "loss": 0.9403, "step": 959500 }, { "epoch": 28.19, "grad_norm": 2.6783134937286377, "learning_rate": 1.8171744609078517e-05, "loss": 0.9762, "step": 960000 }, { "epoch": 28.2, "grad_norm": 2.0703890323638916, "learning_rate": 1.8024652617747138e-05, "loss": 0.9756, "step": 960500 }, { "epoch": 28.22, "grad_norm": 2.4419829845428467, "learning_rate": 1.787756062641576e-05, "loss": 0.9684, "step": 961000 }, { "epoch": 28.23, "grad_norm": 4.775966644287109, "learning_rate": 1.773046863508438e-05, "loss": 0.9912, "step": 961500 }, { "epoch": 28.24, "grad_norm": 1.0155693292617798, "learning_rate": 1.7583376643753e-05, "loss": 0.97, "step": 962000 }, { "epoch": 28.26, "grad_norm": 2.105178117752075, "learning_rate": 1.7436284652421623e-05, "loss": 0.9699, "step": 962500 }, { "epoch": 28.27, "grad_norm": 11.809367179870605, "learning_rate": 1.7289192661090244e-05, "loss": 0.9869, "step": 963000 }, { "epoch": 28.29, "grad_norm": 2.5389130115509033, "learning_rate": 1.7142100669758865e-05, "loss": 0.9807, "step": 963500 }, { "epoch": 28.3, "grad_norm": 2.3344273567199707, "learning_rate": 1.6995008678427486e-05, "loss": 0.9861, "step": 964000 }, { "epoch": 28.32, "grad_norm": 2.3741703033447266, "learning_rate": 1.6847916687096107e-05, "loss": 0.9592, "step": 964500 }, { "epoch": 28.33, "grad_norm": 5.04816198348999, "learning_rate": 1.670082469576473e-05, "loss": 0.9903, "step": 965000 }, { "epoch": 28.35, "grad_norm": 1.2963931560516357, "learning_rate": 1.6553732704433353e-05, "loss": 0.9676, "step": 965500 }, { "epoch": 28.36, "grad_norm": 3.1595191955566406, "learning_rate": 1.6406640713101974e-05, "loss": 0.9942, "step": 966000 }, { "epoch": 28.38, "grad_norm": 1.5514929294586182, "learning_rate": 1.6259548721770595e-05, "loss": 0.9741, "step": 966500 }, { "epoch": 28.39, "grad_norm": 1.9268239736557007, "learning_rate": 1.6112456730439216e-05, "loss": 0.9869, "step": 967000 }, { "epoch": 28.41, "grad_norm": 2.7622382640838623, "learning_rate": 1.5965364739107837e-05, "loss": 0.9598, "step": 967500 }, { "epoch": 28.42, "grad_norm": 1.418778657913208, "learning_rate": 1.581827274777646e-05, "loss": 0.9464, "step": 968000 }, { "epoch": 28.44, "grad_norm": 2.1014456748962402, "learning_rate": 1.567118075644508e-05, "loss": 0.9846, "step": 968500 }, { "epoch": 28.45, "grad_norm": 14.58630657196045, "learning_rate": 1.55240887651137e-05, "loss": 0.9695, "step": 969000 }, { "epoch": 28.46, "grad_norm": 1.556394100189209, "learning_rate": 1.5376996773782322e-05, "loss": 0.9916, "step": 969500 }, { "epoch": 28.48, "grad_norm": 1.8601114749908447, "learning_rate": 1.5229904782450943e-05, "loss": 0.9552, "step": 970000 }, { "epoch": 28.49, "grad_norm": 2.2888152599334717, "learning_rate": 1.5082812791119564e-05, "loss": 0.972, "step": 970500 }, { "epoch": 28.51, "grad_norm": 2.9334394931793213, "learning_rate": 1.4935720799788187e-05, "loss": 0.9986, "step": 971000 }, { "epoch": 28.52, "grad_norm": 2.88075590133667, "learning_rate": 1.4788628808456808e-05, "loss": 0.9935, "step": 971500 }, { "epoch": 28.54, "grad_norm": 4.609189510345459, "learning_rate": 1.4641536817125429e-05, "loss": 0.9715, "step": 972000 }, { "epoch": 28.55, "grad_norm": 1.6899704933166504, "learning_rate": 1.449444482579405e-05, "loss": 0.9737, "step": 972500 }, { "epoch": 28.57, "grad_norm": 18.26349449157715, "learning_rate": 1.4347352834462671e-05, "loss": 0.9728, "step": 973000 }, { "epoch": 28.58, "grad_norm": 5.080548286437988, "learning_rate": 1.4200260843131292e-05, "loss": 0.9819, "step": 973500 }, { "epoch": 28.6, "grad_norm": 3.9338316917419434, "learning_rate": 1.4053168851799914e-05, "loss": 0.9758, "step": 974000 }, { "epoch": 28.61, "grad_norm": 1.4379346370697021, "learning_rate": 1.3906076860468535e-05, "loss": 0.9634, "step": 974500 }, { "epoch": 28.63, "grad_norm": 2.943136215209961, "learning_rate": 1.3758984869137157e-05, "loss": 0.9519, "step": 975000 }, { "epoch": 28.64, "grad_norm": 3.1957294940948486, "learning_rate": 1.3611892877805779e-05, "loss": 0.9727, "step": 975500 }, { "epoch": 28.66, "grad_norm": 2.38423752784729, "learning_rate": 1.34648008864744e-05, "loss": 0.9787, "step": 976000 }, { "epoch": 28.67, "grad_norm": 2.059079885482788, "learning_rate": 1.331770889514302e-05, "loss": 0.9851, "step": 976500 }, { "epoch": 28.69, "grad_norm": 1.8208869695663452, "learning_rate": 1.3170616903811642e-05, "loss": 0.9665, "step": 977000 }, { "epoch": 28.7, "grad_norm": 2.108597755432129, "learning_rate": 1.3023524912480263e-05, "loss": 0.9941, "step": 977500 }, { "epoch": 28.71, "grad_norm": 1.5259730815887451, "learning_rate": 1.2876432921148884e-05, "loss": 0.98, "step": 978000 }, { "epoch": 28.73, "grad_norm": 3.095134735107422, "learning_rate": 1.2729340929817505e-05, "loss": 0.9962, "step": 978500 }, { "epoch": 28.74, "grad_norm": 1.49867582321167, "learning_rate": 1.2582248938486128e-05, "loss": 0.9714, "step": 979000 }, { "epoch": 28.76, "grad_norm": 1.5933908224105835, "learning_rate": 1.243515694715475e-05, "loss": 0.999, "step": 979500 }, { "epoch": 28.77, "grad_norm": 2.663355588912964, "learning_rate": 1.228806495582337e-05, "loss": 0.9682, "step": 980000 }, { "epoch": 28.79, "grad_norm": 104.24024200439453, "learning_rate": 1.2140972964491992e-05, "loss": 0.9774, "step": 980500 }, { "epoch": 28.8, "grad_norm": 1.7283943891525269, "learning_rate": 1.1993880973160613e-05, "loss": 0.9607, "step": 981000 }, { "epoch": 28.82, "grad_norm": 2.0852441787719727, "learning_rate": 1.1846788981829234e-05, "loss": 1.0005, "step": 981500 }, { "epoch": 28.83, "grad_norm": 2.0442757606506348, "learning_rate": 1.1699696990497855e-05, "loss": 0.9836, "step": 982000 }, { "epoch": 28.85, "grad_norm": 1.5879892110824585, "learning_rate": 1.1552604999166476e-05, "loss": 0.9973, "step": 982500 }, { "epoch": 28.86, "grad_norm": 2.7892065048217773, "learning_rate": 1.14055130078351e-05, "loss": 0.9615, "step": 983000 }, { "epoch": 28.88, "grad_norm": 3.826296329498291, "learning_rate": 1.1258421016503722e-05, "loss": 0.9746, "step": 983500 }, { "epoch": 28.89, "grad_norm": 7.535625457763672, "learning_rate": 1.1111329025172343e-05, "loss": 0.9685, "step": 984000 }, { "epoch": 28.91, "grad_norm": 1.7329784631729126, "learning_rate": 1.0964237033840964e-05, "loss": 0.9745, "step": 984500 }, { "epoch": 28.92, "grad_norm": 1.9252818822860718, "learning_rate": 1.0817145042509585e-05, "loss": 0.9693, "step": 985000 }, { "epoch": 28.93, "grad_norm": 2.0630569458007812, "learning_rate": 1.0670053051178206e-05, "loss": 0.9634, "step": 985500 }, { "epoch": 28.95, "grad_norm": 1.6531202793121338, "learning_rate": 1.0522961059846827e-05, "loss": 0.9702, "step": 986000 }, { "epoch": 28.96, "grad_norm": 32.64999771118164, "learning_rate": 1.037586906851545e-05, "loss": 0.9593, "step": 986500 }, { "epoch": 28.98, "grad_norm": 5.578742980957031, "learning_rate": 1.0228777077184071e-05, "loss": 0.9624, "step": 987000 }, { "epoch": 28.99, "grad_norm": 1.9109100103378296, "learning_rate": 1.0081685085852692e-05, "loss": 0.9698, "step": 987500 }, { "epoch": 29.01, "grad_norm": 3.434389352798462, "learning_rate": 9.934593094521313e-06, "loss": 0.9824, "step": 988000 }, { "epoch": 29.02, "grad_norm": 3.804184913635254, "learning_rate": 9.787501103189935e-06, "loss": 0.9534, "step": 988500 }, { "epoch": 29.04, "grad_norm": 2.4718267917633057, "learning_rate": 9.640409111858556e-06, "loss": 0.9533, "step": 989000 }, { "epoch": 29.05, "grad_norm": 2.81648588180542, "learning_rate": 9.493317120527177e-06, "loss": 0.9688, "step": 989500 }, { "epoch": 29.07, "grad_norm": 3.936281681060791, "learning_rate": 9.346225129195798e-06, "loss": 0.9486, "step": 990000 }, { "epoch": 29.08, "grad_norm": 4.693751335144043, "learning_rate": 9.19913313786442e-06, "loss": 0.9505, "step": 990500 }, { "epoch": 29.1, "grad_norm": 3.5278286933898926, "learning_rate": 9.052041146533042e-06, "loss": 0.9338, "step": 991000 }, { "epoch": 29.11, "grad_norm": 2.9467105865478516, "learning_rate": 8.904949155201663e-06, "loss": 0.9602, "step": 991500 }, { "epoch": 29.13, "grad_norm": 2.929056406021118, "learning_rate": 8.757857163870284e-06, "loss": 0.9469, "step": 992000 }, { "epoch": 29.14, "grad_norm": 4.979098320007324, "learning_rate": 8.610765172538905e-06, "loss": 0.9364, "step": 992500 }, { "epoch": 29.15, "grad_norm": 3.7940807342529297, "learning_rate": 8.463673181207526e-06, "loss": 0.9407, "step": 993000 }, { "epoch": 29.17, "grad_norm": 2.5154898166656494, "learning_rate": 8.316581189876148e-06, "loss": 0.9504, "step": 993500 }, { "epoch": 29.18, "grad_norm": 3.7596309185028076, "learning_rate": 8.169489198544769e-06, "loss": 0.9762, "step": 994000 }, { "epoch": 29.2, "grad_norm": 2.13797926902771, "learning_rate": 8.022397207213392e-06, "loss": 0.9605, "step": 994500 }, { "epoch": 29.21, "grad_norm": 3.1647770404815674, "learning_rate": 7.875305215882013e-06, "loss": 0.9611, "step": 995000 }, { "epoch": 29.23, "grad_norm": 3.417736768722534, "learning_rate": 7.728213224550634e-06, "loss": 0.9611, "step": 995500 }, { "epoch": 29.24, "grad_norm": 4.05425500869751, "learning_rate": 7.581121233219255e-06, "loss": 0.9733, "step": 996000 }, { "epoch": 29.26, "grad_norm": 2.117457866668701, "learning_rate": 7.434029241887876e-06, "loss": 0.9494, "step": 996500 }, { "epoch": 29.27, "grad_norm": 9.484780311584473, "learning_rate": 7.286937250556497e-06, "loss": 0.9842, "step": 997000 }, { "epoch": 29.29, "grad_norm": 5.586018085479736, "learning_rate": 7.139845259225119e-06, "loss": 0.955, "step": 997500 }, { "epoch": 29.3, "grad_norm": 6.439801216125488, "learning_rate": 6.99275326789374e-06, "loss": 0.9608, "step": 998000 }, { "epoch": 29.32, "grad_norm": 2.890882730484009, "learning_rate": 6.845661276562361e-06, "loss": 0.9649, "step": 998500 }, { "epoch": 29.33, "grad_norm": 2.0991222858428955, "learning_rate": 6.6985692852309825e-06, "loss": 0.9476, "step": 999000 }, { "epoch": 29.35, "grad_norm": 2.40987229347229, "learning_rate": 6.5514772938996045e-06, "loss": 0.9537, "step": 999500 }, { "epoch": 29.36, "grad_norm": 2.95318865776062, "learning_rate": 6.4043853025682256e-06, "loss": 0.9749, "step": 1000000 }, { "epoch": 29.38, "grad_norm": 5.403099060058594, "learning_rate": 6.257293311236847e-06, "loss": 0.9496, "step": 1000500 }, { "epoch": 29.39, "grad_norm": 3.3474481105804443, "learning_rate": 6.110201319905468e-06, "loss": 0.9756, "step": 1001000 }, { "epoch": 29.4, "grad_norm": 2.7617027759552, "learning_rate": 5.96310932857409e-06, "loss": 0.944, "step": 1001500 }, { "epoch": 29.42, "grad_norm": 3.7849531173706055, "learning_rate": 5.816017337242711e-06, "loss": 0.9616, "step": 1002000 }, { "epoch": 29.43, "grad_norm": 2.1899170875549316, "learning_rate": 5.668925345911332e-06, "loss": 0.9578, "step": 1002500 }, { "epoch": 29.45, "grad_norm": 2.423264265060425, "learning_rate": 5.521833354579953e-06, "loss": 0.9886, "step": 1003000 }, { "epoch": 29.46, "grad_norm": 10.377019882202148, "learning_rate": 5.374741363248575e-06, "loss": 0.9559, "step": 1003500 }, { "epoch": 29.48, "grad_norm": 3.244654655456543, "learning_rate": 5.227649371917196e-06, "loss": 0.9467, "step": 1004000 }, { "epoch": 29.49, "grad_norm": 3.0262067317962646, "learning_rate": 5.080557380585817e-06, "loss": 0.9825, "step": 1004500 }, { "epoch": 29.51, "grad_norm": 1.8769322633743286, "learning_rate": 4.933465389254439e-06, "loss": 0.9835, "step": 1005000 }, { "epoch": 29.52, "grad_norm": 3.122030735015869, "learning_rate": 4.786373397923061e-06, "loss": 0.9686, "step": 1005500 }, { "epoch": 29.54, "grad_norm": 3.3955576419830322, "learning_rate": 4.639281406591682e-06, "loss": 0.9549, "step": 1006000 }, { "epoch": 29.55, "grad_norm": 11.812577247619629, "learning_rate": 4.492189415260304e-06, "loss": 0.944, "step": 1006500 }, { "epoch": 29.57, "grad_norm": 3.2719273567199707, "learning_rate": 4.345097423928925e-06, "loss": 0.9675, "step": 1007000 }, { "epoch": 29.58, "grad_norm": 2.0601563453674316, "learning_rate": 4.198005432597546e-06, "loss": 0.9816, "step": 1007500 }, { "epoch": 29.6, "grad_norm": 2.1465187072753906, "learning_rate": 4.050913441266167e-06, "loss": 0.9386, "step": 1008000 }, { "epoch": 29.61, "grad_norm": 3.070183753967285, "learning_rate": 3.903821449934789e-06, "loss": 0.9729, "step": 1008500 }, { "epoch": 29.62, "grad_norm": 8.74717903137207, "learning_rate": 3.7567294586034105e-06, "loss": 0.9483, "step": 1009000 }, { "epoch": 29.64, "grad_norm": 4.731165409088135, "learning_rate": 3.6096374672720316e-06, "loss": 0.9699, "step": 1009500 }, { "epoch": 29.65, "grad_norm": 3.4259703159332275, "learning_rate": 3.462545475940653e-06, "loss": 0.9768, "step": 1010000 }, { "epoch": 29.67, "grad_norm": 2.1280946731567383, "learning_rate": 3.3154534846092743e-06, "loss": 0.9732, "step": 1010500 }, { "epoch": 29.68, "grad_norm": 2.901146411895752, "learning_rate": 3.168361493277896e-06, "loss": 0.968, "step": 1011000 }, { "epoch": 29.7, "grad_norm": 3.9223461151123047, "learning_rate": 3.021269501946517e-06, "loss": 0.9499, "step": 1011500 }, { "epoch": 29.71, "grad_norm": 2.9555001258850098, "learning_rate": 2.8741775106151385e-06, "loss": 0.9306, "step": 1012000 }, { "epoch": 29.73, "grad_norm": 3.057680606842041, "learning_rate": 2.7270855192837596e-06, "loss": 0.9558, "step": 1012500 }, { "epoch": 29.74, "grad_norm": 2.978799343109131, "learning_rate": 2.579993527952381e-06, "loss": 0.9701, "step": 1013000 }, { "epoch": 29.76, "grad_norm": 2.0970911979675293, "learning_rate": 2.4329015366210023e-06, "loss": 0.9577, "step": 1013500 }, { "epoch": 29.77, "grad_norm": 2.8144404888153076, "learning_rate": 2.285809545289624e-06, "loss": 0.9587, "step": 1014000 }, { "epoch": 29.79, "grad_norm": 2.756420135498047, "learning_rate": 2.1387175539582454e-06, "loss": 0.9554, "step": 1014500 }, { "epoch": 29.8, "grad_norm": 3.5521059036254883, "learning_rate": 1.9916255626268665e-06, "loss": 0.95, "step": 1015000 }, { "epoch": 29.82, "grad_norm": 34.70965576171875, "learning_rate": 1.844533571295488e-06, "loss": 0.9708, "step": 1015500 }, { "epoch": 29.83, "grad_norm": 3.2949025630950928, "learning_rate": 1.6974415799641094e-06, "loss": 0.9673, "step": 1016000 }, { "epoch": 29.84, "grad_norm": 2.9649112224578857, "learning_rate": 1.5503495886327307e-06, "loss": 0.9758, "step": 1016500 }, { "epoch": 29.86, "grad_norm": 2.4358630180358887, "learning_rate": 1.403257597301352e-06, "loss": 0.9793, "step": 1017000 }, { "epoch": 29.87, "grad_norm": 3.3332042694091797, "learning_rate": 1.2561656059699736e-06, "loss": 0.9783, "step": 1017500 }, { "epoch": 29.89, "grad_norm": 5.788940906524658, "learning_rate": 1.1090736146385948e-06, "loss": 0.9519, "step": 1018000 }, { "epoch": 29.9, "grad_norm": 2.6301000118255615, "learning_rate": 9.619816233072163e-07, "loss": 0.9558, "step": 1018500 }, { "epoch": 29.92, "grad_norm": 2.9536306858062744, "learning_rate": 8.148896319758376e-07, "loss": 0.9466, "step": 1019000 }, { "epoch": 29.93, "grad_norm": 2.9202470779418945, "learning_rate": 6.67797640644459e-07, "loss": 0.9695, "step": 1019500 }, { "epoch": 29.95, "grad_norm": 3.0824179649353027, "learning_rate": 5.207056493130803e-07, "loss": 0.9361, "step": 1020000 }, { "epoch": 29.96, "grad_norm": 2.5186848640441895, "learning_rate": 3.736136579817017e-07, "loss": 0.9476, "step": 1020500 }, { "epoch": 29.98, "grad_norm": 5.588615894317627, "learning_rate": 2.265216666503231e-07, "loss": 0.9431, "step": 1021000 }, { "epoch": 29.99, "grad_norm": 2.025895595550537, "learning_rate": 7.942967531894445e-08, "loss": 0.9567, "step": 1021500 }, { "epoch": 30.0, "step": 1021770, "total_flos": 1.7792790799000776e+21, "train_loss": 1.519880951870647, "train_runtime": 591190.4296, "train_samples_per_second": 27.653, "train_steps_per_second": 1.728 } ], "logging_steps": 500, "max_steps": 1021770, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 1.7792790799000776e+21, "train_batch_size": 2, "trial_name": null, "trial_params": null }